├── doc
    ├── FigNetwork.pdf
    └── FigNetwork2.png
├── eval.py
├── LICENSE
├── main.py
├── utils
    ├── pc_utils.py
    ├── configs.py
    ├── tfnp_compatibility.py
    ├── align_utils.py
    ├── spherical_utils.py
    └── evaluation_utils.py
├── README.md
├── provider
    ├── dataloder.py
    └── training_data_prepare.py
└── model
    ├── modules.py
    ├── layers.py
    └── dualposenet.py


/doc/FigNetwork.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Gorilla-Lab-SCUT/DualPoseNet/HEAD/doc/FigNetwork.pdf


--------------------------------------------------------------------------------
/doc/FigNetwork2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Gorilla-Lab-SCUT/DualPoseNet/HEAD/doc/FigNetwork2.png


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Gorilla Lab, SCUT.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | """ Evaluation of results of DualPoseNet on CAMERA25 and REAL275.
 7 | """
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import division
11 | from __future__ import print_function
12 | 
13 | import os
14 | import sys
15 | 
16 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
17 | ROOT_DIR = BASE_DIR
18 | sys.path.append(os.path.join(ROOT_DIR, 'utils'))
19 | from evaluation_utils import evaluate
20 | 
21 | 
22 | CAMERA25_path = os.path.join('results', 'CAMERA25')
23 | REAL275_path = os.path.join('results', 'REAL275')
24 | 
25 | 
26 | print('\n*********** Evaluate the results of DualPoseNet on CAMERA25 ***********')
27 | evaluate(CAMERA25_path)
28 | 
29 | print('\n*********** Evaluate the results of DualPoseNet on REAL275  ***********')
30 | evaluate(REAL275_path)
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Jiehong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Gorilla Lab, SCUT.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | """
 7 | Training/testing routines of DualPoseNet for category-level pose estimation on CAMERA25 or REAL275.
 8 | """
 9 | 
10 | from __future__ import absolute_import
11 | from __future__ import division
12 | from __future__ import print_function
13 | 
14 | import os
15 | import sys
16 | import logging
17 | import pprint
18 | pp = pprint.PrettyPrinter()
19 | 
20 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
21 | ROOT_DIR = BASE_DIR
22 | sys.path.append(os.path.join(ROOT_DIR, 'model'))
23 | sys.path.append(os.path.join(ROOT_DIR, 'provider'))
24 | sys.path.append(os.path.join(ROOT_DIR, 'utils'))
25 | 
26 | 
27 | import tensorflow as tf
28 | import configs
29 | from dualposenet import DualPoseNet
30 | from evaluation_utils import evaluate
31 | 
32 | 
33 | def run():
34 |     FLAGS = configs.parse()
35 |     assert FLAGS.dataset=='REAL275' or FLAGS.dataset=='CAMERA25', 'Error dataset of {}, which should be chosen from [REAL275, CAMERA25]'.format(FLAGS.dataset)
36 |     assert FLAGS.phase in ['train', 'test', 'test_refine_encoder', 'test_refine_feature'], 'Error dataset of {}, which should be chosen from [train, test, test_refine_encoder, test_refine_feature]'.format(FLAGS.phase)
37 | 
38 |     FLAGS.log_dir = os.path.join('log', FLAGS.dataset)
39 |     if not os.path.exists('log'):
40 |         os.makedirs('log')
41 |     if not os.path.exists(FLAGS.log_dir):
42 |         os.makedirs(FLAGS.log_dir)
43 |     if FLAGS.phase !='train':
44 |         FLAGS.test_log_dir = os.path.join(FLAGS.log_dir, FLAGS.phase + '_epoch' + str(FLAGS.test_epoch))
45 |         if not os.path.exists(FLAGS.test_log_dir):
46 |             os.makedirs(FLAGS.test_log_dir)
47 | 
48 |     run_config = tf.ConfigProto()
49 |     run_config.gpu_options.allow_growth = True
50 | 
51 |     with tf.Session(config=run_config) as sess:
52 |     	model = DualPoseNet(FLAGS,sess)
53 | 
54 |     	if FLAGS.phase == 'train':
55 |     		model.train()
56 |     	else:
57 |             if FLAGS.phase == 'test':
58 |                 model.test()
59 |             elif FLAGS.phase == 'test_refine_encoder':
60 |                 model.test_refine_encoder()
61 |             elif FLAGS.phase == 'test_refine_feature':
62 |                 model.test_refine_feature()
63 | 
64 |             print('\n*********** Evaluate the results on {} ***********'.format(FLAGS.dataset))
65 |             evaluate(FLAGS.test_log_dir)
66 | 
67 | 
68 | def main(unused_argv):
69 |     run()
70 | 
71 | if __name__ == '__main__':
72 |     logging.basicConfig(level=logging.INFO)
73 |     tf.app.run()
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/utils/pc_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Gorilla Lab, SCUT.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | """
 7 | Modified based on: https://github.com/hughw19/NOCS_CVPR2019.
 8 | """
 9 | 
10 | import cv2
11 | import numpy as np
12 | 
13 | import math
14 | import cmath
15 | 
16 | 
17 | def load_depth(img_path):
18 |     """ Load depth image from img_path. """
19 |     depth_path = img_path + '_depth.png'
20 |     depth = cv2.imread(depth_path, -1)
21 |     if len(depth.shape) == 3:
22 |         # This is encoded depth image, let's convert
23 |         # NOTE: RGB is actually BGR in opencv
24 |         depth16 = depth[:, :, 1]*256 + depth[:, :, 2]
25 |         depth16 = np.where(depth16==32001, 0, depth16)
26 |         depth16 = depth16.astype(np.uint16)
27 |     elif len(depth.shape) == 2 and depth.dtype == 'uint16':
28 |         depth16 = depth
29 |     else:
30 |         assert False, '[ Error ]: Unsupported depth type.'
31 |     return depth16
32 | 
33 | 
34 | def backproject(depth, intrinsics, instance_mask):
35 |     intrinsics_inv = np.linalg.inv(intrinsics)
36 |     image_shape = depth.shape
37 |     width = image_shape[1]
38 |     height = image_shape[0]
39 | 
40 |     x = np.arange(width)
41 |     y = np.arange(height)
42 | 
43 |     #non_zero_mask = np.logical_and(depth > 0, depth < 5000)
44 |     non_zero_mask = (depth > 0)
45 |     final_instance_mask = np.logical_and(instance_mask, non_zero_mask)
46 | 
47 |     idxs = np.where(final_instance_mask)
48 |     grid = np.array([idxs[1], idxs[0]])
49 | 
50 |     # shape: height * width
51 |     # mesh_grid = np.meshgrid(x, y) #[height, width, 2]
52 |     # mesh_grid = np.reshape(mesh_grid, [2, -1])
53 |     length = grid.shape[1]
54 |     ones = np.ones([1, length])
55 |     uv_grid = np.concatenate((grid, ones), axis=0) # [3, num_pixel]
56 | 
57 |     xyz = intrinsics_inv @ uv_grid # [3, num_pixel]
58 |     xyz = np.transpose(xyz) #[num_pixel, 3]
59 | 
60 |     z = depth[idxs[0], idxs[1]]
61 | 
62 |     # print(np.amax(z), np.amin(z))
63 |     pts = xyz * z[:, np.newaxis]/xyz[:, -1:]
64 |     pts[:, 0] = -pts[:, 0]
65 |     pts[:, 1] = -pts[:, 1]
66 | 
67 |     return pts, idxs
68 | 
69 | 
70 | def pc2sphericalmap(pc, img, resolution=64):
71 |     n = pc.shape[0]
72 |     assert pc.shape[1] == 3
73 | 
74 |     pc = pc.astype(np.float32)
75 |     x = pc[:, 0]
76 |     y = pc[:, 1]
77 |     z = pc[:, 2]
78 |     t = np.pi / resolution
79 |     k = 2 * np.pi / resolution
80 |     r = np.sqrt(np.sum(pc ** 2, axis=1) + 1e-10).astype(np.float32)
81 | 
82 |     phi = np.around(np.arccos(z / r) / t).astype('int') % resolution
83 |     arr = np.arctan2(y, x)
84 |     rho = np.zeros(n)
85 |     rho[y > 0] = np.around(arr[y > 0] / k)
86 |     rho[y < 0] = np.around((arr[y < 0] + 2 * np.pi) / k)
87 |     rho = rho.astype('int') % resolution
88 |     f1 = np.zeros([resolution, resolution, 1], dtype='float32')
89 |     f2 = np.zeros([resolution, resolution, 3], dtype='float32')
90 | 
91 |     for i in range(pc.shape[0]):
92 |         tmp = np.real(cmath.rect(r[i], 0))
93 |         if f1[rho[i], phi[i], 0] <= tmp:
94 |             f1[rho[i], phi[i], 0] = tmp
95 |             f2[rho[i], phi[i], :] = img[i]
96 | 
97 |     return f1[np.newaxis, :, :, :], f2[np.newaxis, :, :, :]


--------------------------------------------------------------------------------
/utils/configs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Gorilla Lab, SCUT.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | 
 7 | import argparse
 8 | import os
 9 | import yaml
10 | 
11 | def parse(args=None):
12 |     parser = argparse.ArgumentParser(description='Configurations of DualPoseNet.',
13 |                                      fromfile_prefix_chars='@')
14 |     parser.add_argument('--phase', default='train',
15 |                         help="train/test/test_refine_encoder/test_refine_feature")
16 |     parser.add_argument('--dataset', default='REAL275',
17 |                         help="REAL275/CAMERA25")
18 | 
19 |     # dataset
20 |     parser.add_argument('--n_classes', '-nc', type=int, default=6,
21 |                         help='Number of classes in dataset')
22 |     parser.add_argument('--input_res', '-res', type=int, default=64,
23 |                         help='Resolution for spherical inputs; may subsample if larger')
24 | 
25 |     # model
26 |     parser.add_argument('--nfilters', default=[16, 16, 32, 32, 64, 64, 128, 128],
27 |                         type=lambda x: [int(_) for _ in x.split(',')],
28 |                         help='Number of filters per layer')
29 |     parser.add_argument('--pool_layers', default=[0, 0, 1, 0, 1, 0, 1, 0],
30 |                         type=lambda x: [int(_) for _ in x.split(',')],
31 |                         help='Pooling layer indicator')
32 |     parser.add_argument('--n_filter_params', '-nfp', type=int, default=8,
33 |                         help='Number of filter params (if 0, use max, else do spectral linear interpolation for localized filters.)')
34 |     parser.add_argument('--nonlin', '-nl', type=str, default='prelu',
35 |                         help='Nonlinearity to be used')
36 |     parser.add_argument('--fusion', '-fs', type=str, default='Max',
37 |                         help='fusion method in multi-scale aggregation[Max, Avg, Cat]')
38 |     parser.add_argument('--transform_method', '-tm', choices=['naive', 'sep'], default='naive',
39 |                         help='SH transform method: NAIVE or SEParation of variables')
40 |     parser.add_argument('--real_inputs', '-ri', action='store_true', default=True,
41 |                         help='Leverage symmetry when inputs are real.')
42 | 
43 |     # train
44 |     parser.add_argument('--learning_rate', '-lr', type=yaml.load, default=0.0001,
45 |                         help='Initial learning rate')
46 |     parser.add_argument('--lr_decay_steps', type=int, default=40000)
47 |     parser.add_argument('--lr_decay_rate', type=float, default=0.5)
48 |     parser.add_argument('--training_epoch', '-ne', type=int, default=30,
49 |                         help='Number of training epochs')
50 |     parser.add_argument('--batch_size', '-bs', type=int, default=64,
51 |                         help='training batch size')
52 |     parser.add_argument('--weight_decay', type=float, default=0.0)
53 |     parser.add_argument('--implicit_loss_weight', type=float, default=10.0,
54 |                         help='Balanced parameter for the loss term of the inplicit pose decoder')
55 | 
56 |     # test
57 |     parser.add_argument('--test_epoch', type=int,
58 |                         default=30, help='Epoch for test')
59 | 
60 |     # use given args instead of cmd line, if they exist
61 |     if isinstance(args, list):
62 |         # if list, parse as cmd lines arguments
63 |         args_out = parser.parse_args(args)
64 |     elif args is not None:
65 |         # if dict, set values directly
66 |         args_out = parser.parse_args('')
67 |         for k, v in args.items():
68 |             setattr(args_out, k, v)
69 |     else:
70 |         args_out = parser.parse_args()
71 |     return args_out
72 | 


--------------------------------------------------------------------------------
/utils/tfnp_compatibility.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Some functions are designed to work with either numpy arrays or tensorflow tensors.
  3 | These wrappers ensure compatibility.
  4 | 
  5 | Source: https://github.com/daniilidis-group/spherical-cnn
  6 | """
  7 | # this is a questionable design decision...
  8 | 
  9 | import tensorflow as tf
 10 | import numpy as np
 11 | 
 12 | 
 13 | def safe_cast(x, y):
 14 |     """ Cast x to type of y or y to type of x, without loss of precision.
 15 | 
 16 |     Works with complex and floats of any precision
 17 |     """
 18 |     t = 'complex' if (x.dtype.is_complex or y.dtype.is_complex) else 'float'
 19 |     s = max(x.dtype.size, y.dtype.size)
 20 |     dtype = '{}{}'.format(t, s*8)
 21 | 
 22 |     return tf.cast(x, dtype), tf.cast(y, dtype)
 23 | 
 24 | 
 25 | def istf(x):
 26 |     """ Check if argument is a tensorflow structure (or list of). """
 27 |     if isinstance(x, list):
 28 |         x = x[0]
 29 |     return (isinstance(x, tf.Tensor) or
 30 |             isinstance(x, tf.Variable))
 31 | 
 32 | 
 33 | def shape(x):
 34 |     """ Return shape of either tensorflow Tensor or numpy array. """
 35 |     return (tuple(x.get_shape().as_list()) if istf(x)
 36 |             else x.shape)
 37 | 
 38 | 
 39 | def sum(*args, **kwargs):
 40 |     """ Return np.sum or tf.reduce_sum according to input. """
 41 |     return fun(['reduce_sum', 'sum'], *args, **kwargs)
 42 | 
 43 | 
 44 | def fun(fun_name, *args, **kwargs):
 45 |     """ Return np or tf version of function according to input.
 46 | 
 47 |     Args:
 48 |         fun_name (list or str): if str, return tf.fun_name or np.fun_name
 49 |                                 if list, return tf.fun_name[0] or np.fun_name[1]
 50 |     """
 51 |     if isinstance(fun_name, list):
 52 |         f1, f2 = fun_name
 53 |     else:
 54 |         f1, f2 = fun_name, fun_name
 55 |     return (getattr(tf, f1)(*args, **kwargs) if any([istf(a) for a in args])
 56 |             else getattr(np, f2)(*args, **kwargs))
 57 | 
 58 | 
 59 | def dot(x, y, *args, **kwargs):
 60 |     """ Return np.tensordot or tf.tensordot according to input. """
 61 |     if istf(x) and not istf(y):
 62 |         y = tf.constant(y)
 63 |     if istf(y) and not istf(x):
 64 |         x = tf.constant(x)
 65 |     if istf(x) and istf(y):
 66 |         x, y = safe_cast(x, y)
 67 |         return tf.tensordot(x, y, *args, **kwargs)
 68 |     else:
 69 |         return np.tensordot(x, y, *args, **kwargs)
 70 | 
 71 | 
 72 | def concat(*args, axis=0):
 73 |     """ Return np.concatenate or tf.concat according to input. """
 74 |     # this is a bit fragile, why should y take type of x and not the other way around?
 75 |     return (tf.concat([*args], axis=axis) if istf(args[0])
 76 |             else np.concatenate(args, axis=axis))
 77 | 
 78 | 
 79 | def fft(x, *args, **kwargs):
 80 |     """ Return np.fft.fft or tf.fft according to input. """
 81 |     return (tf.fft(x, *args, **kwargs) if istf(x)
 82 |             else np.fft.fft(x, *args, **kwargs))
 83 | 
 84 | 
 85 | def conj(*args, **kwargs):
 86 |     return fun('conj', *args, **kwargs)
 87 | 
 88 | 
 89 | def transpose(*args, **kwargs):
 90 |     return fun('transpose', *args, **kwargs)
 91 | 
 92 | 
 93 | def reshape(*args, **kwargs):
 94 |     return fun('reshape', *args, **kwargs)
 95 | 
 96 | 
 97 | def real_or_imag(x, part):
 98 |     """ Return real or imaginary part of either tensorflow Tensor or numpy array. """
 99 |     if istf(x):
100 |         fun = getattr(tf, part)
101 |         if x.dtype.is_complex:
102 |             return fun(x)
103 |         else:
104 |             nbits = x.dtype.size*8
105 |             return fun(tf.cast(x, 'complex{}'.format(nbits*2)))
106 |     else:
107 |         fun = getattr(np, part)
108 |         return fun(x)
109 | 
110 | 
111 | def real(x):
112 |     return real_or_imag(x, 'real')
113 | 
114 | 
115 | def imag(x):
116 |     return real_or_imag(x, 'imag')
117 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DualPoseNet
  2 | Code for "DualPoseNet: Category-level 6D Object Pose and Size Estimation Using Dual Pose Network with Refined Learning of Pose Consistency", ICCV2021. [[Paper](https://openaccess.thecvf.com/content/ICCV2021/papers/Lin_DualPoseNet_Category-Level_6D_Object_Pose_and_Size_Estimation_Using_Dual_ICCV_2021_paper.pdf)][[Supp](https://openaccess.thecvf.com/content/ICCV2021/supplemental/Lin_DualPoseNet_Category-Level_6D_ICCV_2021_supplemental.pdf)][[Arxiv](https://arxiv.org/abs/2103.06526)]
  3 | 
  4 | Created by Jiehong Lin, Zewei Wei, Zhihao Li, Songcen Xu, [Kui Jia](http://kuijia.site/), and Yuanqing Li.
  5 | 
  6 | ![image](https://github.com/Gorilla-Lab-SCUT/DualPoseNet/blob/main/doc/FigNetwork2.png)
  7 | 
  8 | 
  9 | ## Citation
 10 | If you find our work useful in your research, please consider citing:
 11 | 
 12 |      @InProceedings{Lin_2021_ICCV,
 13 |          author    = {Lin, Jiehong and Wei, Zewei and Li, Zhihao and Xu, Songcen and Jia, Kui and Li, Yuanqing},
 14 |          title     = {DualPoseNet: Category-Level 6D Object Pose and Size Estimation Using Dual Pose Network With Refined Learning of Pose Consistency},
 15 |          booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
 16 |          month     = {October},
 17 |          year      = {2021},
 18 |          pages     = {3560-3569}
 19 |      }
 20 | 
 21 | ## Requirements
 22 | This code has been tested with
 23 | - python 3.6.5
 24 | - tensorflow-gpu 1.11.0
 25 | - CUDA 11.2
 26 | 
 27 | ## Downloads
 28 | - Pre-trained models [[link](https://drive.google.com/file/d/16DVaudTE_K_dqXbKoW7SASgxRQnfHApI/view?usp=sharing)]
 29 | - Segmentation predictions on CAMERA25 and REAL275 [[link](https://drive.google.com/file/d/1RwAbFWw2ITX9mXzLUEBjPy_g-MNdyHET/view?usp=sharing)]
 30 | - Pose Predicitons on CAMERA25 and REAL275 [[link](https://drive.google.com/file/d/10TBFY73BMmTxfErlbMqZKfClZgxMzMd9/view?usp=sharing)]
 31 | 
 32 | ## Evaluation
 33 | Evaluate the results of DualPoseNet reported in the paper:
 34 | 
 35 | ```
 36 | python eval.py
 37 | ```
 38 | ## Data Preparation
 39 | 
 40 | Download the data provided by [NOCS](https://github.com/hughw19/NOCS_CVPR2019) ([real_train](http://download.cs.stanford.edu/orion/nocs/real_train.zip), [real_test](http://download.cs.stanford.edu/orion/nocs/real_test.zip),
 41 | [ground truths](http://download.cs.stanford.edu/orion/nocs/gts.zip),
 42 | and [mesh models](http://download.cs.stanford.edu/orion/nocs/obj_models.zip)), and unzip them in the folder ```data/``` as follows:
 43 | 
 44 | ```
 45 | data
 46 | ├── CAMERA
 47 | │   ├── train
 48 | │   └── val
 49 | ├── Real
 50 | │   ├── train
 51 | │   └── test
 52 | ├── gts
 53 | │   ├── val
 54 | │   └── real_test
 55 | └── obj_models
 56 |     ├── train
 57 |     ├── val
 58 |     ├── real_train
 59 |     └── real_test
 60 | ```
 61 | 
 62 | Run the following scripts to prepare training instances:
 63 | 
 64 | ```
 65 | cd provider
 66 | python training_data_prepare.py
 67 | ```
 68 | 
 69 | ## Training
 70 | 
 71 | Command for training DualPoseNet:
 72 | ```
 73 | python main.py --phase train --dataset REAL275
 74 | ```
 75 | The configurations can be modified in ```utils/config.py```.
 76 | 
 77 | ## Testing
 78 | Command for testing DualPoseNet without refined learning:
 79 | ```
 80 | python main.py --phase test --dataset REAL275
 81 | ```
 82 | 
 83 | Command for testing DualPoseNet with refined learning:
 84 | ```
 85 | python main.py --phase test_refine_encoder --dataset REAL275
 86 | ```
 87 | 
 88 | We also provider another faster way of refinement by directly finetuning the pose-sensitive features:
 89 | ```
 90 | python main.py --phase test_refine_feature --dataset REAL275
 91 | ```
 92 | 
 93 | The configurations can also be modified in ```utils/config.py```.
 94 | 
 95 | ## Acknowledgements
 96 | 
 97 | Our implementation leverages the code from [SCNN](https://github.com/daniilidis-group/spherical-cnn), [NOCS](https://github.com/hughw19/NOCS_CVPR2019) and [SPD](https://github.com/mentian/object-deformnet).
 98 | 
 99 | ## License
100 | Our code is released under MIT License (see LICENSE file for details).
101 | 
102 | ## Contact
103 | `lin.jiehong@mail.scut.edu.cn`
104 | 
105 | `kuijia@scut.edu.cn`
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/provider/dataloder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Gorilla Lab, SCUT.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | """
  7 | Dataloders of REAL275 and CAMERA25.
  8 | 
  9 | Author: Jiehong Lin
 10 | """
 11 | 
 12 | import os
 13 | import numpy as np
 14 | import queue
 15 | import threading
 16 | import glob
 17 | import _pickle as cPickle
 18 | 
 19 | class Fetcher(threading.Thread):
 20 |     def __init__(self, opts):
 21 |         super(Fetcher, self).__init__()
 22 |         self.queue = queue.Queue(50)
 23 |         self.stopped = False
 24 |         self.opts = opts
 25 | 
 26 | 
 27 |         data_paths = glob.glob('data/training_instance/CAMERA25_*.pkl')
 28 |         if self.opts.dataset == 'REAL275':
 29 |             data_paths.append('data/training_instance/REAL275.pkl')
 30 |         print(data_paths)
 31 | 
 32 |         self.observed_pc = []
 33 |         self.input_dis = []
 34 |         self.input_rgb = []
 35 |         self.rotation = []
 36 |         self.translation = []
 37 |         self.scale = []
 38 | 
 39 |         for data_path in data_paths:
 40 |             print(data_path)
 41 |             with open(data_path, 'rb') as f:
 42 |                 data = cPickle.load(f)
 43 |             self.observed_pc.append(data['observed_pc'])
 44 |             self.input_dis.append(data['input_dis'])
 45 |             self.input_rgb.append(data['input_rgb'])
 46 |             self.rotation.append(data['rotation'])
 47 |             self.translation.append(data['translation'])
 48 |             self.scale.append(data['scale'])
 49 | 
 50 |         self.observed_pc = np.concatenate(self.observed_pc, axis=0)
 51 |         self.input_dis = np.concatenate(self.input_dis, axis=0)
 52 |         self.input_rgb = np.concatenate(self.input_rgb, axis=0)
 53 |         self.rotation = np.concatenate(self.rotation, axis=0)
 54 |         self.translation = np.concatenate(self.translation, axis=0)
 55 |         self.scale = np.concatenate(self.scale, axis=0)
 56 | 
 57 |         self.batch_size = self.opts.batch_size
 58 |         self.sample_cnt = self.input_dis.shape[0]
 59 |         self.num_batches = self.sample_cnt//self.batch_size
 60 |         print ("NUM_INSTANCE is %s"%(self.sample_cnt))
 61 |         print ("NUM_BATCH is %s"%(self.num_batches))
 62 | 
 63 |     def run(self):
 64 |         while not self.stopped:
 65 |             idx = np.arange(self.sample_cnt)
 66 |             np.random.shuffle(idx)
 67 |             self.observed_pc = self.observed_pc[idx, ...]
 68 |             self.input_dis = self.input_dis[idx, ...]
 69 |             self.input_rgb = self.input_rgb[idx, ...]
 70 |             self.rotation = self.rotation[idx, ...]
 71 |             self.translation = self.translation[idx, ...]
 72 |             self.scale = self.scale[idx, ...]
 73 | 
 74 |             for batch_idx in range(self.num_batches):
 75 |                 if self.stopped:
 76 |                     return None
 77 |                 start_idx = batch_idx * self.batch_size
 78 |                 end_idx = (batch_idx + 1) * self.batch_size
 79 | 
 80 |                 batch_input_dis = self.input_dis[start_idx:end_idx, :, :, :].copy()
 81 |                 batch_input_rgb = self.input_rgb[start_idx:end_idx, :, :, :].copy()
 82 |                 batch_observed_pc = self.observed_pc[start_idx:end_idx, :, :].copy()
 83 |                 batch_rotation = self.rotation[start_idx:end_idx, :].copy()
 84 |                 batch_translation = self.translation[start_idx:end_idx, :].copy()
 85 |                 batch_scale = self.scale[start_idx:end_idx, :].copy()
 86 |                 self.queue.put((batch_input_dis, batch_input_rgb, batch_observed_pc, batch_rotation, batch_translation, batch_scale))
 87 |         return None
 88 | 
 89 |     def fetch(self):
 90 |         if self.stopped:
 91 |             return None
 92 |         return self.queue.get()
 93 | 
 94 |     def shutdown(self):
 95 |         self.stopped = True
 96 |         print ("Shutdown .....")
 97 |         while not self.queue.empty():
 98 |             self.queue.get()
 99 |         print ("Remove all queue data")
100 | 
101 | 


--------------------------------------------------------------------------------
/utils/align_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     RANSAC for Similarity Transformation Estimation
  3 |     Modified from https://github.com/hughw19/NOCS_CVPR2019 & https://github.com/mentian/object-deformnet
  4 |     Originally Written by Srinath Sridhar
  5 | """
  6 | import time
  7 | import numpy as np
  8 | 
  9 | from pc_utils import backproject, pc2sphericalmap
 10 | 
 11 | 
 12 | def estimateSimilarityUmeyama(SourceHom, TargetHom):
 13 |     # Copy of original paper is at: http://web.stanford.edu/class/cs273/refs/umeyama.pdf
 14 |     SourceCentroid = np.mean(SourceHom[:3, :], axis=1)
 15 |     TargetCentroid = np.mean(TargetHom[:3, :], axis=1)
 16 |     nPoints = SourceHom.shape[1]
 17 |     CenteredSource = SourceHom[:3, :] - np.tile(SourceCentroid, (nPoints, 1)).transpose()
 18 |     CenteredTarget = TargetHom[:3, :] - np.tile(TargetCentroid, (nPoints, 1)).transpose()
 19 |     CovMatrix = np.matmul(CenteredTarget, np.transpose(CenteredSource)) / nPoints
 20 |     if np.isnan(CovMatrix).any():
 21 |         print('nPoints:', nPoints)
 22 |         print(SourceHom.shape)
 23 |         print(TargetHom.shape)
 24 |         raise RuntimeError('There are NANs in the input.')
 25 | 
 26 |     U, D, Vh = np.linalg.svd(CovMatrix, full_matrices=True)
 27 |     d = (np.linalg.det(U) * np.linalg.det(Vh)) < 0.0
 28 |     if d:
 29 |         D[-1] = -D[-1]
 30 |         U[:, -1] = -U[:, -1]
 31 |     # rotation
 32 |     Rotation = np.matmul(U, Vh)
 33 |     # scale
 34 |     varP = np.var(SourceHom[:3, :], axis=1).sum()
 35 |     Scale = 1 / varP * np.sum(D)
 36 |     # translation
 37 |     Translation = TargetHom[:3, :].mean(axis=1) - SourceHom[:3, :].mean(axis=1).dot(Scale*Rotation.T)
 38 |     # transformation matrix
 39 |     OutTransform = np.identity(4)
 40 |     OutTransform[:3, :3] = Scale * Rotation
 41 |     OutTransform[:3, 3] = Translation
 42 | 
 43 |     return Scale, Rotation, Translation, OutTransform
 44 | 
 45 | 
 46 | def estimateSimilarityTransform(source: np.array, target: np.array, verbose=False):
 47 |     """ Add RANSAC algorithm to account for outliers.
 48 |     """
 49 |     assert source.shape[0] == target.shape[0], 'Source and Target must have same number of points.'
 50 |     SourceHom = np.transpose(np.hstack([source, np.ones([source.shape[0], 1])]))
 51 |     TargetHom = np.transpose(np.hstack([target, np.ones([target.shape[0], 1])]))
 52 |     # Auto-parameter selection based on source heuristics
 53 |     # Assume source is object model or gt nocs map, which is of high quality
 54 |     SourceCentroid = np.mean(SourceHom[:3, :], axis=1)
 55 |     nPoints = SourceHom.shape[1]
 56 |     CenteredSource = SourceHom[:3, :] - np.tile(SourceCentroid, (nPoints, 1)).transpose()
 57 |     SourceDiameter = 2 * np.amax(np.linalg.norm(CenteredSource, axis=0))
 58 |     InlierT = SourceDiameter / 10.0  # 0.1 of source diameter
 59 |     maxIter = 128
 60 |     confidence = 0.99
 61 | 
 62 |     if verbose:
 63 |         print('Inlier threshold: ', InlierT)
 64 |         print('Max number of iterations: ', maxIter)
 65 | 
 66 |     BestInlierRatio = 0
 67 |     BestInlierIdx = np.arange(nPoints)
 68 |     for i in range(0, maxIter):
 69 |         # Pick 5 random (but corresponding) points from source and target
 70 |         RandIdx = np.random.randint(nPoints, size=5)
 71 |         Scale, _, _, OutTransform = estimateSimilarityUmeyama(SourceHom[:, RandIdx], TargetHom[:, RandIdx])
 72 |         PassThreshold = Scale * InlierT    # propagate inlier threshold to target scale
 73 |         Diff = TargetHom - np.matmul(OutTransform, SourceHom)
 74 |         ResidualVec = np.linalg.norm(Diff[:3, :], axis=0)
 75 |         InlierIdx = np.where(ResidualVec < PassThreshold)[0]
 76 |         nInliers = InlierIdx.shape[0]
 77 |         InlierRatio = nInliers / nPoints
 78 |         # update best hypothesis
 79 |         if InlierRatio > BestInlierRatio:
 80 |             BestInlierRatio = InlierRatio
 81 |             BestInlierIdx = InlierIdx
 82 |         if verbose:
 83 |             print('Iteration: ', i)
 84 |             print('Inlier ratio: ', BestInlierRatio)
 85 |         # early break
 86 |         if (1 - (1 - BestInlierRatio ** 5) ** i) > confidence:
 87 |             break
 88 | 
 89 |     if(BestInlierRatio < 0.1):
 90 |         print('[ WARN ] - Something is wrong. Small BestInlierRatio: ', BestInlierRatio)
 91 |         return None, None, None, None
 92 | 
 93 |     SourceInliersHom = SourceHom[:, BestInlierIdx]
 94 |     TargetInliersHom = TargetHom[:, BestInlierIdx]
 95 |     Scale, Rotation, Translation, OutTransform = estimateSimilarityUmeyama(SourceInliersHom, TargetInliersHom)
 96 | 
 97 |     if verbose:
 98 |         print('BestInlierRatio:', BestInlierRatio)
 99 |         print('Rotation:\n', Rotation)
100 |         print('Translation:\n', Translation)
101 |         print('Scale:', Scale)
102 | 
103 |     return Scale, Rotation, Translation, OutTransform
104 | 
105 | 
106 | def align_nocs_to_depth(masks, coords, depth, image, intrinsics, instance_ids, img_path, verbose=False):
107 |     num_instances = len(instance_ids)
108 |     error_messages = ''
109 |     elapses = []
110 | 
111 |     instance_pts = np.zeros((num_instances, 1024, 3))
112 |     dis_smaps = np.zeros((num_instances, 64, 64, 1))
113 |     rgb_smaps = np.zeros((num_instances, 64, 64, 3))
114 | 
115 |     scales = np.zeros(num_instances)
116 |     rotations = np.zeros((num_instances, 3, 3))
117 |     translations = np.zeros((num_instances, 3))
118 | 
119 | 
120 |     for i in range(num_instances):
121 |         mask = masks[:, :, i]
122 |         coord = coords[:, :, i, :]
123 |         pts, idxs = backproject(depth, intrinsics, mask)
124 |         coord_pts = coord[idxs[0], idxs[1], :] - 0.5
125 |         try:
126 |             start = time.time()
127 |             s, R, T, outtransform = estimateSimilarityTransform(coord_pts, pts, False)
128 |             elapsed = time.time() - start
129 |             if verbose:
130 |                 print('elapsed: ', elapsed)
131 |             elapses.append(elapsed)
132 |         except Exception as e:
133 |             message = '[ Error ] aligning instance {} in {} fails. Message: {}.'.format(instance_ids[i], img_path, str(e))
134 |             print(message)
135 |             error_messages += message + '\n'
136 |             s = 1.0
137 |             R = np.eye(3)
138 |             T = np.zeros(3)
139 |             outtransform = np.identity(4, dtype=np.float32)
140 | 
141 |         pts = pts / 1000.0
142 |         centroid = np.mean(pts, axis=0)
143 |         pts = pts - centroid[np.newaxis, :]
144 |         T = T / 1000.0 - centroid
145 | 
146 |         img = image[idxs[0], idxs[1], :]
147 |         img = (img-np.array([123.7, 116.8, 103.9])[np.newaxis, :])/255.0
148 |         dis_map, rgb_map = pc2sphericalmap(
149 |                                 pts, img, resolution=64)
150 | 
151 |         if pts.shape[0]>1024:
152 |             pts = pts[np.random.choice(
153 |                 pts.shape[0], 1024, replace=False), :]
154 |         else:
155 |             pts = pts[np.random.choice(
156 |                 pts.shape[0], 1024), :]
157 | 
158 |         instance_pts[i, :, :] = pts
159 |         dis_smaps[i, :, :, :] = dis_map[0]
160 |         rgb_smaps[i, :, :, :] = rgb_map[0]
161 | 
162 |         scales[i] = s / 1000.0
163 |         rotations[i, :, :] = R.transpose()
164 |         translations[i, :] = T
165 | 
166 |     return instance_pts, dis_smaps, rgb_smaps, scales, rotations, translations, error_messages, elapses


--------------------------------------------------------------------------------
/model/modules.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Gorilla Lab, SCUT.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | """
  7 | Main Modules in DualPoseNet.
  8 | 
  9 | * The code of encoder is modified based on: https://github.com/daniilidis-group/spherical-cnn.
 10 | 
 11 | Author: Jiehong Lin
 12 | """
 13 | import numpy as np
 14 | import tensorflow as tf
 15 | 
 16 | import spherical_utils
 17 | from layers import sphconv, conv1d, fully_connected, area_weights, block
 18 | 
 19 | 
 20 | def dup(x):
 21 |     """ Return two references for input; useful when creating NNs and storing references to layers """
 22 |     return [x, x]
 23 | 
 24 | 
 25 | class encoder(object):
 26 |     def __init__(self, opts, is_training, name="encoder"):
 27 |         self.opts = opts
 28 |         self.is_training = is_training
 29 |         self.name = name
 30 |         self.reuse = False
 31 |         self.bn = False
 32 | 
 33 |         self.method = self.opts.transform_method
 34 |         self.real = self.opts.real_inputs
 35 | 
 36 |         if self.method == 'naive':
 37 |             args = self.opts
 38 |             fun = lambda *args, **kwargs: spherical_utils.sph_harm_all(
 39 |                 *args, **kwargs, real=self.real)
 40 | 
 41 |         with tf.name_scope('harmonics_or_legendre'):
 42 |             res = self.opts.input_res
 43 |             self.harmonics = [fun(res // (2**i), as_tfvar=True)
 44 |                               for i in range(sum(self.opts.pool_layers) + 1)]
 45 | 
 46 |     def __call__(self, dis_inputs, rgb_inputs):
 47 | 
 48 |         net = {}
 49 |         high = 0
 50 |         low = 1
 51 |         args = self.opts
 52 |         convfun = sphconv
 53 |         l_or_h = self.harmonics
 54 |         method = self.method
 55 |         dis_curr = dis_inputs
 56 |         rgb_curr = rgb_inputs
 57 | 
 58 |         with tf.variable_scope(self.name, reuse=self.reuse):
 59 | 
 60 |             # Main Streams with Spherical Fusion
 61 |             for i, (nf, pool) in enumerate(zip(args.nfilters, args.pool_layers)):
 62 |                 if i in [3, 5, 7]:
 63 |                     with tf.variable_scope('fusion_conv{}'.format(i), reuse=tf.AUTO_REUSE):
 64 |                         fusion_curr = tf.concat([dis_curr, rgb_curr], axis=-1)
 65 |                         net['fusion_conv{}'.format(i)], fusion_curr = dup(block(args, convfun, self.is_training, fusion_curr, nf,
 66 |                                                                                 n_filter_params=args.n_filter_params, weight_decay=args.weight_decay, 
 67 |                                                                                 harmonics_or_legendre=l_or_h[high], method=method))
 68 |                         dis_curr = tf.concat([dis_curr, fusion_curr], axis=-1)
 69 |                         rgb_curr = tf.concat([rgb_curr, fusion_curr], axis=-1)
 70 |                 else:
 71 |                     if not pool:
 72 |                         with tf.variable_scope('dis_conv{}'.format(i), reuse=tf.AUTO_REUSE):
 73 |                             net['dis_conv{}'.format(i)], dis_curr = dup(block(args, convfun, self.is_training, dis_curr, nf,
 74 |                                                                               n_filter_params=args.n_filter_params, weight_decay=args.weight_decay, 
 75 |                                                                               harmonics_or_legendre=l_or_h[high], method=method))
 76 |                         with tf.variable_scope('rgb_conv{}'.format(i), reuse=tf.AUTO_REUSE):
 77 |                             net['rgb_conv{}'.format(i)], rgb_curr = dup(block(args, convfun, self.is_training, rgb_curr, nf,
 78 |                                                                               n_filter_params=args.n_filter_params, weight_decay=args.weight_decay, 
 79 |                                                                               harmonics_or_legendre=l_or_h[high], method=method))
 80 |                     else:
 81 |                         with tf.variable_scope('dis_conv{}'.format(i), reuse=tf.AUTO_REUSE):
 82 |                             net['dis_conv{}'.format(i)], dis_curr = dup(block(args, convfun, self.is_training, dis_curr, nf, n_filter_params=args.n_filter_params,
 83 |                                                                               weight_decay=args.weight_decay, harmonics_or_legendre=l_or_h[
 84 |                                                                                   high], method=method,
 85 |                                                                               spectral_pool=0, harmonics_or_legendre_low=l_or_h[low]))
 86 |                         with tf.variable_scope('rgb_conv{}'.format(i), reuse=tf.AUTO_REUSE):
 87 |                             net['rgb_conv{}'.format(i)], rgb_curr = dup(block(args, convfun, self.is_training, rgb_curr, nf, n_filter_params=args.n_filter_params,
 88 |                                                                               weight_decay=args.weight_decay, harmonics_or_legendre=l_or_h[
 89 |                                                                                   high], method=method,
 90 |                                                                               spectral_pool=0, harmonics_or_legendre_low=l_or_h[low]))
 91 |                         # pooling
 92 |                         dis_curr = area_weights(tf.layers.average_pooling2d(
 93 |                             area_weights(dis_curr), 2*pool, 2*pool, 'same'), invert=True)
 94 |                         rgb_curr = area_weights(tf.layers.average_pooling2d(
 95 |                             area_weights(rgb_curr), 2*pool, 2*pool, 'same'), invert=True)
 96 | 
 97 |                         high += 1
 98 |                         low += 1
 99 | 
100 |             # Aggregation of Multi-scale Spherical Features
101 |             feat1 = net['fusion_conv3']
102 |             feat1 = tf.layers.flatten(inputs=feat1)
103 |             feat1 = fully_connected(feat1, 1024, scope='flatten_conv3', weight_decay=self.opts.weight_decay,
104 |                                     activation_fn=tf.nn.leaky_relu, is_training=self.is_training)
105 | 
106 |             feat2 = net['fusion_conv5']
107 |             feat2 = tf.layers.flatten(inputs=feat2)
108 |             feat2 = fully_connected(feat2, 1024, scope='flatten_conv5', weight_decay=self.opts.weight_decay,
109 |                                     activation_fn=tf.nn.leaky_relu, is_training=self.is_training)
110 | 
111 |             feat3 = net['fusion_conv7']
112 |             feat3 = tf.layers.flatten(inputs=feat3)
113 |             feat3 = fully_connected(feat3, 1024, scope='flatten_conv7', weight_decay=self.opts.weight_decay,
114 |                                     activation_fn=tf.nn.leaky_relu, is_training=self.is_training)
115 | 
116 |             if self.opts.fusion == 'Avg':
117 |                 curr = tf.concat([tf.expand_dims(feat1, 2), tf.expand_dims(
118 |                     feat2, 2), tf.expand_dims(feat3, 2)], axis=2)
119 |                 curr = tf.reduce_mean(curr, axis=2)
120 |             elif self.opts.fusion == 'Max':
121 |                 curr = tf.concat([tf.expand_dims(feat1, 2), tf.expand_dims(
122 |                     feat2, 2), tf.expand_dims(feat3, 2)], axis=2)
123 |                 curr = tf.reduce_max(curr, axis=2)
124 |             elif self.opts.fusion == 'Cat':
125 |                 curr = tf.concat([feat1, feat2, feat3], axis=-1)
126 |             else:
127 |                 assert False
128 | 
129 |             curr = fully_connected(curr, 1024, scope='fc1', weight_decay=self.opts.weight_decay,
130 |                                    activation_fn=tf.nn.leaky_relu, is_training=self.is_training)
131 |             curr = fully_connected(curr, 1024, scope='fc2', weight_decay=self.opts.weight_decay,
132 |                                    activation_fn=tf.nn.leaky_relu, is_training=self.is_training)
133 | 
134 |         self.reuse = True
135 |         self.variables = tf.get_collection(
136 |             tf.GraphKeys.TRAINABLE_VARIABLES, self.name)
137 | 
138 |         return curr
139 | 
140 | 
141 | class explicit_decoder(object):
142 |     def __init__(self, opts, is_training, name="explicit_decoder"):
143 |         self.opts = opts
144 |         self.is_training = is_training
145 |         self.name = name
146 |         self.reuse = False
147 |         self.bn = False
148 | 
149 |     def __call__(self, feat):
150 | 
151 |         with tf.variable_scope(self.name, reuse=self.reuse):
152 | 
153 |             rot_feat = fully_connected(feat, 1024, scope='rot_fc1', weight_decay=self.opts.weight_decay,
154 |                                        activation_fn=tf.nn.leaky_relu, is_training=self.is_training)
155 |             rot_feat = fully_connected(rot_feat, 512, scope='rot_fc2', weight_decay=self.opts.weight_decay,
156 |                                        activation_fn=tf.nn.leaky_relu, is_training=self.is_training)
157 |             rot_feat = tf.layers.dense(
158 |                 inputs=rot_feat, activation=None, units=4, name="rot_fc3")
159 |             rotation = rot_feat / \
160 |                 (tf.linalg.norm(rot_feat, axis=1, keepdims=True)+1e-10)
161 | 
162 |             trans_feat = fully_connected(feat, 1024, scope='trans_fc1', weight_decay=self.opts.weight_decay,
163 |                                          activation_fn=tf.nn.leaky_relu, is_training=self.is_training)
164 |             trans_feat = fully_connected(trans_feat, 512, scope='trans_fc2', weight_decay=self.opts.weight_decay,
165 |                                          activation_fn=tf.nn.leaky_relu, is_training=self.is_training)
166 |             translation = tf.layers.dense(
167 |                 inputs=trans_feat, activation=None, units=3, name="trans_fc3")
168 | 
169 |             scale_feat = fully_connected(feat, 1024, scope='scale_fc1', weight_decay=self.opts.weight_decay,
170 |                                          activation_fn=tf.nn.leaky_relu, bn=self.opts.batch_norm, is_training=self.is_training)
171 |             scale_feat = fully_connected(scale_feat, 512, scope='scale_fc2', weight_decay=self.opts.weight_decay,
172 |                                          activation_fn=tf.nn.leaky_relu, bn=self.opts.batch_norm, is_training=self.is_training)
173 |             scale = tf.layers.dense(
174 |                 inputs=scale_feat, activation=None, units=3, name="scale_fc3")
175 | 
176 | 
177 |         self.reuse = True
178 |         self.variables = tf.get_collection(
179 |             tf.GraphKeys.TRAINABLE_VARIABLES, self.name)
180 | 
181 |         return translation, rotation, scale
182 | 
183 | 
184 | class implicit_decoder(object):
185 |     def __init__(self, opts, is_training, output_npoint=1024, name="implcit_decoder"):
186 |         self.opts = opts
187 |         self.npoint = output_npoint
188 |         self.is_training = is_training
189 |         self.name = name
190 |         self.reuse = False
191 | 
192 |     def __call__(self, pts, feat):
193 |         with tf.variable_scope(self.name, reuse=self.reuse):
194 |             b = pts.get_shape()[0].value
195 |             n = pts.get_shape()[1].value
196 |             c = feat.get_shape()[1].value
197 | 
198 |             feat = tf.concat(
199 |                 [pts, tf.tile(tf.reshape(feat, [b, 1, c]), [1, n, 1])], axis=-1)
200 |             feat = conv1d(feat, 1024, 1, scope='conv1', weight_decay=self.opts.weight_decay,
201 |                           activation_fn=tf.nn.leaky_relu, is_training=self.is_training)
202 |             feat = conv1d(feat, 512, 1, scope='conv2', weight_decay=self.opts.weight_decay,
203 |                           activation_fn=tf.nn.leaky_relu, is_training=self.is_training)
204 |             feat = conv1d(feat, 256, 1, scope='conv3', weight_decay=self.opts.weight_decay,
205 |                           activation_fn=tf.nn.leaky_relu, is_training=self.is_training)
206 |             Q = conv1d(feat, 3, 1, scope='conv4', weight_decay=self.opts.weight_decay,
207 |                        activation_fn=None, bn=False, is_training=self.is_training)
208 | 
209 |         self.reuse = True
210 |         self.variables = tf.get_collection(
211 |             tf.GraphKeys.TRAINABLE_VARIABLES, self.name)
212 | 
213 |         return Q
214 | 


--------------------------------------------------------------------------------
/model/layers.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Gorilla Lab, SCUT.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | """ Custom layers.
  7 | 
  8 | Modified based on: https://github.com/daniilidis-group/spherical-cnn.
  9 | """
 10 | 
 11 | 
 12 | import tensorflow as tf
 13 | import numpy as np
 14 | 
 15 | import spherical_utils
 16 | import tfnp_compatibility as tfnp
 17 | 
 18 | 
 19 | def sphconv(inputs, filters, use_bias=True, n_filter_params=0, weight_decay=0,
 20 |             *args, **kwargs):
 21 |     shapei = tfnp.shape(inputs)
 22 |     spectral_input = True if len(shapei) == 5 else False
 23 |     nchan = shapei[-1]
 24 |     n = shapei[2]
 25 |     if spectral_input:
 26 |         n *= 2
 27 | 
 28 |     with tf.variable_scope(None, default_name='sphconv'):
 29 |         # we desire that variance to stay constant after every layer
 30 |         # factor n // 2 is because we sum n // 2 coefficients
 31 |         # factor nchan is because we sum nchan channels
 32 |         # factor 2pi is the 'gain' of integrating over SO(3)
 33 |         # the factor 2 takes into account non-zero mean
 34 |         # (see He et al 2015 "Delving deep into rectifiers")
 35 |         std = 2./(2 * np.pi * np.sqrt((n // 2) * (nchan)))
 36 |         regularizer = tf.contrib.layers.l2_regularizer(
 37 |             weight_decay) if weight_decay > 0 else None
 38 | 
 39 |         if n_filter_params == 0:
 40 |             weights = tf.get_variable('W',
 41 |                                       trainable=True,
 42 |                                       initializer=tf.truncated_normal([nchan, n // 2, filters],
 43 |                                                                       stddev=std),
 44 |                                       regularizer=regularizer)
 45 |             ker = weights[:, np.newaxis, :, np.newaxis, :]
 46 |         else:
 47 |             nw_in = n_filter_params
 48 |             if nw_in > n // 2:
 49 |                 nw_in = n // 2
 50 | 
 51 |             weights = tf.get_variable('W',
 52 |                                       trainable=True,
 53 |                                       initializer=tf.truncated_normal([nchan,
 54 |                                                                        nw_in,
 55 |                                                                        filters],
 56 |                                                                       stddev=std),
 57 |                                       regularizer=regularizer)
 58 |             xw_in = np.linspace(0, 1, nw_in)
 59 |             xw_out = np.linspace(0, 1, n // 2)
 60 |             id_out = np.searchsorted(xw_in, xw_out)
 61 |             subws = []
 62 |             for i, x in zip(id_out, xw_out):
 63 |                 # linear interpolation
 64 |                 # HACK! we assume the first indices match so i-1 when i==0 cancels out
 65 |                 subws.append(weights[:, i-1, :] +
 66 |                              (weights[:, i, :] - weights[:, i-1, :]) *
 67 |                              (x-xw_in[i-1]) /
 68 |                              ((xw_in[i]-xw_in[i-1])))
 69 |             ker = tf.stack(subws, axis=1)[:, np.newaxis, :, np.newaxis, :]
 70 | 
 71 |         if use_bias:
 72 |             bias = tf.get_variable('b',
 73 |                                    trainable=True,
 74 |                                    initializer=tf.zeros([1, 1, 1, filters], dtype=tf.float32))
 75 |         else:
 76 |             bias = tf.zeros([1, 1, 1, filters], dtype=tf.float32)
 77 | 
 78 |         conv = spherical_utils.sph_conv_batch(inputs, ker, *args, **kwargs)
 79 |         conv = conv + bias
 80 | 
 81 |         for k, v in {'W': weights, 'b': bias, 'activations': conv}.items():
 82 |             tf.summary.histogram(k, v)
 83 |         # avg
 84 |         tf.summary.scalar('norm_activation', tf.reduce_mean(
 85 |             tf.norm(conv, axis=(1, 2)) / n))
 86 | 
 87 |     return conv
 88 | 
 89 | 
 90 | def block(params, fun, is_training=None, *args, **kwargs):
 91 |     """ Block consisting of weight layer + batch norm + nonlinearity"""
 92 |     params.batch_norm = False
 93 |     use_bias = not params.batch_norm
 94 |     curr = fun(*args, **kwargs, use_bias=use_bias)
 95 |     if params.batch_norm:
 96 |         curr = tf.layers.batch_normalization(curr,
 97 |                                              # doesn't make sense to learn scale when using ReLU
 98 |                                              fused=False,
 99 |                                              scale=False,
100 |                                              training=is_training,
101 |                                              renorm=params.batch_renorm)
102 |         for v in tf.get_variable_scope().trainable_variables():
103 |             if 'batch_normalization' in v.name:
104 |                 tf.summary.histogram(v.name, v)
105 | 
106 |     return nonlin(params)(curr)
107 | 
108 | 
109 | def nonlin(params):
110 |     return getattr(tf.nn, params.nonlin, globals().get(params.nonlin))
111 | 
112 | 
113 | def identity(inputs):
114 |     return inputs
115 | 
116 | 
117 | def prelu(inputs):
118 |     """ From: https://stackoverflow.com/a/40264459 """
119 |     alphas = tf.Variable(0.1 * tf.ones(inputs.get_shape()[-1]),
120 |                          trainable=True,
121 |                          dtype=tf.float32)
122 |     pos = tf.nn.relu(inputs)
123 |     neg = alphas * (inputs - abs(inputs)) * 0.5
124 | 
125 |     return pos + neg
126 | 
127 | 
128 | def area_weights(x, invert=False):
129 |     """ Apply weight each cell according to its area; useful for averaging/avg pooling. """
130 |     n = tfnp.shape(x)[1]
131 |     phi, theta = spherical_utils.sph_sample(n)
132 |     phi += np.diff(phi)[0]/2
133 |     # this is proportional to the cell area, not exactly the area
134 |     # this is the same as using |cos\phi_1 - cos\phi_2|
135 |     if invert:
136 |         x /= np.sin(phi)[np.newaxis, np.newaxis, :, np.newaxis]
137 |     else:
138 |         x *= np.sin(phi)[np.newaxis, np.newaxis, :, np.newaxis]
139 | 
140 |     return x
141 | 
142 | 
143 | def fully_connected(inputs,
144 |                     num_outputs,
145 |                     scope,
146 |                     use_xavier=True,
147 |                     stddev=1e-3,
148 |                     weight_decay=0.00001,
149 |                     activation_fn=tf.nn.relu,
150 |                     bn=False,
151 |                     bn_decay=None,
152 |                     use_bias=True,
153 |                     is_training=None):
154 |     """ Fully connected layer with non-linear operation.
155 | 
156 |     Args:
157 |             inputs: 2-D tensor BxN
158 |             num_outputs: int
159 | 
160 |     Returns:
161 |             Variable tensor of size B x num_outputs.
162 |     """
163 | 
164 |     with tf.variable_scope(scope) as sc:
165 |         if use_xavier:
166 |             initializer = tf.contrib.layers.xavier_initializer()
167 |         else:
168 |             initializer = tf.truncated_normal_initializer(stddev=stddev)
169 | 
170 |         if weight_decay > 0:
171 |             outputs = tf.layers.dense(inputs, num_outputs,
172 |                                       use_bias=use_bias, kernel_initializer=initializer,
173 |                                       kernel_regularizer=tf.contrib.layers.l2_regularizer(
174 |                                           weight_decay),
175 |                                       bias_regularizer=tf.contrib.layers.l2_regularizer(
176 |                                           weight_decay),
177 |                                       reuse=None)
178 |         else:
179 |             outputs = tf.layers.dense(inputs, num_outputs,
180 |                                       use_bias=use_bias, kernel_initializer=initializer,
181 |                                       kernel_regularizer=None,
182 |                                       bias_regularizer=None,
183 |                                       reuse=None)
184 |         if bn:
185 |             # outputs = tf.layers.batch_normalization(outputs, momentum=bn_decay, training=is_training, renorm=False)
186 |             outputs = tf.layers.batch_normalization(
187 |                 outputs, training=is_training, renorm=False)
188 | 
189 |         if activation_fn is not None:
190 |             outputs = activation_fn(outputs)
191 | 
192 |         return outputs
193 | 
194 | 
195 | def conv1d(inputs,
196 |            num_output_channels,
197 |            kernel_size,
198 |            scope=None,
199 |            stride=1,
200 |            padding='SAME',
201 |            use_xavier=True,
202 |            stddev=1e-3,
203 |            weight_decay=0.00001,
204 |            activation_fn=tf.nn.relu,
205 |            bn=False,
206 |            bn_decay=None,
207 |            use_bias=True,
208 |            is_training=None,
209 |            reuse=None):
210 |     """ 1D convolution with non-linear operation.
211 | 
212 |     Args:
213 |     inputs: 3-D tensor variable BxHxWxC
214 |     num_output_channels: int
215 |     kernel_size: int
216 |     scope: string
217 |     stride: a list of 2 ints
218 |     padding: 'SAME' or 'VALID'
219 |     use_xavier: bool, use xavier_initializer if true
220 |     stddev: float, stddev for truncated_normal init
221 |     weight_decay: float
222 |     activation_fn: function
223 |     bn: bool, whether to use batch norm
224 |     bn_decay: float or float tensor variable in [0,1]
225 |     is_training: bool Tensor variable
226 | 
227 |     Returns:
228 |     Variable tensor
229 |     """
230 |     with tf.variable_scope(scope, reuse=reuse):
231 |         if use_xavier:
232 |             initializer = tf.contrib.layers.xavier_initializer()
233 |         else:
234 |             initializer = tf.truncated_normal_initializer(stddev=stddev)
235 | 
236 |         if weight_decay > 0:
237 |             outputs = tf.layers.conv1d(inputs, num_output_channels, kernel_size, stride, padding,
238 |                                        kernel_initializer=initializer,
239 |                                        kernel_regularizer=tf.contrib.layers.l2_regularizer(
240 |                                            weight_decay),
241 |                                        bias_regularizer=tf.contrib.layers.l2_regularizer(
242 |                                            weight_decay),
243 |                                        use_bias=use_bias, reuse=None)
244 |         else:
245 |             outputs = tf.layers.conv1d(inputs, num_output_channels, kernel_size, stride, padding,
246 |                                        kernel_initializer=initializer,
247 |                                        kernel_regularizer=None,
248 |                                        bias_regularizer=None,
249 |                                        use_bias=use_bias, reuse=None)
250 | 
251 |         if bn:
252 |             outputs = tf.layers.batch_normalization(
253 |                 outputs, momentum=bn_decay, training=is_training, renorm=False, fused=True)
254 | 
255 |         if activation_fn is not None:
256 |             outputs = activation_fn(outputs)
257 | 
258 |     return outputs
259 | 
260 | 
261 | def quaternion2mat(q):
262 |     # quaternion2mat(q) = transforms3d.euler.qua2mat(q).T
263 |     b = tf.shape(q)[0]
264 | 
265 |     w = tf.slice(q, [0, 0], [-1, 1])
266 |     x = tf.slice(q, [0, 1], [-1, 1])
267 |     y = tf.slice(q, [0, 2], [-1, 1])
268 |     z = tf.slice(q, [0, 3], [-1, 1])
269 | 
270 |     r1 = tf.reshape(
271 |         tf.concat([1-2*(y**2)-2*(z**2), 2*x*y+2*w*z, 2*x*z-2*w*y], axis=1), [b, 1, 3])
272 |     r2 = tf.reshape(
273 |         tf.concat([2*x*y-2*w*z, 1-2*(x**2)-2*(z**2), 2*y*z+2*w*x], axis=1), [b, 1, 3])
274 |     r3 = tf.reshape(
275 |         tf.concat([2*x*z+2*w*y, 2*y*z-2*w*x, 1-2*(x**2)-2*(y**2)], axis=1), [b, 1, 3])
276 | 
277 |     r = tf.concat([r1, r2, r3], axis=1)
278 |     return r
279 | 
280 | 
281 | def point_transformation(pc, rotation, translation, scale):
282 |     b = tf.shape(pc)[0]
283 | 
284 |     r = quaternion2mat(rotation)
285 |     pc = (pc - tf.tile(tf.reshape(translation, [b, 1, 3]), [1, 1024, 1]))/tf.tile(
286 |         tf.reshape((tf.linalg.norm(scale, axis=1)+1e-10), [b, 1, 1]), [1, 1024, 3])
287 |     pc = tf.matmul(pc, r)
288 |     return pc


--------------------------------------------------------------------------------
/utils/spherical_utils.py:
--------------------------------------------------------------------------------
  1 | """ Spherical harmonics transforms/inverses and spherical convolution.
  2 | 
  3 |     Source: https://github.com/daniilidis-group/spherical-cnn
  4 | """
  5 | import functools
  6 | 
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | from scipy.special import sph_harm
 10 | 
 11 | import tfnp_compatibility as tfnp
 12 | from tfnp_compatibility import istf
 13 | 
 14 | 
 15 | def safe_cast(x, y):
 16 |     """ Cast x to type of y or y to type of x, without loss of precision.
 17 | 
 18 |     Works with complex and floats of any precision
 19 |     """
 20 |     t = 'complex' if (x.dtype.is_complex or y.dtype.is_complex) else 'float'
 21 |     s = max(x.dtype.size, y.dtype.size)
 22 |     dtype = '{}{}'.format(t, s*8)
 23 | 
 24 |     return tf.cast(x, dtype), tf.cast(y, dtype)
 25 | 
 26 | 
 27 | def sph_sample(n, mode='DH'):
 28 |     """ Sample grid on a sphere.
 29 | 
 30 |     Args:
 31 |         n (int): dimension is n x n
 32 |         mode (str): sampling mode; DH or GLQ
 33 | 
 34 |     Returns:
 35 |         theta, phi (1D arrays): polar and azimuthal angles
 36 |     """
 37 |     assert n % 2 == 0
 38 |     j = np.arange(0, n)
 39 |     if mode == 'DH':
 40 |         return j*np.pi/n, j*2*np.pi/n
 41 |     elif mode == 'ours':
 42 |         return (2*j+1)*np.pi/2/n, j*2*np.pi/n
 43 |     elif mode == 'GLQ':
 44 |         from pyshtools.shtools import GLQGridCoord
 45 |         phi, theta = GLQGridCoord(n-1)
 46 |         # convert latitude to [0, np.pi/2]
 47 |         return np.radians(phi+90), np.radians(theta)
 48 |     elif mode == 'naive':
 49 |         # repeat first and last points; useful for plotting
 50 |         return np.linspace(0, np.pi, n), np.linspace(0, 2*np.pi, n)
 51 | 
 52 | 
 53 | # cache outputs; 2050 > 32*64
 54 | @functools.lru_cache(maxsize=2050, typed=False)
 55 | def sph_harm_lm(l, m, n):
 56 |     """ Wrapper around scipy.special.sph_harm. Return spherical harmonic of degree l and order m. """
 57 |     phi, theta = sph_sample(n)
 58 |     phi, theta = np.meshgrid(phi, theta)
 59 |     f = sph_harm(m, l, theta, phi)
 60 | 
 61 |     return f
 62 | 
 63 | 
 64 | def sph_harm_all(n, as_tfvar=False, real=False):
 65 |     """ Compute spherical harmonics for an n x n input (degree up to n // 2)
 66 | 
 67 |     Args:
 68 |         n (int): input dimensions; order will be n // 2
 69 |         as_tfvar (bool): if True, return as list of tensorflow Variables.
 70 |         real (bool): if True, return real harmonics
 71 |     """
 72 |     harmonics = []
 73 | 
 74 |     for l in range(n // 2):
 75 |         if real:
 76 |             minl = 0
 77 |         else:
 78 |             minl = -l
 79 |         row = []
 80 |         for m in range(minl, l+1):
 81 |             row.append(sph_harm_lm(l, m, n))
 82 |         harmonics.append(row)
 83 | 
 84 |     if as_tfvar:
 85 |         return tf.cast(tf.constant(sph_harm_to_shtools(harmonics)),
 86 |                        'complex64')
 87 | 
 88 |     else:
 89 |         return harmonics
 90 | 
 91 | 
 92 | def DHaj(n, mode='DH'):
 93 |     """ Sampling weights. """
 94 |     # Driscoll and Healy sampling weights (on the phi dimension)
 95 |     # note: weights depend on the chosen grid, given by sph_sample
 96 |     if mode == 'DH':
 97 |         def gridfun(j): return np.pi*j/n
 98 |     elif mode == 'ours':
 99 |         def gridfun(j): return np.pi*(2*j+1)/2/n
100 |     else:
101 |         raise NotImplementedError()
102 | 
103 |     l = np.arange(0, n/2)
104 |     a = [(2*np.sqrt(2)/n *
105 |           np.sin(gridfun(j)) *
106 |           (1/(2*l+1) * np.sin((2*l+1)*gridfun(j))).sum())
107 |          for j in range(n)]
108 | 
109 |     return a
110 | 
111 | 
112 | def sph_harm_transform(f, mode='DH', harmonics=None):
113 |     """ Project spherical function into the spherical harmonics basis. """
114 |     assert f.shape[0] == f.shape[1]
115 | 
116 |     if isinstance(f, tf.Tensor):
117 |         sumfun = tf.reduce_sum
118 |         def conjfun(x): return tf.conj(x)
119 |         n = f.shape[0].value
120 |     else:
121 |         sumfun = np.sum
122 |         conjfun = np.conj
123 |         n = f.shape[0]
124 |     assert np.log2(n).is_integer()
125 | 
126 |     if harmonics is None:
127 |         harmonics = sph_harm_all(n)
128 | 
129 |     a = DHaj(n, mode)
130 | 
131 |     f = f*np.array(a)[np.newaxis, :]
132 | 
133 |     real = is_real_sft(harmonics)
134 | 
135 |     coeffs = []
136 |     for l in range(n // 2):
137 |         row = []
138 |         minl = 0 if real else -l
139 |         for m in range(minl, l+1):
140 |             # WARNING: results are off by this factor, when using driscoll1994computing formulas
141 |             factor = 2*np.sqrt(np.pi)
142 |             row.append(sumfun(factor * np.sqrt(2*np.pi)/n *
143 |                               f * conjfun(harmonics[l][m-minl])))
144 |         coeffs.append(row)
145 | 
146 |     return coeffs
147 | 
148 | 
149 | def sph_harm_inverse(c, harmonics=None):
150 |     """ Inverse spherical harmonics transform. """
151 |     n = 2*len(c)
152 | 
153 |     real = is_real_sft(c)
154 |     dtype = 'float32' if real else c[1][1].dtype
155 |     if harmonics is None:
156 |         harmonics = sph_harm_all(n, real=real)
157 | 
158 |     if isinstance(c[0][0], tf.Tensor):
159 |         f = tf.zeros((n, n), dtype=dtype)
160 |     else:
161 |         f = np.zeros((n, n), dtype=dtype)
162 | 
163 |     for l in range(n // 2):
164 |         lenm = l+1 if real else 2*l+1
165 |         for m in range(lenm):
166 |             if real:
167 |                 # leverage symmetry of coefficients and harmonics
168 |                 factor = 1 if m == 0 else 2
169 |                 f += factor*(tfnp.real(c[l][m]) * tfnp.real(harmonics[l][m]) -
170 |                              tfnp.imag(c[l][m]) * tfnp.imag(harmonics[l][m]))
171 |             else:
172 |                 f += c[l][m] * harmonics[l][m]
173 | 
174 |     return f
175 | 
176 | 
177 | def sph_harm_transform_batch(f, method=None, *args, **kwargs):
178 |     return sph_harm_transform_batch_naive(f, *args, **kwargs)
179 | 
180 | 
181 | def sph_harm_inverse_batch(c, method=None, *args, **kwargs):
182 |     return sph_harm_inverse_batch_naive(c, *args, **kwargs)
183 | 
184 | 
185 | def sph_harm_transform_batch_naive(f, harmonics=None, m0_only=False):
186 |     """ Spherical harmonics batch-transform.
187 | 
188 |     Args:
189 |         f (n, l, l, c)-array : functions are on l x l grid
190 |         harmonics (2, l/2, l/2, l, l)-array:
191 |         m0_only (bool): return only coefficients with order 0;
192 |                         only them are needed when computing convolutions
193 | 
194 |     Returns:
195 |         coeffs ((n, 2, l/2, l/2, c)-array):
196 |     """
197 |     shapef = tfnp.shape(f)
198 |     n, l = shapef[:2]
199 |     assert shapef[2] == l
200 |     if harmonics is None:
201 |         harmonics = sph_harm_to_shtools(sph_harm_all(l))
202 |     shapeh = tfnp.shape(harmonics)
203 |     assert shapeh[1:] == (l//2, l//2, l, l)
204 |     assert shapeh[0] in [1, 2]
205 | 
206 |     aj = np.array(DHaj(l))
207 | 
208 |     # returns m=0 only; useful to expand kernel in spherical convolution
209 |     if m0_only:
210 |         harmonics = harmonics[slice(0, 1), :, slice(0, 1), ...]
211 | 
212 |     na = np.newaxis
213 |     coeffs = tfnp.transpose(2*np.sqrt(2)*np.pi/l *
214 |                             tfnp.dot(f * aj[na, na, :, na],
215 |                                      tfnp.conj(harmonics),
216 |                                      [[1, 2], [3, 4]]),
217 |                             (0, 2, 3, 4, 1))
218 | 
219 |     return coeffs
220 | 
221 | 
222 | def sph_harm_inverse_batch_naive(c, harmonics=None):
223 |     """ Spherical harmonics batch inverse transform.
224 | 
225 |     Args:
226 |         c ((n, 2, l/2, l/2, c)-array): sph harm coefficients; max degree is l/2
227 |         harmonics (2, l/2, l/2, l, l)-array:
228 | 
229 |     Returns:
230 |         recons ((n, l, l, c)-array):
231 |     """
232 |     shapec = tfnp.shape(c)
233 |     l = 2*shapec[2]
234 |     assert shapec[3] == l//2
235 |     if harmonics is None:
236 |         harmonics = sph_harm_to_shtools(sph_harm_all(l))
237 |     shapeh = tfnp.shape(harmonics)
238 |     assert shapeh[1:] == (l//2, l//2, l, l)
239 |     assert shapeh[0] in [1, 2]
240 | 
241 |     real = True if shapeh[0] == 1 else False
242 | 
243 |     na = np.newaxis
244 | 
245 |     if real:
246 |         # using m, -m symmetry:
247 |         # c^{-m}Y^{-m} + c^mY^m = 2(Re(c^{m})Re(Y^m) - Im(c^{m})Im(Y^m))
248 |         # that does not apply to c_0 so we compensate by dividing it by two
249 |         factor = np.ones(tfnp.shape(c)[1:])[np.newaxis, ...]
250 |         factor[..., 0, :] = factor[..., 0, :]/2
251 |         c = c * factor
252 |         # c[..., 0, :] = c[..., 0, :]/2
253 |         recons = tfnp.transpose(2*(tfnp.dot(tfnp.real(c), tfnp.real(harmonics),
254 |                                             [[1, 2, 3], [0, 1, 2]]) -
255 |                                    tfnp.dot(tfnp.imag(c), tfnp.imag(harmonics),
256 |                                             [[1, 2, 3], [0, 1, 2]])),
257 |                                 (0, 2, 3, 1))
258 |     else:
259 |         recons = tfnp.transpose(tfnp.dot(c, harmonics,
260 |                                          [[1, 2, 3], [0, 1, 2]]),
261 |                                 (0, 2, 3, 1))
262 |     return recons
263 | 
264 | 
265 | def sph_conv(f, g, harmonics=None):
266 |     """ Spherical convolution f * g. """
267 |     stackfun = tf.stack if isinstance(f, tf.Tensor) else np.array
268 |     cf, cg = [sph_harm_transform(x, harmonics=harmonics) for x in [f, g]]
269 |     cfg = [2*np.pi*np.sqrt(4*np.pi / (2*l+1)) * stackfun(c1) * c2[l]
270 |            for l, (c1, c2) in enumerate(zip(cf, cg))]
271 | 
272 |     return sph_harm_inverse(cfg)
273 | 
274 | 
275 | def sph_conv_batch(f, g,
276 |                    harmonics_or_legendre=None,
277 |                    method=None,
278 |                    spectral_pool=0,
279 |                    harmonics_or_legendre_low=None):
280 |     """ CNN-like batch spherical convolution.
281 | 
282 |     Args:
283 |         f (n, l, l, c)-array: input feature map. n entries, c channels
284 |         g (c, l, l, d)-array: convolution kernels
285 |         harmonics_or_legendre (): spherical harmonics or legendre polynomials to expand f and g
286 |         method (str): see sph_harm_transform_batch
287 |         spectral_pool (int): if > 0 run spectral pooling before ISHT
288 |     (bandwidth is reduced by a factor of 2**spectral_pool)
289 |         harmonics_or_legendre_low (): low frequency harmonics of legendre to be used when spectral_pool==True
290 | 
291 |     Returns:
292 |         fg (n, l, l, d)-array
293 |     """
294 |     shapef, shapeg = [tfnp.shape(x) for x in [f, g]]
295 |     spectral_filter = True if len(shapeg) == 5 else False
296 |     spectral_input = True if len(shapef) == 5 else False
297 |     n = shapef[2]
298 |     if spectral_input:
299 |         n *= 2
300 | 
301 |     if not spectral_input:
302 |         cf = sph_harm_transform_batch(
303 |             f, method, harmonics_or_legendre, m0_only=False)
304 |     else:
305 |         cf = f
306 |     if not spectral_filter:
307 |         cg = sph_harm_transform_batch(
308 |             g, method, harmonics_or_legendre, m0_only=True)
309 |     else:
310 |         cg = g
311 | 
312 |     shapecf, shapecg = [tfnp.shape(x) for x in [cf, cg]]
313 |     assert shapecf[4] == shapecg[0]
314 |     assert shapecf[2] == shapecg[2]
315 | 
316 |     na = np.newaxis
317 |     # per degree factor
318 |     factor = (2*np.pi*np.sqrt(4*np.pi / (2*np.arange(n/2)+1))
319 |               )[na, na, :, na, na, na]
320 |     cg = tfnp.transpose(cg, (1, 2, 3, 0, 4))[na, ...]
321 |     cf = cf[..., na]
322 |     if istf(cg) and istf(cf):
323 |         cg, cf = safe_cast(cg, cf)
324 |         cfg = factor * cf * cg
325 |     else:
326 |         cfg = factor * cf * cg
327 | 
328 |     if spectral_pool > 0:
329 |         cfg = cfg[:, :, :n//(2**(spectral_pool+1)), :n //
330 |                   (2**(spectral_pool+1)), ...]
331 |         hol = harmonics_or_legendre_low
332 |     else:
333 |         hol = harmonics_or_legendre
334 | 
335 |     # sum over channels
336 |     cfg = tfnp.sum(cfg, axis=-2)
337 | 
338 |     return sph_harm_inverse_batch(cfg, method, hol)
339 | 
340 | 
341 | def is_real_sft(h_or_c):
342 |     """ Detect if list of lists of harmonics or coefficients assumes real inputs (m>0) """
343 |     if istf(h_or_c):
344 |         d = tfnp.shape(h_or_c[1])[0]
345 |     else:
346 |         d = len(h_or_c[1])
347 | 
348 |     isreal = True if d == 2 else False
349 | 
350 |     return isreal
351 | 
352 | 
353 | def sph_harm_to_shtools(c):
354 |     """ Convert our list format for the sph harm coefficients/harmonics to pyshtools (2, n, n) format. """
355 |     n = len(c)
356 |     real = is_real_sft(c)
357 |     dim1 = 1 if real else 2
358 |     out = np.zeros((dim1, n, n, *c[0][0].shape)) + 0j
359 |     for l, cc in enumerate(c):
360 |         cc = np.array(cc)
361 |         if not real:
362 |             m_minus = cc[:l][::-1]
363 |             m_plus = cc[l:]
364 |         else:
365 |             m_minus = np.array([])
366 |             m_plus = cc
367 | 
368 |         # we get warnings here when using reals
369 |         if m_minus.size > 0:
370 |             out[1, l, 1:l+1, ...] = m_minus
371 |         out[0, l, :l+1, ...] = m_plus
372 | 
373 |     return out
374 | 


--------------------------------------------------------------------------------
/provider/training_data_prepare.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Modified from https://github.com/mentian/object-deformnet
  3 | """
  4 | 
  5 | import os
  6 | import sys
  7 | import glob
  8 | import cv2
  9 | import math
 10 | import numpy as np
 11 | import _pickle as cPickle
 12 | from tqdm import tqdm
 13 | from autolab_core import RigidTransform
 14 | 
 15 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 16 | ROOT_DIR = BASE_DIR
 17 | sys.path.append(os.path.join(ROOT_DIR, '..', 'utils'))
 18 | from align_utils import align_nocs_to_depth
 19 | 
 20 | sym_id = [0,1,3]
 21 | DATA_DIR = 'data'
 22 | OBJ_MODEL_DIR = os.path.join(DATA_DIR, 'obj_models')
 23 | 
 24 | 
 25 | def create_img_list(data_dir):
 26 |     """ Create train/val/test data list for CAMERA and Real. """
 27 |     # CAMERA dataset
 28 |     for subset in ['train', 'val']:
 29 |         img_list = []
 30 |         img_dir = os.path.join(data_dir, 'camera', subset)
 31 |         folder_list = [name for name in os.listdir(img_dir) if os.path.isdir(os.path.join(img_dir, name))]
 32 |         for i in range(10*len(folder_list)):
 33 |             folder_id = int(i) // 10
 34 |             img_id = int(i) % 10
 35 |             img_path = os.path.join(subset, '{:05d}'.format(folder_id), '{:04d}'.format(img_id))
 36 |             img_list.append(img_path)
 37 |         with open(os.path.join(data_dir, 'camera', subset+'_list_all.txt'), 'w') as f:
 38 |             for img_path in img_list:
 39 |                 f.write("%s\n" % img_path)
 40 |     # Real dataset
 41 |     for subset in ['train', 'test']:
 42 |         img_list = []
 43 |         img_dir = os.path.join(data_dir, 'real', subset)
 44 |         folder_list = [name for name in sorted(os.listdir(img_dir)) if os.path.isdir(os.path.join(img_dir, name))]
 45 |         for folder in folder_list:
 46 |             img_paths = glob.glob(os.path.join(img_dir, folder, '*_color.png'))
 47 |             img_paths = sorted(img_paths)
 48 |             for img_full_path in img_paths:
 49 |                 img_name = os.path.basename(img_full_path)
 50 |                 img_ind = img_name.split('_')[0]
 51 |                 img_path = os.path.join(subset, folder, img_ind)
 52 |                 img_list.append(img_path)
 53 |         with open(os.path.join(data_dir, 'real', subset+'_list_all.txt'), 'w') as f:
 54 |             for img_path in img_list:
 55 |                 f.write("%s\n" % img_path)
 56 |     print('Write all data paths to file done!')
 57 | 
 58 | def load_depth(img_path):
 59 |     """ Load depth image from img_path. """
 60 |     depth_path = img_path + '_depth.png'
 61 |     depth = cv2.imread(depth_path, -1)
 62 |     if len(depth.shape) == 3:
 63 |         # This is encoded depth image, let's convert
 64 |         # NOTE: RGB is actually BGR in opencv
 65 |         depth16 = depth[:, :, 1]*256 + depth[:, :, 2]
 66 |         depth16 = np.where(depth16==32001, 0, depth16)
 67 |         depth16 = depth16.astype(np.uint16)
 68 |     elif len(depth.shape) == 2 and depth.dtype == 'uint16':
 69 |         depth16 = depth
 70 |     else:
 71 |         assert False, '[ Error ]: Unsupported depth type.'
 72 |     return depth16
 73 | 
 74 | 
 75 | def process_data(img_path, depth, subset=None):
 76 |     """ Load instance masks for the objects in the image. """
 77 | 
 78 |     mask_path = img_path + '_mask.png'
 79 |     mask = cv2.imread(mask_path)[:, :, 2]
 80 |     mask = np.array(mask, dtype=np.int32)
 81 |     all_inst_ids = sorted(list(np.unique(mask)))
 82 |     assert all_inst_ids[-1] == 255
 83 |     del all_inst_ids[-1]    # remove background
 84 |     num_all_inst = len(all_inst_ids)
 85 |     h, w = mask.shape
 86 | 
 87 |     coord_path = img_path + '_coord.png'
 88 |     coord_map = cv2.imread(coord_path)[:, :, :3]
 89 |     coord_map = coord_map[:, :, (2, 1, 0)]
 90 |     # flip z axis of coord map
 91 |     coord_map = np.array(coord_map, dtype=np.float32) / 255
 92 |     coord_map[:, :, 2] = 1 - coord_map[:, :, 2]
 93 | 
 94 |     class_ids = []
 95 |     instance_ids = []
 96 |     model_list = []
 97 |     masks = np.zeros([h, w, num_all_inst], dtype=np.uint8)
 98 |     coords = np.zeros((h, w, num_all_inst, 3), dtype=np.float32)
 99 |     bboxes = np.zeros((num_all_inst, 4), dtype=np.int32)
100 |     scales = np.zeros([num_all_inst, 3], dtype=np.float32)
101 | 
102 |     meta_path = img_path + '_meta.txt'
103 |     with open(meta_path, 'r') as f:
104 |         i = 0
105 |         for line in f:
106 |             line_info = line.strip().split(' ')
107 |             inst_id = int(line_info[0])
108 |             cls_id = int(line_info[1])
109 |             # background objects and non-existing objects
110 |             if cls_id == 0 or (inst_id not in all_inst_ids):
111 |                 continue
112 | 
113 |             if len(line_info) == 3:
114 |                 model_id = line_info[2]    # Real scanned objs
115 |                 if model_id[-3:] == 'npz':
116 |                     npz_path = os.path.join(OBJ_MODEL_DIR, 'real_val', model_id)
117 |                     with np.load(npz_path) as npz_file:
118 |                         scale = npz_file['scale']
119 |                 else:
120 |                     bbox_file = os.path.join(OBJ_MODEL_DIR, 'real_'+subset, model_id+'.txt')
121 |                     scale = np.loadtxt(bbox_file)
122 | 
123 |                 scales[i, :] = scale/ (np.linalg.norm(scale)+1e-10)
124 |             else:
125 |                 model_id = line_info[3]    # CAMERA objs
126 |                 bbox_file = os.path.join(OBJ_MODEL_DIR, subset, line_info[2], line_info[3], 'bbox.txt')
127 |                 bbox = np.loadtxt(bbox_file)
128 |                 scales[i, :] = bbox[0, :] - bbox[1, :]
129 | 
130 |             # remove one mug instance in CAMERA train due to improper model
131 |             if model_id == 'b9be7cfe653740eb7633a2dd89cec754':
132 |                 continue
133 |             # process foreground objects
134 |             inst_mask = np.equal(mask, inst_id)
135 |             # bounding box
136 |             horizontal_indicies = np.where(np.any(inst_mask, axis=0))[0]
137 |             vertical_indicies = np.where(np.any(inst_mask, axis=1))[0]
138 |             assert horizontal_indicies.shape[0], print(img_path)
139 |             x1, x2 = horizontal_indicies[[0, -1]]
140 |             y1, y2 = vertical_indicies[[0, -1]]
141 |             # x2 and y2 should not be part of the box. Increment by 1.
142 |             x2 += 1
143 |             y2 += 1
144 |             # object occupies full image, rendering error, happens in CAMERA dataset
145 |             if np.any(np.logical_or((x2-x1) > 600, (y2-y1) > 440)):
146 |                 return None, None, None, None, None, None, None
147 |             # not enough valid depth observation
148 |             final_mask = np.logical_and(inst_mask, depth > 0)
149 |             if np.sum(final_mask) < 64:
150 |                 continue
151 |             class_ids.append(cls_id)
152 |             instance_ids.append(inst_id)
153 |             model_list.append(model_id)
154 |             masks[:, :, i] = inst_mask
155 |             coords[:, :, i, :] = np.multiply(coord_map, np.expand_dims(inst_mask, axis=-1))
156 |             bboxes[i] = np.array([y1, x1, y2, x2])
157 | 
158 |             i += 1
159 |     # no valid foreground objects
160 |     if i == 0:
161 |         return None, None, None, None, None, None, None
162 | 
163 |     masks = masks[:, :, :i]
164 |     coords = np.clip(coords[:, :, :i, :], 0, 1)
165 |     bboxes = bboxes[:i, :]
166 |     scales = scales[:i, :]
167 | 
168 |     return masks, coords, class_ids, instance_ids, model_list, bboxes, scales
169 | 
170 | 
171 | def annotate_camera_train(data_dir):
172 |     """ Generate gt labels for CAMERA train data. """
173 |     camera_train = open(os.path.join(data_dir, 'camera', 'train_list_all.txt')).read().splitlines()
174 |     intrinsics = np.array([[577.5, 0, 319.5], [0, 577.5, 239.5], [0, 0, 1]])
175 | 
176 |     all_input_dis = []
177 |     all_input_rgb = []
178 |     all_observed_pc = []
179 |     all_translation = []
180 |     all_rotation = []
181 |     all_scale = []
182 | 
183 |     part_counter = 1
184 |     instance_counter = 0
185 | 
186 |     for i, img_path in enumerate(tqdm(camera_train)):
187 |         img_full_path = os.path.join(data_dir, 'camera', img_path)
188 |         all_exist = os.path.exists(img_full_path + '_color.png') and \
189 |                     os.path.exists(img_full_path + '_coord.png') and \
190 |                     os.path.exists(img_full_path + '_depth.png') and \
191 |                     os.path.exists(img_full_path + '_mask.png') and \
192 |                     os.path.exists(img_full_path + '_meta.txt')
193 |         if not all_exist:
194 |             continue
195 | 
196 |         image = cv2.imread(img_full_path + '_color.png')[:, :, :3]
197 |         image = image[:, :, ::-1]
198 |         depth = load_depth(img_full_path)
199 |         masks, coords, class_ids, instance_ids, model_list, bboxes, sizes = process_data(img_full_path, depth, subset='train')
200 |         if instance_ids is None:
201 |             continue
202 |         # Umeyama alignment of GT NOCS map with depth image
203 |         pts, input_dis, input_rgb, scales, rotations, translations, error_messages, _ = \
204 |             align_nocs_to_depth(masks, coords, depth, image, intrinsics, instance_ids, img_path)
205 |         if error_messages:
206 |             continue
207 |         # write results
208 |         quaternions = np.zeros((rotations.shape[0], 4))
209 |         for k in range(pts.shape[0]):
210 |             label = class_ids[k] - 1
211 |             r = rotations[k]
212 |             if label in sym_id:
213 |                 theta_x = r[0, 0] + r[2, 2]
214 |                 theta_y = r[0, 2] - r[2, 0]
215 |                 r_norm = math.sqrt(theta_x**2 + theta_y**2)
216 |                 s_map = np.array([[theta_x/r_norm, 0.0, -theta_y/r_norm],
217 |                                     [0.0,            1.0,  0.0           ],
218 |                                     [theta_y/r_norm, 0.0,  theta_x/r_norm]])
219 |                 r = s_map@r
220 |             quaternions[k] = RigidTransform(rotation=r).quaternion
221 | 
222 |         all_input_dis.append(input_dis)
223 |         all_input_rgb.append(input_rgb)
224 |         all_observed_pc.append(pts)
225 |         all_translation.append(translations)
226 |         all_rotation.append(quaternions)
227 |         all_scale.append(scales[:, np.newaxis] * sizes)
228 | 
229 |         instance_counter += pts.shape[0]
230 |         if instance_counter >=80000 or i==len(camera_train)-1:
231 |             dataset = {}
232 |             dataset['input_dis'] = np.concatenate(all_input_dis, axis=0).astype(np.float32)
233 |             dataset['input_rgb'] = np.concatenate(all_input_rgb, axis=0).astype(np.float32)
234 |             dataset['observed_pc'] = np.concatenate(all_observed_pc, axis=0).astype(np.float32)
235 |             dataset['translation'] = np.concatenate(all_translation, axis=0).astype(np.float32)
236 |             dataset['rotation'] = np.concatenate(all_rotation, axis=0).astype(np.float32)
237 |             dataset['scale'] = np.concatenate(all_scale, axis=0).astype(np.float32)
238 | 
239 |             with open(os.path.join(ROOT_DIR, '..', 'data', 'training_instance', 'CAMERA25_'+str(part_counter)+'.pkl'), 'wb') as f:
240 |                 cPickle.dump(dataset, f)
241 | 
242 |             all_input_dis = []
243 |             all_input_rgb = []
244 |             all_observed_pc = []
245 |             all_translation = []
246 |             all_rotation = []
247 |             all_scale = []
248 | 
249 |             part_counter += 1
250 |             instance_counter = 0
251 | 
252 | 
253 | def annotate_real_train(data_dir):
254 |     """ Generate gt labels for Real train data through PnP. """
255 |     real_train = open(os.path.join(data_dir, 'real/train_list_all.txt')).read().splitlines()
256 |     intrinsics = np.array([[591.0125, 0, 322.525], [0, 590.16775, 244.11084], [0, 0, 1]])
257 | 
258 |     all_input_dis = []
259 |     all_input_rgb = []
260 |     all_observed_pc = []
261 |     all_translation = []
262 |     all_rotation = []
263 |     all_scale = []
264 | 
265 |     for img_path in tqdm(real_train):
266 |         img_full_path = os.path.join(data_dir, 'real', img_path)
267 |         all_exist = os.path.exists(img_full_path + '_color.png') and \
268 |                     os.path.exists(img_full_path + '_coord.png') and \
269 |                     os.path.exists(img_full_path + '_depth.png') and \
270 |                     os.path.exists(img_full_path + '_mask.png') and \
271 |                     os.path.exists(img_full_path + '_meta.txt')
272 |         if not all_exist:
273 |             continue
274 |         image = cv2.imread(img_full_path + '_color.png')[:, :, :3]
275 |         image = image[:, :, ::-1]
276 |         depth = load_depth(img_full_path)
277 |         masks, coords, class_ids, instance_ids, model_list, bboxes, sizes = process_data(img_full_path, depth, 'train')
278 |         if instance_ids is None:
279 |             continue
280 |         # Umeyama alignment of GT NOCS map with depth image
281 |         pts, input_dis, input_rgb, scales, rotations, translations, error_messages, _ = \
282 |             align_nocs_to_depth(masks, coords, depth, image, intrinsics, instance_ids, img_path)
283 |         if error_messages:
284 |             continue
285 |         # write results
286 |         quaternions = np.zeros((rotations.shape[0], 4))
287 |         for k in range(pts.shape[0]):
288 |             label = class_ids[k] - 1
289 |             r = rotations[k]
290 |             if label in sym_id:
291 |                 theta_x = r[0, 0] + r[2, 2]
292 |                 theta_y = r[0, 2] - r[2, 0]
293 |                 r_norm = math.sqrt(theta_x**2 + theta_y**2)
294 |                 s_map = np.array([[theta_x/r_norm, 0.0, -theta_y/r_norm],
295 |                                     [0.0,            1.0,  0.0           ],
296 |                                     [theta_y/r_norm, 0.0,  theta_x/r_norm]])
297 |                 r = s_map@r
298 |             quaternions[k] = RigidTransform(rotation=r).quaternion
299 | 
300 |         all_input_dis.append(input_dis)
301 |         all_input_rgb.append(input_rgb)
302 |         all_observed_pc.append(pts)
303 |         all_translation.append(translations)
304 |         all_rotation.append(quaternions)
305 |         all_scale.append(scales[:, np.newaxis] * sizes)
306 | 
307 |     dataset = {}
308 |     dataset['input_dis'] = np.concatenate(all_input_dis, axis=0).astype(np.float32)
309 |     dataset['input_rgb'] = np.concatenate(all_input_rgb, axis=0).astype(np.float32)
310 |     dataset['observed_pc'] = np.concatenate(all_observed_pc, axis=0).astype(np.float32)
311 |     dataset['translation'] = np.concatenate(all_translation, axis=0).astype(np.float32)
312 |     dataset['rotation'] = np.concatenate(all_rotation, axis=0).astype(np.float32)
313 |     dataset['scale'] = np.concatenate(all_scale, axis=0).astype(np.float32)
314 | 
315 |     with open(os.path.join(ROOT_DIR, '..', 'data', 'training_instance', 'REAL275.pkl'), 'wb') as f:
316 |         cPickle.dump(dataset, f)
317 | 
318 | 
319 | if __name__ == '__main__':
320 | 
321 |     if not os.path.exists(os.path.join(ROOT_DIR, '..', 'data', 'training_instance')):
322 |         os.makedirs(os.path.join(ROOT_DIR, '..', 'data', 'training_instance'))
323 | 
324 |     # create list for all data
325 |     create_img_list(DATA_DIR)
326 |     # annotate dataset and re-write valid data to list
327 |     annotate_camera_train(DATA_DIR)
328 |     annotate_real_train(DATA_DIR)
329 | 


--------------------------------------------------------------------------------
/model/dualposenet.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Gorilla Lab, SCUT.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | """
  7 | Dual Pose Network with Refined Learning of Pose Consistency.
  8 | 
  9 | Author: Jiehong Lin
 10 | """
 11 | 
 12 | import os
 13 | import tensorflow as tf
 14 | import numpy as np
 15 | import math
 16 | import cmath
 17 | import glob
 18 | import _pickle as cPickle
 19 | from tqdm import tqdm
 20 | import cv2
 21 | from transforms3d.euler import quat2mat
 22 | 
 23 | from layers import point_transformation
 24 | from modules import encoder, explicit_decoder, implicit_decoder
 25 | from dataloder import Fetcher
 26 | from pc_utils import load_depth, backproject, pc2sphericalmap
 27 | 
 28 | 
 29 | class DualPoseNet(object):
 30 |     def __init__(self, opts, sess):
 31 |         self.sess = sess
 32 |         self.opts = opts
 33 | 
 34 |         if self.opts.dataset == 'REAL275':
 35 |             self.intrinsics = np.array(
 36 |                 [[591.0125, 0, 322.525], [0, 590.16775, 244.11084], [0, 0, 1]])
 37 |         elif self.opts.dataset == 'CAMERA25':
 38 |             self.intrinsics = np.array(
 39 |                 [[577.5, 0, 319.5], [0, 577.5, 239.5], [0, 0, 1]])
 40 | 
 41 |     def allocate_placeholders(self):
 42 |         self.is_training = tf.placeholder_with_default(
 43 |             True, shape=[], name='is_training')
 44 |         self.global_step = tf.Variable(0, trainable=False, name='global_step')
 45 | 
 46 |         self.input_dis = tf.placeholder(tf.float32, shape=[
 47 |                                         self.opts.batch_size, self.opts.input_res, self.opts.input_res, 1])
 48 |         self.input_rgb = tf.placeholder(tf.float32, shape=[
 49 |                                         self.opts.batch_size, self.opts.input_res, self.opts.input_res, 3])
 50 |         self.observed_pc = tf.placeholder(
 51 |             tf.float32, shape=[self.opts.batch_size, 1024, 3])
 52 | 
 53 |         self.gt_rotation = tf.placeholder(
 54 |             tf.float32, shape=[self.opts.batch_size, 4])
 55 |         self.gt_translation = tf.placeholder(
 56 |             tf.float32, shape=[self.opts.batch_size, 3])
 57 |         self.gt_scale = tf.placeholder(
 58 |             tf.float32, shape=[self.opts.batch_size, 3])
 59 | 
 60 |     def build_model(self):
 61 |         # model
 62 |         self.encoder = encoder(self.opts, self.is_training, name='encoder')
 63 |         self.explicit_decoder = explicit_decoder(
 64 |             self.opts, self.is_training, name='explicit_decoder')
 65 |         self.implicit_decoder = implicit_decoder(
 66 |             self.opts, self.is_training, name="implicit_decoder")
 67 | 
 68 |         # graphs
 69 |         self.pose_feat = self.encoder(self.input_dis, self.input_rgb)
 70 |         self.pred_translation, self.pred_rotation, self.pred_scale = self.explicit_decoder(
 71 |             self.pose_feat)
 72 |         self.pred_canonical_points = self.implicit_decoder(
 73 |             self.observed_pc, self.pose_feat)
 74 |         self.gt_canonical_points = point_transformation(
 75 |             self.observed_pc, self.gt_rotation, self.gt_translation, self.gt_scale)
 76 | 
 77 |         # loss
 78 |         self.translation_loss = tf.losses.huber_loss(
 79 |             self.pred_translation, self.gt_translation)
 80 |         self.rotation_loss = tf.losses.huber_loss(
 81 |             self.pred_rotation, self.gt_rotation)
 82 |         self.scale_loss = tf.losses.huber_loss(self.pred_scale, self.gt_scale)
 83 |         self.implicit_loss = tf.losses.huber_loss(
 84 |             self.pred_canonical_points, self.gt_canonical_points)
 85 |         self.loss = self.rotation_loss + self.translation_loss + \
 86 |             self.scale_loss + self.opts.implicit_loss_weight*self.implicit_loss
 87 | 
 88 |     def setup_optimizer(self):
 89 |         self.learning_rate = tf.train.exponential_decay(self.opts.learning_rate, self.global_step,
 90 |                                                         self.opts.lr_decay_steps, self.opts.lr_decay_rate, staircase=True)
 91 |         self.learning_rate = tf.maximum(self.learning_rate, 0.000001)
 92 | 
 93 |         all_update_ops = [op for op in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if op.name.startswith(
 94 |             "encoder") or op.name.startswith("explicit_decoder") or op.name.startswith("implicit_decoder")]
 95 |         all_tvars = [var for var in tf.trainable_variables() if var.name.startswith(
 96 |             "encoder") or var.name.startswith("explicit_decoder") or var.name.startswith("implicit_decoder")]
 97 |         with tf.control_dependencies(all_update_ops):
 98 |             self.all_optimizers = tf.train.AdamOptimizer(self.learning_rate).minimize(
 99 |                 self.loss, var_list=all_tvars, colocate_gradients_with_ops=True, global_step=self.global_step)
100 | 
101 |     def train(self):
102 |         print('\n*********** Training of DualPoseNet ***********')
103 | 
104 |         # model & graph
105 |         print('building model ...')
106 |         self.allocate_placeholders()
107 |         self.build_model()
108 |         self.setup_optimizer()
109 |         print('model built !')
110 | 
111 |         # dataset
112 |         print('loading data ...')
113 |         fetchworker = Fetcher(self.opts)
114 |         fetchworker.start()
115 |         print('data loaded !')
116 | 
117 |         print('starting training ...')
118 |         self.sess.run(tf.global_variables_initializer())
119 |         self.saver = tf.train.Saver(max_to_keep=None)
120 |         step = self.sess.run(self.global_step)
121 | 
122 |         for epoch in range(1, self.opts.training_epoch+1):
123 |             sum_loss = 0.0
124 |             sum_r_loss = 0.0
125 |             sum_t_loss = 0.0
126 |             sum_s_loss = 0.0
127 |             sum_i_loss = 0.0
128 |             count = 0
129 | 
130 |             for batch_idx in range(fetchworker.num_batches):
131 | 
132 |                 batch_input_dis, batch_input_rgb, batch_observed_pc, batch_rotation,  batch_translation, batch_scale = fetchworker.fetch()
133 |                 curr_bs = batch_input_dis.shape[0]
134 | 
135 |                 feed_dict = {self.input_dis: batch_input_dis,
136 |                              self.input_rgb: batch_input_rgb,
137 |                              self.observed_pc: batch_observed_pc,
138 |                              self.gt_rotation: batch_rotation,
139 |                              self.gt_translation: batch_translation,
140 |                              self.gt_scale: batch_scale,
141 |                              self.is_training: True}
142 | 
143 |                 _, loss, r_loss, t_loss, s_loss, i_loss = self.sess.run(
144 |                     [self.all_optimizers, self.loss, self.rotation_loss, self.translation_loss, self.scale_loss, self.implicit_loss], feed_dict=feed_dict)
145 | 
146 |                 sum_loss += (loss*curr_bs)
147 |                 sum_r_loss += (r_loss*curr_bs)
148 |                 sum_t_loss += (t_loss*curr_bs)
149 |                 sum_s_loss += (s_loss*curr_bs)
150 |                 sum_i_loss += (i_loss*curr_bs)
151 |                 count += curr_bs
152 | 
153 |                 print('[{}][{}/{}] loss: {:.5f}({:.5f}) r_loss: {:.5f}({:.5f}) t_loss: {:.5f}({:.5f}) s_loss: {:.5f}({:.5f}) i_loss: {:.5f}({:.5f})'.format(
154 |                     epoch, batch_idx, fetchworker.num_batches, sum_loss /
155 |                     float(count), loss, sum_r_loss /
156 |                     float(count), r_loss, sum_t_loss/float(count),
157 |                     t_loss, sum_s_loss/float(count), s_loss, sum_i_loss/float(count), i_loss))
158 |                 step += 1
159 | 
160 |             if epoch % 10 == 0:
161 |                 self.saver.save(self.sess, os.path.join(
162 |                     self.opts.log_dir, 'model'), epoch)
163 | 
164 |         fetchworker.shutdown()
165 |         print('training finished !')
166 | 
167 |     def test(self):
168 | 
169 |         print('\n*********** Testing on {} ***********'.format(self.opts.dataset))
170 | 
171 |         # inputs
172 |         self.input_dis = tf.placeholder(
173 |             tf.float32, shape=[1, self.opts.input_res, self.opts.input_res, 1])
174 |         self.input_rgb = tf.placeholder(
175 |             tf.float32, shape=[1, self.opts.input_res, self.opts.input_res, 3])
176 |         self.observed_pc = tf.placeholder(tf.float32, shape=[1, 1024, 3])
177 |         self.is_training = tf.placeholder_with_default(
178 |             False, shape=[], name='is_training')
179 | 
180 |         # modules
181 |         self.encoder = encoder(self.opts, self.is_training, name='encoder')
182 |         self.explicit_decoder = explicit_decoder(
183 |             self.opts, self.is_training, name='explicit_decoder')
184 |         self.implicit_decoder = implicit_decoder(
185 |             self.opts, self.is_training, name="implicit_decoder")
186 | 
187 |         # graphs
188 |         self.pose_feat = self.encoder(self.input_dis, self.input_rgb)
189 |         self.pred_translation, self.pred_rotation, self.pred_scale = self.explicit_decoder(
190 |             self.pose_feat)
191 |         self.pred_canonical_points = self.implicit_decoder(
192 |             self.observed_pc, self.pose_feat)
193 | 
194 |         # checkpoints
195 |         print("loading from checkpoint ...")
196 |         saver = tf.train.Saver()
197 |         checkpoint_path = os.path.join(
198 |             self.opts.log_dir, 'model-'+str(self.opts.test_epoch))
199 |         saver.restore(self.sess, checkpoint_path)
200 | 
201 |         # test
202 |         result_pkl_list = glob.glob(
203 |             'data/segmentation_results/{}/results_*.pkl'.format(self.opts.dataset))
204 |         result_pkl_list = sorted(result_pkl_list)
205 |         n_image = len(result_pkl_list)
206 |         print('no. of test images: {}\n'.format(n_image))
207 | 
208 |         for i, path in tqdm(enumerate(result_pkl_list)):
209 |             with open(path, 'rb') as f:
210 |                 data = cPickle.load(f)
211 |             image_path = data['image_path']
212 |             num_instance = len(data['pred_class_ids'])
213 | 
214 |             image = cv2.imread(image_path + '_color.png')[:, :, :3]
215 |             image = image[:, :, ::-1]
216 |             depth = load_depth(image_path)
217 |             pred_mask = data['pred_masks']
218 | 
219 |             pred_RTs = np.zeros((num_instance, 4, 4))
220 |             pred_scales = np.zeros((num_instance, 3))
221 |             pred_RTs[:, 3, 3] = 1
222 | 
223 |             if num_instance != 0:
224 |                 for j in range(num_instance):
225 |                     inst_mask = 255 * pred_mask[:, :, j].astype('uint8')
226 |                     pts_ori, idx = backproject(
227 |                         depth, self.intrinsics, inst_mask)
228 |                     pts_ori = pts_ori/1000.0
229 |                     rgb_ori = image[idx[0], idx[1], :]
230 |                     rgb_ori = (
231 |                         rgb_ori - np.array([123.7, 116.8, 103.9])[np.newaxis, :])/255.0
232 | 
233 |                     FLAG = pts_ori.shape[0] > 32
234 |                     pts = pts_ori.copy()
235 |                     rgb = rgb_ori.copy()
236 | 
237 |                     for k in range(3):
238 |                         if FLAG:
239 |                             centroid = np.mean(pts, axis=0)
240 |                             pts = pts - centroid[np.newaxis, :]
241 | 
242 |                             input_dis, input_rgb = pc2sphericalmap(
243 |                                 pts, rgb, resolution=self.opts.input_res)
244 |                             if pts.shape[0] > 1024:
245 |                                 input_pts = pts[np.random.choice(
246 |                                     pts.shape[0], 1024, replace=False), :]
247 |                             else:
248 |                                 input_pts = pts[np.random.choice(
249 |                                     pts.shape[0], 1024), :]
250 |                             feed_dict = {self.input_dis: input_dis, self.input_rgb: input_rgb,
251 |                                          self.observed_pc: input_pts[np.newaxis, :, :], self.is_training: False}
252 |                             pred_rotation, pred_translation, pred_size = self.sess.run(
253 |                                 [self.pred_rotation, self.pred_translation, self.pred_scale], feed_dict=feed_dict)
254 | 
255 |                             pred_rotation = quat2mat(pred_rotation[0])
256 |                             pred_translation = pred_translation[0] + centroid
257 |                             pred_scale = np.linalg.norm(pred_size[0])
258 |                             pred_size = pred_size[0]/pred_scale
259 | 
260 |                             pred_canonical_pts = (
261 |                                 (pts_ori - pred_translation[np.newaxis, :])/pred_scale) @ np.transpose(pred_rotation)
262 |                             dis = np.linalg.norm(pred_canonical_pts, axis=1)
263 | 
264 |                             FLAG = np.sum(dis < 0.6) > 32
265 |                             pts = pts_ori[dis < 0.6].copy()
266 |                             rgb = rgb_ori[dis < 0.6].copy()
267 |                         else:
268 |                             break
269 | 
270 |                     if pts_ori.shape[0] > 32:
271 |                         pred_RTs[j, :3, :3] = np.diag(
272 |                             np.ones((3))*pred_scale) @ pred_rotation.transpose()
273 |                         pred_RTs[j, :3, 3] = pred_translation
274 |                         pred_scales[j] = pred_size
275 | 
276 |                         z_180_RT = np.zeros((4, 4), dtype=np.float32)
277 |                         z_180_RT[:3, :3] = np.diag([-1, -1, 1])
278 |                         z_180_RT[3, 3] = 1
279 |                         pred_RTs[j, :, :] = z_180_RT @ pred_RTs[j, :, :]
280 |                     else:
281 |                         pred_RTs[j] = np.eye(4)
282 |                         pred_scales[j] = np.ones((3))
283 | 
284 |             data.pop('pred_masks')
285 |             data['pred_RTs'] = pred_RTs
286 |             data['pred_scales'] = pred_scales
287 | 
288 |             with open(os.path.join(self.opts.test_log_dir, path.split('/')[-1]), 'wb') as f:
289 |                 cPickle.dump(data, f)
290 | 
291 |     def test_refine_encoder(self):
292 | 
293 |         print(
294 |             '\n*********** Testing & Refining on {} ***********'.format(self.opts.dataset))
295 | 
296 |         # inputs
297 |         self.input_dis = tf.placeholder(
298 |             tf.float32, shape=[1, self.opts.input_res, self.opts.input_res, 1])
299 |         self.input_rgb = tf.placeholder(
300 |             tf.float32, shape=[1, self.opts.input_res, self.opts.input_res, 3])
301 |         self.observed_pc = tf.placeholder(tf.float32, shape=[1, 1024, 3])
302 |         self.is_training = tf.placeholder_with_default(
303 |             False, shape=[], name='is_training')
304 |         self.global_step = tf.Variable(0, trainable=False, name='global_step')
305 | 
306 |         # modules
307 |         self.encoder = encoder(self.opts, self.is_training, name='encoder')
308 |         self.explicit_decoder = explicit_decoder(
309 |             self.opts, self.is_training, name='explicit_decoder')
310 |         self.implicit_decoder = implicit_decoder(
311 |             self.opts, self.is_training, name="implicit_decoder")
312 | 
313 |         # graphs
314 |         self.pose_feat = self.encoder(self.input_dis, self.input_rgb)
315 |         self.pred_translation, self.pred_rotation, self.pred_scale = self.explicit_decoder(
316 |             self.pose_feat)
317 |         self.ex_canonical_points = point_transformation(
318 |             self.observed_pc, self.pred_rotation, self.pred_translation, self.pred_scale)
319 |         self.im_canonical_points = self.implicit_decoder(
320 |             self.observed_pc, self.pose_feat)
321 | 
322 |         # self-adaptive loss
323 |         self.loss = tf.losses.huber_loss(
324 |             self.ex_canonical_points, self.im_canonical_points)
325 | 
326 |         # optimizer
327 |         self.learning_rate = tf.train.exponential_decay(
328 |             self.opts.refine_learning_rate, self.global_step, 100000000, 0, staircase=True)
329 |         all_update_ops = [op for op in tf.get_collection(
330 |             tf.GraphKeys.UPDATE_OPS) if op.name.startswith("encoder")]
331 |         all_tvars = [var for var in tf.trainable_variables()
332 |                      if var.name.startswith("encoder")]
333 |         with tf.control_dependencies(all_update_ops):
334 |             self.all_optimizers = tf.train.AdamOptimizer(self.learning_rate).minimize(
335 |                 self.loss, var_list=all_tvars, colocate_gradients_with_ops=True, global_step=self.global_step)
336 | 
337 |         # checkpoints
338 |         print("loading from checkpoint ...")
339 |         self.sess.run(tf.global_variables_initializer())
340 |         checkpoint_path = os.path.join(
341 |             self.opts.log_dir, 'model-'+str(self.opts.test_epoch))
342 |         var_to_restore = [
343 |             val for val in tf.global_variables() if ('/Adam' not in val.name) and ('encoder' in val.name or 'explicit_decoder' in val.name or 'implicit_decoder' in val.name)]
344 |         saver = tf.train.Saver(var_to_restore)
345 |         saver.restore(self.sess, checkpoint_path)
346 |         step = self.sess.run(self.global_step)
347 | 
348 |         # test
349 |         result_pkl_list = glob.glob(
350 |             'data/segmentation_results/{}/results_*.pkl'.format(self.opts.dataset))
351 |         result_pkl_list = sorted(result_pkl_list)
352 |         n_image = len(result_pkl_list)
353 |         print('no. of test images: {}\n'.format(n_image))
354 | 
355 |         for i, path in tqdm(enumerate(result_pkl_list)):
356 |             with open(path, 'rb') as f:
357 |                 data = cPickle.load(f)
358 |             image_path = data['image_path']
359 |             num_instance = len(data['pred_class_ids'])
360 | 
361 |             image = cv2.imread(image_path + '_color.png')[:, :, :3]
362 |             image = image[:, :, ::-1]
363 |             depth = load_depth(image_path)
364 |             pred_mask = data['pred_masks']
365 | 
366 |             pred_RTs = np.zeros((num_instance, 4, 4))
367 |             pred_scales = np.zeros((num_instance, 3))
368 |             pred_RTs[:, 3, 3] = 1
369 | 
370 |             if num_instance != 0:
371 |                 for j in range(num_instance):
372 |                     saver.restore(self.sess, checkpoint_path)  # re-load model
373 | 
374 |                     inst_mask = 255 * pred_mask[:, :, j].astype('uint8')
375 |                     pts_ori, idx = backproject(
376 |                         depth, self.intrinsics, inst_mask)
377 |                     pts_ori = pts_ori/1000.0
378 |                     rgb_ori = image[idx[0], idx[1], :]
379 |                     rgb_ori = (
380 |                         rgb_ori - np.array([123.7, 116.8, 103.9])[np.newaxis, :])/255.0
381 | 
382 |                     # test
383 |                     dis = np.zeros((pts_ori.shape[0]))
384 |                     for k in range(3):
385 |                         pts = pts_ori[dis < 0.6].copy()
386 |                         rgb = rgb_ori[dis < 0.6].copy()
387 | 
388 |                         if pts.shape[0] > 32:
389 |                             centroid = np.mean(pts, axis=0)
390 |                             pts = pts - centroid[np.newaxis, :]
391 | 
392 |                             input_dis, input_rgb = pc2sphericalmap(
393 |                                 pts, rgb, resolution=self.opts.input_res)
394 |                             if pts.shape[0] > 1024:
395 |                                 input_pts = pts[np.random.choice(
396 |                                     pts.shape[0], 1024, replace=False), :]
397 |                             else:
398 |                                 input_pts = pts[np.random.choice(
399 |                                     pts.shape[0], 1024), :]
400 |                             feed_dict = {self.input_dis: input_dis, self.input_rgb: input_rgb,
401 |                                          self.observed_pc: input_pts[np.newaxis, :, :], self.is_training: False}
402 |                             iter_rotation, iter_translation, iter_size, iter_loss = self.sess.run(
403 |                                 [self.pred_rotation, self.pred_translation, self.pred_scale, self.loss], feed_dict=feed_dict)
404 | 
405 |                             pred_rotation = quat2mat(iter_rotation[0])
406 |                             pred_translation = iter_translation[0] + centroid
407 |                             pred_scale = np.linalg.norm(iter_size[0])
408 |                             pred_size = iter_size[0]/pred_scale
409 |                             pred_loss = iter_loss
410 | 
411 |                             pred_canonical_pts = (
412 |                                 (pts_ori - pred_translation[np.newaxis, :])/pred_scale) @ np.transpose(pred_rotation)
413 |                             dis = np.linalg.norm(pred_canonical_pts, axis=1)
414 | 
415 |                         else:
416 |                             break
417 | 
418 |                     # refinement
419 |                     if pts.shape[0] > 32:
420 |                         for k in range(self.opts.refine_iteration):
421 |                             _, iter_rotation, iter_translation, iter_size, iter_loss = self.sess.run(
422 |                                 [self.all_optimizers, self.pred_rotation, self.pred_translation, self.pred_scale, self.loss], feed_dict=feed_dict)
423 |                             if iter_loss < pred_loss:
424 |                                 pred_rotation = quat2mat(iter_rotation[0])
425 |                                 pred_translation = iter_translation[0] + centroid
426 |                                 pred_scale = np.linalg.norm(iter_size[0])
427 |                                 pred_size = iter_size[0]/pred_scale
428 |                                 pred_loss = iter_loss
429 |                             if pred_loss <= self.opts.refine_threshold:
430 |                                 break
431 | 
432 |                     if pts_ori.shape[0] > 32:
433 |                         pred_RTs[j, :3, :3] = np.diag(
434 |                             np.ones((3))*pred_scale) @ pred_rotation.transpose()
435 |                         pred_RTs[j, :3, 3] = pred_translation
436 |                         pred_scales[j] = pred_size
437 | 
438 |                         z_180_RT = np.zeros((4, 4), dtype=np.float32)
439 |                         z_180_RT[:3, :3] = np.diag([-1, -1, 1])
440 |                         z_180_RT[3, 3] = 1
441 |                         pred_RTs[j, :, :] = z_180_RT @ pred_RTs[j, :, :]
442 |                     else:
443 |                         pred_RTs[j] = np.eye(4)
444 |                         pred_scales[j] = np.ones((3))
445 | 
446 |             data.pop('pred_masks')
447 |             data['pred_RTs'] = pred_RTs
448 |             data['pred_scales'] = pred_scales
449 | 
450 |             with open(os.path.join(self.opts.test_log_dir, path.split('/')[-1]), 'wb') as f:
451 |                 cPickle.dump(data, f)
452 | 
453 |     def test_refine_feature(self):
454 |         print(
455 |             '\n*********** Testing & Refining on {} ***********'.format(self.opts.dataset))
456 | 
457 |         # inputs
458 |         self.input_dis = tf.placeholder(
459 |             tf.float32, shape=[1, self.opts.input_res, self.opts.input_res, 1])
460 |         self.input_rgb = tf.placeholder(
461 |             tf.float32, shape=[1, self.opts.input_res, self.opts.input_res, 3])
462 |         self.observed_pc = tf.placeholder(tf.float32, shape=[1, 1024, 3])
463 |         self.new_pose_feat = tf.Variable(tf.zeros([1,1024]), trainable=True, name='new_pose_feat')
464 |         self.is_training = tf.placeholder_with_default(
465 |             False, shape=[], name='is_training')
466 |         self.global_step = tf.Variable(0, trainable=False, name='global_step')
467 | 
468 |         # modules
469 |         self.encoder = encoder(self.opts, self.is_training, name='encoder')
470 |         self.explicit_decoder = explicit_decoder(
471 |             self.opts, self.is_training, name='explicit_decoder')
472 |         self.implicit_decoder = implicit_decoder(
473 |             self.opts, self.is_training, name="implicit_decoder")
474 | 
475 |         # graphs
476 |         self.pose_feat = self.encoder(self.input_dis, self.input_rgb)
477 |         self.pred_translation0, self.pred_rotation0, self.pred_scale0 = self.explicit_decoder(
478 |             self.pose_feat)
479 |         self.ex_canonical_points0 = point_transformation(
480 |             self.observed_pc, self.pred_rotation0, self.pred_translation0, self.pred_scale0)
481 |         self.im_canonical_points0 = self.implicit_decoder(
482 |             self.observed_pc, self.pose_feat)
483 | 
484 |         # refine_graphs
485 |         self.pred_translation, self.pred_rotation, self.pred_scale = self.explicit_decoder(
486 |             self.new_pose_feat)
487 |         self.ex_canonical_points = point_transformation(
488 |             self.observed_pc, self.pred_rotation, self.pred_translation, self.pred_scale)
489 |         self.im_canonical_points = self.implicit_decoder(
490 |             self.observed_pc, self.new_pose_feat)
491 | 
492 |         # self.adaptive loss
493 |         self.loss = tf.losses.huber_loss(
494 |             self.ex_canonical_points, self.im_canonical_points)
495 | 
496 |         # optimizer
497 |         self.learning_rate = tf.train.exponential_decay(
498 |             self.opts.refine_learning_rate, self.global_step, 100000000, 0, staircase=True)
499 |         self.all_optimizers = tf.train.AdamOptimizer(self.opts.refine_f_learning_rate).minimize(
500 |             self.loss, var_list=[self.new_pose_feat], colocate_gradients_with_ops=True, global_step=self.global_step)
501 | 
502 |         # checkpoints
503 |         print("loading from checkpoint ...")
504 |         self.sess.run(tf.global_variables_initializer())
505 |         checkpoint_path = os.path.join(
506 |             self.opts.log_dir, 'model-'+str(self.opts.test_epoch))
507 |         var_to_restore = [
508 |             val for val in tf.global_variables() if ('/Adam' not in val.name) and ('encoder' in val.name or 'explicit_decoder' in val.name or 'implicit_decoder' in val.name)]
509 |         saver = tf.train.Saver(var_to_restore)
510 |         saver.restore(self.sess, checkpoint_path)
511 |         step = self.sess.run(self.global_step)
512 | 
513 |         # test
514 |         result_pkl_list = glob.glob(
515 |             'data/segmentation_results/{}/results_*.pkl'.format(self.opts.dataset))
516 |         result_pkl_list = sorted(result_pkl_list)
517 |         n_image = len(result_pkl_list)
518 |         print('no. of test images: {}\n'.format(n_image))
519 | 
520 |         for i, path in tqdm(enumerate(result_pkl_list)):
521 |             with open(path, 'rb') as f:
522 |                 data = cPickle.load(f)
523 |             image_path = data['image_path']
524 |             num_instance = len(data['pred_class_ids'])
525 | 
526 |             image = cv2.imread(image_path + '_color.png')[:, :, :3]
527 |             image = image[:, :, ::-1]
528 |             depth = load_depth(image_path)
529 |             pred_mask = data['pred_masks']
530 | 
531 |             pred_RTs = np.zeros((num_instance, 4, 4))
532 |             pred_scales = np.zeros((num_instance, 3))
533 |             pred_RTs[:, 3, 3] = 1
534 | 
535 |             if num_instance != 0:
536 |                 for j in range(num_instance):
537 |                     inst_mask = 255 * pred_mask[:, :, j].astype('uint8')
538 |                     pts_ori, idx = backproject(
539 |                         depth, self.intrinsics, inst_mask)
540 |                     pts_ori = pts_ori/1000.0
541 |                     rgb_ori = image[idx[0], idx[1], :]
542 |                     rgb_ori = (
543 |                         rgb_ori - np.array([123.7, 116.8, 103.9])[np.newaxis, :])/255.0
544 | 
545 |                     # test
546 |                     dis = np.zeros((pts_ori.shape[0]))
547 |                     for k in range(3):
548 |                         pts = pts_ori[dis < 0.6].copy()
549 |                         rgb = rgb_ori[dis < 0.6].copy()
550 | 
551 |                         if pts.shape[0] > 32:
552 |                             centroid = np.mean(pts, axis=0)
553 |                             pts = pts - centroid[np.newaxis, :]
554 | 
555 |                             input_dis, input_rgb = pc2sphericalmap(
556 |                                 pts, rgb, resolution=self.opts.input_res)
557 |                             if pts.shape[0] > 1024:
558 |                                 input_pts = pts[np.random.choice(
559 |                                     pts.shape[0], 1024, replace=False), :]
560 |                             else:
561 |                                 input_pts = pts[np.random.choice(
562 |                                     pts.shape[0], 1024), :]
563 |                             feed_dict = {self.input_dis: input_dis, self.input_rgb: input_rgb,
564 |                                          self.observed_pc: input_pts[np.newaxis, :, :], self.is_training: False}
565 |                             iter_rotation, iter_translation, iter_size = self.sess.run(
566 |                                 [self.pred_rotation0, self.pred_translation0, self.pred_scale0], feed_dict=feed_dict)
567 | 
568 |                             pred_rotation = quat2mat(iter_rotation[0])
569 |                             pred_translation = iter_translation[0] + centroid
570 |                             pred_scale = np.linalg.norm(iter_size[0])
571 |                             pred_size = iter_size[0]/pred_scale
572 | 
573 |                             pred_canonical_pts = (
574 |                                 (pts_ori - pred_translation[np.newaxis, :])/pred_scale) @ np.transpose(pred_rotation)
575 |                             dis = np.linalg.norm(pred_canonical_pts, axis=1)
576 | 
577 |                         else:
578 |                             break
579 | 
580 |                     # refinement
581 |                     if pts.shape[0] > 32:
582 |                         pose_feat = self.sess.run([self.pose_feat], feed_dict=feed_dict)
583 |                         update = tf.assign(self.new_pose_feat, pose_feat[0])
584 |                         self.sess.run(update)
585 |                         pred_loss = np.inf
586 | 
587 |                         for k in range(self.opts.refine_f_iteration):
588 |                             _, iter_rotation, iter_translation, iter_size, iter_loss = self.sess.run(
589 |                                 [self.all_optimizers, self.pred_rotation, self.pred_translation, self.pred_scale, self.loss], feed_dict=feed_dict)
590 |                             if iter_loss < pred_loss:
591 |                                 pred_rotation = quat2mat(iter_rotation[0])
592 |                                 pred_translation = iter_translation[0] + centroid
593 |                                 pred_scale = np.linalg.norm(iter_size[0])
594 |                                 pred_size = iter_size[0]/pred_scale
595 |                                 pred_loss = iter_loss
596 |                             if pred_loss <= self.opts.refine_f_threshold:
597 |                                 break
598 | 
599 |                     if pts_ori.shape[0] > 32:
600 |                         pred_RTs[j, :3, :3] = np.diag(
601 |                             np.ones((3))*pred_scale) @ pred_rotation.transpose()
602 |                         pred_RTs[j, :3, 3] = pred_translation
603 |                         pred_scales[j] = pred_size
604 | 
605 |                         z_180_RT = np.zeros((4, 4), dtype=np.float32)
606 |                         z_180_RT[:3, :3] = np.diag([-1, -1, 1])
607 |                         z_180_RT[3, 3] = 1
608 |                         pred_RTs[j, :, :] = z_180_RT @ pred_RTs[j, :, :]
609 |                     else:
610 |                         pred_RTs[j] = np.eye(4)
611 |                         pred_scales[j] = np.ones((3))
612 | 
613 |             data.pop('pred_masks')
614 |             data['pred_RTs'] = pred_RTs
615 |             data['pred_scales'] = pred_scales
616 | 
617 |             with open(os.path.join(self.opts.test_log_dir, path.split('/')[-1]), 'wb') as f:
618 |                 cPickle.dump(data, f)
619 | 


--------------------------------------------------------------------------------
/utils/evaluation_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Gorilla Lab, SCUT.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | """ Modified based on https://github.com/hughw19/NOCS_CVPR2019."""
  7 | 
  8 | 
  9 | import os
 10 | import sys
 11 | import numpy as np
 12 | import glob
 13 | import math
 14 | import _pickle as cPickle
 15 | from tqdm import tqdm
 16 | 
 17 | 
 18 | def trim_zeros(x):
 19 |     """It's common to have tensors larger than the available data and
 20 |     pad with zeros. This function removes rows that are all zeros.
 21 |     x: [rows, columns].
 22 |     """
 23 | 
 24 |     pre_shape = x.shape
 25 |     assert len(x.shape) == 2, x.shape
 26 |     new_x = x[~np.all(x == 0, axis=1)]
 27 |     post_shape = new_x.shape
 28 |     assert pre_shape[0] == post_shape[0]
 29 |     assert pre_shape[1] == post_shape[1]
 30 | 
 31 |     return new_x
 32 | 
 33 | 
 34 | def get_3d_bbox(scale, shift=0):
 35 |     """
 36 |     Input:
 37 |         scale: [3] or scalar
 38 |         shift: [3] or scalar
 39 |     Return
 40 |         bbox_3d: [3, N]
 41 | 
 42 |     """
 43 |     if hasattr(scale, "__iter__"):
 44 |         bbox_3d = np.array([[scale[0] / 2, +scale[1] / 2, scale[2] / 2],
 45 |                             [scale[0] / 2, +scale[1] / 2, -scale[2] / 2],
 46 |                             [-scale[0] / 2, +scale[1] / 2, scale[2] / 2],
 47 |                             [-scale[0] / 2, +scale[1] / 2, -scale[2] / 2],
 48 |                             [+scale[0] / 2, -scale[1] / 2, scale[2] / 2],
 49 |                             [+scale[0] / 2, -scale[1] / 2, -scale[2] / 2],
 50 |                             [-scale[0] / 2, -scale[1] / 2, scale[2] / 2],
 51 |                             [-scale[0] / 2, -scale[1] / 2, -scale[2] / 2]]) + shift
 52 |     else:
 53 |         bbox_3d = np.array([[scale / 2, +scale / 2, scale / 2],
 54 |                             [scale / 2, +scale / 2, -scale / 2],
 55 |                             [-scale / 2, +scale / 2, scale / 2],
 56 |                             [-scale / 2, +scale / 2, -scale / 2],
 57 |                             [+scale / 2, -scale / 2, scale / 2],
 58 |                             [+scale / 2, -scale / 2, -scale / 2],
 59 |                             [-scale / 2, -scale / 2, scale / 2],
 60 |                             [-scale / 2, -scale / 2, -scale / 2]]) + shift
 61 | 
 62 |     bbox_3d = bbox_3d.transpose()
 63 |     return bbox_3d
 64 | 
 65 | 
 66 | def transform_coordinates_3d(coordinates, RT):
 67 |     """
 68 |     Input:
 69 |         coordinates: [3, N]
 70 |         RT: [4, 4]
 71 |     Return
 72 |         new_coordinates: [3, N]
 73 | 
 74 |     """
 75 |     assert coordinates.shape[0] == 3
 76 |     coordinates = np.vstack([coordinates, np.ones(
 77 |         (1, coordinates.shape[1]), dtype=np.float32)])
 78 |     new_coordinates = RT @ coordinates
 79 |     new_coordinates = new_coordinates[:3, :]/new_coordinates[3, :]
 80 |     return new_coordinates
 81 | 
 82 | 
 83 | def compute_ap_from_matches_scores(pred_match, pred_scores, gt_match):
 84 |     # sort the scores from high to low
 85 |     # print(pred_match.shape, pred_scores.shape)
 86 |     assert pred_match.shape[0] == pred_scores.shape[0]
 87 | 
 88 |     score_indices = np.argsort(pred_scores)[::-1]
 89 |     pred_scores = pred_scores[score_indices]
 90 |     pred_match = pred_match[score_indices]
 91 | 
 92 |     precisions = np.cumsum(pred_match > -1) / (np.arange(len(pred_match)) + 1)
 93 |     recalls = np.cumsum(pred_match > -1).astype(np.float32) / len(gt_match)
 94 | 
 95 |     # Pad with start and end values to simplify the math
 96 |     precisions = np.concatenate([[0], precisions, [0]])
 97 |     recalls = np.concatenate([[0], recalls, [1]])
 98 | 
 99 |     # Ensure precision values decrease but don't increase. This way, the
100 |     # precision value at each recall threshold is the maximum it can be
101 |     # for all following recall thresholds, as specified by the VOC paper.
102 |     for i in range(len(precisions) - 2, -1, -1):
103 |         precisions[i] = np.maximum(precisions[i], precisions[i + 1])
104 | 
105 |     # Compute mean AP over recall range
106 |     indices = np.where(recalls[:-1] != recalls[1:])[0] + 1
107 |     ap = np.sum((recalls[indices] - recalls[indices - 1])
108 |                 * precisions[indices])
109 |     return ap
110 | 
111 | 
112 | def compute_3d_iou_new(RT_1, RT_2, scales_1, scales_2, handle_visibility, class_name_1, class_name_2):
113 |     '''Computes IoU overlaps between two 3d bboxes.
114 |        bbox_3d_1, bbox_3d_1: [3, 8]
115 |     '''
116 |     # flatten masks
117 |     def asymmetric_3d_iou(RT_1, RT_2, scales_1, scales_2):
118 |         noc_cube_1 = get_3d_bbox(scales_1, 0)
119 |         bbox_3d_1 = transform_coordinates_3d(noc_cube_1, RT_1)
120 | 
121 |         noc_cube_2 = get_3d_bbox(scales_2, 0)
122 |         bbox_3d_2 = transform_coordinates_3d(noc_cube_2, RT_2)
123 | 
124 |         bbox_1_max = np.amax(bbox_3d_1, axis=0)
125 |         bbox_1_min = np.amin(bbox_3d_1, axis=0)
126 |         bbox_2_max = np.amax(bbox_3d_2, axis=0)
127 |         bbox_2_min = np.amin(bbox_3d_2, axis=0)
128 | 
129 |         overlap_min = np.maximum(bbox_1_min, bbox_2_min)
130 |         overlap_max = np.minimum(bbox_1_max, bbox_2_max)
131 | 
132 |         # intersections and union
133 |         if np.amin(overlap_max - overlap_min) < 0:
134 |             intersections = 0
135 |         else:
136 |             intersections = np.prod(overlap_max - overlap_min)
137 |         union = np.prod(bbox_1_max - bbox_1_min) + \
138 |             np.prod(bbox_2_max - bbox_2_min) - intersections
139 |         overlaps = intersections / union
140 |         return overlaps
141 | 
142 |     if RT_1 is None or RT_2 is None:
143 |         return -1
144 | 
145 |     symmetry_flag = False
146 |     if (class_name_1 in ['bottle', 'bowl', 'can'] and class_name_1 == class_name_2) or (class_name_1 == 'mug' and class_name_1 == class_name_2 and handle_visibility == 0):
147 |         # print('*'*10)
148 | 
149 |         noc_cube_1 = get_3d_bbox(scales_1, 0)
150 |         noc_cube_2 = get_3d_bbox(scales_2, 0)
151 |         bbox_3d_2 = transform_coordinates_3d(noc_cube_2, RT_2)
152 | 
153 |         def y_rotation_matrix(theta):
154 |             return np.array([[np.cos(theta), 0, np.sin(theta), 0],
155 |                              [0, 1, 0, 0],
156 |                              [-np.sin(theta), 0, np.cos(theta), 0],
157 |                              [0, 0, 0, 1]])
158 | 
159 |         n = 20
160 |         max_iou = 0
161 |         for i in range(n):
162 |             rotated_RT_1 = RT_1@y_rotation_matrix(2*math.pi*i/float(n))
163 |             max_iou = max(max_iou,
164 |                           asymmetric_3d_iou(rotated_RT_1, RT_2, scales_1, scales_2))
165 |     else:
166 |         max_iou = asymmetric_3d_iou(RT_1, RT_2, scales_1, scales_2)
167 | 
168 |     return max_iou
169 | 
170 | 
171 | def compute_combination_RT_degree_cm_symmetry(RT_1, RT_2, scale, class_id, handle_visibility, synset_names):
172 |     '''
173 |     :param RT_1: [4, 4]. homogeneous affine transformation
174 |     :param RT_2: [4, 4]. homogeneous affine transformation
175 |     :return: theta: angle difference of R in degree, shift: l2 difference of T in centimeter
176 | 
177 | 
178 |     synset_names = ['BG',  # 0
179 |                     'bottle',  # 1
180 |                     'bowl',  # 2
181 |                     'camera',  # 3
182 |                     'can',  # 4
183 |                     'cap',  # 5
184 |                     'phone',  # 6
185 |                     'monitor',  # 7
186 |                     'laptop',  # 8
187 |                     'mug'  # 9
188 |                     ]
189 | 
190 |     synset_names = ['BG',  # 0
191 |                     'bottle',  # 1
192 |                     'bowl',  # 2
193 |                     'camera',  # 3
194 |                     'can',  # 4
195 |                     'laptop',  # 5
196 |                     'mug'  # 6
197 |                     ]
198 |     '''
199 | 
200 |     # make sure the last row is [0, 0, 0, 1]
201 |     if RT_1 is None or RT_2 is None:
202 |         return -1
203 |     try:
204 |         assert np.array_equal(RT_1[3, :], RT_2[3, :])
205 |         assert np.array_equal(RT_1[3, :], np.array([0, 0, 0, 1]))
206 |     except AssertionError:
207 |         print(RT_1[3, :], RT_2[3, :])
208 |         exit()
209 | 
210 |     R1 = RT_1[:3, :3] / np.cbrt(np.linalg.det(RT_1[:3, :3]))
211 |     T1 = RT_1[:3, 3]
212 | 
213 |     R2 = RT_2[:3, :3] / np.cbrt(np.linalg.det(RT_2[:3, :3]))
214 |     T2 = RT_2[:3, 3]
215 | 
216 |     # symmetric when rotating around y-axis
217 |     if synset_names[class_id] in ['bottle', 'can', 'bowl']:
218 |         y = np.array([0, 1, 0])
219 |         y1 = R1 @ y
220 |         y2 = R2 @ y
221 |         theta = np.arccos(
222 |             y1.dot(y2) / (np.linalg.norm(y1) * np.linalg.norm(y2)))
223 |     # symmetric when rotating around y-axis
224 |     elif synset_names[class_id] == 'mug' and handle_visibility == 0:
225 |         y = np.array([0, 1, 0])
226 |         y1 = R1 @ y
227 |         y2 = R2 @ y
228 |         theta = np.arccos(
229 |             y1.dot(y2) / (np.linalg.norm(y1) * np.linalg.norm(y2)))
230 |     elif synset_names[class_id] in ['phone', 'eggbox', 'glue']:
231 |         y_180_RT = np.diag([-1.0, 1.0, -1.0])
232 |         R = R1 @ R2.transpose()
233 |         R_rot = R1 @ y_180_RT @ R2.transpose()
234 |         theta = min(np.arccos((np.trace(R) - 1) / 2),
235 |                     np.arccos((np.trace(R_rot) - 1) / 2))
236 |     else:
237 |         R = R1 @ R2.transpose()
238 |         theta = np.arccos(np.clip((np.trace(R) - 1) / 2, -1.0, 1.0))
239 | 
240 |     theta *= 180 / np.pi
241 |     shift = np.linalg.norm(T1 - T2) / scale
242 |     result = np.array([theta, shift])
243 | 
244 |     return result
245 | 
246 | 
247 | def compute_combination_3d_matches(gt_class_ids, gt_RTs, gt_scales, gt_handle_visibility, synset_names,
248 |                                    pred_boxes, pred_class_ids, pred_scores, pred_RTs, pred_scales,
249 |                                    iou_3d_thresholds, degree_thesholds, shift_thesholds, score_threshold=0):
250 |     """Finds matches between prediction and ground truth instances.
251 |     Returns:
252 |         gt_matches: 2-D array. For each GT box it has the index of the matched
253 |                   predicted box.
254 |         pred_matches: 2-D array. For each predicted box, it has the index of
255 |                     the matched ground truth box.
256 |         overlaps: [pred_boxes, gt_boxes] IoU overlaps.
257 |     """
258 |     # Trim zero padding
259 |     # TODO: cleaner to do zero unpadding upstream
260 |     num_pred = len(pred_class_ids)
261 |     num_gt = len(gt_class_ids)
262 |     indices = np.zeros(0)
263 | 
264 |     if num_pred:
265 |         pred_boxes = trim_zeros(pred_boxes).copy()
266 |         pred_scores = pred_scores[:pred_boxes.shape[0]].copy()
267 | 
268 |         # Sort predictions by score from high to low
269 |         indices = np.argsort(pred_scores)[::-1]
270 | 
271 |         pred_boxes = pred_boxes[indices].copy()
272 |         pred_class_ids = pred_class_ids[indices].copy()
273 |         pred_scores = pred_scores[indices].copy()
274 |         pred_scales = pred_scales[indices].copy()
275 |         pred_RTs = pred_RTs[indices].copy()
276 | 
277 |     # Compute IoU overlaps [pred_bboxs gt_bboxs]
278 |     #overlaps = [[0 for j in range(num_gt)] for i in range(num_pred)]
279 |     overlaps = np.zeros((num_pred, num_gt), dtype=np.float32)
280 |     RT_overlaps = np.zeros((num_pred, num_gt, 2), dtype=np.float32)
281 |     for i in range(num_pred):
282 |         for j in range(num_gt):
283 |             # overlaps[i, j] = compute_3d_iou(pred_3d_bboxs[i], gt_3d_bboxs[j], gt_handle_visibility[j],
284 |             #    synset_names[pred_class_ids[i]], synset_names[gt_class_ids[j]])
285 |             overlaps[i, j] = compute_3d_iou_new(pred_RTs[i], gt_RTs[j], pred_scales[i, :], gt_scales[j],
286 |                                                 gt_handle_visibility[j], synset_names[pred_class_ids[i]], synset_names[gt_class_ids[j]])
287 | 
288 |             RT_overlaps[i, j, :] = compute_combination_RT_degree_cm_symmetry(pred_RTs[i], gt_RTs[j], np.cbrt(
289 |                 np.linalg.det(gt_RTs[j, :3, :3])), gt_class_ids[j], gt_handle_visibility[j], synset_names)
290 | 
291 |     # Loop through predictions and find matching ground truth boxes
292 |     num_iou_3d_thres = len(iou_3d_thresholds)
293 |     num_degree_thes = len(degree_thesholds)
294 |     num_shift_thes = len(shift_thesholds)
295 |     pred_matches = -1 * \
296 |         np.ones([num_degree_thes, num_shift_thes, num_iou_3d_thres, num_pred])
297 |     gt_matches = -1 * \
298 |         np.ones([num_degree_thes, num_shift_thes, num_iou_3d_thres, num_gt])
299 | 
300 |     for s, iou_thres in enumerate(iou_3d_thresholds):
301 |         for d, degree_thres in enumerate(degree_thesholds):
302 |             for t, shift_thres in enumerate(shift_thesholds):
303 |                 for i in range(len(pred_boxes)):
304 |                     # Find best matching ground truth box
305 |                     # 1. Sort matches by score
306 |                     sorted_ixs_by_iou = np.argsort(overlaps[i])[::-1]
307 |                     # 2. Remove low scores
308 |                     low_score_idx = np.where(
309 |                         overlaps[i, sorted_ixs_by_iou] < score_threshold)[0]
310 |                     if low_score_idx.size > 0:
311 |                         sorted_ixs_by_iou = sorted_ixs_by_iou[:low_score_idx[0]]
312 |                     # 3. Find the match
313 |                     for j in sorted_ixs_by_iou:
314 |                         if gt_matches[d, t, s, j] > -1:
315 |                             continue
316 |                         # If we reach IoU smaller than the threshold, end the loop
317 |                         iou = overlaps[i, j]
318 |                         r_error = RT_overlaps[i, j, 0]
319 |                         t_error = RT_overlaps[i, j, 1]
320 | 
321 |                         if iou < iou_thres or r_error > degree_thres or t_error > shift_thres:
322 |                             break
323 | 
324 |                         if not pred_class_ids[i] == gt_class_ids[j]:
325 |                             continue
326 | 
327 |                         if iou >= iou_thres or r_error <= degree_thres or t_error <= shift_thres:
328 |                             gt_matches[d, t, s, j] = i
329 |                             pred_matches[d, t, s, i] = j
330 |                             break
331 | 
332 |     return gt_matches, pred_matches, indices
333 | 
334 | 
335 | def compute_combination_mAP(final_results, synset_names, degree_thresholds=[5, 10, 15], shift_thresholds=[0.1, 0.2], iou_3d_thresholds=[0.1]):
336 |     num_classes = len(synset_names)
337 |     degree_thres_list = list(degree_thresholds) + [360]
338 |     num_degree_thres = len(degree_thres_list)
339 | 
340 |     shift_thres_list = list(shift_thresholds) + [100]
341 |     num_shift_thres = len(shift_thres_list)
342 | 
343 |     iou_thres_list = list(iou_3d_thresholds)
344 |     num_iou_thres = len(iou_thres_list)
345 | 
346 |     aps = np.zeros((num_classes + 1, num_degree_thres,
347 |                     num_shift_thres, num_iou_thres))
348 |     pred_matches_all = [np.zeros(
349 |         (num_degree_thres, num_shift_thres, num_iou_thres, 0)) for _ in range(num_classes)]
350 |     gt_matches_all = [np.zeros(
351 |         (num_degree_thres, num_shift_thres, num_iou_thres, 0)) for _ in range(num_classes)]
352 |     pred_scores_all = [np.zeros(
353 |         (num_degree_thres, num_shift_thres, num_iou_thres, 0)) for _ in range(num_classes)]
354 | 
355 |     for progress, result in tqdm(enumerate(final_results)):
356 |         gt_class_ids = result['gt_class_ids'].astype(np.int32)
357 |         gt_RTs = np.array(result['gt_RTs'])
358 |         gt_scales = np.array(result['gt_scales'])
359 |         gt_handle_visibility = result['gt_handle_visibility']
360 | 
361 |         pred_bboxes = np.array(result['pred_bboxes'])
362 |         pred_class_ids = result['pred_class_ids']
363 |         pred_scales = result['pred_scales']
364 |         pred_scores = result['pred_scores']
365 |         pred_RTs = np.array(result['pred_RTs'])
366 | 
367 |         if len(gt_class_ids) == 0 and len(pred_class_ids) == 0:
368 |             continue
369 | 
370 |         for cls_id in range(1, num_classes):
371 |             # get gt and predictions in this class
372 |             cls_gt_class_ids = gt_class_ids[gt_class_ids == cls_id] if len(
373 |                 gt_class_ids) else np.zeros(0)
374 |             cls_gt_scales = gt_scales[gt_class_ids == cls_id] if len(
375 |                 gt_class_ids) else np.zeros((0, 3))
376 |             cls_gt_RTs = gt_RTs[gt_class_ids == cls_id] if len(
377 |                 gt_class_ids) else np.zeros((0, 4, 4))
378 | 
379 |             cls_pred_class_ids = pred_class_ids[pred_class_ids == cls_id] if len(
380 |                 pred_class_ids) else np.zeros(0)
381 |             cls_pred_bboxes = pred_bboxes[pred_class_ids == cls_id, :] if len(
382 |                 pred_class_ids) else np.zeros((0, 4))
383 |             cls_pred_scores = pred_scores[pred_class_ids == cls_id] if len(
384 |                 pred_class_ids) else np.zeros(0)
385 |             cls_pred_RTs = pred_RTs[pred_class_ids == cls_id] if len(
386 |                 pred_class_ids) else np.zeros((0, 4, 4))
387 |             cls_pred_scales = pred_scales[pred_class_ids == cls_id] if len(
388 |                 pred_class_ids) else np.zeros((0, 3))
389 | 
390 |             if synset_names[cls_id] != 'mug':
391 |                 cls_gt_handle_visibility = np.ones_like(cls_gt_class_ids)
392 |             else:
393 |                 cls_gt_handle_visibility = gt_handle_visibility[gt_class_ids == cls_id] if len(
394 |                     gt_class_ids) else np.ones(0)
395 | 
396 |             gt_match, pred_match, pred_indiced = compute_combination_3d_matches(cls_gt_class_ids, cls_gt_RTs, cls_gt_scales, cls_gt_handle_visibility, synset_names,
397 |                                                                                 cls_pred_bboxes, cls_pred_class_ids, cls_pred_scores, cls_pred_RTs, cls_pred_scales,
398 |                                                                                 iou_thres_list, degree_thres_list, shift_thres_list)
399 |             if len(pred_indiced):
400 |                 cls_pred_class_ids = cls_pred_class_ids[pred_indiced]
401 |                 cls_pred_RTs = cls_pred_RTs[pred_indiced]
402 |                 cls_pred_scores = cls_pred_scores[pred_indiced]
403 |                 cls_pred_bboxes = cls_pred_bboxes[pred_indiced]
404 | 
405 |             pred_matches_all[cls_id] = np.concatenate(
406 |                 (pred_matches_all[cls_id], pred_match), axis=-1)
407 |             cls_pred_scores_tile = np.tile(
408 |                 cls_pred_scores, (num_degree_thres, num_shift_thres, num_iou_thres, 1))
409 |             pred_scores_all[cls_id] = np.concatenate(
410 |                 (pred_scores_all[cls_id], cls_pred_scores_tile), axis=-1)
411 |             assert pred_matches_all[cls_id].shape[-1] == pred_scores_all[cls_id].shape[-1]
412 |             gt_matches_all[cls_id] = np.concatenate(
413 |                 (gt_matches_all[cls_id], gt_match), axis=-1)
414 | 
415 |     for cls_id in range(1, num_classes):
416 |         class_name = synset_names[cls_id]
417 |         for s, iou_thres in enumerate(iou_thres_list):
418 |             for d, degree_thres in enumerate(degree_thres_list):
419 |                 for t, shift_thres in enumerate(shift_thres_list):
420 |                     aps[cls_id, d, t, s] = compute_ap_from_matches_scores(pred_matches_all[cls_id][d, t, s, :],
421 |                                                                           pred_scores_all[cls_id][d,
422 |                                                                                                   t, s, :],
423 |                                                                           gt_matches_all[cls_id][d, t, s, :])
424 |     # for i in range(6):
425 |     #     print(i+1)
426 |     #     print('IoU75, 5  degree,  5% translation: {:.2f}'.format(aps[i+1, degree_thres_list.index(5), shift_thres_list.index(0.05), iou_thres_list.index(0.75)]*100))
427 |     #     print('IoU75, 10 degree,  5% translation: {:.2f}'.format(aps[i+1, degree_thres_list.index(10), shift_thres_list.index(0.05), iou_thres_list.index(0.75)]*100))
428 |     #     print('IoU75, 5  degree, 10% translation: {:.2f}'.format(aps[i+1, degree_thres_list.index(5), shift_thres_list.index(0.10), iou_thres_list.index(0.75)]*100))
429 |     #     print('IoU50, 5  degree, 20% translation: {:.2f}'.format(aps[i+1, degree_thres_list.index(5), shift_thres_list.index(0.20), iou_thres_list.index(0.50)]*100))
430 |     #     print('IoU50, 10 degree, 10% translation: {:.2f}'.format(aps[i+1, degree_thres_list.index(10), shift_thres_list.index(0.10), iou_thres_list.index(0.50)]*100))
431 |     #     print('IoU50, 10 degree, 20% translation: {:.2f}'.format(aps[i+1, degree_thres_list.index(10), shift_thres_list.index(0.20), iou_thres_list.index(0.50)]*100))
432 | 
433 |     # print('ALL:')
434 |     aps[-1, :, :, :] = np.mean(aps[1:-1, :, :, :], axis=0)
435 | 
436 |     print('IoU75, 5  degree,  5% translation: {:.2f}'.format(
437 |         aps[-1, degree_thres_list.index(5), shift_thres_list.index(0.05), iou_thres_list.index(0.75)]*100))
438 |     print('IoU75, 10 degree,  5% translation: {:.2f}'.format(
439 |         aps[-1, degree_thres_list.index(10), shift_thres_list.index(0.05), iou_thres_list.index(0.75)]*100))
440 |     print('IoU75, 5  degree, 10% translation: {:.2f}'.format(
441 |         aps[-1, degree_thres_list.index(5), shift_thres_list.index(0.10), iou_thres_list.index(0.75)]*100))
442 |     print('IoU50, 5  degree, 20% translation: {:.2f}'.format(
443 |         aps[-1, degree_thres_list.index(5), shift_thres_list.index(0.20), iou_thres_list.index(0.50)]*100))
444 |     print('IoU50, 10 degree, 10% translation: {:.2f}'.format(
445 |         aps[-1, degree_thres_list.index(10), shift_thres_list.index(0.10), iou_thres_list.index(0.50)]*100))
446 |     print('IoU50, 10 degree, 20% translation: {:.2f}'.format(
447 |         aps[-1, degree_thres_list.index(10), shift_thres_list.index(0.20), iou_thres_list.index(0.50)]*100))
448 | 
449 |     return aps
450 | 
451 | 
452 | def compute_3d_matches(gt_class_ids, gt_RTs, gt_scales, gt_handle_visibility, synset_names,
453 |                        pred_boxes, pred_class_ids, pred_scores, pred_RTs, pred_scales,
454 |                        iou_3d_thresholds, score_threshold=0):
455 |     """Finds matches between prediction and ground truth instances.
456 |     Returns:
457 |         gt_matches: 2-D array. For each GT box it has the index of the matched
458 |                   predicted box.
459 |         pred_matches: 2-D array. For each predicted box, it has the index of
460 |                     the matched ground truth box.
461 |         overlaps: [pred_boxes, gt_boxes] IoU overlaps.
462 |     """
463 |     # Trim zero padding
464 |     # TODO: cleaner to do zero unpadding upstream
465 |     num_pred = len(pred_class_ids)
466 |     num_gt = len(gt_class_ids)
467 |     indices = np.zeros(0)
468 | 
469 |     if num_pred:
470 |         pred_boxes = trim_zeros(pred_boxes).copy()
471 |         pred_scores = pred_scores[:pred_boxes.shape[0]].copy()
472 | 
473 |         # Sort predictions by score from high to low
474 |         indices = np.argsort(pred_scores)[::-1]
475 | 
476 |         pred_boxes = pred_boxes[indices].copy()
477 |         pred_class_ids = pred_class_ids[indices].copy()
478 |         pred_scores = pred_scores[indices].copy()
479 |         pred_scales = pred_scales[indices].copy()
480 |         pred_RTs = pred_RTs[indices].copy()
481 | 
482 |     # Compute IoU overlaps [pred_bboxs gt_bboxs]
483 |     #overlaps = [[0 for j in range(num_gt)] for i in range(num_pred)]
484 |     overlaps = np.zeros((num_pred, num_gt), dtype=np.float32)
485 |     for i in range(num_pred):
486 |         for j in range(num_gt):
487 |             # overlaps[i, j] = compute_3d_iou(pred_3d_bboxs[i], gt_3d_bboxs[j], gt_handle_visibility[j],
488 |             #    synset_names[pred_class_ids[i]], synset_names[gt_class_ids[j]])
489 |             overlaps[i, j] = compute_3d_iou_new(pred_RTs[i], gt_RTs[j], pred_scales[i, :], gt_scales[j],
490 |                                                 gt_handle_visibility[j], synset_names[pred_class_ids[i]], synset_names[gt_class_ids[j]])
491 | 
492 |     # Loop through predictions and find matching ground truth boxes
493 |     num_iou_3d_thres = len(iou_3d_thresholds)
494 |     pred_matches = -1 * np.ones([num_iou_3d_thres, num_pred])
495 |     gt_matches = -1 * np.ones([num_iou_3d_thres, num_gt])
496 | 
497 |     for s, iou_thres in enumerate(iou_3d_thresholds):
498 |         for i in range(len(pred_boxes)):
499 |             # Find best matching ground truth box
500 |             # 1. Sort matches by score
501 |             sorted_ixs = np.argsort(overlaps[i])[::-1]
502 |             # 2. Remove low scores
503 |             low_score_idx = np.where(
504 |                 overlaps[i, sorted_ixs] < score_threshold)[0]
505 |             if low_score_idx.size > 0:
506 |                 sorted_ixs = sorted_ixs[:low_score_idx[0]]
507 |             # 3. Find the match
508 |             for j in sorted_ixs:
509 |                 # If ground truth box is already matched, go to next one
510 |                 #print('gt_match: ', gt_match[j])
511 |                 if gt_matches[s, j] > -1:
512 |                     continue
513 |                 # If we reach IoU smaller than the threshold, end the loop
514 |                 iou = overlaps[i, j]
515 |                 #print('iou: ', iou)
516 |                 if iou < iou_thres:
517 |                     break
518 |                 # Do we have a match?
519 |                 if not pred_class_ids[i] == gt_class_ids[j]:
520 |                     continue
521 | 
522 |                 if iou > iou_thres:
523 |                     gt_matches[s, j] = i
524 |                     pred_matches[s, i] = j
525 |                     break
526 | 
527 |     return gt_matches, pred_matches, overlaps, indices
528 | 
529 | 
530 | def compute_RT_degree_cm_symmetry(RT_1, RT_2, class_id, handle_visibility, synset_names):
531 |     '''
532 |     :param RT_1: [4, 4]. homogeneous affine transformation
533 |     :param RT_2: [4, 4]. homogeneous affine transformation
534 |     :return: theta: angle difference of R in degree, shift: l2 difference of T in centimeter
535 | 
536 | 
537 |     synset_names = ['BG',  # 0
538 |                     'bottle',  # 1
539 |                     'bowl',  # 2
540 |                     'camera',  # 3
541 |                     'can',  # 4
542 |                     'cap',  # 5
543 |                     'phone',  # 6
544 |                     'monitor',  # 7
545 |                     'laptop',  # 8
546 |                     'mug'  # 9
547 |                     ]
548 | 
549 |     synset_names = ['BG',  # 0
550 |                     'bottle',  # 1
551 |                     'bowl',  # 2
552 |                     'camera',  # 3
553 |                     'can',  # 4
554 |                     'laptop',  # 5
555 |                     'mug'  # 6
556 |                     ]
557 |     '''
558 | 
559 |     # make sure the last row is [0, 0, 0, 1]
560 |     if RT_1 is None or RT_2 is None:
561 |         return -1
562 |     try:
563 |         assert np.array_equal(RT_1[3, :], RT_2[3, :])
564 |         assert np.array_equal(RT_1[3, :], np.array([0, 0, 0, 1]))
565 |     except AssertionError:
566 |         print(RT_1[3, :], RT_2[3, :])
567 |         exit()
568 | 
569 |     R1 = RT_1[:3, :3] / np.cbrt(np.linalg.det(RT_1[:3, :3]))
570 |     T1 = RT_1[:3, 3]
571 | 
572 |     R2 = RT_2[:3, :3] / np.cbrt(np.linalg.det(RT_2[:3, :3]))
573 |     T2 = RT_2[:3, 3]
574 | 
575 |     # symmetric when rotating around y-axis
576 |     if synset_names[class_id] in ['bottle', 'can', 'bowl']:
577 |         y = np.array([0, 1, 0])
578 |         y1 = R1 @ y
579 |         y2 = R2 @ y
580 |         theta = np.arccos(
581 |             y1.dot(y2) / (np.linalg.norm(y1) * np.linalg.norm(y2)))
582 |     # symmetric when rotating around y-axis
583 |     elif synset_names[class_id] == 'mug' and handle_visibility == 0:
584 |         y = np.array([0, 1, 0])
585 |         y1 = R1 @ y
586 |         y2 = R2 @ y
587 |         theta = np.arccos(
588 |             y1.dot(y2) / (np.linalg.norm(y1) * np.linalg.norm(y2)))
589 |     elif synset_names[class_id] in ['phone', 'eggbox', 'glue']:
590 |         y_180_RT = np.diag([-1.0, 1.0, -1.0])
591 |         R = R1 @ R2.transpose()
592 |         R_rot = R1 @ y_180_RT @ R2.transpose()
593 |         theta = min(np.arccos((np.trace(R) - 1) / 2),
594 |                     np.arccos((np.trace(R_rot) - 1) / 2))
595 |     else:
596 |         R = R1 @ R2.transpose()
597 |         theta = np.arccos(np.clip((np.trace(R) - 1) / 2, -1.0, 1.0))
598 | 
599 |     theta *= 180 / np.pi
600 |     shift = np.linalg.norm(T1 - T2) * 100
601 |     result = np.array([theta, shift])
602 | 
603 |     return result
604 | 
605 | 
606 | def compute_RT_overlaps(gt_class_ids, gt_RTs, gt_handle_visibility,
607 |                         pred_class_ids, pred_RTs,
608 |                         synset_names):
609 |     """Finds overlaps between prediction and ground truth instances.
610 |     Returns:
611 |         overlaps: [pred_boxes, gt_boxes] IoU overlaps.
612 |     """
613 |     # print('num of gt instances: {}, num of pred instances: {}'.format(len(gt_class_ids), len(gt_class_ids)))
614 |     num_pred = len(pred_class_ids)
615 |     num_gt = len(gt_class_ids)
616 | 
617 |     # Compute IoU overlaps [pred_bboxs gt_bboxs]
618 |     #overlaps = [[0 for j in range(num_gt)] for i in range(num_pred)]
619 |     overlaps = np.zeros((num_pred, num_gt, 2))
620 | 
621 |     for i in range(num_pred):
622 |         for j in range(num_gt):
623 |             overlaps[i, j, :] = compute_RT_degree_cm_symmetry(pred_RTs[i],
624 |                                                               gt_RTs[j],
625 |                                                               gt_class_ids[j],
626 |                                                               gt_handle_visibility[j],
627 |                                                               synset_names)
628 | 
629 |     return overlaps
630 | 
631 | 
632 | def compute_match_from_degree_cm(overlaps, pred_class_ids, gt_class_ids, degree_thres_list, shift_thres_list):
633 |     num_degree_thres = len(degree_thres_list)
634 |     num_shift_thres = len(shift_thres_list)
635 | 
636 |     num_pred = len(pred_class_ids)
637 |     num_gt = len(gt_class_ids)
638 | 
639 |     pred_matches = -1 * np.ones((num_degree_thres, num_shift_thres, num_pred))
640 |     gt_matches = -1 * np.ones((num_degree_thres, num_shift_thres, num_gt))
641 | 
642 |     if num_pred == 0 or num_gt == 0:
643 |         return gt_matches, pred_matches
644 | 
645 |     assert num_pred == overlaps.shape[0]
646 |     assert num_gt == overlaps.shape[1]
647 |     assert overlaps.shape[2] == 2
648 | 
649 |     for d, degree_thres in enumerate(degree_thres_list):
650 |         for s, shift_thres in enumerate(shift_thres_list):
651 |             for i in range(num_pred):
652 |                 # Find best matching ground truth box
653 |                 # 1. Sort matches by scores from low to high
654 |                 sum_degree_shift = np.sum(overlaps[i, :, :], axis=-1)
655 |                 sorted_ixs = np.argsort(sum_degree_shift)
656 |                 # 2. Remove low scores
657 |                 # low_score_idx = np.where(sum_degree_shift >= 100)[0]
658 |                 # if low_score_idx.size > 0:
659 |                 #     sorted_ixs = sorted_ixs[:low_score_idx[0]]
660 |                 # 3. Find the match
661 |                 for j in sorted_ixs:
662 |                     # If ground truth box is already matched, go to next one
663 |                     #print(j, len(gt_match), len(pred_class_ids), len(gt_class_ids))
664 |                     if gt_matches[d, s, j] > -1 or pred_class_ids[i] != gt_class_ids[j]:
665 |                         continue
666 |                     # If we reach IoU smaller than the threshold, end the loop
667 |                     if overlaps[i, j, 0] > degree_thres or overlaps[i, j, 1] > shift_thres:
668 |                         continue
669 | 
670 |                     gt_matches[d, s, j] = i
671 |                     pred_matches[d, s, i] = j
672 |                     break
673 | 
674 |     return gt_matches, pred_matches
675 | 
676 | 
677 | def compute_independent_mAP(final_results, synset_names, degree_thresholds=[360], shift_thresholds=[100], iou_3d_thresholds=[0.1], iou_pose_thres=0.1, use_matches_for_pose=True):
678 | 
679 |     num_classes = len(synset_names)
680 |     degree_thres_list = list(degree_thresholds) + [360]
681 |     num_degree_thres = len(degree_thres_list)
682 | 
683 |     shift_thres_list = list(shift_thresholds) + [100]
684 |     num_shift_thres = len(shift_thres_list)
685 | 
686 |     iou_thres_list = list(iou_3d_thresholds)
687 |     num_iou_thres = len(iou_thres_list)
688 | 
689 |     if use_matches_for_pose:
690 |         assert iou_pose_thres in iou_thres_list
691 | 
692 |     iou_3d_aps = np.zeros((num_classes + 1, num_iou_thres))
693 |     iou_pred_matches_all = [np.zeros((num_iou_thres, 0))
694 |                             for _ in range(num_classes)]
695 |     iou_pred_scores_all = [np.zeros((num_iou_thres, 0))
696 |                            for _ in range(num_classes)]
697 |     iou_gt_matches_all = [np.zeros((num_iou_thres, 0))
698 |                           for _ in range(num_classes)]
699 | 
700 |     pose_aps = np.zeros((num_classes + 1, num_degree_thres, num_shift_thres))
701 |     pose_pred_matches_all = [
702 |         np.zeros((num_degree_thres, num_shift_thres, 0)) for _ in range(num_classes)]
703 |     pose_gt_matches_all = [
704 |         np.zeros((num_degree_thres, num_shift_thres, 0)) for _ in range(num_classes)]
705 |     pose_pred_scores_all = [
706 |         np.zeros((num_degree_thres, num_shift_thres, 0)) for _ in range(num_classes)]
707 | 
708 |     # loop over results to gather pred matches and gt matches for iou and pose metrics
709 |     progress = 0
710 |     for progress, result in tqdm(enumerate(final_results)):
711 |         gt_class_ids = result['gt_class_ids'].astype(np.int32)
712 |         gt_RTs = np.array(result['gt_RTs'])
713 |         gt_scales = np.array(result['gt_scales'])
714 |         gt_handle_visibility = result['gt_handle_visibility']
715 | 
716 |         pred_bboxes = np.array(result['pred_bboxes'])
717 |         pred_class_ids = result['pred_class_ids']
718 |         pred_scales = result['pred_scales']
719 |         pred_scores = result['pred_scores']
720 |         pred_RTs = np.array(result['pred_RTs'])
721 | 
722 |         if len(gt_class_ids) == 0 and len(pred_class_ids) == 0:
723 |             continue
724 | 
725 |         for cls_id in range(1, num_classes):
726 |             # get gt and predictions in this class
727 |             cls_gt_class_ids = gt_class_ids[gt_class_ids == cls_id] if len(
728 |                 gt_class_ids) else np.zeros(0)
729 |             cls_gt_scales = gt_scales[gt_class_ids == cls_id] if len(
730 |                 gt_class_ids) else np.zeros((0, 3))
731 |             cls_gt_RTs = gt_RTs[gt_class_ids == cls_id] if len(
732 |                 gt_class_ids) else np.zeros((0, 4, 4))
733 | 
734 |             # ipdb.set_trace()
735 |             cls_pred_class_ids = pred_class_ids[pred_class_ids == cls_id] if len(
736 |                 pred_class_ids) else np.zeros(0)
737 |             cls_pred_bboxes = pred_bboxes[pred_class_ids == cls_id, :] if len(
738 |                 pred_class_ids) else np.zeros((0, 4))
739 |             cls_pred_scores = pred_scores[pred_class_ids == cls_id] if len(
740 |                 pred_class_ids) else np.zeros(0)
741 |             cls_pred_RTs = pred_RTs[pred_class_ids == cls_id] if len(
742 |                 pred_class_ids) else np.zeros((0, 4, 4))
743 |             cls_pred_scales = pred_scales[pred_class_ids == cls_id] if len(
744 |                 pred_class_ids) else np.zeros((0, 3))
745 | 
746 |             # calculate the overlap between each gt instance and pred instance
747 |             if synset_names[cls_id] != 'mug':
748 |                 cls_gt_handle_visibility = np.ones_like(cls_gt_class_ids)
749 |             else:
750 |                 cls_gt_handle_visibility = gt_handle_visibility[gt_class_ids == cls_id] if len(
751 |                     gt_class_ids) else np.ones(0)
752 | 
753 |             iou_cls_gt_match, iou_cls_pred_match, _, iou_pred_indices = compute_3d_matches(cls_gt_class_ids, cls_gt_RTs, cls_gt_scales, cls_gt_handle_visibility, synset_names,
754 |                                                                                            cls_pred_bboxes, cls_pred_class_ids, cls_pred_scores, cls_pred_RTs, cls_pred_scales,
755 |                                                                                            iou_thres_list)
756 |             if len(iou_pred_indices):
757 |                 cls_pred_class_ids = cls_pred_class_ids[iou_pred_indices]
758 |                 cls_pred_RTs = cls_pred_RTs[iou_pred_indices]
759 |                 cls_pred_scores = cls_pred_scores[iou_pred_indices]
760 |                 cls_pred_bboxes = cls_pred_bboxes[iou_pred_indices]
761 | 
762 |             iou_pred_matches_all[cls_id] = np.concatenate(
763 |                 (iou_pred_matches_all[cls_id], iou_cls_pred_match), axis=-1)
764 |             cls_pred_scores_tile = np.tile(cls_pred_scores, (num_iou_thres, 1))
765 |             iou_pred_scores_all[cls_id] = np.concatenate(
766 |                 (iou_pred_scores_all[cls_id], cls_pred_scores_tile), axis=-1)
767 |             assert iou_pred_matches_all[cls_id].shape[1] == iou_pred_scores_all[cls_id].shape[1]
768 |             iou_gt_matches_all[cls_id] = np.concatenate(
769 |                 (iou_gt_matches_all[cls_id], iou_cls_gt_match), axis=-1)
770 | 
771 |             if use_matches_for_pose:
772 |                 thres_ind = list(iou_thres_list).index(iou_pose_thres)
773 | 
774 |                 iou_thres_pred_match = iou_cls_pred_match[thres_ind, :]
775 | 
776 |                 cls_pred_class_ids = cls_pred_class_ids[iou_thres_pred_match > -1] if len(
777 |                     iou_thres_pred_match) > 0 else np.zeros(0)
778 |                 cls_pred_RTs = cls_pred_RTs[iou_thres_pred_match > -1] if len(
779 |                     iou_thres_pred_match) > 0 else np.zeros((0, 4, 4))
780 |                 cls_pred_scores = cls_pred_scores[iou_thres_pred_match > -1] if len(
781 |                     iou_thres_pred_match) > 0 else np.zeros(0)
782 |                 cls_pred_bboxes = cls_pred_bboxes[iou_thres_pred_match > -1] if len(
783 |                     iou_thres_pred_match) > 0 else np.zeros((0, 4))
784 | 
785 |                 iou_thres_gt_match = iou_cls_gt_match[thres_ind, :]
786 |                 cls_gt_class_ids = cls_gt_class_ids[iou_thres_gt_match > -1] if len(
787 |                     iou_thres_gt_match) > 0 else np.zeros(0)
788 |                 cls_gt_RTs = cls_gt_RTs[iou_thres_gt_match > -1] if len(
789 |                     iou_thres_gt_match) > 0 else np.zeros((0, 4, 4))
790 |                 cls_gt_handle_visibility = cls_gt_handle_visibility[iou_thres_gt_match > -1] if len(
791 |                     iou_thres_gt_match) > 0 else np.zeros(0)
792 | 
793 |             RT_overlaps = compute_RT_overlaps(cls_gt_class_ids, cls_gt_RTs, cls_gt_handle_visibility,
794 |                                               cls_pred_class_ids, cls_pred_RTs,
795 |                                               synset_names)
796 | 
797 |             pose_cls_gt_match, pose_cls_pred_match = compute_match_from_degree_cm(RT_overlaps,
798 |                                                                                   cls_pred_class_ids,
799 |                                                                                   cls_gt_class_ids,
800 |                                                                                   degree_thres_list,
801 |                                                                                   shift_thres_list)
802 | 
803 |             pose_pred_matches_all[cls_id] = np.concatenate(
804 |                 (pose_pred_matches_all[cls_id], pose_cls_pred_match), axis=-1)
805 | 
806 |             cls_pred_scores_tile = np.tile(
807 |                 cls_pred_scores, (num_degree_thres, num_shift_thres, 1))
808 |             pose_pred_scores_all[cls_id] = np.concatenate(
809 |                 (pose_pred_scores_all[cls_id], cls_pred_scores_tile), axis=-1)
810 |             assert pose_pred_scores_all[cls_id].shape[2] == pose_pred_matches_all[cls_id].shape[2], '{} vs. {}'.format(
811 |                 pose_pred_scores_all[cls_id].shape, pose_pred_matches_all[cls_id].shape)
812 |             pose_gt_matches_all[cls_id] = np.concatenate(
813 |                 (pose_gt_matches_all[cls_id], pose_cls_gt_match), axis=-1)
814 | 
815 |     iou_dict = {}
816 |     iou_dict['thres_list'] = iou_thres_list
817 |     for cls_id in range(1, num_classes):
818 |         class_name = synset_names[cls_id]
819 |         for s, iou_thres in enumerate(iou_thres_list):
820 |             iou_3d_aps[cls_id, s] = compute_ap_from_matches_scores(iou_pred_matches_all[cls_id][s, :],
821 |                                                                    iou_pred_scores_all[cls_id][s, :],
822 |                                                                    iou_gt_matches_all[cls_id][s, :])
823 | 
824 |     iou_3d_aps[-1, :] = np.mean(iou_3d_aps[1:-1, :], axis=0)
825 | 
826 |     for i, degree_thres in enumerate(degree_thres_list):
827 |         for j, shift_thres in enumerate(shift_thres_list):
828 |             for cls_id in range(1, num_classes):
829 |                 cls_pose_pred_matches_all = pose_pred_matches_all[cls_id][i, j, :]
830 |                 cls_pose_gt_matches_all = pose_gt_matches_all[cls_id][i, j, :]
831 |                 cls_pose_pred_scores_all = pose_pred_scores_all[cls_id][i, j, :]
832 | 
833 |                 pose_aps[cls_id, i, j] = compute_ap_from_matches_scores(cls_pose_pred_matches_all,
834 |                                                                         cls_pose_pred_scores_all,
835 |                                                                         cls_pose_gt_matches_all)
836 | 
837 |             pose_aps[-1, i, j] = np.mean(pose_aps[1:-1, i, j])
838 | 
839 |     print('3D IoU at 25: {:.1f}'.format(
840 |         iou_3d_aps[-1, iou_thres_list.index(0.25)] * 100))
841 |     print('3D IoU at 50: {:.1f}'.format(
842 |         iou_3d_aps[-1, iou_thres_list.index(0.5)] * 100))
843 |     print('3D IoU at 75: {:.1f}'.format(
844 |         iou_3d_aps[-1, iou_thres_list.index(0.75)] * 100))
845 | 
846 |     print('5 degree, 2cm: {:.1f}'.format(
847 |         pose_aps[-1, degree_thres_list.index(5), shift_thres_list.index(2)] * 100))
848 |     print('5 degree, 5cm: {:.1f}'.format(
849 |         pose_aps[-1, degree_thres_list.index(5), shift_thres_list.index(5)] * 100))
850 | 
851 |     print('10 degree, 2cm: {:.1f}'.format(
852 |         pose_aps[-1, degree_thres_list.index(10), shift_thres_list.index(2)] * 100))
853 |     print('10 degree, 5cm: {:.1f}'.format(
854 |         pose_aps[-1, degree_thres_list.index(10), shift_thres_list.index(5)] * 100))
855 | 
856 |     return iou_3d_aps, pose_aps
857 | 
858 | 
859 | def evaluate(path):
860 |     synset_names = ['BG',  # 0
861 |                     'bottle',  # 1
862 |                     'bowl',  # 2
863 |                     'camera',  # 3
864 |                     'can',  # 4
865 |                     'laptop',  # 5
866 |                     'mug']  # 6
867 | 
868 |     result_pkl_list = glob.glob(os.path.join(path, 'results_*.pkl'))
869 |     result_pkl_list = sorted(result_pkl_list)
870 |     print('image num: {}'.format(len(result_pkl_list)))
871 | 
872 |     final_results = []
873 |     for pkl_path in result_pkl_list:
874 |         with open(pkl_path, 'rb') as f:
875 |             result = cPickle.load(f)
876 |             if not 'gt_handle_visibility' in result:
877 |                 result['gt_handle_visibility'] = np.ones_like(
878 |                     result['gt_class_ids'])
879 |                 print('can\'t find gt_handle_visibility in the pkl.')
880 |             else:
881 |                 assert len(result['gt_handle_visibility']) == len(result['gt_class_ids']), "{} {}".format(
882 |                     result['gt_handle_visibility'], result['gt_class_ids'])
883 | 
884 |         if type(result) is list:
885 |             final_results += result
886 |         elif type(result) is dict:
887 |             final_results.append(result)
888 |         else:
889 |             assert False
890 | 
891 |     print("Compute combined mAP: ")
892 |     compute_combination_mAP(final_results, synset_names,
893 |                             degree_thresholds=[5, 10, 20],
894 |                             shift_thresholds=[0.05, 0.1, 0.2],
895 |                             iou_3d_thresholds=[0.25, 0.50, 0.75])
896 | 
897 |     print("Compute independent mAP: ")
898 |     compute_independent_mAP(final_results, synset_names,
899 |                             degree_thresholds=[5, 10],
900 |                             shift_thresholds=[2, 5],
901 |                             iou_3d_thresholds=[0.10, 0.25, 0.50, 0.75])
902 | 


--------------------------------------------------------------------------------