├── colmap ├── __init__.py └── read_model.py ├── LICENSE ├── pltfuns.py ├── transforms.py ├── README.md ├── filters.py ├── losses.py ├── iofuns.py ├── refine.py ├── logger.py ├── refinement.py └── misc.py /colmap/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020, 4 | ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland, 5 | Laboratoire de Traitement des Signaux 4 (LTS4). 6 | All rights reserved. 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | -------------------------------------------------------------------------------- /pltfuns.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, 2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland, 3 | # Laboratoire de Traitement des Signaux 4 (LTS4). 4 | # All rights reserved. 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | # 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com) 25 | 26 | import numpy as np 27 | # import matplotlib.cm 28 | from typing import Tuple 29 | 30 | 31 | def normal2rgb(normal: np.array) -> np.array: 32 | """It maps a 3D normal map into an RGB image. 33 | 34 | It maps the input 3D normal map into an RGB image. Since a normal vector has unitary norm, the set of all the 35 | possible normals describes a unitary sphere. This function maps each point `(X, Y, Z)` on the sphere, hence each 36 | normal vector, to an RGB value. All non zero normals are assumed valid and no check is performed on them. 37 | 38 | Args: 39 | normal: normal map, arranged as an `(H, W, 3)` array. 40 | 41 | Returns: 42 | An RGB image, arranged as an `(H, W, 3)` array, that encodes the normals. 43 | """ 44 | 45 | # Detect the entries of the grid where the 3D normals are available. 46 | mask = (np.sum(normal != 0, axis=2) != 0) 47 | 48 | # Allocate the RGB representation of the normals. 49 | normal_rgb = np.zeros_like(normal, dtype=np.uint8) 50 | 51 | # Map the X, Y and Z coordinates from [-1, 1] to [0, 255]. 52 | normal_rgb[mask] = np.round(((normal.astype(np.float64, copy=False)[mask] + 1.0) / 2.0) * 255).astype(np.uint8) 53 | 54 | return normal_rgb 55 | 56 | 57 | def normal2rgb_legend(n: int = 500) -> Tuple[np.array, np.array]: 58 | """It returns a legend for the function `normals2rgb`. 59 | 60 | It returns a legend for the color coding adopted in the function `normal2rgb`. The legend comprises two images 61 | representing the two hemispheres associated to the negative and positive `Z` semi-axis, respectively. 62 | 63 | Args: 64 | n: height (and width) of the output legend images. 65 | 66 | Returns: 67 | The legend arranged as two `(n, n)` arrays, for the negative and positive `Z` semi-axes, respectively. 68 | """ 69 | 70 | # Build the X and Y components of the 3D normals. 71 | x, y = np.meshgrid(np.linspace(- 1, 1, n), np.linspace(- 1, 1, n)) 72 | 73 | # Detect the entries that are within the unitary circle. 74 | mask = np.sqrt((x ** 2) + (y ** 2)) <= 1.0 75 | 76 | # Compute the z component of the 3D unitary normals. 77 | z = np.zeros_like(x) 78 | z[mask] = np.sqrt(np.abs(1 - (x[mask] ** 2) - (y[mask] ** 2))) 79 | 80 | # Set the X and Y entries of the non unitary 3D normals to zero. 81 | x[~mask] = 0 82 | y[~mask] = 0 83 | 84 | # Build the negative hemisphere of the 3D normal legend. 85 | normal_z_neg = np.stack((x, y, - z), axis=2) 86 | 87 | # Build the positive hemisphere of the 3D normal legend. 88 | normal_z_pos = np.stack((x, y, z), axis=2) 89 | 90 | # Encode the 3D normals into an RGB image. 91 | normal_z_neg_rgb = normal2rgb(normal_z_neg) 92 | normal_z_pos_rgb = normal2rgb(normal_z_pos) 93 | 94 | return normal_z_neg_rgb, normal_z_pos_rgb 95 | 96 | 97 | # def plot_map(heat_map, mask=None, vmax=0.0, vmin=1.0, colormap='viridis'): 98 | # """It turns the input heat map into an RGB image. 99 | # 100 | # It turns the input heat map into an RGB image according to the specified input color map. The parameters `vmin` 101 | # and `vmax` play the same role that they have in `matplotlib.pyplot.imshow`. In particular, calling `imshow` on 102 | # the input heat map using `vmin` and `vmax` produces the same visual result of calling `imshow` on the RGB image 103 | # created by this function. 104 | # 105 | # In addition, the heat map pixels marked as `False` in the input `mask` are converted to white in the RGB image. 106 | # 107 | # Args: 108 | # heat_map: heat map, arranged as an `(H, W)` array. 109 | # mask: binary mask, arranged as an `(H, W)` array. 110 | # vmax: heat map lower bound. 111 | # vmin: heat map upper bound. 112 | # colormap: `matplotlib` colormap. 113 | # 114 | # Returns: 115 | # The input heat map converted to RGB. 116 | # """ 117 | # 118 | # # Clip the input heat map. 119 | # heat_map_clipped = np.clip(heat_map, vmin, vmax) 120 | # 121 | # # Color map object. 122 | # cmap = matplotlib.cm.get_cmap(colormap) 123 | # 124 | # # Convert the heat map intensity values to RGB triplets. 125 | # heat_map_rgb = cmap((heat_map_clipped - vmin) / (vmax - vmin))[:, :, 0:-1] 126 | # 127 | # # Non valid pixels are assigned the white color. 128 | # if mask is not None: 129 | # mask_rgb = np.repeat(mask[:, :, None], 3, axis=2) 130 | # heat_map_rgb[~mask_rgb] = 1.0 131 | # 132 | # return heat_map_rgb 133 | -------------------------------------------------------------------------------- /transforms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, 2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland, 3 | # Laboratoire de Traitement des Signaux 4 (LTS4). 4 | # All rights reserved. 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | # 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com) 25 | 26 | import numpy as np 27 | import torch 28 | from typing import Union, Tuple 29 | 30 | 31 | DEFAULT_DEPTH_RANGE = (1e-1, 1e3) 32 | 33 | 34 | def depth2depth_inv(depth: Union[np.array, torch.Tensor]) -> Union[np.array, torch.Tensor]: 35 | """It computes `1 / depth`. 36 | 37 | It applies the transformation `1 / depth` to the valid entries of `depth`. The remaining entries are set to zero. 38 | Valid entries of `depth` must belong to the interval `]0, +inf[`. 39 | 40 | Args: 41 | depth: depth map, arranged as an `(H, W)` array. 42 | 43 | Returns: 44 | The transformed depth map. 45 | """ 46 | 47 | # Check the class of the input data. 48 | depth_class = type(depth).__name__ 49 | 50 | # Detect the valid entries. 51 | mask = (depth > 0) & (depth < float('inf')) 52 | 53 | # Select the valid entries. 54 | selection = depth[mask] 55 | 56 | # Perform the transformation. 57 | selection = 1.0 / selection 58 | 59 | # Division could lead to non valid entries. Remove them. 60 | selection[~((selection > 0) & (selection < float('inf')))] = 0 61 | 62 | # Write the transformed depth. 63 | if depth_class == 'ndarray': 64 | 65 | depth_inv = np.zeros_like(depth) 66 | depth_inv[mask] = selection 67 | 68 | elif depth_class == 'Tensor': 69 | 70 | # >>> NOT TESTED !!! <<< 71 | 72 | depth_inv = torch.zeros_like(depth) 73 | depth_inv[mask] = selection 74 | 75 | else: 76 | 77 | raise TypeError('The input must be either of type `numpy.ndarray` or `torch.Tensor`.') 78 | 79 | return depth_inv 80 | 81 | 82 | def depth_inv2depth(depth_inv: Union[np.array, torch.Tensor], 83 | depth_range: Tuple[np.float, np.float] = DEFAULT_DEPTH_RANGE) -> Union[np.array, torch.Tensor]: 84 | """It reverts the operation of the function `depth2depth_inv()`. 85 | 86 | It reverts the operation of the function `depth2depth_inv()` by applying the transformation `1 / depth_inv` 87 | to the valid entries of `depth_inv`. The remaining entries are set to zero. 88 | Valid entries of `depth_inv` must belong to the interval `]0, +inf[`. 89 | Upon conversion, the valid depth entries are clipped to the interval `[depth_range[0], depth_range[1]]`, 90 | which must belong to `]0, +inf[`. 91 | 92 | Args: 93 | depth_inv: transformed depth map, arranged as an `(H, W)` array. 94 | depth_range: 2-ple specifying the final depth range. 95 | 96 | Returns: 97 | The depth map resulting from the inverse transformation. 98 | """ 99 | 100 | # Check the class of the input data. 101 | depth_inv_class = type(depth_inv).__name__ 102 | 103 | # Check the final depth range. 104 | low, up = depth_range 105 | assert low > 0 and up < float('inf'), 'The depth range must belong to ]0, +inf[.' 106 | 107 | # Detect the valid entries. 108 | mask = (depth_inv > 0) & (depth_inv < float('inf')) 109 | 110 | # Select the valid entries. 111 | selection = depth_inv[mask] 112 | 113 | # Perform the transformation. 114 | selection = 1.0 / selection 115 | 116 | # Division could lead to non valid entries. Remove them. 117 | selection[~((selection > 0) & (selection < float('inf')))] = 0 118 | 119 | # Clip and write the transformed depth. 120 | if depth_inv_class == 'ndarray': 121 | 122 | # Clip. 123 | selection[selection > 0] = np.clip(selection[selection > 0], low, up) 124 | 125 | # Write. 126 | depth = np.zeros_like(depth_inv) 127 | depth[mask] = selection 128 | 129 | elif depth_inv_class == 'Tensor': 130 | 131 | # >>> NOT TESTED !!! <<< 132 | 133 | # Clip. 134 | selection[selection > 0] = torch.clamp(selection[selection > 0], low, up) 135 | 136 | # Write. 137 | depth = torch.zeros_like(depth_inv) 138 | depth[mask] = selection 139 | 140 | else: 141 | 142 | raise TypeError('The input must be either of type `numpy.ndarray` or `torch.Tensor`.') 143 | 144 | return depth 145 | 146 | 147 | def depth_range2depth_inv_range(depth_range: Tuple[float, float]) -> Tuple[float, float]: 148 | """It converts a depth range into the inverse depth range. 149 | 150 | Args: 151 | depth_range: 2-tuple specifying the depth range. 152 | 153 | Returns: 154 | The inverse depth range 2-tuple. 155 | """ 156 | 157 | assert depth_range[0] <= depth_range[1], 'The input depth range is empty.' 158 | 159 | assert depth_range[0] > 0 and depth_range[1] < float('inf'), 'The input depth range must belong to ]0, 1[.' 160 | 161 | return 1.0 / depth_range[1], 1.0 / depth_range[0] 162 | 163 | 164 | def tensor2array(tensor: torch.Tensor) -> np.array: 165 | """It converts a torch batch to a numpy batch. 166 | 167 | It converts a batch of images stored as a torch tensor of dimensions `(B, C, H, W)` or `(C, H, W)` into a numpy 168 | array of dimensions `(B, H, W, C)` or `(H, W, C)`, respectively. 169 | 170 | Args: 171 | tensor: tensor to convert. 172 | 173 | Returns: 174 | The converted tensor. 175 | """ 176 | 177 | if tensor.dim() == 3: 178 | array = np.transpose(tensor.numpy(), (1, 2, 0)) 179 | elif tensor.dim() == 4: 180 | array = np.transpose(tensor, (0, 2, 3, 1)) 181 | else: 182 | raise ValueError('Input tensor dimension must be 3 or 4.') 183 | 184 | return array 185 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # depth-refinement-and-normal-estimation 2 | 3 | This software is meant to refine a noisy and potentially incomplete depth map, 4 | given the corresponding image. 5 | Since the depth map refinement algorithm underneath this software assumes a piece-wise planar world, 6 | this software estimates a normal map jointly with the refined depth map. 7 | The software can take advantage of a continuous confidence map with entries in `[0, 1]`, 8 | where `0` denotes unreliable depth values and `1` denotes reliable ones. 9 | In the absence of a confidence map, a pixel is assigned a confidence equal to `1` 10 | if it has a valid depth, `0` otherwise. 11 | 12 | This software is released under the MIT license. 13 | If you use this software in your research, please cite the following article: 14 | 15 | @inproceedings{rossi_refinement_2020, 16 | authors = {Mattia Rossi, Mireille El Gheche, Andreas Kuhn, Pascal Frossard}, 17 | title = {Joint Graph-based Depth Refinement and Normal Estimation}, 18 | booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR), Seattle, WA, USA}, 19 | year = {2020} 20 | } 21 | 22 | ## Installation 23 | 24 | The software has been tested with Python 3.7 and it has the following dependencies (in brackets are 25 | the tested versions): 26 | 27 | - pytorch (v.1.4.0), 28 | - opencv (v.3.4.2), 29 | - visdom (v.1.8.9). 30 | 31 | The software relies on `pytorch`, therefore it can run on both CPU and GPU: the latter is recommended. 32 | Processing depth maps of resolution approximately 3000x2000 pixels requires a GPU equipped with a 33 | 12 GB memory. 34 | The software does not support the parallel use of multiple GPUs. 35 | `visdom` is not a mandatory dependency: it is required only for runtime plotting. 36 | In particular, `visdom` permits to observe the progressive refinement of the input depth map from 37 | a web browser, even if the computation is taking place on a remote server. 38 | 39 | ## How to run the software 40 | 41 | The software has a command-line interface, but it can be integrated in a third party code 42 | easily by calling the function `refine` in `refinement.py`. 43 | The following command (new lines must be replaced with spaces) provides and example of usage 44 | of the command-line interface: 45 | 46 | python refine.py 47 | --image 48 | --depth 49 | --confidence 50 | --depth_out 51 | --normal_out 52 | --cam_focal 53 | --cam_center 54 | --depth_min 0.1 55 | --depth_max 50 56 | --confidence_threshold 0.5 57 | --gpu_id 0 58 | --scale_nb 4 59 | --lambda_regularization 7.5 7.5 7.5 7.5 60 | --gamma_regularization 5.5 5.5 5.5 5.5 61 | --window_size 9 9 9 9 62 | --patch_size 3 3 3 3 63 | --sigma_int 0.07 0.07 0.07 0.07 64 | --sigma_spa 3.0 3.0 3.0 3.0 65 | --degree_max 20 20 20 20 66 | --iter_max 4000 3000 2000 1000 67 | --eps_stop 0.000001 0.000001 0.000001 0.000001 68 | --attempt_max 50 50 50 50 69 | --lr_start 0.01 0.01 0.001 0.0001 70 | --lr_slot_nb 3 3 2 1 71 | 72 | The above command performs a refinement of the input depth map adopting a multi-scale refinement 73 | with 4 scales. 74 | As a consequence, the scale-dependent parameters require 4 input values each. 75 | For more details on the software input parameters, please use the help `python refine.py --help`. 76 | Finally, please note that the input depth and confidence maps must be in binary format (the same used in 77 | [COLMAP](https://github.com/colmap/colmap)). 78 | This is also the same format used to save the refined depth map and the corresponding normal map 79 | on disk. 80 | Reading and writing in binary format is performed by the functions `read_bin_file` and `write_bin_file`, 81 | respectively, in `iofuns.py`. 82 | 83 | ## Interactive plotting 84 | 85 | The software permits to visualize the progress of the input depth map refinement via web browser. 86 | This is implemented using a [VISDOM](https://github.com/facebookresearch/visdom) server. 87 | 88 | The VISDOM server can be started with the following command: 89 | 90 | python -m visdom.server -port -base_url / & 91 | 92 | where `` and `visdom_base_url` are an arbitrary port and string, respectively. 93 | The server will be accessible at the web page `:/`, 94 | where `` is the address of the machine where the refinement software runs. 95 | If the software is run locally, then `` is `localhost`. 96 | 97 | In order to have the software plotting the intermediate results on the VISDOM server, it is necessary 98 | to specify the following two additional parameters when launching the refinement: 99 | 100 | --visdom_display_port 101 | --visdom_base_url 102 | 103 | ## License 104 | 105 | This software itself is licensed under the MIT license. 106 | The software dependencies and the content of the folder `colmap` may have different licenses: 107 | using these within the depth refinement software may affect the resulting software license. 108 | 109 | Copyright (c) 2020, 110 | ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland, 111 | Laboratoire de Traitement des Signaux 4 (LTS4). 112 | All rights reserved. 113 | 114 | Permission is hereby granted, free of charge, to any person obtaining a copy 115 | of this software and associated documentation files (the "Software"), to deal 116 | in the Software without restriction, including without limitation the rights 117 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 118 | copies of the Software, and to permit persons to whom the Software is 119 | furnished to do so, subject to the following conditions: 120 | 121 | The above copyright notice and this permission notice shall be included in all 122 | copies or substantial portions of the Software. 123 | 124 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 125 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 126 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 127 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 128 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 129 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 130 | SOFTWARE. 131 | 132 | Author: Mattia Rossi (rossi-mattia-at-gmail-com) 133 | -------------------------------------------------------------------------------- /filters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, 2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland, 3 | # Laboratoire de Traitement des Signaux 4 (LTS4). 4 | # All rights reserved. 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | # 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com) 25 | 26 | import numpy as np 27 | import torch 28 | from typing import Tuple, Union 29 | 30 | 31 | def gauss_filter_1d(length: int, sigma: float) -> np.array: 32 | """It builds a 1D Gaussian filter. 33 | 34 | Args: 35 | length: number of filter taps. 36 | sigma: standard deviation. 37 | 38 | Returns: 39 | A 1D Gaussian filter arranged as a `(length,)` array. 40 | """ 41 | 42 | # Check the filter length. 43 | if (length % 2) == 0: 44 | raise ValueError('The length of the filter must be odd.') 45 | 46 | # Build the filter. 47 | radius = int((length - 1) / 2.0) 48 | x = np.arange(-radius, radius + 1, dtype=np.float32) 49 | y = np.exp(- (x ** 2) / (2 * (sigma ** 2))) / (sigma * np.sqrt(2 * np.pi)) 50 | 51 | # Normalize the filter. 52 | const = np.sum(y) 53 | assert const != 0, 'The filter is zero everywhere.' 54 | y = y / const 55 | 56 | return y 57 | 58 | 59 | def gauss_filter_deriv_1d(length: int, sigma: float) -> np.array: 60 | """It builds the derivative of a 1D Gaussian filter. 61 | 62 | Args: 63 | length: number of filter taps. 64 | sigma: standard deviation. 65 | 66 | Returns: 67 | A 1D Gaussian filter derivative, arranged as a `(length,)` array. 68 | """ 69 | 70 | # Check the filter length. 71 | if (length % 2) == 0: 72 | raise ValueError('The length of the filter must be odd.') 73 | 74 | # Build the filter. 75 | radius = int((length - 1) / 2.0) 76 | x = np.arange(-radius, radius + 1, dtype=np.float32) 77 | y = gauss_filter_1d(length, sigma) * (- x / (sigma ** 2)) 78 | 79 | # Normalize the filter. 80 | const = np.sum(np.abs(y)) 81 | assert const != 0, 'The filter is zero everywhere.' 82 | y = y / const 83 | # TODO: check whether this normalization makes sense. 84 | 85 | return y 86 | 87 | 88 | def gauss_filter_2d(size: int, sigma: float) -> np.array: 89 | """It builds a 2D Gaussian filter. 90 | 91 | Args: 92 | size: height (and width) of the filter. 93 | sigma: standard deviation (in pixels) of the Gaussian filter. 94 | 95 | Returns: 96 | A 2D Gaussian filter arranged as a `(size, size)` array. 97 | """ 98 | 99 | # Build the filter. 100 | y = (gauss_filter_1d(size, sigma)[:, None]).dot(gauss_filter_1d(size, sigma)[None, :]) 101 | 102 | # Normalize the filter. 103 | const = np.sum(y) 104 | assert const != 0, 'The filter is zero everywhere.' 105 | y = y / const 106 | 107 | return y 108 | 109 | 110 | def gauss_filter_deriv_2d(size: int, sigma: float) -> np.array: 111 | """It builds the vertical derivative of a 2D Gaussian filter. 112 | 113 | It builds the vertical derivative of a 2D Gaussian filter. The horizontal derivative can be obtained just by taking 114 | the transpose of the vertical one. 115 | 116 | Args: 117 | size: height (and width) of the filters. 118 | sigma: standard deviation (in pixels) of the Gaussian filter underneath the derivative filters. 119 | 120 | Returns: 121 | The vertical derivative of a 2D Gaussian filter arranged as a `(size, size)` array. 122 | """ 123 | 124 | # Build the filter. 125 | y = (gauss_filter_deriv_1d(size, sigma)[:, None]).dot(gauss_filter_1d(size, sigma)[None, :]) 126 | 127 | # Normalize the filter. 128 | const = np.sum(np.abs(y)) 129 | assert const != 0, 'The filter is zero everywhere.' 130 | y = y / const 131 | 132 | return y 133 | 134 | 135 | def gradient_filter(size: int, sigma: float) -> torch.Tensor: 136 | """It builds a gradient filter for images in PyTorch tensor format. 137 | 138 | It builds a filter that can be used with `torch.nn.functional.conv2d` to compute the gradient of a batch of images 139 | or, more in general, of maps. The images or maps must have only one channel. 140 | The filter is arranged as a `(2, 1, H, W)` tensor with `[0, :, :, :]` and `[1, :, :, :]` the 2D horizontal and 141 | vertical derivative filters. 142 | 143 | Example: 144 | batch_nb = 5 145 | height = 100 146 | width = 200 147 | size = 7 148 | image = torch.random(batch_nb, 1, height, width) 149 | filter = gradient_filter(7, 0.1) 150 | pad = tuple([int((size - 1) / 2)] * 4) 151 | image_grad = torch.nn.functional.conv2d(torch.nn.functional.pad(image, pad, mode='replicate'), filter) 152 | 153 | In the example, `image_grad` is a `(batch_nb, 2, height, width)` tensor with `image_grad[k, 0, :, :]` and 154 | `image_grad[k, 1, :, :]` the horizontal and vertical derivatives of the image `k`. 155 | 156 | Args: 157 | size: height (and width) of the filters. 158 | sigma: standard deviation (in pixels) of the Gaussian filter underneath the derivative filters. 159 | 160 | Returns: 161 | The gradient filter, arranged as a `(2, 1, H, W)` tensor. 162 | """ 163 | 164 | # Build the vertical (y) derivative filter. 165 | d_gauss_dy = gauss_filter_deriv_2d(size, sigma) 166 | 167 | # Flip the filter around the (x, y) origin, as torch.nn.functional.conv2d() performs just cross-correlation rather 168 | # than the standard convolution. 169 | d_gauss_dy = np.fliplr(d_gauss_dy) 170 | d_gauss_dy = np.flipud(d_gauss_dy) 171 | 172 | # Build the horizontal (x) derivative filter, which is just the transpose of the vertical one. 173 | d_gauss_dx = d_gauss_dy.T 174 | 175 | # Expand the filters to make them compliant with torch.nn.functional.conv2d. 176 | d_gauss_dy = d_gauss_dy[None, None, :, :] # [1, 1, size, size] 177 | d_gauss_dx = d_gauss_dx[None, None, :, :] # [1, 1, size, size] 178 | 179 | # Concatenate the two filters into a single filter with two channels. 180 | grad_filter = np.concatenate((d_gauss_dx, d_gauss_dy), axis=0) # [2, 1, size, size] 181 | 182 | # Change the filter type to torch.Tensor. 183 | grad_filter = torch.from_numpy(grad_filter) 184 | 185 | return grad_filter 186 | 187 | 188 | def diff_filter_bank(size: Union[int, Tuple[int, int]] = 5): 189 | """It builds a derivative filter bank. 190 | 191 | It builds a set of `HxW` filters where each filter has only two non zero entries: the central one, whose value 192 | is `-1`, and another non central, whose value is `1`. The number of filters is `H*W - 1`, i.e., all the possible 193 | filters of the described type. 194 | 195 | Args: 196 | size: tuple specifying the height and width of the filter (square filter if only one dimensions is specified). 197 | 198 | Returns: 199 | The derivative filter bank, arranged as an `(H, W)` array. 200 | """ 201 | 202 | # Filter bank spatial dimensions. 203 | filter_size = tuple((size, )) 204 | if len(filter_size) == 2: 205 | filter_height = size[0] 206 | filter_width = size[1] 207 | elif len(filter_size) == 1: 208 | filter_height = size 209 | filter_width = size 210 | else: 211 | raise TypeError('Input must be either an integer or a 2-tuple of integers.') 212 | 213 | # Number of filters in the filter bank. 214 | filter_nb = int((filter_height * filter_width) - 1) 215 | 216 | # Center of each filter in the filter bank. 217 | filter_center_y = int((filter_height - 1) / 2.0) 218 | filter_center_x = int((filter_width - 1) / 2.0) 219 | 220 | # Create the filter bank. 221 | index = 0 222 | filter_bank = torch.zeros(filter_nb, 1, size, size) 223 | filter_bank[:, :, filter_center_y, filter_center_x] = - 1.0 224 | for y in range(size): 225 | for x in range(size): 226 | 227 | if y != filter_center_y or x != filter_center_x: 228 | filter_bank[index, :, y, x] = 1.0 229 | index += 1 230 | 231 | return filter_bank 232 | -------------------------------------------------------------------------------- /colmap/read_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, ETH Zurich and UNC Chapel Hill. 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # 14 | # * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of 15 | # its contributors may be used to endorse or promote products derived 16 | # from this software without specific prior written permission. 17 | # 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE 22 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 | # POSSIBILITY OF SUCH DAMAGE. 29 | # 30 | # Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de) 31 | 32 | import os 33 | import sys 34 | import collections 35 | import numpy as np 36 | import struct 37 | 38 | 39 | CameraModel = collections.namedtuple( 40 | "CameraModel", ["model_id", "model_name", "num_params"]) 41 | Camera = collections.namedtuple( 42 | "Camera", ["id", "model", "width", "height", "params"]) 43 | BaseImage = collections.namedtuple( 44 | "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"]) 45 | Point3D = collections.namedtuple( 46 | "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"]) 47 | 48 | class Image(BaseImage): 49 | def qvec2rotmat(self): 50 | return qvec2rotmat(self.qvec) 51 | 52 | 53 | CAMERA_MODELS = { 54 | CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3), 55 | CameraModel(model_id=1, model_name="PINHOLE", num_params=4), 56 | CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4), 57 | CameraModel(model_id=3, model_name="RADIAL", num_params=5), 58 | CameraModel(model_id=4, model_name="OPENCV", num_params=8), 59 | CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8), 60 | CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12), 61 | CameraModel(model_id=7, model_name="FOV", num_params=5), 62 | CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4), 63 | CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5), 64 | CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12) 65 | } 66 | CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model) \ 67 | for camera_model in CAMERA_MODELS]) 68 | 69 | 70 | def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"): 71 | """Read and unpack the next bytes from a binary file. 72 | :param fid: 73 | :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc. 74 | :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}. 75 | :param endian_character: Any of {@, =, <, >, !} 76 | :return: Tuple of read and unpacked values. 77 | """ 78 | data = fid.read(num_bytes) 79 | return struct.unpack(endian_character + format_char_sequence, data) 80 | 81 | 82 | def read_cameras_text(path): 83 | """ 84 | see: src/base/reconstruction.cc 85 | void Reconstruction::WriteCamerasText(const std::string& path) 86 | void Reconstruction::ReadCamerasText(const std::string& path) 87 | """ 88 | cameras = {} 89 | with open(path, "r") as fid: 90 | while True: 91 | line = fid.readline() 92 | if not line: 93 | break 94 | line = line.strip() 95 | if len(line) > 0 and line[0] != "#": 96 | elems = line.split() 97 | camera_id = int(elems[0]) 98 | model = elems[1] 99 | width = int(elems[2]) 100 | height = int(elems[3]) 101 | params = np.array(tuple(map(float, elems[4:]))) 102 | cameras[camera_id] = Camera(id=camera_id, model=model, 103 | width=width, height=height, 104 | params=params) 105 | return cameras 106 | 107 | 108 | def read_cameras_binary(path_to_model_file): 109 | """ 110 | see: src/base/reconstruction.cc 111 | void Reconstruction::WriteCamerasBinary(const std::string& path) 112 | void Reconstruction::ReadCamerasBinary(const std::string& path) 113 | """ 114 | cameras = {} 115 | with open(path_to_model_file, "rb") as fid: 116 | num_cameras = read_next_bytes(fid, 8, "Q")[0] 117 | for camera_line_index in range(num_cameras): 118 | camera_properties = read_next_bytes( 119 | fid, num_bytes=24, format_char_sequence="iiQQ") 120 | camera_id = camera_properties[0] 121 | model_id = camera_properties[1] 122 | model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name 123 | width = camera_properties[2] 124 | height = camera_properties[3] 125 | num_params = CAMERA_MODEL_IDS[model_id].num_params 126 | params = read_next_bytes(fid, num_bytes=8*num_params, 127 | format_char_sequence="d"*num_params) 128 | cameras[camera_id] = Camera(id=camera_id, 129 | model=model_name, 130 | width=width, 131 | height=height, 132 | params=np.array(params)) 133 | assert len(cameras) == num_cameras 134 | return cameras 135 | 136 | 137 | def read_images_text(path): 138 | """ 139 | see: src/base/reconstruction.cc 140 | void Reconstruction::ReadImagesText(const std::string& path) 141 | void Reconstruction::WriteImagesText(const std::string& path) 142 | """ 143 | images = {} 144 | with open(path, "r") as fid: 145 | while True: 146 | line = fid.readline() 147 | if not line: 148 | break 149 | line = line.strip() 150 | if len(line) > 0 and line[0] != "#": 151 | elems = line.split() 152 | image_id = int(elems[0]) 153 | qvec = np.array(tuple(map(float, elems[1:5]))) 154 | tvec = np.array(tuple(map(float, elems[5:8]))) 155 | camera_id = int(elems[8]) 156 | image_name = elems[9] 157 | elems = fid.readline().split() 158 | xys = np.column_stack([tuple(map(float, elems[0::3])), 159 | tuple(map(float, elems[1::3]))]) 160 | point3D_ids = np.array(tuple(map(int, elems[2::3]))) 161 | images[image_id] = Image( 162 | id=image_id, qvec=qvec, tvec=tvec, 163 | camera_id=camera_id, name=image_name, 164 | xys=xys, point3D_ids=point3D_ids) 165 | return images 166 | 167 | 168 | def read_images_binary(path_to_model_file): 169 | """ 170 | see: src/base/reconstruction.cc 171 | void Reconstruction::ReadImagesBinary(const std::string& path) 172 | void Reconstruction::WriteImagesBinary(const std::string& path) 173 | """ 174 | images = {} 175 | with open(path_to_model_file, "rb") as fid: 176 | num_reg_images = read_next_bytes(fid, 8, "Q")[0] 177 | for image_index in range(num_reg_images): 178 | binary_image_properties = read_next_bytes( 179 | fid, num_bytes=64, format_char_sequence="idddddddi") 180 | image_id = binary_image_properties[0] 181 | qvec = np.array(binary_image_properties[1:5]) 182 | tvec = np.array(binary_image_properties[5:8]) 183 | camera_id = binary_image_properties[8] 184 | image_name = "" 185 | current_char = read_next_bytes(fid, 1, "c")[0] 186 | while current_char != b"\x00": # look for the ASCII 0 entry 187 | image_name += current_char.decode("utf-8") 188 | current_char = read_next_bytes(fid, 1, "c")[0] 189 | num_points2D = read_next_bytes(fid, num_bytes=8, 190 | format_char_sequence="Q")[0] 191 | x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D, 192 | format_char_sequence="ddq"*num_points2D) 193 | xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])), 194 | tuple(map(float, x_y_id_s[1::3]))]) 195 | point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3]))) 196 | images[image_id] = Image( 197 | id=image_id, qvec=qvec, tvec=tvec, 198 | camera_id=camera_id, name=image_name, 199 | xys=xys, point3D_ids=point3D_ids) 200 | return images 201 | 202 | 203 | def read_points3D_text(path): 204 | """ 205 | see: src/base/reconstruction.cc 206 | void Reconstruction::ReadPoints3DText(const std::string& path) 207 | void Reconstruction::WritePoints3DText(const std::string& path) 208 | """ 209 | points3D = {} 210 | with open(path, "r") as fid: 211 | while True: 212 | line = fid.readline() 213 | if not line: 214 | break 215 | line = line.strip() 216 | if len(line) > 0 and line[0] != "#": 217 | elems = line.split() 218 | point3D_id = int(elems[0]) 219 | xyz = np.array(tuple(map(float, elems[1:4]))) 220 | rgb = np.array(tuple(map(int, elems[4:7]))) 221 | error = float(elems[7]) 222 | image_ids = np.array(tuple(map(int, elems[8::2]))) 223 | point2D_idxs = np.array(tuple(map(int, elems[9::2]))) 224 | points3D[point3D_id] = Point3D(id=point3D_id, xyz=xyz, rgb=rgb, 225 | error=error, image_ids=image_ids, 226 | point2D_idxs=point2D_idxs) 227 | return points3D 228 | 229 | 230 | def read_points3d_binary(path_to_model_file): 231 | """ 232 | see: src/base/reconstruction.cc 233 | void Reconstruction::ReadPoints3DBinary(const std::string& path) 234 | void Reconstruction::WritePoints3DBinary(const std::string& path) 235 | """ 236 | points3D = {} 237 | with open(path_to_model_file, "rb") as fid: 238 | num_points = read_next_bytes(fid, 8, "Q")[0] 239 | for point_line_index in range(num_points): 240 | binary_point_line_properties = read_next_bytes( 241 | fid, num_bytes=43, format_char_sequence="QdddBBBd") 242 | point3D_id = binary_point_line_properties[0] 243 | xyz = np.array(binary_point_line_properties[1:4]) 244 | rgb = np.array(binary_point_line_properties[4:7]) 245 | error = np.array(binary_point_line_properties[7]) 246 | track_length = read_next_bytes( 247 | fid, num_bytes=8, format_char_sequence="Q")[0] 248 | track_elems = read_next_bytes( 249 | fid, num_bytes=8*track_length, 250 | format_char_sequence="ii"*track_length) 251 | image_ids = np.array(tuple(map(int, track_elems[0::2]))) 252 | point2D_idxs = np.array(tuple(map(int, track_elems[1::2]))) 253 | points3D[point3D_id] = Point3D( 254 | id=point3D_id, xyz=xyz, rgb=rgb, 255 | error=error, image_ids=image_ids, 256 | point2D_idxs=point2D_idxs) 257 | return points3D 258 | 259 | 260 | def read_model(path, ext): 261 | if ext == ".txt": 262 | cameras = read_cameras_text(os.path.join(path, "cameras" + ext)) 263 | images = read_images_text(os.path.join(path, "images" + ext)) 264 | points3D = read_points3D_text(os.path.join(path, "points3D") + ext) 265 | else: 266 | cameras = read_cameras_binary(os.path.join(path, "cameras" + ext)) 267 | images = read_images_binary(os.path.join(path, "images" + ext)) 268 | points3D = read_points3d_binary(os.path.join(path, "points3D") + ext) 269 | return cameras, images, points3D 270 | 271 | 272 | def qvec2rotmat(qvec): 273 | return np.array([ 274 | [1 - 2 * qvec[2]**2 - 2 * qvec[3]**2, 275 | 2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3], 276 | 2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]], 277 | [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3], 278 | 1 - 2 * qvec[1]**2 - 2 * qvec[3]**2, 279 | 2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]], 280 | [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2], 281 | 2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1], 282 | 1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]]) 283 | 284 | 285 | def rotmat2qvec(R): 286 | Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat 287 | K = np.array([ 288 | [Rxx - Ryy - Rzz, 0, 0, 0], 289 | [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0], 290 | [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0], 291 | [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0 292 | eigvals, eigvecs = np.linalg.eigh(K) 293 | qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)] 294 | if qvec[0] < 0: 295 | qvec *= -1 296 | return qvec 297 | 298 | 299 | def main(): 300 | if len(sys.argv) != 3: 301 | print("Usage: python read_model.py path/to/model/folder [.txt,.bin]") 302 | return 303 | 304 | cameras, images, points3D = read_model(path=sys.argv[1], ext=sys.argv[2]) 305 | 306 | print("num_cameras:", len(cameras)) 307 | print("num_images:", len(images)) 308 | print("num_points3D:", len(points3D)) 309 | 310 | 311 | if __name__ == "__main__": 312 | main() 313 | -------------------------------------------------------------------------------- /losses.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, 2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland, 3 | # Laboratoire de Traitement des Signaux 4 (LTS4). 4 | # All rights reserved. 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | # 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com) 25 | 26 | import torch 27 | import torch.nn as nn 28 | from misc import similarity_graph, unravel_index 29 | import numpy as np 30 | from typing import Tuple 31 | 32 | 33 | class DepthConsistencyL1(nn.Module): 34 | """This class implements a consistency loss between the input depth map and the estimated one. The consistency is 35 | measured in terms of the L1-norm of the error between the input depth map and the estimated one. 36 | """ 37 | 38 | def __init__(self, 39 | depth: np.array, depth_range: Tuple[float, float], 40 | depth_confidence: np.array = None, 41 | multiplier: float = 0.0): 42 | """Constructor. 43 | 44 | Args: 45 | depth: depth map to refine, arranged as an `(H, W)` array. 46 | depth_range: depth values must belong to the interval `[depth_range[0], depth_range[1]]`. 47 | depth_confidence: confidence map associated to the depth map to refine. It must have entries in `[0, 1]`. 48 | multiplier: loss multiplier. 49 | """ 50 | 51 | super(DepthConsistencyL1, self).__init__() 52 | 53 | # Check the input depth range. 54 | depth_min, depth_max = depth_range 55 | assert depth_min < depth_max, 'The specified depth range is empty.' 56 | 57 | # Extract the depth map confidence. 58 | if depth_confidence is not None: 59 | assert (depth_confidence >= 0).all() and (depth_confidence <= 1).all(), \ 60 | 'Depth confidence entries must belong to [0, 1].' 61 | confidence = depth_confidence 62 | else: 63 | confidence = 1 64 | 65 | # The confidence is set to zero at non valid depth entries. 66 | confidence = confidence * ((depth > depth_min) & (depth < depth_max)) 67 | 68 | # Convert the confidence to tensor and register it. 69 | self.register_buffer('confidence', torch.as_tensor(confidence[None, None, ])) 70 | 71 | # Convert the depth map to tensor and register it. 72 | self.register_buffer('depth', torch.as_tensor(depth[None, None, ])) 73 | 74 | # Register the normalization constant. 75 | # self.norm_const = self.confidence.sum() 76 | pixel_nb = depth.shape[0] * depth.shape[1] 77 | self.norm_const = pixel_nb 78 | 79 | # Register the loss multiplier. 80 | self.multiplier = multiplier 81 | 82 | def forward(self, depth: torch.Tensor) -> torch.Tensor: 83 | 84 | # Allocate a zero loss in the case that the loss is disabled, i.e., `self.multiplier` is zero. 85 | loss = depth.new_zeros(1, requires_grad=True) 86 | 87 | # If the loss is enabled, evaluate it. 88 | if self.multiplier > 0: 89 | 90 | # Evaluate the loss. 91 | loss = (depth - self.depth).mul(self.confidence).abs().sum().div(self.norm_const) 92 | 93 | # Weight the loss. 94 | loss = self.multiplier * loss 95 | 96 | return loss 97 | 98 | 99 | class NormalConsistencyL1(nn.Module): 100 | """This class implements a consistency loss between the input normal map and the estimated one. The consistency is 101 | measured in terms of the L1-norm of the error between each pair of input and estimated normals. 102 | """ 103 | 104 | def __init__(self, 105 | normal: np.array, 106 | normal_confidence: np.array = None, 107 | multiplier: float = 0.0): 108 | """Constructor. 109 | 110 | Args: 111 | normal: 2D or 3D normal map to refine, arranged as an `(H, W, 2)` or `(H, W, 3)` array. 112 | normal_confidence: confidence map associated to the normal map to refine. It must have entries in `[0, 1]`. 113 | multiplier: loss multiplier. 114 | """ 115 | 116 | super(NormalConsistencyL1, self).__init__() 117 | 118 | # Extract the normal map confidence. 119 | if normal_confidence is not None: 120 | assert (normal_confidence >= 0).all() and (normal_confidence <= 1).all(), \ 121 | 'Depth confidence entries must belong to [0, 1].' 122 | confidence = normal_confidence 123 | else: 124 | confidence = 1 125 | 126 | # The confidence is set to zero at non valid normal entries. 127 | aux = np.sum(normal, axis=2) 128 | confidence = confidence * ((aux > 0) & (aux < float('inf'))) 129 | 130 | # Convert the confidence to tensor and register it. 131 | self.register_buffer('confidence', torch.as_tensor(confidence[None, None,])) 132 | 133 | # Convert the normal map to tensor and register it. 134 | self.register_buffer('normal', torch.as_tensor((np.transpose(normal, (2, 0, 1))[None, ]).copy())) 135 | 136 | # Register the normalization constant. 137 | # self.norm_const = self.confidence.sum() 138 | pixel_nb = normal.shape[0] * normal.shape[1] 139 | self.norm_const = pixel_nb 140 | 141 | # Register the loss multiplier. 142 | self.multiplier = multiplier 143 | 144 | def forward(self, normal: torch.Tensor) -> torch.Tensor: 145 | 146 | # Allocate a zero loss in the case that the loss is disabled, i.e., `self.multiplier` is zero. 147 | loss = normal.new_zeros(1, requires_grad=True) 148 | 149 | # If the loss is enabled, evaluate it. 150 | if self.multiplier > 0: 151 | 152 | # Evaluate the loss. 153 | loss = (normal - self.normal).mul(self.confidence).abs().sum().div(self.norm_const) 154 | 155 | # Weight the loss. 156 | loss = self.multiplier * loss 157 | 158 | return loss 159 | 160 | 161 | class PieceWisePlanarRegularization(nn.Module): 162 | """This class implements a regularizer promoting piece-wise planar functions. 163 | """ 164 | 165 | def __init__(self, 166 | image: np.array, 167 | gamma: float, 168 | window_size: int = 9, patch_size: int = 7, 169 | sigma_intensity: float = 0.2, sigma_spatial: float = 3.0, 170 | degree_max: int = 15, 171 | version: int = 1, 172 | multiplier: float = 0.0, 173 | device: torch.device = torch.device('cpu')): 174 | """Constructor. 175 | 176 | Args: 177 | image: reference image, arranged as an `(H, W)` or `(H, W, C)` array. 178 | gamma: internal multiplier associated to the vector field part of the loss. 179 | window_size: search window size (window_size x window_size) to be used in the graph construction. 180 | patch_size: patch size (patch_size x patch_size) to be used in the graph construction. 181 | sigma_intensity: color difference standard deviation for patch comparison in the graph construction. 182 | sigma_spatial: euclidean distance standard deviation for patch comparison in the graph construction. 183 | degree_max: maximum number of per pixel neighbors in the graph. 184 | version: regularization version (`0` for NLTGV or `1` for ours). 185 | multiplier: loss multiplier. 186 | device: device where the graph computation must take place. 187 | """ 188 | 189 | super(PieceWisePlanarRegularization, self).__init__() 190 | 191 | # Convert the reference image to tensor. 192 | if image.ndim == 2: 193 | image_aux = torch.as_tensor(image[None, None, ]) 194 | elif image.ndim == 3: 195 | image_aux = torch.as_tensor((np.transpose(image, (2, 0, 1))[None, ]).copy()) 196 | else: 197 | raise ValueError('The input image must be either gray scale or RGB.') 198 | 199 | # Image dimensions. 200 | height = image_aux.size(2) 201 | width = image_aux.size(3) 202 | 203 | # Compute the neighboring pixels and the corresponding weights. 204 | weights, neighbours = similarity_graph( 205 | image_aux.to(device), 206 | window_size=window_size, patch_size=patch_size, 207 | sigma_intensity=sigma_intensity, sigma_spatial=sigma_spatial, 208 | degree_max=degree_max) 209 | weights = weights.to('cpu') 210 | neighbours = neighbours.to('cpu') 211 | # The function `similarity_graph` is fed with a copy of `image_tensor` on `device`, therefore the output is on 212 | # `device` as well and it must be brought back to CPU. 213 | 214 | # Register the number of neighbors per pixel. 215 | self.neighbour_nb = weights.size(1) 216 | 217 | # Flatten the spatial dimensions of `weights` and `neighbours`, and register them. 218 | weights = weights.view(self.neighbour_nb, -1) 219 | neighbours = neighbours.view(self.neighbour_nb, -1) 220 | self.register_buffer('weights', weights) 221 | self.register_buffer('neighbours', neighbours) 222 | 223 | # Compute the distance vector between each pixel and its neighbours, and register it. 224 | y_source, x_source = unravel_index( 225 | torch.arange(height * width).view(1, -1), 226 | (height, width)) 227 | y_target, x_target = unravel_index( 228 | self.neighbours, 229 | (height, width)) 230 | dist = torch.cat( 231 | (x_source.add(-1, x_target.to(x_source))[:, None, ], 232 | y_source.add(-1, y_target.to(y_source))[:, None, ]), 233 | dim=1) 234 | self.register_buffer('dist', dist.to(torch.float64)) 235 | # Note that `dist` is casted to `torch.float64` before to be registered. In fact, the function `forward()` 236 | # requires `self.dist` data type to match the float data type (16, 32 or 64) of the other tensors involved 237 | # in the computation. One could argue that calling `to()` on the module and specifying the data type would 238 | # convert all its registered tensors. However, this is not the case for integer tensor. Therefore, in order 239 | # to have `self.dist` converted by `to()`, its data type must be of type float already. The data type 240 | # `torch.float64` is chosen to avoid any loss of precision. 241 | 242 | # Number of pixels. 243 | pixel_nb = height * width 244 | 245 | # Register the normalization constant. 246 | self.norm_const = pixel_nb 247 | 248 | # Register the multiplier associated to the second order derivative. 249 | self.gamma = gamma 250 | 251 | # Register the regularization type. 252 | if version == 1: 253 | self.forward_internal = self.ours 254 | else: 255 | raise NotImplementedError('The required regularization does not exist.') 256 | 257 | # Register the loss multiplier. 258 | self.multiplier = multiplier 259 | 260 | def forward(self, sig1: torch.Tensor, sig2: torch.Tensor) -> torch.Tensor: 261 | 262 | return self.forward_internal(sig1, sig2) 263 | 264 | # Our regularization. 265 | def ours(self, sig1: torch.Tensor, sig2: torch.Tensor) -> torch.Tensor: 266 | """ 267 | It implements the regularization proposed in the following article: 268 | 269 | Mattia Rossi, Mireille El Gheche, Andreas Kuhn, Pascal Frossard, 270 | "Joint Graph-based Depth Refinement and Normal Estimation", 271 | in IEEE Computer Vision and Pattern Recognition Conference (CVPR), Seattle, WA, USA, 2020. 272 | 273 | Args: 274 | sig1: main signal, arranged as a `(1, 1, H, W)` tensor. 275 | sig2: secondary signal, arranged as a `(1, 2, H, W)` tensor. 276 | 277 | Returns: 278 | The considered regularization evaluated at `(sig1, sig2)`. 279 | """ 280 | 281 | # Allocate a zero loss in the case that the loss is disabled, i.e., `self.multiplier` is zero. 282 | loss = sig1.new_zeros(1, requires_grad=True) 283 | 284 | # If the loss is enabled, evaluate it. 285 | if self.multiplier > 0: 286 | 287 | # Expand and flatten `sig1` and `sig2`. 288 | sig1_flattened = sig1[:, None, ] 289 | sig1_flattened = sig1_flattened.expand( 290 | -1, self.neighbour_nb, -1, -1, -1).view(self.neighbour_nb, -1) 291 | sig2_flattened = sig2[:, None, ] 292 | sig2_flattened = sig2_flattened.expand( 293 | -1, self.neighbour_nb, -1, -1, -1).view(self.neighbour_nb, 2, -1) 294 | 295 | # Compute the left part of the regularization. 296 | aux1 = (sig1_flattened - 297 | torch.gather(sig1_flattened, 1, self.neighbours) - 298 | (sig2_flattened * self.dist).sum(dim=1)) 299 | aux1 = (aux1 * self.weights).norm(dim=0).sum() 300 | 301 | # Compute the right part of the regularization. 302 | aux2 = (sig2_flattened - 303 | torch.gather(sig2_flattened, 2, self.neighbours[:, None, ].expand(-1, 2, -1))).norm(dim=1) 304 | aux2 = (aux2 * self.weights).sum() 305 | 306 | # Add the contribution of the left and right parts. 307 | loss = aux1 + (self.gamma * aux2) 308 | 309 | # Normalize the loss. 310 | loss = loss.div(self.norm_const) 311 | 312 | # Weight the loss. 313 | loss = self.multiplier * loss 314 | 315 | return loss 316 | -------------------------------------------------------------------------------- /iofuns.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, 2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland, 3 | # Laboratoire de Traitement des Signaux 4 (LTS4). 4 | # All rights reserved. 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | # 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com) 25 | 26 | import numpy as np 27 | from typing import Tuple, Dict 28 | import os 29 | import re 30 | import sys 31 | import struct 32 | from colmap.read_model import read_cameras_binary 33 | from cv2 import imread 34 | 35 | 36 | def read_depth_map(path: str, data_format: str, 37 | size: Tuple[int, int] = None, stereo_param: Dict = None) -> np.array: 38 | """It reads a depth map either in the ETH3D format or in the COLMAP one. 39 | 40 | Args: 41 | path: path to the depth map. 42 | data_format: depth map format (`ETH3D`, `COLMAP`, `MIDDLEBURY`). 43 | size: a 2-Tuples specifying the depth map height and width (mandatory only for the ETH3D format). 44 | stereo_param: stereo parameters (mandatory for `data_format` equal to `'MIDDLEBURY'`). 45 | 46 | Returns: 47 | The read depth map (in meters) arranged as an `(H, W)` array. Non valid values are signalled with zero entries. 48 | """ 49 | 50 | # Read the depth map. 51 | if data_format == 'ETH3D': 52 | 53 | # Depth map dimensions. 54 | if size is not None: 55 | height, width = size 56 | else: 57 | raise ValueError('For ETH3D depth type, the `size` parameter is mandatory.') 58 | 59 | with open(path, "rb") as fid: 60 | depth = np.reshape(np.fromfile(fid, dtype=np.float32), (height, width)) 61 | # Note that depth values are of type np.float32. 62 | 63 | elif data_format == 'COLMAP': 64 | 65 | depth = read_bin_file(path) 66 | 67 | elif data_format == 'MIDDLEBURY': 68 | 69 | assert stereo_param is not None, 'For `data_format` equal to MIDDLEBURY, `stereo_param` is mandatory.' 70 | 71 | # Read the disparity map. 72 | disparity = read_middlebury_disparity(path) 73 | 74 | # Convert the read disparity to depth. 75 | depth = (stereo_param['baseline'] * stereo_param['cam0'][0, 0]) / (disparity - stereo_param['doffs']) / 1000.0 76 | 77 | else: 78 | 79 | raise ValueError('Bad depth format.') 80 | 81 | # Depending on the source, non valid pixels are signalled either with non positive entries or with infinite ones. 82 | 83 | # Signal the non valid entries with zero. 84 | depth[(depth < 0) | (depth == float('inf'))] = 0 85 | 86 | return depth 87 | 88 | 89 | def read_normal_map(path: str, data_format: str) -> np.array: 90 | """It reads a normal map in the COLMAP format. 91 | 92 | Args: 93 | path: path to the normal map. 94 | data_format: the normal map format (currently, only 'COLMAP'). 95 | 96 | Returns: 97 | The read normal map arranged as an `(H, W, 3)` array. 98 | """ 99 | 100 | if data_format == 'COLMAP': 101 | 102 | normal = read_bin_file(path) 103 | 104 | else: 105 | 106 | raise ValueError('Bad normal format.') 107 | 108 | return normal 109 | 110 | 111 | def read_confidence_map(path: str): 112 | 113 | confidence = (imread(path)[:, :, 0]).astype(np.float64) / 255.0 114 | 115 | return confidence 116 | 117 | 118 | def read_middlebury_disparity(file_name: str) -> np.array: 119 | """It read the Middlebury 2014 dataset disparity. 120 | 121 | Args: 122 | file_name: path to the disparity map. 123 | 124 | Returns: 125 | The loaded disparity map, arranged as an `(H, W)` array. 126 | """ 127 | 128 | # Read the disparity. 129 | disparity = load_pfm(file_name) 130 | 131 | # It is necessary to flip the disparity upside down. 132 | disparity = np.flipud(disparity) 133 | 134 | # Non valid disparity entries are signalled with infinite. 135 | 136 | return disparity 137 | 138 | 139 | def read_middlebury_calib_file(file_name: str) -> Tuple: 140 | """It reads the calibration file of a scene from the Middlebury 2014 training dataset. 141 | 142 | It reads the calibration file of a scene from the Middlebury 2014 training dataset provided in 143 | `http://vision.middlebury.edu/stereo/data/scenes2014/` 144 | 145 | Args: 146 | file_name: calibration file name. 147 | 148 | Returns: 149 | A dictionary containing all the calibration file parameters. 150 | - cam0: left camera intrinsic matrix, arranged as a `(3, 3)` array, 151 | - cam1: camera intrinsic matrix, arranged as a `(3, 3)` array, 152 | - doffs: correction offset, 153 | - baseline: camera baseline, 154 | - width: image width, 155 | - height: image height, 156 | - ndisp: ground truth disparity resolution, 157 | - isint: ... , 158 | - vmin: minimum disparity, 159 | - vmax: maximum disparity, 160 | - dyavg: ... , 161 | - dymax: ... . 162 | """ 163 | 164 | # Create the parameter dictionary. 165 | param = {} 166 | 167 | with open(file_name) as fp: 168 | 169 | # Read the left camera intrinsic matrix. 170 | line = fp.readline() 171 | data = (line.split('='))[1].rstrip() 172 | data = data.replace('[', '').replace(']', '').replace(';', '') 173 | cam0 = np.reshape(np.fromstring(data, dtype=np.float32, sep=' '), (3, 3)) 174 | param['cam0'] = cam0 175 | 176 | # Read the right camera intrinsic matrix. 177 | line = fp.readline() 178 | data = (line.split('='))[1].rstrip() 179 | data = data.replace('[', '').replace(']', '').replace(';', '') 180 | cam1 = np.reshape(np.fromstring(data, dtype=np.float, sep=' '), (3, 3)) 181 | param['cam1'] = cam1 182 | 183 | # Read doffs. 184 | line = fp.readline() 185 | data = (line.split('='))[1].rstrip() 186 | doffs = float(data) 187 | param['doffs'] = doffs 188 | 189 | # Read baseline. 190 | line = fp.readline() 191 | data = (line.split('='))[1].rstrip() 192 | baseline = float(data) 193 | param['baseline'] = baseline 194 | 195 | # Read width. 196 | line = fp.readline() 197 | data = (line.split('='))[1].rstrip() 198 | width = int(data) 199 | param['width'] = width 200 | 201 | # Read height. 202 | line = fp.readline() 203 | data = (line.split('='))[1].rstrip() 204 | height = int(data) 205 | param['height'] = height 206 | 207 | # Read ndisp. 208 | line = fp.readline() 209 | data = (line.split('='))[1].rstrip() 210 | ndisp = int(data) 211 | param['ndisp'] = ndisp 212 | 213 | # Read isint. 214 | line = fp.readline() 215 | data = (line.split('='))[1].rstrip() 216 | isint = int(data) 217 | param['isint'] = isint 218 | 219 | # Read vmin. 220 | line = fp.readline() 221 | data = (line.split('='))[1].rstrip() 222 | vmin = float(data) 223 | param['vmin'] = vmin 224 | 225 | # Read vmax. 226 | line = fp.readline() 227 | data = (line.split('='))[1].rstrip() 228 | vmax = float(data) 229 | param['vmax'] = vmax 230 | 231 | # Read dyavg. 232 | line = fp.readline() 233 | data = (line.split('='))[1].rstrip() 234 | dyavg = float(data) 235 | param['dyavg'] = dyavg 236 | 237 | # Read dymax. 238 | line = fp.readline() 239 | data = (line.split('='))[1].rstrip() 240 | dymax = float(data) 241 | param['dymax'] = dymax 242 | 243 | return param 244 | 245 | 246 | def read_kitti_calib_file(filename: str) -> np.array: 247 | """It reads the calibration file of a scene from the KITTI 2015 training dataset. 248 | 249 | It reads the calibration file of a scene from the KITTI 2015 training dataset provided in 250 | `http://www.cvlibs.net/datasets/kitti/`. 251 | 252 | Args: 253 | filename: calibration file name. 254 | 255 | Returns: 256 | A dictionary containing all the calibration file parameters. 257 | - P_rect_02: rectified left color camera intrinsic matrix, arranged as a `(3, 3)` array, 258 | - P_rect_03: rectified right color camera intrinsic matrix, arranged as a `(3, 3)` array, 259 | - baseline: rectified camera baseline. 260 | """ 261 | 262 | param = {} 263 | 264 | with open(filename) as fp: 265 | 266 | while True: 267 | 268 | # Read a new line. 269 | line = fp.readline() 270 | 271 | # If the line is empty, the end of the file has been reached. 272 | if line == '': 273 | break 274 | 275 | # Split the line into parameter name and it value. 276 | param_name, data = line.split(':', maxsplit=1) 277 | 278 | # If the current line contains one of the parameters of interest, save it. 279 | if param_name == 'P_rect_02': 280 | param[param_name] = np.reshape( 281 | np.fromstring(data.lstrip().rstrip(), dtype=np.float32, sep=' '), 282 | (3, 4)) 283 | elif param_name == 'T_02': 284 | param[param_name] = np.reshape( 285 | np.fromstring(data.lstrip().rstrip(), dtype=np.float32, sep=' '), 286 | (3,)) 287 | elif param_name == 'P_rect_03': 288 | param[param_name] = np.reshape( 289 | np.fromstring(data.lstrip().rstrip(), dtype=np.float32, sep=' '), 290 | (3, 4)) 291 | elif param_name == 'T_03': 292 | param[param_name] = np.reshape( 293 | np.fromstring(data.lstrip().rstrip(), dtype=np.float32, sep=' '), 294 | (3,)) 295 | 296 | # Check that all the parameters have been read. 297 | assert 'P_rect_02' in param, 'Could not read left camera intrinsic matrix.' 298 | assert 'T_02' in param, 'Could not read left camera translation vector.' 299 | assert 'P_rect_03' in param, 'Could not read right camera intrinsic matrix.' 300 | assert 'T_03' in param, 'Could not read right camera translation vector.' 301 | 302 | # Compute the baseline. 303 | param['baseline'] = abs( 304 | (param['P_rect_02'][0, 3] / param['P_rect_02'][0, 0]) - (param['P_rect_03'][0, 3] / param['P_rect_03'][0, 0])) 305 | 306 | return param 307 | 308 | 309 | def read_bin_file(file_name: str) -> np.array: 310 | """It reads a depth map or normal map in the COLMAP bin format. 311 | 312 | It reads a depth map or normal map in the COLMAP bin format. In practice, it can read any 2D or 3D array. 313 | It is a modified version of COLMAP `read_array()` Python script. 314 | 315 | Args: 316 | file_name: source file. 317 | 318 | Returns: 319 | A depth map or a normal map, arranged as an `(H, W)` or an `(H, W, 3)` array, respectively. 320 | """ 321 | 322 | with open(file_name, "rb") as fid: 323 | 324 | # Read the file header. It is in the format 'width&height&channel_nb&' where channel_nb is 1 for 2D data. 325 | width, height, channel_nb = np.genfromtxt(fid, delimiter="&", max_rows=1, usecols=(0, 1, 2), dtype=int) 326 | fid.seek(0) 327 | num_delimiter = 0 328 | byte = fid.read(1) 329 | 330 | while True: 331 | 332 | if byte == b"&": 333 | num_delimiter += 1 334 | if num_delimiter >= 3: 335 | break 336 | 337 | byte = fid.read(1) 338 | 339 | # Read the data, stored as float32 in C-like order. 340 | data = np.fromfile(fid, np.float32) 341 | 342 | # Reshape the read data into a (width, height, channel_nb) array. 343 | data = data.reshape((width, height, channel_nb), order='F') 344 | 345 | # Transpose the data to get an array in the (height, width, channel_nb) format. 346 | data = np.transpose(data, (1, 0, 2)) 347 | 348 | # In the case of 2D data, remove the last dimension. 349 | if channel_nb == 1: 350 | data = data[:, :, 0] 351 | 352 | return data 353 | 354 | 355 | def write_bin_file(data: np.array, file_name: str) -> None: 356 | """It writes a depth map or a normal map in the COLMAP bin format. 357 | 358 | It writes a depth map or a normal map in the COLMAP bin format. In practice, it can write any 2D or 3D array. 359 | 360 | Args: 361 | data: depth map or normal map, arranged as an `(H, W)` or an `(H, W, 3)` array, respectively. 362 | file_name: destination file name. 363 | """ 364 | 365 | # Check the input data. 366 | assert data.ndim == 2 or data.ndim == 3, 'The input data must be 2D or 3D.' 367 | 368 | # If the input data are 2D, a fake 3D dimension is added. This permits to treat 2D and 3D data the same way. 369 | if data.ndim == 2: 370 | data = data[:, :, None] 371 | 372 | # Number of color channels. 373 | channel_nb = data.shape[2] 374 | 375 | with open(file_name, "wb") as file: 376 | 377 | # Write the file header. 378 | file.write(bytearray(str(data.shape[1]), 'utf8')) 379 | file.write(bytearray('&', 'utf8')) 380 | file.write(bytearray(str(data.shape[0]), 'utf8')) 381 | file.write(bytearray('&', 'utf8')) 382 | file.write(bytearray(str(channel_nb), 'utf8')) 383 | file.write(bytearray('&', 'utf8')) 384 | 385 | # Write the data. 386 | with open(file_name, "ab") as file: 387 | 388 | for c in range(channel_nb): 389 | for y in range(data.shape[0]): 390 | for x in range(data.shape[1]): 391 | file.write(struct.pack('f', data[y, x, c])) 392 | 393 | 394 | def load_pfm(file_name: str) -> np.array: 395 | """It reads a pfm file. 396 | 397 | It reads a pfm file. Adapted from the following web page: 398 | `https://stackoverflow.com/questions/48809433/read-pfm-format-in-python` 399 | 400 | Args: 401 | file_name: PFM file name. 402 | 403 | Returns: 404 | The PFM file, arranged as an `(H, W, C)` or an `(H, W)` array. 405 | """ 406 | 407 | with open(file_name, "rb") as f: 408 | 409 | # Line 1: the number of channels. 410 | channel_type = f.readline().decode('latin-1') 411 | if "PF" in channel_type: 412 | channels = 3 413 | 414 | elif "Pf" in channel_type: 415 | channels = 1 416 | 417 | else: 418 | sys.exit(1) 419 | 420 | # Line 2: height and width. 421 | line = f.readline().decode('latin-1') 422 | width, height = re.findall('\d+', line) 423 | width = int(width) 424 | height = int(height) 425 | 426 | # Line 3: positive number means big endian, negative means little endian. 427 | line = f.readline().decode('latin-1') 428 | big_endian = True 429 | if "-" in line: 430 | big_endian = False 431 | 432 | # Slurp all the binary data. 433 | samples = width * height * channels; 434 | buffer = f.read(samples * 4) 435 | 436 | # Unpack the floats with the appropriate endianness. 437 | if big_endian: 438 | fmt = ">" 439 | else: 440 | fmt = "<" 441 | 442 | fmt = fmt + str(samples) + "f" 443 | data = struct.unpack(fmt, buffer) 444 | 445 | # Reshape the data. 446 | data = np.reshape(np.array(data), (height, width, channels)).squeeze() 447 | 448 | return data 449 | 450 | 451 | def colmap_camera_intrinsic(path: str) -> Dict[str, float]: 452 | """It reads the camera intrinsic parameters stored in COLMAP format. 453 | 454 | It reads the intrinsic parameters of the first pinhole camera stored in a `.txt` or `.bin` COLMAP camera file. 455 | 456 | Args: 457 | path: path to the COLMAP file. 458 | 459 | Returns: 460 | A dictionary with the following fields: 461 | - the horizontal and vertical focal lengths `f_x` and `f_y`, 462 | - the horizontal and vertical coordinates of the camera center of projection `c_x` and `c_y`. 463 | """ 464 | 465 | camera = read_cameras_binary(path)[0] 466 | 467 | assert camera.model == 'PINHOLE', 'The input camera must refer to a pinhole model.' 468 | 469 | focal_x, focal_y, center_x, center_y = camera.params 470 | 471 | camera_intrinsic = dict( 472 | f_x=focal_x, 473 | f_y=focal_y, 474 | c_x=center_x, 475 | c_y=center_y) 476 | 477 | return camera_intrinsic 478 | 479 | 480 | def read_camera_data_text(path: str) -> Dict: 481 | """It extracts the camera info stored in a text file (Andreas' format). 482 | 483 | Args: 484 | path: path to the camera info file. 485 | 486 | Returns: 487 | A dictionary containing all the camera info. 488 | """ 489 | 490 | # Dictionary containing the camera data. 491 | camera = { 492 | 'height': None, 493 | 'width': None, 494 | 'A': None, 495 | 'k1': None, 496 | 'k2': None, 497 | 'R': None, 498 | 'T': None, 499 | 'zmin': None, 500 | 'zmax': None, 501 | 'match': None} 502 | 503 | # Extract the camera data from the text file. 504 | with open(path, "r") as fid: 505 | while True: 506 | line = fid.readline() 507 | if not line: 508 | break 509 | line = line.strip() 510 | if len(line) > 0 and line[0] != "#": 511 | line_elems = line.split('=') 512 | if len(line_elems) != 2: 513 | continue 514 | param_name = line_elems[0] 515 | param_value = line_elems[1] 516 | param_name_elems = param_name.split('.') 517 | if (len(param_name_elems) != 2) or (param_name_elems[0] != 'camera'): 518 | continue 519 | param_name = param_name_elems[1] 520 | camera[param_name] = param_value 521 | 522 | # Convert the camera height to `int`. 523 | if (camera['height'] is not None) and ('.' not in camera['height']): 524 | camera['height'] = int(camera['height']) 525 | else: 526 | camera['height'] = None 527 | 528 | # Convert the camera width to `int`. 529 | if (camera['width'] is not None) and ('.' not in camera['width']): 530 | camera['width'] = int(camera['width']) 531 | else: 532 | camera['width'] = None 533 | 534 | # Convert the camera `k1' parameter to `float`. 535 | if camera['k1'] is not None: 536 | camera['k1'] = int(camera['k1']) 537 | else: 538 | camera['k1'] = None 539 | 540 | # Convert the camera `k2' parameter to `float`. 541 | if camera['k2'] is not None: 542 | camera['k2'] = int(camera['k2']) 543 | else: 544 | camera['k2'] = None 545 | 546 | # Convert the camera intrinsic matrix to `np.array`. 547 | if camera['A'] is not None: 548 | mtx_intrinsic = [float(i) for i in re.findall(r'[-+]?\d*\.\d+|[-+]?\d+', camera['A'])] 549 | mtx_intrinsic = np.asarray(mtx_intrinsic, dtype=float) 550 | if len(mtx_intrinsic) != 9: 551 | camera['A'] = None 552 | else: 553 | camera['A'] = np.reshape(mtx_intrinsic, (3, 3)) 554 | 555 | # Convert the camera rotation matrix to `np.array`. 556 | if camera['R'] is not None: 557 | mtx_rotation = [float(i) for i in re.findall(r'[-+]?\d*\.\d+|[-+]?\d+', camera['R'])] 558 | mtx_rotation = np.asarray(mtx_rotation, dtype=float) 559 | if len(mtx_rotation) != 9: 560 | camera['R'] = None 561 | else: 562 | camera['R'] = np.reshape(mtx_rotation, (3, 3)) 563 | 564 | # Convert the camera translation vector to `np.array`. 565 | if camera['T'] is not None: 566 | vec_translation = [float(i) for i in re.findall(r'[-+]?\d*\.\d+|[-+]?\d+', camera['T'])] 567 | vec_translation = np.asarray(vec_translation, dtype=float) 568 | if len(vec_translation) != 3: 569 | camera['T'] = None 570 | else: 571 | camera['T'] = np.reshape(vec_translation, (3, 1)) 572 | 573 | return camera 574 | -------------------------------------------------------------------------------- /refine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, 2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland, 3 | # Laboratoire de Traitement des Signaux 4 (LTS4). 4 | # All rights reserved. 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | # 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com) 25 | 26 | import argparse 27 | import os 28 | import numpy as np 29 | from cv2 import imread 30 | from iofuns import read_depth_map, read_normal_map, write_bin_file 31 | from misc import resize_map, depth_percentage_error 32 | from refinement import refine_depth 33 | from logger import Logger 34 | import torch.optim 35 | import time 36 | import math 37 | 38 | 39 | def read_param() -> argparse.Namespace: 40 | """It parses the command-line parameters. 41 | 42 | Returns: 43 | The input parameters. 44 | """ 45 | 46 | # Create the parser. 47 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 48 | 49 | # ========================================== SCALE-INDEPENDENT PARAMETERS ========================================== 50 | 51 | # Input/output paths. 52 | parser.add_argument( 53 | '--image', type=str, required=True, 54 | help='input image path') 55 | parser.add_argument( 56 | '--depth', type=str, required=True, 57 | help='input depth map path') 58 | parser.add_argument( 59 | '--normal', type=str, 60 | help='input normal map path') 61 | parser.add_argument( 62 | '--confidence', type=str, required=True, 63 | help='input (depth) confidence map path') 64 | parser.add_argument( 65 | '--depth_gt', type=str, 66 | help='ground truth depth map path') 67 | parser.add_argument( 68 | '--depth_out', type=str, required=True, 69 | help='refined depth map saving path') 70 | parser.add_argument( 71 | '--normal_out', type=str, required=True, 72 | help='estimated normal map saving path') 73 | 74 | # Camera parameters. 75 | parser.add_argument( 76 | '--cam_focal', type=float, nargs=2, required=True, 77 | help='camera focal lengths (f_x, f_y)') 78 | parser.add_argument( 79 | '--cam_center', type=float, nargs=2, required=True, 80 | help='camera principal point coordinates (c_x, c_y)') 81 | 82 | # Depth range. 83 | parser.add_argument( 84 | '--depth_min', type=float, default=1e-1, 85 | help='minimum depth value (in meters)') 86 | parser.add_argument( 87 | '--depth_max', type=float, default=100, 88 | help='maximum depth value (in meters)') 89 | 90 | # Confidence binarization. 91 | parser.add_argument( 92 | '--confidence_threshold', type=float, default=None, 93 | help='threshold for confidence binarization') 94 | 95 | # Plotting parameters. 96 | parser.add_argument( 97 | '--visdom_display_port', type=int, default=-1, 98 | help='port to be used by the VISDOM server') 99 | parser.add_argument( 100 | '--visdom_base_url', type=str, default='refinement', 101 | help='string to customize the VISDOM server URL') 102 | parser.add_argument( 103 | '--plotting_step', type=int, default=500, 104 | help='number of steps between two plot updates in the VISDOM server') 105 | 106 | # Device and precision. 107 | parser.add_argument( 108 | '--gpu_id', type=int, default=-1, 109 | help='gpu id (non positive numbers trigger cpu computation)') 110 | parser.add_argument( 111 | '--precision', type=str, choices=['single', 'double'], default='single', 112 | help='computation precision (32 or 64 bits)') 113 | 114 | # Error evaluation. 115 | parser.add_argument( 116 | '--depth_error_threshold', type=float, default=0.02, 117 | help='error threshold (in meters) to be used in the evaluation against the ground truth') 118 | 119 | # Multi-scale strategy. 120 | parser.add_argument( 121 | '--scale_nb', type=int, default=1, 122 | help='number of scales in the multi-scale pyramid') 123 | 124 | # Regularization. 125 | parser.add_argument( 126 | '--regularization', type=int, choices=[0, 1], default=1, 127 | help='regularization type (0 for NLTGV, 1 for our regularization)') 128 | 129 | # =========================================== SCALE-DEPENDENT PARAMETERS =========================================== 130 | 131 | # Loss parameters. 132 | parser.add_argument( 133 | '--lambda_depth_consistency', nargs='*', type=float, default=1.0, 134 | help='depth consistency term multiplier (one per scale)') 135 | parser.add_argument( 136 | '--lambda_normal_consistency', nargs='*', type=float, default=0.0, 137 | help='normal consistency term multiplier (one per scale)') 138 | parser.add_argument( 139 | '--lambda_regularization', nargs='*', type=float, default=7.5, 140 | help='depth regularization term multiplier (one per scale)') 141 | parser.add_argument( 142 | '--gamma_regularization', nargs='*', type=float, default=5.5, 143 | help='depth regularization term internal multiplier (one per scale)') 144 | 145 | # Graph parameters. 146 | parser.add_argument( 147 | '--window_size', nargs='*', type=int, default=9, 148 | help='search window size (window_size x window_size) to be used in the graph construction (one per scale)') 149 | parser.add_argument( 150 | '--patch_size', nargs='*', type=int, default=3, 151 | help='patch size (patch_size x patch_size) to be used in the graph construction (one per scale)') 152 | parser.add_argument( 153 | '--sigma_int', nargs='*', type=float, default=0.07, 154 | help='color difference standard deviation for patch comparison in the graph construction (one per scale)') 155 | parser.add_argument( 156 | '--sigma_spa', nargs='*', type=float, default=3.0, 157 | help='euclidean distance standard deviation for patch comparison in the graph construction (one per scale)') 158 | parser.add_argument( 159 | '--degree_max', nargs='*', type=int, default=20, 160 | help='maximum number of per pixel neighbors in the graph (one per scale)') 161 | 162 | # Stopping criteria. 163 | parser.add_argument( 164 | '--iter_max', nargs='*', type=int, default=4000, 165 | help='maximum number of iterations (one per scale)') 166 | parser.add_argument( 167 | '--eps_stop', nargs='*', type=float, default=1e-6, 168 | help=('minimum relative change between the current and the previous ' 169 | 'iteration depth maps (one per scale)')) 170 | parser.add_argument( 171 | '--attempt_max', nargs='*', type=int, default=50, 172 | help='maximum number of iterations without improving the loss (one per scale)') 173 | 174 | # Learning rate update policies. 175 | parser.add_argument( 176 | '--lr_start', nargs='*', type=float, default=1e-4, 177 | help='initial learning rate (one per scale)') 178 | parser.add_argument( 179 | '--lr_slot_nb', nargs='*', type=int, default=1, 180 | help=('number of partitions (one per scale); ' 181 | 'each partition adopts a learning rate which is 1/10 of those employed at the previous partition;' 182 | '0 excludes the relative depth map change stopping criterium.')) 183 | 184 | # ================================================================================================================== 185 | 186 | # Perform parsing. 187 | param = parser.parse_args() 188 | 189 | # =================================== CHECK AND ADJUST THE INPUT PARAMETER FORMAT ================================== 190 | 191 | # Cases: 192 | # - if the value for a parameter is provided, then this must be a list of length `param.scale_nb`. 193 | # - if the value is not provided, the default value is used to fill a list of length `param.scale_nb`. 194 | 195 | # Check `lambda_depth_consistency`. 196 | if isinstance(param.lambda_depth_consistency, list): 197 | assert (len(param.lambda_depth_consistency) == param.scale_nb) 198 | else: 199 | param.lambda_depth_consistency = [param.lambda_depth_consistency] * param.scale_nb 200 | 201 | # Check `lambda_normal_consistency`. 202 | if isinstance(param.lambda_normal_consistency, list): 203 | assert (len(param.lambda_normal_consistency) == param.scale_nb) 204 | else: 205 | param.lambda_normal_consistency = [param.lambda_normal_consistency] * param.scale_nb 206 | 207 | # Check `lambda_reg`. 208 | if isinstance(param.lambda_regularization, list): 209 | assert (len(param.lambda_regularization) == param.scale_nb) 210 | else: 211 | param.lambda_regularization = [param.lambda_regularization] * param.scale_nb 212 | 213 | # Check `gamma_regularization`. 214 | if isinstance(param.gamma_regularization, list): 215 | assert (len(param.gamma_regularization) == param.scale_nb) 216 | else: 217 | param.gamma_regularization = [param.gamma_regularization] * param.scale_nb 218 | 219 | # Check `window_size`. 220 | if isinstance(param.window_size, list): 221 | assert (len(param.window_size) == param.scale_nb) 222 | else: 223 | param.window_size = [param.window_size] * param.scale_nb 224 | 225 | # Check `patch_size`. 226 | if isinstance(param.patch_size, list): 227 | assert (len(param.patch_size) == param.scale_nb) 228 | else: 229 | param.patch_size = [param.patch_size] * param.scale_nb 230 | 231 | # Check `sigma_int`. 232 | if isinstance(param.sigma_int, list): 233 | assert (len(param.sigma_int) == param.scale_nb) 234 | else: 235 | param.sigma_int = [param.sigma_int] * param.scale_nb 236 | 237 | # Check `sigma_spa`. 238 | if isinstance(param.sigma_spa, list): 239 | assert (len(param.sigma_spa) == param.scale_nb) 240 | else: 241 | param.sigma_spa = [param.sigma_spa] * param.scale_nb 242 | 243 | # Check `degree_max`. 244 | if isinstance(param.degree_max, list): 245 | assert (len(param.degree_max) == param.scale_nb) 246 | else: 247 | param.degree_max = [param.degree_max] * param.scale_nb 248 | 249 | # Check `iter_max`. 250 | if isinstance(param.iter_max, list): 251 | assert (len(param.iter_max) == param.scale_nb) 252 | else: 253 | param.iter_max = [param.iter_max] * param.scale_nb 254 | 255 | # Check `eps_stop`. 256 | if isinstance(param.eps_stop, list): 257 | assert (len(param.eps_stop) == param.scale_nb) 258 | else: 259 | param.eps_stop = [param.eps_stop] * param.scale_nb 260 | 261 | # Check `attempt_max`. 262 | if isinstance(param.attempt_max, list): 263 | assert (len(param.attempt_max) == param.scale_nb) 264 | else: 265 | param.attempt_max = [param.attempt_max] * param.scale_nb 266 | 267 | # Check `lr_start`. 268 | if isinstance(param.lr_start, list): 269 | assert (len(param.lr_start) == param.scale_nb) 270 | else: 271 | param.lr_start = [param.lr_start] * param.scale_nb 272 | 273 | # Check `lr_slot_nb`. 274 | if isinstance(param.lr_slot_nb, list): 275 | assert (len(param.lr_slot_nb) == param.scale_nb) 276 | else: 277 | param.lr_slot_nb = [param.lr_slot_nb] * param.scale_nb 278 | 279 | return param 280 | 281 | 282 | def print_param(param: argparse.Namespace) -> None: 283 | """It prints the input parameters. 284 | 285 | Args: 286 | param: parameters to be printed. 287 | """ 288 | 289 | # Organize the parameters into a single string. 290 | message = '' 291 | message += '---------------------- Options ----------------------\n' 292 | for k, v in sorted(vars(param).items()): 293 | 294 | # Turn `v` into a string. 295 | if isinstance(v, list): 296 | v_str = ', '.join([str(item) for item in v]) 297 | else: 298 | v_str = str(v) 299 | 300 | # Write the current pair. 301 | message += '{:>30}: {:<30}\n'.format(str(k), v_str) 302 | message += '------------------------ End ------------------------' 303 | 304 | # Print the options to standard output. 305 | print(message) 306 | 307 | # Save the parameters to disk. 308 | file_name = 'param.txt' 309 | with open(file_name, 'wt') as param_file: 310 | param_file.write(message) 311 | param_file.write('\n') 312 | 313 | 314 | def main(): 315 | # Read the input parameters. 316 | param = read_param() 317 | 318 | # Interrupt the script if the refined depth and normal maps already exist. 319 | if os.path.exists(param.depth_out) and os.path.exists(param.normal_out): 320 | print('The refined depth and/or normal map already exist !!!') 321 | return 322 | 323 | # Organize the camera parameters in a dictionary. 324 | camera_param = { 325 | 'f_x': param.cam_focal[0], 326 | 'f_y': param.cam_focal[1], 327 | 'c_x': param.cam_center[0], 328 | 'c_y': param.cam_center[1]} 329 | 330 | # Store the loss parameters as a list of dictionaries (one dictionary for each scale of the multi-scale pyramid). 331 | # The same approach is adopted for the optimization parameters. 332 | loss_param = [None] * param.scale_nb 333 | opt_param = [None] * param.scale_nb 334 | for i in range(param.scale_nb): 335 | loss_param[i] = { 336 | 'lambda_depth_consistency': param.lambda_depth_consistency[i], 337 | 'lambda_normal_consistency': param.lambda_normal_consistency[i], 338 | 'lambda_regularization': param.lambda_regularization[i], 339 | 'gamma_regularization': param.gamma_regularization[i], 340 | 'window_size': param.window_size[i], 341 | 'patch_size': param.patch_size[i], 342 | 'sigma_intensity': param.sigma_int[i], 343 | 'sigma_spatial': param.sigma_spa[i], 344 | 'degree_max': param.degree_max[i], 345 | 'regularization': param.regularization} 346 | 347 | opt_param[i] = { 348 | 'iter_max': param.iter_max[i], 349 | 'plotting_step': param.plotting_step, 350 | 'eps_stop': param.eps_stop[i], 351 | 'attempt_max': param.attempt_max[i], 352 | 'learning_rate': {'lr_start': param.lr_start[i], 'lr_slot_nb': param.lr_slot_nb[i]}, 353 | 'depth_error_threshold': param.depth_error_threshold} 354 | 355 | # Set the device. 356 | if torch.cuda.is_available() and (param.gpu_id >= 0): 357 | device = torch.device('cuda:{}'.format(param.gpu_id)) 358 | else: 359 | device = torch.device('cpu') 360 | 361 | # Create the logger object for plotting. 362 | logger = None 363 | if param.visdom_display_port > 0: 364 | logger = Logger( 365 | param.depth_error_threshold, 366 | display_port=param.visdom_display_port, base_url=('/' + param.visdom_base_url)) 367 | 368 | ################################################## REFERENCE IMAGE ################################################# 369 | 370 | # Read the reference image. 371 | image = imread(param.image) 372 | if image is None: 373 | raise FileNotFoundError('The reference image could not be loaded.') 374 | 375 | # Convert the image to [0, 1] and flip the color channels, as OpenCV assumes that the image is in BGR format on disk. 376 | image = np.flip(image.astype(param.precision) / 255, axis=2) 377 | 378 | ############################################ NOISY/INCOMPLETE DEPTH MAP ############################################ 379 | 380 | # Read the noisy/incomplete depth map. 381 | depth = read_depth_map(param.depth, 'COLMAP').astype(param.precision) 382 | if depth is None: 383 | raise FileNotFoundError('The noisy/incomplete depth map to process could not be loaded.') 384 | 385 | # Clip the valid entries of the MVS depth map to `[param.depth_min, param.depth_max]`. 386 | mask = (depth > 0) & (depth < float('inf')) 387 | depth[~mask] = 0 # Non valid pixels are set to zero. 388 | depth[mask] = np.clip(depth[mask], param.depth_min, param.depth_max) # Valid pixels are clipped. 389 | 390 | ############################################ NOISY/INCOMPLETE NORMAL MAP ########################################### 391 | 392 | # Read the noisy/incomplete normal map. 393 | normal = read_normal_map(param.normal, 'COLMAP') 394 | if normal is None: 395 | print('WARNING: The noisy/incomplete normal map could not be loaded.') 396 | else: 397 | # Set to zero all the 3D normals without a corresponding depth value. 398 | normal[~mask] = 0 399 | 400 | # We do not care about normal having unitary norm at this stage. 401 | 402 | ################################################## CONFIDENCE MAP ################################################## 403 | 404 | # Read the confidence map associated to the noisy/incomplete depth map. 405 | depth_confidence = read_depth_map(param.confidence, 'COLMAP').astype(param.precision) 406 | 407 | # Check the confidence map. 408 | assert (np.min(depth_confidence) >= 0) and (np.max(depth_confidence) <= 1), \ 409 | 'Depth map confidence entries must be in [0, 1].' 410 | 411 | # Make the confidence binary. 412 | if param.confidence_threshold is not None: 413 | if (param.confidence_threshold >= 0) and (param.confidence_threshold <= 1): 414 | mask_confidence = depth_confidence < param.confidence_threshold 415 | depth_confidence = np.ones_like(depth_confidence) 416 | depth_confidence[mask_confidence] = 0 417 | else: 418 | print('WARNING: the specified confidence threshold is outside [0, 1] therefore it will be ignored.') 419 | 420 | ############################################## GROUND TRUTH DEPTH MAP ############################################## 421 | 422 | # Read the Ground Truth depth map. 423 | depth_gt = None 424 | if param.depth_gt is not None: 425 | depth_gt = read_depth_map(param.depth_gt, 'COLMAP').astype(param.precision) 426 | if depth_gt is None: 427 | raise FileNotFoundError('The ground truth depth map could not be loaded.') 428 | 429 | ############################################ ADJUST REFERENCE IMAGE SIZE ########################################### 430 | 431 | # If the reference image size differs from the noisy/incomplete depth map one, then the image is resized. 432 | # Note that, since the camera parameters are associated to the reference image, the camera parameters must be 433 | # adjusted accordingly, if the reference image is resized. 434 | height = depth.shape[0] 435 | width = depth.shape[1] 436 | if (image.shape[0] != height) or (image.shape[1] != width): 437 | x_ratio = float(height) / float(image.shape[0]) 438 | y_ratio = float(width) / float(image.shape[1]) 439 | camera_param['f_x'] = camera_param['f_x'] * x_ratio 440 | camera_param['f_y'] = camera_param['f_y'] * y_ratio 441 | camera_param['c_x'] = camera_param['c_x'] * x_ratio 442 | camera_param['c_y'] = camera_param['c_y'] * y_ratio 443 | image = resize_map(image, (height, width), order=1) 444 | print('WARNING: the reference image has been resized in order to match the input depth map height and width.') 445 | 446 | # The other maps must have the same height and width of the noisy/incomplete depth map. No resizing for them. 447 | if normal is not None: 448 | assert normal.shape == (height, width, 3), \ 449 | 'Input normal map size not compatible with the reference image one.' 450 | if depth_confidence is not None: 451 | assert depth_confidence.shape == (height, width), \ 452 | 'Input depth map confidence size not compatible with the reference image one.' 453 | if depth_gt is not None: 454 | assert depth_gt.shape == (height, width), \ 455 | 'Ground truth depth size not compatible with the reference image one.' 456 | 457 | #################################################################################################################### 458 | #################################################### REFINEMENT #################################################### 459 | #################################################################################################################### 460 | 461 | # Start measuring the processing time. 462 | time_start = time.time() 463 | 464 | # Refine the noisy/incomplete depth map. 465 | depth_refined, normal_refined = refine_depth( 466 | image, depth, (param.depth_min, param.depth_max), 467 | camera_param, loss_param, opt_param, 468 | depth_confidence=depth_confidence, 469 | normal=normal, 470 | depth_gt=depth_gt, 471 | logger=logger, 472 | device=device) 473 | 474 | # Check the processing time. 475 | time_elapsed = time.time() - time_start 476 | minute_elapsed = math.floor(time_elapsed / 60) 477 | print('Elapsed time: {}min {}sec'.format( 478 | minute_elapsed, math.ceil(time_elapsed - minute_elapsed * 60)), flush=True) 479 | 480 | #################################################### EVALUATION #################################################### 481 | 482 | # Compute the depth percentage error of the noisy/incomplete depth map. 483 | if depth_gt is not None: 484 | print('Percentage of input depth map values with error larger that {}: {:.2f}'.format( 485 | param.depth_error_threshold, 486 | depth_percentage_error(depth, depth_gt, param.depth_error_threshold))) 487 | 488 | print('Percentage of refined depth map values with error larger that {}: {:.2f}'.format( 489 | param.depth_error_threshold, 490 | depth_percentage_error(depth_refined, depth_gt, param.depth_error_threshold))) 491 | 492 | ###################################################### SAVING ###################################################### 493 | 494 | # Save the refined depth map. 495 | saving_path, _ = os.path.split(param.depth_out) 496 | os.makedirs(saving_path, exist_ok=True) 497 | write_bin_file( 498 | depth_refined, os.path.join(param.depth_out)) 499 | 500 | # Save the refined/estimated normal map. 501 | saving_path, _ = os.path.split(param.normal_out) 502 | os.makedirs(saving_path, exist_ok=True) 503 | write_bin_file( 504 | normal_refined, os.path.join(param.normal_out)) 505 | 506 | 507 | if __name__ == '__main__': 508 | main() 509 | -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, 2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland, 3 | # Laboratoire de Traitement des Signaux 4 (LTS4). 4 | # All rights reserved. 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | # 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com) 25 | 26 | import torch 27 | import numpy as np 28 | from importlib import import_module 29 | from misc import resize_map, depth_percentage_error 30 | from pltfuns import normal2rgb 31 | from typing import Tuple 32 | 33 | 34 | # Maximum height of any image (not heat map) plotted on the screen. 35 | HEIGHT_MAX = int(300) 36 | 37 | 38 | class Logger: 39 | 40 | def __init__(self, 41 | error_threshold: float, 42 | display_port: int = 8097, base_url: str = '/1234'): 43 | 44 | # Windows. 45 | self.texture_win = None # reference image. 46 | self.depth_win = None # noisy and possibly incomplete depth map. 47 | self.depth_init_win = None # initial depth map used in the refinement. 48 | self.depth_refined_win = None # refined depth map. 49 | self.normal_win = None # noisy and possibly incomplete normal map. 50 | self.normal_init_win = None # initial normal map used in the refinement. 51 | self.normal_refined_win = None # normal map associated to the refined depth map. 52 | self.depth_gt_win = None # ground truth depth map. 53 | self.depth_error_win = None # noisy and possibly incomplete depth map error. 54 | self.depth_refined_error_win = None # refined depth map error. 55 | 56 | # Windows associated to the partial and global losses (in the inverse depth domain). 57 | self.idepth_consistency_loss_win = None 58 | self.inormal_consistency_loss_win = None 59 | self.regularization_loss_win = None 60 | self.global_loss_win = None 61 | 62 | # Typically, the logger is called to plot a new complete depth map and to compute its error with respect to 63 | # the ground truth depth map, which does not change very often. Therefore, we store the ground truth depth map. 64 | self.depth_gt = None 65 | 66 | # Minimum and maximum depth values to be used in the plots. 67 | self.depth_min = None 68 | self.depth_max = None 69 | 70 | # Record the error threshold to be used in the percentage error computation. 71 | self.depth_error_threshold = error_threshold 72 | 73 | # Instantiate the online visualization tool. 74 | visdom = import_module('visdom') 75 | self.vis = visdom.Visdom(port=display_port, base_url=base_url) 76 | # The `visdom` module is imported here to avoid its installation in the case the user does not need the logger. 77 | 78 | # Environment default name. 79 | self.environment = 'main' 80 | 81 | def plot(self, 82 | texture: np.array = None, 83 | depth: np.array = None, 84 | depth_init: np.array = None, 85 | depth_refined: np.array = None, 86 | depth_gt: np.array = None, 87 | normal: np.array = None, 88 | normal_init: np.array = None, 89 | normal_refined: np.array = None, 90 | idepth_consistency_loss: np.array = None, 91 | inormal_consistency_loss: np.array = None, 92 | regularization_loss: np.array = None, 93 | global_loss: np.array = None) -> None: 94 | 95 | # ============================================================================================================== 96 | 97 | # Reference camera texture. 98 | if texture is not None: 99 | 100 | # Texture dimensions. 101 | aux = texture 102 | if texture.ndim == 2: 103 | height = texture.shape[0] 104 | width = texture.shape[1] 105 | aux = aux[:, :, None] 106 | elif texture.ndim == 3 and texture.shape[2] == 3: 107 | height = texture.shape[0] 108 | width = texture.shape[1] 109 | else: 110 | raise ValueError('The input texture must be gray scale or RGB.') 111 | 112 | # Resize the texture if too large. 113 | img_ratio = float(width) / float(height) 114 | if height > HEIGHT_MAX: 115 | aux = resize_map(aux, [HEIGHT_MAX, int(HEIGHT_MAX * img_ratio)]) 116 | 117 | # Convert the texture to tensor. 118 | texture_t = torch.from_numpy(np.transpose(aux, axes=(2, 0, 1)).copy()) 119 | 120 | # Plot the texture. 121 | if self.texture_win is not None: 122 | 123 | self.vis.image( 124 | texture_t, 125 | env=self.environment, 126 | win=self.texture_win, 127 | opts=dict(title='texture')) 128 | 129 | else: 130 | 131 | self.texture_win = self.vis.image( 132 | texture_t, 133 | env=self.environment, 134 | opts=dict(title='texture')) 135 | 136 | # ============================================================================================================== 137 | 138 | # Ground truth depth map. 139 | if depth_gt is not None: 140 | 141 | # Store the ground truth depth map. 142 | self.depth_gt = depth_gt 143 | 144 | # Set the minimum and maximum depth values. 145 | xmin = self.depth_min if self.depth_min is not None else np.min(self.depth_gt) 146 | xmax = self.depth_max if self.depth_max is not None else np.max(self.depth_gt) 147 | 148 | # Convert the ground truth depth map to tensor. 149 | depth_gt_t = torch.from_numpy(self.depth_gt).flip([0]) 150 | 151 | # Plot. 152 | if self.depth_gt_win is not None: 153 | 154 | self.vis.heatmap( 155 | depth_gt_t, 156 | env=self.environment, 157 | win=self.depth_gt_win, 158 | opts=dict( 159 | xmin=xmin, 160 | xmax=xmax, 161 | title='depth gt')) 162 | 163 | else: 164 | 165 | self.depth_gt_win = self.vis.heatmap( 166 | depth_gt_t, 167 | env=self.environment, 168 | opts=dict( 169 | xmin=xmin, 170 | xmax=xmax, 171 | title='depth gt')) 172 | 173 | # ============================================================================================================== 174 | 175 | # Noisy and possibly incomplete depth map. 176 | if depth is not None: 177 | 178 | # Set the minimum and maximum depth values. 179 | aux_min, aux_max = np.percentile(depth, [5, 95]) 180 | xmin = self.depth_min if self.depth_min is not None else aux_min 181 | xmax = self.depth_max if self.depth_max is not None else aux_max 182 | 183 | # Convert the depth map to tensor. 184 | depth_t = torch.from_numpy(depth).flip([0]) 185 | 186 | # Plot. 187 | if self.depth_win is not None: 188 | 189 | self.vis.heatmap( 190 | depth_t, 191 | env=self.environment, 192 | win=self.depth_win, 193 | opts=dict( 194 | xmin=xmin, 195 | xmax=xmax, 196 | title='input depth')) 197 | 198 | else: 199 | 200 | self.depth_win = self.vis.heatmap( 201 | depth_t, 202 | env=self.environment, 203 | opts=dict( 204 | xmin=xmin, 205 | xmax=xmax, 206 | title='input depth')) 207 | 208 | # Error map. 209 | if self.depth_gt is not None: 210 | 211 | # Detect the valid entries in `self.depth_gt`. 212 | mask = (self.depth_gt > 0) & (self.depth_gt < float('inf')) 213 | 214 | # Compute the error. 215 | error = np.abs(self.depth_gt - depth) 216 | error[~mask] = 0 217 | 218 | # Compute the percentage error. 219 | percentage_error = depth_percentage_error( 220 | depth, self.depth_gt, self.depth_error_threshold) 221 | 222 | # Convert the error to tensor. 223 | error_t = torch.from_numpy(error).flip([0]) 224 | 225 | # Plot the depth map error. 226 | if self.depth_error_win is not None: 227 | 228 | self.vis.heatmap( 229 | error_t, 230 | env=self.environment, 231 | win=self.depth_error_win, 232 | opts=dict( 233 | xmin=0.0, 234 | xmax=self.depth_error_threshold, 235 | title='input depth error: {:.2f}% ({})'.format( 236 | percentage_error, self.depth_error_threshold))) 237 | 238 | else: 239 | 240 | self.depth_error_win = self.vis.heatmap( 241 | error_t, 242 | env=self.environment, 243 | opts=dict( 244 | xmin=0.0, 245 | xmax=self.depth_error_threshold, 246 | title='input depth error: {:.2f}% ({})'.format( 247 | percentage_error, self.depth_error_threshold))) 248 | 249 | # ============================================================================================================== 250 | 251 | # Initial depth map. 252 | if depth_init is not None: 253 | 254 | # Set the minimum and maximum depth values. 255 | aux_min, aux_max = np.percentile(depth_init, [5, 95]) 256 | xmin = self.depth_min if self.depth_min is not None else aux_min 257 | xmax = self.depth_max if self.depth_max is not None else aux_max 258 | 259 | # Convert the depth map to tensor. 260 | depth_init_t = torch.from_numpy(depth_init).flip([0]) 261 | 262 | # Plot the depth map. 263 | if self.depth_init_win is not None: 264 | 265 | self.vis.heatmap( 266 | depth_init_t, 267 | env=self.environment, 268 | win=self.depth_init_win, 269 | opts=dict( 270 | xmin=xmin, 271 | xmax=xmax, 272 | title='initial depth')) 273 | 274 | else: 275 | 276 | self.depth_init_win = self.vis.heatmap( 277 | depth_init_t, 278 | env=self.environment, 279 | opts=dict( 280 | xmin=xmin, 281 | xmax=xmax, 282 | title='initial depth')) 283 | 284 | # ============================================================================================================== 285 | 286 | # Refined depth map. 287 | if depth_refined is not None: 288 | 289 | # Set the minimum and maximum depth values. 290 | aux_min, aux_max = np.percentile(depth_refined, [5, 95]) 291 | xmin = self.depth_min if self.depth_min is not None else aux_min 292 | xmax = self.depth_max if self.depth_max is not None else aux_max 293 | 294 | # Convert the depth map to tensor. 295 | depth_refined_t = torch.from_numpy(depth_refined).flip([0]) 296 | 297 | # Plot the depth map. 298 | if self.depth_refined_win is not None: 299 | 300 | self.vis.heatmap( 301 | depth_refined_t, 302 | env=self.environment, 303 | win=self.depth_refined_win, 304 | opts=dict( 305 | xmin=xmin, 306 | xmax=xmax, 307 | title='refined depth')) 308 | 309 | else: 310 | 311 | self.depth_refined_win = self.vis.heatmap( 312 | depth_refined_t, 313 | env=self.environment, 314 | opts=dict( 315 | xmin=xmin, 316 | xmax=xmax, 317 | title='refined depth')) 318 | 319 | # Depth map error. 320 | if self.depth_gt is not None: 321 | 322 | # Detect the valid entries in `self.depth_gt`. 323 | mask = (self.depth_gt > 0) & (self.depth_gt < float('inf')) 324 | 325 | # Compute the error. 326 | error = np.abs(self.depth_gt - depth_refined) 327 | error[~mask] = 0 328 | 329 | # Compute the percentage error. 330 | percentage_error = depth_percentage_error( 331 | depth_refined, self.depth_gt, self.depth_error_threshold) 332 | 333 | # Convert the error to tensor. 334 | error_t = torch.from_numpy(error).flip([0]) 335 | 336 | # Plot the depth map error. 337 | if self.depth_refined_error_win is not None: 338 | 339 | self.vis.heatmap( 340 | error_t, 341 | env=self.environment, 342 | win=self.depth_refined_error_win, 343 | opts=dict( 344 | xmin=0.0, 345 | xmax=self.depth_error_threshold, 346 | title='refined depth error: {:.2f}% ({})'.format( 347 | percentage_error, self.depth_error_threshold))) 348 | 349 | else: 350 | 351 | self.depth_refined_error_win = self.vis.heatmap( 352 | error_t, 353 | env=self.environment, 354 | opts=dict( 355 | xmin=0.0, 356 | xmax=self.depth_error_threshold, 357 | title='refined depth error: {:.2f}% ({})'.format( 358 | percentage_error, self.depth_error_threshold))) 359 | 360 | # ============================================================================================================== 361 | 362 | # Noisy/incomplete normal map. 363 | if normal is not None: 364 | 365 | if depth is not None: 366 | 367 | # Spatial dimensions. 368 | height = normal.shape[0] 369 | width = normal.shape[1] 370 | 371 | # Encode the 3D normals into an RGB image. 372 | normal_rgb = normal2rgb(normal) 373 | 374 | # Resize the normal map, if too large. 375 | img_ratio = float(width) / float(height) 376 | if height > HEIGHT_MAX: 377 | normal_rgb = resize_map( 378 | normal_rgb, [HEIGHT_MAX, int(HEIGHT_MAX * img_ratio)], order=0) 379 | 380 | # Convert the normal map to tensor. 381 | normal_rgb_t = torch.from_numpy(np.transpose(normal_rgb, axes=(2, 0, 1))) 382 | 383 | # Plot the normal map. 384 | if self.normal_win is not None: 385 | 386 | self.vis.image( 387 | normal_rgb_t, 388 | env=self.environment, 389 | win=self.normal_win, 390 | opts=dict(title='input normal')) 391 | 392 | else: 393 | 394 | self.normal_win = self.vis.image( 395 | normal_rgb_t, 396 | env=self.environment, 397 | opts=dict(title='input normal')) 398 | 399 | # ============================================================================================================== 400 | 401 | # Initial normal map. 402 | if normal_init is not None: 403 | 404 | if depth_init is not None: 405 | 406 | # Spatial dimensions. 407 | height = normal_init.shape[0] 408 | width = normal_init.shape[1] 409 | 410 | # Encode the 3D normals into an RGB image. 411 | normal_init_rgb = normal2rgb(normal_init) 412 | 413 | # Resize the normal map, if too large. 414 | img_ratio = float(width) / float(height) 415 | if height > HEIGHT_MAX: 416 | normal_init_rgb = resize_map( 417 | normal_init_rgb, [HEIGHT_MAX, int(HEIGHT_MAX * img_ratio)], order=0) 418 | 419 | # Convert the normal map to tensor. 420 | normal_init_rgb_t = torch.from_numpy(np.transpose(normal_init_rgb, axes=(2, 0, 1))) 421 | 422 | # Plot the normal map. 423 | if self.normal_init_win is not None: 424 | 425 | self.vis.image( 426 | normal_init_rgb_t, 427 | env=self.environment, 428 | win=self.normal_init_win, 429 | opts=dict(title='initial normal')) 430 | 431 | else: 432 | 433 | self.normal_init_win = self.vis.image( 434 | normal_init_rgb_t, 435 | env=self.environment, 436 | opts=dict(title='initial normal')) 437 | 438 | # ============================================================================================================== 439 | 440 | # Normal map associated to the refined depth map. 441 | if normal_refined is not None: 442 | 443 | if depth_refined is not None: 444 | 445 | # Spatial dimensions. 446 | height = normal_refined.shape[0] 447 | width = normal_refined.shape[1] 448 | 449 | # Encode the 3D normals into an RGB image. 450 | normal_refined_rgb = normal2rgb(normal_refined) 451 | 452 | # Resize the normal map, if too large. 453 | img_ratio = float(width) / float(height) 454 | if height > HEIGHT_MAX: 455 | normal_refined_rgb = resize_map( 456 | normal_refined_rgb, [HEIGHT_MAX, int(HEIGHT_MAX * img_ratio)], order=0) 457 | 458 | # Convert the normal map to tensor. 459 | normal_refined_rgb_t = torch.from_numpy(np.transpose(normal_refined_rgb, axes=(2, 0, 1))) 460 | 461 | # Plot the normal map. 462 | if self.normal_refined_win is not None: 463 | 464 | self.vis.image( 465 | normal_refined_rgb_t, 466 | env=self.environment, 467 | win=self.normal_refined_win, 468 | opts=dict(title='refined normal')) 469 | 470 | else: 471 | 472 | self.normal_refined_win = self.vis.image( 473 | normal_refined_rgb_t, 474 | env=self.environment, 475 | opts=dict(title='refined normal')) 476 | 477 | # ============================================================================================================== 478 | 479 | # Depth consistency loss. 480 | if idepth_consistency_loss is not None: 481 | 482 | if self.idepth_consistency_loss_win is not None: 483 | 484 | self.vis.line( 485 | X=idepth_consistency_loss[0], 486 | Y=idepth_consistency_loss[1], 487 | env=self.environment, 488 | win=self.idepth_consistency_loss_win, 489 | update='append') 490 | 491 | else: 492 | 493 | self.idepth_consistency_loss_win = self.vis.line( 494 | X=idepth_consistency_loss[0], 495 | Y=idepth_consistency_loss[1], 496 | env=self.environment, 497 | opts=dict( 498 | xlabel='iterations', 499 | ylabel='loss', 500 | title='inverse depth consistency loss', 501 | markers=True, 502 | markersymbol='dot')) 503 | 504 | # ============================================================================================================== 505 | 506 | # Normal consistency loss. 507 | if inormal_consistency_loss is not None: 508 | 509 | if self.inormal_consistency_loss_win is not None: 510 | 511 | self.vis.line( 512 | X=inormal_consistency_loss[0], 513 | Y=inormal_consistency_loss[1], 514 | env=self.environment, 515 | win=self.inormal_consistency_loss_win, 516 | update='append') 517 | 518 | else: 519 | 520 | self.inormal_consistency_loss_win = self.vis.line( 521 | X=inormal_consistency_loss[0], 522 | Y=inormal_consistency_loss[1], 523 | env=self.environment, 524 | opts=dict( 525 | xlabel='iterations', 526 | ylabel='loss', 527 | title='2D normal consistency loss', 528 | markers=True, 529 | markersymbol='dot')) 530 | 531 | # ============================================================================================================== 532 | 533 | # Depth regularization loss. 534 | if regularization_loss is not None: 535 | 536 | if self.regularization_loss_win is not None: 537 | 538 | self.vis.line( 539 | X=regularization_loss[0], 540 | Y=regularization_loss[1], 541 | env=self.environment, 542 | win=self.regularization_loss_win, 543 | update='append') 544 | 545 | else: 546 | 547 | self.regularization_loss_win = self.vis.line( 548 | X=regularization_loss[0], 549 | Y=regularization_loss[1], 550 | env=self.environment, 551 | opts=dict( 552 | xlabel='iterations', 553 | ylabel='loss', 554 | title='regularization loss', 555 | markers=True, 556 | markersymbol='dot')) 557 | 558 | # ============================================================================================================== 559 | 560 | # Global loss. 561 | if global_loss is not None: 562 | 563 | if self.global_loss_win is not None: 564 | 565 | self.vis.line( 566 | X=global_loss[0], 567 | Y=global_loss[1], 568 | env=self.environment, 569 | win=self.global_loss_win, 570 | update='append') 571 | 572 | else: 573 | 574 | self.global_loss_win = self.vis.line( 575 | X=global_loss[0], 576 | Y=global_loss[1], 577 | env=self.environment, 578 | opts=dict( 579 | xlabel='iterations', 580 | ylabel='loss', 581 | title='global loss', 582 | markers=True, 583 | markersymbol='dot')) 584 | 585 | def setup(self, 586 | env_name: str = None, 587 | depth_range: Tuple[float, float] = None) -> None: 588 | 589 | # Reset the plot windows. 590 | self.texture_win = None 591 | self.depth_win = None 592 | self.depth_init_win = None 593 | self.depth_refined_win = None 594 | self.normal_win = None 595 | self.normal_init_win = None 596 | self.normal_refined_win = None 597 | self.depth_gt_win = None 598 | self.depth_error_win = None 599 | self.depth_refined_error_win = None 600 | 601 | # Reset the loss windows. 602 | self.idepth_consistency_loss_win = None 603 | self.inormal_consistency_loss_win = None 604 | self.regularization_loss_win = None 605 | self.global_loss_win = None 606 | 607 | # Reset the ground truth depth map. 608 | self.depth_gt = None 609 | 610 | # Reset the plotting depth range. 611 | self.depth_min = None 612 | self.depth_max = None 613 | 614 | # Set the new plotting environment name. 615 | if env_name is not None: 616 | self.environment = env_name 617 | 618 | # Set the plotting depth range. 619 | if depth_range is not None: 620 | self.depth_min = depth_range[0] 621 | self.depth_max = depth_range[1] 622 | -------------------------------------------------------------------------------- /refinement.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, 2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland, 3 | # Laboratoire de Traitement des Signaux 4 (LTS4). 4 | # All rights reserved. 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | # 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com) 25 | 26 | import math 27 | import torch 28 | import torch.nn as nn 29 | import torch.nn.functional as fun 30 | from torch import device as dev 31 | import numpy as np 32 | from misc import resize_map, space2plane_normal, plane2space_normal, depth2normal 33 | from filters import gradient_filter 34 | from losses import DepthConsistencyL1, NormalConsistencyL1, PieceWisePlanarRegularization 35 | from cv2 import cvtColor, COLOR_RGB2GRAY 36 | from transforms import depth2depth_inv, depth_inv2depth, depth_range2depth_inv_range 37 | from logger import Logger 38 | from typing import Tuple, List, Dict 39 | 40 | 41 | class Loss(nn.Module): 42 | """It creates a loss function consisting of an inverse depth map consistency loss, a 2D normal map consistency loss 43 | and a joint inverse depth map and normal map regularization. The 2D normal map is 2D vector field capturing the 44 | orientation of the inverse depth map. 45 | 46 | The independent variables of this loss are `self.idepth` and `self.inormal`. 47 | """ 48 | 49 | def __init__(self, 50 | image: np.array, idepth: np.array, idepth_range: Tuple[float], 51 | loss_param: Dict[str, float], 52 | idepth_confidence: np.array = None, 53 | inormal: np.array = None, 54 | idepth_init: np.array = None, 55 | inormal_init: np.array = None, 56 | device: torch.device = torch.device('cpu')) -> None: 57 | """`Loss` constructor. It considers the inverse depth map and the corresponding 2D normal map. 58 | 59 | Args: 60 | image: reference image, arranged as an `(H, W)` or `(H, W, C)` array. 61 | idepth: inverse depth map to refine, arranged as an `(H, W)` array. 62 | idepth_range: inverse depth values must belong to the interval `[depth_range[0], depth_range[1]]`. 63 | loss_param: dictionaries containing the loss parameters. 64 | idepth_confidence: confidence map associated to the inverse depth map to refine. 65 | It must have entries in `[0, 1]`. 66 | inormal: 2D normal map associated to the depth map to refine, arranged as an `(H, W, 2)` array. 67 | It is ignored if the normal consistency loss is off. 68 | idepth_init: initial guess for the refined inverse depth map. 69 | inormal_init: initial guess for the 2D normal map associated to the refined inverse depth map. 70 | device: device on which the computation will take place. 71 | """ 72 | 73 | # Call the parent constructor. 74 | super(Loss, self).__init__() 75 | 76 | # Convert the input data from `np.array` to `torch.Tensor`. In particular, arrays are converted into 4D tensors 77 | # of size `(1, C, H, W)` with `H`, `W` and `C` representing the height, width and channel number, respectively. 78 | 79 | # Check the inverse depth range and register it. 80 | if idepth_range[0] <= 0 or idepth_range[1] == float('inf') or idepth_range[0] > idepth_range[1]: 81 | raise ValueError('Invalid depth range.') 82 | self.idepth_min = idepth_range[0] 83 | self.idepth_max = idepth_range[1] 84 | 85 | # Register the first optimization variable, i.e., the refined inverse depth map, and initialize it. 86 | if idepth_init is not None: 87 | aux = torch.as_tensor(idepth_init[None, None, ]) 88 | else: 89 | aux = torch.as_tensor(idepth[None, None, ]) 90 | self.idepth = nn.Parameter(aux.clone(), requires_grad=True) 91 | # Note that the data passed to `self.idepth` is copied in order to avoid shared data between different tensors. 92 | 93 | # Register the second optimization variable, i.e., the normal map, and initialize it. 94 | if inormal_init is not None: 95 | aux = torch.as_tensor((np.transpose(inormal_init, (2, 0, 1))[None, ]).copy()) 96 | elif inormal is not None: 97 | aux = torch.as_tensor((np.transpose(inormal, (2, 0, 1))[None, ]).copy()) 98 | else: 99 | with torch.no_grad(): 100 | filter_size = 5 101 | filter_sigma = 5.0 102 | grad_filter = gradient_filter(filter_size, filter_sigma) 103 | pad = tuple([int((filter_size - 1) / 2)] * 4) 104 | aux = fun.conv2d( 105 | fun.pad(self.idepth, pad, mode='replicate'), 106 | grad_filter.to(self.idepth)) 107 | self.inormal = nn.Parameter(aux.clone(), requires_grad=True) 108 | # The `torch.no_grad()` block prevents PyTorch from tracking the operation. 109 | 110 | # Create the depth consistency loss. 111 | self.idepth_consistency_loss = DepthConsistencyL1( 112 | idepth, idepth_range, 113 | depth_confidence=idepth_confidence, 114 | multiplier=loss_param['lambda_depth_consistency']) 115 | 116 | # Create the 2D normal consistency loss. 117 | if loss_param['lambda_normal_consistency'] > 0: 118 | 119 | assert inormal is not None, 'Cannot activate the normal consistency term with no input normal map.' 120 | 121 | self.inormal_consistency_loss = NormalConsistencyL1( 122 | inormal, 123 | normal_confidence=idepth_confidence, 124 | multiplier=loss_param['lambda_normal_consistency']) 125 | 126 | else: 127 | self.inormal_consistency_loss = None 128 | 129 | # Create the depth regularization loss. 130 | self.regularization_loss = PieceWisePlanarRegularization( 131 | image, 132 | loss_param['gamma_regularization'], 133 | window_size=loss_param['window_size'], 134 | patch_size=loss_param['patch_size'], 135 | sigma_intensity=loss_param['sigma_intensity'], 136 | sigma_spatial=loss_param['sigma_spatial'], 137 | degree_max=loss_param['degree_max'], 138 | version=loss_param['regularization'], 139 | multiplier=loss_param['lambda_regularization'], 140 | device=device) 141 | 142 | def forward(self) -> Tuple[torch.Tensor, float, float, float]: 143 | """It evaluates the loss function at (`self.idepth`, `self.inormal`). 144 | 145 | Returns: 146 | the loss function value, and the value of its two terms, at (`self.idepth`, `self.inormal`). 147 | """ 148 | 149 | # Inverse depth consistency loss. 150 | idepth_consistency_loss = self.idepth_consistency_loss(self.idepth) 151 | 152 | # 2D normal consistency loss. 153 | if self.inormal_consistency_loss is not None: 154 | inormal_consistency_loss = self.inormal_consistency_loss(self.inormal) 155 | else: 156 | inormal_consistency_loss = self.idepth.new_zeros(1, requires_grad=True) 157 | 158 | # Regularization loss. 159 | regularization_loss = self.regularization_loss(self.idepth, self.inormal) 160 | 161 | # Assemble the full loss. 162 | loss = idepth_consistency_loss + inormal_consistency_loss + regularization_loss 163 | 164 | return loss, idepth_consistency_loss.item(), inormal_consistency_loss.item(), regularization_loss.item() 165 | 166 | 167 | def refine_depth(image: np.array, depth: np.array, depth_range: Tuple[float, float], 168 | camera_param: Dict[str, float], loss_param: List[Dict], opt_param: List[Dict], 169 | depth_confidence: np.array = None, 170 | normal: np.array = None, 171 | depth_init: np.array = None, 172 | normal_init: np.array = None, 173 | depth_gt: np.array = None, 174 | logger: Logger = None, 175 | device: dev = dev('cpu')) -> Tuple[np.array, np.array]: 176 | """It refines the input depth map and estimates the corresponding normal map in a multi-scale fashion. 177 | 178 | It refines the input depth map and estimate the corresponding normal map according to the method described 179 | in the following article: 180 | 181 | Mattia Rossi, Mireille El Gheche, Andreas Kuhn, Pascal Frossard, 182 | "Joint Graph-based Depth Refinement and Normal Estimation", 183 | in IEEE Computer Vision and Pattern Recognition Conference (CVPR), Seattle, WA, USA, 2020. 184 | 185 | If the input depth map comes together with a normal map, the latter can be refined as well (rather than estimated) 186 | by activating the normal consistency term (not described in the article). 187 | 188 | The `loss_param` input parameter contains a list of dictionaries, one for each scale. Each dictionary must contain 189 | the following keys: 190 | - lambda_depth_consistency: depth consistency term multiplier. 191 | - lambda_normal_consistency: normal consistency term multiplier. 192 | - lambda_regularization: depth regularization term multiplier. 193 | - gamma_regularization: depth regularization term internal multiplier. 194 | - window_size: search window size (window_size x window_size) to be used in the graph construction. 195 | - patch_size: patch size (patch_size x patch_size) to be used in the graph construction. 196 | - sigma_intensity: color difference standard deviation for patch comparison in the graph construction. 197 | - sigma_spatial: euclidean distance standard deviation for patch comparison in the graph construction. 198 | - degree_max: maximum number of per pixel neighbors in the graph. 199 | - regularization: regularization type (0 for NLTGV, 1 for our regularization). 200 | 201 | The `opt_param` input parameter contains a list of dictionaries, one for each scale. Each dictionary must contain 202 | the following keys: 203 | - iter_max: maximum number of iterations. 204 | - eps_stop: minimum relative change between the current and the previous iteration depth maps. 205 | - attempt_max: maximum number of iterations without improving the loss. 206 | - learning_rate: dictionary containing the following keys: 207 | - lr_start: initial learning rate. 208 | - lr_slot_nb: number of partitions; each partition adopts a learning rate which is 1/10 of those employed at 209 | the previous partition; 0 excludes the relative depth map change stopping criterium. 210 | - plotting_step: number of steps between two plot updates of the logger. 211 | - depth_error_threshold: error threshold (in meters) to be used in the evaluation against the ground truth. 212 | 213 | Args: 214 | image: reference image, arranged as an `(H, W)` or `(H, W, C)` array. 215 | depth: depth map to refine, arranged as an `(H, W)` array. 216 | depth_range: depth values must belong to the interval `[depth_range[0], depth_range[1]]`. 217 | camera_param: dictionary containing `f_x`, `f_y`, `c_x`, `c_y`. 218 | loss_param: list of dictionaries, each one containing the loss parameters for a given scale. 219 | opt_param: list of dictionaries, each one containing the solver parameters for a given scale. 220 | depth_confidence: confidence map associated to the depth map to refine. It must have entries in `[0, 1]`. 221 | normal: 3D normal map to refine, arranged as an `(H, W, 3)` array. It is ignored if the normal consistency loss is off. 222 | depth_init: initial guess for the refined depth map. 223 | normal_init: initial guess for the 3D normal map associated to the refined depth map. 224 | depth_gt: ground truth depth map, arranged as an `(H, W)` array. 225 | logger: logger to plot visual results and statistics at runtime. 226 | device: device on which the computation will take place. 227 | 228 | Returns: 229 | The refined depth map and the corresponding normal map. 230 | """ 231 | 232 | # Number of scales in the multi-scale pyramid. 233 | scale_nb = len(opt_param) 234 | 235 | # Allocate the multi-scale pyramid. 236 | scale_pyramid = [None] * scale_nb 237 | camera_param_pyramid = [None] * scale_nb 238 | image_pyramid = [None] * scale_nb 239 | depth_pyramid = [None] * scale_nb 240 | depth_confidence_pyramid = [None] * scale_nb 241 | normal_pyramid = [None] * scale_nb 242 | depth_init_pyramid = [None] * scale_nb 243 | normal_init_pyramid = [None] * scale_nb 244 | depth_gt_pyramid = [None] * scale_nb 245 | 246 | # Build the multi-scale pyramid. 247 | for i in range(scale_nb): 248 | 249 | if i > 0: 250 | 251 | # Compute the image dimensions for the current scale. 252 | height = int(round(scale_pyramid[i - 1][0] / 2.0)) 253 | width = int(round(scale_pyramid[i - 1][1] / 2.0)) 254 | scale_pyramid[i] = (height, width) 255 | 256 | # Compute the camera parameters for the current scale. 257 | x_ratio = scale_pyramid[i][1] / scale_pyramid[i - 1][1] 258 | y_ratio = scale_pyramid[i][0] / scale_pyramid[i - 1][0] 259 | camera_param_pyramid[i] = {'f_x': camera_param_pyramid[i - 1]['f_x'] * x_ratio, 260 | 'f_y': camera_param_pyramid[i - 1]['f_y'] * y_ratio, 261 | 'c_x': camera_param_pyramid[i - 1]['c_x'] * x_ratio, 262 | 'c_y': camera_param_pyramid[i - 1]['c_y'] * y_ratio} 263 | 264 | # Downscale the image. 265 | image_pyramid[i] = resize_map(image_pyramid[i - 1], scale_pyramid[i], order=1) 266 | 267 | # Downscale the noisy/incomplete depth map. 268 | depth_pyramid[i] = resize_map(depth_pyramid[i - 1], scale_pyramid[i], order=0) 269 | 270 | # Downscale the noisy/incomplete depth map confidence. 271 | if depth_confidence_pyramid[i - 1] is not None: 272 | depth_confidence_pyramid[i] = resize_map(depth_confidence_pyramid[i - 1], scale_pyramid[i], order=0) 273 | else: 274 | depth_confidence_pyramid[i] = None 275 | 276 | # Downscale the noisy/incomplete normal map. 277 | if normal_pyramid[i - 1] is not None: 278 | normal_pyramid[i] = resize_map(normal_pyramid[i - 1], scale_pyramid[i], order=0) 279 | 280 | else: 281 | normal_pyramid[i] = None 282 | 283 | # Downscale the initial depth map estimate (we need only the lowest scale). 284 | if depth_init_pyramid[i - 1] is not None: 285 | depth_init_pyramid[i] = resize_map(depth_init_pyramid[i - 1], scale_pyramid[i], order=0) 286 | depth_init_pyramid[i - 1] = None 287 | else: 288 | depth_init_pyramid[i] = None 289 | 290 | # Downscale the initial normal map estimate (we need only the lowest scale). 291 | if normal_init_pyramid[i - 1] is not None: 292 | normal_init_pyramid[i] = resize_map(normal_init_pyramid[i - 1], scale_pyramid[i], order=0) 293 | normal_init_pyramid[i - 1] = None 294 | else: 295 | normal_init_pyramid[i] = None 296 | 297 | # Downscale the ground truth depth map. 298 | if depth_gt_pyramid[i - 1] is not None: 299 | depth_gt_pyramid[i] = resize_map(depth_gt_pyramid[i - 1], scale_pyramid[i], order=0) 300 | else: 301 | depth_gt_pyramid[i] = None 302 | 303 | else: 304 | 305 | # Store the original image dimensions. 306 | scale_pyramid[i] = (image.shape[0], image.shape[1]) 307 | 308 | # Store the original camera parameters. 309 | camera_param_pyramid[i] = camera_param 310 | 311 | # The lowest scale hosts the original data. 312 | image_pyramid[i] = image 313 | depth_pyramid[i] = depth 314 | depth_confidence_pyramid[i] = depth_confidence 315 | normal_pyramid[i] = normal 316 | depth_init_pyramid[i] = depth_init 317 | normal_init_pyramid[i] = normal_init 318 | depth_gt_pyramid[i] = depth_gt 319 | 320 | # Reverse the multi-scale pyramid. 321 | scale_pyramid.reverse() 322 | camera_param_pyramid.reverse() 323 | image_pyramid.reverse() 324 | depth_pyramid.reverse() 325 | depth_confidence_pyramid.reverse() 326 | normal_pyramid.reverse() 327 | depth_init_pyramid.reverse() # It contains only the lowest scale. 328 | normal_init_pyramid.reverse() # It contains only the lowest scale. 329 | depth_gt_pyramid.reverse() 330 | 331 | # Perform the multi-scale depth refinement. 332 | scale_name_pyramid = [None] * scale_nb 333 | depth_refined_pyramid = [None] * scale_nb 334 | normal_refined_pyramid = [None] * scale_nb 335 | for i in range(scale_nb): 336 | 337 | scale_name_pyramid[i] = ('{} ({}x{})'.format(i, scale_pyramid[i][0], scale_pyramid[i][1])) 338 | print('Processing scale {}'.format(scale_name_pyramid[i])) 339 | 340 | # Setup a new plotting environment. 341 | if logger is not None: 342 | 343 | if depth_gt_pyramid[i] is not None: 344 | depth_plotting_range = (np.min(depth_gt_pyramid[i]).item(), np.max(depth_gt_pyramid[i]).item()) 345 | else: 346 | depth_plotting_range = np.percentile(depth, [5, 95]) 347 | logger.setup(env_name=scale_name_pyramid[i], depth_range=depth_plotting_range) 348 | 349 | # Initialize the next scale with the refined depth map and the corresponding normal map from the previous scale. 350 | # The two maps are up-sampled first. 351 | if i > 0: 352 | depth_init_pyramid[i] = resize_map(depth_refined_pyramid[i - 1], scale_pyramid[i], order=0) 353 | if normal_refined_pyramid[i - 1] is not None: 354 | normal_init_pyramid[i] = resize_map(normal_refined_pyramid[i - 1], scale_pyramid[i], order=0) 355 | 356 | # Refine the depth map of the current scale. 357 | depth_refined, normal_refined = refine( 358 | image_pyramid[i], depth_pyramid[i], depth_range, 359 | camera_param_pyramid[i], loss_param[i], opt_param[i], 360 | depth_confidence=depth_confidence_pyramid[i], 361 | depth_init=depth_init_pyramid[i], 362 | normal=normal_pyramid[i], 363 | normal_init=normal_init_pyramid[i], 364 | depth_gt=depth_gt_pyramid[i], 365 | logger=logger, 366 | device=device) 367 | 368 | depth_refined_pyramid[i] = depth_refined 369 | normal_refined_pyramid[i] = normal_refined 370 | 371 | # Extract the refined depth map and the corresponding normal map. 372 | depth_refined = depth_refined_pyramid[-1] 373 | normal_refined = normal_refined_pyramid[-1] 374 | 375 | # Delete all the plotting environments. 376 | if logger is not None: 377 | for i in range(scale_nb): 378 | logger.vis.delete_env(scale_name_pyramid[i]) 379 | 380 | return depth_refined, normal_refined 381 | 382 | 383 | def refine(image: np.array, depth: np.array, depth_range: Tuple[float, float], 384 | camera_param: Dict[str, float], loss_param: Dict, opt_param: Dict, 385 | depth_confidence: np.array = None, 386 | normal: np.array = None, 387 | depth_init: np.array = None, 388 | normal_init: np.array = None, 389 | depth_gt: np.array = None, 390 | logger: Logger = None, 391 | device: dev = dev('cpu')) -> Tuple[np.array, np.array]: 392 | """It implements one scale of the multi-scale pyramid of the function `refine_depth`. 393 | 394 | Args: 395 | image: reference image, arranged as an `(H, W)` or `(H, W, C)` array. 396 | depth: depth map to refine, arranged as an `(H, W)` array. 397 | depth_range: depth values must belong to the interval `[depth_range[0], depth_range[1]]`. 398 | camera_param: dictionary containing `f_x`, `f_y`, `c_x`, `c_y`. 399 | loss_param: dictionary containing the loss parameters. 400 | opt_param: dictionary containing the solver parameters. 401 | depth_confidence: confidence map associated to the depth map to refine. It must have entries in `[0, 1]`. 402 | normal: 3D normal map to refine, arranged as an `(H, W, 3)` array. It is ignored if the normal consistency loss is off. 403 | depth_init: initial guess for the refined depth map. 404 | normal_init: initial guess for the 3D normal map associated to the refined depth map. 405 | depth_gt: ground truth depth map, arranged as an `(H, W)` array. 406 | logger: logger to plot visual results and statistics at runtime. 407 | device: device on which the computation will take place. 408 | 409 | Returns: 410 | The refined depth map and the corresponding normal map. 411 | """ 412 | 413 | # Check that the input maps have the same height and width of the input reference image. 414 | height = image.shape[0] 415 | width = image.shape[1] 416 | assert depth.shape == (height, width),\ 417 | 'Input depth map size not compatible with the reference image one.' 418 | if depth_confidence is not None: 419 | assert depth_confidence.shape == (height, width),\ 420 | 'Input depth map confidence size not compatible with the reference image one.' 421 | if normal is not None: 422 | assert normal.shape == (height, width, 3),\ 423 | 'Input normal map size not compatible with the reference image one.' 424 | if depth_init is not None: 425 | assert depth_init.shape == (height, width),\ 426 | 'Input initial depth map size not compatible with the reference image one.' 427 | if normal_init is not None: 428 | assert normal_init.shape == (height, width, 3),\ 429 | 'Input initial normal map size not compatible with the reference image one.' 430 | if depth_gt is not None: 431 | assert depth_gt.shape == (height, width),\ 432 | 'Ground truth depth size not compatible with the reference image one.' 433 | 434 | # Check the depth map data type. 435 | if depth.dtype == np.float32: 436 | depth_dtype = torch.float 437 | elif depth.dtype == np.float64: 438 | depth_dtype = torch.double 439 | else: 440 | raise TypeError('The input depth map must be either of type double or float.') 441 | 442 | # Convert the reference image to gray scale. 443 | image_gray = image 444 | if image_gray.ndim == 3: 445 | image_gray = cvtColor(image_gray.astype(np.float32), COLOR_RGB2GRAY) 446 | image_gray = image_gray.astype(image.dtype) 447 | # The function `cvtColor` requires an input image of type uint8, uint16 or float32. Therefore, `image_gray` is 448 | # first converted to float32 (to minimize the precision loss) and then back to its original data type. 449 | 450 | # Plot. 451 | if logger is not None: 452 | logger.plot( 453 | texture=image, 454 | depth=depth, 455 | depth_init=depth_init, 456 | depth_gt=depth_gt, 457 | normal=normal, 458 | normal_init=normal_init) 459 | 460 | # Convert the depth maps. 461 | idepth = depth2depth_inv(depth) 462 | idepth_init = depth2depth_inv(depth_init) if depth_init is not None else None 463 | idepth_range = depth_range2depth_inv_range(depth_range) 464 | 465 | # Convert the normal maps. 466 | inormal = None 467 | inormal_init = None 468 | if normal is not None: 469 | inormal = space2plane_normal( 470 | depth, 471 | normal, 472 | (camera_param['f_x'], camera_param['f_y']), 473 | (camera_param['c_x'], camera_param['c_y'])) 474 | if normal_init is not None: 475 | inormal_init = space2plane_normal( 476 | depth_init if depth_init is not None else depth, 477 | normal_init, 478 | (camera_param['f_x'], camera_param['f_y']), 479 | (camera_param['c_x'], camera_param['c_y'])) 480 | 481 | # Create the loss object. 482 | loss = Loss(image_gray, idepth, idepth_range, 483 | loss_param, 484 | idepth_confidence=depth_confidence, 485 | inormal=inormal, 486 | idepth_init=idepth_init, 487 | inormal_init=inormal_init, 488 | device=device).to(device=device, dtype=depth_dtype) 489 | 490 | # Set the maximum number of iterations. 491 | assert 'iter_max' in opt_param, 'Missing \'iter_max\' in `opt_param`.' 492 | iter_max = opt_param['iter_max'] 493 | 494 | # Set the learning rate and define the optimization policy (i.e., with oir without scheduler). 495 | assert 'learning_rate' in opt_param, 'Missing \'learning_rate\' in `opt_param.' 496 | assert 'lr_start' in opt_param['learning_rate'], 'Missing \'lr\' in `opt_param[\'learning_rate\']`.' 497 | assert 'lr_slot_nb' in opt_param['learning_rate'], 'Missing \'slot_nb\' in `opt_param[\'learning_rate\']`.' 498 | learning_rate_start = opt_param['learning_rate']['lr_start'] 499 | learning_rate_slot_nb = opt_param['learning_rate']['lr_slot_nb'] 500 | 501 | # Define stopping condition. 502 | if learning_rate_slot_nb < 1: 503 | 504 | # The learning rate is kept constant. 505 | 506 | # The optimization terminates in one of the following event occurs: 507 | # - the relative depth change is smaller than `eps_stop`, 508 | # - the loss is not improved for more than `attempt_max` consecutive iterations, 509 | # - `iter_max` iterations have been performed. 510 | 511 | assert 'eps_stop' in opt_param, 'Missing \'eps_stop\' in `opt_param.' 512 | assert 'attempt_max' in opt_param, 'Missing \'attempt_max\' in `opt_param.' 513 | 514 | eps_stop = opt_param['eps_stop'] 515 | attempt_max = opt_param['attempt_max'] 516 | scheduler_step_size = iter_max * 2 517 | 518 | else: 519 | 520 | # The learning rate is dynamically updated. 521 | 522 | # The optimization terminates only when `iter_max` iterations have been performed. 523 | # However, in this scenario the learning rate is progressively decreased: 524 | # - the learning rate starts at `learning_rate_start`, 525 | # - it is decreased `learning_rate_slot_nb - 1` times by a factor `10`. 526 | 527 | eps_stop = 0.0 528 | attempt_max = float('inf') 529 | scheduler_step_size = int(math.ceil(float(iter_max) / float(learning_rate_slot_nb))) 530 | 531 | # Set the plotting step. 532 | assert 'plotting_step' in opt_param, 'Missing \'plotting_step\' in `opt_param.' 533 | plotting_step = opt_param['plotting_step'] 534 | 535 | # Allocate an array to store the loss function values. 536 | loss_history = np.zeros(iter_max + 1) 537 | idepth_consistency_history = np.zeros(iter_max + 1) 538 | inormal_consistency_history = np.zeros(iter_max + 1) if loss_param['lambda_normal_consistency'] > 0 else None 539 | regularization_history = np.zeros(iter_max + 1) 540 | 541 | # Create an ADAM optimizer. 542 | optimizer = torch.optim.Adam(loss.parameters(), lr=learning_rate_start) 543 | 544 | # Create a learning rate scheduler. 545 | scheduler = torch.optim.lr_scheduler.StepLR(optimizer, scheduler_step_size, gamma=0.1) 546 | 547 | #################################################################################################################### 548 | ################################################# OPTIMIZATION ##################################################### 549 | #################################################################################################################### 550 | 551 | # Lowest minimum value of the loss encountered during the optimization. 552 | loss_value_min = float('inf') 553 | 554 | # Number of consecutive iterations without improving `loss_value_min`. 555 | attempt_counter = 0 556 | 557 | # Relative change of the depth map between two consecutive iterations. 558 | relative_depth_change = float('inf') 559 | 560 | ################################################# CASE `i == 0` #################################################### 561 | 562 | # Evaluate the loss function. 563 | optimizer.zero_grad() 564 | loss_value, idepth_consistency_value, inormal_consistency_value, regularization_value = loss.forward() 565 | 566 | # Log operations. 567 | with torch.no_grad(): 568 | 569 | # Store the current value of the loss. 570 | idepth_consistency_history[0] = idepth_consistency_value 571 | if inormal_consistency_history is not None: 572 | inormal_consistency_history[0] = inormal_consistency_value 573 | regularization_history[0] = regularization_value 574 | loss_history[0] = loss_value.item() 575 | 576 | # Log the optimization status to the standard output. 577 | print('Iteration: {:6}, Fails: {:3}, Rel. depth change: {:.6f}, Loss: {:.6f}'.format( 578 | 0, attempt_counter, relative_depth_change, loss_history[0]), flush=True) 579 | 580 | # Plot the optimization status. 581 | indexes = np.arange(0, 1) 582 | if logger is not None: 583 | depth_aux = depth_inv2depth( 584 | loss.idepth.data.to('cpu').squeeze().numpy(), depth_range) 585 | normal_aux = plane2space_normal( 586 | depth_aux, 587 | np.transpose(loss.inormal.data.to('cpu').squeeze().numpy(), (1, 2, 0)), 588 | (camera_param['f_x'], camera_param['f_y']), 589 | (camera_param['c_x'], camera_param['c_y'])) 590 | logger.plot( 591 | depth_refined=depth_aux, 592 | normal_refined=normal_aux, 593 | idepth_consistency_loss=(indexes, idepth_consistency_history[indexes]), 594 | inormal_consistency_loss=((indexes, inormal_consistency_history[indexes]) 595 | if inormal_consistency_history is not None else None), 596 | regularization_loss=(indexes, regularization_history[indexes]), 597 | global_loss=(indexes, loss_history[indexes])) 598 | 599 | ################################################# CASE `i > 0` ##################################################### 600 | 601 | for i in range(1, iter_max + 1): 602 | 603 | # Compute the gradient of each parameter of the loss (i.e., the depth map and the normal maps). 604 | loss_value.backward() 605 | 606 | # Store a copy of the old depth map. 607 | idepth_old = loss.idepth.clone().detach() 608 | 609 | # Update the old depth map. 610 | optimizer.step() 611 | 612 | # Update the optimizer learning rate. 613 | scheduler.step() 614 | 615 | # Without PyTorch tracking, project the new depth map into the specified depth range. 616 | with torch.no_grad(): 617 | loss.idepth.data = loss.idepth.data.clamp(idepth_range[0], idepth_range[1]) 618 | 619 | # Evaluate the loss function at the new depth map and normal map. 620 | optimizer.zero_grad() 621 | loss_value, idepth_consistency_value, inormal_consistency_value, regularization_value = loss.forward() 622 | 623 | # Without PyTorch tracking, perform some routines. 624 | with torch.no_grad(): 625 | 626 | # Store the value of the loss evaluated at the new depth map. 627 | idepth_consistency_history[i] = idepth_consistency_value 628 | if inormal_consistency_history is not None: 629 | inormal_consistency_history[i] = inormal_consistency_value 630 | regularization_history[i] = regularization_value 631 | loss_history[i] = loss_value.item() 632 | 633 | # Compute the relative depth map change. 634 | relative_depth_change = torch.norm( 635 | (idepth_old - loss.idepth).view(-1, 1)) / torch.norm(idepth_old.view(-1, 1)) 636 | 637 | # Update the lowest encountered minimum. 638 | if loss_history[i] >= loss_value_min: 639 | attempt_counter = attempt_counter + 1 640 | else: 641 | attempt_counter = 0 642 | loss_value_min = loss_history[i] 643 | 644 | # Evaluate the stopping condition. 645 | stop_now = (relative_depth_change <= eps_stop) or (attempt_counter >= attempt_max) 646 | 647 | if (i % plotting_step == 0) or stop_now or ((i + 1) > iter_max): 648 | 649 | # Log the optimization status to the standard output. 650 | print('Iteration: {:6}, Fails: {:3}, Rel. depth change: {:.6f}, Loss: {:.6f}'.format( 651 | i, attempt_counter, relative_depth_change, loss_history[i]), flush=True) 652 | 653 | # Plot the optimization status. 654 | indexes = np.arange(i - (plotting_step - 1), i + 1) # The index `i` is included. 655 | if logger is not None: 656 | depth_aux = depth_inv2depth( 657 | loss.idepth.data.to('cpu').squeeze().numpy(), depth_range) 658 | normal_aux = plane2space_normal( 659 | depth_aux, 660 | np.transpose(loss.inormal.data.to('cpu').squeeze().numpy(), (1, 2, 0)), 661 | (camera_param['f_x'], camera_param['f_y']), 662 | (camera_param['c_x'], camera_param['c_y'])) 663 | logger.plot( 664 | depth_refined=depth_aux, 665 | normal_refined=normal_aux, 666 | idepth_consistency_loss=(indexes, idepth_consistency_history[indexes]), 667 | inormal_consistency_loss=((indexes, inormal_consistency_history[indexes]) 668 | if inormal_consistency_history is not None else None), 669 | regularization_loss=(indexes, regularization_history[indexes]), 670 | global_loss=(indexes, loss_history[indexes])) 671 | 672 | # If the stopping condition is met, terminate. 673 | if stop_now: 674 | break 675 | 676 | #################################################################################################################### 677 | #################################################################################################################### 678 | #################################################################################################################### 679 | 680 | # Extract the refined depth map. 681 | depth_refined = depth_inv2depth( 682 | loss.idepth.detach().to('cpu').numpy().squeeze(), depth_range) 683 | 684 | # Extract the normal map associated to the refined depth map. 685 | normal_refined = plane2space_normal( 686 | depth_refined, 687 | np.transpose(loss.inormal.detach().to('cpu').numpy().squeeze(), (1, 2, 0)), 688 | (camera_param['f_x'], camera_param['f_y']), 689 | (camera_param['c_x'], camera_param['c_y'])) 690 | 691 | return depth_refined, normal_refined 692 | -------------------------------------------------------------------------------- /misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, 2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland, 3 | # Laboratoire de Traitement des Signaux 4 (LTS4). 4 | # All rights reserved. 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | # SOFTWARE. 23 | # 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com) 25 | 26 | import math 27 | import numpy as np 28 | import torch 29 | from torch.nn import functional as fun 30 | from scipy.ndimage import map_coordinates 31 | from scipy.interpolate import griddata 32 | from scipy.signal import convolve2d 33 | from filters import gauss_filter_deriv_2d, diff_filter_bank 34 | from transforms import depth2depth_inv 35 | from typing import Tuple, Union 36 | 37 | 38 | def resize_map(data: np.array, size_new: Tuple[int, int], order: int = 0) -> np.array: 39 | """It re-sizes the input map. 40 | 41 | It up-samples or down-samples any map (e.g., an image) with one or more channels. 42 | 43 | Args: 44 | data: map to resize, arranged as an `(H, W)` or `(H, W, C)` array. 45 | size_new: 2-tuple specifying the new height and width. 46 | order: order of the spline to be used in the re-sizing. 47 | 48 | Returns: 49 | The re-sized map, with dimensions `size_new[0], size_new[1]` or `size_new[0], size_new[1], C`. The output 50 | data type reflects the input one. 51 | """ 52 | 53 | # Check that the data is either 2D or 3D. 54 | if (data.ndim != 2) & (data.ndim != 3): 55 | raise ValueError('Input data must be either 2D or 3D.') 56 | 57 | # Input data dimensions. 58 | height = data.shape[0] 59 | width = data.shape[1] 60 | 61 | # The target dimensions. 62 | height_new, width_new = size_new 63 | 64 | # We make the following assumptions: 65 | # - each pixel in the input data has height `1` and width `1`, 66 | # - `data[y, x]` is concentrated at the spatial coordinates `(y, x)`. 67 | # According to the previous two assumptions: 68 | # - the top left corner of the pixel associated to `data[y, x]` is at spatial coordinates `(y - 0.5, x - 0.5)`, 69 | # - the bottom right corner of the pixel associated to `data[y, x]` is at spatial coordinates `(y + 0.5, x + 0.5)`, 70 | # - `data` has its top left corner at the spatial coordinates `(- 0.5, - 0.5)`, 71 | # - `data` has its bottom right corner at the spatial coordinates `(height - 1 + 0.5, width - 1 + 0.5)`. 72 | 73 | # NOTE: 74 | # Re-sizing the input data means enlarging the pixel size, not decreasing the data (image or depth) area. 75 | # After resizing, the top left and bottom right corners of `data` will still be located at spatial coordinates 76 | # `(- 0.5, - 0.5)` and `(height - 1 + 0.5, width - 1 + 0.5)`, respectively. 77 | 78 | # New pixel dimensions. 79 | pixel_height_new = float(height) / height_new 80 | pixel_width_new = float(width) / width_new 81 | 82 | # Compute the coordinates of center of the top left pixel in the re-sized data. 83 | start_y = - 0.5 + (pixel_height_new / 2.0) 84 | start_x = - 0.5 + (pixel_width_new / 2.0) 85 | 86 | # Compute the coordinates of the center of the bottom right pixel in the new data. 87 | end_y = height - 1 + 0.5 - (pixel_height_new / 2.0) 88 | end_x = width - 1 + 0.5 - (pixel_width_new / 2.0) 89 | 90 | # Compute the new sampling grid. 91 | y_coord_new, x_coord_new = np.mgrid[start_y:end_y:(height_new * 1j), start_x:end_x:(width_new * 1j)] 92 | 93 | # Organize the sampling grid in a single array. 94 | points_new = np.stack((y_coord_new.flatten(), x_coord_new.flatten()), axis=1) 95 | 96 | # Re-sample the input depth. 97 | if data.ndim == 2: 98 | 99 | # Single channel input. 100 | 101 | aux = map_coordinates(data, points_new.T, order=order, mode='nearest') 102 | data_resized = np.reshape(aux, (height_new, width_new)) 103 | 104 | else: 105 | 106 | # Multiple channel input. 107 | 108 | # Number of channels. 109 | channel_nb = data.shape[2] 110 | 111 | aux = tuple( 112 | map_coordinates(data[:, :, i], points_new.T, order=order, mode='nearest') for i in range(channel_nb)) 113 | aux = np.stack(aux, axis=1) 114 | data_resized = np.reshape(aux, (height_new, width_new, channel_nb)) 115 | 116 | return data_resized 117 | 118 | 119 | def filler_1d(data: np.array, mask: np.array) -> np.array: 120 | """It fills a sparse 1D array. 121 | 122 | It fills the missing entries in the sparse 1D array `data` using linear interpolation. 123 | A missing entry can be filled if and only if it is included between two available entries. 124 | 125 | Args: 126 | data: sparse array of dimension `(N,)`. 127 | mask: array of dimension `(N,)` with `mask[i]` equal to `1` if the entry `data[i]` is available, equal to `0` 128 | if `data[i]` needs to be filled. 129 | 130 | Returns: 131 | A new filled array with `nan` values at those entries that could not be filled. 132 | """ 133 | 134 | # Array length. 135 | length = len(data) 136 | 137 | # Array support. 138 | line = np.arange(0, length) 139 | 140 | # Compute the coordinates of the available entries. 141 | available_entries = line[mask.astype(np.bool)] 142 | 143 | # Compute the coordinates of the non available entries. 144 | target_entries = line[~mask.astype(np.bool)] 145 | 146 | # Allocate the filled array. 147 | data_filled = np.copy(data) 148 | 149 | # Perform the filling. 150 | if len(available_entries) <= 1: 151 | data_filled[~mask.astype(np.bool)] = math.nan 152 | else: 153 | data_filled[target_entries] = griddata( 154 | available_entries, (data[available_entries]), target_entries, method='linear') 155 | # The target entries at the left of the left-most available entry or at the right of the-right most available 156 | # entry are assigned the `nan` value. 157 | 158 | return data_filled 159 | 160 | 161 | def filler_2d(data: np.array, mask: np.array) -> np.array: 162 | """It fills a sparse 2D array. 163 | 164 | It fills the missing entries in the sparse 2D array `data` using the following approach. 165 | First, two candidates are computed: 166 | - one obtained by interpolating linearly all the rows separately, 167 | - one obtained by interpolating linearly all the columns separately. 168 | Then, the derivative of the two candidates are computed: 169 | - the horizontal candidate derivative is obtained by deriving each row separately, 170 | - the vertical candidate derivative is obtained by deriving each column separately. 171 | For each missing entry, the candidate with the lowest derivative (in absolute value) is selected. 172 | 173 | Missing entries with only one candidate are assigned that candidate. Missing entries without any candidate are 174 | filled with nearest neighbor. 175 | 176 | Args: 177 | data: sparse array of dimensions `(H, W)`. 178 | mask: array of dimension `(H, W)` with `mask[i, j]` equal to `1` if the entry `data[i, j]` is available, 179 | equal to `0` if `data[i, j]` needs to be filled. 180 | 181 | Returns: 182 | A new filled array. 183 | """ 184 | 185 | # Initialize the filled data with the input one. 186 | data_filled = np.copy(data) 187 | 188 | # Check whether there are entries to fill. If there are, then fill them. 189 | if np.sum(mask) != data.size: 190 | 191 | # Input data dimensions. 192 | height, width = data.shape 193 | 194 | # Perform the horizontal filling. 195 | data_horiz = np.zeros_like(data) 196 | for i in range(height): 197 | data_horiz[i, :] = filler_1d(data[i, :], mask[i, :]) 198 | 199 | # Compute the horizontal derivative. `nan` derivatives are set to infinity. 200 | derivative_horiz = np.abs(np.diff(np.append(data_horiz, data_horiz[:, -2:-1], axis=1), axis=1)) 201 | derivative_horiz[np.isnan(derivative_horiz)] = float('inf') 202 | 203 | # Perform the vertical filling. 204 | data_vert = np.zeros_like(data) 205 | for i in range(width): 206 | data_vert[:, i] = filler_1d(data[:, i], mask[:, i]) 207 | 208 | # Compute the vertical derivative. `nan` derivatives are set to infinity. 209 | derivative_vert = np.abs(np.diff(np.append(data_vert, data_vert[-2:-1, :], axis=0), axis=0)) 210 | derivative_vert[np.isnan(derivative_vert)] = float('inf') 211 | 212 | # Detect those pixels where the horizontal derivative is stronger than the vertical one, in absolute value. 213 | mask_orientation = derivative_horiz > derivative_vert 214 | 215 | # Perform the merging. 216 | data_filled = np.copy(data_horiz) 217 | data_filled[mask_orientation] = data_vert[mask_orientation] 218 | # Entries where no estimate is available (if any) are equal to `nan`. 219 | 220 | # Detect the entries where no estimate is available (if any), and fill them via nearest neighbor interpolation. 221 | mask_unfilled = np.isnan(data_filled) 222 | if np.sum(mask_unfilled) > 0: 223 | i, j = np.mgrid[0:data.shape[0]:1, 0:data.shape[1]:1] 224 | available_entries = np.stack((i[~mask_unfilled], j[~mask_unfilled]), axis=1) 225 | target_entries = np.stack((i[mask_unfilled], j[mask_unfilled]), axis=1) 226 | data_filled[mask_unfilled] = griddata( 227 | available_entries, (data_filled[~mask_unfilled]), target_entries, method='nearest') 228 | 229 | return data_filled 230 | 231 | 232 | def filler_2d_nearest(data: np.array, mask: np.array) -> np.array: 233 | """It fills a sparse 2D array using nearest neighbour interpolation. 234 | 235 | Args: 236 | data: sparse array of dimensions `(H, W)`. 237 | mask: array of dimension `(H, W)` with `mask[i, j]` equal to `1` if the entry `data[i, j]` is available, 238 | equal to `0` if `data[i, j]` needs to be filled. 239 | 240 | Returns: 241 | A new filled array. 242 | """ 243 | 244 | # Initialize the filled data with the input one. 245 | data_filled = np.copy(data) 246 | 247 | mask_available = mask.astype(np.bool) 248 | 249 | # Check whether there are entries to fill. If there are, then fill them. 250 | if np.sum(mask) != data.size: 251 | 252 | i, j = np.mgrid[0:data.shape[0]:1, 0:data.shape[1]:1] 253 | available_entries = np.stack((i[mask_available], j[mask_available]), axis=1) 254 | target_entries = np.stack((i[~mask_available], j[~mask_available]), axis=1) 255 | data_filled[~mask_available] = griddata( 256 | available_entries, (data_filled[mask_available]), target_entries, method='nearest') 257 | 258 | return data_filled 259 | 260 | 261 | def similarity_graph(image: torch.Tensor, 262 | window_size: int = 9, patch_size: int = 7, 263 | sigma_intensity: float = 0.2, sigma_spatial: float = 3.0, 264 | degree_max: int = 15) -> Tuple[torch.Tensor, torch.Tensor]: 265 | """It builds a similarity graph on the input image. 266 | 267 | Args: 268 | image: reference image, arranged as a `(1, 1, H, W)` tensor. 269 | window_size: edge size of the square searching window. 270 | patch_size: edge size of the square patch used in the similarity computation. 271 | sigma_intensity: intensity standard deviation for the gaussian similarity weights. 272 | sigma_spatial: spatial standard deviation for the gaussian similarity weights. 273 | degree_max: maximum number of neighbors for each node (pixel) in the similarity graph. 274 | 275 | Returns: 276 | A tuple containing two `(1, degree_max, H, W)` tensors. The entry `(0, k, i, j)` of the first tensor stores the 277 | similarity weight between the pixels `(i, j)' of the input image and its k-th best neighbor. 278 | The linear index of k-th best neighbor is stored in the entry `(0, k, i, j)` of the second tensor. 279 | A pixel `(i, j)` with less than `degree_max` neighbors has the array `(0, :, i, j)` in the first tensor filled 280 | with zeros. The linear index, in the second tensor, associated to the aforementioned zero weights is the linear 281 | index of the pixel `(i, j)` itself. 282 | """ 283 | 284 | # Check the input image type. 285 | assert image.is_floating_point(), "The input image must be of type float." 286 | 287 | # Image dimensions. 288 | channel_nb = image.size(1) 289 | height = image.size(2) 290 | width = image.size(3) 291 | 292 | # Organize the channels in the batch dimension. 293 | image_aux = image 294 | if channel_nb > 1: 295 | image_aux = image.transpose(0, 1).contiguous() 296 | 297 | # Create the filters to be used to compute the patch similarity. 298 | filter_bank = diff_filter_bank(window_size).to(image_aux) 299 | 300 | # Compute the padding for the patch similarity computation. 301 | window_radius = int((window_size - 1) / 2.0) 302 | patch_radius = int((patch_size - 1) / 2.0) 303 | pad = [window_radius + patch_radius] * 4 304 | 305 | # Compute the pixel similarity. 306 | pixel_similarity = fun.conv2d( 307 | fun.pad(image_aux, pad, mode='replicate'), filter_bank).pow(2).sum(dim=0, keepdim=True) 308 | # `pixel_similarity` is `(1, window_size * window_size, height + (2 * patch_radius), width + (2 * patch_radius))`. 309 | 310 | # Compute the integral image associated to `similarity`. 311 | pad = (1, 0, 1, 0) # (pad_left, pad_right, pad_top, pad_bottom) 312 | integral = fun.pad(pixel_similarity, pad, mode='constant', value=0).cumsum(dim=2).cumsum(dim=3) 313 | # `integral` is `(1, window_size * window_size, height + (2 * patch_radius) + 1, width + (2 * patch_radius) + 1)`. 314 | 315 | # Free the memory associated to `pixel_similarity`. 316 | del pixel_similarity 317 | 318 | # Exploit the integral image to compute the patch similarity in constant time. 319 | integral_height = integral.size(2) 320 | integral_width = integral.size(3) 321 | bottom_right = integral.narrow(2, integral_height - height, height).narrow(3, integral_width - width, width) 322 | bottom_left = integral.narrow(2, integral_height - height, height).narrow(3, 0, width) 323 | top_right = integral.narrow(2, 0, height).narrow(3, integral_width - width, width) 324 | top_left = integral.narrow(2, 0, height).narrow(3, 0, width) 325 | patch_similarity = bottom_right.clone().add_(-1.0, bottom_left).add_(-1.0, top_right).add_(top_left) 326 | 327 | # DEBUG. 328 | # patch_similarity.sqrt_() 329 | 330 | # Normalize the patch similarity. 331 | patch_similarity.div_((- 2.0) * (sigma_intensity ** 2)) 332 | 333 | # Free the memory associated to `integral`. 334 | del integral 335 | 336 | # Define the window grid. 337 | y_window, x_window = torch.meshgrid( 338 | [torch.arange(- window_radius, window_radius + 1, dtype=torch.int16, device=image_aux.device), 339 | torch.arange(- window_radius, window_radius + 1, dtype=torch.int16, device=image_aux.device)]) 340 | y_window = y_window.reshape(1, -1) 341 | x_window = x_window.reshape(1, -1) 342 | 343 | # Remove the entry `(0, 0)` from the window grid, as `filter_bank` does not contain any filter for this coordinate. 344 | mask = (y_window == 0) & (x_window == 0) 345 | y_window = y_window[~mask].reshape(1, -1, 1, 1) 346 | x_window = x_window[~mask].reshape(1, -1, 1, 1) 347 | 348 | # Compute the squared spatial distance. 349 | spatial_weights = x_window.to(patch_similarity).pow_(2) + y_window.to(patch_similarity).pow_(2) 350 | 351 | # Normalize the spatial distance. 352 | spatial_weights.div_((- 2.0) * (sigma_spatial ** 2)) 353 | 354 | # Compute the global weights (based on both patch similarity and spatial distance). 355 | weights = patch_similarity.add_(spatial_weights).exp_() 356 | # weights = patch_similarity.exp_() # DEBUG. 357 | 358 | # Define the image grid. 359 | y_source, x_source = torch.meshgrid( 360 | [torch.arange(height, dtype=torch.int16, device=image_aux.device), 361 | torch.arange(width, dtype=torch.int16, device=image_aux.device)]) 362 | y_source = y_source[None, None,] 363 | x_source = x_source[None, None,] 364 | 365 | # Detect and remove the non valid weights, i.e., those associated to pixel outside the actual image support. 366 | y_target = torch.zeros_like(y_source) 367 | x_target = torch.zeros_like(x_source) 368 | for i in range(weights.size(1)): 369 | 370 | # Compute the neighbouring pixel coordinates. 371 | torch.add(y_source, y_window.narrow(1, i, 1), out=y_target) 372 | torch.add(x_source, x_window.narrow(1, i, 1), out=x_target) 373 | 374 | # Detect the non valid coordinates and set them to zero. 375 | weights.narrow(1, i, 1).mul_( 376 | (y_target >= 0).to(weights)).mul_( 377 | (y_target < height).to(weights)).mul_( 378 | (x_target >= 0).to(weights)).mul_( 379 | (x_target < width).to(weights)) 380 | 381 | # For each pixel, select the `degree_max` neighbours with the largest weights. 382 | weights_top, indexes = torch.topk(weights, degree_max, dim=1) 383 | # Note that, although the weights associated to non valid neighbours have been set equal to zero, some of these 384 | # neighbours may still have been selected. This must be taken into account later. 385 | 386 | # Free the memory associated to `weights`. 387 | del weights 388 | 389 | # Normalize the vector of weights associated to each pixel by its sum. 390 | weights_top.div_( 391 | torch.max(weights_top.sum(dim=1, keepdim=True).expand_as(weights_top), weights_top.new_ones(1) * 1e-12)) 392 | 393 | # Build the tensor `indexes_linear`. 394 | index_linear = torch.zeros_like(weights_top, dtype=torch.long) 395 | for i in range(degree_max): 396 | 397 | # Flatten the spatial dimensions of `indexes`. 398 | indexes_flattened = indexes.narrow(1, i, 1).view(1, -1, 1, 1) 399 | 400 | # Compute the neighboring pixel coordinates. 401 | torch.add( 402 | y_source, 403 | torch.gather(y_window, 1, indexes_flattened).view(y_source.size()), 404 | out=y_target) 405 | torch.add( 406 | x_source, 407 | torch.gather(x_window, 1, indexes_flattened).view(x_source.size()), 408 | out=x_target) 409 | 410 | # The coordinates of the non valid neighbors of a pixel `p` are set equal to the coordinates of `p` itself. 411 | mask = None 412 | if (y_target < 0).any() or (y_target >= height).any(): 413 | mask = (y_target < 0) | (y_target >= height) 414 | y_target[mask] = y_source[mask] 415 | if (x_target < 0).any() or (x_target >= width).any(): 416 | mask = (x_target < 0) | (x_target >= width) 417 | x_target[mask] = x_source[mask] 418 | 419 | # Convert the spatial indexes into linear. 420 | torch.add( 421 | x_target.to(index_linear), 422 | width, 423 | y_target.to(index_linear), 424 | out=index_linear.narrow(1, i, 1)) 425 | 426 | # Free the memory associated to `y_target`, `x_target`, `mask`. 427 | del y_target, x_target, mask 428 | 429 | return weights_top, index_linear 430 | 431 | 432 | def unravel_index(index: Union[np.ndarray, torch.Tensor], size: Tuple[int, int])\ 433 | -> Union[Tuple[np.ndarray, np.ndarray], Tuple[torch.Tensor, torch.Tensor]]: 434 | """It converts linear indexes into matrix indexes. 435 | 436 | It converts each input linear index `i` into a pair `(row, col)` for the matrix whose shape is specified at the input. 437 | 438 | Args: 439 | index: linear indexes, arranged as an `(N,)` array or tensor. 440 | size: matrix shape. 441 | 442 | Returns: 443 | A tuple containing the row and columns indexes, each one arranged as an `(N,)` array or tensor. 444 | """ 445 | 446 | height, width = size 447 | 448 | # # Check the class of the input data. 449 | # index_class = type(index).__name__ 450 | # 451 | # if index_class == 'ndarray': 452 | # 453 | # row, col = np.divmod(index, width) 454 | # 455 | # elif index_class == 'Tensor': 456 | # 457 | # row = (index.div(width)).floor_() 458 | # col = index.fmod(width) 459 | # 460 | # else: 461 | # 462 | # raise TypeError('The input index data type must be ndarray or Tensor.') 463 | 464 | row = index // width 465 | col = index % width 466 | 467 | return row, col 468 | 469 | 470 | def depth_percentage_error(depth: np.array, depth_gt: np.array, threshold: float): 471 | """It computes the percentage of pixel whose depth has an error larger than a predefined threshold. 472 | 473 | Args: 474 | depth: depth map to check, arranged as an `(H, W)` array. 475 | depth_gt: ground truth depth map, arranged as an `(H, W)` array. 476 | threshold: error threshold. 477 | 478 | Returns: 479 | The percentage of pixels in the input depth map with an error larger than the specified threshold. 480 | """ 481 | 482 | mask = (depth_gt > 0) & (depth_gt < float('inf')) 483 | error = np.abs(depth_gt - depth) 484 | error = (np.sum(error[mask] > threshold) / np.sum(mask)) * 100 485 | 486 | return error 487 | 488 | 489 | def space2plane_normal(depth: np.array, normal: np.array, 490 | focal: Tuple[float, float], center: Tuple[float, float]) -> np.array: 491 | """It computes the 2D normals associated to the inverse depth, starting from the 3D normals. 492 | 493 | The unitary normal associated to a 3D points `(X_0, Y_0, Z_0)` defines a plane `P` that locally approximates the 494 | surface around the point itself. Let us indicate with `(x_0, y_0)` the coordinates of the projection of 495 | `(X_0, Y_0, Z_0)` onto the camera image plane. Assuming a pinhole camera model, the inverse depth associated to 496 | the plane `P` is a plane as well, `P1` hereafter, passing through the point `(x_0, y_0, 1 / depth[x_0, x_0])`. 497 | In particular, the plane `P1` is described by the following equation: 498 | 499 | `(1 / depth[x, y]) = (1 / depth[x_0, y_0]) + (w_1 * (x - x_0)) + (w_2 * (y - y_0))` 500 | 501 | where the direction of the (non necessarily unitary) vector `(w_0, w_1, -1)` defines the orientation of `P1`. 502 | For each pixel in the input depth map, this function leverages the normal of the corresponding 3D point to compute 503 | the corresponding vector `(w_0, w_1)`. 504 | 505 | Input 3D normals with the 'z' component equal to zero are mapped to the 2D zero vector. 506 | Input 3D normals whose corresponding depth is not valid are mapped to the 2D zero vector. 507 | 508 | Args: 509 | depth: depth map, arranged as an `(H, W)` array. 510 | normal: normal map, arranged as an `(H, W, 3)` array. Normals must be unitary. 511 | focal: tuple containing the camera focal lengths `(f_x, f_y)`. 512 | center: tuple containing the camera principal point coordinates `(c_x, c_y)`. 513 | 514 | Returns: 515 | The 2D normals associated to the input 3D normals, arranged as an `(H, W, 2)` array. 516 | """ 517 | 518 | # Define the data type to be used below: 64-bit precision is recommended. 519 | dtype = np.float64 520 | 521 | # Convert the input depth map to `dtype`. 522 | d = depth.astype(dtype, copy=False) 523 | 524 | # Depth map dimensions. 525 | height = depth.shape[0] 526 | width = depth.shape[1] 527 | 528 | # Build the depth map grid. 529 | x, y = np.meshgrid(np.arange(width, dtype=dtype), np.arange(height, dtype=dtype)) 530 | 531 | # Extract the camera focal lengths and the coordinates of the camera center of projection. 532 | focal_x, focal_y = focal 533 | center_x, center_y = center 534 | 535 | # Detect the entries of the grid where the depth is available. 536 | mask = (d > 0) & (d < float('inf')) 537 | 538 | # Create a copy of the 3D normals where those associated to non available depth entries are set to zero. 539 | normal_new = np.zeros_like(normal, dtype=dtype) 540 | normal_new[mask] = normal[mask] 541 | 542 | # Re-normalize the normals. 543 | normal_norm = np.linalg.norm(normal_new, axis=2) 544 | mask_nnz = (normal_norm > 0) 545 | for i in range(3): 546 | normal_new[:, :, i][mask_nnz] = normal_new[:, :, i][mask_nnz] / normal_norm[mask_nnz] 547 | 548 | # Name the 3D normal components as in the report. 549 | a = normal_new[:, :, 0] 550 | b = normal_new[:, :, 1] 551 | c = normal_new[:, :, 2] 552 | 553 | # Compute the cosine of the angle between the 3D normal and the line of sight of the corresponding 3D point. 554 | rho = np.zeros_like(depth, dtype=dtype) 555 | rho[mask] = d[mask] * ( 556 | ((a[mask] * (x[mask] - center_x)) / focal_x) + 557 | ((b[mask] * (y[mask] - center_y)) / focal_y) + 558 | c[mask]) 559 | 560 | ''' 561 | # Cases: 562 | # 1. A 3D normal with negative `rho` indicates a 3D point on the side of a plane visible by the camera. 563 | # 2. A 3D normal with positive `rho` indicates a 3D point on the side of a plane hidden to the camera. 564 | # However, it is sufficient to flip the normal orientation in order to associate the point to the side of the 565 | # plane visible by the camera. 566 | # 3. A 3D normal with zero `rho` indicates a 3D point on a plane aligned with the line of sight of the point and 567 | # therefore not visible by the camera (regardless of the side of the plane). 568 | # 569 | # As the normals `n` and `-n` are both projected to the same 2D vector, it is not necessary to flip the normals 570 | # corresponding to the Case 2. 571 | ''' 572 | 573 | # Allocate the space for the 2D normals and name them as in the report. 574 | plane_normal = np.zeros((height, width, 2), dtype=dtype) 575 | w_0 = plane_normal[:, :, 0] 576 | w_1 = plane_normal[:, :, 1] 577 | 578 | # Compute the 2D normals associated to the available 3D normals. 579 | mask = (mask & (rho != 0)) 580 | w_0[mask] = a[mask] / (rho[mask] * focal_x) 581 | w_1[mask] = b[mask] / (rho[mask] * focal_y) 582 | 583 | # The 3D normals with a valid depth, but corresponding to the Case 3 (i.e., `rho == 0`), are not valid. 584 | # These 3D normals are arbitrarily mapped to the 2D normal `[0, 0]`. 585 | 586 | return plane_normal 587 | 588 | 589 | def plane2space_normal(depth: np.array, normal: np.array, 590 | focal: Tuple[float, float], center: Tuple[float, float]) -> np.array: 591 | """It reverts the operation performed by `space2plane_normals`. 592 | 593 | Args: 594 | depth: depth map, arranged as an `(H, W)` array. 595 | normal: normal map, arranged as an `(H, W, 2)` array. 596 | focal: tuple containing the camera focal lengths `(f_x, f_y)`. 597 | center: tuple containing the camera principal point coordinates `(c_x, c_y)`. 598 | 599 | Returns: 600 | The 3D normals associated to the input 2D normals, arranged as an `(H, W, 3)` array. 601 | """ 602 | 603 | # Define the data type to be used below: 64-bit precision is recommended. 604 | dtype = np.float64 605 | 606 | # Convert the input depth map to `dtype`. 607 | d = depth.astype(dtype, copy=False) 608 | 609 | # Depth map dimensions. 610 | height = depth.shape[0] 611 | width = depth.shape[1] 612 | 613 | # Build the depth map grid. 614 | x, y = np.meshgrid(np.arange(width, dtype=dtype), np.arange(height,dtype=dtype)) 615 | 616 | # Extract the camera focal lengths and the coordinates of the camera center of projection. 617 | focal_x, focal_y = focal 618 | center_x, center_y = center 619 | 620 | # Detect the entries of the grid where the depth is available. 621 | mask = (d > 0) & (d < float('inf')) 622 | 623 | # Address the 2D normals as in the report. 624 | w_0 = normal[:, :, 0].astype(dtype, copy=False) 625 | w_1 = normal[:, :, 1].astype(dtype, copy=False) 626 | 627 | # Compute the coefficients of the first linear equation. 628 | alpha = np.zeros_like(depth, dtype=dtype) 629 | beta = np.zeros_like(depth, dtype=dtype) 630 | gamma = np.zeros_like(depth, dtype=dtype) 631 | alpha[mask] = (w_0[mask] * (x[mask] - center_x) * d[mask] * focal_y) - focal_y 632 | beta[mask] = w_0[mask] * (y[mask] - center_y) * d[mask] * focal_x 633 | gamma[mask] = w_0[mask] * d[mask] * (focal_x * focal_y) 634 | 635 | # Compute the coefficients of the second linear equation. 636 | delta = np.zeros_like(depth, dtype=dtype) 637 | epsilon = np.zeros_like(depth, dtype=dtype) 638 | phi = np.zeros_like(depth, dtype=dtype) 639 | delta[mask] = w_1[mask] * (x[mask] - center_x) * d[mask] * focal_y 640 | epsilon[mask] = (w_1[mask] * (y[mask] - center_y) * d[mask] * focal_x) - focal_x 641 | phi[mask] = w_1[mask] * d[mask] * (focal_x * focal_y) 642 | 643 | # Allocate the space for the 3D normals and address them as in the report. 644 | space_normal = np.zeros((height, width, 3), dtype=dtype) 645 | a = space_normal[:, :, 0] 646 | b = space_normal[:, :, 1] 647 | c = space_normal[:, :, 2] 648 | 649 | # ==== CASE w_0(x, y) NOT ZERO AND w_1(x, y) NOT ZERO ============================================================== 650 | 651 | # Detect the entries associated to the current case. 652 | mask_case = (w_0 != 0) & (w_1 != 0) & mask 653 | 654 | # Auxiliary variables. 655 | kappa = np.zeros_like(depth, dtype=dtype) 656 | alpha_beta_kappa = np.zeros_like(depth, dtype=dtype) 657 | one_plus_kappa_sq = np.zeros_like(depth, dtype=dtype) 658 | kappa[mask_case] = (w_1[mask_case] * focal_y) / (w_0[mask_case] * focal_x) 659 | alpha_beta_kappa[mask_case] = alpha[mask_case] + (beta[mask_case] * kappa[mask_case]) 660 | one_plus_kappa_sq[mask_case] = 1.0 + (kappa[mask_case] ** 2) 661 | 662 | a[mask_case] = - (np.sign(w_0[mask_case]) * np.abs(gamma[mask_case])) / np.sqrt( 663 | (alpha_beta_kappa[mask_case] ** 2) + ((gamma[mask_case] ** 2) * one_plus_kappa_sq[mask_case])) 664 | b[mask_case] = kappa[mask_case] * a[mask_case] 665 | c[mask_case] = - ((alpha[mask_case] * a[mask_case]) + (beta[mask_case] * b[mask_case])) / gamma[mask_case] 666 | 667 | # ==== CASE w_0(x, y) NOT ZERO AND w_1(x, y) EQUAL TO ZERO ========================================================= 668 | 669 | # Detect the entries associated to the current case. 670 | mask_case = (w_0 != 0) & (w_1 == 0) & mask 671 | 672 | a[mask_case] = - (np.sign(w_0[mask_case]) * np.abs(gamma[mask_case])) / np.sqrt( 673 | (alpha[mask_case] ** 2) + (gamma[mask_case] ** 2)) 674 | c[mask_case] = - (alpha[mask_case] / gamma[mask_case]) * a[mask_case] 675 | 676 | # ==== CASE w_0(x, y) EQUAL TO ZERO AND w_1(x, y) NOT ZERO ========================================================= 677 | 678 | # Detect the entries associated to the current case. 679 | mask_case = (w_0 == 0) & (w_1 != 0) & mask 680 | 681 | b[mask_case] = - (np.sign(w_1[mask_case]) * np.abs(phi[mask_case])) / np.sqrt( 682 | (epsilon[mask_case] ** 2) + (phi[mask_case] ** 2)) 683 | c[mask_case] = - (epsilon[mask_case] / phi[mask_case]) * b[mask_case] 684 | 685 | # ==== CASE w_0(x, y) EQUAL TO ZERO AND w_1(x, y) EQUAL TO ZERO ==================================================== 686 | 687 | # Detect the entries associated to the current case. 688 | mask_case = (w_0 == 0) & (w_1 == 0) & mask 689 | 690 | c[mask_case] = - 1.0 691 | 692 | # ================================================================================================================== 693 | 694 | # Check the normal orientations ... 695 | 696 | # Compute the cosine of the angle between the 3D normal and the line of sight of the corresponding 3D point. 697 | rho = np.zeros((height, width), dtype=dtype) 698 | rho[mask] = d[mask] * ( 699 | ((a[mask] * (x[mask] - center_x)) / focal_x) + 700 | ((b[mask] * (y[mask] - center_y)) / focal_y) + 701 | c[mask]) 702 | 703 | # Cases: 704 | # 1. A 3D normal with negative `rho` indicates a 3D point on the side of a plane visible by the camera. 705 | # 2. A 3D normal with positive `rho` indicates a 3D point on the side of a plane hidden to the camera. 706 | # However, it is sufficient to flip the normal orientation in order to associate the point to the side of the 707 | # plane visible by the camera. 708 | # 3. A 3D normal with zero `rho` indicates a 3D point on a plane aligned with the line of sight of the point and 709 | # therefore not visible by the camera (regardless of the side of the plane). 710 | 711 | # No normal must be compliant with the case 2. 712 | assert np.sum(rho > 0) == 0, 'Error in the normal map correction.' 713 | 714 | # Detect the 3D normals whose orientation is not compatible with a visible point (case 3) and set them to zero. 715 | mask = (rho == 0) 716 | space_normal[mask] = 0 717 | 718 | return space_normal 719 | 720 | 721 | def depth2normal(depth: np.array, 722 | focal: Tuple[float, float], center: Tuple[float, float], 723 | filter_size: int = 7, filter_sigma: float = 5.0) -> np.array: 724 | """It computes the 3D normals associated to the 3D points described by the input depth map. 725 | 726 | Args: 727 | depth: depth map, arranged as an `(H, W)` array. 728 | focal: tuple containing the camera focal lengths `(f_x, f_y)`. 729 | center: tuple containing the camera principal point coordinates `(c_x, c_y)`. 730 | filter_size: height (and width) of the filters. 731 | filter_sigma: standard deviation (in pixels) of the Gaussian filter underneath the derivative filters. 732 | 733 | Returns: 734 | The 3D normals associated to the 3D points in the input depth map. 735 | """ 736 | 737 | # Build the vertical (y) derivative filter. 738 | d_gauss_dy = gauss_filter_deriv_2d(filter_size, filter_sigma) 739 | 740 | # Build the gradient filter. 741 | grad_filter = d_gauss_dy.T + (1j * d_gauss_dy) 742 | # The x and y derivative filters are encoded in the real and imaginary parts of the filter. 743 | 744 | # Compute the inverse depth. 745 | depth_inv = depth2depth_inv(depth) 746 | 747 | # Compute the inverse depth gradient. 748 | depth_inv_grad = convolve2d(depth_inv, grad_filter, mode='same', boundary='symm') 749 | depth_inv_grad = np.stack((np.real(depth_inv_grad), np.imag(depth_inv_grad)), axis=2) 750 | 751 | # Convert the inverse depth gradient field to 3D normals. 752 | normal = plane2space_normal(depth, depth_inv_grad, focal, center) 753 | 754 | return normal 755 | 756 | 757 | def check_normal(depth: np.array, normal: np.array, 758 | focal: Tuple[float, float], center: Tuple[float, float]) -> np.array: 759 | """It computes the inner product between the 3D point associated to each pixel and the corresponding 3D normal. 760 | 761 | Args: 762 | depth: depth map, arranged as an `(H, W)` array. 763 | normal: normal map, arranged as an `(H, W, 3)` array. Normals must be unitary. 764 | focal: tuple containing the camera focal lengths `(f_x, f_y)`. 765 | center: tuple containing the camera principal point coordinates `(c_x, c_y)`. 766 | 767 | Returns: 768 | The inner product, arranged as an `(H, w)` array, between the 3D point associated to each pixel and the 769 | corresponding 3D normal. Entries set to zero represent either pixel with no normal available or whose 770 | corresponding 3D point is not visible by the camera. 771 | """ 772 | 773 | # Define the data type to be used below: 64-bit precision is recommended. 774 | dtype = np.float64 775 | 776 | # Convert the input depth map to `dtype`. 777 | d = depth.astype(dtype, copy=False) 778 | 779 | # Depth map dimensions. 780 | height = depth.shape[0] 781 | width = depth.shape[1] 782 | 783 | # Build the depth map grid. 784 | x, y = np.meshgrid(np.arange(width, dtype=dtype), np.arange(height, dtype=dtype)) 785 | 786 | # Extract the camera focal lengths and the coordinates of the camera center of projection. 787 | focal_x, focal_y = focal 788 | center_x, center_y = center 789 | 790 | # Detect the entries of the grid where the depth is available. 791 | mask = (depth > 0) & (depth < float('inf')) 792 | 793 | # Name the 3D normal components as in the report. 794 | a = normal[:, :, 0] 795 | b = normal[:, :, 1] 796 | c = normal[:, :, 2] 797 | 798 | # Compute the cosine of the angle between the 3D normal and the line of sight of the corresponding 3D point. 799 | rho = np.zeros_like(depth, dtype=dtype) 800 | rho[mask] = d[mask] * ( 801 | ((a[mask] * (x[mask] - center_x)) / focal_x) + 802 | ((b[mask] * (y[mask] - center_y)) / focal_y) + 803 | c[mask]) 804 | 805 | # Cases: 806 | # 1. A 3D normal with negative `rho` indicates a 3D point on the side of a plane visible by the camera. 807 | # 2. A 3D normal with positive `rho` indicates a 3D point on the side of a plane hidden to the camera. 808 | # However, it is sufficient to flip the normal orientation in order to associate the point to the side of the 809 | # plane visible by the camera. 810 | # 3. A 3D normal with zero `rho` indicates a 3D point on a plane aligned with the line of sight of the point and 811 | # therefore not visible by the camera (regardless of the side of the plane). 812 | 813 | return rho 814 | --------------------------------------------------------------------------------