├── colmap
    ├── __init__.py
    └── read_model.py
├── LICENSE
├── pltfuns.py
├── transforms.py
├── README.md
├── filters.py
├── losses.py
├── iofuns.py
├── refine.py
├── logger.py
├── refinement.py
└── misc.py


/colmap/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020,
 4 | ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland,
 5 | Laboratoire de Traitement des Signaux 4 (LTS4).
 6 | All rights reserved.
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | of this software and associated documentation files (the "Software"), to deal
10 | in the Software without restriction, including without limitation the rights
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | copies of the Software, and to permit persons to whom the Software is
13 | furnished to do so, subject to the following conditions:
14 | 
15 | The above copyright notice and this permission notice shall be included in all
16 | copies or substantial portions of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | SOFTWARE.
25 | 


--------------------------------------------------------------------------------
/pltfuns.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020,
  2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland,
  3 | # Laboratoire de Traitement des Signaux 4 (LTS4).
  4 | # All rights reserved.
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | #
 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | # SOFTWARE.
 23 | #
 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com)
 25 | 
 26 | import numpy as np
 27 | # import matplotlib.cm
 28 | from typing import Tuple
 29 | 
 30 | 
 31 | def normal2rgb(normal: np.array) -> np.array:
 32 |     """It maps a 3D normal map into an RGB image.
 33 | 
 34 |     It maps the input 3D normal map into an RGB image. Since a normal vector has unitary norm, the set of all the
 35 |     possible normals describes a unitary sphere. This function maps each point `(X, Y, Z)` on the sphere, hence each
 36 |     normal vector, to an RGB value. All non zero normals are assumed valid and no check is performed on them.
 37 | 
 38 |     Args:
 39 |         normal: normal map, arranged as an `(H, W, 3)` array.
 40 | 
 41 |     Returns:
 42 |         An RGB image, arranged as an `(H, W, 3)` array, that encodes the normals.
 43 |     """
 44 | 
 45 |     # Detect the entries of the grid where the 3D normals are available.
 46 |     mask = (np.sum(normal != 0, axis=2) != 0)
 47 | 
 48 |     # Allocate the RGB representation of the normals.
 49 |     normal_rgb = np.zeros_like(normal, dtype=np.uint8)
 50 | 
 51 |     # Map the X, Y and Z coordinates from [-1, 1] to [0, 255].
 52 |     normal_rgb[mask] = np.round(((normal.astype(np.float64, copy=False)[mask] + 1.0) / 2.0) * 255).astype(np.uint8)
 53 | 
 54 |     return normal_rgb
 55 | 
 56 | 
 57 | def normal2rgb_legend(n: int = 500) -> Tuple[np.array, np.array]:
 58 |     """It returns a legend for the function `normals2rgb`.
 59 | 
 60 |     It returns a legend for the color coding adopted in the function `normal2rgb`. The legend comprises two images
 61 |     representing the two hemispheres associated to the negative and positive `Z` semi-axis, respectively.
 62 | 
 63 |         Args:
 64 |             n: height (and width) of the output legend images.
 65 | 
 66 |         Returns:
 67 |             The legend arranged as two `(n, n)` arrays, for the negative and positive `Z` semi-axes, respectively.
 68 |         """
 69 | 
 70 |     # Build the X and Y components of the 3D normals.
 71 |     x, y = np.meshgrid(np.linspace(- 1, 1, n), np.linspace(- 1, 1, n))
 72 | 
 73 |     # Detect the entries that are within the unitary circle.
 74 |     mask = np.sqrt((x ** 2) + (y ** 2)) <= 1.0
 75 | 
 76 |     # Compute the z component of the 3D unitary normals.
 77 |     z = np.zeros_like(x)
 78 |     z[mask] = np.sqrt(np.abs(1 - (x[mask] ** 2) - (y[mask] ** 2)))
 79 | 
 80 |     # Set the X and Y entries of the non unitary 3D normals to zero.
 81 |     x[~mask] = 0
 82 |     y[~mask] = 0
 83 | 
 84 |     # Build the negative hemisphere of the 3D normal legend.
 85 |     normal_z_neg = np.stack((x, y, - z), axis=2)
 86 | 
 87 |     # Build the positive hemisphere of the 3D normal legend.
 88 |     normal_z_pos = np.stack((x, y, z), axis=2)
 89 | 
 90 |     # Encode the 3D normals into an RGB image.
 91 |     normal_z_neg_rgb = normal2rgb(normal_z_neg)
 92 |     normal_z_pos_rgb = normal2rgb(normal_z_pos)
 93 | 
 94 |     return normal_z_neg_rgb, normal_z_pos_rgb
 95 | 
 96 | 
 97 | # def plot_map(heat_map, mask=None, vmax=0.0, vmin=1.0, colormap='viridis'):
 98 | #     """It turns the input heat map into an RGB image.
 99 | #
100 | #     It turns the input heat map into an RGB image according to the specified input color map. The parameters `vmin`
101 | #     and `vmax` play the same role that they have in `matplotlib.pyplot.imshow`. In particular, calling `imshow` on
102 | #     the input heat map using `vmin` and `vmax` produces the same visual result of calling `imshow` on the RGB image
103 | #     created by this function.
104 | #
105 | #     In addition, the heat map pixels marked as `False` in the input `mask` are converted to white in the RGB image.
106 | #
107 | #     Args:
108 | #         heat_map: heat map, arranged as an `(H, W)` array.
109 | #         mask: binary mask, arranged as an `(H, W)` array.
110 | #         vmax: heat map lower bound.
111 | #         vmin: heat map upper bound.
112 | #         colormap: `matplotlib` colormap.
113 | #
114 | #     Returns:
115 | #         The input heat map converted to RGB.
116 | #     """
117 | #
118 | #     # Clip the input heat map.
119 | #     heat_map_clipped = np.clip(heat_map, vmin, vmax)
120 | #
121 | #     # Color map object.
122 | #     cmap = matplotlib.cm.get_cmap(colormap)
123 | #
124 | #     # Convert the heat map intensity values to RGB triplets.
125 | #     heat_map_rgb = cmap((heat_map_clipped - vmin) / (vmax - vmin))[:, :, 0:-1]
126 | #
127 | #     # Non valid pixels are assigned the white color.
128 | #     if mask is not None:
129 | #         mask_rgb = np.repeat(mask[:, :, None], 3, axis=2)
130 | #         heat_map_rgb[~mask_rgb] = 1.0
131 | #
132 | #     return heat_map_rgb
133 | 


--------------------------------------------------------------------------------
/transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020,
  2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland,
  3 | # Laboratoire de Traitement des Signaux 4 (LTS4).
  4 | # All rights reserved.
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | #
 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | # SOFTWARE.
 23 | #
 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com)
 25 | 
 26 | import numpy as np
 27 | import torch
 28 | from typing import Union, Tuple
 29 | 
 30 | 
 31 | DEFAULT_DEPTH_RANGE = (1e-1, 1e3)
 32 | 
 33 | 
 34 | def depth2depth_inv(depth: Union[np.array, torch.Tensor]) -> Union[np.array, torch.Tensor]:
 35 |     """It computes `1 / depth`.
 36 | 
 37 |     It applies the transformation `1 / depth` to the valid entries of `depth`. The remaining entries are set to zero.
 38 |     Valid entries of `depth` must belong to the interval `]0, +inf[`.
 39 | 
 40 |     Args:
 41 |         depth: depth map, arranged as an `(H, W)` array.
 42 | 
 43 |     Returns:
 44 |         The transformed depth map.
 45 |     """
 46 | 
 47 |     # Check the class of the input data.
 48 |     depth_class = type(depth).__name__
 49 | 
 50 |     # Detect the valid entries.
 51 |     mask = (depth > 0) & (depth < float('inf'))
 52 | 
 53 |     # Select the valid entries.
 54 |     selection = depth[mask]
 55 | 
 56 |     # Perform the transformation.
 57 |     selection = 1.0 / selection
 58 | 
 59 |     # Division could lead to non valid entries. Remove them.
 60 |     selection[~((selection > 0) & (selection < float('inf')))] = 0
 61 | 
 62 |     # Write the transformed depth.
 63 |     if depth_class == 'ndarray':
 64 | 
 65 |         depth_inv = np.zeros_like(depth)
 66 |         depth_inv[mask] = selection
 67 | 
 68 |     elif depth_class == 'Tensor':
 69 | 
 70 |         # >>> NOT TESTED !!! <<<
 71 | 
 72 |         depth_inv = torch.zeros_like(depth)
 73 |         depth_inv[mask] = selection
 74 | 
 75 |     else:
 76 | 
 77 |         raise TypeError('The input must be either of type `numpy.ndarray` or `torch.Tensor`.')
 78 | 
 79 |     return depth_inv
 80 | 
 81 | 
 82 | def depth_inv2depth(depth_inv: Union[np.array, torch.Tensor],
 83 |                     depth_range: Tuple[np.float, np.float] = DEFAULT_DEPTH_RANGE) -> Union[np.array, torch.Tensor]:
 84 |     """It reverts the operation of the function `depth2depth_inv()`.
 85 | 
 86 |     It reverts the operation of the function `depth2depth_inv()` by applying the transformation `1 / depth_inv`
 87 |     to the valid entries of `depth_inv`. The remaining entries are set to zero.
 88 |     Valid entries of `depth_inv` must belong to the interval `]0, +inf[`.
 89 |     Upon conversion, the valid depth entries are clipped to the interval `[depth_range[0], depth_range[1]]`,
 90 |     which must belong to `]0, +inf[`.
 91 | 
 92 |     Args:
 93 |         depth_inv: transformed depth map, arranged as an `(H, W)` array.
 94 |         depth_range: 2-ple specifying the final depth range.
 95 | 
 96 |     Returns:
 97 |         The depth map resulting from the inverse transformation.
 98 |     """
 99 | 
100 |     # Check the class of the input data.
101 |     depth_inv_class = type(depth_inv).__name__
102 | 
103 |     # Check the final depth range.
104 |     low, up = depth_range
105 |     assert low > 0 and up < float('inf'), 'The depth range must belong to ]0, +inf[.'
106 | 
107 |     # Detect the valid entries.
108 |     mask = (depth_inv > 0) & (depth_inv < float('inf'))
109 | 
110 |     # Select the valid entries.
111 |     selection = depth_inv[mask]
112 | 
113 |     # Perform the transformation.
114 |     selection = 1.0 / selection
115 | 
116 |     # Division could lead to non valid entries. Remove them.
117 |     selection[~((selection > 0) & (selection < float('inf')))] = 0
118 | 
119 |     # Clip and write the transformed depth.
120 |     if depth_inv_class == 'ndarray':
121 | 
122 |         # Clip.
123 |         selection[selection > 0] = np.clip(selection[selection > 0], low, up)
124 | 
125 |         # Write.
126 |         depth = np.zeros_like(depth_inv)
127 |         depth[mask] = selection
128 | 
129 |     elif depth_inv_class == 'Tensor':
130 | 
131 |         # >>> NOT TESTED !!! <<<
132 | 
133 |         # Clip.
134 |         selection[selection > 0] = torch.clamp(selection[selection > 0], low, up)
135 | 
136 |         # Write.
137 |         depth = torch.zeros_like(depth_inv)
138 |         depth[mask] = selection
139 | 
140 |     else:
141 | 
142 |         raise TypeError('The input must be either of type `numpy.ndarray` or `torch.Tensor`.')
143 | 
144 |     return depth
145 | 
146 | 
147 | def depth_range2depth_inv_range(depth_range: Tuple[float, float]) -> Tuple[float, float]:
148 |     """It converts a depth range into the inverse depth range.
149 | 
150 |     Args:
151 |         depth_range: 2-tuple specifying the depth range.
152 | 
153 |     Returns:
154 |         The inverse depth range 2-tuple.
155 |     """
156 | 
157 |     assert depth_range[0] <= depth_range[1], 'The input depth range is empty.'
158 | 
159 |     assert depth_range[0] > 0 and depth_range[1] < float('inf'), 'The input depth range must belong to ]0, 1[.'
160 | 
161 |     return 1.0 / depth_range[1], 1.0 / depth_range[0]
162 | 
163 | 
164 | def tensor2array(tensor: torch.Tensor) -> np.array:
165 |     """It converts a torch batch to a numpy batch.
166 | 
167 |     It converts a batch of images stored as a torch tensor of dimensions `(B, C, H, W)` or `(C, H, W)` into a numpy
168 |     array of dimensions `(B, H, W, C)` or `(H, W, C)`, respectively.
169 | 
170 |     Args:
171 |         tensor: tensor to convert.
172 | 
173 |     Returns:
174 |         The converted tensor.
175 |     """
176 | 
177 |     if tensor.dim() == 3:
178 |         array = np.transpose(tensor.numpy(), (1, 2, 0))
179 |     elif tensor.dim() == 4:
180 |         array = np.transpose(tensor, (0, 2, 3, 1))
181 |     else:
182 |         raise ValueError('Input tensor dimension must be 3 or 4.')
183 | 
184 |     return array
185 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # depth-refinement-and-normal-estimation
  2 | 
  3 | This software is meant to refine a noisy and potentially incomplete depth map,
  4 | given the corresponding image.
  5 | Since the depth map refinement algorithm underneath this software assumes a piece-wise planar world,
  6 | this software estimates a normal map jointly with the refined depth map.  
  7 | The software can take advantage of a continuous confidence map with entries in `[0, 1]`,
  8 | where `0` denotes unreliable depth values and `1` denotes reliable ones.
  9 | In the absence of a confidence map, a pixel is assigned a confidence equal to `1`
 10 | if it has a valid depth, `0` otherwise.
 11 | 
 12 | This software is released under the MIT license.
 13 | If you use this software in your research, please cite the following article:
 14 | 
 15 |     @inproceedings{rossi_refinement_2020,
 16 |         authors = {Mattia Rossi, Mireille El Gheche, Andreas Kuhn, Pascal Frossard},
 17 |         title = {Joint Graph-based Depth Refinement and Normal Estimation},
 18 |         booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR), Seattle, WA, USA},
 19 |         year = {2020}
 20 |     }
 21 | 
 22 | ## Installation
 23 | 
 24 | The software has been tested with Python 3.7 and it has the following dependencies (in brackets are
 25 | the tested versions):
 26 | 
 27 | - pytorch (v.1.4.0),
 28 | - opencv (v.3.4.2),
 29 | - visdom (v.1.8.9).
 30 | 
 31 | The software relies on `pytorch`, therefore it can run on both CPU and GPU: the latter is recommended.
 32 | Processing depth maps of resolution approximately 3000x2000 pixels requires a GPU equipped with a
 33 | 12 GB memory.
 34 | The software does not support the parallel use of multiple GPUs.  
 35 | `visdom` is not a mandatory dependency: it is required only for runtime plotting.
 36 | In particular, `visdom` permits to observe the progressive refinement of the input depth map from
 37 | a web browser, even if the computation is taking place on a remote server.
 38 | 
 39 | ## How to run the software
 40 | 
 41 | The software has a command-line interface, but it can be integrated in a third party code
 42 | easily by calling the function `refine` in `refinement.py`.
 43 | The following command (new lines must be replaced with spaces) provides and example of usage
 44 | of the command-line interface:
 45 |     
 46 |     python refine.py
 47 |     --image <input_image_path>
 48 |     --depth <input_depth_map_path>
 49 |     --confidence <input_confidence_map_path>
 50 |     --depth_out <refined_depth_map_saving_path>
 51 |     --normal_out <estimated_normal_map_saving_path>
 52 |     --cam_focal <camera_focal_lenght_x_axis> <camera_focal_lenght_y_axis>
 53 |     --cam_center <camera_center_of_projection_x_axis> <camera_center_of_projection_y_axis>
 54 |     --depth_min 0.1
 55 |     --depth_max 50
 56 |     --confidence_threshold 0.5
 57 |     --gpu_id 0
 58 |     --scale_nb 4
 59 |     --lambda_regularization 7.5 7.5 7.5 7.5
 60 |     --gamma_regularization 5.5 5.5 5.5 5.5
 61 |     --window_size 9 9 9 9
 62 |     --patch_size 3 3 3 3
 63 |     --sigma_int 0.07 0.07 0.07 0.07
 64 |     --sigma_spa 3.0 3.0 3.0 3.0
 65 |     --degree_max 20 20 20 20
 66 |     --iter_max 4000 3000 2000 1000
 67 |     --eps_stop 0.000001 0.000001 0.000001 0.000001
 68 |     --attempt_max 50 50 50 50
 69 |     --lr_start 0.01 0.01 0.001 0.0001
 70 |     --lr_slot_nb 3 3 2 1
 71 | 
 72 | The above command performs a refinement of the input depth map adopting a multi-scale refinement
 73 | with 4 scales.
 74 | As a consequence, the scale-dependent parameters require 4 input values each.
 75 | For more details on the software input parameters, please use the help `python refine.py --help`.  
 76 | Finally, please note that the input depth and confidence maps must be in binary format (the same used in
 77 | [COLMAP](https://github.com/colmap/colmap)).
 78 | This is also the same format used to save the refined depth map and the corresponding normal map
 79 | on disk.
 80 | Reading and writing in binary format is performed by the functions `read_bin_file` and `write_bin_file`,
 81 | respectively, in `iofuns.py`.
 82 | 
 83 | ## Interactive plotting
 84 | 
 85 | The software permits to visualize the progress of the input depth map refinement via web browser.
 86 | This is implemented using a [VISDOM](https://github.com/facebookresearch/visdom) server.
 87 | 
 88 | The VISDOM server can be started with the following command:
 89 | 
 90 |     python -m visdom.server -port <visdom_port_number> -base_url /<visdom_base_url> &
 91 | 
 92 | where `<visdom_port_number>` and `visdom_base_url` are an arbitrary port and string, respectively.
 93 | The server will be accessible at the web page `<server_address>:<visdom_port_number>/<visdom_base_url>`,
 94 | where `<server_address>` is the address of the machine where the refinement software runs.
 95 | If the software is run locally, then `<server_address>` is `localhost`.
 96 | 
 97 | In order to have the software plotting the intermediate results on the VISDOM server, it is necessary
 98 | to specify the following two additional parameters when launching the refinement:
 99 | 
100 |     --visdom_display_port <visdom_display_port>
101 |     --visdom_base_url <visdom_base_url>
102 | 
103 | ## License
104 | 
105 | This software itself is licensed under the MIT license.
106 | The software dependencies and the content of the folder `colmap` may have different licenses:
107 | using these within the depth refinement software may affect the resulting software license.
108 |     
109 |     Copyright (c) 2020,
110 |     ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland,
111 |     Laboratoire de Traitement des Signaux 4 (LTS4).
112 |     All rights reserved.
113 |     
114 |     Permission is hereby granted, free of charge, to any person obtaining a copy
115 |     of this software and associated documentation files (the "Software"), to deal
116 |     in the Software without restriction, including without limitation the rights
117 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
118 |     copies of the Software, and to permit persons to whom the Software is
119 |     furnished to do so, subject to the following conditions:
120 |     
121 |     The above copyright notice and this permission notice shall be included in all
122 |     copies or substantial portions of the Software.
123 |     
124 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
125 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
126 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
127 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
128 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
129 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
130 |     SOFTWARE.
131 |     
132 |     Author: Mattia Rossi (rossi-mattia-at-gmail-com)
133 | 


--------------------------------------------------------------------------------
/filters.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020,
  2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland,
  3 | # Laboratoire de Traitement des Signaux 4 (LTS4).
  4 | # All rights reserved.
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | #
 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | # SOFTWARE.
 23 | #
 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com)
 25 | 
 26 | import numpy as np
 27 | import torch
 28 | from typing import Tuple, Union
 29 | 
 30 | 
 31 | def gauss_filter_1d(length: int, sigma: float) -> np.array:
 32 |     """It builds a 1D Gaussian filter.
 33 | 
 34 |     Args:
 35 |         length: number of filter taps.
 36 |         sigma: standard deviation.
 37 | 
 38 |     Returns:
 39 |         A 1D Gaussian filter arranged as a `(length,)` array.
 40 |     """
 41 | 
 42 |     # Check the filter length.
 43 |     if (length % 2) == 0:
 44 |         raise ValueError('The length of the filter must be odd.')
 45 | 
 46 |     # Build the filter.
 47 |     radius = int((length - 1) / 2.0)
 48 |     x = np.arange(-radius, radius + 1, dtype=np.float32)
 49 |     y = np.exp(- (x ** 2) / (2 * (sigma ** 2))) / (sigma * np.sqrt(2 * np.pi))
 50 | 
 51 |     # Normalize the filter.
 52 |     const = np.sum(y)
 53 |     assert const != 0, 'The filter is zero everywhere.'
 54 |     y = y / const
 55 | 
 56 |     return y
 57 | 
 58 | 
 59 | def gauss_filter_deriv_1d(length: int, sigma: float) -> np.array:
 60 |     """It builds the derivative of a 1D Gaussian filter.
 61 | 
 62 |     Args:
 63 |         length: number of filter taps.
 64 |         sigma: standard deviation.
 65 | 
 66 |     Returns:
 67 |         A 1D Gaussian filter derivative, arranged as a `(length,)` array.
 68 |     """
 69 | 
 70 |     # Check the filter length.
 71 |     if (length % 2) == 0:
 72 |         raise ValueError('The length of the filter must be odd.')
 73 | 
 74 |     # Build the filter.
 75 |     radius = int((length - 1) / 2.0)
 76 |     x = np.arange(-radius, radius + 1, dtype=np.float32)
 77 |     y = gauss_filter_1d(length, sigma) * (- x / (sigma ** 2))
 78 | 
 79 |     # Normalize the filter.
 80 |     const = np.sum(np.abs(y))
 81 |     assert const != 0, 'The filter is zero everywhere.'
 82 |     y = y / const
 83 |     # TODO: check whether this normalization makes sense.
 84 | 
 85 |     return y
 86 | 
 87 | 
 88 | def gauss_filter_2d(size: int, sigma: float) -> np.array:
 89 |     """It builds a 2D Gaussian filter.
 90 | 
 91 |     Args:
 92 |         size: height (and width) of the filter.
 93 |         sigma: standard deviation (in pixels) of the Gaussian filter.
 94 | 
 95 |     Returns:
 96 |         A 2D Gaussian filter arranged as a `(size, size)` array.
 97 |     """
 98 | 
 99 |     # Build the filter.
100 |     y = (gauss_filter_1d(size, sigma)[:, None]).dot(gauss_filter_1d(size, sigma)[None, :])
101 | 
102 |     # Normalize the filter.
103 |     const = np.sum(y)
104 |     assert const != 0, 'The filter is zero everywhere.'
105 |     y = y / const
106 | 
107 |     return y
108 | 
109 | 
110 | def gauss_filter_deriv_2d(size: int, sigma: float) -> np.array:
111 |     """It builds the vertical derivative of a 2D Gaussian filter.
112 | 
113 |     It builds the vertical derivative of a 2D Gaussian filter. The horizontal derivative can be obtained just by taking
114 |     the transpose of the vertical one.
115 | 
116 |     Args:
117 |         size: height (and width) of the filters.
118 |         sigma: standard deviation (in pixels) of the Gaussian filter underneath the derivative filters.
119 | 
120 |     Returns:
121 |         The vertical derivative of a 2D Gaussian filter arranged as a `(size, size)` array.
122 |     """
123 | 
124 |     # Build the filter.
125 |     y = (gauss_filter_deriv_1d(size, sigma)[:, None]).dot(gauss_filter_1d(size, sigma)[None, :])
126 | 
127 |     # Normalize the filter.
128 |     const = np.sum(np.abs(y))
129 |     assert const != 0, 'The filter is zero everywhere.'
130 |     y = y / const
131 | 
132 |     return y
133 | 
134 | 
135 | def gradient_filter(size: int, sigma: float) -> torch.Tensor:
136 |     """It builds a gradient filter for images in PyTorch tensor format.
137 | 
138 |     It builds a filter that can be used with `torch.nn.functional.conv2d` to compute the gradient of a batch of images
139 |     or, more in general, of maps. The images or maps must have only one channel.
140 |     The filter is arranged as a `(2, 1, H, W)` tensor with `[0, :, :, :]` and `[1, :, :, :]` the 2D horizontal and
141 |     vertical derivative filters.
142 | 
143 |     Example:
144 |         batch_nb = 5
145 |         height = 100
146 |         width = 200
147 |         size = 7
148 |         image = torch.random(batch_nb, 1, height, width)
149 |         filter = gradient_filter(7, 0.1)
150 |         pad = tuple([int((size - 1) / 2)] * 4)
151 |         image_grad = torch.nn.functional.conv2d(torch.nn.functional.pad(image, pad, mode='replicate'), filter)
152 | 
153 |     In the example, `image_grad` is a `(batch_nb, 2, height, width)` tensor with `image_grad[k, 0, :, :]` and
154 |     `image_grad[k, 1, :, :]` the horizontal and vertical derivatives of the image `k`.
155 | 
156 |     Args:
157 |         size: height (and width) of the filters.
158 |         sigma: standard deviation (in pixels) of the Gaussian filter underneath the derivative filters.
159 | 
160 |     Returns:
161 |         The gradient filter, arranged as a `(2, 1, H, W)` tensor.
162 |     """
163 | 
164 |     # Build the vertical (y) derivative filter.
165 |     d_gauss_dy = gauss_filter_deriv_2d(size, sigma)
166 | 
167 |     # Flip the filter around the (x, y) origin, as torch.nn.functional.conv2d() performs just cross-correlation rather
168 |     # than the standard convolution.
169 |     d_gauss_dy = np.fliplr(d_gauss_dy)
170 |     d_gauss_dy = np.flipud(d_gauss_dy)
171 | 
172 |     # Build the horizontal (x) derivative filter, which is just the transpose of the vertical one.
173 |     d_gauss_dx = d_gauss_dy.T
174 | 
175 |     # Expand the filters to make them compliant with torch.nn.functional.conv2d.
176 |     d_gauss_dy = d_gauss_dy[None, None, :, :]  # [1, 1, size, size]
177 |     d_gauss_dx = d_gauss_dx[None, None, :, :]  # [1, 1, size, size]
178 | 
179 |     # Concatenate the two filters into a single filter with two channels.
180 |     grad_filter = np.concatenate((d_gauss_dx, d_gauss_dy), axis=0)  # [2, 1, size, size]
181 | 
182 |     # Change the filter type to torch.Tensor.
183 |     grad_filter = torch.from_numpy(grad_filter)
184 | 
185 |     return grad_filter
186 | 
187 | 
188 | def diff_filter_bank(size: Union[int, Tuple[int, int]] = 5):
189 |     """It builds a derivative filter bank.
190 | 
191 |     It builds a set of `HxW` filters where each filter has only two non zero entries: the central one, whose value
192 |     is `-1`, and another non central, whose value is `1`. The number of filters is `H*W - 1`, i.e., all the possible
193 |     filters of the described type.
194 | 
195 |     Args:
196 |         size: tuple specifying the height and width of the filter (square filter if only one dimensions is specified).
197 | 
198 |     Returns:
199 |         The derivative filter bank, arranged as an `(H, W)` array.
200 |     """
201 | 
202 |     # Filter bank spatial dimensions.
203 |     filter_size = tuple((size, ))
204 |     if len(filter_size) == 2:
205 |         filter_height = size[0]
206 |         filter_width = size[1]
207 |     elif len(filter_size) == 1:
208 |         filter_height = size
209 |         filter_width = size
210 |     else:
211 |         raise TypeError('Input must be either an integer or a 2-tuple of integers.')
212 | 
213 |     # Number of filters in the filter bank.
214 |     filter_nb = int((filter_height * filter_width) - 1)
215 | 
216 |     # Center of each filter in the filter bank.
217 |     filter_center_y = int((filter_height - 1) / 2.0)
218 |     filter_center_x = int((filter_width - 1) / 2.0)
219 | 
220 |     # Create the filter bank.
221 |     index = 0
222 |     filter_bank = torch.zeros(filter_nb, 1, size, size)
223 |     filter_bank[:, :, filter_center_y, filter_center_x] = - 1.0
224 |     for y in range(size):
225 |         for x in range(size):
226 | 
227 |             if y != filter_center_y or x != filter_center_x:
228 |                 filter_bank[index, :, y, x] = 1.0
229 |                 index += 1
230 | 
231 |     return filter_bank
232 | 


--------------------------------------------------------------------------------
/colmap/read_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2018, ETH Zurich and UNC Chapel Hill.
  2 | # All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions are met:
  6 | #
  7 | #     * Redistributions of source code must retain the above copyright
  8 | #       notice, this list of conditions and the following disclaimer.
  9 | #
 10 | #     * Redistributions in binary form must reproduce the above copyright
 11 | #       notice, this list of conditions and the following disclaimer in the
 12 | #       documentation and/or other materials provided with the distribution.
 13 | #
 14 | #     * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
 15 | #       its contributors may be used to endorse or promote products derived
 16 | #       from this software without specific prior written permission.
 17 | #
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 19 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 20 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 21 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
 22 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 23 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 24 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 25 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 26 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 27 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 28 | # POSSIBILITY OF SUCH DAMAGE.
 29 | #
 30 | # Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de)
 31 | 
 32 | import os
 33 | import sys
 34 | import collections
 35 | import numpy as np
 36 | import struct
 37 | 
 38 | 
 39 | CameraModel = collections.namedtuple(
 40 |     "CameraModel", ["model_id", "model_name", "num_params"])
 41 | Camera = collections.namedtuple(
 42 |     "Camera", ["id", "model", "width", "height", "params"])
 43 | BaseImage = collections.namedtuple(
 44 |     "Image", ["id", "qvec", "tvec", "camera_id", "name", "xys", "point3D_ids"])
 45 | Point3D = collections.namedtuple(
 46 |     "Point3D", ["id", "xyz", "rgb", "error", "image_ids", "point2D_idxs"])
 47 | 
 48 | class Image(BaseImage):
 49 |     def qvec2rotmat(self):
 50 |         return qvec2rotmat(self.qvec)
 51 | 
 52 | 
 53 | CAMERA_MODELS = {
 54 |     CameraModel(model_id=0, model_name="SIMPLE_PINHOLE", num_params=3),
 55 |     CameraModel(model_id=1, model_name="PINHOLE", num_params=4),
 56 |     CameraModel(model_id=2, model_name="SIMPLE_RADIAL", num_params=4),
 57 |     CameraModel(model_id=3, model_name="RADIAL", num_params=5),
 58 |     CameraModel(model_id=4, model_name="OPENCV", num_params=8),
 59 |     CameraModel(model_id=5, model_name="OPENCV_FISHEYE", num_params=8),
 60 |     CameraModel(model_id=6, model_name="FULL_OPENCV", num_params=12),
 61 |     CameraModel(model_id=7, model_name="FOV", num_params=5),
 62 |     CameraModel(model_id=8, model_name="SIMPLE_RADIAL_FISHEYE", num_params=4),
 63 |     CameraModel(model_id=9, model_name="RADIAL_FISHEYE", num_params=5),
 64 |     CameraModel(model_id=10, model_name="THIN_PRISM_FISHEYE", num_params=12)
 65 | }
 66 | CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model) \
 67 |                          for camera_model in CAMERA_MODELS])
 68 | 
 69 | 
 70 | def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
 71 |     """Read and unpack the next bytes from a binary file.
 72 |     :param fid:
 73 |     :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
 74 |     :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
 75 |     :param endian_character: Any of {@, =, <, >, !}
 76 |     :return: Tuple of read and unpacked values.
 77 |     """
 78 |     data = fid.read(num_bytes)
 79 |     return struct.unpack(endian_character + format_char_sequence, data)
 80 | 
 81 | 
 82 | def read_cameras_text(path):
 83 |     """
 84 |     see: src/base/reconstruction.cc
 85 |         void Reconstruction::WriteCamerasText(const std::string& path)
 86 |         void Reconstruction::ReadCamerasText(const std::string& path)
 87 |     """
 88 |     cameras = {}
 89 |     with open(path, "r") as fid:
 90 |         while True:
 91 |             line = fid.readline()
 92 |             if not line:
 93 |                 break
 94 |             line = line.strip()
 95 |             if len(line) > 0 and line[0] != "#":
 96 |                 elems = line.split()
 97 |                 camera_id = int(elems[0])
 98 |                 model = elems[1]
 99 |                 width = int(elems[2])
100 |                 height = int(elems[3])
101 |                 params = np.array(tuple(map(float, elems[4:])))
102 |                 cameras[camera_id] = Camera(id=camera_id, model=model,
103 |                                             width=width, height=height,
104 |                                             params=params)
105 |     return cameras
106 | 
107 | 
108 | def read_cameras_binary(path_to_model_file):
109 |     """
110 |     see: src/base/reconstruction.cc
111 |         void Reconstruction::WriteCamerasBinary(const std::string& path)
112 |         void Reconstruction::ReadCamerasBinary(const std::string& path)
113 |     """
114 |     cameras = {}
115 |     with open(path_to_model_file, "rb") as fid:
116 |         num_cameras = read_next_bytes(fid, 8, "Q")[0]
117 |         for camera_line_index in range(num_cameras):
118 |             camera_properties = read_next_bytes(
119 |                 fid, num_bytes=24, format_char_sequence="iiQQ")
120 |             camera_id = camera_properties[0]
121 |             model_id = camera_properties[1]
122 |             model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
123 |             width = camera_properties[2]
124 |             height = camera_properties[3]
125 |             num_params = CAMERA_MODEL_IDS[model_id].num_params
126 |             params = read_next_bytes(fid, num_bytes=8*num_params,
127 |                                      format_char_sequence="d"*num_params)
128 |             cameras[camera_id] = Camera(id=camera_id,
129 |                                         model=model_name,
130 |                                         width=width,
131 |                                         height=height,
132 |                                         params=np.array(params))
133 |         assert len(cameras) == num_cameras
134 |     return cameras
135 | 
136 | 
137 | def read_images_text(path):
138 |     """
139 |     see: src/base/reconstruction.cc
140 |         void Reconstruction::ReadImagesText(const std::string& path)
141 |         void Reconstruction::WriteImagesText(const std::string& path)
142 |     """
143 |     images = {}
144 |     with open(path, "r") as fid:
145 |         while True:
146 |             line = fid.readline()
147 |             if not line:
148 |                 break
149 |             line = line.strip()
150 |             if len(line) > 0 and line[0] != "#":
151 |                 elems = line.split()
152 |                 image_id = int(elems[0])
153 |                 qvec = np.array(tuple(map(float, elems[1:5])))
154 |                 tvec = np.array(tuple(map(float, elems[5:8])))
155 |                 camera_id = int(elems[8])
156 |                 image_name = elems[9]
157 |                 elems = fid.readline().split()
158 |                 xys = np.column_stack([tuple(map(float, elems[0::3])),
159 |                                        tuple(map(float, elems[1::3]))])
160 |                 point3D_ids = np.array(tuple(map(int, elems[2::3])))
161 |                 images[image_id] = Image(
162 |                     id=image_id, qvec=qvec, tvec=tvec,
163 |                     camera_id=camera_id, name=image_name,
164 |                     xys=xys, point3D_ids=point3D_ids)
165 |     return images
166 | 
167 | 
168 | def read_images_binary(path_to_model_file):
169 |     """
170 |     see: src/base/reconstruction.cc
171 |         void Reconstruction::ReadImagesBinary(const std::string& path)
172 |         void Reconstruction::WriteImagesBinary(const std::string& path)
173 |     """
174 |     images = {}
175 |     with open(path_to_model_file, "rb") as fid:
176 |         num_reg_images = read_next_bytes(fid, 8, "Q")[0]
177 |         for image_index in range(num_reg_images):
178 |             binary_image_properties = read_next_bytes(
179 |                 fid, num_bytes=64, format_char_sequence="idddddddi")
180 |             image_id = binary_image_properties[0]
181 |             qvec = np.array(binary_image_properties[1:5])
182 |             tvec = np.array(binary_image_properties[5:8])
183 |             camera_id = binary_image_properties[8]
184 |             image_name = ""
185 |             current_char = read_next_bytes(fid, 1, "c")[0]
186 |             while current_char != b"\x00":   # look for the ASCII 0 entry
187 |                 image_name += current_char.decode("utf-8")
188 |                 current_char = read_next_bytes(fid, 1, "c")[0]
189 |             num_points2D = read_next_bytes(fid, num_bytes=8,
190 |                                            format_char_sequence="Q")[0]
191 |             x_y_id_s = read_next_bytes(fid, num_bytes=24*num_points2D,
192 |                                        format_char_sequence="ddq"*num_points2D)
193 |             xys = np.column_stack([tuple(map(float, x_y_id_s[0::3])),
194 |                                    tuple(map(float, x_y_id_s[1::3]))])
195 |             point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
196 |             images[image_id] = Image(
197 |                 id=image_id, qvec=qvec, tvec=tvec,
198 |                 camera_id=camera_id, name=image_name,
199 |                 xys=xys, point3D_ids=point3D_ids)
200 |     return images
201 | 
202 | 
203 | def read_points3D_text(path):
204 |     """
205 |     see: src/base/reconstruction.cc
206 |         void Reconstruction::ReadPoints3DText(const std::string& path)
207 |         void Reconstruction::WritePoints3DText(const std::string& path)
208 |     """
209 |     points3D = {}
210 |     with open(path, "r") as fid:
211 |         while True:
212 |             line = fid.readline()
213 |             if not line:
214 |                 break
215 |             line = line.strip()
216 |             if len(line) > 0 and line[0] != "#":
217 |                 elems = line.split()
218 |                 point3D_id = int(elems[0])
219 |                 xyz = np.array(tuple(map(float, elems[1:4])))
220 |                 rgb = np.array(tuple(map(int, elems[4:7])))
221 |                 error = float(elems[7])
222 |                 image_ids = np.array(tuple(map(int, elems[8::2])))
223 |                 point2D_idxs = np.array(tuple(map(int, elems[9::2])))
224 |                 points3D[point3D_id] = Point3D(id=point3D_id, xyz=xyz, rgb=rgb,
225 |                                                error=error, image_ids=image_ids,
226 |                                                point2D_idxs=point2D_idxs)
227 |     return points3D
228 | 
229 | 
230 | def read_points3d_binary(path_to_model_file):
231 |     """
232 |     see: src/base/reconstruction.cc
233 |         void Reconstruction::ReadPoints3DBinary(const std::string& path)
234 |         void Reconstruction::WritePoints3DBinary(const std::string& path)
235 |     """
236 |     points3D = {}
237 |     with open(path_to_model_file, "rb") as fid:
238 |         num_points = read_next_bytes(fid, 8, "Q")[0]
239 |         for point_line_index in range(num_points):
240 |             binary_point_line_properties = read_next_bytes(
241 |                 fid, num_bytes=43, format_char_sequence="QdddBBBd")
242 |             point3D_id = binary_point_line_properties[0]
243 |             xyz = np.array(binary_point_line_properties[1:4])
244 |             rgb = np.array(binary_point_line_properties[4:7])
245 |             error = np.array(binary_point_line_properties[7])
246 |             track_length = read_next_bytes(
247 |                 fid, num_bytes=8, format_char_sequence="Q")[0]
248 |             track_elems = read_next_bytes(
249 |                 fid, num_bytes=8*track_length,
250 |                 format_char_sequence="ii"*track_length)
251 |             image_ids = np.array(tuple(map(int, track_elems[0::2])))
252 |             point2D_idxs = np.array(tuple(map(int, track_elems[1::2])))
253 |             points3D[point3D_id] = Point3D(
254 |                 id=point3D_id, xyz=xyz, rgb=rgb,
255 |                 error=error, image_ids=image_ids,
256 |                 point2D_idxs=point2D_idxs)
257 |     return points3D
258 | 
259 | 
260 | def read_model(path, ext):
261 |     if ext == ".txt":
262 |         cameras = read_cameras_text(os.path.join(path, "cameras" + ext))
263 |         images = read_images_text(os.path.join(path, "images" + ext))
264 |         points3D = read_points3D_text(os.path.join(path, "points3D") + ext)
265 |     else:
266 |         cameras = read_cameras_binary(os.path.join(path, "cameras" + ext))
267 |         images = read_images_binary(os.path.join(path, "images" + ext))
268 |         points3D = read_points3d_binary(os.path.join(path, "points3D") + ext)
269 |     return cameras, images, points3D
270 | 
271 | 
272 | def qvec2rotmat(qvec):
273 |     return np.array([
274 |         [1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
275 |          2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
276 |          2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]],
277 |         [2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
278 |          1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
279 |          2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]],
280 |         [2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
281 |          2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
282 |          1 - 2 * qvec[1]**2 - 2 * qvec[2]**2]])
283 | 
284 | 
285 | def rotmat2qvec(R):
286 |     Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
287 |     K = np.array([
288 |         [Rxx - Ryy - Rzz, 0, 0, 0],
289 |         [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
290 |         [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
291 |         [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0
292 |     eigvals, eigvecs = np.linalg.eigh(K)
293 |     qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
294 |     if qvec[0] < 0:
295 |         qvec *= -1
296 |     return qvec
297 | 
298 | 
299 | def main():
300 |     if len(sys.argv) != 3:
301 |         print("Usage: python read_model.py path/to/model/folder [.txt,.bin]")
302 |         return
303 | 
304 |     cameras, images, points3D = read_model(path=sys.argv[1], ext=sys.argv[2])
305 | 
306 |     print("num_cameras:", len(cameras))
307 |     print("num_images:", len(images))
308 |     print("num_points3D:", len(points3D))
309 | 
310 | 
311 | if __name__ == "__main__":
312 |     main()
313 | 


--------------------------------------------------------------------------------
/losses.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020,
  2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland,
  3 | # Laboratoire de Traitement des Signaux 4 (LTS4).
  4 | # All rights reserved.
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | #
 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | # SOFTWARE.
 23 | #
 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com)
 25 | 
 26 | import torch
 27 | import torch.nn as nn
 28 | from misc import similarity_graph, unravel_index
 29 | import numpy as np
 30 | from typing import Tuple
 31 | 
 32 | 
 33 | class DepthConsistencyL1(nn.Module):
 34 |     """This class implements a consistency loss between the input depth map and the estimated one. The consistency is
 35 |     measured in terms of the L1-norm of the error between the input depth map and the estimated one.
 36 |     """
 37 | 
 38 |     def __init__(self,
 39 |                  depth: np.array, depth_range: Tuple[float, float],
 40 |                  depth_confidence: np.array = None,
 41 |                  multiplier: float = 0.0):
 42 |         """Constructor.
 43 | 
 44 |         Args:
 45 |             depth: depth map to refine, arranged as an `(H, W)` array.
 46 |             depth_range: depth values must belong to the interval `[depth_range[0], depth_range[1]]`.
 47 |             depth_confidence: confidence map associated to the depth map to refine. It must have entries in `[0, 1]`.
 48 |             multiplier: loss multiplier.
 49 |         """
 50 | 
 51 |         super(DepthConsistencyL1, self).__init__()
 52 | 
 53 |         # Check the input depth range.
 54 |         depth_min, depth_max = depth_range
 55 |         assert depth_min < depth_max, 'The specified depth range is empty.'
 56 | 
 57 |         # Extract the depth map confidence.
 58 |         if depth_confidence is not None:
 59 |             assert (depth_confidence >= 0).all() and (depth_confidence <= 1).all(), \
 60 |                 'Depth confidence entries must belong to [0, 1].'
 61 |             confidence = depth_confidence
 62 |         else:
 63 |             confidence = 1
 64 | 
 65 |         # The confidence is set to zero at non valid depth entries.
 66 |         confidence = confidence * ((depth > depth_min) & (depth < depth_max))
 67 | 
 68 |         # Convert the confidence to tensor and register it.
 69 |         self.register_buffer('confidence', torch.as_tensor(confidence[None, None, ]))
 70 | 
 71 |         # Convert the depth map to tensor and register it.
 72 |         self.register_buffer('depth', torch.as_tensor(depth[None, None, ]))
 73 | 
 74 |         # Register the normalization constant.
 75 |         # self.norm_const = self.confidence.sum()
 76 |         pixel_nb = depth.shape[0] * depth.shape[1]
 77 |         self.norm_const = pixel_nb
 78 | 
 79 |         # Register the loss multiplier.
 80 |         self.multiplier = multiplier
 81 | 
 82 |     def forward(self, depth: torch.Tensor) -> torch.Tensor:
 83 | 
 84 |         # Allocate a zero loss in the case that the loss is disabled, i.e., `self.multiplier` is zero.
 85 |         loss = depth.new_zeros(1, requires_grad=True)
 86 | 
 87 |         # If the loss is enabled, evaluate it.
 88 |         if self.multiplier > 0:
 89 | 
 90 |             # Evaluate the loss.
 91 |             loss = (depth - self.depth).mul(self.confidence).abs().sum().div(self.norm_const)
 92 | 
 93 |             # Weight the loss.
 94 |             loss = self.multiplier * loss
 95 | 
 96 |         return loss
 97 | 
 98 | 
 99 | class NormalConsistencyL1(nn.Module):
100 |     """This class implements a consistency loss between the input normal map and the estimated one. The consistency is
101 |     measured in terms of the L1-norm of the error between each pair of input and estimated normals.
102 |     """
103 | 
104 |     def __init__(self,
105 |                  normal: np.array,
106 |                  normal_confidence: np.array = None,
107 |                  multiplier: float = 0.0):
108 |         """Constructor.
109 | 
110 |         Args:
111 |             normal: 2D or 3D normal map to refine, arranged as an `(H, W, 2)` or `(H, W, 3)` array.
112 |             normal_confidence: confidence map associated to the normal map to refine. It must have entries in `[0, 1]`.
113 |             multiplier: loss multiplier.
114 |         """
115 | 
116 |         super(NormalConsistencyL1, self).__init__()
117 | 
118 |         # Extract the normal map confidence.
119 |         if normal_confidence is not None:
120 |             assert (normal_confidence >= 0).all() and (normal_confidence <= 1).all(), \
121 |                 'Depth confidence entries must belong to [0, 1].'
122 |             confidence = normal_confidence
123 |         else:
124 |             confidence = 1
125 | 
126 |         # The confidence is set to zero at non valid normal entries.
127 |         aux = np.sum(normal, axis=2)
128 |         confidence = confidence * ((aux > 0) & (aux < float('inf')))
129 | 
130 |         # Convert the confidence to tensor and register it.
131 |         self.register_buffer('confidence', torch.as_tensor(confidence[None, None,]))
132 | 
133 |         # Convert the normal map to tensor and register it.
134 |         self.register_buffer('normal', torch.as_tensor((np.transpose(normal, (2, 0, 1))[None, ]).copy()))
135 | 
136 |         # Register the normalization constant.
137 |         # self.norm_const = self.confidence.sum()
138 |         pixel_nb = normal.shape[0] * normal.shape[1]
139 |         self.norm_const = pixel_nb
140 | 
141 |         # Register the loss multiplier.
142 |         self.multiplier = multiplier
143 | 
144 |     def forward(self, normal: torch.Tensor) -> torch.Tensor:
145 | 
146 |         # Allocate a zero loss in the case that the loss is disabled, i.e., `self.multiplier` is zero.
147 |         loss = normal.new_zeros(1, requires_grad=True)
148 | 
149 |         # If the loss is enabled, evaluate it.
150 |         if self.multiplier > 0:
151 | 
152 |             # Evaluate the loss.
153 |             loss = (normal - self.normal).mul(self.confidence).abs().sum().div(self.norm_const)
154 | 
155 |             # Weight the loss.
156 |             loss = self.multiplier * loss
157 | 
158 |         return loss
159 | 
160 | 
161 | class PieceWisePlanarRegularization(nn.Module):
162 |     """This class implements a regularizer promoting piece-wise planar functions.
163 |     """
164 | 
165 |     def __init__(self,
166 |                  image: np.array,
167 |                  gamma: float,
168 |                  window_size: int = 9, patch_size: int = 7,
169 |                  sigma_intensity: float = 0.2, sigma_spatial: float = 3.0,
170 |                  degree_max: int = 15,
171 |                  version: int = 1,
172 |                  multiplier: float = 0.0,
173 |                  device: torch.device = torch.device('cpu')):
174 |         """Constructor.
175 | 
176 |         Args:
177 |             image: reference image, arranged as an `(H, W)` or `(H, W, C)` array.
178 |             gamma: internal multiplier associated to the vector field part of the loss.
179 |             window_size: search window size (window_size x window_size) to be used in the graph construction.
180 |             patch_size: patch size (patch_size x patch_size) to be used in the graph construction.
181 |             sigma_intensity: color difference standard deviation for patch comparison in the graph construction.
182 |             sigma_spatial: euclidean distance standard deviation for patch comparison in the graph construction.
183 |             degree_max: maximum number of per pixel neighbors in the graph.
184 |             version: regularization version (`0` for NLTGV or `1` for ours).
185 |             multiplier: loss multiplier.
186 |             device: device where the graph computation must take place.
187 |         """
188 | 
189 |         super(PieceWisePlanarRegularization, self).__init__()
190 | 
191 |         # Convert the reference image to tensor.
192 |         if image.ndim == 2:
193 |             image_aux = torch.as_tensor(image[None, None, ])
194 |         elif image.ndim == 3:
195 |             image_aux = torch.as_tensor((np.transpose(image, (2, 0, 1))[None, ]).copy())
196 |         else:
197 |             raise ValueError('The input image must be either gray scale or RGB.')
198 | 
199 |         # Image dimensions.
200 |         height = image_aux.size(2)
201 |         width = image_aux.size(3)
202 | 
203 |         # Compute the neighboring pixels and the corresponding weights.
204 |         weights, neighbours = similarity_graph(
205 |             image_aux.to(device),
206 |             window_size=window_size, patch_size=patch_size,
207 |             sigma_intensity=sigma_intensity, sigma_spatial=sigma_spatial,
208 |             degree_max=degree_max)
209 |         weights = weights.to('cpu')
210 |         neighbours = neighbours.to('cpu')
211 |         # The function `similarity_graph` is fed with a copy of `image_tensor` on `device`, therefore the output is on
212 |         # `device` as well and it must be brought back to CPU.
213 | 
214 |         # Register the number of neighbors per pixel.
215 |         self.neighbour_nb = weights.size(1)
216 | 
217 |         # Flatten the spatial dimensions of `weights` and `neighbours`, and register them.
218 |         weights = weights.view(self.neighbour_nb, -1)
219 |         neighbours = neighbours.view(self.neighbour_nb, -1)
220 |         self.register_buffer('weights', weights)
221 |         self.register_buffer('neighbours', neighbours)
222 | 
223 |         # Compute the distance vector between each pixel and its neighbours, and register it.
224 |         y_source, x_source = unravel_index(
225 |             torch.arange(height * width).view(1, -1),
226 |             (height, width))
227 |         y_target, x_target = unravel_index(
228 |             self.neighbours,
229 |             (height, width))
230 |         dist = torch.cat(
231 |             (x_source.add(-1, x_target.to(x_source))[:, None, ],
232 |              y_source.add(-1, y_target.to(y_source))[:, None, ]),
233 |             dim=1)
234 |         self.register_buffer('dist', dist.to(torch.float64))
235 |         # Note that `dist` is casted to `torch.float64` before to be registered. In fact, the function `forward()`
236 |         # requires `self.dist` data type to match the float data type (16, 32 or 64) of the other tensors involved
237 |         # in the computation. One could argue that calling `to()` on the module and specifying the data type would
238 |         # convert all its registered tensors. However, this is not the case for integer tensor. Therefore, in order
239 |         # to have `self.dist` converted by `to()`, its data type must be of type float already. The data type
240 |         # `torch.float64` is chosen to avoid any loss of precision.
241 | 
242 |         # Number of pixels.
243 |         pixel_nb = height * width
244 | 
245 |         # Register the normalization constant.
246 |         self.norm_const = pixel_nb
247 | 
248 |         # Register the multiplier associated to the second order derivative.
249 |         self.gamma = gamma
250 | 
251 |         # Register the regularization type.
252 |         if version == 1:
253 |             self.forward_internal = self.ours
254 |         else:
255 |             raise NotImplementedError('The required regularization does not exist.')
256 | 
257 |         # Register the loss multiplier.
258 |         self.multiplier = multiplier
259 | 
260 |     def forward(self, sig1: torch.Tensor, sig2: torch.Tensor) -> torch.Tensor:
261 | 
262 |         return self.forward_internal(sig1, sig2)
263 | 
264 |     # Our regularization.
265 |     def ours(self, sig1: torch.Tensor, sig2: torch.Tensor) -> torch.Tensor:
266 |         """
267 |         It implements the regularization proposed in the following article:
268 | 
269 |         Mattia Rossi, Mireille El Gheche, Andreas Kuhn, Pascal Frossard,
270 |         "Joint Graph-based Depth Refinement and Normal Estimation",
271 |         in IEEE Computer Vision and Pattern Recognition Conference (CVPR), Seattle, WA, USA, 2020.
272 | 
273 |         Args:
274 |             sig1: main signal, arranged as a `(1, 1, H, W)` tensor.
275 |             sig2: secondary signal, arranged as a `(1, 2, H, W)` tensor.
276 | 
277 |         Returns:
278 |             The considered regularization evaluated at `(sig1, sig2)`.
279 |         """
280 | 
281 |         # Allocate a zero loss in the case that the loss is disabled, i.e., `self.multiplier` is zero.
282 |         loss = sig1.new_zeros(1, requires_grad=True)
283 | 
284 |         # If the loss is enabled, evaluate it.
285 |         if self.multiplier > 0:
286 | 
287 |             # Expand and flatten `sig1` and `sig2`.
288 |             sig1_flattened = sig1[:, None, ]
289 |             sig1_flattened = sig1_flattened.expand(
290 |                 -1, self.neighbour_nb, -1, -1, -1).view(self.neighbour_nb, -1)
291 |             sig2_flattened = sig2[:, None, ]
292 |             sig2_flattened = sig2_flattened.expand(
293 |                 -1, self.neighbour_nb, -1, -1, -1).view(self.neighbour_nb, 2, -1)
294 | 
295 |             # Compute the left part of the regularization.
296 |             aux1 = (sig1_flattened -
297 |                     torch.gather(sig1_flattened, 1, self.neighbours) -
298 |                     (sig2_flattened * self.dist).sum(dim=1))
299 |             aux1 = (aux1 * self.weights).norm(dim=0).sum()
300 | 
301 |             # Compute the right part of the regularization.
302 |             aux2 = (sig2_flattened -
303 |                     torch.gather(sig2_flattened, 2, self.neighbours[:, None, ].expand(-1, 2, -1))).norm(dim=1)
304 |             aux2 = (aux2 * self.weights).sum()
305 | 
306 |             # Add the contribution of the left and right parts.
307 |             loss = aux1 + (self.gamma * aux2)
308 | 
309 |             # Normalize the loss.
310 |             loss = loss.div(self.norm_const)
311 | 
312 |             # Weight the loss.
313 |             loss = self.multiplier * loss
314 | 
315 |         return loss
316 | 


--------------------------------------------------------------------------------
/iofuns.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020,
  2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland,
  3 | # Laboratoire de Traitement des Signaux 4 (LTS4).
  4 | # All rights reserved.
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | #
 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | # SOFTWARE.
 23 | #
 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com)
 25 | 
 26 | import numpy as np
 27 | from typing import Tuple, Dict
 28 | import os
 29 | import re
 30 | import sys
 31 | import struct
 32 | from colmap.read_model import read_cameras_binary
 33 | from cv2 import imread
 34 | 
 35 | 
 36 | def read_depth_map(path: str, data_format: str,
 37 |                    size: Tuple[int, int] = None, stereo_param: Dict = None) -> np.array:
 38 |     """It reads a depth map either in the ETH3D format or in the COLMAP one.
 39 | 
 40 |     Args:
 41 |         path: path to the depth map.
 42 |         data_format: depth map format (`ETH3D`, `COLMAP`, `MIDDLEBURY`).
 43 |         size: a 2-Tuples specifying the depth map height and width (mandatory only for the ETH3D format).
 44 |         stereo_param: stereo parameters (mandatory for `data_format` equal to `'MIDDLEBURY'`).
 45 | 
 46 |     Returns:
 47 |         The read depth map (in meters) arranged as an `(H, W)` array. Non valid values are signalled with zero entries.
 48 |     """
 49 | 
 50 |     # Read the depth map.
 51 |     if data_format == 'ETH3D':
 52 | 
 53 |         # Depth map dimensions.
 54 |         if size is not None:
 55 |             height, width = size
 56 |         else:
 57 |             raise ValueError('For ETH3D depth type, the `size` parameter is mandatory.')
 58 | 
 59 |         with open(path, "rb") as fid:
 60 |             depth = np.reshape(np.fromfile(fid, dtype=np.float32), (height, width))
 61 |             # Note that depth values are of type np.float32.
 62 | 
 63 |     elif data_format == 'COLMAP':
 64 | 
 65 |         depth = read_bin_file(path)
 66 | 
 67 |     elif data_format == 'MIDDLEBURY':
 68 | 
 69 |         assert stereo_param is not None, 'For `data_format` equal to MIDDLEBURY, `stereo_param` is mandatory.'
 70 | 
 71 |         # Read the disparity map.
 72 |         disparity = read_middlebury_disparity(path)
 73 | 
 74 |         # Convert the read disparity to depth.
 75 |         depth = (stereo_param['baseline'] * stereo_param['cam0'][0, 0]) / (disparity - stereo_param['doffs']) / 1000.0
 76 | 
 77 |     else:
 78 | 
 79 |         raise ValueError('Bad depth format.')
 80 | 
 81 |     # Depending on the source, non valid pixels are signalled either with non positive entries or with infinite ones.
 82 | 
 83 |     # Signal the non valid entries with zero.
 84 |     depth[(depth < 0) | (depth == float('inf'))] = 0
 85 | 
 86 |     return depth
 87 | 
 88 | 
 89 | def read_normal_map(path: str, data_format: str) -> np.array:
 90 |     """It reads a normal map in the COLMAP format.
 91 | 
 92 |     Args:
 93 |         path: path to the normal map.
 94 |         data_format: the normal map format (currently, only 'COLMAP').
 95 | 
 96 |     Returns:
 97 |         The read normal map arranged as an `(H, W, 3)` array.
 98 |     """
 99 | 
100 |     if data_format == 'COLMAP':
101 | 
102 |         normal = read_bin_file(path)
103 | 
104 |     else:
105 | 
106 |         raise ValueError('Bad normal format.')
107 | 
108 |     return normal
109 | 
110 | 
111 | def read_confidence_map(path: str):
112 | 
113 |     confidence = (imread(path)[:, :, 0]).astype(np.float64) / 255.0
114 | 
115 |     return confidence
116 | 
117 | 
118 | def read_middlebury_disparity(file_name: str) -> np.array:
119 |     """It read the Middlebury 2014 dataset disparity.
120 | 
121 |     Args:
122 |         file_name: path to the disparity map.
123 | 
124 |     Returns:
125 |         The loaded disparity map, arranged as an `(H, W)` array.
126 |     """
127 | 
128 |     # Read the disparity.
129 |     disparity = load_pfm(file_name)
130 | 
131 |     # It is necessary to flip the disparity upside down.
132 |     disparity = np.flipud(disparity)
133 | 
134 |     # Non valid disparity entries are signalled with infinite.
135 | 
136 |     return disparity
137 | 
138 | 
139 | def read_middlebury_calib_file(file_name: str) -> Tuple:
140 |     """It reads the calibration file of a scene from the Middlebury 2014 training dataset.
141 | 
142 |     It reads the calibration file of a scene from the Middlebury 2014 training dataset provided in
143 |     `http://vision.middlebury.edu/stereo/data/scenes2014/`
144 | 
145 |     Args:
146 |         file_name: calibration file name.
147 | 
148 |     Returns:
149 |         A dictionary containing all the calibration file parameters.
150 |         - cam0: left camera intrinsic matrix, arranged as a `(3, 3)` array,
151 |         - cam1: camera intrinsic matrix, arranged as a `(3, 3)` array,
152 |         - doffs: correction offset,
153 |         - baseline: camera baseline,
154 |         - width: image width,
155 |         - height: image height,
156 |         - ndisp: ground truth disparity resolution,
157 |         - isint: ... ,
158 |         - vmin: minimum disparity,
159 |         - vmax: maximum disparity,
160 |         - dyavg: ... ,
161 |         - dymax: ... .
162 |     """
163 | 
164 |     # Create the parameter dictionary.
165 |     param = {}
166 | 
167 |     with open(file_name) as fp:
168 | 
169 |         # Read the left camera intrinsic matrix.
170 |         line = fp.readline()
171 |         data = (line.split('='))[1].rstrip()
172 |         data = data.replace('[', '').replace(']', '').replace(';', '')
173 |         cam0 = np.reshape(np.fromstring(data, dtype=np.float32, sep=' '), (3, 3))
174 |         param['cam0'] = cam0
175 | 
176 |         # Read the right camera intrinsic matrix.
177 |         line = fp.readline()
178 |         data = (line.split('='))[1].rstrip()
179 |         data = data.replace('[', '').replace(']', '').replace(';', '')
180 |         cam1 = np.reshape(np.fromstring(data, dtype=np.float, sep=' '), (3, 3))
181 |         param['cam1'] = cam1
182 | 
183 |         # Read doffs.
184 |         line = fp.readline()
185 |         data = (line.split('='))[1].rstrip()
186 |         doffs = float(data)
187 |         param['doffs'] = doffs
188 | 
189 |         # Read baseline.
190 |         line = fp.readline()
191 |         data = (line.split('='))[1].rstrip()
192 |         baseline = float(data)
193 |         param['baseline'] = baseline
194 | 
195 |         # Read width.
196 |         line = fp.readline()
197 |         data = (line.split('='))[1].rstrip()
198 |         width = int(data)
199 |         param['width'] = width
200 | 
201 |         # Read height.
202 |         line = fp.readline()
203 |         data = (line.split('='))[1].rstrip()
204 |         height = int(data)
205 |         param['height'] = height
206 | 
207 |         # Read ndisp.
208 |         line = fp.readline()
209 |         data = (line.split('='))[1].rstrip()
210 |         ndisp = int(data)
211 |         param['ndisp'] = ndisp
212 | 
213 |         # Read isint.
214 |         line = fp.readline()
215 |         data = (line.split('='))[1].rstrip()
216 |         isint = int(data)
217 |         param['isint'] = isint
218 | 
219 |         # Read vmin.
220 |         line = fp.readline()
221 |         data = (line.split('='))[1].rstrip()
222 |         vmin = float(data)
223 |         param['vmin'] = vmin
224 | 
225 |         # Read vmax.
226 |         line = fp.readline()
227 |         data = (line.split('='))[1].rstrip()
228 |         vmax = float(data)
229 |         param['vmax'] = vmax
230 | 
231 |         # Read dyavg.
232 |         line = fp.readline()
233 |         data = (line.split('='))[1].rstrip()
234 |         dyavg = float(data)
235 |         param['dyavg'] = dyavg
236 | 
237 |         # Read dymax.
238 |         line = fp.readline()
239 |         data = (line.split('='))[1].rstrip()
240 |         dymax = float(data)
241 |         param['dymax'] = dymax
242 | 
243 |     return param
244 | 
245 | 
246 | def read_kitti_calib_file(filename: str) -> np.array:
247 |     """It reads the calibration file of a scene from the KITTI 2015 training dataset.
248 | 
249 |     It reads the calibration file of a scene from the KITTI 2015 training dataset provided in
250 |     `http://www.cvlibs.net/datasets/kitti/`.
251 | 
252 |     Args:
253 |         filename: calibration file name.
254 | 
255 |     Returns:
256 |         A dictionary containing all the calibration file parameters.
257 |         - P_rect_02: rectified left color camera intrinsic matrix, arranged as a `(3, 3)` array,
258 |         - P_rect_03: rectified right color camera intrinsic matrix, arranged as a `(3, 3)` array,
259 |         - baseline: rectified camera baseline.
260 |     """
261 | 
262 |     param = {}
263 | 
264 |     with open(filename) as fp:
265 | 
266 |         while True:
267 | 
268 |             # Read a new line.
269 |             line = fp.readline()
270 | 
271 |             # If the line is empty, the end of the file has been reached.
272 |             if line == '':
273 |                 break
274 | 
275 |             # Split the line into parameter name and it value.
276 |             param_name, data = line.split(':', maxsplit=1)
277 | 
278 |             # If the current line contains one of the parameters of interest, save it.
279 |             if param_name == 'P_rect_02':
280 |                 param[param_name] = np.reshape(
281 |                     np.fromstring(data.lstrip().rstrip(), dtype=np.float32, sep=' '),
282 |                     (3, 4))
283 |             elif param_name == 'T_02':
284 |                 param[param_name] = np.reshape(
285 |                     np.fromstring(data.lstrip().rstrip(), dtype=np.float32, sep=' '),
286 |                     (3,))
287 |             elif param_name == 'P_rect_03':
288 |                 param[param_name] = np.reshape(
289 |                     np.fromstring(data.lstrip().rstrip(), dtype=np.float32, sep=' '),
290 |                     (3, 4))
291 |             elif param_name == 'T_03':
292 |                 param[param_name] = np.reshape(
293 |                     np.fromstring(data.lstrip().rstrip(), dtype=np.float32, sep=' '),
294 |                     (3,))
295 | 
296 |     # Check that all the parameters have been read.
297 |     assert 'P_rect_02' in param, 'Could not read left camera intrinsic matrix.'
298 |     assert 'T_02' in param, 'Could not read left camera translation vector.'
299 |     assert 'P_rect_03' in param, 'Could not read right camera intrinsic matrix.'
300 |     assert 'T_03' in param, 'Could not read right camera translation vector.'
301 | 
302 |     # Compute the baseline.
303 |     param['baseline'] = abs(
304 |         (param['P_rect_02'][0, 3] / param['P_rect_02'][0, 0]) - (param['P_rect_03'][0, 3] / param['P_rect_03'][0, 0]))
305 | 
306 |     return param
307 | 
308 | 
309 | def read_bin_file(file_name: str) -> np.array:
310 |     """It reads a depth map or normal map in the COLMAP bin format.
311 | 
312 |     It reads a depth map or normal map in the COLMAP bin format. In practice, it can read any 2D or 3D array.
313 |     It is a modified version of COLMAP `read_array()` Python script.
314 | 
315 |     Args:
316 |         file_name: source file.
317 | 
318 |     Returns:
319 |         A depth map or a normal map, arranged as an `(H, W)` or an `(H, W, 3)` array, respectively.
320 |     """
321 | 
322 |     with open(file_name, "rb") as fid:
323 | 
324 |         # Read the file header. It is in the format 'width&height&channel_nb&' where channel_nb is 1 for 2D data.
325 |         width, height, channel_nb = np.genfromtxt(fid, delimiter="&", max_rows=1, usecols=(0, 1, 2), dtype=int)
326 |         fid.seek(0)
327 |         num_delimiter = 0
328 |         byte = fid.read(1)
329 | 
330 |         while True:
331 | 
332 |             if byte == b"&":
333 |                 num_delimiter += 1
334 |                 if num_delimiter >= 3:
335 |                     break
336 | 
337 |             byte = fid.read(1)
338 | 
339 |         # Read the data, stored as float32 in C-like order.
340 |         data = np.fromfile(fid, np.float32)
341 | 
342 |     # Reshape the read data into a (width, height, channel_nb) array.
343 |     data = data.reshape((width, height, channel_nb), order='F')
344 | 
345 |     # Transpose the data to get an array in the (height, width, channel_nb) format.
346 |     data = np.transpose(data, (1, 0, 2))
347 | 
348 |     # In the case of 2D data, remove the last dimension.
349 |     if channel_nb == 1:
350 |         data = data[:, :, 0]
351 | 
352 |     return data
353 | 
354 | 
355 | def write_bin_file(data: np.array, file_name: str) -> None:
356 |     """It writes a depth map or a normal map in the COLMAP bin format.
357 | 
358 |     It writes a depth map or a normal map in the COLMAP bin format. In practice, it can write any 2D or 3D array.
359 | 
360 |     Args:
361 |         data: depth map or normal map, arranged as an `(H, W)` or an `(H, W, 3)` array, respectively.
362 |         file_name: destination file name.
363 |     """
364 | 
365 |     # Check the input data.
366 |     assert data.ndim == 2 or data.ndim == 3, 'The input data must be 2D or 3D.'
367 | 
368 |     # If the input data are 2D, a fake 3D dimension is added. This permits to treat 2D and 3D data the same way.
369 |     if data.ndim == 2:
370 |         data = data[:, :, None]
371 | 
372 |     # Number of color channels.
373 |     channel_nb = data.shape[2]
374 | 
375 |     with open(file_name, "wb") as file:
376 | 
377 |         # Write the file header.
378 |         file.write(bytearray(str(data.shape[1]), 'utf8'))
379 |         file.write(bytearray('&', 'utf8'))
380 |         file.write(bytearray(str(data.shape[0]), 'utf8'))
381 |         file.write(bytearray('&', 'utf8'))
382 |         file.write(bytearray(str(channel_nb), 'utf8'))
383 |         file.write(bytearray('&', 'utf8'))
384 | 
385 |     # Write the data.
386 |     with open(file_name, "ab") as file:
387 | 
388 |         for c in range(channel_nb):
389 |             for y in range(data.shape[0]):
390 |                 for x in range(data.shape[1]):
391 |                     file.write(struct.pack('f', data[y, x, c]))
392 | 
393 | 
394 | def load_pfm(file_name: str) -> np.array:
395 |     """It reads a pfm file.
396 | 
397 |     It reads a pfm file. Adapted from the following web page:
398 |     `https://stackoverflow.com/questions/48809433/read-pfm-format-in-python`
399 | 
400 |     Args:
401 |         file_name: PFM file name.
402 | 
403 |     Returns:
404 |         The PFM file, arranged as an `(H, W, C)` or an `(H, W)` array.
405 |     """
406 | 
407 |     with open(file_name, "rb") as f:
408 | 
409 |         # Line 1: the number of channels.
410 |         channel_type = f.readline().decode('latin-1')
411 |         if "PF" in channel_type:
412 |             channels = 3
413 | 
414 |         elif "Pf" in channel_type:
415 |             channels = 1
416 | 
417 |         else:
418 |             sys.exit(1)
419 | 
420 |         # Line 2: height and width.
421 |         line = f.readline().decode('latin-1')
422 |         width, height = re.findall('\d+', line)
423 |         width = int(width)
424 |         height = int(height)
425 | 
426 |         # Line 3: positive number means big endian, negative means little endian.
427 |         line = f.readline().decode('latin-1')
428 |         big_endian = True
429 |         if "-" in line:
430 |             big_endian = False
431 | 
432 |         # Slurp all the binary data.
433 |         samples = width * height * channels;
434 |         buffer = f.read(samples * 4)
435 | 
436 |         # Unpack the floats with the appropriate endianness.
437 |         if big_endian:
438 |             fmt = ">"
439 |         else:
440 |             fmt = "<"
441 | 
442 |         fmt = fmt + str(samples) + "f"
443 |         data = struct.unpack(fmt, buffer)
444 | 
445 |         # Reshape the data.
446 |         data = np.reshape(np.array(data), (height, width, channels)).squeeze()
447 | 
448 |     return data
449 | 
450 | 
451 | def colmap_camera_intrinsic(path: str) -> Dict[str, float]:
452 |     """It reads the camera intrinsic parameters stored in COLMAP format.
453 | 
454 |     It reads the intrinsic parameters of the first pinhole camera stored in a `.txt` or `.bin` COLMAP camera file.
455 | 
456 |     Args:
457 |         path: path to the COLMAP file.
458 | 
459 |     Returns:
460 |         A dictionary with the following fields:
461 |         - the horizontal and vertical focal lengths `f_x` and `f_y`,
462 |         - the horizontal and vertical coordinates of the camera center of projection `c_x` and `c_y`.
463 |     """
464 | 
465 |     camera = read_cameras_binary(path)[0]
466 | 
467 |     assert camera.model == 'PINHOLE', 'The input camera must refer to a pinhole model.'
468 | 
469 |     focal_x, focal_y, center_x, center_y = camera.params
470 | 
471 |     camera_intrinsic = dict(
472 |         f_x=focal_x,
473 |         f_y=focal_y,
474 |         c_x=center_x,
475 |         c_y=center_y)
476 | 
477 |     return camera_intrinsic
478 | 
479 | 
480 | def read_camera_data_text(path: str) -> Dict:
481 |     """It extracts the camera info stored in a text file (Andreas' format).
482 | 
483 |     Args:
484 |         path: path to the camera info file.
485 | 
486 |     Returns:
487 |         A dictionary containing all the camera info.
488 |     """
489 | 
490 |     # Dictionary containing the camera data.
491 |     camera = {
492 |         'height': None,
493 |         'width': None,
494 |         'A': None,
495 |         'k1': None,
496 |         'k2': None,
497 |         'R': None,
498 |         'T': None,
499 |         'zmin': None,
500 |         'zmax': None,
501 |         'match': None}
502 | 
503 |     # Extract the camera data from the text file.
504 |     with open(path, "r") as fid:
505 |         while True:
506 |             line = fid.readline()
507 |             if not line:
508 |                 break
509 |             line = line.strip()
510 |             if len(line) > 0 and line[0] != "#":
511 |                 line_elems = line.split('=')
512 |                 if len(line_elems) != 2:
513 |                     continue
514 |                 param_name = line_elems[0]
515 |                 param_value = line_elems[1]
516 |                 param_name_elems = param_name.split('.')
517 |                 if (len(param_name_elems) != 2) or (param_name_elems[0] != 'camera'):
518 |                     continue
519 |                 param_name = param_name_elems[1]
520 |                 camera[param_name] = param_value
521 | 
522 |     # Convert the camera height to `int`.
523 |     if (camera['height'] is not None) and ('.' not in camera['height']):
524 |         camera['height'] = int(camera['height'])
525 |     else:
526 |         camera['height'] = None
527 | 
528 |     # Convert the camera width to `int`.
529 |     if (camera['width'] is not None) and ('.' not in camera['width']):
530 |         camera['width'] = int(camera['width'])
531 |     else:
532 |         camera['width'] = None
533 | 
534 |     # Convert the camera `k1' parameter to `float`.
535 |     if camera['k1'] is not None:
536 |         camera['k1'] = int(camera['k1'])
537 |     else:
538 |         camera['k1'] = None
539 | 
540 |     # Convert the camera `k2' parameter to `float`.
541 |     if camera['k2'] is not None:
542 |         camera['k2'] = int(camera['k2'])
543 |     else:
544 |         camera['k2'] = None
545 | 
546 |     # Convert the camera intrinsic matrix to `np.array`.
547 |     if camera['A'] is not None:
548 |         mtx_intrinsic = [float(i) for i in re.findall(r'[-+]?\d*\.\d+|[-+]?\d+', camera['A'])]
549 |         mtx_intrinsic = np.asarray(mtx_intrinsic, dtype=float)
550 |         if len(mtx_intrinsic) != 9:
551 |             camera['A'] = None
552 |         else:
553 |             camera['A'] = np.reshape(mtx_intrinsic, (3, 3))
554 | 
555 |     # Convert the camera rotation matrix to `np.array`.
556 |     if camera['R'] is not None:
557 |         mtx_rotation = [float(i) for i in re.findall(r'[-+]?\d*\.\d+|[-+]?\d+', camera['R'])]
558 |         mtx_rotation = np.asarray(mtx_rotation, dtype=float)
559 |         if len(mtx_rotation) != 9:
560 |             camera['R'] = None
561 |         else:
562 |             camera['R'] = np.reshape(mtx_rotation, (3, 3))
563 | 
564 |     # Convert the camera translation vector to `np.array`.
565 |     if camera['T'] is not None:
566 |         vec_translation = [float(i) for i in re.findall(r'[-+]?\d*\.\d+|[-+]?\d+', camera['T'])]
567 |         vec_translation = np.asarray(vec_translation, dtype=float)
568 |         if len(vec_translation) != 3:
569 |             camera['T'] = None
570 |         else:
571 |             camera['T'] = np.reshape(vec_translation, (3, 1))
572 | 
573 |     return camera
574 | 


--------------------------------------------------------------------------------
/refine.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020,
  2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland,
  3 | # Laboratoire de Traitement des Signaux 4 (LTS4).
  4 | # All rights reserved.
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | #
 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | # SOFTWARE.
 23 | #
 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com)
 25 | 
 26 | import argparse
 27 | import os
 28 | import numpy as np
 29 | from cv2 import imread
 30 | from iofuns import read_depth_map, read_normal_map, write_bin_file
 31 | from misc import resize_map, depth_percentage_error
 32 | from refinement import refine_depth
 33 | from logger import Logger
 34 | import torch.optim
 35 | import time
 36 | import math
 37 | 
 38 | 
 39 | def read_param() -> argparse.Namespace:
 40 |     """It parses the command-line parameters.
 41 | 
 42 |     Returns:
 43 |         The input parameters.
 44 |     """
 45 | 
 46 |     # Create the parser.
 47 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 48 | 
 49 |     # ========================================== SCALE-INDEPENDENT PARAMETERS ==========================================
 50 | 
 51 |     # Input/output paths.
 52 |     parser.add_argument(
 53 |         '--image', type=str, required=True,
 54 |         help='input image path')
 55 |     parser.add_argument(
 56 |         '--depth', type=str, required=True,
 57 |         help='input depth map path')
 58 |     parser.add_argument(
 59 |         '--normal', type=str,
 60 |         help='input normal map path')
 61 |     parser.add_argument(
 62 |         '--confidence', type=str, required=True,
 63 |         help='input (depth) confidence map path')
 64 |     parser.add_argument(
 65 |         '--depth_gt', type=str,
 66 |         help='ground truth depth map path')
 67 |     parser.add_argument(
 68 |         '--depth_out', type=str, required=True,
 69 |         help='refined depth map saving path')
 70 |     parser.add_argument(
 71 |         '--normal_out', type=str, required=True,
 72 |         help='estimated normal map saving path')
 73 | 
 74 |     # Camera parameters.
 75 |     parser.add_argument(
 76 |         '--cam_focal', type=float, nargs=2, required=True,
 77 |         help='camera focal lengths (f_x, f_y)')
 78 |     parser.add_argument(
 79 |         '--cam_center', type=float, nargs=2, required=True,
 80 |         help='camera principal point coordinates (c_x, c_y)')
 81 | 
 82 |     # Depth range.
 83 |     parser.add_argument(
 84 |         '--depth_min', type=float, default=1e-1,
 85 |         help='minimum depth value (in meters)')
 86 |     parser.add_argument(
 87 |         '--depth_max', type=float, default=100,
 88 |         help='maximum depth value (in meters)')
 89 | 
 90 |     # Confidence binarization.
 91 |     parser.add_argument(
 92 |         '--confidence_threshold', type=float, default=None,
 93 |         help='threshold for confidence binarization')
 94 | 
 95 |     # Plotting parameters.
 96 |     parser.add_argument(
 97 |         '--visdom_display_port', type=int, default=-1,
 98 |         help='port to be used by the VISDOM server')
 99 |     parser.add_argument(
100 |         '--visdom_base_url', type=str, default='refinement',
101 |         help='string to customize the VISDOM server URL')
102 |     parser.add_argument(
103 |         '--plotting_step', type=int, default=500,
104 |         help='number of steps between two plot updates in the VISDOM server')
105 | 
106 |     # Device and precision.
107 |     parser.add_argument(
108 |         '--gpu_id', type=int, default=-1,
109 |         help='gpu id (non positive numbers trigger cpu computation)')
110 |     parser.add_argument(
111 |         '--precision', type=str, choices=['single', 'double'], default='single',
112 |         help='computation precision (32 or 64 bits)')
113 | 
114 |     # Error evaluation.
115 |     parser.add_argument(
116 |         '--depth_error_threshold', type=float, default=0.02,
117 |         help='error threshold (in meters) to be used in the evaluation against the ground truth')
118 | 
119 |     # Multi-scale strategy.
120 |     parser.add_argument(
121 |         '--scale_nb', type=int, default=1,
122 |         help='number of scales in the multi-scale pyramid')
123 | 
124 |     # Regularization.
125 |     parser.add_argument(
126 |         '--regularization', type=int, choices=[0, 1], default=1,
127 |         help='regularization type (0 for NLTGV, 1 for our regularization)')
128 | 
129 |     # =========================================== SCALE-DEPENDENT PARAMETERS ===========================================
130 | 
131 |     # Loss parameters.
132 |     parser.add_argument(
133 |         '--lambda_depth_consistency', nargs='*', type=float, default=1.0,
134 |         help='depth consistency term multiplier (one per scale)')
135 |     parser.add_argument(
136 |         '--lambda_normal_consistency', nargs='*', type=float, default=0.0,
137 |         help='normal consistency term multiplier (one per scale)')
138 |     parser.add_argument(
139 |         '--lambda_regularization', nargs='*', type=float, default=7.5,
140 |         help='depth regularization term multiplier (one per scale)')
141 |     parser.add_argument(
142 |         '--gamma_regularization', nargs='*', type=float, default=5.5,
143 |         help='depth regularization term internal multiplier (one per scale)')
144 | 
145 |     # Graph parameters.
146 |     parser.add_argument(
147 |         '--window_size', nargs='*', type=int, default=9,
148 |         help='search window size (window_size x window_size) to be used in the graph construction (one per scale)')
149 |     parser.add_argument(
150 |         '--patch_size', nargs='*', type=int, default=3,
151 |         help='patch size (patch_size x patch_size) to be used in the graph construction (one per scale)')
152 |     parser.add_argument(
153 |         '--sigma_int', nargs='*', type=float, default=0.07,
154 |         help='color difference standard deviation for patch comparison in the graph construction (one per scale)')
155 |     parser.add_argument(
156 |         '--sigma_spa', nargs='*', type=float, default=3.0,
157 |         help='euclidean distance standard deviation for patch comparison in the graph construction (one per scale)')
158 |     parser.add_argument(
159 |         '--degree_max', nargs='*', type=int, default=20,
160 |         help='maximum number of per pixel neighbors in the graph (one per scale)')
161 | 
162 |     # Stopping criteria.
163 |     parser.add_argument(
164 |         '--iter_max', nargs='*', type=int, default=4000,
165 |         help='maximum number of iterations (one per scale)')
166 |     parser.add_argument(
167 |         '--eps_stop', nargs='*', type=float, default=1e-6,
168 |         help=('minimum relative change between the current and the previous '
169 |               'iteration depth maps (one per scale)'))
170 |     parser.add_argument(
171 |         '--attempt_max', nargs='*', type=int, default=50,
172 |         help='maximum number of iterations without improving the loss (one per scale)')
173 | 
174 |     # Learning rate update policies.
175 |     parser.add_argument(
176 |         '--lr_start', nargs='*', type=float, default=1e-4,
177 |         help='initial learning rate (one per scale)')
178 |     parser.add_argument(
179 |         '--lr_slot_nb', nargs='*', type=int, default=1,
180 |         help=('number of partitions (one per scale); '
181 |               'each partition adopts a learning rate which is 1/10 of those employed at the previous partition;'
182 |               '0 excludes the relative depth map change stopping criterium.'))
183 | 
184 |     # ==================================================================================================================
185 | 
186 |     # Perform parsing.
187 |     param = parser.parse_args()
188 | 
189 |     # =================================== CHECK AND ADJUST THE INPUT PARAMETER FORMAT ==================================
190 | 
191 |     # Cases:
192 |     # - if the value for a parameter is provided, then this must be a list of length `param.scale_nb`.
193 |     # - if the value is not provided, the default value is used to fill a list of length `param.scale_nb`.
194 | 
195 |     # Check `lambda_depth_consistency`.
196 |     if isinstance(param.lambda_depth_consistency, list):
197 |         assert (len(param.lambda_depth_consistency) == param.scale_nb)
198 |     else:
199 |         param.lambda_depth_consistency = [param.lambda_depth_consistency] * param.scale_nb
200 | 
201 |     # Check `lambda_normal_consistency`.
202 |     if isinstance(param.lambda_normal_consistency, list):
203 |         assert (len(param.lambda_normal_consistency) == param.scale_nb)
204 |     else:
205 |         param.lambda_normal_consistency = [param.lambda_normal_consistency] * param.scale_nb
206 | 
207 |     # Check `lambda_reg`.
208 |     if isinstance(param.lambda_regularization, list):
209 |         assert (len(param.lambda_regularization) == param.scale_nb)
210 |     else:
211 |         param.lambda_regularization = [param.lambda_regularization] * param.scale_nb
212 | 
213 |     # Check `gamma_regularization`.
214 |     if isinstance(param.gamma_regularization, list):
215 |         assert (len(param.gamma_regularization) == param.scale_nb)
216 |     else:
217 |         param.gamma_regularization = [param.gamma_regularization] * param.scale_nb
218 | 
219 |     # Check `window_size`.
220 |     if isinstance(param.window_size, list):
221 |         assert (len(param.window_size) == param.scale_nb)
222 |     else:
223 |         param.window_size = [param.window_size] * param.scale_nb
224 | 
225 |     # Check `patch_size`.
226 |     if isinstance(param.patch_size, list):
227 |         assert (len(param.patch_size) == param.scale_nb)
228 |     else:
229 |         param.patch_size = [param.patch_size] * param.scale_nb
230 | 
231 |     # Check `sigma_int`.
232 |     if isinstance(param.sigma_int, list):
233 |         assert (len(param.sigma_int) == param.scale_nb)
234 |     else:
235 |         param.sigma_int = [param.sigma_int] * param.scale_nb
236 | 
237 |     # Check `sigma_spa`.
238 |     if isinstance(param.sigma_spa, list):
239 |         assert (len(param.sigma_spa) == param.scale_nb)
240 |     else:
241 |         param.sigma_spa = [param.sigma_spa] * param.scale_nb
242 | 
243 |     # Check `degree_max`.
244 |     if isinstance(param.degree_max, list):
245 |         assert (len(param.degree_max) == param.scale_nb)
246 |     else:
247 |         param.degree_max = [param.degree_max] * param.scale_nb
248 | 
249 |     # Check `iter_max`.
250 |     if isinstance(param.iter_max, list):
251 |         assert (len(param.iter_max) == param.scale_nb)
252 |     else:
253 |         param.iter_max = [param.iter_max] * param.scale_nb
254 | 
255 |     # Check `eps_stop`.
256 |     if isinstance(param.eps_stop, list):
257 |         assert (len(param.eps_stop) == param.scale_nb)
258 |     else:
259 |         param.eps_stop = [param.eps_stop] * param.scale_nb
260 | 
261 |     # Check `attempt_max`.
262 |     if isinstance(param.attempt_max, list):
263 |         assert (len(param.attempt_max) == param.scale_nb)
264 |     else:
265 |         param.attempt_max = [param.attempt_max] * param.scale_nb
266 | 
267 |     # Check `lr_start`.
268 |     if isinstance(param.lr_start, list):
269 |         assert (len(param.lr_start) == param.scale_nb)
270 |     else:
271 |         param.lr_start = [param.lr_start] * param.scale_nb
272 | 
273 |     # Check `lr_slot_nb`.
274 |     if isinstance(param.lr_slot_nb, list):
275 |         assert (len(param.lr_slot_nb) == param.scale_nb)
276 |     else:
277 |         param.lr_slot_nb = [param.lr_slot_nb] * param.scale_nb
278 | 
279 |     return param
280 | 
281 | 
282 | def print_param(param: argparse.Namespace) -> None:
283 |     """It prints the input parameters.
284 | 
285 |     Args:
286 |         param: parameters to be printed.
287 |     """
288 | 
289 |     # Organize the parameters into a single string.
290 |     message = ''
291 |     message += '---------------------- Options ----------------------\n'
292 |     for k, v in sorted(vars(param).items()):
293 | 
294 |         # Turn `v` into a string.
295 |         if isinstance(v, list):
296 |             v_str = ', '.join([str(item) for item in v])
297 |         else:
298 |             v_str = str(v)
299 | 
300 |         # Write the current pair.
301 |         message += '{:>30}: {:<30}\n'.format(str(k), v_str)
302 |     message += '------------------------ End ------------------------'
303 | 
304 |     # Print the options to standard output.
305 |     print(message)
306 | 
307 |     # Save the parameters to disk.
308 |     file_name = 'param.txt'
309 |     with open(file_name, 'wt') as param_file:
310 |         param_file.write(message)
311 |         param_file.write('\n')
312 | 
313 | 
314 | def main():
315 |     # Read the input parameters.
316 |     param = read_param()
317 | 
318 |     # Interrupt the script if the refined depth and normal maps already exist.
319 |     if os.path.exists(param.depth_out) and os.path.exists(param.normal_out):
320 |         print('The refined depth and/or normal map already exist !!!')
321 |         return
322 | 
323 |     # Organize the camera parameters in a dictionary.
324 |     camera_param = {
325 |         'f_x': param.cam_focal[0],
326 |         'f_y': param.cam_focal[1],
327 |         'c_x': param.cam_center[0],
328 |         'c_y': param.cam_center[1]}
329 | 
330 |     # Store the loss parameters as a list of dictionaries (one dictionary for each scale of the multi-scale pyramid).
331 |     # The same approach is adopted for the optimization parameters.
332 |     loss_param = [None] * param.scale_nb
333 |     opt_param = [None] * param.scale_nb
334 |     for i in range(param.scale_nb):
335 |         loss_param[i] = {
336 |             'lambda_depth_consistency': param.lambda_depth_consistency[i],
337 |             'lambda_normal_consistency': param.lambda_normal_consistency[i],
338 |             'lambda_regularization': param.lambda_regularization[i],
339 |             'gamma_regularization': param.gamma_regularization[i],
340 |             'window_size': param.window_size[i],
341 |             'patch_size': param.patch_size[i],
342 |             'sigma_intensity': param.sigma_int[i],
343 |             'sigma_spatial': param.sigma_spa[i],
344 |             'degree_max': param.degree_max[i],
345 |             'regularization': param.regularization}
346 | 
347 |         opt_param[i] = {
348 |             'iter_max': param.iter_max[i],
349 |             'plotting_step': param.plotting_step,
350 |             'eps_stop': param.eps_stop[i],
351 |             'attempt_max': param.attempt_max[i],
352 |             'learning_rate': {'lr_start': param.lr_start[i], 'lr_slot_nb': param.lr_slot_nb[i]},
353 |             'depth_error_threshold': param.depth_error_threshold}
354 | 
355 |     # Set the device.
356 |     if torch.cuda.is_available() and (param.gpu_id >= 0):
357 |         device = torch.device('cuda:{}'.format(param.gpu_id))
358 |     else:
359 |         device = torch.device('cpu')
360 | 
361 |     # Create the logger object for plotting.
362 |     logger = None
363 |     if param.visdom_display_port > 0:
364 |         logger = Logger(
365 |             param.depth_error_threshold,
366 |             display_port=param.visdom_display_port, base_url=('/' + param.visdom_base_url))
367 | 
368 |     ################################################## REFERENCE IMAGE #################################################
369 | 
370 |     # Read the reference image.
371 |     image = imread(param.image)
372 |     if image is None:
373 |         raise FileNotFoundError('The reference image could not be loaded.')
374 | 
375 |     # Convert the image to [0, 1] and flip the color channels, as OpenCV assumes that the image is in BGR format on disk.
376 |     image = np.flip(image.astype(param.precision) / 255, axis=2)
377 | 
378 |     ############################################ NOISY/INCOMPLETE DEPTH MAP ############################################
379 | 
380 |     # Read the noisy/incomplete depth map.
381 |     depth = read_depth_map(param.depth, 'COLMAP').astype(param.precision)
382 |     if depth is None:
383 |         raise FileNotFoundError('The noisy/incomplete depth map to process could not be loaded.')
384 | 
385 |     # Clip the valid entries of the MVS depth map to `[param.depth_min, param.depth_max]`.
386 |     mask = (depth > 0) & (depth < float('inf'))
387 |     depth[~mask] = 0  # Non valid pixels are set to zero.
388 |     depth[mask] = np.clip(depth[mask], param.depth_min, param.depth_max)  # Valid pixels are clipped.
389 | 
390 |     ############################################ NOISY/INCOMPLETE NORMAL MAP ###########################################
391 | 
392 |     # Read the noisy/incomplete normal map.
393 |     normal = read_normal_map(param.normal, 'COLMAP')
394 |     if normal is None:
395 |         print('WARNING: The noisy/incomplete normal map could not be loaded.')
396 |     else:
397 |         # Set to zero all the 3D normals without a corresponding depth value.
398 |         normal[~mask] = 0
399 | 
400 |     # We do not care about normal having unitary norm at this stage.
401 | 
402 |     ################################################## CONFIDENCE MAP ##################################################
403 | 
404 |     # Read the confidence map associated to the noisy/incomplete depth map.
405 |     depth_confidence = read_depth_map(param.confidence, 'COLMAP').astype(param.precision)
406 | 
407 |     # Check the confidence map.
408 |     assert (np.min(depth_confidence) >= 0) and (np.max(depth_confidence) <= 1), \
409 |         'Depth map confidence entries must be in [0, 1].'
410 | 
411 |     # Make the confidence binary.
412 |     if param.confidence_threshold is not None:
413 |         if (param.confidence_threshold >= 0) and (param.confidence_threshold <= 1):
414 |             mask_confidence = depth_confidence < param.confidence_threshold
415 |             depth_confidence = np.ones_like(depth_confidence)
416 |             depth_confidence[mask_confidence] = 0
417 |         else:
418 |             print('WARNING: the specified confidence threshold is outside [0, 1] therefore it will be ignored.')
419 | 
420 |     ############################################## GROUND TRUTH DEPTH MAP ##############################################
421 | 
422 |     # Read the Ground Truth depth map.
423 |     depth_gt = None
424 |     if param.depth_gt is not None:
425 |         depth_gt = read_depth_map(param.depth_gt, 'COLMAP').astype(param.precision)
426 |         if depth_gt is None:
427 |             raise FileNotFoundError('The ground truth depth map could not be loaded.')
428 | 
429 |     ############################################ ADJUST REFERENCE IMAGE SIZE ###########################################
430 | 
431 |     # If the reference image size differs from the noisy/incomplete depth map one, then the image is resized.
432 |     # Note that, since the camera parameters are associated to the reference image, the camera parameters must be
433 |     # adjusted accordingly, if the reference image is resized.
434 |     height = depth.shape[0]
435 |     width = depth.shape[1]
436 |     if (image.shape[0] != height) or (image.shape[1] != width):
437 |         x_ratio = float(height) / float(image.shape[0])
438 |         y_ratio = float(width) / float(image.shape[1])
439 |         camera_param['f_x'] = camera_param['f_x'] * x_ratio
440 |         camera_param['f_y'] = camera_param['f_y'] * y_ratio
441 |         camera_param['c_x'] = camera_param['c_x'] * x_ratio
442 |         camera_param['c_y'] = camera_param['c_y'] * y_ratio
443 |         image = resize_map(image, (height, width), order=1)
444 |         print('WARNING: the reference image has been resized in order to match the input depth map height and width.')
445 | 
446 |     # The other maps must have the same height and width of the noisy/incomplete depth map. No resizing for them.
447 |     if normal is not None:
448 |         assert normal.shape == (height, width, 3), \
449 |             'Input normal map size not compatible with the reference image one.'
450 |     if depth_confidence is not None:
451 |         assert depth_confidence.shape == (height, width), \
452 |             'Input depth map confidence size not compatible with the reference image one.'
453 |     if depth_gt is not None:
454 |         assert depth_gt.shape == (height, width), \
455 |             'Ground truth depth size not compatible with the reference image one.'
456 | 
457 |     ####################################################################################################################
458 |     #################################################### REFINEMENT ####################################################
459 |     ####################################################################################################################
460 | 
461 |     # Start measuring the processing time.
462 |     time_start = time.time()
463 | 
464 |     # Refine the noisy/incomplete depth map.
465 |     depth_refined, normal_refined = refine_depth(
466 |         image, depth, (param.depth_min, param.depth_max),
467 |         camera_param, loss_param, opt_param,
468 |         depth_confidence=depth_confidence,
469 |         normal=normal,
470 |         depth_gt=depth_gt,
471 |         logger=logger,
472 |         device=device)
473 | 
474 |     # Check the processing time.
475 |     time_elapsed = time.time() - time_start
476 |     minute_elapsed = math.floor(time_elapsed / 60)
477 |     print('Elapsed time: {}min {}sec'.format(
478 |         minute_elapsed, math.ceil(time_elapsed - minute_elapsed * 60)), flush=True)
479 | 
480 |     #################################################### EVALUATION ####################################################
481 | 
482 |     # Compute the depth percentage error of the noisy/incomplete depth map.
483 |     if depth_gt is not None:
484 |         print('Percentage of input depth map values with error larger that {}: {:.2f}'.format(
485 |             param.depth_error_threshold,
486 |             depth_percentage_error(depth, depth_gt, param.depth_error_threshold)))
487 | 
488 |         print('Percentage of refined depth map values with error larger that {}: {:.2f}'.format(
489 |             param.depth_error_threshold,
490 |             depth_percentage_error(depth_refined, depth_gt, param.depth_error_threshold)))
491 | 
492 |     ###################################################### SAVING ######################################################
493 | 
494 |     # Save the refined depth map.
495 |     saving_path, _ = os.path.split(param.depth_out)
496 |     os.makedirs(saving_path, exist_ok=True)
497 |     write_bin_file(
498 |         depth_refined, os.path.join(param.depth_out))
499 | 
500 |     # Save the refined/estimated normal map.
501 |     saving_path, _ = os.path.split(param.normal_out)
502 |     os.makedirs(saving_path, exist_ok=True)
503 |     write_bin_file(
504 |         normal_refined, os.path.join(param.normal_out))
505 | 
506 | 
507 | if __name__ == '__main__':
508 |     main()
509 | 


--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020,
  2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland,
  3 | # Laboratoire de Traitement des Signaux 4 (LTS4).
  4 | # All rights reserved.
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | #
 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | # SOFTWARE.
 23 | #
 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com)
 25 | 
 26 | import torch
 27 | import numpy as np
 28 | from importlib import import_module
 29 | from misc import resize_map, depth_percentage_error
 30 | from pltfuns import normal2rgb
 31 | from typing import Tuple
 32 | 
 33 | 
 34 | # Maximum height of any image (not heat map) plotted on the screen.
 35 | HEIGHT_MAX = int(300)
 36 | 
 37 | 
 38 | class Logger:
 39 | 
 40 |     def __init__(self,
 41 |                  error_threshold: float,
 42 |                  display_port: int = 8097, base_url: str = '/1234'):
 43 | 
 44 |         # Windows.
 45 |         self.texture_win = None                 # reference image.
 46 |         self.depth_win = None                   # noisy and possibly incomplete depth map.
 47 |         self.depth_init_win = None              # initial depth map used in the refinement.
 48 |         self.depth_refined_win = None           # refined depth map.
 49 |         self.normal_win = None                  # noisy and possibly incomplete normal map.
 50 |         self.normal_init_win = None             # initial normal map used in the refinement.
 51 |         self.normal_refined_win = None          # normal map associated to the refined depth map.
 52 |         self.depth_gt_win = None                # ground truth depth map.
 53 |         self.depth_error_win = None             # noisy and possibly incomplete depth map error.
 54 |         self.depth_refined_error_win = None     # refined depth map error.
 55 | 
 56 |         # Windows associated to the partial and global losses (in the inverse depth domain).
 57 |         self.idepth_consistency_loss_win = None
 58 |         self.inormal_consistency_loss_win = None
 59 |         self.regularization_loss_win = None
 60 |         self.global_loss_win = None
 61 | 
 62 |         # Typically, the logger is called to plot a new complete depth map and to compute its error with respect to
 63 |         # the ground truth depth map, which does not change very often. Therefore, we store the ground truth depth map.
 64 |         self.depth_gt = None
 65 | 
 66 |         # Minimum and maximum depth values to be used in the plots.
 67 |         self.depth_min = None
 68 |         self.depth_max = None
 69 | 
 70 |         # Record the error threshold to be used in the percentage error computation.
 71 |         self.depth_error_threshold = error_threshold
 72 | 
 73 |         # Instantiate the online visualization tool.
 74 |         visdom = import_module('visdom')
 75 |         self.vis = visdom.Visdom(port=display_port, base_url=base_url)
 76 |         # The `visdom` module is imported here to avoid its installation in the case the user does not need the logger.
 77 | 
 78 |         # Environment default name.
 79 |         self.environment = 'main'
 80 | 
 81 |     def plot(self,
 82 |              texture: np.array = None,
 83 |              depth: np.array = None,
 84 |              depth_init: np.array = None,
 85 |              depth_refined: np.array = None,
 86 |              depth_gt: np.array = None,
 87 |              normal: np.array = None,
 88 |              normal_init: np.array = None,
 89 |              normal_refined: np.array = None,
 90 |              idepth_consistency_loss: np.array = None,
 91 |              inormal_consistency_loss: np.array = None,
 92 |              regularization_loss: np.array = None,
 93 |              global_loss: np.array = None) -> None:
 94 | 
 95 |         # ==============================================================================================================
 96 | 
 97 |         # Reference camera texture.
 98 |         if texture is not None:
 99 | 
100 |             # Texture dimensions.
101 |             aux = texture
102 |             if texture.ndim == 2:
103 |                 height = texture.shape[0]
104 |                 width = texture.shape[1]
105 |                 aux = aux[:, :, None]
106 |             elif texture.ndim == 3 and texture.shape[2] == 3:
107 |                 height = texture.shape[0]
108 |                 width = texture.shape[1]
109 |             else:
110 |                 raise ValueError('The input texture must be gray scale or RGB.')
111 | 
112 |             # Resize the texture if too large.
113 |             img_ratio = float(width) / float(height)
114 |             if height > HEIGHT_MAX:
115 |                 aux = resize_map(aux, [HEIGHT_MAX, int(HEIGHT_MAX * img_ratio)])
116 | 
117 |             # Convert the texture to tensor.
118 |             texture_t = torch.from_numpy(np.transpose(aux, axes=(2, 0, 1)).copy())
119 | 
120 |             # Plot the texture.
121 |             if self.texture_win is not None:
122 | 
123 |                 self.vis.image(
124 |                     texture_t,
125 |                     env=self.environment,
126 |                     win=self.texture_win,
127 |                     opts=dict(title='texture'))
128 | 
129 |             else:
130 | 
131 |                 self.texture_win = self.vis.image(
132 |                     texture_t,
133 |                     env=self.environment,
134 |                     opts=dict(title='texture'))
135 | 
136 |         # ==============================================================================================================
137 | 
138 |         # Ground truth depth map.
139 |         if depth_gt is not None:
140 | 
141 |             # Store the ground truth depth map.
142 |             self.depth_gt = depth_gt
143 | 
144 |             # Set the minimum and maximum depth values.
145 |             xmin = self.depth_min if self.depth_min is not None else np.min(self.depth_gt)
146 |             xmax = self.depth_max if self.depth_max is not None else np.max(self.depth_gt)
147 | 
148 |             # Convert the ground truth depth map to tensor.
149 |             depth_gt_t = torch.from_numpy(self.depth_gt).flip([0])
150 | 
151 |             # Plot.
152 |             if self.depth_gt_win is not None:
153 | 
154 |                 self.vis.heatmap(
155 |                     depth_gt_t,
156 |                     env=self.environment,
157 |                     win=self.depth_gt_win,
158 |                     opts=dict(
159 |                         xmin=xmin,
160 |                         xmax=xmax,
161 |                         title='depth gt'))
162 | 
163 |             else:
164 | 
165 |                 self.depth_gt_win = self.vis.heatmap(
166 |                     depth_gt_t,
167 |                     env=self.environment,
168 |                     opts=dict(
169 |                         xmin=xmin,
170 |                         xmax=xmax,
171 |                         title='depth gt'))
172 | 
173 |         # ==============================================================================================================
174 | 
175 |         # Noisy and possibly incomplete depth map.
176 |         if depth is not None:
177 | 
178 |             # Set the minimum and maximum depth values.
179 |             aux_min, aux_max = np.percentile(depth, [5, 95])
180 |             xmin = self.depth_min if self.depth_min is not None else aux_min
181 |             xmax = self.depth_max if self.depth_max is not None else aux_max
182 | 
183 |             # Convert the depth map to tensor.
184 |             depth_t = torch.from_numpy(depth).flip([0])
185 | 
186 |             # Plot.
187 |             if self.depth_win is not None:
188 | 
189 |                 self.vis.heatmap(
190 |                     depth_t,
191 |                     env=self.environment,
192 |                     win=self.depth_win,
193 |                     opts=dict(
194 |                         xmin=xmin,
195 |                         xmax=xmax,
196 |                         title='input depth'))
197 | 
198 |             else:
199 | 
200 |                 self.depth_win = self.vis.heatmap(
201 |                     depth_t,
202 |                     env=self.environment,
203 |                     opts=dict(
204 |                         xmin=xmin,
205 |                         xmax=xmax,
206 |                         title='input depth'))
207 | 
208 |             # Error map.
209 |             if self.depth_gt is not None:
210 | 
211 |                 # Detect the valid entries in `self.depth_gt`.
212 |                 mask = (self.depth_gt > 0) & (self.depth_gt < float('inf'))
213 | 
214 |                 # Compute the error.
215 |                 error = np.abs(self.depth_gt - depth)
216 |                 error[~mask] = 0
217 | 
218 |                 # Compute the percentage error.
219 |                 percentage_error = depth_percentage_error(
220 |                     depth, self.depth_gt, self.depth_error_threshold)
221 | 
222 |                 # Convert the error to tensor.
223 |                 error_t = torch.from_numpy(error).flip([0])
224 | 
225 |                 # Plot the depth map error.
226 |                 if self.depth_error_win is not None:
227 | 
228 |                     self.vis.heatmap(
229 |                         error_t,
230 |                         env=self.environment,
231 |                         win=self.depth_error_win,
232 |                         opts=dict(
233 |                             xmin=0.0,
234 |                             xmax=self.depth_error_threshold,
235 |                             title='input depth error: {:.2f}% ({})'.format(
236 |                                 percentage_error, self.depth_error_threshold)))
237 | 
238 |                 else:
239 | 
240 |                     self.depth_error_win = self.vis.heatmap(
241 |                         error_t,
242 |                         env=self.environment,
243 |                         opts=dict(
244 |                             xmin=0.0,
245 |                             xmax=self.depth_error_threshold,
246 |                             title='input depth error: {:.2f}% ({})'.format(
247 |                                 percentage_error, self.depth_error_threshold)))
248 | 
249 |         # ==============================================================================================================
250 | 
251 |         # Initial depth map.
252 |         if depth_init is not None:
253 | 
254 |             # Set the minimum and maximum depth values.
255 |             aux_min, aux_max = np.percentile(depth_init, [5, 95])
256 |             xmin = self.depth_min if self.depth_min is not None else aux_min
257 |             xmax = self.depth_max if self.depth_max is not None else aux_max
258 | 
259 |             # Convert the depth map to tensor.
260 |             depth_init_t = torch.from_numpy(depth_init).flip([0])
261 | 
262 |             # Plot the depth map.
263 |             if self.depth_init_win is not None:
264 | 
265 |                 self.vis.heatmap(
266 |                     depth_init_t,
267 |                     env=self.environment,
268 |                     win=self.depth_init_win,
269 |                     opts=dict(
270 |                         xmin=xmin,
271 |                         xmax=xmax,
272 |                         title='initial depth'))
273 | 
274 |             else:
275 | 
276 |                 self.depth_init_win = self.vis.heatmap(
277 |                     depth_init_t,
278 |                     env=self.environment,
279 |                     opts=dict(
280 |                         xmin=xmin,
281 |                         xmax=xmax,
282 |                         title='initial depth'))
283 | 
284 |         # ==============================================================================================================
285 | 
286 |         # Refined depth map.
287 |         if depth_refined is not None:
288 | 
289 |             # Set the minimum and maximum depth values.
290 |             aux_min, aux_max = np.percentile(depth_refined, [5, 95])
291 |             xmin = self.depth_min if self.depth_min is not None else aux_min
292 |             xmax = self.depth_max if self.depth_max is not None else aux_max
293 | 
294 |             # Convert the depth map to tensor.
295 |             depth_refined_t = torch.from_numpy(depth_refined).flip([0])
296 | 
297 |             # Plot the depth map.
298 |             if self.depth_refined_win is not None:
299 | 
300 |                 self.vis.heatmap(
301 |                     depth_refined_t,
302 |                     env=self.environment,
303 |                     win=self.depth_refined_win,
304 |                     opts=dict(
305 |                         xmin=xmin,
306 |                         xmax=xmax,
307 |                         title='refined depth'))
308 | 
309 |             else:
310 | 
311 |                 self.depth_refined_win = self.vis.heatmap(
312 |                     depth_refined_t,
313 |                     env=self.environment,
314 |                     opts=dict(
315 |                         xmin=xmin,
316 |                         xmax=xmax,
317 |                         title='refined depth'))
318 | 
319 |             # Depth map error.
320 |             if self.depth_gt is not None:
321 | 
322 |                 # Detect the valid entries in `self.depth_gt`.
323 |                 mask = (self.depth_gt > 0) & (self.depth_gt < float('inf'))
324 | 
325 |                 # Compute the error.
326 |                 error = np.abs(self.depth_gt - depth_refined)
327 |                 error[~mask] = 0
328 | 
329 |                 # Compute the percentage error.
330 |                 percentage_error = depth_percentage_error(
331 |                     depth_refined, self.depth_gt, self.depth_error_threshold)
332 | 
333 |                 # Convert the error to tensor.
334 |                 error_t = torch.from_numpy(error).flip([0])
335 | 
336 |                 # Plot the depth map error.
337 |                 if self.depth_refined_error_win is not None:
338 | 
339 |                     self.vis.heatmap(
340 |                         error_t,
341 |                         env=self.environment,
342 |                         win=self.depth_refined_error_win,
343 |                         opts=dict(
344 |                             xmin=0.0,
345 |                             xmax=self.depth_error_threshold,
346 |                             title='refined depth error: {:.2f}% ({})'.format(
347 |                                 percentage_error, self.depth_error_threshold)))
348 | 
349 |                 else:
350 | 
351 |                     self.depth_refined_error_win = self.vis.heatmap(
352 |                         error_t,
353 |                         env=self.environment,
354 |                         opts=dict(
355 |                             xmin=0.0,
356 |                             xmax=self.depth_error_threshold,
357 |                             title='refined depth error: {:.2f}% ({})'.format(
358 |                                 percentage_error, self.depth_error_threshold)))
359 | 
360 |         # ==============================================================================================================
361 | 
362 |         # Noisy/incomplete normal map.
363 |         if normal is not None:
364 | 
365 |             if depth is not None:
366 | 
367 |                 # Spatial dimensions.
368 |                 height = normal.shape[0]
369 |                 width = normal.shape[1]
370 | 
371 |                 # Encode the 3D normals into an RGB image.
372 |                 normal_rgb = normal2rgb(normal)
373 | 
374 |                 # Resize the normal map, if too large.
375 |                 img_ratio = float(width) / float(height)
376 |                 if height > HEIGHT_MAX:
377 |                     normal_rgb = resize_map(
378 |                         normal_rgb, [HEIGHT_MAX, int(HEIGHT_MAX * img_ratio)], order=0)
379 | 
380 |                 # Convert the normal map to tensor.
381 |                 normal_rgb_t = torch.from_numpy(np.transpose(normal_rgb, axes=(2, 0, 1)))
382 | 
383 |                 # Plot the normal map.
384 |                 if self.normal_win is not None:
385 | 
386 |                     self.vis.image(
387 |                         normal_rgb_t,
388 |                         env=self.environment,
389 |                         win=self.normal_win,
390 |                         opts=dict(title='input normal'))
391 | 
392 |                 else:
393 | 
394 |                     self.normal_win = self.vis.image(
395 |                         normal_rgb_t,
396 |                         env=self.environment,
397 |                         opts=dict(title='input normal'))
398 | 
399 |         # ==============================================================================================================
400 | 
401 |         # Initial normal map.
402 |         if normal_init is not None:
403 | 
404 |             if depth_init is not None:
405 | 
406 |                 # Spatial dimensions.
407 |                 height = normal_init.shape[0]
408 |                 width = normal_init.shape[1]
409 | 
410 |                 # Encode the 3D normals into an RGB image.
411 |                 normal_init_rgb = normal2rgb(normal_init)
412 | 
413 |                 # Resize the normal map, if too large.
414 |                 img_ratio = float(width) / float(height)
415 |                 if height > HEIGHT_MAX:
416 |                     normal_init_rgb = resize_map(
417 |                         normal_init_rgb, [HEIGHT_MAX, int(HEIGHT_MAX * img_ratio)], order=0)
418 | 
419 |                 # Convert the normal map to tensor.
420 |                 normal_init_rgb_t = torch.from_numpy(np.transpose(normal_init_rgb, axes=(2, 0, 1)))
421 | 
422 |                 # Plot the normal map.
423 |                 if self.normal_init_win is not None:
424 | 
425 |                     self.vis.image(
426 |                         normal_init_rgb_t,
427 |                         env=self.environment,
428 |                         win=self.normal_init_win,
429 |                         opts=dict(title='initial normal'))
430 | 
431 |                 else:
432 | 
433 |                     self.normal_init_win = self.vis.image(
434 |                         normal_init_rgb_t,
435 |                         env=self.environment,
436 |                         opts=dict(title='initial normal'))
437 | 
438 |         # ==============================================================================================================
439 | 
440 |         # Normal map associated to the refined depth map.
441 |         if normal_refined is not None:
442 | 
443 |             if depth_refined is not None:
444 | 
445 |                 # Spatial dimensions.
446 |                 height = normal_refined.shape[0]
447 |                 width = normal_refined.shape[1]
448 | 
449 |                 # Encode the 3D normals into an RGB image.
450 |                 normal_refined_rgb = normal2rgb(normal_refined)
451 | 
452 |                 # Resize the normal map, if too large.
453 |                 img_ratio = float(width) / float(height)
454 |                 if height > HEIGHT_MAX:
455 |                     normal_refined_rgb = resize_map(
456 |                         normal_refined_rgb, [HEIGHT_MAX, int(HEIGHT_MAX * img_ratio)], order=0)
457 | 
458 |                 # Convert the normal map to tensor.
459 |                 normal_refined_rgb_t = torch.from_numpy(np.transpose(normal_refined_rgb, axes=(2, 0, 1)))
460 | 
461 |                 # Plot the normal map.
462 |                 if self.normal_refined_win is not None:
463 | 
464 |                     self.vis.image(
465 |                         normal_refined_rgb_t,
466 |                         env=self.environment,
467 |                         win=self.normal_refined_win,
468 |                         opts=dict(title='refined normal'))
469 | 
470 |                 else:
471 | 
472 |                     self.normal_refined_win = self.vis.image(
473 |                         normal_refined_rgb_t,
474 |                         env=self.environment,
475 |                         opts=dict(title='refined normal'))
476 | 
477 |         # ==============================================================================================================
478 | 
479 |         # Depth consistency loss.
480 |         if idepth_consistency_loss is not None:
481 | 
482 |             if self.idepth_consistency_loss_win is not None:
483 | 
484 |                 self.vis.line(
485 |                     X=idepth_consistency_loss[0],
486 |                     Y=idepth_consistency_loss[1],
487 |                     env=self.environment,
488 |                     win=self.idepth_consistency_loss_win,
489 |                     update='append')
490 | 
491 |             else:
492 | 
493 |                 self.idepth_consistency_loss_win = self.vis.line(
494 |                     X=idepth_consistency_loss[0],
495 |                     Y=idepth_consistency_loss[1],
496 |                     env=self.environment,
497 |                     opts=dict(
498 |                         xlabel='iterations',
499 |                         ylabel='loss',
500 |                         title='inverse depth consistency loss',
501 |                         markers=True,
502 |                         markersymbol='dot'))
503 | 
504 |         # ==============================================================================================================
505 | 
506 |         # Normal consistency loss.
507 |         if inormal_consistency_loss is not None:
508 | 
509 |             if self.inormal_consistency_loss_win is not None:
510 | 
511 |                 self.vis.line(
512 |                     X=inormal_consistency_loss[0],
513 |                     Y=inormal_consistency_loss[1],
514 |                     env=self.environment,
515 |                     win=self.inormal_consistency_loss_win,
516 |                     update='append')
517 | 
518 |             else:
519 | 
520 |                 self.inormal_consistency_loss_win = self.vis.line(
521 |                     X=inormal_consistency_loss[0],
522 |                     Y=inormal_consistency_loss[1],
523 |                     env=self.environment,
524 |                     opts=dict(
525 |                         xlabel='iterations',
526 |                         ylabel='loss',
527 |                         title='2D normal consistency loss',
528 |                         markers=True,
529 |                         markersymbol='dot'))
530 | 
531 |         # ==============================================================================================================
532 | 
533 |         # Depth regularization loss.
534 |         if regularization_loss is not None:
535 | 
536 |             if self.regularization_loss_win is not None:
537 | 
538 |                 self.vis.line(
539 |                     X=regularization_loss[0],
540 |                     Y=regularization_loss[1],
541 |                     env=self.environment,
542 |                     win=self.regularization_loss_win,
543 |                     update='append')
544 | 
545 |             else:
546 | 
547 |                 self.regularization_loss_win = self.vis.line(
548 |                     X=regularization_loss[0],
549 |                     Y=regularization_loss[1],
550 |                     env=self.environment,
551 |                     opts=dict(
552 |                         xlabel='iterations',
553 |                         ylabel='loss',
554 |                         title='regularization loss',
555 |                         markers=True,
556 |                         markersymbol='dot'))
557 | 
558 |         # ==============================================================================================================
559 | 
560 |         # Global loss.
561 |         if global_loss is not None:
562 | 
563 |             if self.global_loss_win is not None:
564 | 
565 |                 self.vis.line(
566 |                     X=global_loss[0],
567 |                     Y=global_loss[1],
568 |                     env=self.environment,
569 |                     win=self.global_loss_win,
570 |                     update='append')
571 | 
572 |             else:
573 | 
574 |                 self.global_loss_win = self.vis.line(
575 |                     X=global_loss[0],
576 |                     Y=global_loss[1],
577 |                     env=self.environment,
578 |                     opts=dict(
579 |                         xlabel='iterations',
580 |                         ylabel='loss',
581 |                         title='global loss',
582 |                         markers=True,
583 |                         markersymbol='dot'))
584 | 
585 |     def setup(self,
586 |               env_name: str = None,
587 |               depth_range: Tuple[float, float] = None) -> None:
588 | 
589 |         # Reset the plot windows.
590 |         self.texture_win = None
591 |         self.depth_win = None
592 |         self.depth_init_win = None
593 |         self.depth_refined_win = None
594 |         self.normal_win = None
595 |         self.normal_init_win = None
596 |         self.normal_refined_win = None
597 |         self.depth_gt_win = None
598 |         self.depth_error_win = None
599 |         self.depth_refined_error_win = None
600 | 
601 |         # Reset the loss windows.
602 |         self.idepth_consistency_loss_win = None
603 |         self.inormal_consistency_loss_win = None
604 |         self.regularization_loss_win = None
605 |         self.global_loss_win = None
606 | 
607 |         # Reset the ground truth depth map.
608 |         self.depth_gt = None
609 | 
610 |         # Reset the plotting depth range.
611 |         self.depth_min = None
612 |         self.depth_max = None
613 | 
614 |         # Set the new plotting environment name.
615 |         if env_name is not None:
616 |             self.environment = env_name
617 | 
618 |         # Set the plotting depth range.
619 |         if depth_range is not None:
620 |             self.depth_min = depth_range[0]
621 |             self.depth_max = depth_range[1]
622 | 


--------------------------------------------------------------------------------
/refinement.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020,
  2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland,
  3 | # Laboratoire de Traitement des Signaux 4 (LTS4).
  4 | # All rights reserved.
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | #
 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | # SOFTWARE.
 23 | #
 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com)
 25 | 
 26 | import math
 27 | import torch
 28 | import torch.nn as nn
 29 | import torch.nn.functional as fun
 30 | from torch import device as dev
 31 | import numpy as np
 32 | from misc import resize_map, space2plane_normal, plane2space_normal, depth2normal
 33 | from filters import gradient_filter
 34 | from losses import DepthConsistencyL1, NormalConsistencyL1, PieceWisePlanarRegularization
 35 | from cv2 import cvtColor, COLOR_RGB2GRAY
 36 | from transforms import depth2depth_inv, depth_inv2depth, depth_range2depth_inv_range
 37 | from logger import Logger
 38 | from typing import Tuple, List, Dict
 39 | 
 40 | 
 41 | class Loss(nn.Module):
 42 |     """It creates a loss function consisting of an inverse depth map consistency loss, a 2D normal map consistency loss
 43 |     and a joint inverse depth map and normal map regularization. The 2D normal map is 2D vector field capturing the
 44 |     orientation of the inverse depth map.
 45 | 
 46 |     The independent variables of this loss are `self.idepth` and `self.inormal`.
 47 |     """
 48 | 
 49 |     def __init__(self,
 50 |                  image: np.array, idepth: np.array, idepth_range: Tuple[float],
 51 |                  loss_param: Dict[str, float],
 52 |                  idepth_confidence: np.array = None,
 53 |                  inormal: np.array = None,
 54 |                  idepth_init: np.array = None,
 55 |                  inormal_init: np.array = None,
 56 |                  device: torch.device = torch.device('cpu')) -> None:
 57 |         """`Loss` constructor. It considers the inverse depth map and the corresponding 2D normal map.
 58 | 
 59 |         Args:
 60 |             image: reference image, arranged as an `(H, W)` or `(H, W, C)` array.
 61 |             idepth: inverse depth map to refine, arranged as an `(H, W)` array.
 62 |             idepth_range: inverse depth values must belong to the interval `[depth_range[0], depth_range[1]]`.
 63 |             loss_param: dictionaries containing the loss parameters.
 64 |             idepth_confidence: confidence map associated to the inverse depth map to refine.
 65 |                 It must have entries in `[0, 1]`.
 66 |             inormal: 2D normal map associated to the depth map to refine, arranged as an `(H, W, 2)` array.
 67 |                 It is ignored if the normal consistency loss is off.
 68 |             idepth_init: initial guess for the refined inverse depth map.
 69 |             inormal_init: initial guess for the 2D normal map associated to the refined inverse depth map.
 70 |             device: device on which the computation will take place.
 71 |         """
 72 | 
 73 |         # Call the parent constructor.
 74 |         super(Loss, self).__init__()
 75 | 
 76 |         # Convert the input data from `np.array` to `torch.Tensor`. In particular, arrays are converted into 4D tensors
 77 |         # of size `(1, C, H, W)` with `H`, `W` and `C` representing the height, width and channel number, respectively.
 78 | 
 79 |         # Check the inverse depth range and register it.
 80 |         if idepth_range[0] <= 0 or idepth_range[1] == float('inf') or idepth_range[0] > idepth_range[1]:
 81 |             raise ValueError('Invalid depth range.')
 82 |         self.idepth_min = idepth_range[0]
 83 |         self.idepth_max = idepth_range[1]
 84 | 
 85 |         # Register the first optimization variable, i.e., the refined inverse depth map, and initialize it.
 86 |         if idepth_init is not None:
 87 |             aux = torch.as_tensor(idepth_init[None, None, ])
 88 |         else:
 89 |             aux = torch.as_tensor(idepth[None, None, ])
 90 |         self.idepth = nn.Parameter(aux.clone(), requires_grad=True)
 91 |         # Note that the data passed to `self.idepth` is copied in order to avoid shared data between different tensors.
 92 | 
 93 |         # Register the second optimization variable, i.e., the normal map, and initialize it.
 94 |         if inormal_init is not None:
 95 |             aux = torch.as_tensor((np.transpose(inormal_init, (2, 0, 1))[None, ]).copy())
 96 |         elif inormal is not None:
 97 |             aux = torch.as_tensor((np.transpose(inormal, (2, 0, 1))[None, ]).copy())
 98 |         else:
 99 |             with torch.no_grad():
100 |                 filter_size = 5
101 |                 filter_sigma = 5.0
102 |                 grad_filter = gradient_filter(filter_size, filter_sigma)
103 |                 pad = tuple([int((filter_size - 1) / 2)] * 4)
104 |                 aux = fun.conv2d(
105 |                     fun.pad(self.idepth, pad, mode='replicate'),
106 |                     grad_filter.to(self.idepth))
107 |         self.inormal = nn.Parameter(aux.clone(), requires_grad=True)
108 |         # The `torch.no_grad()` block prevents PyTorch from tracking the operation.
109 | 
110 |         # Create the depth consistency loss.
111 |         self.idepth_consistency_loss = DepthConsistencyL1(
112 |             idepth, idepth_range,
113 |             depth_confidence=idepth_confidence,
114 |             multiplier=loss_param['lambda_depth_consistency'])
115 | 
116 |         # Create the 2D normal consistency loss.
117 |         if loss_param['lambda_normal_consistency'] > 0:
118 | 
119 |             assert inormal is not None, 'Cannot activate the normal consistency term with no input normal map.'
120 | 
121 |             self.inormal_consistency_loss = NormalConsistencyL1(
122 |                 inormal,
123 |                 normal_confidence=idepth_confidence,
124 |                 multiplier=loss_param['lambda_normal_consistency'])
125 | 
126 |         else:
127 |             self.inormal_consistency_loss = None
128 | 
129 |         # Create the depth regularization loss.
130 |         self.regularization_loss = PieceWisePlanarRegularization(
131 |             image,
132 |             loss_param['gamma_regularization'],
133 |             window_size=loss_param['window_size'],
134 |             patch_size=loss_param['patch_size'],
135 |             sigma_intensity=loss_param['sigma_intensity'],
136 |             sigma_spatial=loss_param['sigma_spatial'],
137 |             degree_max=loss_param['degree_max'],
138 |             version=loss_param['regularization'],
139 |             multiplier=loss_param['lambda_regularization'],
140 |             device=device)
141 | 
142 |     def forward(self) -> Tuple[torch.Tensor, float, float, float]:
143 |         """It evaluates the loss function at (`self.idepth`, `self.inormal`).
144 | 
145 |         Returns:
146 |             the loss function value, and the value of its two terms, at (`self.idepth`, `self.inormal`).
147 |         """
148 | 
149 |         # Inverse depth consistency loss.
150 |         idepth_consistency_loss = self.idepth_consistency_loss(self.idepth)
151 | 
152 |         # 2D normal consistency loss.
153 |         if self.inormal_consistency_loss is not None:
154 |             inormal_consistency_loss = self.inormal_consistency_loss(self.inormal)
155 |         else:
156 |             inormal_consistency_loss = self.idepth.new_zeros(1, requires_grad=True)
157 | 
158 |         # Regularization loss.
159 |         regularization_loss = self.regularization_loss(self.idepth, self.inormal)
160 | 
161 |         # Assemble the full loss.
162 |         loss = idepth_consistency_loss + inormal_consistency_loss + regularization_loss
163 | 
164 |         return loss, idepth_consistency_loss.item(), inormal_consistency_loss.item(), regularization_loss.item()
165 | 
166 | 
167 | def refine_depth(image: np.array, depth: np.array, depth_range: Tuple[float, float],
168 |                  camera_param: Dict[str, float], loss_param: List[Dict], opt_param: List[Dict],
169 |                  depth_confidence: np.array = None,
170 |                  normal: np.array = None,
171 |                  depth_init: np.array = None,
172 |                  normal_init: np.array = None,
173 |                  depth_gt: np.array = None,
174 |                  logger: Logger = None,
175 |                  device: dev = dev('cpu')) -> Tuple[np.array, np.array]:
176 |     """It refines the input depth map and estimates the corresponding normal map in a multi-scale fashion.
177 | 
178 |     It refines the input depth map and estimate the corresponding normal map according to the method described
179 |     in the following article:
180 | 
181 |     Mattia Rossi, Mireille El Gheche, Andreas Kuhn, Pascal Frossard,
182 |     "Joint Graph-based Depth Refinement and Normal Estimation",
183 |     in IEEE Computer Vision and Pattern Recognition Conference (CVPR), Seattle, WA, USA, 2020.
184 | 
185 |     If the input depth map comes together with a normal map, the latter can be refined as well (rather than estimated)
186 |     by activating the normal consistency term (not described in the article).
187 | 
188 |     The `loss_param` input parameter contains a list of dictionaries, one for each scale. Each dictionary must contain
189 |     the following keys:
190 |     - lambda_depth_consistency: depth consistency term multiplier.
191 |     - lambda_normal_consistency: normal consistency term multiplier.
192 |     - lambda_regularization: depth regularization term multiplier.
193 |     - gamma_regularization: depth regularization term internal multiplier.
194 |     - window_size: search window size (window_size x window_size) to be used in the graph construction.
195 |     - patch_size: patch size (patch_size x patch_size) to be used in the graph construction.
196 |     - sigma_intensity: color difference standard deviation for patch comparison in the graph construction.
197 |     - sigma_spatial: euclidean distance standard deviation for patch comparison in the graph construction.
198 |     - degree_max: maximum number of per pixel neighbors in the graph.
199 |     - regularization: regularization type (0 for NLTGV, 1 for our regularization).
200 | 
201 |     The `opt_param` input parameter contains a list of dictionaries, one for each scale. Each dictionary must contain
202 |     the following keys:
203 |     - iter_max: maximum number of iterations.
204 |     - eps_stop: minimum relative change between the current and the previous iteration depth maps.
205 |     - attempt_max: maximum number of iterations without improving the loss.
206 |     - learning_rate: dictionary containing the following keys:
207 |         - lr_start: initial learning rate.
208 |         - lr_slot_nb: number of partitions; each partition adopts a learning rate which is 1/10 of those employed at
209 |                       the previous partition; 0 excludes the relative depth map change stopping criterium.
210 |     - plotting_step: number of steps between two plot updates of the logger.
211 |     - depth_error_threshold: error threshold (in meters) to be used in the evaluation against the ground truth.
212 | 
213 |     Args:
214 |         image: reference image, arranged as an `(H, W)` or `(H, W, C)` array.
215 |         depth: depth map to refine, arranged as an `(H, W)` array.
216 |         depth_range: depth values must belong to the interval `[depth_range[0], depth_range[1]]`.
217 |         camera_param: dictionary containing `f_x`, `f_y`, `c_x`, `c_y`.
218 |         loss_param: list of dictionaries, each one containing the loss parameters for a given scale.
219 |         opt_param: list of dictionaries, each one containing the solver parameters for a given scale.
220 |         depth_confidence: confidence map associated to the depth map to refine. It must have entries in `[0, 1]`.
221 |         normal: 3D normal map to refine, arranged as an `(H, W, 3)` array. It is ignored if the normal consistency loss is off.
222 |         depth_init: initial guess for the refined depth map.
223 |         normal_init: initial guess for the 3D normal map associated to the refined depth map.
224 |         depth_gt: ground truth depth map, arranged as an `(H, W)` array.
225 |         logger: logger to plot visual results and statistics at runtime.
226 |         device: device on which the computation will take place.
227 | 
228 |     Returns:
229 |         The refined depth map and the corresponding normal map.
230 |     """
231 | 
232 |     # Number of scales in the multi-scale pyramid.
233 |     scale_nb = len(opt_param)
234 | 
235 |     # Allocate the multi-scale pyramid.
236 |     scale_pyramid = [None] * scale_nb
237 |     camera_param_pyramid = [None] * scale_nb
238 |     image_pyramid = [None] * scale_nb
239 |     depth_pyramid = [None] * scale_nb
240 |     depth_confidence_pyramid = [None] * scale_nb
241 |     normal_pyramid = [None] * scale_nb
242 |     depth_init_pyramid = [None] * scale_nb
243 |     normal_init_pyramid = [None] * scale_nb
244 |     depth_gt_pyramid = [None] * scale_nb
245 | 
246 |     # Build the multi-scale pyramid.
247 |     for i in range(scale_nb):
248 | 
249 |         if i > 0:
250 | 
251 |             # Compute the image dimensions for the current scale.
252 |             height = int(round(scale_pyramid[i - 1][0] / 2.0))
253 |             width = int(round(scale_pyramid[i - 1][1] / 2.0))
254 |             scale_pyramid[i] = (height, width)
255 | 
256 |             # Compute the camera parameters for the current scale.
257 |             x_ratio = scale_pyramid[i][1] / scale_pyramid[i - 1][1]
258 |             y_ratio = scale_pyramid[i][0] / scale_pyramid[i - 1][0]
259 |             camera_param_pyramid[i] = {'f_x': camera_param_pyramid[i - 1]['f_x'] * x_ratio,
260 |                                        'f_y': camera_param_pyramid[i - 1]['f_y'] * y_ratio,
261 |                                        'c_x': camera_param_pyramid[i - 1]['c_x'] * x_ratio,
262 |                                        'c_y': camera_param_pyramid[i - 1]['c_y'] * y_ratio}
263 | 
264 |             # Downscale the image.
265 |             image_pyramid[i] = resize_map(image_pyramid[i - 1], scale_pyramid[i], order=1)
266 | 
267 |             # Downscale the noisy/incomplete depth map.
268 |             depth_pyramid[i] = resize_map(depth_pyramid[i - 1], scale_pyramid[i], order=0)
269 | 
270 |             # Downscale the noisy/incomplete depth map confidence.
271 |             if depth_confidence_pyramid[i - 1] is not None:
272 |                 depth_confidence_pyramid[i] = resize_map(depth_confidence_pyramid[i - 1], scale_pyramid[i], order=0)
273 |             else:
274 |                 depth_confidence_pyramid[i] = None
275 | 
276 |             # Downscale the noisy/incomplete normal map.
277 |             if normal_pyramid[i - 1] is not None:
278 |                 normal_pyramid[i] = resize_map(normal_pyramid[i - 1], scale_pyramid[i], order=0)
279 | 
280 |             else:
281 |                 normal_pyramid[i] = None
282 | 
283 |             # Downscale the initial depth map estimate (we need only the lowest scale).
284 |             if depth_init_pyramid[i - 1] is not None:
285 |                 depth_init_pyramid[i] = resize_map(depth_init_pyramid[i - 1], scale_pyramid[i], order=0)
286 |                 depth_init_pyramid[i - 1] = None
287 |             else:
288 |                 depth_init_pyramid[i] = None
289 | 
290 |             # Downscale the initial normal map estimate (we need only the lowest scale).
291 |             if normal_init_pyramid[i - 1] is not None:
292 |                 normal_init_pyramid[i] = resize_map(normal_init_pyramid[i - 1], scale_pyramid[i], order=0)
293 |                 normal_init_pyramid[i - 1] = None
294 |             else:
295 |                 normal_init_pyramid[i] = None
296 | 
297 |             # Downscale the ground truth depth map.
298 |             if depth_gt_pyramid[i - 1] is not None:
299 |                 depth_gt_pyramid[i] = resize_map(depth_gt_pyramid[i - 1], scale_pyramid[i], order=0)
300 |             else:
301 |                 depth_gt_pyramid[i] = None
302 | 
303 |         else:
304 | 
305 |             # Store the original image dimensions.
306 |             scale_pyramid[i] = (image.shape[0], image.shape[1])
307 | 
308 |             # Store the original camera parameters.
309 |             camera_param_pyramid[i] = camera_param
310 | 
311 |             # The lowest scale hosts the original data.
312 |             image_pyramid[i] = image
313 |             depth_pyramid[i] = depth
314 |             depth_confidence_pyramid[i] = depth_confidence
315 |             normal_pyramid[i] = normal
316 |             depth_init_pyramid[i] = depth_init
317 |             normal_init_pyramid[i] = normal_init
318 |             depth_gt_pyramid[i] = depth_gt
319 | 
320 |     # Reverse the multi-scale pyramid.
321 |     scale_pyramid.reverse()
322 |     camera_param_pyramid.reverse()
323 |     image_pyramid.reverse()
324 |     depth_pyramid.reverse()
325 |     depth_confidence_pyramid.reverse()
326 |     normal_pyramid.reverse()
327 |     depth_init_pyramid.reverse()        # It contains only the lowest scale.
328 |     normal_init_pyramid.reverse()       # It contains only the lowest scale.
329 |     depth_gt_pyramid.reverse()
330 | 
331 |     # Perform the multi-scale depth refinement.
332 |     scale_name_pyramid = [None] * scale_nb
333 |     depth_refined_pyramid = [None] * scale_nb
334 |     normal_refined_pyramid = [None] * scale_nb
335 |     for i in range(scale_nb):
336 | 
337 |         scale_name_pyramid[i] = ('{} ({}x{})'.format(i, scale_pyramid[i][0], scale_pyramid[i][1]))
338 |         print('Processing scale {}'.format(scale_name_pyramid[i]))
339 | 
340 |         # Setup a new plotting environment.
341 |         if logger is not None:
342 | 
343 |             if depth_gt_pyramid[i] is not None:
344 |                 depth_plotting_range = (np.min(depth_gt_pyramid[i]).item(), np.max(depth_gt_pyramid[i]).item())
345 |             else:
346 |                 depth_plotting_range = np.percentile(depth, [5, 95])
347 |             logger.setup(env_name=scale_name_pyramid[i], depth_range=depth_plotting_range)
348 | 
349 |         # Initialize the next scale with the refined depth map and the corresponding normal map from the previous scale.
350 |         # The two maps are up-sampled first.
351 |         if i > 0:
352 |             depth_init_pyramid[i] = resize_map(depth_refined_pyramid[i - 1], scale_pyramid[i], order=0)
353 |             if normal_refined_pyramid[i - 1] is not None:
354 |                 normal_init_pyramid[i] = resize_map(normal_refined_pyramid[i - 1], scale_pyramid[i], order=0)
355 | 
356 |         # Refine the depth map of the current scale.
357 |         depth_refined, normal_refined = refine(
358 |             image_pyramid[i], depth_pyramid[i], depth_range,
359 |             camera_param_pyramid[i], loss_param[i], opt_param[i],
360 |             depth_confidence=depth_confidence_pyramid[i],
361 |             depth_init=depth_init_pyramid[i],
362 |             normal=normal_pyramid[i],
363 |             normal_init=normal_init_pyramid[i],
364 |             depth_gt=depth_gt_pyramid[i],
365 |             logger=logger,
366 |             device=device)
367 | 
368 |         depth_refined_pyramid[i] = depth_refined
369 |         normal_refined_pyramid[i] = normal_refined
370 | 
371 |     # Extract the refined depth map and the corresponding normal map.
372 |     depth_refined = depth_refined_pyramid[-1]
373 |     normal_refined = normal_refined_pyramid[-1]
374 | 
375 |     # Delete all the plotting environments.
376 |     if logger is not None:
377 |         for i in range(scale_nb):
378 |             logger.vis.delete_env(scale_name_pyramid[i])
379 | 
380 |     return depth_refined, normal_refined
381 | 
382 | 
383 | def refine(image: np.array, depth: np.array, depth_range: Tuple[float, float],
384 |            camera_param: Dict[str, float], loss_param: Dict, opt_param: Dict,
385 |            depth_confidence: np.array = None,
386 |            normal: np.array = None,
387 |            depth_init: np.array = None,
388 |            normal_init: np.array = None,
389 |            depth_gt: np.array = None,
390 |            logger: Logger = None,
391 |            device: dev = dev('cpu')) -> Tuple[np.array, np.array]:
392 |     """It implements one scale of the multi-scale pyramid of the function `refine_depth`.
393 | 
394 |     Args:
395 |         image: reference image, arranged as an `(H, W)` or `(H, W, C)` array.
396 |         depth: depth map to refine, arranged as an `(H, W)` array.
397 |         depth_range: depth values must belong to the interval `[depth_range[0], depth_range[1]]`.
398 |         camera_param: dictionary containing `f_x`, `f_y`, `c_x`, `c_y`.
399 |         loss_param: dictionary containing the loss parameters.
400 |         opt_param: dictionary containing the solver parameters.
401 |         depth_confidence: confidence map associated to the depth map to refine. It must have entries in `[0, 1]`.
402 |         normal: 3D normal map to refine, arranged as an `(H, W, 3)` array. It is ignored if the normal consistency loss is off.
403 |         depth_init: initial guess for the refined depth map.
404 |         normal_init: initial guess for the 3D normal map associated to the refined depth map.
405 |         depth_gt: ground truth depth map, arranged as an `(H, W)` array.
406 |         logger: logger to plot visual results and statistics at runtime.
407 |         device: device on which the computation will take place.
408 | 
409 |     Returns:
410 |         The refined depth map and the corresponding normal map.
411 |     """
412 | 
413 |     # Check that the input maps have the same height and width of the input reference image.
414 |     height = image.shape[0]
415 |     width = image.shape[1]
416 |     assert depth.shape == (height, width),\
417 |         'Input depth map size not compatible with the reference image one.'
418 |     if depth_confidence is not None:
419 |         assert depth_confidence.shape == (height, width),\
420 |             'Input depth map confidence size not compatible with the reference image one.'
421 |     if normal is not None:
422 |         assert normal.shape == (height, width, 3),\
423 |             'Input normal map size not compatible with the reference image one.'
424 |     if depth_init is not None:
425 |         assert depth_init.shape == (height, width),\
426 |             'Input initial depth map size not compatible with the reference image one.'
427 |     if normal_init is not None:
428 |         assert normal_init.shape == (height, width, 3),\
429 |             'Input initial normal map size not compatible with the reference image one.'
430 |     if depth_gt is not None:
431 |         assert depth_gt.shape == (height, width),\
432 |             'Ground truth depth size not compatible with the reference image one.'
433 | 
434 |     # Check the depth map data type.
435 |     if depth.dtype == np.float32:
436 |         depth_dtype = torch.float
437 |     elif depth.dtype == np.float64:
438 |         depth_dtype = torch.double
439 |     else:
440 |         raise TypeError('The input depth map must be either of type double or float.')
441 | 
442 |     # Convert the reference image to gray scale.
443 |     image_gray = image
444 |     if image_gray.ndim == 3:
445 |         image_gray = cvtColor(image_gray.astype(np.float32), COLOR_RGB2GRAY)
446 |         image_gray = image_gray.astype(image.dtype)
447 |         # The function `cvtColor` requires an input image of type uint8, uint16 or float32. Therefore, `image_gray` is
448 |         # first converted to float32 (to minimize the precision loss) and then back to its original data type.
449 | 
450 |     # Plot.
451 |     if logger is not None:
452 |         logger.plot(
453 |             texture=image,
454 |             depth=depth,
455 |             depth_init=depth_init,
456 |             depth_gt=depth_gt,
457 |             normal=normal,
458 |             normal_init=normal_init)
459 | 
460 |     # Convert the depth maps.
461 |     idepth = depth2depth_inv(depth)
462 |     idepth_init = depth2depth_inv(depth_init) if depth_init is not None else None
463 |     idepth_range = depth_range2depth_inv_range(depth_range)
464 | 
465 |     # Convert the normal maps.
466 |     inormal = None
467 |     inormal_init = None
468 |     if normal is not None:
469 |         inormal = space2plane_normal(
470 |             depth,
471 |             normal,
472 |             (camera_param['f_x'], camera_param['f_y']),
473 |             (camera_param['c_x'], camera_param['c_y']))
474 |     if normal_init is not None:
475 |         inormal_init = space2plane_normal(
476 |             depth_init if depth_init is not None else depth,
477 |             normal_init,
478 |             (camera_param['f_x'], camera_param['f_y']),
479 |             (camera_param['c_x'], camera_param['c_y']))
480 | 
481 |     # Create the loss object.
482 |     loss = Loss(image_gray, idepth, idepth_range,
483 |                 loss_param,
484 |                 idepth_confidence=depth_confidence,
485 |                 inormal=inormal,
486 |                 idepth_init=idepth_init,
487 |                 inormal_init=inormal_init,
488 |                 device=device).to(device=device, dtype=depth_dtype)
489 | 
490 |     # Set the maximum number of iterations.
491 |     assert 'iter_max' in opt_param, 'Missing \'iter_max\' in `opt_param`.'
492 |     iter_max = opt_param['iter_max']
493 | 
494 |     # Set the learning rate and define the optimization policy (i.e., with oir without scheduler).
495 |     assert 'learning_rate' in opt_param, 'Missing \'learning_rate\' in `opt_param.'
496 |     assert 'lr_start' in opt_param['learning_rate'], 'Missing \'lr\' in `opt_param[\'learning_rate\']`.'
497 |     assert 'lr_slot_nb' in opt_param['learning_rate'], 'Missing \'slot_nb\' in `opt_param[\'learning_rate\']`.'
498 |     learning_rate_start = opt_param['learning_rate']['lr_start']
499 |     learning_rate_slot_nb = opt_param['learning_rate']['lr_slot_nb']
500 | 
501 |     # Define stopping condition.
502 |     if learning_rate_slot_nb < 1:
503 | 
504 |         # The learning rate is kept constant.
505 | 
506 |         # The optimization terminates in one of the following event occurs:
507 |         # - the relative depth change is smaller than `eps_stop`,
508 |         # - the loss is not improved for more than `attempt_max` consecutive iterations,
509 |         # - `iter_max` iterations have been performed.
510 | 
511 |         assert 'eps_stop' in opt_param, 'Missing \'eps_stop\' in `opt_param.'
512 |         assert 'attempt_max' in opt_param, 'Missing \'attempt_max\' in `opt_param.'
513 | 
514 |         eps_stop = opt_param['eps_stop']
515 |         attempt_max = opt_param['attempt_max']
516 |         scheduler_step_size = iter_max * 2
517 | 
518 |     else:
519 | 
520 |         # The learning rate is dynamically updated.
521 | 
522 |         # The optimization terminates only when `iter_max` iterations have been performed.
523 |         # However, in this scenario the learning rate is progressively decreased:
524 |         # - the learning rate starts at `learning_rate_start`,
525 |         # - it is decreased `learning_rate_slot_nb - 1` times by a factor `10`.
526 | 
527 |         eps_stop = 0.0
528 |         attempt_max = float('inf')
529 |         scheduler_step_size = int(math.ceil(float(iter_max) / float(learning_rate_slot_nb)))
530 | 
531 |     # Set the plotting step.
532 |     assert 'plotting_step' in opt_param, 'Missing \'plotting_step\' in `opt_param.'
533 |     plotting_step = opt_param['plotting_step']
534 | 
535 |     # Allocate an array to store the loss function values.
536 |     loss_history = np.zeros(iter_max + 1)
537 |     idepth_consistency_history = np.zeros(iter_max + 1)
538 |     inormal_consistency_history = np.zeros(iter_max + 1) if loss_param['lambda_normal_consistency'] > 0 else None
539 |     regularization_history = np.zeros(iter_max + 1)
540 | 
541 |     # Create an ADAM optimizer.
542 |     optimizer = torch.optim.Adam(loss.parameters(), lr=learning_rate_start)
543 | 
544 |     # Create a learning rate scheduler.
545 |     scheduler = torch.optim.lr_scheduler.StepLR(optimizer, scheduler_step_size, gamma=0.1)
546 | 
547 |     ####################################################################################################################
548 |     ################################################# OPTIMIZATION #####################################################
549 |     ####################################################################################################################
550 | 
551 |     # Lowest minimum value of the loss encountered during the optimization.
552 |     loss_value_min = float('inf')
553 | 
554 |     # Number of consecutive iterations without improving `loss_value_min`.
555 |     attempt_counter = 0
556 | 
557 |     # Relative change of the depth map between two consecutive iterations.
558 |     relative_depth_change = float('inf')
559 | 
560 |     ################################################# CASE `i == 0` ####################################################
561 | 
562 |     # Evaluate the loss function.
563 |     optimizer.zero_grad()
564 |     loss_value, idepth_consistency_value, inormal_consistency_value, regularization_value = loss.forward()
565 | 
566 |     # Log operations.
567 |     with torch.no_grad():
568 | 
569 |         # Store the current value of the loss.
570 |         idepth_consistency_history[0] = idepth_consistency_value
571 |         if inormal_consistency_history is not None:
572 |             inormal_consistency_history[0] = inormal_consistency_value
573 |         regularization_history[0] = regularization_value
574 |         loss_history[0] = loss_value.item()
575 | 
576 |         # Log the optimization status to the standard output.
577 |         print('Iteration: {:6}, Fails: {:3}, Rel. depth change: {:.6f}, Loss: {:.6f}'.format(
578 |             0, attempt_counter, relative_depth_change, loss_history[0]), flush=True)
579 | 
580 |         # Plot the optimization status.
581 |         indexes = np.arange(0, 1)
582 |         if logger is not None:
583 |             depth_aux = depth_inv2depth(
584 |                 loss.idepth.data.to('cpu').squeeze().numpy(), depth_range)
585 |             normal_aux = plane2space_normal(
586 |                 depth_aux,
587 |                 np.transpose(loss.inormal.data.to('cpu').squeeze().numpy(), (1, 2, 0)),
588 |                 (camera_param['f_x'], camera_param['f_y']),
589 |                 (camera_param['c_x'], camera_param['c_y']))
590 |             logger.plot(
591 |                 depth_refined=depth_aux,
592 |                 normal_refined=normal_aux,
593 |                 idepth_consistency_loss=(indexes, idepth_consistency_history[indexes]),
594 |                 inormal_consistency_loss=((indexes, inormal_consistency_history[indexes])
595 |                                           if inormal_consistency_history is not None else None),
596 |                 regularization_loss=(indexes, regularization_history[indexes]),
597 |                 global_loss=(indexes, loss_history[indexes]))
598 | 
599 |     ################################################# CASE `i > 0` #####################################################
600 | 
601 |     for i in range(1, iter_max + 1):
602 | 
603 |         # Compute the gradient of each parameter of the loss (i.e., the depth map and the normal maps).
604 |         loss_value.backward()
605 | 
606 |         # Store a copy of the old depth map.
607 |         idepth_old = loss.idepth.clone().detach()
608 | 
609 |         # Update the old depth map.
610 |         optimizer.step()
611 | 
612 |         # Update the optimizer learning rate.
613 |         scheduler.step()
614 | 
615 |         # Without PyTorch tracking, project the new depth map into the specified depth range.
616 |         with torch.no_grad():
617 |             loss.idepth.data = loss.idepth.data.clamp(idepth_range[0], idepth_range[1])
618 | 
619 |         # Evaluate the loss function at the new depth map and normal map.
620 |         optimizer.zero_grad()
621 |         loss_value, idepth_consistency_value, inormal_consistency_value, regularization_value = loss.forward()
622 | 
623 |         # Without PyTorch tracking, perform some routines.
624 |         with torch.no_grad():
625 | 
626 |             # Store the value of the loss evaluated at the new depth map.
627 |             idepth_consistency_history[i] = idepth_consistency_value
628 |             if inormal_consistency_history is not None:
629 |                 inormal_consistency_history[i] = inormal_consistency_value
630 |             regularization_history[i] = regularization_value
631 |             loss_history[i] = loss_value.item()
632 | 
633 |             # Compute the relative depth map change.
634 |             relative_depth_change = torch.norm(
635 |                 (idepth_old - loss.idepth).view(-1, 1)) / torch.norm(idepth_old.view(-1, 1))
636 | 
637 |             # Update the lowest encountered minimum.
638 |             if loss_history[i] >= loss_value_min:
639 |                 attempt_counter = attempt_counter + 1
640 |             else:
641 |                 attempt_counter = 0
642 |                 loss_value_min = loss_history[i]
643 | 
644 |             # Evaluate the stopping condition.
645 |             stop_now = (relative_depth_change <= eps_stop) or (attempt_counter >= attempt_max)
646 | 
647 |             if (i % plotting_step == 0) or stop_now or ((i + 1) > iter_max):
648 | 
649 |                 # Log the optimization status to the standard output.
650 |                 print('Iteration: {:6}, Fails: {:3}, Rel. depth change: {:.6f}, Loss: {:.6f}'.format(
651 |                     i, attempt_counter, relative_depth_change, loss_history[i]), flush=True)
652 | 
653 |                 # Plot the optimization status.
654 |                 indexes = np.arange(i - (plotting_step - 1), i + 1)     # The index `i` is included.
655 |                 if logger is not None:
656 |                     depth_aux = depth_inv2depth(
657 |                         loss.idepth.data.to('cpu').squeeze().numpy(), depth_range)
658 |                     normal_aux = plane2space_normal(
659 |                         depth_aux,
660 |                         np.transpose(loss.inormal.data.to('cpu').squeeze().numpy(), (1, 2, 0)),
661 |                         (camera_param['f_x'], camera_param['f_y']),
662 |                         (camera_param['c_x'], camera_param['c_y']))
663 |                     logger.plot(
664 |                         depth_refined=depth_aux,
665 |                         normal_refined=normal_aux,
666 |                         idepth_consistency_loss=(indexes, idepth_consistency_history[indexes]),
667 |                         inormal_consistency_loss=((indexes, inormal_consistency_history[indexes])
668 |                                                   if inormal_consistency_history is not None else None),
669 |                         regularization_loss=(indexes, regularization_history[indexes]),
670 |                         global_loss=(indexes, loss_history[indexes]))
671 | 
672 |                 # If the stopping condition is met, terminate.
673 |                 if stop_now:
674 |                     break
675 | 
676 |     ####################################################################################################################
677 |     ####################################################################################################################
678 |     ####################################################################################################################
679 | 
680 |     # Extract the refined depth map.
681 |     depth_refined = depth_inv2depth(
682 |         loss.idepth.detach().to('cpu').numpy().squeeze(), depth_range)
683 | 
684 |     # Extract the normal map associated to the refined depth map.
685 |     normal_refined = plane2space_normal(
686 |         depth_refined,
687 |         np.transpose(loss.inormal.detach().to('cpu').numpy().squeeze(), (1, 2, 0)),
688 |         (camera_param['f_x'], camera_param['f_y']),
689 |         (camera_param['c_x'], camera_param['c_y']))
690 | 
691 |     return depth_refined, normal_refined
692 | 


--------------------------------------------------------------------------------
/misc.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020,
  2 | # ECOLE POLYTECHNIQUE FEDERALE DE LAUSANNE, Switzerland,
  3 | # Laboratoire de Traitement des Signaux 4 (LTS4).
  4 | # All rights reserved.
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | #
 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | # SOFTWARE.
 23 | #
 24 | # Author: Mattia Rossi (rossi-mattia-at-gmail-com)
 25 | 
 26 | import math
 27 | import numpy as np
 28 | import torch
 29 | from torch.nn import functional as fun
 30 | from scipy.ndimage import map_coordinates
 31 | from scipy.interpolate import griddata
 32 | from scipy.signal import convolve2d
 33 | from filters import gauss_filter_deriv_2d, diff_filter_bank
 34 | from transforms import depth2depth_inv
 35 | from typing import Tuple, Union
 36 | 
 37 | 
 38 | def resize_map(data: np.array, size_new: Tuple[int, int], order: int = 0) -> np.array:
 39 |     """It re-sizes the input map.
 40 | 
 41 |     It up-samples or down-samples any map (e.g., an image) with one or more channels.
 42 | 
 43 |     Args:
 44 |         data: map to resize, arranged as an `(H, W)` or `(H, W, C)` array.
 45 |         size_new: 2-tuple specifying the new height and width.
 46 |         order: order of the spline to be used in the re-sizing.
 47 | 
 48 |     Returns:
 49 |         The re-sized map, with dimensions `size_new[0], size_new[1]` or `size_new[0], size_new[1], C`. The output
 50 |         data type reflects the input one.
 51 |     """
 52 | 
 53 |     # Check that the data is either 2D or 3D.
 54 |     if (data.ndim != 2) & (data.ndim != 3):
 55 |         raise ValueError('Input data must be either 2D or 3D.')
 56 | 
 57 |     # Input data dimensions.
 58 |     height = data.shape[0]
 59 |     width = data.shape[1]
 60 | 
 61 |     # The target dimensions.
 62 |     height_new, width_new = size_new
 63 | 
 64 |     # We make the following assumptions:
 65 |     # - each pixel in the input data has height `1` and width `1`,
 66 |     # - `data[y, x]` is concentrated at the spatial coordinates `(y, x)`.
 67 |     # According to the previous two assumptions:
 68 |     # - the top left corner of the pixel associated to `data[y, x]` is at spatial coordinates `(y - 0.5, x - 0.5)`,
 69 |     # - the bottom right corner of the pixel associated to `data[y, x]` is at spatial coordinates `(y + 0.5, x + 0.5)`,
 70 |     # - `data` has its top left corner at the spatial coordinates `(- 0.5, - 0.5)`,
 71 |     # - `data` has its bottom right corner at the spatial coordinates `(height - 1 + 0.5, width - 1 + 0.5)`.
 72 | 
 73 |     # NOTE:
 74 |     # Re-sizing the input data means enlarging the pixel size, not decreasing the data (image or depth) area.
 75 |     # After resizing, the top left and bottom right corners of `data` will still be located at spatial coordinates
 76 |     # `(- 0.5, - 0.5)` and `(height - 1 + 0.5, width - 1 + 0.5)`, respectively.
 77 | 
 78 |     # New pixel dimensions.
 79 |     pixel_height_new = float(height) / height_new
 80 |     pixel_width_new = float(width) / width_new
 81 | 
 82 |     # Compute the coordinates of center of the top left pixel in the re-sized data.
 83 |     start_y = - 0.5 + (pixel_height_new / 2.0)
 84 |     start_x = - 0.5 + (pixel_width_new / 2.0)
 85 | 
 86 |     # Compute the coordinates of the center of the bottom right pixel in the new data.
 87 |     end_y = height - 1 + 0.5 - (pixel_height_new / 2.0)
 88 |     end_x = width - 1 + 0.5 - (pixel_width_new / 2.0)
 89 | 
 90 |     # Compute the new sampling grid.
 91 |     y_coord_new, x_coord_new = np.mgrid[start_y:end_y:(height_new * 1j), start_x:end_x:(width_new * 1j)]
 92 | 
 93 |     # Organize the sampling grid in a single array.
 94 |     points_new = np.stack((y_coord_new.flatten(), x_coord_new.flatten()), axis=1)
 95 | 
 96 |     # Re-sample the input depth.
 97 |     if data.ndim == 2:
 98 | 
 99 |         # Single channel input.
100 | 
101 |         aux = map_coordinates(data, points_new.T, order=order, mode='nearest')
102 |         data_resized = np.reshape(aux, (height_new, width_new))
103 | 
104 |     else:
105 | 
106 |         # Multiple channel input.
107 | 
108 |         # Number of channels.
109 |         channel_nb = data.shape[2]
110 | 
111 |         aux = tuple(
112 |             map_coordinates(data[:, :, i], points_new.T, order=order, mode='nearest') for i in range(channel_nb))
113 |         aux = np.stack(aux, axis=1)
114 |         data_resized = np.reshape(aux, (height_new, width_new, channel_nb))
115 | 
116 |     return data_resized
117 | 
118 | 
119 | def filler_1d(data: np.array, mask: np.array) -> np.array:
120 |     """It fills a sparse 1D array.
121 | 
122 |     It fills the missing entries in the sparse 1D array `data` using linear interpolation.
123 |     A missing entry can be filled if and only if it is included between two available entries.
124 | 
125 |     Args:
126 |         data: sparse array of dimension `(N,)`.
127 |         mask: array of dimension `(N,)` with `mask[i]` equal to `1` if the entry `data[i]` is available, equal to `0`
128 |             if `data[i]` needs to be filled.
129 | 
130 |     Returns:
131 |         A new filled array with `nan` values at those entries that could not be filled.
132 |     """
133 | 
134 |     # Array length.
135 |     length = len(data)
136 | 
137 |     # Array support.
138 |     line = np.arange(0, length)
139 | 
140 |     # Compute the coordinates of the available entries.
141 |     available_entries = line[mask.astype(np.bool)]
142 | 
143 |     # Compute the coordinates of the non available entries.
144 |     target_entries = line[~mask.astype(np.bool)]
145 | 
146 |     # Allocate the filled array.
147 |     data_filled = np.copy(data)
148 | 
149 |     # Perform the filling.
150 |     if len(available_entries) <= 1:
151 |         data_filled[~mask.astype(np.bool)] = math.nan
152 |     else:
153 |         data_filled[target_entries] = griddata(
154 |             available_entries, (data[available_entries]), target_entries, method='linear')
155 |     # The target entries at the left of the left-most available entry or at the right of the-right most available
156 |     # entry are assigned the `nan` value.
157 | 
158 |     return data_filled
159 | 
160 | 
161 | def filler_2d(data: np.array, mask: np.array) -> np.array:
162 |     """It fills a sparse 2D array.
163 | 
164 |     It fills the missing entries in the sparse 2D array `data` using the following approach.
165 |     First, two candidates are computed:
166 |     - one obtained by interpolating linearly all the rows separately,
167 |     - one obtained by interpolating linearly all the columns separately.
168 |     Then, the derivative of the two candidates are computed:
169 |     - the horizontal candidate derivative is obtained by deriving each row separately,
170 |     - the vertical candidate derivative is obtained by deriving each column separately.
171 |     For each missing entry, the candidate with the lowest derivative (in absolute value) is selected.
172 | 
173 |     Missing entries with only one candidate are assigned that candidate. Missing entries without any candidate are
174 |     filled with nearest neighbor.
175 | 
176 |     Args:
177 |         data: sparse array of dimensions `(H, W)`.
178 |         mask: array of dimension `(H, W)` with `mask[i, j]` equal to `1` if the entry `data[i, j]` is available,
179 |               equal to `0` if `data[i, j]` needs to be filled.
180 | 
181 |     Returns:
182 |         A new filled array.
183 |     """
184 | 
185 |     # Initialize the filled data with the input one.
186 |     data_filled = np.copy(data)
187 | 
188 |     # Check whether there are entries to fill. If there are, then fill them.
189 |     if np.sum(mask) != data.size:
190 | 
191 |         # Input data dimensions.
192 |         height, width = data.shape
193 | 
194 |         # Perform the horizontal filling.
195 |         data_horiz = np.zeros_like(data)
196 |         for i in range(height):
197 |             data_horiz[i, :] = filler_1d(data[i, :], mask[i, :])
198 | 
199 |         # Compute the horizontal derivative. `nan` derivatives are set to infinity.
200 |         derivative_horiz = np.abs(np.diff(np.append(data_horiz, data_horiz[:, -2:-1], axis=1), axis=1))
201 |         derivative_horiz[np.isnan(derivative_horiz)] = float('inf')
202 | 
203 |         # Perform the vertical filling.
204 |         data_vert = np.zeros_like(data)
205 |         for i in range(width):
206 |             data_vert[:, i] = filler_1d(data[:, i], mask[:, i])
207 | 
208 |         # Compute the vertical derivative. `nan` derivatives are set to infinity.
209 |         derivative_vert = np.abs(np.diff(np.append(data_vert, data_vert[-2:-1, :], axis=0), axis=0))
210 |         derivative_vert[np.isnan(derivative_vert)] = float('inf')
211 | 
212 |         # Detect those pixels where the horizontal derivative is stronger than the vertical one, in absolute value.
213 |         mask_orientation = derivative_horiz > derivative_vert
214 | 
215 |         # Perform the merging.
216 |         data_filled = np.copy(data_horiz)
217 |         data_filled[mask_orientation] = data_vert[mask_orientation]
218 |         # Entries where no estimate is available (if any) are equal to `nan`.
219 | 
220 |         # Detect the entries where no estimate is available (if any), and fill them via nearest neighbor interpolation.
221 |         mask_unfilled = np.isnan(data_filled)
222 |         if np.sum(mask_unfilled) > 0:
223 |             i, j = np.mgrid[0:data.shape[0]:1, 0:data.shape[1]:1]
224 |             available_entries = np.stack((i[~mask_unfilled], j[~mask_unfilled]), axis=1)
225 |             target_entries = np.stack((i[mask_unfilled], j[mask_unfilled]), axis=1)
226 |             data_filled[mask_unfilled] = griddata(
227 |                 available_entries, (data_filled[~mask_unfilled]), target_entries, method='nearest')
228 | 
229 |     return data_filled
230 | 
231 | 
232 | def filler_2d_nearest(data: np.array, mask: np.array) -> np.array:
233 |     """It fills a sparse 2D array using nearest neighbour interpolation.
234 | 
235 |     Args:
236 |         data: sparse array of dimensions `(H, W)`.
237 |         mask: array of dimension `(H, W)` with `mask[i, j]` equal to `1` if the entry `data[i, j]` is available,
238 |               equal to `0` if `data[i, j]` needs to be filled.
239 | 
240 |     Returns:
241 |         A new filled array.
242 |     """
243 | 
244 |     # Initialize the filled data with the input one.
245 |     data_filled = np.copy(data)
246 | 
247 |     mask_available = mask.astype(np.bool)
248 | 
249 |     # Check whether there are entries to fill. If there are, then fill them.
250 |     if np.sum(mask) != data.size:
251 | 
252 |         i, j = np.mgrid[0:data.shape[0]:1, 0:data.shape[1]:1]
253 |         available_entries = np.stack((i[mask_available], j[mask_available]), axis=1)
254 |         target_entries = np.stack((i[~mask_available], j[~mask_available]), axis=1)
255 |         data_filled[~mask_available] = griddata(
256 |             available_entries, (data_filled[mask_available]), target_entries, method='nearest')
257 | 
258 |     return data_filled
259 | 
260 | 
261 | def similarity_graph(image: torch.Tensor,
262 |                      window_size: int = 9, patch_size: int = 7,
263 |                      sigma_intensity: float = 0.2, sigma_spatial: float = 3.0,
264 |                      degree_max: int = 15) -> Tuple[torch.Tensor, torch.Tensor]:
265 |     """It builds a similarity graph on the input image.
266 | 
267 |     Args:
268 |         image: reference image, arranged as a `(1, 1, H, W)` tensor.
269 |         window_size: edge size of the square searching window.
270 |         patch_size: edge size of the square patch used in the similarity computation.
271 |         sigma_intensity: intensity standard deviation for the gaussian similarity weights.
272 |         sigma_spatial: spatial standard deviation for the gaussian similarity weights.
273 |         degree_max: maximum number of neighbors for each node (pixel) in the similarity graph.
274 | 
275 |     Returns:
276 |         A tuple containing two `(1, degree_max, H, W)` tensors. The entry `(0, k, i, j)` of the first tensor stores the
277 |         similarity weight between the pixels `(i, j)' of the input image and its k-th best neighbor.
278 |         The linear index of k-th best neighbor is stored in the entry `(0, k, i, j)` of the second tensor.
279 |         A pixel `(i, j)` with less than `degree_max` neighbors has the array `(0, :, i, j)` in the first tensor filled
280 |         with zeros. The linear index, in the second tensor, associated to the aforementioned zero weights is the linear
281 |         index of the pixel `(i, j)` itself.
282 |     """
283 | 
284 |     # Check the input image type.
285 |     assert image.is_floating_point(), "The input image must be of type float."
286 | 
287 |     # Image dimensions.
288 |     channel_nb = image.size(1)
289 |     height = image.size(2)
290 |     width = image.size(3)
291 | 
292 |     # Organize the channels in the batch dimension.
293 |     image_aux = image
294 |     if channel_nb > 1:
295 |         image_aux = image.transpose(0, 1).contiguous()
296 | 
297 |     # Create the filters to be used to compute the patch similarity.
298 |     filter_bank = diff_filter_bank(window_size).to(image_aux)
299 | 
300 |     # Compute the padding for the patch similarity computation.
301 |     window_radius = int((window_size - 1) / 2.0)
302 |     patch_radius = int((patch_size - 1) / 2.0)
303 |     pad = [window_radius + patch_radius] * 4
304 | 
305 |     # Compute the pixel similarity.
306 |     pixel_similarity = fun.conv2d(
307 |         fun.pad(image_aux, pad, mode='replicate'), filter_bank).pow(2).sum(dim=0, keepdim=True)
308 |     # `pixel_similarity` is `(1, window_size * window_size, height + (2 * patch_radius), width + (2 * patch_radius))`.
309 | 
310 |     # Compute the integral image associated to `similarity`.
311 |     pad = (1, 0, 1, 0)      # (pad_left, pad_right, pad_top, pad_bottom)
312 |     integral = fun.pad(pixel_similarity, pad, mode='constant', value=0).cumsum(dim=2).cumsum(dim=3)
313 |     # `integral` is `(1, window_size * window_size, height + (2 * patch_radius) + 1, width + (2 * patch_radius) + 1)`.
314 | 
315 |     # Free the memory associated to `pixel_similarity`.
316 |     del pixel_similarity
317 | 
318 |     # Exploit the integral image to compute the patch similarity in constant time.
319 |     integral_height = integral.size(2)
320 |     integral_width = integral.size(3)
321 |     bottom_right = integral.narrow(2, integral_height - height, height).narrow(3, integral_width - width, width)
322 |     bottom_left = integral.narrow(2, integral_height - height, height).narrow(3, 0, width)
323 |     top_right = integral.narrow(2, 0, height).narrow(3, integral_width - width, width)
324 |     top_left = integral.narrow(2, 0, height).narrow(3, 0, width)
325 |     patch_similarity = bottom_right.clone().add_(-1.0, bottom_left).add_(-1.0, top_right).add_(top_left)
326 | 
327 |     # DEBUG.
328 |     # patch_similarity.sqrt_()
329 | 
330 |     # Normalize the patch similarity.
331 |     patch_similarity.div_((- 2.0) * (sigma_intensity ** 2))
332 | 
333 |     # Free the memory associated to `integral`.
334 |     del integral
335 | 
336 |     # Define the window grid.
337 |     y_window, x_window = torch.meshgrid(
338 |         [torch.arange(- window_radius, window_radius + 1, dtype=torch.int16, device=image_aux.device),
339 |          torch.arange(- window_radius, window_radius + 1, dtype=torch.int16, device=image_aux.device)])
340 |     y_window = y_window.reshape(1, -1)
341 |     x_window = x_window.reshape(1, -1)
342 | 
343 |     # Remove the entry `(0, 0)` from the window grid, as `filter_bank` does not contain any filter for this coordinate.
344 |     mask = (y_window == 0) & (x_window == 0)
345 |     y_window = y_window[~mask].reshape(1, -1, 1, 1)
346 |     x_window = x_window[~mask].reshape(1, -1, 1, 1)
347 | 
348 |     # Compute the squared spatial distance.
349 |     spatial_weights = x_window.to(patch_similarity).pow_(2) + y_window.to(patch_similarity).pow_(2)
350 | 
351 |     # Normalize the spatial distance.
352 |     spatial_weights.div_((- 2.0) * (sigma_spatial ** 2))
353 | 
354 |     # Compute the global weights (based on both patch similarity and spatial distance).
355 |     weights = patch_similarity.add_(spatial_weights).exp_()
356 |     # weights = patch_similarity.exp_()       # DEBUG.
357 | 
358 |     # Define the image grid.
359 |     y_source, x_source = torch.meshgrid(
360 |         [torch.arange(height, dtype=torch.int16, device=image_aux.device),
361 |          torch.arange(width, dtype=torch.int16, device=image_aux.device)])
362 |     y_source = y_source[None, None,]
363 |     x_source = x_source[None, None,]
364 | 
365 |     # Detect and remove the non valid weights, i.e., those associated to pixel outside the actual image support.
366 |     y_target = torch.zeros_like(y_source)
367 |     x_target = torch.zeros_like(x_source)
368 |     for i in range(weights.size(1)):
369 | 
370 |         # Compute the neighbouring pixel coordinates.
371 |         torch.add(y_source, y_window.narrow(1, i, 1), out=y_target)
372 |         torch.add(x_source, x_window.narrow(1, i, 1), out=x_target)
373 | 
374 |         # Detect the non valid coordinates and set them to zero.
375 |         weights.narrow(1, i, 1).mul_(
376 |             (y_target >= 0).to(weights)).mul_(
377 |             (y_target < height).to(weights)).mul_(
378 |             (x_target >= 0).to(weights)).mul_(
379 |             (x_target < width).to(weights))
380 | 
381 |     # For each pixel, select the `degree_max` neighbours with the largest weights.
382 |     weights_top, indexes = torch.topk(weights, degree_max, dim=1)
383 |     # Note that, although the weights associated to non valid neighbours have been set equal to zero, some of these
384 |     # neighbours may still have been selected. This must be taken into account later.
385 | 
386 |     # Free the memory associated to `weights`.
387 |     del weights
388 | 
389 |     # Normalize the vector of weights associated to each pixel by its sum.
390 |     weights_top.div_(
391 |         torch.max(weights_top.sum(dim=1, keepdim=True).expand_as(weights_top), weights_top.new_ones(1) * 1e-12))
392 | 
393 |     # Build the tensor `indexes_linear`.
394 |     index_linear = torch.zeros_like(weights_top, dtype=torch.long)
395 |     for i in range(degree_max):
396 | 
397 |         # Flatten the spatial dimensions of `indexes`.
398 |         indexes_flattened = indexes.narrow(1, i, 1).view(1, -1, 1, 1)
399 | 
400 |         # Compute the neighboring pixel coordinates.
401 |         torch.add(
402 |             y_source,
403 |             torch.gather(y_window, 1, indexes_flattened).view(y_source.size()),
404 |             out=y_target)
405 |         torch.add(
406 |             x_source,
407 |             torch.gather(x_window, 1, indexes_flattened).view(x_source.size()),
408 |             out=x_target)
409 | 
410 |         # The coordinates of the non valid neighbors of a pixel `p` are set equal to the coordinates of `p` itself.
411 |         mask = None
412 |         if (y_target < 0).any() or (y_target >= height).any():
413 |             mask = (y_target < 0) | (y_target >= height)
414 |             y_target[mask] = y_source[mask]
415 |         if (x_target < 0).any() or (x_target >= width).any():
416 |             mask = (x_target < 0) | (x_target >= width)
417 |             x_target[mask] = x_source[mask]
418 | 
419 |         # Convert the spatial indexes into linear.
420 |         torch.add(
421 |             x_target.to(index_linear),
422 |             width,
423 |             y_target.to(index_linear),
424 |             out=index_linear.narrow(1, i, 1))
425 | 
426 |     # Free the memory associated to `y_target`, `x_target`, `mask`.
427 |     del y_target, x_target, mask
428 | 
429 |     return weights_top, index_linear
430 | 
431 | 
432 | def unravel_index(index: Union[np.ndarray, torch.Tensor], size: Tuple[int, int])\
433 |         -> Union[Tuple[np.ndarray, np.ndarray], Tuple[torch.Tensor, torch.Tensor]]:
434 |     """It converts linear indexes into matrix indexes.
435 | 
436 |     It converts each input linear index `i` into a pair `(row, col)` for the matrix whose shape is specified at the input.
437 | 
438 |     Args:
439 |         index: linear indexes, arranged as an `(N,)` array or tensor.
440 |         size: matrix shape.
441 | 
442 |     Returns:
443 |         A tuple containing the row and columns indexes, each one arranged as an `(N,)` array or tensor.
444 |     """
445 | 
446 |     height, width = size
447 | 
448 |     # # Check the class of the input data.
449 |     # index_class = type(index).__name__
450 |     #
451 |     # if index_class == 'ndarray':
452 |     #
453 |     #     row, col = np.divmod(index, width)
454 |     #
455 |     # elif index_class == 'Tensor':
456 |     #
457 |     #     row = (index.div(width)).floor_()
458 |     #     col = index.fmod(width)
459 |     #
460 |     # else:
461 |     #
462 |     #     raise TypeError('The input index data type must be ndarray or Tensor.')
463 | 
464 |     row = index // width
465 |     col = index % width
466 | 
467 |     return row, col
468 | 
469 | 
470 | def depth_percentage_error(depth: np.array, depth_gt: np.array, threshold: float):
471 |     """It computes the percentage of pixel whose depth has an error larger than a predefined threshold.
472 | 
473 |     Args:
474 |         depth: depth map to check, arranged as an `(H, W)` array.
475 |         depth_gt: ground truth depth map, arranged as an `(H, W)` array.
476 |         threshold: error threshold.
477 | 
478 |     Returns:
479 |         The percentage of pixels in the input depth map with an error larger than the specified threshold.
480 |     """
481 | 
482 |     mask = (depth_gt > 0) & (depth_gt < float('inf'))
483 |     error = np.abs(depth_gt - depth)
484 |     error = (np.sum(error[mask] > threshold) / np.sum(mask)) * 100
485 | 
486 |     return error
487 | 
488 | 
489 | def space2plane_normal(depth: np.array, normal: np.array,
490 |                        focal: Tuple[float, float], center: Tuple[float, float]) -> np.array:
491 |     """It computes the 2D normals associated to the inverse depth, starting from the 3D normals.
492 | 
493 |     The unitary normal associated to a 3D points `(X_0, Y_0, Z_0)` defines a plane `P` that locally approximates the
494 |     surface around the point itself. Let us indicate with `(x_0, y_0)` the coordinates of the projection of
495 |     `(X_0, Y_0, Z_0)` onto the camera image plane. Assuming a pinhole camera model, the inverse depth associated to
496 |     the plane `P` is a plane as well, `P1` hereafter, passing through the point `(x_0, y_0, 1 / depth[x_0, x_0])`.
497 |     In particular, the plane `P1` is described by the following equation:
498 | 
499 |                     `(1 / depth[x, y]) = (1 / depth[x_0, y_0])  +  (w_1 * (x - x_0))  +  (w_2 * (y - y_0))`
500 | 
501 |     where the direction of the (non necessarily unitary) vector `(w_0, w_1, -1)` defines the orientation of `P1`.
502 |     For each pixel in the input depth map, this function leverages the normal of the corresponding 3D point to compute
503 |     the corresponding vector `(w_0, w_1)`.
504 | 
505 |     Input 3D normals with the 'z' component equal to zero are mapped to the 2D zero vector.
506 |     Input 3D normals whose corresponding depth is not valid are mapped to the 2D zero vector.
507 | 
508 |     Args:
509 |         depth: depth map, arranged as an `(H, W)` array.
510 |         normal: normal map, arranged as an `(H, W, 3)` array. Normals must be unitary.
511 |         focal: tuple containing the camera focal lengths `(f_x, f_y)`.
512 |         center: tuple containing the camera principal point coordinates `(c_x, c_y)`.
513 | 
514 |     Returns:
515 |         The 2D normals associated to the input 3D normals, arranged as an `(H, W, 2)` array.
516 |     """
517 | 
518 |     # Define the data type to be used below: 64-bit precision is recommended.
519 |     dtype = np.float64
520 | 
521 |     # Convert the input depth map to `dtype`.
522 |     d = depth.astype(dtype, copy=False)
523 | 
524 |     # Depth map dimensions.
525 |     height = depth.shape[0]
526 |     width = depth.shape[1]
527 | 
528 |     # Build the depth map grid.
529 |     x, y = np.meshgrid(np.arange(width, dtype=dtype), np.arange(height, dtype=dtype))
530 | 
531 |     # Extract the camera focal lengths and the coordinates of the camera center of projection.
532 |     focal_x, focal_y = focal
533 |     center_x, center_y = center
534 | 
535 |     # Detect the entries of the grid where the depth is available.
536 |     mask = (d > 0) & (d < float('inf'))
537 | 
538 |     # Create a copy of the 3D normals where those associated to non available depth entries are set to zero.
539 |     normal_new = np.zeros_like(normal, dtype=dtype)
540 |     normal_new[mask] = normal[mask]
541 | 
542 |     # Re-normalize the normals.
543 |     normal_norm = np.linalg.norm(normal_new, axis=2)
544 |     mask_nnz = (normal_norm > 0)
545 |     for i in range(3):
546 |         normal_new[:, :, i][mask_nnz] = normal_new[:, :, i][mask_nnz] / normal_norm[mask_nnz]
547 | 
548 |     # Name the 3D normal components as in the report.
549 |     a = normal_new[:, :, 0]
550 |     b = normal_new[:, :, 1]
551 |     c = normal_new[:, :, 2]
552 | 
553 |     # Compute the cosine of the angle between the 3D normal and the line of sight of the corresponding 3D point.
554 |     rho = np.zeros_like(depth, dtype=dtype)
555 |     rho[mask] = d[mask] * (
556 |             ((a[mask] * (x[mask] - center_x)) / focal_x) +
557 |             ((b[mask] * (y[mask] - center_y)) / focal_y) +
558 |             c[mask])
559 | 
560 |     '''
561 |     # Cases:
562 |     # 1. A 3D normal with negative `rho` indicates a 3D point on the side of a plane visible by the camera.
563 |     # 2. A 3D normal with positive `rho` indicates a 3D point on the side of a plane hidden to the camera.
564 |     #    However, it is sufficient to flip the normal orientation in order to associate the point to the side of the
565 |     #    plane visible by the camera.
566 |     # 3. A 3D normal with zero `rho` indicates a 3D point on a plane aligned with the line of sight of the point and
567 |     #    therefore not visible by the camera (regardless of the side of the plane).
568 |     #
569 |     # As the normals `n` and `-n` are both projected to the same 2D vector, it is not necessary to flip the normals
570 |     # corresponding to the Case 2.
571 |     '''
572 | 
573 |     # Allocate the space for the 2D normals and name them as in the report.
574 |     plane_normal = np.zeros((height, width, 2), dtype=dtype)
575 |     w_0 = plane_normal[:, :, 0]
576 |     w_1 = plane_normal[:, :, 1]
577 | 
578 |     # Compute the 2D normals associated to the available 3D normals.
579 |     mask = (mask & (rho != 0))
580 |     w_0[mask] = a[mask] / (rho[mask] * focal_x)
581 |     w_1[mask] = b[mask] / (rho[mask] * focal_y)
582 | 
583 |     # The 3D normals with a valid depth, but corresponding to the Case 3 (i.e., `rho == 0`), are not valid.
584 |     # These 3D normals are arbitrarily mapped to the 2D normal `[0, 0]`.
585 | 
586 |     return plane_normal
587 | 
588 | 
589 | def plane2space_normal(depth: np.array, normal: np.array,
590 |                        focal: Tuple[float, float], center: Tuple[float, float]) -> np.array:
591 |     """It reverts the operation performed by `space2plane_normals`.
592 | 
593 |     Args:
594 |         depth: depth map, arranged as an `(H, W)` array.
595 |         normal: normal map, arranged as an `(H, W, 2)` array.
596 |         focal: tuple containing the camera focal lengths `(f_x, f_y)`.
597 |         center: tuple containing the camera principal point coordinates `(c_x, c_y)`.
598 | 
599 |     Returns:
600 |         The 3D normals associated to the input 2D normals, arranged as an `(H, W, 3)` array.
601 |     """
602 | 
603 |     # Define the data type to be used below: 64-bit precision is recommended.
604 |     dtype = np.float64
605 | 
606 |     # Convert the input depth map to `dtype`.
607 |     d = depth.astype(dtype, copy=False)
608 | 
609 |     # Depth map dimensions.
610 |     height = depth.shape[0]
611 |     width = depth.shape[1]
612 | 
613 |     # Build the depth map grid.
614 |     x, y = np.meshgrid(np.arange(width, dtype=dtype), np.arange(height,dtype=dtype))
615 | 
616 |     # Extract the camera focal lengths and the coordinates of the camera center of projection.
617 |     focal_x, focal_y = focal
618 |     center_x, center_y = center
619 | 
620 |     # Detect the entries of the grid where the depth is available.
621 |     mask = (d > 0) & (d < float('inf'))
622 | 
623 |     # Address the 2D normals as in the report.
624 |     w_0 = normal[:, :, 0].astype(dtype, copy=False)
625 |     w_1 = normal[:, :, 1].astype(dtype, copy=False)
626 | 
627 |     # Compute the coefficients of the first linear equation.
628 |     alpha = np.zeros_like(depth, dtype=dtype)
629 |     beta = np.zeros_like(depth, dtype=dtype)
630 |     gamma = np.zeros_like(depth, dtype=dtype)
631 |     alpha[mask] = (w_0[mask] * (x[mask] - center_x) * d[mask] * focal_y) - focal_y
632 |     beta[mask] = w_0[mask] * (y[mask] - center_y) * d[mask] * focal_x
633 |     gamma[mask] = w_0[mask] * d[mask] * (focal_x * focal_y)
634 | 
635 |     # Compute the coefficients of the second linear equation.
636 |     delta = np.zeros_like(depth, dtype=dtype)
637 |     epsilon = np.zeros_like(depth, dtype=dtype)
638 |     phi = np.zeros_like(depth, dtype=dtype)
639 |     delta[mask] = w_1[mask] * (x[mask] - center_x) * d[mask] * focal_y
640 |     epsilon[mask] = (w_1[mask] * (y[mask] - center_y) * d[mask] * focal_x) - focal_x
641 |     phi[mask] = w_1[mask] * d[mask] * (focal_x * focal_y)
642 | 
643 |     # Allocate the space for the 3D normals and address them as in the report.
644 |     space_normal = np.zeros((height, width, 3), dtype=dtype)
645 |     a = space_normal[:, :, 0]
646 |     b = space_normal[:, :, 1]
647 |     c = space_normal[:, :, 2]
648 | 
649 |     # ==== CASE w_0(x, y) NOT ZERO AND w_1(x, y) NOT ZERO ==============================================================
650 | 
651 |     # Detect the entries associated to the current case.
652 |     mask_case = (w_0 != 0) & (w_1 != 0) & mask
653 | 
654 |     # Auxiliary variables.
655 |     kappa = np.zeros_like(depth, dtype=dtype)
656 |     alpha_beta_kappa = np.zeros_like(depth, dtype=dtype)
657 |     one_plus_kappa_sq = np.zeros_like(depth, dtype=dtype)
658 |     kappa[mask_case] = (w_1[mask_case] * focal_y) / (w_0[mask_case] * focal_x)
659 |     alpha_beta_kappa[mask_case] = alpha[mask_case] + (beta[mask_case] * kappa[mask_case])
660 |     one_plus_kappa_sq[mask_case] = 1.0 + (kappa[mask_case] ** 2)
661 | 
662 |     a[mask_case] = - (np.sign(w_0[mask_case]) * np.abs(gamma[mask_case])) / np.sqrt(
663 |         (alpha_beta_kappa[mask_case] ** 2) + ((gamma[mask_case] ** 2) * one_plus_kappa_sq[mask_case]))
664 |     b[mask_case] = kappa[mask_case] * a[mask_case]
665 |     c[mask_case] = - ((alpha[mask_case] * a[mask_case]) + (beta[mask_case] * b[mask_case])) / gamma[mask_case]
666 | 
667 |     # ==== CASE w_0(x, y) NOT ZERO AND w_1(x, y) EQUAL TO ZERO =========================================================
668 | 
669 |     # Detect the entries associated to the current case.
670 |     mask_case = (w_0 != 0) & (w_1 == 0) & mask
671 | 
672 |     a[mask_case] = - (np.sign(w_0[mask_case]) * np.abs(gamma[mask_case])) / np.sqrt(
673 |         (alpha[mask_case] ** 2) + (gamma[mask_case] ** 2))
674 |     c[mask_case] = - (alpha[mask_case] / gamma[mask_case]) * a[mask_case]
675 | 
676 |     # ==== CASE w_0(x, y) EQUAL TO ZERO AND w_1(x, y) NOT ZERO =========================================================
677 | 
678 |     # Detect the entries associated to the current case.
679 |     mask_case = (w_0 == 0) & (w_1 != 0) & mask
680 | 
681 |     b[mask_case] = - (np.sign(w_1[mask_case]) * np.abs(phi[mask_case])) / np.sqrt(
682 |         (epsilon[mask_case] ** 2) + (phi[mask_case] ** 2))
683 |     c[mask_case] = - (epsilon[mask_case] / phi[mask_case]) * b[mask_case]
684 | 
685 |     # ==== CASE w_0(x, y) EQUAL TO ZERO AND w_1(x, y) EQUAL TO ZERO ====================================================
686 | 
687 |     # Detect the entries associated to the current case.
688 |     mask_case = (w_0 == 0) & (w_1 == 0) & mask
689 | 
690 |     c[mask_case] = - 1.0
691 | 
692 |     # ==================================================================================================================
693 | 
694 |     # Check the normal orientations ...
695 | 
696 |     # Compute the cosine of the angle between the 3D normal and the line of sight of the corresponding 3D point.
697 |     rho = np.zeros((height, width), dtype=dtype)
698 |     rho[mask] = d[mask] * (
699 |             ((a[mask] * (x[mask] - center_x)) / focal_x) +
700 |             ((b[mask] * (y[mask] - center_y)) / focal_y) +
701 |             c[mask])
702 | 
703 |     # Cases:
704 |     # 1. A 3D normal with negative `rho` indicates a 3D point on the side of a plane visible by the camera.
705 |     # 2. A 3D normal with positive `rho` indicates a 3D point on the side of a plane hidden to the camera.
706 |     #    However, it is sufficient to flip the normal orientation in order to associate the point to the side of the
707 |     #    plane visible by the camera.
708 |     # 3. A 3D normal with zero `rho` indicates a 3D point on a plane aligned with the line of sight of the point and
709 |     #    therefore not visible by the camera (regardless of the side of the plane).
710 | 
711 |     # No normal must be compliant with the case 2.
712 |     assert np.sum(rho > 0) == 0, 'Error in the normal map correction.'
713 | 
714 |     # Detect the 3D normals whose orientation is not compatible with a visible point (case 3) and set them to zero.
715 |     mask = (rho == 0)
716 |     space_normal[mask] = 0
717 | 
718 |     return space_normal
719 | 
720 | 
721 | def depth2normal(depth: np.array,
722 |                  focal: Tuple[float, float], center: Tuple[float, float],
723 |                  filter_size: int = 7, filter_sigma: float = 5.0) -> np.array:
724 |     """It computes the 3D normals associated to the 3D points described by the input depth map.
725 | 
726 |     Args:
727 |         depth: depth map, arranged as an `(H, W)` array.
728 |         focal: tuple containing the camera focal lengths `(f_x, f_y)`.
729 |         center: tuple containing the camera principal point coordinates `(c_x, c_y)`.
730 |         filter_size: height (and width) of the filters.
731 |         filter_sigma: standard deviation (in pixels) of the Gaussian filter underneath the derivative filters.
732 | 
733 |     Returns:
734 |         The 3D normals associated to the 3D points in the input depth map.
735 |     """
736 | 
737 |     # Build the vertical (y) derivative filter.
738 |     d_gauss_dy = gauss_filter_deriv_2d(filter_size, filter_sigma)
739 | 
740 |     # Build the gradient filter.
741 |     grad_filter = d_gauss_dy.T + (1j * d_gauss_dy)
742 |     # The x and y derivative filters are encoded in the real and imaginary parts of the filter.
743 | 
744 |     # Compute the inverse depth.
745 |     depth_inv = depth2depth_inv(depth)
746 | 
747 |     # Compute the inverse depth gradient.
748 |     depth_inv_grad = convolve2d(depth_inv, grad_filter, mode='same', boundary='symm')
749 |     depth_inv_grad = np.stack((np.real(depth_inv_grad), np.imag(depth_inv_grad)), axis=2)
750 | 
751 |     # Convert the inverse depth gradient field to 3D normals.
752 |     normal = plane2space_normal(depth, depth_inv_grad, focal, center)
753 | 
754 |     return normal
755 | 
756 | 
757 | def check_normal(depth: np.array, normal: np.array,
758 |                  focal: Tuple[float, float], center: Tuple[float, float]) -> np.array:
759 |     """It computes the inner product between the 3D point associated to each pixel and the corresponding 3D normal.
760 | 
761 |     Args:
762 |         depth: depth map, arranged as an `(H, W)` array.
763 |         normal: normal map, arranged as an `(H, W, 3)` array. Normals must be unitary.
764 |         focal: tuple containing the camera focal lengths `(f_x, f_y)`.
765 |         center: tuple containing the camera principal point coordinates `(c_x, c_y)`.
766 | 
767 |     Returns:
768 |         The inner product, arranged as an `(H, w)` array, between the 3D point associated to each pixel and the
769 |         corresponding 3D normal. Entries set to zero represent either pixel with no normal available or whose
770 |         corresponding 3D point is not visible by the camera.
771 |     """
772 | 
773 |     # Define the data type to be used below: 64-bit precision is recommended.
774 |     dtype = np.float64
775 | 
776 |     # Convert the input depth map to `dtype`.
777 |     d = depth.astype(dtype, copy=False)
778 | 
779 |     # Depth map dimensions.
780 |     height = depth.shape[0]
781 |     width = depth.shape[1]
782 | 
783 |     # Build the depth map grid.
784 |     x, y = np.meshgrid(np.arange(width, dtype=dtype), np.arange(height, dtype=dtype))
785 | 
786 |     # Extract the camera focal lengths and the coordinates of the camera center of projection.
787 |     focal_x, focal_y = focal
788 |     center_x, center_y = center
789 | 
790 |     # Detect the entries of the grid where the depth is available.
791 |     mask = (depth > 0) & (depth < float('inf'))
792 | 
793 |     # Name the 3D normal components as in the report.
794 |     a = normal[:, :, 0]
795 |     b = normal[:, :, 1]
796 |     c = normal[:, :, 2]
797 | 
798 |     # Compute the cosine of the angle between the 3D normal and the line of sight of the corresponding 3D point.
799 |     rho = np.zeros_like(depth, dtype=dtype)
800 |     rho[mask] = d[mask] * (
801 |             ((a[mask] * (x[mask] - center_x)) / focal_x) +
802 |             ((b[mask] * (y[mask] - center_y)) / focal_y) +
803 |             c[mask])
804 | 
805 |     # Cases:
806 |     # 1. A 3D normal with negative `rho` indicates a 3D point on the side of a plane visible by the camera.
807 |     # 2. A 3D normal with positive `rho` indicates a 3D point on the side of a plane hidden to the camera.
808 |     #    However, it is sufficient to flip the normal orientation in order to associate the point to the side of the
809 |     #    plane visible by the camera.
810 |     # 3. A 3D normal with zero `rho` indicates a 3D point on a plane aligned with the line of sight of the point and
811 |     #    therefore not visible by the camera (regardless of the side of the plane).
812 | 
813 |     return rho
814 | 


--------------------------------------------------------------------------------