├── .gitignore
├── LICENSE
├── README.md
├── img
    ├── teaser1.gif
    └── teaser2.gif
└── python
    ├── colmap_helpers.py
    ├── colmap_helpers_for_bin.py
    ├── download_trained_model.py
    ├── download_training_datasets.py
    ├── eval.py
    ├── generate_volume_test.py
    ├── generate_volume_train.py
    ├── model.py
    ├── test.py
    └── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # compiled pyc file
104 | *.pyc
105 | 
106 | # datasets
107 | dataset/
108 | model/
109 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2018, Po-Han Huang
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeepMVS: Learning Multi-View Stereopsis
 2 | 
 3 | [![License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg)](https://opensource.org/licenses/BSD-2-Clause)
 4 | 
 5 | **DeepMVS** is a Deep Convolutional Neural Network which learns to estimate pixel-wise disparity maps from a sequence of an arbitrary number of unordered images with the camera poses already known or estimated. 
 6 | 
 7 | ![teaser1](img/teaser1.gif)
 8 | ![teaser2](img/teaser2.gif)
 9 | 
10 | If you use our codes or datasets in your work, please cite:
11 | ```
12 | @inproceedings{DeepMVS,
13 |   author       = "Huang, Po-Han and Matzen, Kevin and Kopf, Johannes and Ahuja, Narendra and Huang, Jia-Bin",
14 |   title        = "DeepMVS: Learning Multi-View Stereopsis",
15 |   booktitle    = "IEEE Conference on Computer Vision and Pattern Recognition (CVPR)",
16 |   year         = "2018"
17 | }
18 | ```
19 | 
20 | For the paper and other details of DeepMVS or the MYS-Synth Dataset, please see our [project webpage](https://phuang17.github.io/DeepMVS/index.html).
21 | 
22 | 
23 | ## Training
24 | 
25 | ### Requirements
26 | 
27 | - **python 2.7**
28 | - **numpy 1.13.1**
29 | - **pytorch 0.3.0** and **torchvision**: Follow the instructions from [their website](http://pytorch.org/).
30 | - **opencv 3.1.0**: Run ``conda install -c menpo opencv`` or ``pip install opencv-python``.
31 | - **imageio 2.2.0** (with freeimage plugin): Run ``conda install -c conda-forge imageio`` or ``pip install imageio``. To install freeimage plugin, run the following Python script once:
32 |     ```python 
33 |     import imageio
34 |     imageio.plugins.freeimage.download()
35 |     ```
36 | - **h5py 2.7.0**: Run ``conda install h5py`` or ``pip install h5py``.
37 | - **lz4 0.23.1**: Run ``pip install lz4``.
38 | - **cuda 8.0.61** and **16GB GPU RAM** (required for gpu support): The training codes use up to 14GB of the GPU RAM with the default configuration. We train our model with an NVIDIA Tesla P100 GPU. To reduce GPU RAM usage, feel free to try smaller ``--patch_width``, ``--patch_height``, ``--num_depths``, and ``--max_num_neighbors``. However, the resulting model may not show the efficacy as appeared in our paper.
39 | 
40 | ### Instructions
41 | 
42 | 1. Download the training datasets.
43 |     ```bash
44 |     python python/download_training_datasets.py # This may take up to 1-2 days to complete.
45 |     ```
46 |     **Update: The training datasets have been updated on May 18, 2018 because of some errors in camera poses. Please remove the files and download them again if you have downloaded the old version.**
47 | 2. Train the network.
48 |     ```bash
49 |     python python/train.py # This may take up to 4-6 days to complete, depending on which GPU is used.
50 |     ```
51 | 
52 | ## Testing
53 | 
54 | ### Requirements
55 | 
56 | - **python 2.7**
57 | - **numpy 1.13.1**
58 | - **pytorch 0.3.0** and **torchvision**: Follow the instructions from [their website](http://pytorch.org/).
59 | - **opencv 3.1.0**: Run ``conda install -c menpo opencv`` or ``pip install opencv-python``.
60 | - **imageio 2.2.0**: Run ``conda install -c conda-forge imageio`` or ``pip install imageio``.
61 | - **pyquaternion 0.9.0**: Run ``pip install pyquaternion``.
62 | - **pydensecrf**: Run ``pip install pydensecrf``.
63 | - **cuda 8.0.61** and **6GB GPU RAM** (required for gpu support): The testing codes use up to 4GB of the GPU RAM with the default configuration. 
64 | - **COLMAP 3.2**: Follow the instructions from [their website](https://colmap.github.io/). 
65 |  
66 | ### Instructions
67 | 
68 | 1. Download the trained model.
69 |     ```bash
70 |     python python/download_trained_model.py
71 |     ```
72 | 
73 | 2. Run the sparse reconstruction and the ``image_undistorter`` using [COLMAP](https://colmap.github.io/). The ``image_undistorter`` will generate a ``images`` folder which contains undistorted images and a ``sparse`` folder which contains three ``.bin`` files.
74 | 
75 | 3. Run the testing script with the paths to the undistorted images and the sparse construction model.
76 |     ```bash
77 |     python python/test.py --load_bin --image_path path/to/images --sparse_path path/to/sparse --output_path path/to/output/directory
78 |     ```
79 |     By default, the script resizes the images to be 540px in height to reduce the running time. If you would like to run the model with other resolutions, please pass the arguments `--image_width XXX` and `--image_height XXX`.
80 |     If your COLMAP outputs ``.txt`` files instead of ``.bin`` files for the sparse reconstruction, simply remove the `--load_bin` flag.
81 | 
82 | 4. To evaluate the predicted results, run
83 |     ```bash
84 |     python python/eval.py --load_bin --image_path path/to/images --sparse_path path/to/sparse --output_path path/to/output/directory --gt_path path/to/gt/directory --image_width 810 --image_height 540 --size_mismatch crop_pad
85 |     ```
86 |     In ``gt_path``, the ground truth disparity maps should be stored in npy format with filenames being ``<image_name>.depth.npy``. If the ground truths are depth maps instead of disparity maps, please add ``--gt_type depth`` flag.
87 | 
88 | ## License
89 | 
90 | DeepMVS is licensed under the [BSD 2-Clause License](LICENSE.txt)


--------------------------------------------------------------------------------
/img/teaser1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phuang17/DeepMVS/4276bf17f965d08c31faf814fc54c80883e3373f/img/teaser1.gif


--------------------------------------------------------------------------------
/img/teaser2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phuang17/DeepMVS/4276bf17f965d08c31faf814fc54c80883e3373f/img/teaser2.gif


--------------------------------------------------------------------------------
/python/colmap_helpers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | import numpy as np
  5 | import cv2
  6 | import imageio
  7 | from pyquaternion import Quaternion
  8 | 
  9 | class PointList:
 10 | 
 11 | 	class Point:
 12 | 		def __init__(self, id, coord):
 13 | 			self.id = id
 14 | 			self.coord = coord
 15 | 
 16 | 	def __init__(self, line):
 17 | 		words = line.split()
 18 | 		self.points = []
 19 | 		for i in range(0, len(words) / 3):
 20 | 			id = int(words[3 * i + 2])
 21 | 			if id == -1:
 22 | 				continue
 23 | 			coord = np.array([float(words[3 * i + 0]), float(words[3 * i + 1])])
 24 | 			self.points.append(self.Point(id, coord))
 25 | 		self.length = len(self.points)
 26 | 
 27 | 	def get_by_id(self, id):
 28 | 		for idx in range(0, self.length):
 29 | 			if id == self.points[idx].id:
 30 | 				return self.points[idx]
 31 | 		return None
 32 | 
 33 | class ImageList:
 34 | 
 35 | 	class Image:
 36 | 		def __init__(self, id, extrinsic, camera_id, filename, point_list):
 37 | 			self.id = id
 38 | 			self.extrinsic = extrinsic
 39 | 			self.camera_id = camera_id
 40 | 			self.filename = filename
 41 | 			self.point_list = point_list
 42 | 
 43 | 	def __init__(self, path):
 44 | 		self.images = []
 45 | 		with open(path) as f:
 46 | 			lines = f.readlines()
 47 | 		self.length = int(re.match(r"# Number of images: (\d+),",lines[3]).groups()[0])
 48 | 		for i in range(0, self.length):
 49 | 			words = lines[4 + i * 2].split()
 50 | 			id = int(words[0])
 51 | 			extrinsic = Quaternion(float(words[1]), float(words[2]), float(words[3]), float(words[4])).transformation_matrix
 52 | 			extrinsic[0,3] = float(words[5])
 53 | 			extrinsic[1,3] = float(words[6])
 54 | 			extrinsic[2,3] = float(words[7])
 55 | 			camera_id = int(words[8])
 56 | 			filename = words[9]
 57 | 			point_list = PointList(lines[4 + i * 2 + 1])
 58 | 			self.images.append(self.Image(id, extrinsic, camera_id, filename, point_list))
 59 | 
 60 | 	def get_by_id(self, id):
 61 | 		for idx in range(0, self.length):
 62 | 			if id == self.images[idx].id:
 63 | 				return self.images[idx]
 64 | 		return None
 65 | 
 66 | class CameraList:
 67 | 
 68 | 	class Camera:
 69 | 		def __init__(self, id, width, height, fx, fy, cx, cy):
 70 | 			self.id = id
 71 | 			self.width = width
 72 | 			self.height = height
 73 | 			self.fx = fx
 74 | 			self.fy = fy
 75 | 			self.cx = cx
 76 | 			self.cy = cy
 77 | 
 78 | 	def __init__(self, path):
 79 | 		self.cameras = []
 80 | 		with open(path) as f:
 81 | 			lines = f.readlines()
 82 | 		self.length = int(re.match(r"# Number of cameras: (\d+)",lines[2]).groups()[0])
 83 | 		for camera_idx in range(0, self.length):
 84 | 			words = lines[3 + camera_idx].split()
 85 | 			id = int(words[0])
 86 | 			width = int(words[2])
 87 | 			height = int(words[3])
 88 | 			fx = float(words[4])
 89 | 			fy = float(words[5])
 90 | 			cx = float(words[6])
 91 | 			cy = float(words[7])
 92 | 			self.cameras.append(self.Camera(id, width, height, fx, fy, cx, cy))
 93 | 
 94 | 	def get_by_id(self, id):
 95 | 		for idx in range(0, self.length):
 96 | 			if id == self.cameras[idx].id:
 97 | 				return self.cameras[idx]
 98 | 		return None
 99 | 
100 | class PointCloud:
101 | 
102 | 	class Point:
103 | 		def __init__(self, id, coord):
104 | 			self.id = id
105 | 			self.coord = coord
106 | 
107 | 	def __init__(self, path):
108 | 		self.points = []
109 | 		with open(path) as f:
110 | 			lines = f.readlines()
111 | 		self.length = int(re.match(r"# Number of points: (\d+),",lines[2]).groups()[0])
112 | 		for i in range(0, self.length):
113 | 			words = lines[3 + i].split()
114 | 			id = int(words[0])
115 | 			x = float(words[1])
116 | 			y = float(words[2])
117 | 			z = float(words[3])
118 | 			coord = np.array([x, y, z, 1.0])
119 | 			self.points.append(self.Point(id, coord))
120 | 
121 | 	def get_by_id(self, id):
122 | 		for idx in range(0, self.length):
123 | 			if id == self.points[idx].id:
124 | 				return self.points[idx]
125 | 		return None
126 | 
127 | class ColmapSparse:
128 | 	def __init__(self, sparse_path, image_path, image_width = -1, image_height = -1, max_num_neighbors = 16):
129 | 		image_list_path = os.path.join(sparse_path, "images.txt")
130 | 		camera_list_path = os.path.join(sparse_path, "cameras.txt")
131 | 		point_cloud_path = os.path.join(sparse_path, "points3D.txt")
132 | 		if not os.path.exists(image_list_path):
133 | 			raise ValueError("{:} does not exist.".format(image_list_path))
134 | 		if not os.path.exists(camera_list_path):
135 | 			raise ValueError("{:} does not exist.".format(camera_list_path))
136 | 		if not os.path.exists(point_cloud_path):
137 | 			raise ValueError("{:} does not exist.".format(point_cloud_path))
138 | 		self.image_list = ImageList(image_list_path)
139 | 		self.camera_list = CameraList(camera_list_path)
140 | 		self.point_cloud = PointCloud(point_cloud_path)
141 | 		self.load_images(image_path)
142 | 		self.resize(image_width, image_height)
143 | 		self.estimate_max_disparities()
144 | 		self.generate_neighbor_list(max_num_neighbors)
145 | 
146 | 	def load_images(self, image_path):
147 | 		for image_idx in range(self.image_list.length):
148 | 			self.image_list.images[image_idx].rgb = imageio.imread(os.path.join(image_path, self.image_list.images[image_idx].filename)).astype(np.float32) / 255.0
149 | 
150 | 	def resize(self, image_width, image_height):
151 | 		if image_width < 0 and image_height < 0:
152 | 			return
153 | 		for camera_idx in range(self.camera_list.length):
154 | 			orig_image_width = self.camera_list.cameras[camera_idx].width
155 | 			orig_image_height = self.camera_list.cameras[camera_idx].height
156 | 			if image_width < 0:
157 | 				target_image_width = image_height * orig_image_width / orig_image_height
158 | 				target_image_height = image_height
159 | 			elif image_height < 0:
160 | 				target_image_width = image_width
161 | 				target_image_height = image_width * orig_image_height / orig_image_width
162 | 			else:
163 | 				target_image_width = image_width
164 | 				target_image_height = image_height
165 | 			width_ratio = float(target_image_width) / orig_image_width
166 | 			height_ratio = float(target_image_height) / orig_image_height
167 | 			self.camera_list.cameras[camera_idx].width = target_image_width
168 | 			self.camera_list.cameras[camera_idx].height = target_image_height
169 | 			self.camera_list.cameras[camera_idx].fx *= width_ratio
170 | 			self.camera_list.cameras[camera_idx].fy *= height_ratio
171 | 			self.camera_list.cameras[camera_idx].cx *= width_ratio
172 | 			self.camera_list.cameras[camera_idx].cy *= height_ratio
173 | 			self.camera_list.cameras[camera_idx].width_ratio = width_ratio
174 | 			self.camera_list.cameras[camera_idx].height_ratio = height_ratio
175 | 
176 | 		for image_idx in range(self.image_list.length):
177 | 			camera = self.camera_list.get_by_id(self.image_list.images[image_idx].camera_id)
178 | 			self.image_list.images[image_idx].rgb = cv2.resize(self.image_list.images[image_idx].rgb, (camera.width, camera.height), interpolation = cv2.INTER_AREA)
179 | 			for point_idx in range(self.image_list.images[image_idx].point_list.length):
180 | 				self.image_list.images[image_idx].point_list.points[point_idx].coord[0] *= camera.width_ratio
181 | 				self.image_list.images[image_idx].point_list.points[point_idx].coord[1] *= camera.height_ratio
182 | 
183 | 	def estimate_max_disparities(self, percentile = 0.99, stretch = 1.333333):
184 | 		for (img_idx, image) in enumerate(self.image_list.images):
185 | 			camera = self.camera_list.get_by_id(image.camera_id)
186 | 			disparity_list = []
187 | 			for (point_idx, point) in enumerate(self.point_cloud.points):
188 | 				coord = image.extrinsic.dot(point.coord)
189 | 				new_x = (coord[0] / coord[2] * camera.fx + camera.cx) 
190 | 				new_y = (coord[1] / coord[2] * camera.fy + camera.cy) 
191 | 				new_d = 1.0 / coord[2]
192 | 				if new_x >= 0.0 and new_x < camera.width and new_y >= 0.0 and new_y < camera.height and new_d > 0.0:
193 | 					disparity_list.append(new_d)
194 | 			disparity_list = np.sort(np.array(disparity_list))
195 | 			self.image_list.images[img_idx].estimated_max_disparity = disparity_list[int(disparity_list.shape[0] * percentile)] * stretch
196 | 
197 | 	def generate_neighbor_list(self, max_num_neighbors):
198 | 		point_id_list = []
199 | 		for (ref_idx, ref_image) in enumerate(self.image_list.images):
200 | 			point_id_set = set()
201 | 			for (ref_point_idx, ref_point) in enumerate(ref_image.point_list.points):
202 | 				point_id_set.add(ref_point.id)
203 | 			point_id_list.append(point_id_set)
204 | 		for (ref_idx, ref_image) in enumerate(self.image_list.images):
205 | 			shared_feature_list = []
206 | 			for (n_idx, n_image) in enumerate(self.image_list.images):
207 | 				if n_idx == ref_idx:
208 | 					shared_feature_list.append(0)
209 | 					continue
210 | 				shared_feature_list.append(len(point_id_list[ref_idx] & point_id_list[n_idx]))
211 | 			index_order = np.argsort(np.array(shared_feature_list))[::-1]
212 | 			neighbor_list = []
213 | 			for idx in index_order:
214 | 				if shared_feature_list[idx] == 0:
215 | 					break
216 | 				neighbor_list.append(idx)
217 | 				if len(neighbor_list) == max_num_neighbors:
218 | 					break
219 | 			self.image_list.images[ref_idx].neighbor_list = neighbor_list
220 | 


--------------------------------------------------------------------------------
/python/colmap_helpers_for_bin.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import cv2
  5 | import imageio
  6 | from pyquaternion import Quaternion
  7 | import struct
  8 | 
  9 | 
 10 | class PointList:
 11 | 
 12 | 	class Point:
 13 | 		def __init__(self, id, coord):
 14 | 			self.id = id
 15 | 			self.coord = coord
 16 | 
 17 | 	def __init__(self, num_points2D, x_y_id_s):
 18 | 		self.points = []
 19 | 		for i in range(0, num_points2D):
 20 | 			id = int(x_y_id_s[3 * i + 2])
 21 | 			if id == -1:
 22 | 				continue
 23 | 			coord = np.array([float(x_y_id_s[3 * i + 0]), float(x_y_id_s[3 * i + 1])])
 24 | 			self.points.append(self.Point(id, coord))
 25 | 		self.length = len(self.points)
 26 | 
 27 | 	def get_by_id(self, id):
 28 | 		for idx in range(0, self.length):
 29 | 			if id == self.points[idx].id:
 30 | 				return self.points[idx]
 31 | 		return None
 32 | 
 33 | class ImageList:
 34 | 
 35 | 	class Image:
 36 | 		def __init__(self, id, extrinsic, camera_id, filename, point_list):
 37 | 			self.id = id
 38 | 			self.extrinsic = extrinsic
 39 | 			self.camera_id = camera_id
 40 | 			self.filename = filename
 41 | 			self.point_list = point_list
 42 | 
 43 | 	def __init__(self, path):
 44 | 		self.images = []
 45 | 
 46 | 		with open(path, "rb") as fid:
 47 | 			self.length = read_next_bytes(fid, 8, "Q")[0]
 48 | 			for image_index in range(self.length):
 49 | 				binary_image_properties = read_next_bytes(
 50 | 					fid, num_bytes=64, format_char_sequence="idddddddi")
 51 | 				id = binary_image_properties[0]
 52 | 				qvec = np.array(binary_image_properties[1:5])
 53 | 				tvec = np.array(binary_image_properties[5:8])
 54 | 				extrinsic = Quaternion(float(qvec[0]), float(qvec[1]), float(qvec[2]),
 55 | 									   float(qvec[3])).transformation_matrix
 56 | 				extrinsic[0, 3] = float(tvec[0])
 57 | 				extrinsic[1, 3] = float(tvec[1])
 58 | 				extrinsic[2, 3] = float(tvec[2])
 59 | 				camera_id = binary_image_properties[8]
 60 | 				image_name = ""
 61 | 				current_char = read_next_bytes(fid, 1, "c")[0]
 62 | 				while current_char != b"\x00":  # look for the ASCII 0 entry
 63 | 					image_name += current_char.decode("utf-8")
 64 | 					current_char = read_next_bytes(fid, 1, "c")[0]
 65 | 				filename = image_name
 66 | 				num_points2D = read_next_bytes(fid, num_bytes=8,
 67 | 											   format_char_sequence="Q")[0]
 68 | 				x_y_id_s = read_next_bytes(fid, num_bytes=24 * num_points2D,
 69 | 										   format_char_sequence="ddq" * num_points2D)
 70 | 				point_list = PointList(num_points2D, x_y_id_s)
 71 | 				self.images.append(self.Image(id, extrinsic, camera_id, filename, point_list))
 72 | 
 73 | 	def get_by_id(self, id):
 74 | 		for idx in range(0, self.length):
 75 | 			if id == self.images[idx].id:
 76 | 				return self.images[idx]
 77 | 		return None
 78 | 
 79 | class CameraList:
 80 | 
 81 | 	class Camera:
 82 | 		def __init__(self, id, width, height, fx, fy, cx, cy):
 83 | 			self.id = id
 84 | 			self.width = width
 85 | 			self.height = height
 86 | 			self.fx = fx
 87 | 			self.fy = fy
 88 | 			self.cx = cx
 89 | 			self.cy = cy
 90 | 
 91 | 	def __init__(self, path):
 92 | 		self.cameras = []
 93 | 
 94 | 		with open(path, "rb") as fid:
 95 | 			self.length = read_next_bytes(fid, 8, "Q")[0]
 96 | 			for camera_line_index in range(self.length):
 97 | 				camera_properties = read_next_bytes(
 98 | 					fid, num_bytes=24, format_char_sequence="iiQQ")
 99 | 				camera_id = camera_properties[0]
100 | 				model_id = camera_properties[1]
101 | 				width = camera_properties[2]
102 | 				height = camera_properties[3]
103 | 				# assuming the pinhole model
104 | 				num_params = 4
105 | 				params = read_next_bytes(fid, num_bytes=8 * num_params,
106 | 										 format_char_sequence="d" * num_params)
107 | 
108 | 				self.cameras.append(self.Camera(camera_id, width, height, params[0], params[1], params[2], params[3]))
109 | 
110 | 	def get_by_id(self, id):
111 | 		for idx in range(0, self.length):
112 | 			if id == self.cameras[idx].id:
113 | 				return self.cameras[idx]
114 | 		return None
115 | 
116 | class PointCloud:
117 | 
118 | 	class Point:
119 | 		def __init__(self, id, coord):
120 | 			self.id = id
121 | 			self.coord = coord
122 | 
123 | 	def __init__(self, path):
124 | 		self.points = []
125 | 		with open(path, "rb") as fid:
126 | 			self.length = read_next_bytes(fid, 8, "Q")[0]
127 | 			for point_line_index in range(self.length):
128 | 				binary_point_line_properties = read_next_bytes(
129 | 					fid, num_bytes=43, format_char_sequence="QdddBBBd")
130 | 				point3D_id = binary_point_line_properties[0]
131 | 				xyz = np.array(binary_point_line_properties[1:4])
132 | 				x = xyz[0]
133 | 				y = xyz[1]
134 | 				z = xyz[2]
135 | 				coord = np.array([x, y, z, 1.0])
136 | 
137 | 				track_length = read_next_bytes(
138 | 					fid, num_bytes=8, format_char_sequence="Q")[0]
139 | 				track_elems = read_next_bytes(
140 | 					fid, num_bytes=8 * track_length,
141 | 					format_char_sequence="ii" * track_length)
142 | 
143 | 				self.points.append(self.Point(point3D_id, coord))
144 | 
145 | 
146 | 	def get_by_id(self, id):
147 | 		for idx in range(0, self.length):
148 | 			if id == self.points[idx].id:
149 | 				return self.points[idx]
150 | 		return None
151 | 
152 | class ColmapSparse:
153 | 	def __init__(self, sparse_path, image_path, image_width = -1, image_height = -1, max_num_neighbors = 16):
154 | 		image_list_path = os.path.join(sparse_path, "images.bin")
155 | 		camera_list_path = os.path.join(sparse_path, "cameras.bin")
156 | 		point_cloud_path = os.path.join(sparse_path, "points3D.bin")
157 | 		if not os.path.exists(image_list_path):
158 | 			raise ValueError("{:} does not exist.".format(image_list_path))
159 | 		if not os.path.exists(camera_list_path):
160 | 			raise ValueError("{:} does not exist.".format(camera_list_path))
161 | 		if not os.path.exists(point_cloud_path):
162 | 			raise ValueError("{:} does not exist.".format(point_cloud_path))
163 | 		self.image_list = ImageList(image_list_path)
164 | 		self.camera_list = CameraList(camera_list_path)
165 | 		self.point_cloud = PointCloud(point_cloud_path)
166 | 		self.load_images(image_path)
167 | 		self.resize(image_width, image_height)
168 | 		self.estimate_max_disparities()
169 | 		self.generate_neighbor_list(max_num_neighbors)
170 | 
171 | 	def load_images(self, image_path):
172 | 		for image_idx in range(self.image_list.length):
173 | 			self.image_list.images[image_idx].rgb = imageio.imread(os.path.join(image_path, self.image_list.images[image_idx].filename)).astype(np.float32) / 255.0
174 | 
175 | 	def resize(self, image_width, image_height):
176 | 		if image_width < 0 and image_height < 0:
177 | 			return
178 | 		for camera_idx in range(self.camera_list.length):
179 | 			orig_image_width = self.camera_list.cameras[camera_idx].width
180 | 			orig_image_height = self.camera_list.cameras[camera_idx].height
181 | 			if image_width < 0:
182 | 				target_image_width = image_height * orig_image_width / orig_image_height
183 | 				target_image_height = image_height
184 | 			elif image_height < 0:
185 | 				target_image_width = image_width
186 | 				target_image_height = image_width * orig_image_height / orig_image_width
187 | 			else:
188 | 				target_image_width = image_width
189 | 				target_image_height = image_height
190 | 			width_ratio = float(target_image_width) / orig_image_width
191 | 			height_ratio = float(target_image_height) / orig_image_height
192 | 			self.camera_list.cameras[camera_idx].width = target_image_width
193 | 			self.camera_list.cameras[camera_idx].height = target_image_height
194 | 			self.camera_list.cameras[camera_idx].fx *= width_ratio
195 | 			self.camera_list.cameras[camera_idx].fy *= height_ratio
196 | 			self.camera_list.cameras[camera_idx].cx *= width_ratio
197 | 			self.camera_list.cameras[camera_idx].cy *= height_ratio
198 | 			self.camera_list.cameras[camera_idx].width_ratio = width_ratio
199 | 			self.camera_list.cameras[camera_idx].height_ratio = height_ratio
200 | 
201 | 		for image_idx in range(self.image_list.length):
202 | 			camera = self.camera_list.get_by_id(self.image_list.images[image_idx].camera_id)
203 | 			self.image_list.images[image_idx].rgb = cv2.resize(self.image_list.images[image_idx].rgb, (camera.width, camera.height), interpolation = cv2.INTER_AREA)
204 | 			for point_idx in range(self.image_list.images[image_idx].point_list.length):
205 | 				self.image_list.images[image_idx].point_list.points[point_idx].coord[0] *= camera.width_ratio
206 | 				self.image_list.images[image_idx].point_list.points[point_idx].coord[1] *= camera.height_ratio
207 | 
208 | 	def estimate_max_disparities(self, percentile = 0.99, stretch = 1.333333):
209 | 		for (img_idx, image) in enumerate(self.image_list.images):
210 | 			camera = self.camera_list.get_by_id(image.camera_id)
211 | 			disparity_list = []
212 | 			for (point_idx, point) in enumerate(self.point_cloud.points):
213 | 				coord = image.extrinsic.dot(point.coord)
214 | 				new_x = (coord[0] / coord[2] * camera.fx + camera.cx)
215 | 				new_y = (coord[1] / coord[2] * camera.fy + camera.cy)
216 | 				new_d = 1.0 / coord[2]
217 | 				if new_x >= 0.0 and new_x < camera.width and new_y >= 0.0 and new_y < camera.height and new_d > 0.0:
218 | 					disparity_list.append(new_d)
219 | 			disparity_list = np.sort(np.array(disparity_list))
220 | 			self.image_list.images[img_idx].estimated_max_disparity = disparity_list[int(disparity_list.shape[0] * percentile)] * stretch
221 | 
222 | 	def generate_neighbor_list(self, max_num_neighbors):
223 | 		point_id_list = []
224 | 		for (ref_idx, ref_image) in enumerate(self.image_list.images):
225 | 			point_id_set = set()
226 | 			for (ref_point_idx, ref_point) in enumerate(ref_image.point_list.points):
227 | 				point_id_set.add(ref_point.id)
228 | 			point_id_list.append(point_id_set)
229 | 		for (ref_idx, ref_image) in enumerate(self.image_list.images):
230 | 			shared_feature_list = []
231 | 			for (n_idx, n_image) in enumerate(self.image_list.images):
232 | 				if n_idx == ref_idx:
233 | 					shared_feature_list.append(0)
234 | 					continue
235 | 				shared_feature_list.append(len(point_id_list[ref_idx] & point_id_list[n_idx]))
236 | 			index_order = np.argsort(np.array(shared_feature_list))[::-1]
237 | 			neighbor_list = []
238 | 			for idx in index_order:
239 | 				if shared_feature_list[idx] == 0:
240 | 					break
241 | 				neighbor_list.append(idx)
242 | 				if len(neighbor_list) == max_num_neighbors:
243 | 					break
244 | 			self.image_list.images[ref_idx].neighbor_list = neighbor_list
245 | 
246 | 
247 | 
248 | def read_next_bytes(fid, num_bytes, format_char_sequence, endian_character="<"):
249 |     """Read and unpack the next bytes from a binary file.
250 |     :param fid:
251 |     :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
252 |     :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
253 |     :param endian_character: Any of {@, =, <, >, !}
254 |     :return: Tuple of read and unpacked values.
255 |     """
256 |     data = fid.read(num_bytes)
257 |     return struct.unpack(endian_character + format_char_sequence, data)


--------------------------------------------------------------------------------
/python/download_trained_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | 
 5 | def download_trained_model(path = None):
 6 | 	if path is None:
 7 | 		path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "model")
 8 | 	if not os.path.isdir(path):
 9 | 		os.mkdir(path)
10 | 	print "Downloading trained model..."
11 | 	subprocess.call(
12 | 		"cd {:} ;".format(path) + 
13 | 		"wget -O DeepMVS_final.model \"https://drive.google.com/u/0/uc?id=1DVhvFot_ePiHNTrAiGSVegFRz_FCYiCY&export=download\" ;",
14 | 		shell = True
15 | 	)
16 | 	print "Successfully downloaded trained model."
17 | 
18 | if __name__ == "__main__":
19 | 	download_trained_model()
20 | 
21 | 


--------------------------------------------------------------------------------
/python/download_training_datasets.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import subprocess
  4 | import json
  5 | 
  6 | import numpy as np
  7 | import imageio
  8 | from imageio.plugins import freeimage
  9 | import h5py
 10 | from lz4.block import decompress
 11 | 
 12 | def download_demon_datasets(path):
 13 | 	print """
 14 | ===================================
 15 | Note: DeMoN dataset is not created by us. If you use these datasets in your work, please visit the url below for information about citations. 
 16 | https://github.com/lmb-freiburg/demon/blob/master/datasets/download_traindata.sh
 17 | ===================================
 18 | 	"""
 19 | 	DATASET_NAMES = ["sun3d", "rgbd", "mvs", "scenes11"]
 20 | 	for dataset_name in DATASET_NAMES:
 21 | 		if not os.path.exists(os.path.join(path, "{:}_train.tgz".format(dataset_name))):
 22 | 			print "Downloading {:} dataset...".format(dataset_name)
 23 | 			subprocess.call(
 24 | 					"cd {:} ;".format(path) +
 25 | 					"wget https://lmb.informatik.uni-freiburg.de/data/demon/traindata/{:}_train.tgz ;".format(dataset_name) +
 26 | 					"tar -xvzf {:}_train.tgz ;".format(dataset_name),
 27 | 					shell = True
 28 | 				)
 29 | 
 30 | 	print "Converting DeMoN dataset into the format required by DeepMVS..."
 31 | 	SUB_DATASET_NAMES = ([
 32 | 		"mvs_achteck_turm", "mvs_breisach", "mvs_citywall", 
 33 | 		"rgbd_10_to_20_3d_train", "rgbd_10_to_20_handheld_train", "rgbd_10_to_20_simple_train", "rgbd_20_to_inf_3d_train", "rgbd_20_to_inf_handheld_train", "rgbd_20_to_inf_simple_train",
 34 | 		"scenes11_train", 
 35 | 		"sun3d_train_0.01m_to_0.1m", "sun3d_train_0.1m_to_0.2m", "sun3d_train_0.2m_to_0.4m", "sun3d_train_0.4m_to_0.8m", "sun3d_train_0.8m_to_1.6m", "sun3d_train_1.6m_to_infm"
 36 | 	])
 37 | 	for dataset_name in SUB_DATASET_NAMES:
 38 | 		print "Converting {:}.h5 ...".format(dataset_name)
 39 | 		if not os.path.isdir(os.path.join(path, dataset_name)):
 40 | 			os.mkdir(os.path.join(path, dataset_name))
 41 | 		file = h5py.File(os.path.join(path, "{:}.h5".format(dataset_name)), "r")
 42 | 		
 43 | 		num_images = []
 44 | 		for (seq_idx, seq_name) in enumerate(file):
 45 | 			print "Processing sequence {:d}/{:d}".format(seq_idx, len(file))
 46 | 			if not os.path.isdir(os.path.join(path, dataset_name, "{:04d}".format(seq_idx))):
 47 | 				os.mkdir(os.path.join(path, dataset_name, "{:04d}".format(seq_idx)))
 48 | 			if not os.path.isdir(os.path.join(path, dataset_name, "{:04d}".format(seq_idx), "images")):
 49 | 				os.mkdir(os.path.join(path, dataset_name, "{:04d}".format(seq_idx), "images"))
 50 | 			if not os.path.isdir(os.path.join(path, dataset_name, "{:04d}".format(seq_idx), "depths")):
 51 | 				os.mkdir(os.path.join(path, dataset_name, "{:04d}".format(seq_idx), "depths"))
 52 | 			if not os.path.isdir(os.path.join(path, dataset_name, "{:04d}".format(seq_idx), "poses")):
 53 | 				os.mkdir(os.path.join(path, dataset_name, "{:04d}".format(seq_idx), "poses"))
 54 | 			sequence = file[seq_name]["frames"]["t0"]
 55 | 			num_images.append(len(sequence))
 56 | 			for (f_idx, f_name) in enumerate(sequence):
 57 | 				frame = sequence[f_name]
 58 | 				for dt_type in frame:
 59 | 					dataset = frame[dt_type]
 60 | 					img = dataset[...]
 61 | 					if dt_type == "camera":
 62 | 						camera = ({
 63 | 							"extrinsic": [[img[5],img[8],img[11],img[14]], [img[6],img[9],img[12],img[15]], [img[7],img[10],img[13],img[16]], [0.0,0.0,0.0,1.0]],
 64 | 							"f_x": img[0],
 65 | 							"f_y": img[1],
 66 | 							"c_x": img[3],
 67 | 							"c_y": img[4]
 68 | 						})
 69 | 						with open(os.path.join(path, dataset_name, "{:04d}".format(seq_idx), "poses", "{:04d}.json".format(f_idx)), "w") as output_file:
 70 | 							json.dump(camera, output_file)
 71 | 					elif dt_type == "depth":
 72 | 						dimension = dataset.attrs["extents"]
 73 | 						depth = np.array(np.frombuffer(decompress(img.tobytes(), dimension[0] * dimension[1] * 2), dtype = np.float16)).astype(np.float32)
 74 | 						depth = depth.reshape(dimension[0], dimension[1])
 75 | 						imageio.imwrite(os.path.join(path, dataset_name, "{:04d}".format(seq_idx), "depths", "{:04d}.exr".format(f_idx)), depth, flags = freeimage.IO_FLAGS.EXR_ZIP)
 76 | 					elif dt_type == "image":
 77 | 						try:
 78 | 							img = imageio.imread(img.tobytes(), format = "RAW-FI")
 79 | 						except:
 80 | 							img = imageio.imread(img.tobytes())
 81 | 						imageio.imwrite(os.path.join(path, dataset_name, "{:04d}".format(seq_idx), "images", "{:04d}.png".format(f_idx)), img)
 82 | 		with open(os.path.join(path, dataset_name, "num_images.json"), "w") as output_file:
 83 | 			json.dump(num_images, output_file)
 84 | 
 85 | def download_GTAV_datasets(path):
 86 | 	if not os.path.exists(os.path.join(path, "GTAV_720.tar.gz")):
 87 | 		print "Downloading GTAV_720 dataset..."
 88 | 		subprocess.call(
 89 | 				"cd {:} ;".format(path) +
 90 | 				"wget -O GTAV_720.tar.gz https://filebox.ece.vt.edu/~jbhuang/project/deepmvs/mvs-syn/GTAV_720.tar.gz ;" +
 91 | 				"tar -xvzf GTAV_720.tar.gz ;",
 92 | 				shell = True
 93 | 			)
 94 | 	if not os.path.exists(os.path.join(path, "GTAV_540.tar.gz")):
 95 | 		print "Downloading GTAV_540 dataset..."
 96 | 		subprocess.call(
 97 | 				"cd {:} ;".format(path) + 
 98 | 				"wget -O GTAV_540.tar.gz https://filebox.ece.vt.edu/~jbhuang/project/deepmvs/mvs-syn/GTAV_540.tar.gz ;" +
 99 | 				"tar -xvzf GTAV_540.tar.gz ;",
100 | 				shell = True
101 | 			)
102 | 
103 | def download_training_datasets(path = None):
104 | 	if path is None:
105 | 		ROOT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
106 | 		if not os.path.isdir(os.path.join(ROOT_DIR, "dataset")):
107 | 			os.mkdir(os.path.join(ROOT_DIR, "dataset"))
108 | 		if not os.path.isdir(os.path.join(ROOT_DIR, "dataset", "train")):
109 | 			os.mkdir(os.path.join(ROOT_DIR, "dataset", "train"))
110 | 		download_demon_datasets(os.path.join(ROOT_DIR, "dataset", "train"))
111 | 		download_GTAV_datasets(os.path.join(ROOT_DIR, "dataset", "train"))
112 | 	else:
113 | 		download_demon_datasets(path)
114 | 		download_GTAV_datasets(path)
115 | 	print "Finished downloading training datasets."
116 | 
117 | if __name__ == "__main__":
118 | 	download_training_datasets()
119 | 


--------------------------------------------------------------------------------
/python/eval.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import cv2
  4 | import imageio
  5 | import numpy as np
  6 | from scipy import linalg as la
  7 | 
  8 | # Parse arguments.
  9 | parser = argparse.ArgumentParser(description = "Evaluate predicted disparity maps.")
 10 | parser.add_argument("--image_path", dest = "image_path", help = "Path to the images.", required = True)
 11 | parser.add_argument("--sparse_path", dest = "sparse_path", help = "Path to the sparse reconstruction.", required = True)
 12 | parser.add_argument("--output_path", dest = "output_path", help = "Path to store the predicted results.", required = True)
 13 | parser.add_argument("--gt_path", dest = "gt_path", help = "Path to store the ground truth results. The gt depth filenames should be <image_name>.depth.npy", required = True)
 14 | parser.add_argument("--load_bin", dest = "load_bin", type = bool, default = False, help = "Set if you want to load COLMAP .bin files")
 15 | # TODO: Support --gt_type = "colmap_bin"
 16 | parser.add_argument("--gt_type", dest = "gt_type", choices = ["depth", "disparity"], default = "disparity", help = "Specify whether the ground truth depth files store depth or disparity (=1/depth).")
 17 | parser.add_argument("--output_type", dest = "output_type", choices = ["depth", "disparity"], default = "disparity", help = "Specify whether the predicted depth files store depth value or disparity (=1/depth).")
 18 | parser.add_argument("--image_width", dest = "image_width", type = int, help = "Image width (>0).", required = True)
 19 | parser.add_argument("--image_height", dest = "image_height", type = int, help = "Image height (>0).", required = True)
 20 | parser.add_argument("--skip_rephoto", dest = "skip_rephoto", action = "store_true", default = False, help = "Skip rephoto error to speed up evaluation.")
 21 | parser.add_argument("--store_rephoto", dest = "store_rephoto", action = "store_true", default = False, help = "Store the rephotography result using the predicted depths to <output_path>/rephoto.")
 22 | # Set border = 6 since COLMAP predicts unuseful values for the borders.
 23 | parser.add_argument("--border", dest = "border", type = int, default = 6, help = "Width of the borders to ignore from evaluation.")
 24 | parser.add_argument("--size_mismatch", dest = "size_mismatch", choices = ["throw", "crop_pad", "resize"], default = "throw", 
 25 | 	help = 
 26 | """Specify what to do if the size of depth maps do not match the specified image_width and image_height.
 27 | "throw": throw an error.
 28 | "crop_pad": crop or pad the depth maps.
 29 | "resize": resample the depth maps using nearest neighbor sampling.
 30 | """
 31 | 	)
 32 | 
 33 | args = parser.parse_args()
 34 | 
 35 | image_path = args.image_path
 36 | sparse_path = args.sparse_path
 37 | output_path = args.output_path
 38 | gt_path = args.gt_path
 39 | load_bin = args.load_bin
 40 | gt_type = args.gt_type
 41 | output_type = args.output_type
 42 | image_width = args.image_width
 43 | image_height = args.image_height
 44 | skip_rephoto = args.skip_rephoto
 45 | store_rephoto = args.store_rephoto
 46 | border = args.border
 47 | size_mismatch = args.size_mismatch
 48 | 
 49 | if args.load_bin:
 50 | 	from colmap_helpers_for_bin import ColmapSparse
 51 | else:
 52 | 	from colmap_helpers import ColmapSparse
 53 | 
 54 | # Crop or pad the depth map to specific size.
 55 | def crop_pad(img, w, h):
 56 | 	img_w = img.shape[1]
 57 | 	img_h = img.shape[0]
 58 | 	if img_w > w:
 59 | 		padding = (img_w - w) / 2
 60 | 		img = img[:,padding:padding+w]
 61 | 	elif img_w < w:
 62 | 		padding_pre = (w - img_w) / 2
 63 | 		padding_post = w - img_w - padding_pre
 64 | 		img = np.pad(img, ((0,0), (padding_pre,padding_post)), "edge")
 65 | 	if img_h > h:
 66 | 		padding = (img_h - h) / 2
 67 | 		img = img[padding:padding+h,:]
 68 | 	elif img_h < h:
 69 | 		padding_pre = (h - img_h) / 2
 70 | 		padding_post = h - img_h - padding_pre
 71 | 		img = np.pad(img, ((padding_pre,padding_post), (0,0)), "edge")
 72 | 	return img
 73 | 
 74 | # Resize the depth map to specific size.
 75 | def resize(img, w, h):
 76 | 	img = cv2.resize(img, (w, h), interpolation = cv2.INTER_NEAREST)
 77 | 	return img
 78 | 
 79 | # Compute rephoto error.
 80 | def get_rephoto_diff(rephoto_path, sparse_model, frame_idx, predict_depth, mask = None):
 81 | 
 82 | 	# Get reference image and camera pose.
 83 | 	target_image = sparse_model.image_list.images[frame_idx]
 84 | 	target_camera = sparse_model.camera_list.get_by_id(target_image.camera_id)
 85 | 	num_neighbors = len(target_image.neighbor_list)
 86 | 
 87 | 	# Ignore borders.
 88 | 	gt_img = target_image.rgb
 89 | 	if border > 0:
 90 | 		gt_img = gt_img[border:-border, border:-border, :]
 91 | 	input_width = gt_img.shape[1]
 92 | 	input_height = gt_img.shape[0]
 93 | 
 94 | 	# Store color candidates for each pixel.
 95 | 	rephoto_volume = -np.ones((num_neighbors, input_height, input_width, 3), dtype = np.float32)
 96 | 
 97 | 	# Loop through all neighbors.
 98 | 	for (idx, neighbor_idx) in enumerate(target_image.neighbor_list):
 99 | 		print "  Warping neighbor No. {:d}/{:d}".format(idx, num_neighbors)
100 | 		# Get neighbor image and camera pose.
101 | 		n_image = sparse_model.image_list.images[neighbor_idx]
102 | 		n_camera = sparse_model.camera_list.get_by_id(n_image.camera_id)
103 | 		rgb_dst = n_image.rgb
104 | 		trans_matrix = n_image.extrinsic.dot(la.inv(target_image.extrinsic))
105 | 		# Warp the neighbor image to reference view.
106 | 		coord_buffer = np.zeros((input_height, input_width, 4), dtype = np.float32)
107 | 		coord_buffer[..., 0] = (border + np.mgrid[0:input_height, 0:input_width][1] - target_camera.cx) / target_camera.fx
108 | 		coord_buffer[..., 1] = (border + np.mgrid[0:input_height, 0:input_width][0] - target_camera.cy) / target_camera.fy
109 | 		coord_buffer[..., 2] = 1.0
110 | 		coord_buffer[..., 3] = np.where(predict_depth == 0.0, 0.0, 1.0)
111 | 		coord_buffer[..., 0:3] = np.where(predict_depth[..., np.newaxis] == 0.0, coord_buffer[..., 0:3], coord_buffer[..., 0:3] / predict_depth[..., np.newaxis])
112 | 		coord_buffer[...] = np.moveaxis(trans_matrix.dot(coord_buffer[..., np.newaxis])[..., 0], 0, -1)
113 | 		x_map = np.where(coord_buffer[..., 2] > 0.0, 
114 | 			coord_buffer[..., 0] / coord_buffer[..., 2] * n_camera.fx + n_camera.cx, -1) 
115 | 		y_map = np.where(coord_buffer[..., 2] > 0.0,
116 | 			coord_buffer[..., 1] / coord_buffer[..., 2] * n_camera.fy + n_camera.cy, -1) 
117 | 		cv2.remap(rgb_dst, x_map, y_map, cv2.INTER_LINEAR, rephoto_volume[idx, ...], cv2.BORDER_CONSTANT, (-1.0, -1.0, -1.0))
118 | 
119 | 	# Select the median color for each pixel.
120 | 	rephoto_volume = np.sort(rephoto_volume, axis = 0)[::-1, ...]
121 | 	valid_count = num_neighbors - np.sum(rephoto_volume < 0.0, axis = 0)
122 | 	valid_mask = valid_count != 0
123 | 	chosen_indices = (valid_count) / 2
124 | 	rephoto_img = np.zeros((input_height, input_width, 3), dtype = np.float32)
125 | 	row_grid = np.mgrid[0:input_height, 0:input_width, 0:3][0]
126 | 	col_grid = np.mgrid[0:input_height, 0:input_width, 0:3][1]
127 | 	channel_grid = np.mgrid[0:input_height, 0:input_width, 0:3][2]
128 | 	rephoto_img = rephoto_volume[chosen_indices, row_grid, col_grid, channel_grid]
129 | 	rephoto_img[np.logical_not(valid_mask)] = 0.5
130 | 
131 | 	# Compute rephoto error.
132 | 	valid_mask = valid_mask[..., 0]
133 | 	diff = np.sum(np.abs(gt_img - rephoto_img), axis = -1)
134 | 	rephoto_img = np.where(valid_mask[..., np.newaxis], rephoto_img, np.array([0.0, 1.0, 0.0])[np.newaxis, np.newaxis, :])
135 | 
136 | 	# Apply masks if needed.
137 | 	if not mask is None:
138 | 		rephoto_img = np.where(mask[..., np.newaxis], rephoto_img, np.array([1.0, 0.0, 0.0])[np.newaxis, np.newaxis, :])
139 | 		valid_mask = np.logical_and(valid_mask, mask)
140 | 
141 | 	# Store rephoto images if needed.
142 | 	if store_rephoto:
143 | 		output_dir = os.path.dirname(os.path.join(rephoto_path, target_image.filename))
144 | 		if not os.path.exists(output_dir):
145 | 			os.makedirs(output_dir)
146 | 		imageio.imwrite("{:}/{:}.rephoto.png".format(rephoto_path, target_image.filename), rephoto_img.clip(0.0, 1.0))
147 | 		imageio.imwrite("{:}/{:}.gt_rgb.png".format(rephoto_path, target_image.filename), gt_img.clip(0.0, 1.0))
148 | 
149 | 	return diff[valid_mask]
150 | 
151 | # Load COLMAP sparse model.
152 | print "Loading the sparse model..."
153 | sparse_model = ColmapSparse(sparse_path, image_path, image_width, image_height)
154 | print "Successfully loaded the sparse model."
155 | 
156 | # Loop through all reference images.
157 | errors_L1 = []
158 | errors_rephoto = []
159 | for (frame_idx, frame) in enumerate(sparse_model.image_list.images):
160 | 	print "Processing reference image No. {:d}/{:d}".format(frame_idx, sparse_model.image_list.length)
161 | 
162 | 	# Load ground truth depths.
163 | 	gt_depth = np.load("{:}/{:}.depth.npy".format(gt_path, frame.filename))
164 | 	gt_depth = np.pad(gt_depth, ((0,0), (15,15)), "edge")
165 | 	if gt_depth.shape[0] != image_height or gt_depth.shape[1] != image_width:
166 | 		if size_mismatch == "throw":
167 | 			raise RuntimeError("Invalid size of gt_depth. gt_depth has size = {:} but the specified image size = ({:d}, {:d}).".format(gt_depth.shape, image_height, image_width))
168 | 		elif size_mismatch == "crop_pad":
169 | 			gt_depth = crop_pad(gt_depth, image_width, image_height)
170 | 		elif size_mismatch == "resize":
171 | 			gt_depth = resize(gt_depth, image_width, image_height)
172 | 		else:
173 | 			raise ValueError("size_mismatch is not supported")
174 | 	if border > 0:
175 | 		gt_depth = gt_depth[border:-border, border:-border]
176 | 	if gt_type == "depth":
177 | 		gt_valid = gt_depth > 0.0
178 | 		gt_depth = np.where(gt_valid, 1.0 / gt_depth, 0.0)
179 | 	elif gt_type == "disparity":
180 | 		# In ground truth, disparity = 0 represents invalid values.
181 | 		gt_valid = gt_depth > 0.0
182 | 	else:
183 | 		raise ValueError("gt_type is not supported")
184 | 	
185 | 	# Load predicted depths.
186 | 	output_depth = np.load("{:}/{:}.output.npy".format(output_path, frame.filename))
187 | 	if output_depth.shape[0] != image_height or output_depth.shape[1] != image_width:
188 | 		if size_mismatch == "throw":
189 | 			raise RuntimeError("Invalid size of output_depth. output_depth has size = {:} but the specified image size = ({:d}, {:d}).".format(output_depth.shape, image_height, image_width))
190 | 		elif size_mismatch == "crop_pad":
191 | 			output_depth = crop_pad(output_depth, image_width, image_height)
192 | 		elif size_mismatch == "resize":
193 | 			output_depth = resize(output_depth, image_width, image_height)
194 | 		else:
195 | 			raise ValueError("size_mismatch is not supported")
196 | 	if border > 0:
197 | 		output_depth = output_depth[border:-border, border:-border]
198 | 	if output_type == "depth":
199 | 		output_valid = output_depth > 0.0
200 | 		output_depth = np.where(output_valid, 1.0 / output_depth, 0.0)
201 | 	elif output_type == "disparity":
202 | 		# In predicted results, disparity = 0 is a valid value.
203 | 		output_valid = output_depth >= 0.0
204 | 	else:
205 | 		raise ValueError("output_type is not supported")
206 | 	
207 | 	
208 | 	
209 | 	# Compute L1 error.
210 | 	valid_mask = np.logical_and(gt_valid, output_valid)
211 | 	error_L1 = np.abs(output_depth - gt_depth)[valid_mask]
212 | 	errors_L1.extend(error_L1.flatten().tolist())
213 | 
214 | 	# Compute rephoto error.
215 | 	if not skip_rephoto:
216 | 		error_rephoto = get_rephoto_diff(os.path.join(output_path, "rephoto"), sparse_model, frame_idx, output_depth, output_valid)
217 | 		errors_rephoto.extend(error_rephoto.tolist())
218 | 
219 | # Report errors:
220 | mean_L1 = np.mean(errors_L1)
221 | print "Disparity L1 error = {:f}, number of valid pixels = {:d}".format(mean_L1, len(errors_L1))
222 | if not skip_rephoto:
223 | 	mean_rephoto = np.mean(errors_rephoto)
224 | 	print "Rephoto error = {:f}, number of valid pixels = {:d}".format(mean_rephoto, len(errors_rephoto))
225 | 


--------------------------------------------------------------------------------
/python/generate_volume_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import json
 4 | 
 5 | import numpy as np
 6 | from numpy import linalg as la
 7 | import cv2
 8 | import imageio
 9 | 
10 | def generate_volume_test(shared_data):
11 | 	# Get event handles.
12 | 	ready_e = shared_data["ready_e"]
13 | 	start_e = shared_data["start_e"]
14 | 	# Read paramters.
15 | 	patch_width = shared_data["patch_width"]
16 | 	patch_height = shared_data["patch_height"]
17 | 	num_depths = shared_data["num_depths"]
18 | 	max_num_neighbors = shared_data["max_num_neighbors"]
19 | 	sparse_model = shared_data["sparse_model"]
20 | 	# Continue generate pland sweep volume.
21 | 	while True:
22 | 		start_e.wait()
23 | 		start_e.clear()
24 | 		if shared_data["stop"]:
25 | 			break
26 | 		# Get reference image index and location of the patch.
27 | 		ref_image_idx = shared_data["image_idx"]
28 | 		target_x = shared_data["target_x"]
29 | 		target_y = shared_data["target_y"]
30 | 		# Read the reference RGB image. 
31 | 		ref_image = sparse_model.image_list.images[ref_image_idx]
32 | 		ref_img_full = ref_image.rgb
33 | 		ref_img = ref_img_full[target_y:target_y + patch_height, target_x:target_x + patch_width, :] - 0.5
34 | 		# Get neighbor images and disparity step.
35 | 		ref_camera = sparse_model.camera_list.get_by_id(ref_image.camera_id)
36 | 		neighbor_list = ref_image.neighbor_list
37 | 		num_neighbors = len(neighbor_list)
38 | 		d_step = ref_image.estimated_max_disparity / num_depths
39 | 		# Allocate buffers.
40 | 		sweep_volume = np.zeros((num_neighbors, num_depths, patch_height, patch_width, 3), dtype = np.float32)
41 | 		x_map = np.zeros((patch_height, patch_width), dtype = np.float32)
42 | 		y_map = np.zeros((patch_height, patch_width), dtype = np.float32)
43 | 		coord_buffer = np.zeros((patch_height, patch_width, 4), dtype = np.float32)
44 | 		# Loop through neighbor images.
45 | 		n_idx = 0
46 | 		for n_image_idx in neighbor_list:
47 | 			# Load the neighbor image.
48 | 			n_image = sparse_model.image_list.images[n_image_idx]
49 | 			n_camera = sparse_model.camera_list.get_by_id(n_image.camera_id)
50 | 			n_img = n_image.rgb - 0.5
51 | 			trans_matrix = n_image.extrinsic.dot(la.inv(ref_image.extrinsic))
52 | 			# Warp the neighbor image for each disparity level.
53 | 			for d_idx in range(0, num_depths):
54 | 				disparity = d_idx * d_step
55 | 				coord_buffer[..., 0] = (np.mgrid[0:patch_height, 0:patch_width][1] + target_x - ref_camera.cx) / ref_camera.fx
56 | 				coord_buffer[..., 1] = (np.mgrid[0:patch_height, 0:patch_width][0] + target_y - ref_camera.cy) / ref_camera.fy
57 | 				coord_buffer[..., 2] = 1.0
58 | 				if disparity == 0.0:
59 | 					coord_buffer[..., 3] = 0.0
60 | 				else:
61 | 					coord_buffer[..., 0:3] /= disparity
62 | 					coord_buffer[..., 3] = 1.0
63 | 				coord_buffer[...] = np.moveaxis(trans_matrix.dot(coord_buffer[..., np.newaxis])[..., 0], 0, -1)
64 | 				x_map[...] = np.where(coord_buffer[..., 2] > 0.0, coord_buffer[..., 0] / coord_buffer[..., 2] * n_camera.fx + n_camera.cx, -1.0) 
65 | 				y_map[...] = np.where(coord_buffer[..., 2] > 0.0, coord_buffer[..., 1] / coord_buffer[..., 2] * n_camera.fy + n_camera.cy, -1.0) 
66 | 				cv2.remap(n_img, x_map, y_map, cv2.INTER_LINEAR, sweep_volume[n_idx, d_idx, ...], cv2.BORDER_CONSTANT, (0.0, 0.0, 0.0))
67 | 			# Make sure there is overlap between reference image patch and this neighbor image.
68 | 			if np.any(sweep_volume[n_idx, ...] != 0.0):
69 | 				n_idx += 1
70 | 		# Update num_neighbors based on actual overlapping neighbor images.
71 | 		num_neighbors = n_idx
72 | 		# If there is no overlap at all, pass zeros as the plane-sweep volume and hope for the best.
73 | 		if num_neighbors == 0:
74 | 			num_neighbors = 1
75 | 		# Send the data to main thread.
76 | 		shared_data["ref_img"] = ref_img
77 | 		shared_data["ref_img_full"] = ref_img_full
78 | 		shared_data["sweep_volume"] = sweep_volume[0:num_neighbors, ...]
79 | 		shared_data["num_neighbors"] = num_neighbors
80 | 		ready_e.set()
81 | 
82 | 


--------------------------------------------------------------------------------
/python/generate_volume_train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | 
  5 | import numpy as np
  6 | from numpy import linalg as la
  7 | import cv2
  8 | import imageio
  9 | 
 10 | def max_disparity_adjust(d, max_disparity_mu = 1.2, max_disparity_sigma = 0.2):
 11 | 	return d * np.random.normal(max_disparity_mu, max_disparity_sigma)
 12 | 
 13 | def generate_volume_train(shared_data, max_num_neighbors, num_depths, patch_height, patch_width, dataset_path = None):
 14 | 	ready_e = shared_data["ready_e"]
 15 | 	start_e = shared_data["start_e"]
 16 | 	DATASET_LIST = ([
 17 | 		"GTAV_540", "GTAV_720",
 18 | 		"mvs_achteck_turm", "mvs_breisach", "mvs_citywall", 
 19 | 		"rgbd_10_to_20_3d_train", "rgbd_10_to_20_handheld_train", "rgbd_10_to_20_simple_train", "rgbd_20_to_inf_3d_train", "rgbd_20_to_inf_handheld_train", "rgbd_20_to_inf_simple_train",
 20 | 		"scenes11_train", 
 21 | 		"sun3d_train_0.01m_to_0.1m", "sun3d_train_0.1m_to_0.2m", "sun3d_train_0.2m_to_0.4m", "sun3d_train_0.4m_to_0.8m", "sun3d_train_0.8m_to_1.6m", "sun3d_train_1.6m_to_infm"
 22 | 	])
 23 | 	if dataset_path is None:
 24 | 		DATASET_DIR_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "dataset", "train")
 25 | 	else:
 26 | 		DATASET_DIR_ROOT = dataset_path
 27 | 	# Assign probability weights to each sequence according to the total number of images.
 28 | 	d_idx_choices = [[] for i in range(0, max_num_neighbors)]
 29 | 	s_idx_choices = [[] for i in range(0, max_num_neighbors)]
 30 | 	weights = [[] for i in range(0, max_num_neighbors)]
 31 | 	num_images = []
 32 | 	for (d_idx, dataset) in enumerate(DATASET_LIST):
 33 | 		with open(os.path.join(DATASET_DIR_ROOT, dataset, "num_images.json")) as f:
 34 | 			num_images.append(json.load(f))
 35 | 		for neighbor_idx in range(0, max_num_neighbors):
 36 | 			# Select only the sequences with at least (neighbor_idx + 1) frames.
 37 | 			choices = np.flatnonzero(np.greater(num_images[-1], neighbor_idx + 1))
 38 | 			s_idx_choices[neighbor_idx].extend(choices)
 39 | 			d_idx_choices[neighbor_idx].extend([d_idx for i in range(0, len(choices))])
 40 | 			weights[neighbor_idx].extend(np.array(num_images[-1])[choices])
 41 | 	# Normalize probability distribution.
 42 | 	for neighbor_idx in range(0, max_num_neighbors):
 43 | 		weights[neighbor_idx] = np.array(weights[neighbor_idx]).astype(np.float32) / np.sum(np.array(weights[neighbor_idx]))
 44 | 	# Allocate buffers.
 45 | 	ref_img = np.zeros((patch_height, patch_width, 3), dtype = np.float32)
 46 | 	sweep_volume = np.zeros((max_num_neighbors, num_depths, patch_height, patch_width, 3), dtype = np.float32)
 47 | 	ref_depth = np.zeros((patch_height, patch_width), dtype = np.float32)
 48 | 	valid_mask = np.zeros((patch_height, patch_width), dtype = np.uint8)
 49 | 	x_map = np.zeros((patch_height, patch_width), dtype = np.float32)
 50 | 	y_map = np.zeros((patch_height, patch_width), dtype = np.float32)
 51 | 	coord_buffer = np.zeros((patch_height, patch_width, 4), dtype = np.float32)
 52 | 	# Fetch image dimensions of each dataset.
 53 | 	actual_width_list = []
 54 | 	actual_height_list = []
 55 | 	for dataset in DATASET_LIST:
 56 | 		img = imageio.imread(os.path.join(DATASET_DIR_ROOT, dataset, "{:04d}".format(0), "images", "{:04d}.png".format(0)))
 57 | 		actual_width_list.append(img.shape[1])
 58 | 		actual_height_list.append(img.shape[0])
 59 | 	# Keep generating until stop signal.
 60 | 	while not shared_data["stop"]:
 61 | 		# Wait for start_flag.
 62 | 		start_e.wait()
 63 | 		start_e.clear()
 64 | 		if shared_data["stop"]:
 65 | 			break
 66 | 		# Randomly choose number of neighbors from [1, max_num_neighbors].
 67 | 		num_neighbors = np.random.randint(0, max_num_neighbors)
 68 | 		# Randomly select a sequence until finding an available one.
 69 | 		while True:
 70 | 			# Select a dataset.
 71 | 			random_idx = np.random.choice(len(d_idx_choices[num_neighbors]), p = weights[num_neighbors])
 72 | 			d_idx = d_idx_choices[num_neighbors][random_idx]
 73 | 			DATASET_DIR = os.path.join(DATASET_DIR_ROOT, DATASET_LIST[d_idx])
 74 | 			actual_width = actual_width_list[d_idx]
 75 | 			actual_height = actual_height_list[d_idx]
 76 | 			# Select a sequence, a reference image, and location of the patch.
 77 | 			s_idx = s_idx_choices[num_neighbors][random_idx]
 78 | 			r_idx = np.random.randint(0, num_images[d_idx][s_idx])
 79 | 			target_x = np.random.randint(0, actual_width - patch_width)
 80 | 			target_y = np.random.randint(0, actual_height - patch_height)
 81 | 			# Load ground truth depths.
 82 | 			ref_depth_full = imageio.imread(os.path.join(DATASET_DIR, "{:04d}".format(s_idx), "depths", "{:04d}.exr".format(r_idx)))
 83 | 			valid_mask_full = ref_depth_full != 0.0
 84 | 			ref_depth_full = np.where(valid_mask_full, 1.0 / ref_depth_full, 0.0)
 85 | 			max_disparity = max_disparity_adjust(np.max(ref_depth_full))
 86 | 			ref_depth[...] = ref_depth_full[target_y:target_y + patch_height, target_x:target_x + patch_width]
 87 | 			valid_mask[...] = valid_mask_full[target_y:target_y + patch_height, target_x:target_x + patch_width]
 88 | 			# If too many invalid ground truths, skip it.
 89 | 			if np.count_nonzero(valid_mask) < patch_width * patch_height * 0.80:
 90 | 				continue
 91 | 			# Choose a random pixel to determine overlapping neighbor images. 
 92 | 			while True:
 93 | 				sample_x = np.random.randint(0, patch_width)
 94 | 				sample_y = np.random.randint(0, patch_height)
 95 | 				if valid_mask[sample_y, sample_x]:
 96 | 					break
 97 | 			# Load camera pose of reference image.
 98 | 			with open(os.path.join(DATASET_DIR, "{:04d}".format(s_idx), "poses", "{:04d}.json".format(r_idx))) as f:
 99 | 				r_info = json.load(f)
100 | 				r_c_x = r_info["c_x"]
101 | 				r_c_y = r_info["c_y"]
102 | 				r_f_x = r_info["f_x"]
103 | 				r_f_y = r_info["f_y"]
104 | 				r_extrinsic = np.array(r_info["extrinsic"])
105 | 			# Select neighbors which have overlap with reference image.
106 | 			selected_neighbor_count = 0
107 | 			valid_neighbors = np.flatnonzero(np.not_equal(np.array(range(0, num_images[d_idx][s_idx])), r_idx))
108 | 			n_idx_list = []
109 | 			while len(valid_neighbors) > 0:
110 | 				n_idx = valid_neighbors[np.random.randint(0, len(valid_neighbors))]
111 | 				valid_neighbors = valid_neighbors[valid_neighbors != n_idx]
112 | 				with open(os.path.join(DATASET_DIR, "{:04d}".format(s_idx), "poses", "{:04d}.json".format(n_idx))) as f:
113 | 					n_info = json.load(f)
114 | 					n_c_x = n_info["c_x"]
115 | 					n_c_y = n_info["c_y"]
116 | 					n_f_x = n_info["f_x"]
117 | 					n_f_y = n_info["f_y"]
118 | 					n_extrinsic = np.array(n_info["extrinsic"])
119 | 				x = (target_x + sample_x - r_c_x) / r_f_x
120 | 				y = (target_y + sample_y - r_c_y) / r_f_y
121 | 				d = ref_depth[sample_y, sample_x]
122 | 				if d == 0.0:
123 | 					coord = np.array([x, y, 1.0, 0.0])
124 | 				else:
125 | 					coord = np.array([x/d, y/d, 1.0/d, 1.0])
126 | 				coord = n_extrinsic.dot(la.inv(r_extrinsic)).dot(coord)
127 | 				x = coord[0] / coord[2] * n_f_x + n_c_x
128 | 				y = coord[1] / coord[2] * n_f_y + n_c_y
129 | 				if x >= 0 and x < actual_width and y >= 0 and y < actual_height and coord[2] > 0:
130 | 					n_idx_list.append(n_idx)
131 | 				if len(n_idx_list) == num_neighbors + 1:
132 | 					break
133 | 			# Check if enough number of neighbors are collected.
134 | 			if len(n_idx_list) < num_neighbors + 1:
135 | 				continue
136 | 			else:
137 | 				break
138 | 		# Load reference RGB image.
139 | 		d_step = max_disparity / (num_depths - 1)
140 | 		d_list = [d_step * i for i in range(0, num_depths)]
141 | 		ref_img_full = imageio.imread(os.path.join(DATASET_DIR, "{:04d}".format(s_idx), "images", "{:04d}.png".format(r_idx))).astype(np.float32) / 255.0
142 | 		ref_img[...] = ref_img_full[target_y:target_y + patch_height, target_x:target_x + patch_width, ...] - 0.5
143 | 		# Generate plane-sweep volume.
144 | 		for neighbor_idx in range(0, num_neighbors + 1):
145 | 			# Load camera pose of neighbor image.
146 | 			n_idx = n_idx_list[neighbor_idx]
147 | 			with open(os.path.join(DATASET_DIR, "{:04d}".format(s_idx), "poses", "{:04d}.json".format(n_idx))) as f:
148 | 				n_info = json.load(f)
149 | 				n_c_x = n_info["c_x"]
150 | 				n_c_y = n_info["c_y"]
151 | 				n_f_x = n_info["f_x"]
152 | 				n_f_y = n_info["f_y"]
153 | 				n_extrinsic = np.array(n_info["extrinsic"])
154 | 			neighbor_img = imageio.imread(os.path.join(DATASET_DIR, "{:04d}".format(s_idx), "images", "{:04d}.png".format(n_idx))).astype(np.float32) / 255.0 - 0.5
155 | 			trans_matrix = n_extrinsic.dot(la.inv(r_extrinsic))
156 | 			for (disparity_idx, disparity) in enumerate(d_list):
157 | 				coord_buffer[..., 0] = (np.mgrid[0:patch_height, 0:patch_width][1] + target_x - r_c_x) / r_f_x
158 | 				coord_buffer[..., 1] = (np.mgrid[0:patch_height, 0:patch_width][0] + target_y - r_c_y) / r_f_y
159 | 				coord_buffer[..., 2] = 1.0
160 | 				if disparity == 0.0:
161 | 					coord_buffer[..., 3] = 0.0
162 | 				else:
163 | 					coord_buffer[..., 0:3] /= disparity
164 | 					coord_buffer[..., 3] = 1.0
165 | 				coord_buffer[...] = np.moveaxis(trans_matrix.dot(coord_buffer[..., np.newaxis])[..., 0], 0, -1)
166 | 				x_map[...] = np.where(coord_buffer[..., 2] >= 0.0, coord_buffer[..., 0] / coord_buffer[..., 2] * n_f_x + n_c_x, -1.0)
167 | 				y_map[...] = np.where(coord_buffer[..., 2] >= 0.0, coord_buffer[..., 1] / coord_buffer[..., 2] * n_f_y + n_c_y, -1.0)
168 | 				cv2.remap(neighbor_img, x_map, y_map, cv2.INTER_LINEAR, sweep_volume[neighbor_idx, disparity_idx, ...], cv2.BORDER_CONSTANT, (0.0, 0.0, 0.0))
169 | 		# Send plane-sweep volume to main thread.
170 | 		shared_data["ref_img"] = ref_img
171 | 		shared_data["ref_img_full"] = ref_img_full
172 | 		shared_data["target_x"] = target_x
173 | 		shared_data["target_y"] = target_y
174 | 		shared_data["sweep_volume"] = sweep_volume[0:num_neighbors + 1, ...]
175 | 		shared_data["ref_depth"] = ref_depth / max_disparity
176 | 		shared_data["valid_mask"] = valid_mask
177 | 		shared_data["num_neighbors"] = num_neighbors + 1
178 | 		ready_e.set()
179 | 
180 | 
181 | 
182 | 


--------------------------------------------------------------------------------
/python/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | class DeepMVS_PT(nn.Module):
  6 | 	def __init__(self, num_depths, use_gpu = True):
  7 | 		super(DeepMVS_PT, self).__init__()
  8 | 		self.layer_0 = nn.Sequential(
  9 | 				nn.Conv2d(3, 64, (5, 5), stride = (1, 1), padding = (2, 2)),
 10 | 				nn.SELU()
 11 | 			)
 12 | 		self.layer_1 = nn.Sequential(
 13 | 				nn.Conv2d(128, 96, (5, 5), stride = (1, 1), padding = (2, 2)),
 14 | 				nn.SELU(),
 15 | 				nn.Conv2d(96, 32, (5, 5), stride = (1, 1), padding = (2, 2)),
 16 | 				nn.SELU(),
 17 | 				nn.Conv2d(32, 4, (5, 5), stride = (1, 1), padding = (2, 2)),
 18 | 				nn.SELU()
 19 | 			)
 20 | 		self.layer_PT = nn.Sequential(
 21 | 				nn.Conv2d(4 * num_depths, 400, (3, 3), stride = (1, 1), padding = (1, 1)),
 22 | 				nn.SELU(),
 23 | 				nn.Conv2d(400, 800, (3, 3), stride = (1, 1), padding = (1, 1)),
 24 | 				nn.SELU()
 25 | 			)
 26 | 		self.layer_3 = nn.Sequential(
 27 | 				nn.Conv2d(800, 400, (3, 3), stride = (1, 1), padding = (1, 1)),
 28 | 				nn.SELU(),
 29 | 				nn.Conv2d(400, num_depths, (3, 3), stride = (1, 1), padding = (1, 1))
 30 | 			)
 31 | 		self.layer_loss = nn.CrossEntropyLoss(ignore_index=-1)
 32 | 
 33 | 		if use_gpu:
 34 | 			self.layer_0 = self.layer_0.cuda()
 35 | 			self.layer_1 = self.layer_1.cuda()
 36 | 			self.layer_PT = self.layer_PT.cuda()
 37 | 			self.layer_3 = self.layer_3.cuda()
 38 | 			self.layer_loss = self.layer_loss.cuda()
 39 | 
 40 | 	# Shape of 'volume_input': batch_size * num_neighbors (or num_sources) * num_depths * 2 * num_channels * height * width 
 41 | 	def forward(self, volume_input):
 42 | 		(aggregated_feature, _) = torch.max(self.forward_feature(volume_input), 1)
 43 | 		return self.forward_predict(aggregated_feature)
 44 | 
 45 | 	def forward_feature(self, volume_input):
 46 | 		if volume_input.dim() != 7 or volume_input.size(3) != 2:
 47 | 			raise ValueError("'volume_input' must be a tensor of shape: batch_size * num_neighbors (or num_sources) * num_depths * 2 * num_channels * height * width")
 48 | 		batch_size = volume_input.size(0)
 49 | 		num_neighbors = volume_input.size(1)
 50 | 		num_depths = volume_input.size(2)
 51 | 		num_channels = volume_input.size(4)
 52 | 		height = volume_input.size(5)
 53 | 		width = volume_input.size(6)
 54 | 		layer_0_out = self.layer_0(
 55 | 			volume_input.view(batch_size * num_neighbors * num_depths * 2, num_channels, height, width))
 56 | 		layer_1_out = self.layer_1(
 57 | 			layer_0_out.view(batch_size * num_neighbors * num_depths, 2 * 64, height, width))
 58 | 		layer_PT_out = self.layer_PT(layer_1_out.view(batch_size * num_neighbors, num_depths * 4, height, width))
 59 | 		return layer_PT_out.view(batch_size, num_neighbors, 800, height, width)
 60 | 
 61 | 	def forward_predict(self, aggregated_feature):
 62 | 		layer_3_output = self.layer_3(aggregated_feature)
 63 | 		return layer_3_output
 64 | 
 65 | class DeepMVS(nn.Module):
 66 | 	def __init__(self, num_depths, use_gpu = True):
 67 | 		super(DeepMVS, self).__init__()
 68 | 		# Patch Matching
 69 | 		self.layer_0 = nn.Sequential(
 70 | 				nn.Conv2d(3, 64, (5, 5), stride = (1, 1), padding = (2, 2)),
 71 | 				nn.SELU()
 72 | 			)
 73 | 		self.layer_1 = nn.Sequential(
 74 | 				nn.Conv2d(128, 96, (5, 5), stride = (1, 1), padding = (2, 2)),
 75 | 				nn.SELU(),
 76 | 				nn.Conv2d(96, 32, (5, 5), stride = (1, 1), padding = (2, 2)),
 77 | 				nn.SELU(),
 78 | 				nn.Conv2d(32, 4, (5, 5), stride = (1, 1), padding = (2, 2)),
 79 | 				nn.SELU()
 80 | 			)
 81 | 		# Encoder
 82 | 		self.layer_2_e1x = nn.Sequential(
 83 | 				nn.Conv2d(4 * num_depths, 200, (3, 3), stride = (1, 1), padding = (1, 1)),
 84 | 				nn.SELU(),
 85 | 				nn.Conv2d(200, 100, (3, 3), stride = (1, 1), padding = (1, 1)),
 86 | 				nn.SELU()
 87 | 			)
 88 | 		self.layer_2_e2x = nn.Sequential(
 89 | 				nn.Conv2d(100, 100, (2, 2), stride = (2, 2), padding = (0, 0)),
 90 | 				nn.SELU(),
 91 | 				nn.Conv2d(100, 100, (3, 3), stride = (1, 1), padding = (1, 1)),
 92 | 				nn.SELU()
 93 | 			)
 94 | 		self.layer_2_e4x = nn.Sequential(
 95 | 				nn.Conv2d(100, 100, (2, 2), stride = (2, 2), padding = (0, 0)),
 96 | 				nn.SELU(),
 97 | 				nn.Conv2d(100, 100, (3, 3), stride = (1, 1), padding = (1, 1)),
 98 | 				nn.SELU(),
 99 | 			)
100 | 		self.layer_2_e8x = nn.Sequential(
101 | 				nn.Conv2d(100, 100, (2, 2), stride = (2, 2), padding = (0, 0)),
102 | 				nn.SELU(),
103 | 				nn.Conv2d(100, 100, (3, 3), stride = (1, 1), padding = (1, 1)),
104 | 				nn.SELU(),
105 | 			)
106 | 		self.layer_2_e16x = nn.Sequential(
107 | 				nn.Conv2d(100, 100, (2, 2), stride = (2, 2), padding = (0, 0)),
108 | 				nn.SELU(),
109 | 				nn.Conv2d(100, 100, (3, 3), stride = (1, 1), padding = (1, 1)),
110 | 				nn.SELU()
111 | 			)
112 | 		# Buffer layers for VGG features
113 | 		self.layer_b1x = nn.Sequential(
114 | 				nn.Conv2d(64, 64, (1, 1), stride = (1, 1), padding = (0, 0)),
115 | 				nn.SELU(),
116 | 			)
117 | 		self.layer_b2x = nn.Sequential(
118 | 				nn.Conv2d(128, 100, (1, 1), stride = (1, 1), padding = (0, 0)),
119 | 				nn.SELU(),
120 | 			)
121 | 		self.layer_b4x = nn.Sequential(
122 | 				nn.Conv2d(256, 100, (1, 1), stride = (1, 1), padding = (0, 0)),
123 | 				nn.SELU(),
124 | 			)
125 | 		self.layer_b8x = nn.Sequential(
126 | 				nn.Conv2d(512, 100, (1, 1), stride = (1, 1), padding = (0, 0)),
127 | 				nn.SELU(),
128 | 			)
129 | 		self.layer_b16x = nn.Sequential(
130 | 				nn.Conv2d(512, 100, (1, 1), stride = (1, 1), padding = (0, 0)),
131 | 				nn.SELU(),
132 | 			)
133 | 		# Decoder
134 | 		self.layer_2_d16x = nn.Sequential(
135 | 				nn.Conv2d(200, 100, (3, 3), stride = (1, 1), padding = (1, 1)),
136 | 				nn.SELU(),
137 | 				nn.Conv2d(100, 100, (3, 3), stride = (1, 1), padding = (1, 1)),
138 | 				nn.SELU(),
139 | 			)
140 | 		self.layer_2_d8x = nn.Sequential(
141 | 				nn.Conv2d(300, 100, (3, 3), stride = (1, 1), padding = (1, 1)),
142 | 				nn.SELU(),
143 | 				nn.Conv2d(100, 100, (3, 3), stride = (1, 1), padding = (1, 1)),
144 | 				nn.SELU()
145 | 			)
146 | 		self.layer_2_d4x = nn.Sequential(
147 | 				nn.Conv2d(300, 100, (3, 3), stride = (1, 1), padding = (1, 1)),
148 | 				nn.SELU(),
149 | 				nn.Conv2d(100, 100, (3, 3), stride = (1, 1), padding = (1, 1)),
150 | 				nn.SELU()
151 | 			)
152 | 		self.layer_2_d2x = nn.Sequential(
153 | 				nn.Conv2d(300, 100, (3, 3), stride = (1, 1), padding = (1, 1)),
154 | 				nn.SELU(),
155 | 				nn.Conv2d(100, 100, (3, 3), stride = (1, 1), padding = (1, 1)),
156 | 				nn.SELU()
157 | 			)
158 | 		self.layer_2_d1x = nn.Sequential(
159 | 				nn.Conv2d(264, 400, (3, 3), stride = (1, 1), padding = (1, 1)),
160 | 				nn.SELU(),
161 | 				nn.Conv2d(400, 800, (3, 3), stride = (1, 1), padding = (1, 1)),
162 | 				nn.SELU()
163 | 			)
164 | 		# Inter-Volume Aggregation
165 | 		self.layer_3 = nn.Sequential(
166 | 				nn.Conv2d(800, 400, (3, 3), stride = (1, 1), padding = (1, 1)),
167 | 				nn.SELU(),
168 | 				nn.Conv2d(400, num_depths, (3, 3), stride = (1, 1), padding = (1, 1))
169 | 			)
170 | 		self.layer_loss = nn.CrossEntropyLoss(ignore_index=-1)
171 | 
172 | 		if use_gpu:
173 | 			self.layer_0 = self.layer_0.cuda()
174 | 			self.layer_1 = self.layer_1.cuda()
175 | 			self.layer_2_e1x = self.layer_2_e1x.cuda()
176 | 			self.layer_2_e2x = self.layer_2_e2x.cuda()
177 | 			self.layer_2_e4x = self.layer_2_e4x.cuda()
178 | 			self.layer_2_e8x = self.layer_2_e8x.cuda()
179 | 			self.layer_2_e16x = self.layer_2_e16x.cuda()
180 | 			self.layer_b1x = self.layer_b1x.cuda()
181 | 			self.layer_b2x = self.layer_b2x.cuda()
182 | 			self.layer_b4x = self.layer_b4x.cuda()
183 | 			self.layer_b8x = self.layer_b8x.cuda()
184 | 			self.layer_b16x = self.layer_b16x.cuda()
185 | 			self.layer_2_d16x = self.layer_2_d16x.cuda()
186 | 			self.layer_2_d8x = self.layer_2_d8x.cuda()
187 | 			self.layer_2_d4x = self.layer_2_d4x.cuda()
188 | 			self.layer_2_d2x = self.layer_2_d2x.cuda()
189 | 			self.layer_2_d1x = self.layer_2_d1x.cuda()
190 | 			self.layer_3 = self.layer_3.cuda()
191 | 			self.layer_loss = self.layer_loss.cuda()
192 | 
193 | 	# Shape of 'volume_input': batch_size * num_neighbors (or num_sources) * num_depths * 2 * num_channels * height * width 
194 | 	# 'feature_inputs' is a list of five VGG feature tensors, each of shape: batch_size * num_features * height * width
195 | 	def forward(self, volume_input, feature_inputs):
196 | 		(aggregated_feature, _) = torch.max(self.forward_feature(volume_input, feature_inputs), 1)
197 | 		return self.forward_predict(aggregated_feature)
198 | 
199 | 	def forward_feature(self, volume_input, feature_inputs):
200 | 		if volume_input.dim() != 7 or volume_input.size(3) != 2:
201 | 			raise ValueError("'volume_input' must be a tensor of shape: batch_size * num_neighbors (or num_sources) * num_depths * 2 * num_channels * height * width")
202 | 		if len(feature_inputs) != 5:
203 | 			raise ValueError("'feature_inputs' is a list of five VGG feature tensors of shape: batch_size * num_features * height * width")
204 | 		for feature in feature_inputs:
205 | 			if feature.dim() != 4:
206 | 				raise ValueError("'feature_inputs' is a list of five VGG feature tensors of shape: batch_size * num_features * height * width")
207 | 		batch_size = volume_input.size(0)
208 | 		num_neighbors = volume_input.size(1)
209 | 		num_depths = volume_input.size(2)
210 | 		num_channels = volume_input.size(4)
211 | 		height = volume_input.size(5)
212 | 		width = volume_input.size(6)
213 | 		layer_0_output = self.layer_0(
214 | 			volume_input.view(batch_size * num_neighbors * num_depths * 2, num_channels, height, width))
215 | 		layer_1_output = self.layer_1(
216 | 			layer_0_output.view(batch_size * num_neighbors * num_depths, 2 * 64, height, width))
217 | 		layer_2_e1x_out = self.layer_2_e1x(layer_1_output.view(batch_size * num_neighbors, num_depths * 4, height, width))
218 | 		layer_2_e2x_out = self.layer_2_e2x(layer_2_e1x_out)
219 | 		layer_2_e4x_out = self.layer_2_e4x(layer_2_e2x_out)
220 | 		layer_2_e8x_out = self.layer_2_e8x(layer_2_e4x_out)
221 | 		layer_2_e16x_out = self.layer_2_e16x(layer_2_e8x_out)
222 | 		layer_b1x_out = self.layer_b1x(feature_inputs[0])
223 | 		layer_b2x_out = self.layer_b2x(feature_inputs[1])
224 | 		layer_b4x_out = self.layer_b4x(feature_inputs[2])
225 | 		layer_b8x_out = self.layer_b8x(feature_inputs[3])
226 | 		layer_b16x_out = self.layer_b16x(feature_inputs[4])
227 | 		if num_neighbors != 1:
228 | 			# We need to copy the features for each neighbor image. When batch_size = 1, use expand() instead of repeat() to save memory.
229 | 			if batch_size == 1:
230 | 				layer_b1x_out = layer_b1x_out.expand(batch_size * num_neighbors, -1, -1, -1)
231 | 				layer_b2x_out = layer_b2x_out.expand(batch_size * num_neighbors, -1, -1, -1)
232 | 				layer_b4x_out = layer_b4x_out.expand(batch_size * num_neighbors, -1, -1, -1)
233 | 				layer_b8x_out = layer_b8x_out.expand(batch_size * num_neighbors, -1, -1, -1)
234 | 				layer_b16x_out = layer_b16x_out.expand(batch_size * num_neighbors, -1, -1, -1)
235 | 			else:
236 | 				layer_b1x_out = layer_b1x_out.repeat(num_neighbors, 1, 1, 1)
237 | 				layer_b2x_out = layer_b2x_out.repeat(num_neighbors, 1, 1, 1)
238 | 				layer_b4x_out = layer_b4x_out.repeat(num_neighbors, 1, 1, 1)
239 | 				layer_b8x_out = layer_b8x_out.repeat(num_neighbors, 1, 1, 1)
240 | 				layer_b16x_out = layer_b16x_out.repeat(num_neighbors, 1, 1, 1)
241 | 		layer_2_d16x_out = self.layer_2_d16x(torch.cat((layer_2_e16x_out, layer_b16x_out), 1))
242 | 		layer_2_d8x_out = self.layer_2_d8x(torch.cat((layer_2_e8x_out, F.upsample(layer_2_d16x_out, scale_factor=2, mode='bilinear'), layer_b8x_out), 1))
243 | 		layer_2_d4x_out = self.layer_2_d4x(torch.cat((layer_2_e4x_out, F.upsample(layer_2_d8x_out, scale_factor=2, mode='bilinear'), layer_b4x_out), 1))
244 | 		layer_2_d2x_out = self.layer_2_d2x(torch.cat((layer_2_e2x_out, F.upsample(layer_2_d4x_out, scale_factor=2, mode='bilinear'), layer_b2x_out), 1))
245 | 		layer_2_d1x_out = self.layer_2_d1x(torch.cat((layer_2_e1x_out, F.upsample(layer_2_d2x_out, scale_factor=2, mode='bilinear'), layer_b1x_out), 1))
246 | 		return layer_2_d1x_out.view(batch_size, num_neighbors, 800, height, width)
247 | 
248 | 	def forward_predict(self, aggregated_feature):
249 | 		layer_3_output = self.layer_3(aggregated_feature)
250 | 		return layer_3_output
251 | 
252 | def weights_init(m):
253 | 	if isinstance(m, nn.Conv2d):
254 | 		nn.init.xavier_normal(m.weight.data)
255 | 		m.bias.data.fill_(0)


--------------------------------------------------------------------------------
/python/test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | import threading
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torch.autograd import Variable
 11 | import torchvision as vision
 12 | import pydensecrf.densecrf as dcrf
 13 | import imageio
 14 | 
 15 | from model import DeepMVS
 16 | from generate_volume_test import generate_volume_test
 17 | 
 18 | 
 19 | # Parse arguments
 20 | parser = argparse.ArgumentParser(description = "Run DeepMVS on a sequence.")
 21 | parser.add_argument("--image_width", dest = "image_width", type = int, default = -1, help = "Image width (<0 means to derive from image_height).")
 22 | parser.add_argument("--image_height", dest = "image_height", type = int, default = 540, help = "Image height (<0 means to derive from image_width).")
 23 | parser.add_argument("--patch_width", dest = "patch_width", type = int, default = 128, help = "Width of patches.")
 24 | parser.add_argument("--patch_height", dest = "patch_height", type = int, default = 128, help = "Height of patches.")
 25 | parser.add_argument("--stride_width", dest = "stride_width", type = int, default = 64, help = "Width of the stride.")
 26 | parser.add_argument("--stride_height", dest = "stride_height", type = int, default = 64, help = "Height of the stride.")
 27 | parser.add_argument("--num_depths", dest = "num_depths", type = int, default = 100, help = "Number of disparity levels.")
 28 | parser.add_argument("--max_num_neighbors", dest = "max_num_neighbors", type = int, default = 16, help = "Maximum number of neighbor images.")
 29 | parser.add_argument("--no_gpu", dest = "use_gpu", action = "store_false", default = True, help = "Disable use of GPU.")
 30 | parser.add_argument("--image_path", dest = "image_path", help = "Path to the images.", required = True)
 31 | parser.add_argument("--sparse_path", dest = "sparse_path", help = "Path to the sparse reconstruction.", required = True)
 32 | parser.add_argument("--output_path", dest = "output_path", help = "Path to store the results.", required = True)
 33 | parser.add_argument("--load_bin", dest = "load_bin", type = bool, default = False, help = "Set if you want to load COLMAP .bin files")
 34 | parser.add_argument("--model_path", dest = "model_path", help = "Path to the trained model.", default = (
 35 | 		os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "model", "DeepMVS_final.model")
 36 | 	))
 37 | parser.add_argument("--overwrite", dest = "overwrite", action = "store_true", default = False, help = "Overwrite existing results.")
 38 | parser.add_argument("--log_file", dest = "log_file", default = None, help = "Path to log file. (Default: sys.stdout)")
 39 | 
 40 | args = parser.parse_args()
 41 | 
 42 | image_width = args.image_width
 43 | image_height = args.image_height
 44 | patch_width = args.patch_width
 45 | patch_height = args.patch_height
 46 | stride_width = args.stride_width
 47 | stride_height = args.stride_height
 48 | num_depths = args.num_depths
 49 | max_num_neighbors = args.max_num_neighbors
 50 | use_gpu = args.use_gpu
 51 | image_path = args.image_path
 52 | sparse_path = args.sparse_path
 53 | output_path = args.output_path
 54 | model_path = args.model_path
 55 | overwrite = args.overwrite
 56 | log_file = args.log_file
 57 | batch_size = 1
 58 | 
 59 | if args.load_bin:
 60 | 	from colmap_helpers_for_bin import ColmapSparse
 61 | else:
 62 | 	from colmap_helpers import ColmapSparse
 63 | 
 64 | # Constants for DenseCRF.
 65 | sigma_xy = 80.0
 66 | sigma_rgb = 15.0
 67 | sigma_d = 10.0
 68 | iteration_num = 5
 69 | compat = np.zeros((num_depths, num_depths), dtype = np.float32)
 70 | for row in range(0, num_depths):
 71 | 	for col in range(0, num_depths):
 72 | 		compat[row, col] = (row - col) ** 2 / sigma_d ** 2 / 2
 73 | 
 74 | # Create log file and output directory if needed.
 75 | if log_file is None:
 76 | 	log_file = sys.stdout
 77 | else:
 78 | 	log_file = open(log_file, "w")
 79 | if not os.path.exists(output_path):
 80 | 	os.makedirs(output_path)
 81 | 
 82 | # Check if model exists.
 83 | if not os.path.exists(model_path):
 84 | 	raise ValueError("Cannot find the trained model. Please download it first or specify the path to the model.")
 85 | 
 86 | # Load trained model.
 87 | print >> log_file, "Loading the trained model..."
 88 | log_file.flush()
 89 | model = DeepMVS(num_depths, use_gpu)
 90 | model.load_state_dict(torch.load(os.path.join(model_path)))
 91 | print >> log_file, "Successfully loaded the trained model."
 92 | log_file.flush()
 93 | 
 94 | # Load COLMAP sparse model.
 95 | print >> log_file, "Loading the sparse model..."
 96 | log_file.flush()
 97 | sparse_model = ColmapSparse(sparse_path, image_path, image_width, image_height)
 98 | print >> log_file, "Successfully loaded the sparse model."
 99 | log_file.flush()
100 | 
101 | # Launch plane-sweep volume generating thread.
102 | shared_data = ({
103 | 		"ready_e": threading.Event(),
104 | 		"start_e": threading.Event(),
105 | 		"stop": False,
106 | 		"patch_width": patch_width,
107 | 		"patch_height": patch_height,
108 | 		"num_depths": num_depths,
109 | 		"max_num_neighbors": max_num_neighbors,
110 | 		"sparse_model": sparse_model
111 | 	})
112 | worker_thread = threading.Thread(name = "generate_volume", target = generate_volume_test, args = (shared_data,))
113 | worker_thread.start()
114 | 
115 | # Prepare VGG model and normalizer.
116 | print >> log_file, "Creating VGG model..."
117 | log_file.flush()
118 | if use_gpu:
119 | 	VGG_model = vision.models.vgg19(pretrained = True).cuda()
120 | else:
121 | 	VGG_model = vision.models.vgg19(pretrained = True)
122 | VGG_normalize = vision.transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
123 | print >> log_file, "Successfully created VGG model."
124 | log_file.flush()
125 | 
126 | # Loop through all images.
127 | for (ref_image_idx, ref_image) in enumerate(sparse_model.image_list.images):
128 | 	# Check if output already exists.
129 | 	if not overwrite and os.path.exists(os.path.join(output_path, "{:}.output.npy".format(ref_image.filename))):
130 | 		print >> log_file, "Skipped {:} since the output already exists.".format(ref_image.filename)
131 | 		log_file.flush()
132 | 		continue
133 | 	# Start generating plane-sweep volume of the first patch.
134 | 	print >> log_file, "Start working on image {:d}/{:d}.".format(ref_image_idx, sparse_model.image_list.length)
135 | 	log_file.flush()
136 | 	shared_data["image_idx"] = ref_image_idx
137 | 	shared_data["target_x"] = 0
138 | 	shared_data["target_y"] = 0
139 | 	shared_data["ready_e"].clear()
140 | 	shared_data["start_e"].set()
141 | 	# Generate VGG features.
142 | 	ref_camera = sparse_model.camera_list.get_by_id(ref_image.camera_id)
143 | 	image_width = ref_camera.width
144 | 	image_height = ref_camera.height
145 | 	shared_data["ready_e"].wait()
146 | 	ref_img_full = shared_data["ref_img_full"]
147 | 	VGG_tensor = Variable(VGG_normalize(torch.FloatTensor(ref_img_full).permute(2, 0, 1)).unsqueeze(0), volatile = True)
148 | 	if use_gpu:
149 | 		VGG_tensor = VGG_tensor.cuda()
150 | 	VGG_scaling_factor = 0.01
151 | 	for i in range(0, 4):
152 | 		VGG_tensor = VGG_model.features[i].forward(VGG_tensor)
153 | 	if use_gpu:
154 | 		feature_input_1x_whole = VGG_tensor.data.cpu().clone() * VGG_scaling_factor
155 | 	else:
156 | 		feature_input_1x_whole = VGG_tensor.data.clone() * VGG_scaling_factor
157 | 	for i in range(4, 9):
158 | 		VGG_tensor = VGG_model.features[i].forward(VGG_tensor)
159 | 	if use_gpu:
160 | 		feature_input_2x_whole = VGG_tensor.data.cpu().clone() * VGG_scaling_factor
161 | 	else:
162 | 		feature_input_2x_whole = VGG_tensor.data.clone() * VGG_scaling_factor
163 | 	for i in range(9, 14):
164 | 		VGG_tensor = VGG_model.features[i].forward(VGG_tensor)
165 | 	if use_gpu:
166 | 		feature_input_4x_whole = VGG_tensor.data.cpu().clone() * VGG_scaling_factor
167 | 	else:
168 | 		feature_input_4x_whole = VGG_tensor.data.clone() * VGG_scaling_factor
169 | 	for i in range(14, 23):
170 | 		VGG_tensor = VGG_model.features[i].forward(VGG_tensor)
171 | 	if use_gpu:
172 | 		feature_input_8x_whole = VGG_tensor.data.cpu().clone() * VGG_scaling_factor
173 | 	else:
174 | 		feature_input_8x_whole = VGG_tensor.data.clone() * VGG_scaling_factor
175 | 	for i in range(23, 32):
176 | 		VGG_tensor = VGG_model.features[i].forward(VGG_tensor)
177 | 	if use_gpu:
178 | 		feature_input_16x_whole = VGG_tensor.data.cpu().clone() * VGG_scaling_factor
179 | 	else:
180 | 		feature_input_16x_whole = VGG_tensor.data.clone() * VGG_scaling_factor
181 | 	del VGG_tensor
182 | 	# Stride through entire reference image.
183 | 	predict_raw = torch.zeros(num_depths, image_height, image_width)
184 | 	border_x = (patch_width - stride_width) / 2
185 | 	border_y = (patch_height - stride_height) / 2
186 | 	col_total = (image_width - 2 * border_x - 1) / stride_width + 1
187 | 	row_total = (image_height - 2 * border_y - 1) / stride_height + 1
188 | 	for row_idx in range(0, row_total):
189 | 		for col_idx in range(0, col_total):
190 | 			print >> log_file, "Working on patch at row = {:d}/{:d} col = {:d}/{:d}".format(row_idx, row_total, col_idx, col_total)
191 | 			log_file.flush()
192 | 			# Compute patch location for this patch and next patch.
193 | 			if col_idx != col_total - 1:
194 | 				start_x = col_idx * stride_width
195 | 			else:
196 | 				start_x = image_width - patch_width
197 | 			if row_idx != row_total - 1:
198 | 				start_y = row_idx * stride_height
199 | 			else:
200 | 				start_y = image_height - patch_height
201 | 			next_col_idx = (col_idx + 1) % col_total
202 | 			next_row_idx = row_idx if col_idx != col_total - 1 else row_idx + 1
203 | 			if next_col_idx != col_total - 1:
204 | 				next_start_x = next_col_idx * stride_width
205 | 			else:
206 | 				next_start_x = image_width - patch_width
207 | 			if next_row_idx != row_total - 1:
208 | 				next_start_y = next_row_idx * stride_height
209 | 			else:
210 | 				next_start_y = image_height - patch_height
211 | 			# Read plane-sweep volume and start next patch.
212 | 			shared_data["ready_e"].wait()
213 | 			ref_img = shared_data["ref_img"].copy()
214 | 			sweep_volume = shared_data["sweep_volume"].copy()
215 | 			num_neighbors = shared_data["num_neighbors"]
216 | 			if next_row_idx < row_total:
217 | 				shared_data["target_x"] = next_start_x
218 | 				shared_data["target_y"] = next_start_y
219 | 				shared_data["ready_e"].clear()
220 | 				shared_data["start_e"].set()
221 | 			# Prepare the inputs.
222 | 			data_in_tensor = torch.FloatTensor(batch_size, 1, num_depths, 2, 3, patch_height, patch_width)
223 | 			ref_img_tensor = torch.FloatTensor(ref_img).permute(2, 0, 1).unsqueeze(0)
224 | 			data_in_tensor[0, 0, :, 0, ...] = ref_img_tensor.expand(num_depths, -1, -1, -1)
225 | 			feature_input_1x = Variable(feature_input_1x_whole[... , start_y:start_y + patch_height, start_x:start_x + patch_width], volatile = True)
226 | 			feature_input_2x = Variable(feature_input_2x_whole[... , start_y / 2:start_y / 2 + patch_height / 2, start_x / 2:start_x / 2 + patch_width / 2], volatile = True)
227 | 			feature_input_4x = Variable(feature_input_4x_whole[... , start_y / 4:start_y / 4 + patch_height / 4, start_x / 4:start_x / 4 + patch_width / 4], volatile = True)
228 | 			feature_input_8x = Variable(feature_input_8x_whole[... , start_y / 8:start_y / 8 + patch_height / 8, start_x / 8:start_x / 8 + patch_width / 8], volatile = True)
229 | 			feature_input_16x = Variable(feature_input_16x_whole[... , start_y / 16:start_y / 16 + patch_height / 16, start_x / 16:start_x / 16 + patch_width / 16], volatile = True)
230 | 			if use_gpu:
231 | 				feature_input_1x = feature_input_1x.cuda()
232 | 				feature_input_2x = feature_input_2x.cuda()
233 | 				feature_input_4x = feature_input_4x.cuda()
234 | 				feature_input_8x = feature_input_8x.cuda()
235 | 				feature_input_16x = feature_input_16x.cuda()
236 | 			# Loop through all neighbor images.
237 | 			for neighbor_idx in range(0, num_neighbors):
238 | 				data_in_tensor[0, 0, :, 1, ...] = torch.FloatTensor(np.moveaxis(sweep_volume[neighbor_idx, ...], -1, -3))
239 | 				data_in = Variable(data_in_tensor, volatile = True)
240 | 				if use_gpu:
241 | 					data_in = data_in.cuda()
242 | 				if neighbor_idx == 0:
243 | 					cost_volume = model.forward_feature(data_in, [feature_input_1x, feature_input_2x, feature_input_4x, feature_input_8x, feature_input_16x]).data[...]
244 | 				else:
245 | 					cost_volume = torch.max(cost_volume, model.forward_feature(data_in, [feature_input_1x, feature_input_2x, feature_input_4x, feature_input_8x, feature_input_16x]).data[...])
246 | 			# Make final prediction.
247 | 			predict = model.forward_predict(Variable(cost_volume[:, 0, ...], volatile = True))
248 | 			# Compute copy range.
249 | 			if col_idx == 0:
250 | 				copy_x_start = 0
251 | 				copy_x_end = patch_width - border_x
252 | 			elif col_idx == col_total - 1:
253 | 				copy_x_start = border_x + col_idx * stride_width
254 | 				copy_x_end = image_width
255 | 			else:
256 | 				copy_x_start = border_x + col_idx * stride_width
257 | 				copy_x_end = copy_x_start + stride_width
258 | 			if row_idx == 0:
259 | 				copy_y_start = 0
260 | 				copy_y_end = patch_height - border_y
261 | 			elif row_idx == row_total - 1:
262 | 				copy_y_start = border_y + row_idx * stride_height
263 | 				copy_y_end = image_height
264 | 			else:
265 | 				copy_y_start = border_y + row_idx * stride_height
266 | 				copy_y_end = copy_y_start + stride_height
267 | 			# Copy the prediction to buffer.
268 | 			predict_raw[..., copy_y_start:copy_y_end, copy_x_start:copy_x_end] = predict.data[0, :, copy_y_start - start_y:copy_y_end - start_y, copy_x_start - start_x:copy_x_end - start_x]
269 | 	# Pass through DenseCRF.
270 | 	print >> log_file, "Running DenseCRF..."
271 | 	log_file.flush()
272 | 	unary_energy = F.log_softmax(Variable(predict_raw, volatile = True), dim = 0).data.numpy()
273 | 	crf = dcrf.DenseCRF2D(image_width, image_height, num_depths)
274 | 	crf.setUnaryEnergy(-unary_energy.reshape(num_depths, image_height * image_width))
275 | 	ref_img_full = (ref_img_full * 255.0).astype(np.uint8)
276 | 	crf.addPairwiseBilateral(sxy=(sigma_xy, sigma_xy), srgb=(sigma_rgb, sigma_rgb, sigma_rgb), rgbim=ref_img_full, compat=compat, kernel=dcrf.FULL_KERNEL, normalization=dcrf.NORMALIZE_SYMMETRIC)
277 | 	new_raw = crf.inference(iteration_num)
278 | 	new_raw = np.array(new_raw).reshape(num_depths, image_height, image_width)
279 | 	new_predict = np.argmax(new_raw, 0).astype(np.float32) / (num_depths - 1) * ref_image.estimated_max_disparity
280 | 	# Store the results.
281 | 	output_dir = os.path.dirname(os.path.join(output_path, ref_image.filename))
282 | 	if not os.path.exists(output_dir):
283 | 		os.makedirs(output_dir)
284 | 	np.save(os.path.join(output_path, "{:}.output.npy".format(ref_image.filename)), new_predict)
285 | 	imageio.imwrite(os.path.join(output_path, "{:}.output.png".format(ref_image.filename)), (new_predict / ref_image.estimated_max_disparity).clip(0.0, 1.0))
286 | 	print >> log_file, "Result has been saved to {:}.".format(os.path.join(output_path, "{:}.output.npy".format(ref_image.filename)))
287 | 	log_file.flush()
288 | 
289 | # Terminate worker threads.
290 | shared_data["stop"] = True
291 | shared_data["start_e"].set()
292 | 
293 | # Finished.
294 | print >> log_file, "Finished running DeepMVS."
295 | print >> log_file, "Results can be found in {:}".format(output_path)


--------------------------------------------------------------------------------
/python/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import argparse
  4 | import threading
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torch.autograd import Variable
 11 | import torchvision as vision
 12 | 
 13 | from model import DeepMVS_PT, DeepMVS, weights_init
 14 | from generate_volume_train import generate_volume_train
 15 | 
 16 | # Parse arguments
 17 | parser = argparse.ArgumentParser(description = "Train DeepMVS.")
 18 | parser.add_argument("--patch_width", dest = "patch_width", type = int, default = 64, help = "Width of patches.")
 19 | parser.add_argument("--patch_height", dest = "patch_height", type = int, default = 64, help = "Height of patches.")
 20 | parser.add_argument("--num_depths", dest = "num_depths", type = int, default = 100, help = "Number of disparity levels.")
 21 | parser.add_argument("--max_num_neighbors", dest = "max_num_neighbors", type = int, default = 4, help = "Maximum number of neighbor images.")
 22 | parser.add_argument("--num_threads", dest = "num_threads", type = int, default = 4, help = "Number of threads for plane-sweep volume generation.")
 23 | parser.add_argument("--no_gpu", dest = "use_gpu", action = "store_false", default = True, help = "Disable use of GPU.")
 24 | parser.add_argument("--dataset_path", dest = "dataset_path", default = (
 25 | 		os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "dataset", "train")
 26 | 	), help = "Path to the training dataset.")
 27 | parser.add_argument("--model_path", dest = "model_path", default = (
 28 | 		os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "model")
 29 | 	), help = "Path to store models.")
 30 | parser.add_argument("--snapshot_period", dest = "snapshot_period", type = int, default = 10000, help = "Take snapshot every n iterations. (0: no snapshot)")
 31 | parser.add_argument("--retrain", dest = "retrain", action = "store_true", default = False, help = "Retrain the network from scratch.")
 32 | parser.add_argument("--log_file", dest = "log_file", default = None, help = "Path to log file. (Default: sys.stdout)")
 33 | 
 34 | args = parser.parse_args()
 35 | 
 36 | patch_width = args.patch_width
 37 | patch_height = args.patch_height
 38 | num_depths = args.num_depths
 39 | max_num_neighbors = args.max_num_neighbors
 40 | num_threads = args.num_threads
 41 | use_gpu = args.use_gpu
 42 | dataset_path = args.dataset_path
 43 | model_path = args.model_path
 44 | snapshot_period = args.snapshot_period
 45 | retrain = args.retrain
 46 | log_file = args.log_file
 47 | batch_size = 1
 48 | 
 49 | # Check if training dataset is downloaded.
 50 | dataset_downloaded = True
 51 | if os.path.exists(dataset_path):
 52 | 	DATASET_LIST = ([
 53 | 			"GTAV_540", "GTAV_720",
 54 | 			"mvs_achteck_turm", "mvs_breisach", "mvs_citywall", 
 55 | 			"rgbd_10_to_20_3d_train", "rgbd_10_to_20_handheld_train", "rgbd_10_to_20_simple_train", "rgbd_20_to_inf_3d_train", "rgbd_20_to_inf_handheld_train", "rgbd_20_to_inf_simple_train",
 56 | 			"scenes11_train", 
 57 | 			"sun3d_train_0.01m_to_0.1m", "sun3d_train_0.1m_to_0.2m", "sun3d_train_0.2m_to_0.4m", "sun3d_train_0.4m_to_0.8m", "sun3d_train_0.8m_to_1.6m", "sun3d_train_1.6m_to_infm"
 58 | 		])
 59 | 	for dataset in DATASET_LIST:
 60 | 		if not os.path.exists(os.path.join(dataset_path, dataset)):
 61 | 			print >> log_file, "Cannot find dataset '{:}'".format(dataset)
 62 | 			dataset_downloaded = False
 63 | 			break
 64 | else:
 65 | 	os.makedirs(dataset_path)
 66 | 	dataset_downloaded = False
 67 | 
 68 | # Try downloading training datasets if it has not been done.
 69 | if not dataset_downloaded:
 70 | 	print >> log_file, "Training datasets must be downloaded before training DeepMVS."
 71 | 	print >> log_file, "Run 'python python/download_training_datasets.py' to download the training datasets."
 72 | 	sys.exit()
 73 | 
 74 | # Create model directory and log file.
 75 | if not os.path.exists(model_path):
 76 | 	os.makedirs(model_path)
 77 | if log_file is None:
 78 | 	log_file = sys.stdout
 79 | else:
 80 | 	log_file = open(log_file, "w")
 81 | 
 82 | # Create worker threads for volume generation.
 83 | shared_datas = []
 84 | threads = []
 85 | for i in range(0, num_threads):
 86 | 	shared_datas.append({
 87 | 		"ready_e": threading.Event(),
 88 | 		"start_e": threading.Event(),
 89 | 		"stop": False
 90 | 	})
 91 | 	threads.append(threading.Thread(name = "generate_volume_{:d}".format(i), target = generate_volume_train, args = 
 92 | 		(shared_datas[i], max_num_neighbors, num_depths, patch_height, patch_width, dataset_path)
 93 | 	))
 94 | 	threads[i].start()
 95 | 	shared_datas[i]["ready_e"].clear()
 96 | 	shared_datas[i]["start_e"].set()
 97 | 
 98 | # Check if we can resume from last snapshot.
 99 | iteration_stop = 320000
100 | if retrain:
101 | 	need_pretrain = True
102 | 	iteration_start = 0
103 | elif os.path.exists(os.path.join(model_path, "DeepMVS_PT_final.model")):
104 | 	need_pretrain = False
105 | elif snapshot_period != 0:
106 | 	need_pretrain = True
107 | 	iter_idx = 0
108 | 	while os.path.exists(os.path.join(model_path, "DeepMVS_PT_snapshot_{:d}.model".format(iter_idx + snapshot_period))):
109 | 		iter_idx += snapshot_period
110 | 	iteration_start = iter_idx
111 | else:
112 | 	need_pretrain = True
113 | 	iteration_start = 0
114 | 
115 | # Train the DeepMVS pre-train network.
116 | # Use function to ensure the local variables are cleaned up upon exiting the scope.
117 | def train_DeepMVS_PT():
118 | 	# Initialization.
119 | 	model = DeepMVS_PT(num_depths, use_gpu)
120 | 	lr = 1e-5
121 | 	grad_clip = 1.0
122 | 	optimizer = torch.optim.Adam(model.parameters(), lr = lr)
123 | 	if iteration_start == 0:
124 | 		model.apply(weights_init)
125 | 	else:
126 | 		model.load_state_dict(torch.load(os.path.join(model_path, "DeepMVS_PT_snapshot_{:d}.model".format(iteration_start))))
127 | 		optimizer.load_state_dict(torch.load(os.path.join(model_path, "DeepMVS_PT_snapshot_{:d}.optimizer".format(iteration_start))))
128 | 	# Start training.
129 | 	data_in = torch.FloatTensor(batch_size, max_num_neighbors, num_depths, 2, 3, patch_height, patch_width)
130 | 	data_gt = torch.LongTensor(batch_size, patch_height, patch_width)
131 | 	invalid_mask = torch.ByteTensor(batch_size, patch_height, patch_width)
132 | 	thread_idx = 0
133 | 	print >> log_file, "Start training DeepMVS_PT from iteration {:d}.".format(iteration_start)
134 | 	for iteration_idx in range(iteration_start, iteration_stop):
135 | 		# Load a plane-sweep volume.
136 | 		while not shared_datas[thread_idx]["ready_e"].wait(1e-3):
137 | 			thread_idx = (thread_idx + 1) % num_threads
138 | 		ref_img = shared_datas[thread_idx]["ref_img"].copy()
139 | 		sweep_volume = shared_datas[thread_idx]["sweep_volume"].copy()
140 | 		ref_depth = shared_datas[thread_idx]["ref_depth"].copy()
141 | 		valid_mask = shared_datas[thread_idx]["valid_mask"].copy()
142 | 		num_neighbors = shared_datas[thread_idx]["num_neighbors"]
143 | 		shared_datas[thread_idx]["ready_e"].clear()
144 | 		shared_datas[thread_idx]["start_e"].set()
145 | 		# Prepare inputs.
146 | 		optimizer.zero_grad()
147 | 		ref_img_tensor = torch.FloatTensor(np.moveaxis(ref_img, -1, 0))
148 | 		data_in[0, 0:num_neighbors, :, 0, ...] = ref_img_tensor.expand(num_neighbors, num_depths, -1, -1, -1)
149 | 		data_in[0, 0:num_neighbors, :, 1, ...] = torch.FloatTensor(np.moveaxis(sweep_volume, -1, -3))
150 | 		data_gt[...] = torch.LongTensor(np.round(ref_depth * (num_depths - 1.0)).clip(0, num_depths - 1.0).astype(int))
151 | 		invalid_mask[...] = torch.ByteTensor(1 - valid_mask)
152 | 		data_gt.masked_fill_(invalid_mask, -1)
153 | 		if use_gpu:
154 | 			data_in_var = Variable(data_in[:, 0:num_neighbors, ...], requires_grad = True).cuda()
155 | 			data_gt_var = Variable(data_gt, requires_grad = False).cuda()
156 | 		else:
157 | 			data_in_var = Variable(data_in[:, 0:num_neighbors, ...], requires_grad = True)
158 | 			data_gt_var = Variable(data_gt, requires_grad = False)
159 | 		# Update parameters.
160 | 		predict = model.forward(data_in_var)
161 | 		loss = model.layer_loss(predict, data_gt_var)
162 | 		loss.backward()
163 | 		nn.utils.clip_grad_norm(model.parameters(), grad_clip)
164 | 		optimizer.step()
165 | 		# Save snapshot if needed.
166 | 		if snapshot_period != 0:
167 | 			if((iteration_idx + 1) % snapshot_period == 0):
168 | 				torch.save(model.state_dict(), os.path.join(model_path, "DeepMVS_PT_snapshot_{:d}.model".format(iteration_idx + 1)))
169 | 				torch.save(optimizer.state_dict(), os.path.join(model_path, "DeepMVS_PT_snapshot_{:d}.optimizer".format(iteration_idx + 1)))
170 | 		# Print loss to log file.
171 | 		print >> log_file, "Iter {:d}: loss = {:.6e}".format(iteration_idx, loss.data[0])
172 | 		log_file.flush()
173 | 	# Save final trained model.
174 | 	torch.save(model.state_dict(), os.path.join(model_path, "DeepMVS_PT_final.model"))
175 | 	torch.save(optimizer.state_dict(), os.path.join(model_path, "DeepMVS_PT_final.optimizer"))
176 | 
177 | if need_pretrain:
178 | 	train_DeepMVS_PT()
179 | 
180 | # Check if we can resume from last snapshot.
181 | iteration_stop = 320000
182 | if retrain or snapshot_period == 0:
183 | 	iteration_start = 0
184 | else:
185 | 	iter_idx = 0
186 | 	while os.path.exists(os.path.join(model_path, "DeepMVS_snapshot_{:d}.model".format(iter_idx + snapshot_period))):
187 | 		iter_idx += snapshot_period
188 | 	iteration_start = iter_idx
189 | 
190 | # Train the complete DeepMVS network.
191 | # Use function to ensure the local variables are cleaned up upon exiting the scope.
192 | def train_DeepMVS():
193 | 	# Initialization.
194 | 	model = DeepMVS(num_depths, use_gpu)
195 | 	lr = 1e-6
196 | 	grad_clip = 0.1
197 | 	optimizer = torch.optim.Adam(model.parameters(), lr = lr)
198 | 	if iteration_start == 0:
199 | 		model.apply(weights_init)
200 | 		model.load_state_dict(torch.load(os.path.join(model_path, "DeepMVS_PT_final.model")), strict = False)
201 | 	else:
202 | 		model.load_state_dict(torch.load(os.path.join(model_path, "DeepMVS_snapshot_{:d}.model".format(iteration_start))))
203 | 		optimizer.load_state_dict(torch.load(os.path.join(model_path, "DeepMVS_snapshot_{:d}.optimizer".format(iteration_start))))
204 | 	if use_gpu:
205 | 		VGG_model = vision.models.vgg19(pretrained = True).cuda()
206 | 	else:
207 | 		VGG_model = vision.models.vgg19(pretrained = True)
208 | 	VGG_normalize = vision.transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
209 | 	# Start training.
210 | 	data_in = torch.FloatTensor(batch_size, max_num_neighbors, num_depths, 2, 3, patch_height, patch_width)
211 | 	data_gt = torch.LongTensor(batch_size, patch_height, patch_width)
212 | 	invalid_mask = torch.ByteTensor(batch_size, patch_height, patch_width)
213 | 	thread_idx = 0
214 | 	print >> log_file, "Start training DeepMVS from iteration {:d}.".format(iteration_start)
215 | 	for iteration_idx in range(iteration_start, iteration_stop):
216 | 		# Load a plane-sweep volume.
217 | 		while not shared_datas[thread_idx]["ready_e"].wait(1e-3):
218 | 			thread_idx = (thread_idx + 1) % num_threads
219 | 		ref_img = shared_datas[thread_idx]["ref_img"].copy()
220 | 		ref_img_full = shared_datas[thread_idx]["ref_img_full"].copy()
221 | 		target_x = shared_datas[thread_idx]["target_x"]
222 | 		target_y = shared_datas[thread_idx]["target_y"]
223 | 		sweep_volume = shared_datas[thread_idx]["sweep_volume"].copy()
224 | 		ref_depth = shared_datas[thread_idx]["ref_depth"].copy()
225 | 		valid_mask = shared_datas[thread_idx]["valid_mask"].copy()
226 | 		num_neighbors = shared_datas[thread_idx]["num_neighbors"]
227 | 		shared_datas[thread_idx]["ready_e"].clear()
228 | 		shared_datas[thread_idx]["start_e"].set()
229 | 		# Prepare inputs.
230 | 		optimizer.zero_grad()
231 | 		ref_img_tensor = torch.FloatTensor(np.moveaxis(ref_img, -1, 0))
232 | 		data_in[0, 0:num_neighbors, :, 0, ...] = ref_img_tensor.expand(num_neighbors, num_depths, -1, -1, -1)
233 | 		data_in[0, 0:num_neighbors, :, 1, ...] = torch.FloatTensor(np.moveaxis(sweep_volume, -1, -3))
234 | 		data_gt[...] = torch.LongTensor(np.round(ref_depth * (num_depths - 1.0)).clip(0, num_depths - 1.0).astype(int))
235 | 		invalid_mask[...] = torch.ByteTensor(1 - valid_mask)
236 | 		data_gt.masked_fill_(invalid_mask, -1)
237 | 		if use_gpu:
238 | 			data_in_var = Variable(data_in[:, 0:num_neighbors, ...], requires_grad = True).cuda()
239 | 			data_gt_var = Variable(data_gt, requires_grad = False).cuda()
240 | 		else:
241 | 			data_in_var = Variable(data_in[:, 0:num_neighbors, ...], requires_grad = True)
242 | 			data_gt_var = Variable(data_gt, requires_grad = False)
243 | 		# Compute VGG Features.
244 | 		VGG_scaling_factor = 0.01
245 | 		if use_gpu:
246 | 			VGG_temp_var = Variable(VGG_normalize(torch.FloatTensor(ref_img_full)).permute(2, 0, 1).unsqueeze(0), volatile = True).cuda()
247 | 		else:
248 | 			VGG_temp_var = Variable(VGG_normalize(torch.FloatTensor(ref_img_full)).permute(2, 0, 1).unsqueeze(0), volatile = True)
249 | 		for i in range(0, 4): # conv_1_2
250 | 			VGG_temp_var = VGG_model.features[i].forward(VGG_temp_var)
251 | 		feature_input_1x = Variable(VGG_temp_var.data[... , target_y:target_y + patch_height, target_x:target_x + patch_width].clone() * VGG_scaling_factor, requires_grad = True)
252 | 		for i in range(4, 9): # conv_2_2
253 | 			VGG_temp_var = VGG_model.features[i].forward(VGG_temp_var)
254 | 		feature_input_2x = Variable(VGG_temp_var.data[... , target_y / 2:target_y / 2 + patch_height / 2, target_x / 2:target_x / 2 + patch_width / 2].clone() * VGG_scaling_factor, requires_grad = True)
255 | 		for i in range(9, 14): # conv_3_2
256 | 			VGG_temp_var = VGG_model.features[i].forward(VGG_temp_var)
257 | 		feature_input_4x = Variable(VGG_temp_var.data[... , target_y / 4:target_y / 4 + patch_height / 4, target_x / 4:target_x / 4 + patch_width / 4].clone() * VGG_scaling_factor, requires_grad = True)
258 | 		for i in range(14, 23): # conv_4_2
259 | 			VGG_temp_var = VGG_model.features[i].forward(VGG_temp_var)
260 | 		feature_input_8x = Variable(VGG_temp_var.data[... , target_y / 8:target_y / 8 + patch_height / 8, target_x / 8:target_x / 8 + patch_width / 8].clone() * VGG_scaling_factor, requires_grad = True)
261 | 		for i in range(23, 32): # conv_5_2
262 | 			VGG_temp_var = VGG_model.features[i].forward(VGG_temp_var)
263 | 		feature_input_16x = Variable(VGG_temp_var.data[... , target_y / 16:target_y / 16 + patch_height / 16, target_x / 16:target_x / 16 + patch_width / 16].clone() * VGG_scaling_factor, requires_grad = True)
264 | 		del VGG_temp_var
265 | 		# Update parameters.
266 | 		predict = model.forward(data_in_var, [feature_input_1x, feature_input_2x, feature_input_4x, feature_input_8x, feature_input_16x])
267 | 		loss = model.layer_loss(predict, data_gt_var)
268 | 		loss.backward()
269 | 		nn.utils.clip_grad_norm(model.parameters(), grad_clip)
270 | 		optimizer.step()
271 | 		# Save snapshot if needed.
272 | 		if snapshot_period != 0:
273 | 			if((iteration_idx + 1) % snapshot_period == 0):
274 | 				torch.save(model.state_dict(), os.path.join(model_path, "DeepMVS_snapshot_{:d}.model".format(iteration_idx + 1)))
275 | 				torch.save(optimizer.state_dict(), os.path.join(model_path, "DeepMVS_snapshot_{:d}.optimizer".format(iteration_idx + 1)))
276 | 		# Print loss to log file.
277 | 		print >> log_file, "Iter {:d}: loss = {:.6e}".format(iteration_idx, loss.data[0])
278 | 		log_file.flush()
279 | 	# Save final trained model.
280 | 	torch.save(model.state_dict(), os.path.join(model_path, "DeepMVS_final.model"))
281 | 	torch.save(optimizer.state_dict(), os.path.join(model_path, "DeepMVS_final.optimizer"))
282 | 
283 | train_DeepMVS()
284 | 
285 | # Terminate worker threads.
286 | for i in range(0, num_threads):
287 | 	shared_datas[i]["stop"] = True
288 | 	shared_datas[i]["start_e"].set()
289 | # Finished.
290 | print >> log_file, "Finished training DeepMVS."
291 | print >> log_file, "Trained model can be found at {:}".format(os.path.join(model_path, "DeepMVS_final.model"))
292 | 


--------------------------------------------------------------------------------