├── common
    ├── loss
    │   ├── __init__.py
    │   ├── loss_functions.py
    │   └── inverse_warp.py
    ├── utils
    │   ├── __init__.py
    │   ├── logger.py
    │   ├── utils.py
    │   └── custom_transforms.py
    ├── data_prepare
    │   ├── __init__.py
    │   ├── well_lit_from_varying.txt
    │   ├── prepare_train_data_VIVID.py
    │   └── VIVID_raw_loader.py
    └── models
    │   ├── __init__.py
    │   ├── ResDispPoseNet.py
    │   ├── PoseResNet.py
    │   ├── DispResNet.py
    │   └── resnet_encoder.py
├── eval_vivid
    ├── __init__.py
    ├── pose_evaluation_utils.py
    └── eval_depth.py
├── media
    ├── overview_a.png
    ├── overview_b.png
    ├── qualitative_1.png
    ├── qualitative_2.png
    └── qualitative_3.png
├── scripts
    ├── prepare_vivid_data.sh
    ├── train_vivid_resnet18_indoor.sh
    ├── train_vivid_resnet18_outdoor.sh
    ├── run_vivid_inference.sh
    ├── display_result.sh
    ├── test_vivid_outdoor.sh
    └── test_vivid_indoor.sh
├── .gitignore
├── dataloader
    ├── VIVID_sequence_folders.py
    └── VIVID_validation_folders.py
├── environment.yml
├── test_disp.py
├── run_inference.py
├── test_pose.py
├── README.md
└── train.py


/common/loss/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eval_vivid/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/common/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/common/data_prepare/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/media/overview_a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UkcheolShin/ThermalMonoDepth/HEAD/media/overview_a.png


--------------------------------------------------------------------------------
/media/overview_b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UkcheolShin/ThermalMonoDepth/HEAD/media/overview_b.png


--------------------------------------------------------------------------------
/media/qualitative_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UkcheolShin/ThermalMonoDepth/HEAD/media/qualitative_1.png


--------------------------------------------------------------------------------
/media/qualitative_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UkcheolShin/ThermalMonoDepth/HEAD/media/qualitative_2.png


--------------------------------------------------------------------------------
/media/qualitative_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UkcheolShin/ThermalMonoDepth/HEAD/media/qualitative_3.png


--------------------------------------------------------------------------------
/common/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .DispResNet import DispResNet
2 | from .PoseResNet import PoseResNet
3 | from .ResDispPoseNet import DispPoseResNet
4 | 


--------------------------------------------------------------------------------
/scripts/prepare_vivid_data.sh:
--------------------------------------------------------------------------------
1 | # for VIVID rgb-thermal dataset
2 | DATASET=/HDD/Dataset/KAIST_VIVID
3 | TRAIN_SET=/HDD/Dataset_processed/VIVID_256/
4 | mkdir -p $TRAIN_SET
5 | python common/data_prepare/prepare_train_data_VIVID.py $DATASET --dump-root $TRAIN_SET --width 320  --height 256 --num-threads 16 --with-depth  --with-pose
6 | 


--------------------------------------------------------------------------------
/scripts/train_vivid_resnet18_indoor.sh:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=/HDD/Dataset_processed
 2 | TRAIN_SET=$DATA_ROOT/VIVID_256
 3 | GPU_ID=1
 4 | 
 5 | CUDA_VISIBLE_DEVICES=${GPU_ID} \
 6 | python train.py $TRAIN_SET \
 7 | --resnet-layers 18 \
 8 | --num-scales 1 \
 9 | --scene_type indoor \
10 | -b 4 \
11 | --sequence-length 3 \
12 | --with-ssim 1 \
13 | --with-auto-mask 1 \
14 | --with-pretrain 1 \
15 | --rearrange-bin 30 \
16 | --clahe-clip 3.0 \
17 | --log-output \
18 | --name T_vivid_resnet18_indoor
19 | 


--------------------------------------------------------------------------------
/scripts/train_vivid_resnet18_outdoor.sh:
--------------------------------------------------------------------------------
 1 | DATA_ROOT=/HDD/Dataset_processed
 2 | TRAIN_SET=$DATA_ROOT/VIVID_256
 3 | GPU_ID=2
 4 | 
 5 | CUDA_VISIBLE_DEVICES=${GPU_ID} \
 6 | python train.py $TRAIN_SET \
 7 | --resnet-layers 18 \
 8 | --num-scales 1 \
 9 | --scene_type outdoor \
10 | -b 4 \
11 | --sequence-length 3 \
12 | --with-ssim 1 \
13 | --with-auto-mask 1 \
14 | --with-pretrain 1 \
15 | --rearrange-bin 30 \
16 | --clahe-clip 2.0 \
17 | --log-output \
18 | --name T_vivid_resnet18_outdoor
19 | 


--------------------------------------------------------------------------------
/scripts/run_vivid_inference.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # run script : bash run_vivid_inference.sh
 3 | 
 4 | NAMES=("T_vivid_resnet18_indoor")  
 5 | 
 6 | DATA_ROOT=/HDD/Dataset_processed/VIVID_256/
 7 | RESNET=18
 8 | IMG_H=256
 9 | IMG_W=320
10 | GPU_ID=0
11 | 
12 | for NAME in ${NAMES[@]}; do
13 | 	echo "Name : ${NAME}"
14 | 	RESULTS_DIR=results/${NAME}
15 | 	POSE_NET=checkpoints/${NAME}/exp_pose_disp_model_best.pth.tar
16 | 	DISP_NET=checkpoints/${NAME}/dispnet_disp_model_best.pth.tar
17 | 
18 | 	SEQS=("indoor_robust_dark"	"indoor_robust_varying" "indoor_aggresive_dark"	
19 | 		"indoor_aggresive_local" "indoor_unstable_dark" "indoor_robust_varying_well_lit")
20 | 
21 | 	for SEQ in ${SEQS[@]}; do
22 | 		echo "Seq_name : ${SEQ}"
23 | 
24 | 		CUDA_VISIBLE_DEVICES=${GPU_ID} python run_inference.py ${DATA_ROOT} --resnet-layers $RESNET \
25 | 		--output-dir $RESULTS_DIR --sequence ${SEQ} --scene_type indoor --pretrained-model $DISP_NET 
26 | 	done
27 | 
28 | 	SEQS=("outdoor_robust_night1" "outdoor_robust_night2")
29 | 
30 | 	for SEQ in ${SEQS[@]}; do
31 | 		echo "Seq_name : ${SEQ}"
32 | 
33 | 		CUDA_VISIBLE_DEVICES=${GPU_ID} python run_inference.py ${DATA_ROOT} --resnet-layers $RESNET \
34 | 		--output-dir $RESULTS_DIR --sequence ${SEQ} --scene_type outdoor --pretrained-model $DISP_NET 
35 | 	done
36 | done
37 | 


--------------------------------------------------------------------------------
/scripts/display_result.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # run script : bash 
 3 | 
 4 | NAMES=("T_vivid_resnet18_indoor")
 5 | RESULTS_DIR=results
 6 | 
 7 | for NAME in ${NAMES[@]}; do
 8 | 	echo "NAME : ${NAME}"
 9 | 
10 | 	# depth
11 | 	echo "Indoor Depth : well-lit results"
12 | 	SEQS=("indoor_aggresive_local" "indoor_robust_varying_well_lit")
13 | 	for SEQ in ${SEQS[@]}; do
14 | 		echo "Seq_name : ${SEQ}"
15 | 		cat ${RESULTS_DIR}/${NAME}/Depth/${SEQ}/eval_depth.txt 
16 | 	done
17 | 
18 | 	echo "Indoor Depth : low-light results"
19 | 	SEQS=("indoor_robust_dark" "indoor_unstable_dark" "indoor_aggresive_dark" )
20 | 	for SEQ in ${SEQS[@]}; do
21 | 		echo "Seq_name : ${SEQ}"
22 | 		cat ${RESULTS_DIR}/${NAME}/Depth/${SEQ}/eval_depth.txt 
23 | 	done
24 | 
25 | 	# pose
26 | 	SEQS=("indoor_aggresive_local" "indoor_robust_varying" "indoor_robust_dark" "indoor_unstable_dark" "indoor_aggresive_dark" )
27 | 	echo "Indoor Pose : All results"
28 | 	for SEQ in ${SEQS[@]}; do
29 | 		echo "Seq_name : ${SEQ}"
30 | 		cat ${RESULTS_DIR}/${NAME}/POSE/${SEQ}/eval_pose.txt 
31 | 	done
32 | done
33 | 
34 | NAMES=("T_vivid_resnet18_outdoor")
35 | for NAME in ${NAMES[@]}; do
36 | 	echo "NAME : ${NAME}"
37 | 
38 | 	# depth
39 | 	echo "Outdoor Depth : night-time results"
40 | 	SEQS=("outdoor_robust_night1" "outdoor_robust_night2" )
41 | 	for SEQ in ${SEQS[@]}; do
42 | 		echo "Seq_name : ${SEQ}"
43 | 		cat ${RESULTS_DIR}/${NAME}/Depth/${SEQ}/eval_depth.txt 
44 | 	done
45 | 
46 | 	# pose
47 | 	SEQS=("outdoor_robust_night1" "outdoor_robust_night2" )
48 | 	echo "Outdoor Pose : night-time results"
49 | 	for SEQ in ${SEQS[@]}; do
50 | 		echo "Seq_name : ${SEQ}"
51 | 		cat ${RESULTS_DIR}/${NAME}/POSE/${SEQ}/eval_pose.txt 
52 | 	done
53 | done
54 | 
55 | 


--------------------------------------------------------------------------------
/scripts/test_vivid_outdoor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # run script : bash test_vivid_outdoor.sh
 3 | 
 4 | GPU_ID=0
 5 | DATA_ROOT=/HDD/Dataset_processed/VIVID_256/
 6 | RESULTS_DIR=results/
 7 | 
 8 | RESNET=18
 9 | IMG_H=256
10 | IMG_W=320
11 | DATASET=VIVID
12 | INPUT_TYPE=T 
13 | DEPTH_GT_DIR=Depth_T 
14 | POSE_GT=poses_T.txt 
15 | 
16 | # outdoor testset
17 | NAMES=("T_vivid_resnet18_outdoor")
18 | 
19 | for NAME in ${NAMES[@]}; do
20 | 	SEQS=("outdoor_robust_night1" "outdoor_robust_night2" )
21 | 	POSE_NET=checkpoints/${NAME}/exp_pose_pose_model_best.pth.tar
22 | 	DISP_NET=checkpoints/${NAME}/dispnet_disp_model_best.pth.tar
23 | 	echo "${NAME}"
24 | 
25 | 	for SEQ in ${SEQS[@]}; do
26 | 		echo "Seq_name : ${SEQ}"
27 | 		SCENE=outdoor
28 | 
29 | 		#mkdir -p ${RESULTS_DIR}
30 | 		DATA_DIR=${DATA_ROOT}/${SEQ}/
31 | 		OUTPUT_DEPTH_DIR=${RESULTS_DIR}/${NAME}/Depth/${SEQ}/
32 | 		OUTPUT_POSE_DIR=${RESULTS_DIR}/${NAME}/POSE/${SEQ}/
33 | 		mkdir -p ${OUTPUT_DEPTH_DIR}
34 | 		mkdir -p ${OUTPUT_POSE_DIR}
35 | 
36 | 		# Detph Evaulation 
37 | 		CUDA_VISIBLE_DEVICES=${GPU_ID} python test_disp.py \
38 | 		--resnet-layers $RESNET --pretrained-dispnet $DISP_NET \
39 | 		--img-height $IMG_H --img-width $IMG_W \
40 | 		--dataset-dir ${DATA_DIR} --output-dir $OUTPUT_DEPTH_DIR >> ${OUTPUT_DEPTH_DIR}/disp.txt
41 | 
42 | 		CUDA_VISIBLE_DEVICES=${GPU_ID} python eval_vivid/eval_depth.py \
43 | 		--dataset $DATASET --pred_depth ${OUTPUT_DEPTH_DIR}/predictions.npy \
44 | 		--gt_depth ${DATA_DIR}/${DEPTH_GT_DIR}  --scene outdoor >> ${OUTPUT_DEPTH_DIR}/eval_depth.txt 
45 | 
46 | 		rm ${OUTPUT_DEPTH_DIR}/predictions.npy
47 | 
48 | 
49 | 		# Pose Evaulation 
50 | 		CUDA_VISIBLE_DEVICES=${GPU_ID} python test_pose.py \
51 | 		--resnet-layers $RESNET --pretrained-posenet $POSE_NET \
52 | 		--img-height $IMG_H --img-width $IMG_W \
53 | 		--dataset-dir ${DATA_ROOT} --output-dir ${OUTPUT_POSE_DIR} \
54 | 		--sequences ${SEQ} >> ${OUTPUT_POSE_DIR}/eval_pose.txt
55 | 	done
56 | done
57 | 
58 | 


--------------------------------------------------------------------------------
/common/models/ResDispPoseNet.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | import numpy as np
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | 
 8 | from .resnet_encoder import *
 9 | from .PoseResNet import PoseDecoder
10 | from .DispResNet import DepthDecoder
11 | 
12 | class DispPoseResNet(nn.Module):
13 |     def __init__(self, num_layers = 18, pretrained = True, num_channel=1):
14 |         super(DispPoseResNet, self).__init__()
15 |         self.encoder = ResnetEncoder(num_layers = num_layers, pretrained = pretrained, num_input_images=1, num_channel=num_channel)
16 |         self.depth_decoder = DepthDecoder(self.encoder.num_ch_enc)
17 |         self.pose_decoder = PoseDecoder(self.encoder.num_ch_enc*2)
18 | 
19 |         self.DispResNet = DispResNet(self.encoder, self.depth_decoder)
20 |         self.PoseResNet = PoseResNet(self.encoder, self.pose_decoder)
21 | 
22 |     def init_weights(self):
23 |         pass
24 | 
25 | class DispResNet(nn.Module):
26 | 
27 |     def __init__(self, encoder, decoder):
28 |         super(DispResNet, self).__init__()
29 |         self.encoder = encoder
30 |         self.decoder = decoder
31 | 
32 |     def init_weights(self):
33 |         pass
34 | 
35 |     def forward(self, x):
36 |         enc_features = self.encoder(x)
37 |         outputs = self.decoder(enc_features)
38 | 
39 |         if self.training:
40 |             return outputs
41 |         else:
42 |             return outputs[0]
43 | 
44 | class PoseResNet(nn.Module):
45 | 
46 |     def __init__(self, encoder, decoder):
47 |         super(PoseResNet, self).__init__()
48 |         self.encoder = encoder
49 |         self.decoder = decoder
50 | 
51 |     def init_weights(self):
52 |         pass
53 | 
54 |     def forward(self, img1, img2):
55 |         features1 = self.encoder(img1)
56 |         features2 = self.encoder(img2)
57 | 
58 |         features = []
59 |         for k in range(0, len(features1)) : 
60 |             features.append(torch.cat([features1[k],features2[k]],dim=1))
61 |         pose = self.decoder([features])
62 |         return pose
63 | 


--------------------------------------------------------------------------------
/scripts/test_vivid_indoor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # run script : bash test_vivid_indoor.sh
 3 | GPU_ID=0
 4 | DATA_ROOT=/HDD/Dataset_processed/VIVID_256/
 5 | RESULTS_DIR=results/
 6 | 
 7 | RESNET=18
 8 | IMG_H=256
 9 | IMG_W=320
10 | DATASET=VIVID
11 | DEPTH_GT_DIR=Depth_T 
12 | POSE_GT=poses_T.txt 
13 | 
14 | NAMES=("T_vivid_resnet18_indoor")
15 | 
16 | for NAME in ${NAMES[@]}; do
17 | 	# indoor testset
18 | 	SEQS=("indoor_robust_dark"	"indoor_aggresive_dark"	"indoor_aggresive_local" "indoor_unstable_dark" "indoor_robust_varying_well_lit")
19 | 	POSE_NET=checkpoints/${NAME}/exp_pose_pose_model_best.pth.tar
20 | 	DISP_NET=checkpoints/${NAME}/dispnet_disp_model_best.pth.tar 
21 | 	echo "${NAME}"
22 | 
23 | 	# depth
24 | 	for SEQ in ${SEQS[@]}; do
25 | 		echo "Seq_name : ${SEQ}"
26 | 
27 | 		#mkdir -p ${RESULTS_DIR}
28 | 		DATA_DIR=${DATA_ROOT}/${SEQ}/
29 | 		OUTPUT_DEPTH_DIR=${RESULTS_DIR}/${NAME}/Depth/${SEQ}/
30 | 		mkdir -p ${OUTPUT_DEPTH_DIR}
31 | 
32 | 		# Detph Evaulation 
33 | 		CUDA_VISIBLE_DEVICES=${GPU_ID} python test_disp.py \
34 | 		--resnet-layers $RESNET --pretrained-dispnet $DISP_NET \
35 | 		--img-height $IMG_H --img-width $IMG_W \
36 | 		--dataset-dir ${DATA_DIR} --output-dir $OUTPUT_DEPTH_DIR >> ${OUTPUT_DEPTH_DIR}/disp.txt
37 | 
38 | 		CUDA_VISIBLE_DEVICES=${GPU_ID} python eval_vivid/eval_depth.py \
39 | 		--dataset $DATASET --pred_depth ${OUTPUT_DEPTH_DIR}/predictions.npy \
40 | 		--gt_depth ${DATA_DIR}/${DEPTH_GT_DIR} --scene indoor >> ${OUTPUT_DEPTH_DIR}/eval_depth.txt 
41 | 
42 | 		rm ${OUTPUT_DEPTH_DIR}/predictions.npy
43 | 	done
44 | 
45 | 	# pose
46 | 	SEQS=("indoor_robust_dark" "indoor_robust_varying" "indoor_aggresive_dark"	"indoor_aggresive_local" "indoor_unstable_dark")
47 | 	for SEQ in ${SEQS[@]}; do
48 | 		echo "Seq_name : ${SEQ}"
49 | 
50 | 		#mkdir -p ${RESULTS_DIR}
51 | 		DATA_DIR=${DATA_ROOT}/${SEQ}/
52 | 		OUTPUT_POSE_DIR=${RESULTS_DIR}/${NAME}/POSE/${SEQ}/
53 | 		mkdir -p ${OUTPUT_POSE_DIR}
54 | 
55 | 		# Pose Evaulation 
56 | 		CUDA_VISIBLE_DEVICES=${GPU_ID} python test_pose.py \
57 | 		--resnet-layers $RESNET --pretrained-posenet $POSE_NET \
58 | 		--img-height $IMG_H --img-width $IMG_W \
59 | 		--dataset-dir ${DATA_ROOT} --output-dir ${OUTPUT_POSE_DIR} \
60 | 		--sequences ${SEQ} >> ${OUTPUT_POSE_DIR}/eval_pose.txt
61 | 	done
62 | done
63 | 


--------------------------------------------------------------------------------
/eval_vivid/pose_evaluation_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from path import Path
 3 | from imageio import imread
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | class test_framework_VIVID(object):
 8 |     def __init__(self, root, sequence_set, seq_length=3, step=1):
 9 |         self.root = root
10 |         self.img_files, self.poses, self.sample_indices = read_scene_data(self.root, sequence_set, seq_length, step)
11 | 
12 |     def generator(self):
13 |         for img_list, pose_list, sample_list in zip(self.img_files, self.poses, self.sample_indices):
14 |             for snippet_indices in sample_list:
15 |                 imgs = [np.expand_dims(imread(img_list[i]).astype(np.float32), axis=2) for i in snippet_indices]
16 | 
17 |                 poses = np.stack([pose_list[i] for i in snippet_indices])
18 |                 first_pose = poses[0]
19 |                 poses[:,:,-1] -= first_pose[:,-1]
20 |                 compensated_poses = np.linalg.inv(first_pose[:,:3]) @ poses
21 | 
22 |                 yield {'imgs': imgs,
23 |                        'path': img_list[0],
24 |                        'poses': compensated_poses
25 |                        }
26 | 
27 |     def __iter__(self):
28 |         return self.generator()
29 | 
30 |     def __len__(self):
31 |         return sum(len(imgs) for imgs in self.img_files)
32 | 
33 | 
34 | def read_scene_data(data_root, sequence_set, seq_length=3, step=1):
35 |     data_root = Path(data_root)
36 |     im_sequences = []
37 |     poses_sequences = []
38 |     indices_sequences = []
39 |     demi_length = (seq_length - 1) // 2
40 |     shift_range = np.array([step*i for i in range(-demi_length, demi_length + 1)]).reshape(1, -1)
41 | 
42 |     sequences = []
43 |     for seq in sequence_set:
44 |         sequences.append((data_root/seq))
45 | 
46 |     print('getting test metadata for theses sequences : {}'.format(sequences))
47 |     for sequence in tqdm(sequences):
48 |         imgs = sorted((sequence/'Thermal').files('*.png'))
49 |         poses = np.genfromtxt(sequence/'poses_T.txt').astype(np.float64).reshape(-1, 3, 4)
50 |    
51 |         # construct 5-snippet sequences
52 |         tgt_indices = np.arange(demi_length, len(imgs) - demi_length).reshape(-1, 1)
53 |         snippet_indices = shift_range + tgt_indices
54 |         im_sequences.append(imgs)
55 |         poses_sequences.append(poses)
56 |         indices_sequences.append(snippet_indices)
57 |     return im_sequences, poses_sequences, indices_sequences
58 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # User define
  7 | checkpoints
  8 | results
  9 | 
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | pip-wheel-metadata/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | 
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 | 
122 | # Rope project settings
123 | .ropeproject
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | 


--------------------------------------------------------------------------------
/common/models/PoseResNet.py:
--------------------------------------------------------------------------------
 1 | # Copyright Niantic 2019. Patent Pending. All rights reserved.
 2 | #
 3 | # This software is licensed under the terms of the Monodepth2 licence
 4 | # which allows for non-commercial use only, the full terms of which are made
 5 | # available in the LICENSE file.
 6 | 
 7 | from __future__ import absolute_import, division, print_function
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | from collections import OrderedDict
12 | from .resnet_encoder import *
13 | 
14 | class PoseDecoder(nn.Module):
15 |     def __init__(self, num_ch_enc, num_input_features=1, num_frames_to_predict_for=1, stride=1):
16 |         super(PoseDecoder, self).__init__()
17 | 
18 |         self.num_ch_enc = num_ch_enc
19 |         self.num_input_features = num_input_features
20 | 
21 |         if num_frames_to_predict_for is None:
22 |             num_frames_to_predict_for = num_input_features - 1
23 |         self.num_frames_to_predict_for = num_frames_to_predict_for
24 | 
25 |         self.convs = OrderedDict()
26 |         self.convs[("squeeze")] = nn.Conv2d(self.num_ch_enc[-1], 256, 1)
27 |         self.convs[("pose", 0)] = nn.Conv2d(num_input_features * 256, 256, 3, stride, 1)
28 |         self.convs[("pose", 1)] = nn.Conv2d(256, 256, 3, stride, 1)
29 |         self.convs[("pose", 2)] = nn.Conv2d(256, 6 * num_frames_to_predict_for, 1)
30 | 
31 |         self.relu = nn.ReLU()
32 | 
33 |         self.net = nn.ModuleList(list(self.convs.values()))
34 | 
35 |     def forward(self, input_features):
36 |         last_features = [f[-1] for f in input_features]
37 | 
38 |         cat_features = [self.relu(self.convs["squeeze"](f)) for f in last_features]
39 |         cat_features = torch.cat(cat_features, 1)
40 | 
41 |         out = cat_features
42 |         for i in range(3):
43 |             out = self.convs[("pose", i)](out)
44 |             if i != 2:
45 |                 out = self.relu(out)
46 | 
47 |         out = out.mean(3).mean(2)
48 | 
49 |         pose = 0.01 * out.view(-1, 6)
50 | 
51 |         return pose
52 | 
53 | 
54 | class PoseResNet(nn.Module):
55 | 
56 |     def __init__(self, num_layers = 18, pretrained = True, num_channel=3):
57 |         super(PoseResNet, self).__init__()
58 |         self.encoder = ResnetEncoder(num_layers = num_layers, pretrained = pretrained, num_input_images=2, num_channel=num_channel)
59 |         self.decoder = PoseDecoder(self.encoder.num_ch_enc)
60 | 
61 |     def init_weights(self):
62 |         pass
63 | 
64 |     def forward(self, img1, img2):
65 |         x = torch.cat([img1,img2],1)
66 |         features = self.encoder(x)
67 |         pose = self.decoder([features])
68 |         return pose
69 | 


--------------------------------------------------------------------------------
/common/utils/logger.py:
--------------------------------------------------------------------------------
 1 | from blessings import Terminal
 2 | import progressbar
 3 | import sys
 4 | 
 5 | 
 6 | class TermLogger(object):
 7 |     def __init__(self, n_epochs, train_size, valid_size):
 8 |         self.n_epochs = n_epochs
 9 |         self.train_size = train_size
10 |         self.valid_size = valid_size
11 |         self.t = Terminal()
12 |         s = 10
13 |         e = 1   # epoch bar position
14 |         tr = 3  # train bar position
15 |         ts = 6  # valid bar position
16 |         value = self.t.height
17 |         h = int(0 if value is None else value)
18 | 
19 |         for i in range(10):
20 |             print('')
21 |         self.epoch_bar = progressbar.ProgressBar(
22 |             max_value=n_epochs, fd=Writer(self.t, (0, h-s+e)))
23 | 
24 |         self.train_writer = Writer(self.t, (0, h-s+tr))
25 |         self.train_bar_writer = Writer(self.t, (0, h-s+tr+1))
26 | 
27 |         self.valid_writer = Writer(self.t, (0, h-s+ts-1))
28 |         self.valid_writer2 = Writer(self.t, (0, h-s+ts))
29 |         self.valid_bar_writer = Writer(self.t, (0, h-s+ts+1))
30 | 
31 |         self.reset_train_bar()
32 |         self.reset_valid_bar()
33 | 
34 |     def reset_train_bar(self):
35 |         self.train_bar = progressbar.ProgressBar(
36 |             max_value=self.train_size, fd=self.train_bar_writer)
37 | 
38 |     def reset_valid_bar(self):
39 |         self.valid_bar = progressbar.ProgressBar(
40 |             max_value=self.valid_size, fd=self.valid_bar_writer)
41 | 
42 | 
43 | class Writer(object):
44 |     """Create an object with a write method that writes to a
45 |     specific place on the screen, defined at instantiation.
46 | 
47 |     This is the glue between blessings and progressbar.
48 |     """
49 | 
50 |     def __init__(self, t, location):
51 |         """
52 |         Input: location - tuple of ints (x, y), the position
53 |                         of the bar in the terminal
54 |         """
55 |         self.location = location
56 |         self.t = t
57 | 
58 |     def write(self, string):
59 |         with self.t.location(*self.location):
60 |             sys.stdout.write("\033[K")
61 |             print(string)
62 | 
63 |     def flush(self):
64 |         return
65 | 
66 | 
67 | class AverageMeter(object):
68 |     """Computes and stores the average and current value"""
69 | 
70 |     def __init__(self, i=1, precision=3):
71 |         self.meters = i
72 |         self.precision = precision
73 |         self.reset(self.meters)
74 | 
75 |     def reset(self, i):
76 |         self.val = [0]*i
77 |         self.avg = [0]*i
78 |         self.sum = [0]*i
79 |         self.count = 0
80 | 
81 |     def update(self, val, n=1):
82 |         if not isinstance(val, list):
83 |             val = [val]
84 |         assert(len(val) == self.meters)
85 |         self.count += n
86 |         for i, v in enumerate(val):
87 |             self.val[i] = v
88 |             self.sum[i] += v * n
89 |             self.avg[i] = self.sum[i] / self.count
90 | 
91 |     def __repr__(self):
92 |         val = ' '.join(['{:.{}f}'.format(v, self.precision) for v in self.val])
93 |         avg = ' '.join(['{:.{}f}'.format(a, self.precision) for a in self.avg])
94 |         return '{} ({})'.format(val, avg)
95 | 


--------------------------------------------------------------------------------
/common/data_prepare/well_lit_from_varying.txt:
--------------------------------------------------------------------------------
  1 | 000020
  2 | 000116
  3 | 000168
  4 | 000216
  5 | 000302
  6 | 000325
  7 | 000376
  8 | 000420
  9 | 000443
 10 | 000492
 11 | 000538
 12 | 000590
 13 | 000023
 14 | 000118
 15 | 000169
 16 | 000217
 17 | 000303
 18 | 000326
 19 | 000377
 20 | 000421
 21 | 000444
 22 | 000493
 23 | 000539
 24 | 000591
 25 | 000042
 26 | 000120
 27 | 000170
 28 | 000218
 29 | 000304
 30 | 000327
 31 | 000378
 32 | 000422
 33 | 000445
 34 | 000494
 35 | 000540
 36 | 000592
 37 | 000044
 38 | 000121
 39 | 000171
 40 | 000219
 41 | 000305
 42 | 000328
 43 | 000379
 44 | 000423
 45 | 000446
 46 | 000495
 47 | 000541
 48 | 000593
 49 | 000045
 50 | 000122
 51 | 000172
 52 | 000220
 53 | 000306
 54 | 000329
 55 | 000380
 56 | 000424
 57 | 000447
 58 | 000496
 59 | 000542
 60 | 000594
 61 | 000046
 62 | 000123
 63 | 000173
 64 | 000247
 65 | 000307
 66 | 000330
 67 | 000381
 68 | 000425
 69 | 000448
 70 | 000497
 71 | 000543
 72 | 000595
 73 | 000047
 74 | 000124
 75 | 000174
 76 | 000248
 77 | 000308
 78 | 000331
 79 | 000382
 80 | 000426
 81 | 000449
 82 | 000498
 83 | 000544
 84 | 000596
 85 | 000065
 86 | 000125
 87 | 000175
 88 | 000249
 89 | 000309
 90 | 000332
 91 | 000383
 92 | 000427
 93 | 000450
 94 | 000499
 95 | 000545
 96 | 000597
 97 | 000067
 98 | 000126
 99 | 000176
100 | 000250
101 | 000310
102 | 000333
103 | 000385
104 | 000428
105 | 000477
106 | 000500
107 | 000546
108 | 000598
109 | 000069
110 | 000127
111 | 000177
112 | 000251
113 | 000311
114 | 000334
115 | 000387
116 | 000429
117 | 000478
118 | 000501
119 | 000547
120 | 000599
121 | 000071
122 | 000128
123 | 000178
124 | 000252
125 | 000312
126 | 000335
127 | 000389
128 | 000430
129 | 000479
130 | 000502
131 | 000548
132 | 000600
133 | 000073
134 | 000129
135 | 000179
136 | 000253
137 | 000313
138 | 000336
139 | 000392
140 | 000431
141 | 000480
142 | 000503
143 | 000549
144 | 000601
145 | 000076
146 | 000130
147 | 000180
148 | 000254
149 | 000314
150 | 000337
151 | 000394
152 | 000432
153 | 000481
154 | 000504
155 | 000550
156 | 000602
157 | 000078
158 | 000131
159 | 000181
160 | 000255
161 | 000315
162 | 000338
163 | 000396
164 | 000433
165 | 000482
166 | 000505
167 | 000551
168 | 000603
169 | 000079
170 | 000159
171 | 000182
172 | 000257
173 | 000316
174 | 000339
175 | 000397
176 | 000434
177 | 000483
178 | 000506
179 | 000552
180 | 000604
181 | 000080
182 | 000160
183 | 000183
184 | 000259
185 | 000317
186 | 000340
187 | 000398
188 | 000435
189 | 000484
190 | 000507
191 | 000553
192 | 000081
193 | 000161
194 | 000184
195 | 000261
196 | 000318
197 | 000369
198 | 000399
199 | 000436
200 | 000485
201 | 000508
202 | 000554
203 | 000082
204 | 000162
205 | 000185
206 | 000263
207 | 000319
208 | 000370
209 | 000400
210 | 000437
211 | 000486
212 | 000509
213 | 000581
214 | 000083
215 | 000163
216 | 000186
217 | 000266
218 | 000320
219 | 000371
220 | 000401
221 | 000438
222 | 000487
223 | 000510
224 | 000583
225 | 000084
226 | 000164
227 | 000187
228 | 000269
229 | 000321
230 | 000372
231 | 000402
232 | 000439
233 | 000488
234 | 000534
235 | 000585
236 | 000111
237 | 000165
238 | 000213
239 | 000274
240 | 000322
241 | 000373
242 | 000403
243 | 000440
244 | 000489
245 | 000535
246 | 000587
247 | 000112
248 | 000166
249 | 000214
250 | 000280
251 | 000323
252 | 000374
253 | 000404
254 | 000441
255 | 000490
256 | 000536
257 | 000588
258 | 000114
259 | 000167
260 | 000215
261 | 000282
262 | 000324
263 | 000375
264 | 000419
265 | 000442
266 | 000491
267 | 000537
268 | 000589


--------------------------------------------------------------------------------
/dataloader/VIVID_sequence_folders.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.utils.data as data
 3 | import numpy as np
 4 | 
 5 | import math
 6 | import random
 7 | from imageio import imread
 8 | from path import Path
 9 | 
10 | def load_as_float(path):
11 |     return np.expand_dims(imread(path).astype(np.float32), axis=2) # HW -> HW1
12 | 
13 | class SequenceFolder(data.Dataset):
14 |     """A sequence data loader where the files are arranged in this way:
15 |         root/scene_1/Thermal/0000000.png
16 |         root/scene_1/Thermal/0000001.png
17 |         ..
18 |         root/scene_1/cam.txt
19 |         root/scene_2/Thermal/0000000.png
20 |         .
21 |         transform functions must take in a list a images and a numpy array (usually intrinsics matrix)
22 |     """
23 |     def __init__(self, root, seed=None, train=True, sequence_length=3,\
24 |                  tf_share=None, tf_input=None, tf_loss=None, \
25 |                  scene_type='indoor', interval=1):
26 |         np.random.seed(seed)
27 |         random.seed(seed)
28 | 
29 |         self.root = Path(root)
30 |         if scene_type == 'indoor': 
31 |             folder_list_path = self.root/'train_indoor.txt' if train else self.root/'val_indoor.txt'
32 |         elif scene_type == 'outdoor': 
33 |             folder_list_path = self.root/'train_outdoor.txt' if train else self.root/'val_outdoor.txt'
34 |         
35 |         self.folders = [self.root/folder[:-1] for folder in open(folder_list_path)]
36 |         self.tf_share = tf_share
37 |         self.tf_input = tf_input
38 |         self.tf_loss = tf_loss        
39 |         self.crawl_folders(sequence_length, interval)
40 | 
41 |     def crawl_folders(self, sequence_length, interval):
42 |         sequence_set = []
43 |         demi_length = (sequence_length-1)//2 + interval - 1
44 |         shifts = list(range(-demi_length, demi_length + 1))
45 |         for i in range(1, 2*demi_length):
46 |             shifts.pop(1)
47 | 
48 |         for folder in self.folders:
49 |             imgs           = sorted((folder/"Thermal").files('*.png'))
50 |             intrinsics     = np.genfromtxt(folder/'cam_T.txt').astype(np.float32).reshape((3, 3))
51 | 
52 |             for i in range(demi_length, len(imgs)-demi_length):
53 |                 sample = {'intrinsics': intrinsics, 'tgt': imgs[i], 'ref_imgs': []}
54 |                 for j in shifts:
55 |                     sample['ref_imgs'].append(imgs[i+j])
56 |                 sequence_set.append(sample)
57 | 
58 |         random.shuffle(sequence_set)
59 |         self.samples = sequence_set
60 | 
61 |     def __getitem__(self, index):
62 |         sample = self.samples[index]
63 | 
64 |         # Read thermal images & GT depths
65 |         tgt_img = load_as_float(sample['tgt'])
66 |         ref_imgs = [load_as_float(ref_img) for ref_img in sample['ref_imgs']]
67 |         
68 |         # Pre-process thermal images for network input & loss calculation
69 |         imgs, intrinsics = self.tf_share([tgt_img] + ref_imgs, np.expand_dims(np.copy(sample['intrinsics']),axis=0))
70 |         imgs_input, _  = self.tf_input(imgs, None)
71 |         imgs_loss, _   = self.tf_loss(imgs, None)
72 | 
73 |         tgt_img_input = imgs_input[0]
74 |         ref_imgs_input = imgs_input[1:]
75 |         tgt_img_loss = imgs_loss[0]
76 |         ref_imgs_loss = imgs_loss[1:]   
77 | 
78 |         return tgt_img_input, ref_imgs_input, tgt_img_loss, ref_imgs_loss, intrinsics.squeeze()
79 | 
80 |     def __len__(self):
81 |         return len(self.samples)
82 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: ThermalDepth
  2 | channels:
  3 |   - pytorch
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=main
  8 |   - blas=1.0=mkl
  9 |   - ca-certificates=2020.6.20=hecda079_0
 10 |   - certifi=2020.6.20=py37hc8dfbb8_0
 11 |   - cudatoolkit=10.2.89=hfd86e86_1
 12 |   - freetype=2.10.2=h5ab3b9f_0
 13 |   - intel-openmp=2020.1=217
 14 |   - jpeg=9b=h024ee3a_2
 15 |   - ld_impl_linux-64=2.33.1=h53a641e_7
 16 |   - libedit=3.1.20191231=h7b6447c_0
 17 |   - libffi=3.2.1=hd88cf55_4
 18 |   - libgcc-ng=9.1.0=hdf63c60_0
 19 |   - libgfortran-ng=7.3.0=hdf63c60_0
 20 |   - libpng=1.6.37=hbc83047_0
 21 |   - libprotobuf=3.12.3=h8b12597_1
 22 |   - libstdcxx-ng=9.1.0=hdf63c60_0
 23 |   - libtiff=4.1.0=h2733197_1
 24 |   - lz4-c=1.9.2=he6710b0_0
 25 |   - mkl=2020.1=217
 26 |   - mkl-service=2.3.0=py37he904b0f_0
 27 |   - mkl_fft=1.1.0=py37h23d657b_0
 28 |   - mkl_random=1.1.1=py37h0573a6f_0
 29 |   - ncurses=6.2=he6710b0_1
 30 |   - ninja=1.9.0=py37hfd86e86_0
 31 |   - numpy=1.18.5=py37ha1c710e_0
 32 |   - numpy-base=1.18.5=py37hde5b4d6_0
 33 |   - olefile=0.46=py37_0
 34 |   - openssl=1.1.1g=h516909a_0
 35 |   - pillow=7.1.2=py37hb39fc2d_0
 36 |   - pip=20.1.1=py37_1
 37 |   - python=3.7.6=h0371630_2
 38 |   - python_abi=3.7=1_cp37m
 39 |   - pytorch=1.5.1=py3.7_cuda10.2.89_cudnn7.6.5_0
 40 |   - pyyaml=5.3.1=py37hb5d75c8_1
 41 |   - readline=7.0=h7b6447c_5
 42 |   - setuptools=47.3.1=py37_0
 43 |   - six=1.15.0=py_0
 44 |   - sqlite=3.32.3=h62c20be_0
 45 |   - tensorboardx=2.1=py_0
 46 |   - termcolor=1.1.0=py_2
 47 |   - tk=8.6.10=hbc83047_0
 48 |   - torchvision=0.6.1=py37_cu102
 49 |   - wheel=0.34.2=py37_0
 50 |   - xz=5.2.5=h7b6447c_0
 51 |   - yacs=0.1.6=py_0
 52 |   - yaml=0.2.5=h516909a_0
 53 |   - zlib=1.2.11=h7b6447c_3
 54 |   - zstd=1.4.4=h0b5b093_3
 55 |   - pip:
 56 |     - absl-py==0.9.0
 57 |     - argparse==1.4.0
 58 |     - astunparse==1.6.3
 59 |     - blessings==1.7
 60 |     - cachetools==4.1.1
 61 |     - catkin-pkg==0.4.22
 62 |     - cffi==1.14.4
 63 |     - chardet==3.0.4
 64 |     - click==7.1.2
 65 |     - cloudpickle==1.6.0
 66 |     - configparser==5.0.1
 67 |     - cycler==0.10.0
 68 |     - decorator==4.4.2
 69 |     - distro==1.5.0
 70 |     - docker-pycreds==0.4.0
 71 |     - docutils==0.16
 72 |     - future==0.18.2
 73 |     - gast==0.3.3
 74 |     - gitdb==4.0.5
 75 |     - gitpython==3.1.11
 76 |     - google-auth==1.18.0
 77 |     - google-auth-oauthlib==0.4.1
 78 |     - google-pasta==0.2.0
 79 |     - grpcio==1.30.0
 80 |     - h5py==2.10.0
 81 |     - horovod==0.21.0
 82 |     - idna==2.10
 83 |     - imageio==2.9.0
 84 |     - importlib-metadata==1.7.0
 85 |     - keras-preprocessing==1.1.2
 86 |     - kiwisolver==1.2.0
 87 |     - markdown==3.2.2
 88 |     - matplotlib==3.2.2
 89 |     - networkx==2.4
 90 |     - oauthlib==3.1.0
 91 |     - opencv-python==4.3.0.36
 92 |     - opt-einsum==3.2.1
 93 |     - path==14.0.1
 94 |     - pebble==4.5.3
 95 |     - progressbar2==3.51.4
 96 |     - promise==2.3
 97 |     - protobuf==3.12.2
 98 |     - psutil==5.8.0
 99 |     - pyasn1==0.4.8
100 |     - pyasn1-modules==0.2.8
101 |     - pycparser==2.20
102 |     - pyparsing==2.4.7
103 |     - python-dateutil==2.8.1
104 |     - python-utils==2.4.0
105 |     - pywavelets==1.1.1
106 |     - requests==2.24.0
107 |     - requests-oauthlib==1.3.0
108 |     - rospkg==1.2.8
109 |     - rsa==4.6
110 |     - scikit-image==0.17.2
111 |     - scipy==1.1.0
112 |     - sentry-sdk==0.19.5
113 |     - shortuuid==1.0.1
114 |     - smmap==3.0.4
115 |     - subprocess32==3.5.4
116 |     - tensorboard==2.2.2
117 |     - tensorboard-plugin-wit==1.7.0
118 |     - tensorflow==2.2.0
119 |     - tensorflow-estimator==2.2.0
120 |     - tifffile==2020.7.4
121 |     - torchgeometry==0.1.2
122 |     - tqdm==4.47.0
123 |     - urllib3==1.25.9
124 |     - wandb==0.10.12
125 |     - watchdog==1.0.2
126 |     - werkzeug==1.0.1
127 |     - wrapt==1.12.1
128 |     - zipp==3.1.0
129 | prefix: /home/user/anaconda3/envs/ThermalDepth
130 | 
131 | 


--------------------------------------------------------------------------------
/common/models/DispResNet.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | import numpy as np
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from .resnet_encoder import *
  8 | 
  9 | 
 10 | import numpy as np
 11 | from collections import OrderedDict
 12 | 
 13 | class ConvBlock(nn.Module):
 14 |     """Layer to perform a convolution followed by ELU
 15 |     """
 16 |     def __init__(self, in_channels, out_channels):
 17 |         super(ConvBlock, self).__init__()
 18 | 
 19 |         self.conv = Conv3x3(in_channels, out_channels)
 20 |         self.nonlin = nn.ELU(inplace=True)
 21 | 
 22 |     def forward(self, x):
 23 |         out = self.conv(x)
 24 |         out = self.nonlin(out)
 25 |         return out
 26 | 
 27 | class Conv3x3(nn.Module):
 28 |     """Layer to pad and convolve input
 29 |     """
 30 |     def __init__(self, in_channels, out_channels, use_refl=True):
 31 |         super(Conv3x3, self).__init__()
 32 | 
 33 |         if use_refl:
 34 |             self.pad = nn.ReflectionPad2d(1)
 35 |         else:
 36 |             self.pad = nn.ZeroPad2d(1)
 37 |         self.conv = nn.Conv2d(int(in_channels), int(out_channels), 3)
 38 | 
 39 |     def forward(self, x):
 40 |         out = self.pad(x)
 41 |         out = self.conv(out)
 42 |         return out
 43 | 
 44 | def upsample(x):
 45 |     """Upsample input tensor by a factor of 2
 46 |     """
 47 |     return F.interpolate(x, scale_factor=2, mode="nearest")
 48 | 
 49 | class DepthDecoder(nn.Module):
 50 |     def __init__(self, num_ch_enc, scales=range(4), num_output_channels=1, use_skips=True):
 51 |         super(DepthDecoder, self).__init__()
 52 | 
 53 |         self.alpha = 10
 54 |         self.beta = 0.01
 55 | 
 56 |         self.num_output_channels = num_output_channels
 57 |         self.use_skips = use_skips
 58 |         self.upsample_mode = 'nearest'
 59 |         self.scales = scales
 60 | 
 61 |         self.num_ch_enc = num_ch_enc
 62 |         self.num_ch_dec = np.array([16, 32, 64, 128, 256])
 63 | 
 64 |         # decoder
 65 |         self.convs = OrderedDict()
 66 |         for i in range(4, -1, -1):
 67 |             # upconv_0
 68 |             num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i + 1]
 69 |             num_ch_out = self.num_ch_dec[i]
 70 |             self.convs[("upconv", i, 0)] = ConvBlock(num_ch_in, num_ch_out)
 71 | 
 72 |             # upconv_1
 73 |             num_ch_in = self.num_ch_dec[i]
 74 |             if self.use_skips and i > 0:
 75 |                 num_ch_in += self.num_ch_enc[i - 1]
 76 |             num_ch_out = self.num_ch_dec[i]
 77 |             self.convs[("upconv", i, 1)] = ConvBlock(num_ch_in, num_ch_out)
 78 | 
 79 |         for s in self.scales:
 80 |             self.convs[("dispconv", s)] = Conv3x3(self.num_ch_dec[s], self.num_output_channels)
 81 | 
 82 |         self.decoder = nn.ModuleList(list(self.convs.values()))
 83 |         self.sigmoid = nn.Sigmoid()
 84 | 
 85 |     def forward(self, input_features):
 86 |         self.outputs = []
 87 | 
 88 |         # decoder
 89 |         x = input_features[-1]
 90 |         for i in range(4, -1, -1):
 91 |             x = self.convs[("upconv", i, 0)](x)
 92 |             x = [upsample(x)]
 93 |             if self.use_skips and i > 0:
 94 |                 x += [input_features[i - 1]]
 95 |             x = torch.cat(x, 1)
 96 |             x = self.convs[("upconv", i, 1)](x)
 97 |             if i in self.scales:
 98 |                 self.outputs.append(self.alpha * self.sigmoid(self.convs[("dispconv", i)](x)) + self.beta)
 99 | 
100 |         self.outputs = self.outputs[::-1]
101 | 
102 |         return self.outputs
103 | 
104 | class DispResNet(nn.Module):
105 | 
106 |     def __init__(self, num_layers = 18, pretrained = True, num_channel=3):
107 |         super(DispResNet, self).__init__()
108 |         self.encoder = ResnetEncoder(num_layers = num_layers, pretrained = pretrained, num_input_images=1, num_channel=num_channel)
109 |         self.decoder = DepthDecoder(self.encoder.num_ch_enc)
110 |     def init_weights(self):
111 |         pass
112 | 
113 |     def forward(self, x):
114 |         features = self.encoder(x)
115 |         outputs = self.decoder(features)
116 | 
117 |         if self.training:
118 |             return outputs
119 |         else:
120 |             return outputs[0]
121 | 


--------------------------------------------------------------------------------
/test_disp.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from skimage.transform import resize as imresize
  3 | from imageio import imread
  4 | import numpy as np
  5 | from path import Path
  6 | import argparse
  7 | from tqdm import tqdm
  8 | import time
  9 | 
 10 | import sys
 11 | sys.path.append('./common/')
 12 | import models
 13 | 
 14 | parser = argparse.ArgumentParser(description='Script for DispNet testing with corresponding groundTruth',
 15 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 16 | parser.add_argument("--pretrained-dispnet", required=True, type=str, help="pretrained DispNet path")
 17 | parser.add_argument("--img-height", default=256, type=int, help="Image height") # 256 (kitti)
 18 | parser.add_argument("--img-width", default=320, type=int, help="Image width")   # 832 (kitti)
 19 | parser.add_argument("--min-depth", default=1e-3)
 20 | parser.add_argument("--max-depth", default=80)
 21 | parser.add_argument("--dataset-dir", default='.', type=str, help="Dataset directory")
 22 | parser.add_argument("--dataset-list", default=None, type=str, help="Dataset list file")
 23 | parser.add_argument("--output-dir", default=None, required=True, type=str, help="Output directory for saving predictions in a big 3D numpy file")
 24 | parser.add_argument('--resnet-layers', required=True, type=int, default=18, choices=[18, 50], help='depth network architecture.')
 25 | 
 26 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 27 |     
 28 | def load_tensor_image(filename, args):
 29 |     img = np.expand_dims(imread(filename).astype(np.float32), axis=2)
 30 |     h,w,_ = img.shape
 31 |     if (h != args.img_height or w != args.img_width):
 32 |         img = imresize(img, (args.img_height, args.img_width)).astype(np.float32)
 33 |     img = np.transpose(img, (2, 0, 1))
 34 |     img = (torch.from_numpy(img).float() / 2**14)
 35 |     tensor_img = ((img.unsqueeze(0)-0.45)/0.225).to(device)
 36 |     return tensor_img
 37 | 
 38 | @torch.no_grad()
 39 | def main():
 40 |     args = parser.parse_args()
 41 | 
 42 |     # load models
 43 |     disp_pose_net = models.DispPoseResNet(args.resnet_layers, False, num_channel=1).to(device)
 44 |     disp_net = disp_pose_net.DispResNet
 45 | 
 46 |     weights = torch.load(args.pretrained_dispnet)
 47 |     disp_net.load_state_dict(weights['state_dict'])
 48 |     disp_net.eval()
 49 | 
 50 |     dataset_dir = Path(args.dataset_dir)
 51 | 
 52 |     # read file list
 53 |     if args.dataset_list is not None:
 54 |         with open(args.dataset_list, 'r') as f:
 55 |             test_files = list(f.read().splitlines())
 56 |     else:
 57 |         test_files=sorted((dataset_dir+'Thermal').files('*.png'))
 58 | 
 59 |     print('{} files to test'.format(len(test_files)))
 60 |   
 61 |     output_dir = Path(args.output_dir)
 62 |     output_dir.makedirs_p()
 63 | 
 64 |     test_disp_avg = 0
 65 |     test_disp_std = 0
 66 |     test_depth_avg = 0
 67 |     test_depth_std = 0
 68 | 
 69 |     avg_time = 0
 70 |     for j in tqdm(range(len(test_files))):
 71 |         tgt_img = load_tensor_image(test_files[j], args)
 72 | 
 73 |         # compute speed
 74 |         torch.cuda.synchronize()
 75 |         t_start = time.time()
 76 | 
 77 |         output = disp_net(tgt_img)
 78 | 
 79 |         torch.cuda.synchronize()
 80 |         elapsed_time = time.time() - t_start
 81 |         
 82 |         avg_time += elapsed_time
 83 | 
 84 |         pred_disp = output.squeeze().cpu().numpy()
 85 | 
 86 |         if j == 0:
 87 |             predictions = np.zeros((len(test_files), *pred_disp.shape))
 88 |         predictions[j] = 1/pred_disp
 89 | 
 90 |         test_disp_avg += pred_disp.mean()
 91 |         test_disp_std += pred_disp.std()
 92 |         test_depth_avg += predictions.mean()
 93 |         test_depth_std += predictions.std()
 94 |         
 95 |     np.save(output_dir/'predictions.npy', predictions)
 96 | 
 97 |     avg_time /= len(test_files)
 98 |     print('Avg Time: ', avg_time, ' seconds.')
 99 |     print('Avg Speed: ', 1.0 / avg_time, ' fps')
100 | 
101 |     print('Avg disp : {0:0.3f}, std disp : {1:0.5f}'.format(test_disp_avg/len(test_files), test_disp_std/len(test_files)))
102 |     print('Avg depth: {0:0.3f}, std depth: {1:0.5f}'.format(test_depth_avg/len(test_files), test_depth_std/len(test_files)))
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     main()
107 | 


--------------------------------------------------------------------------------
/common/utils/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import shutil
  3 | import numpy as np
  4 | import torch
  5 | from path import Path
  6 | import datetime
  7 | from collections import OrderedDict
  8 | from matplotlib import cm
  9 | from matplotlib.colors import ListedColormap, LinearSegmentedColormap
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | def high_res_colormap(low_res_cmap, resolution=1000, max_value=1):
 13 |     # Construct the list colormap, with interpolated values for higer resolution
 14 |     # For a linear segmented colormap, you can just specify the number of point in
 15 |     # cm.get_cmap(name, lutsize) with the parameter lutsize
 16 |     x = np.linspace(0, 1, low_res_cmap.N)
 17 |     low_res = low_res_cmap(x)
 18 |     new_x = np.linspace(0, max_value, resolution)
 19 |     high_res = np.stack([np.interp(new_x, x, low_res[:, i])
 20 |                          for i in range(low_res.shape[1])], axis=1)
 21 |     return ListedColormap(high_res)
 22 | 
 23 | def opencv_rainbow(resolution=1000):
 24 |     # Construct the opencv equivalent of Rainbow
 25 |     opencv_rainbow_data = (
 26 |         (0.000, (1.00, 0.00, 0.00)),
 27 |         (0.400, (1.00, 1.00, 0.00)),
 28 |         (0.600, (0.00, 1.00, 0.00)),
 29 |         (0.800, (0.00, 0.00, 1.00)),
 30 |         (1.000, (0.60, 0.00, 1.00))
 31 |     )
 32 | 
 33 |     return LinearSegmentedColormap.from_list('opencv_rainbow', opencv_rainbow_data, resolution)
 34 | 
 35 | def opencv_rainbow_inv(resolution=1000):
 36 |     # Construct the opencv equivalent of Rainbow
 37 |     opencv_rainbow_data = (
 38 |         (0.000, (0.60, 0.00, 1.00)),
 39 |         (0.400, (0.00, 0.00, 0.10)),
 40 |         (0.600, (0.00, 1.00, 0.00)),
 41 |         (0.800, (1.00, 1.00, 0.00)),
 42 |         (1.000, (1.00, 0.00, 0.00))
 43 |     )
 44 | 
 45 |     return LinearSegmentedColormap.from_list('opencv_rainbow', opencv_rainbow_data, resolution)
 46 | 
 47 | COLORMAPS = {'rainbow': opencv_rainbow(),
 48 |              'rainbow_inv' : plt.get_cmap('jet'), #opencv_rainbow_inv(),
 49 |              'magma': high_res_colormap(cm.get_cmap('magma')),
 50 |              'magma_inv': high_res_colormap(cm.get_cmap('magma').reversed()),
 51 |              'bone': cm.get_cmap('bone', 10000),
 52 |              'bone_inv': cm.get_cmap('bone', 10000).reversed()}
 53 | 
 54 | def ind2rgb(im):
 55 |     cmap = plt.get_cmap('jet')
 56 |     im = im.cpu().squeeze().numpy()
 57 |     im = cmap(im)
 58 |     im = im[:,:,0:3]
 59 |     # put it from HWC to CHW format
 60 |     im = np.transpose(im, (2, 0, 1))
 61 |     return torch.from_numpy(im).float()
 62 |     
 63 | def tensor2array(tensor, max_value=None, colormap='rainbow'):
 64 |     tensor = tensor.detach().cpu()
 65 |     if max_value is None:
 66 |         max_value = tensor[~np.isinf(tensor).type(torch.bool)].max()
 67 |         tensor[np.isinf(tensor).type(torch.bool)] = max_value
 68 |         max_value = tensor.max().item()
 69 |     if tensor.ndimension() == 2 or tensor.size(0) == 1:
 70 |         norm_array = tensor.squeeze().numpy()/max_value
 71 |         array = COLORMAPS[colormap](norm_array).astype(np.float32)
 72 |         array = array.transpose(2, 0, 1)
 73 | 
 74 |     elif tensor.ndimension() == 3:
 75 |         if( tensor.size(0) == 3) : 
 76 |             array = 0.45 + tensor.numpy()*0.225
 77 |         elif (tensor.size(0) == 2):
 78 |             array = tensor.numpy()
 79 | 
 80 |     return array
 81 | 
 82 | def tensor2array_thermal(tensor):
 83 |     tensor = tensor.detach().cpu()
 84 |     array = np.expand_dims(0.45 + tensor.detach().cpu().squeeze().numpy()*0.225, 0)
 85 |     return array
 86 | 
 87 | def save_checkpoint(save_path, dispnet_state, exp_pose_state, is_depth_best, is_pose_best, filename='checkpoint.pth.tar'):
 88 |     file_prefixes = ['dispnet', 'exp_pose']
 89 |     states = [dispnet_state, exp_pose_state]
 90 |     for (prefix, state) in zip(file_prefixes, states):
 91 |         torch.save(state, save_path/'{}_{}'.format(prefix, filename))
 92 | 
 93 |     if (is_depth_best&is_pose_best):
 94 |         for prefix in file_prefixes:
 95 |             shutil.copyfile(save_path/'{}_{}'.format(prefix, filename),
 96 |                             save_path/'{}_both_model_best.pth.tar'.format(prefix))
 97 |     elif is_depth_best :
 98 |         for prefix in file_prefixes:
 99 |             shutil.copyfile(save_path/'{}_{}'.format(prefix, filename),
100 |                             save_path/'{}_disp_model_best.pth.tar'.format(prefix))
101 |     elif is_pose_best : 
102 |         for prefix in file_prefixes:
103 |             shutil.copyfile(save_path/'{}_{}'.format(prefix, filename),
104 |                             save_path/'{}_pose_model_best.pth.tar'.format(prefix))        
105 | 


--------------------------------------------------------------------------------
/common/models/resnet_encoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright Niantic 2019. Patent Pending. All rights reserved.
  2 | #
  3 | # This software is licensed under the terms of the Monodepth2 licence
  4 | # which allows for non-commercial use only, the full terms of which are made
  5 | # available in the LICENSE file.
  6 | 
  7 | from __future__ import absolute_import, division, print_function
  8 | 
  9 | import numpy as np
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torchvision.models as models
 14 | import torch.utils.model_zoo as model_zoo
 15 | 
 16 | class ResNetMultiImageInput(models.ResNet):
 17 |     """Constructs a resnet model with varying number of input images.
 18 |     Adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
 19 |     """
 20 |     def __init__(self, block, layers, num_classes=1000, num_input_images=1, num_channel=3):
 21 |         super(ResNetMultiImageInput, self).__init__(block, layers)
 22 |         self.inplanes = 64
 23 |         self.conv1 = nn.Conv2d(num_input_images * num_channel, 64, kernel_size=7, stride=2, padding=3, bias=False)
 24 |         self.bn1 = nn.BatchNorm2d(64)
 25 |         self.relu = nn.ReLU(inplace=True)
 26 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 27 |         self.layer1 = self._make_layer(block, 64, layers[0])
 28 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
 29 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
 30 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
 31 | 
 32 |         for m in self.modules():
 33 |             if isinstance(m, nn.Conv2d):
 34 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
 35 |             elif isinstance(m, nn.BatchNorm2d):
 36 |                 nn.init.constant_(m.weight, 1)
 37 |                 nn.init.constant_(m.bias, 0)
 38 | 
 39 | 
 40 | 
 41 | def resnet_multiimage_input(num_layers, pretrained=False, num_input_images=1, num_channel=3):
 42 |     """Constructs a ResNet model.
 43 |     Args:
 44 |         num_layers (int): Number of resnet layers. Must be 18 or 50
 45 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
 46 |         num_input_images (int): Number of frames stacked as input
 47 |     """
 48 |     assert num_layers in [18, 50], "Can only run with 18 or 50 layer resnet"
 49 |     blocks = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers]
 50 |     block_type = {18: models.resnet.BasicBlock, 50: models.resnet.Bottleneck}[num_layers]
 51 |     model = ResNetMultiImageInput(block_type, blocks, num_input_images=num_input_images, num_channel=num_channel)
 52 | 
 53 |     if pretrained:
 54 |         loaded = model_zoo.load_url(models.resnet.model_urls['resnet{}'.format(num_layers)])
 55 |         if num_channel == 3:
 56 |             loaded['conv1.weight'] = torch.cat([loaded['conv1.weight']] * num_input_images, 1) / num_input_images
 57 |         else :
 58 |             loaded['conv1.weight'] = torch.cat([loaded['conv1.weight'].mean(1).unsqueeze(1)] * num_input_images, 1) / num_input_images
 59 |         model.load_state_dict(loaded)
 60 |     return model
 61 | 
 62 | 
 63 | class ResnetEncoder(nn.Module):
 64 |     """Pytorch module for a resnet encoder
 65 |     """
 66 |     def __init__(self, num_layers, pretrained, num_input_images=1, num_channel=3):
 67 |         super(ResnetEncoder, self).__init__()
 68 | 
 69 |         self.num_ch_enc = np.array([64, 64, 128, 256, 512])
 70 | 
 71 |         resnets = {18: models.resnet18,
 72 |                    34: models.resnet34,
 73 |                    50: models.resnet50,
 74 |                    101: models.resnet101,
 75 |                    152: models.resnet152}
 76 | 
 77 |         if num_layers not in resnets:
 78 |             raise ValueError("{} is not a valid number of resnet layers".format(num_layers))
 79 | 
 80 |         self.encoder = resnet_multiimage_input(num_layers, pretrained, num_input_images, num_channel)
 81 |         """
 82 |         if num_input_images > 1:
 83 |             self.encoder = resnet_multiimage_input(num_layers, pretrained, num_input_images)
 84 |         else:
 85 |             self.encoder = resnets[num_layers](pretrained)
 86 |         """
 87 |         if num_layers > 34:
 88 |             self.num_ch_enc[1:] *= 4
 89 | 
 90 |     def forward(self, input_image):
 91 |         self.features = []
 92 |         x = input_image
 93 |         x = self.encoder.conv1(x)
 94 |         x = self.encoder.bn1(x)
 95 |         self.features.append(self.encoder.relu(x))
 96 |         self.features.append(self.encoder.layer1(self.encoder.maxpool(self.features[-1])))
 97 |         self.features.append(self.encoder.layer2(self.features[-1]))
 98 |         self.features.append(self.encoder.layer3(self.features[-1]))
 99 |         self.features.append(self.encoder.layer4(self.features[-1]))
100 | 
101 |         return self.features


--------------------------------------------------------------------------------
/eval_vivid/eval_depth.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import cv2
  3 | import matplotlib as mpl
  4 | import matplotlib.cm as cm
  5 | import numpy as np
  6 | import os
  7 | from tqdm import tqdm
  8 | from path import Path
  9 | 
 10 | parser = argparse.ArgumentParser(description="Depth options")
 11 | parser.add_argument("--dataset", required=True, help="VIVID", choices=['VIVID'], type=str)
 12 | parser.add_argument("--scene", required=True, help="scene type for VIVID", choices=['indoor', 'outdoor'], type=str)
 13 | parser.add_argument("--pred_depth", required=True, help="depth predictions npy", type=str)
 14 | parser.add_argument("--gt_depth", required=True, help="gt depth folder for VIVID", type=str)
 15 | parser.add_argument("--ratio_name", help="names for saving ratios", type=str)
 16 | 
 17 | 
 18 | args = parser.parse_args()
 19 | 
 20 | def compute_depth_errors(gt, pred):
 21 |     """Computation of error metrics between predicted and ground truth depths
 22 |     Args:
 23 |         gt (N): ground truth depth
 24 |         pred (N): predicted depth
 25 |     """
 26 |     thresh = np.maximum((gt / pred), (pred / gt))
 27 |     a1 = (thresh < 1.25).mean()
 28 |     a2 = (thresh < 1.25 ** 2).mean()
 29 |     a3 = (thresh < 1.25 ** 3).mean()
 30 | 
 31 |     rmse = (gt - pred) ** 2
 32 |     rmse = np.sqrt(rmse.mean())
 33 |     rmse_log = (np.log(gt) - np.log(pred)) ** 2
 34 |     rmse_log = np.sqrt(rmse_log.mean())
 35 | 
 36 |     log10 = np.mean(np.abs((np.log10(gt) - np.log10(pred))))
 37 | 
 38 |     abs_rel = np.mean(np.abs(gt - pred) / gt)
 39 |     sq_rel = np.mean(((gt - pred) ** 2) / gt)
 40 | 
 41 |     return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3
 42 | 
 43 | class DepthEval():
 44 |     def __init__(self):
 45 | 
 46 |         self.min_depth = 1e-3
 47 | 
 48 |         if args.dataset == 'VIVID':
 49 |             if args.scene == 'indoor' :
 50 |                 self.max_depth = 10.
 51 |             elif args.scene == 'outdoor' :
 52 |                 self.max_depth = 80.
 53 | 
 54 |     def main(self):
 55 |         pred_depths = []
 56 | 
 57 |         """ Get result """
 58 |         # Read precomputed result
 59 |         pred_depths = np.load(os.path.join(args.pred_depth))
 60 | 
 61 |         """ Evaluation """
 62 |         if args.dataset == 'VIVID':
 63 |             gt_depths = []
 64 |             for gt_f in sorted(Path(args.gt_depth).files("*.npy")):
 65 |                 gt_depths.append(np.load(gt_f))
 66 | 
 67 |         pred_depths = self.evaluate_depth(gt_depths, pred_depths, eval_mono=True)
 68 | 
 69 | 
 70 |     def evaluate_depth(self, gt_depths, pred_depths, eval_mono=True):
 71 |         """evaluate depth result
 72 |         Args:
 73 |             gt_depths (NxHxW): gt depths
 74 |             pred_depths (NxHxW): predicted depths
 75 |             split (str): data split for evaluation
 76 |                 - depth_eigen
 77 |             eval_mono (bool): use median scaling if True
 78 |         """
 79 |         errors = []
 80 |         ratios = []
 81 |         resized_pred_depths = []
 82 | 
 83 |         print("==> Evaluating depth result...")
 84 |         for i in tqdm(range(pred_depths.shape[0])):
 85 |             if pred_depths[i].mean() != -1:
 86 |                 gt_depth = gt_depths[i]
 87 |                 gt_height, gt_width = gt_depth.shape[:2]
 88 | 
 89 |                 # resizing prediction (based on inverse depth)
 90 |                 pred_inv_depth = 1 / (pred_depths[i] + 1e-6)
 91 |                 pred_inv_depth = cv2.resize(pred_inv_depth, (gt_width, gt_height))
 92 |                 pred_depth = 1 / (pred_inv_depth + 1e-6)
 93 | 
 94 |                 mask = np.logical_and(gt_depth > self.min_depth, gt_depth < self.max_depth)
 95 |                 val_pred_depth = pred_depth[mask]
 96 |                 val_gt_depth = gt_depth[mask]
 97 | 
 98 |                 # median scaling is used for monocular evaluation
 99 |                 ratio = 1
100 |                 if eval_mono:
101 |                     ratio = np.median(val_gt_depth) / np.median(val_pred_depth)
102 |                     ratios.append(ratio)
103 |                     val_pred_depth *= ratio
104 | 
105 |                 resized_pred_depths.append(pred_depth * ratio)
106 | 
107 |                 val_pred_depth[val_pred_depth < self.min_depth] = self.min_depth
108 |                 val_pred_depth[val_pred_depth > self.max_depth] = self.max_depth
109 | 
110 |                 errors.append(compute_depth_errors(val_gt_depth, val_pred_depth))
111 | 
112 |         if eval_mono:
113 |             ratios = np.array(ratios)
114 |             med = np.median(ratios)
115 |             print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format(med, np.std(ratios / med)))
116 |             print(" Scaling ratios | mean: {:0.3f} +- std: {:0.3f}".format(np.mean(ratios), np.std(ratios)))
117 |             if args.ratio_name:
118 |                 np.savetxt(args.ratio_name, ratios, fmt='%.4f')
119 | 
120 |         mean_errors = np.array(errors).mean(0)
121 |         print("\n  " + ("{:>8} | " * 7).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3"))
122 |         print(("&{: 8.3f}  " * 7).format(*mean_errors.tolist()) + "\\\\")
123 | 
124 |         return resized_pred_depths
125 | 
126 | 
127 | eval = DepthEval()
128 | eval.main()
129 | 


--------------------------------------------------------------------------------
/run_inference.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.backends.cudnn as cudnn
  3 | import torch.utils.data
  4 | 
  5 | from imageio import imsave
  6 | import numpy as np
  7 | from path import Path
  8 | import argparse
  9 | 
 10 | import sys
 11 | sys.path.append('./common/')
 12 | 
 13 | import models 
 14 | import utils.custom_transforms as custom_transforms
 15 | from utils.utils import tensor2array
 16 | 
 17 | import matplotlib as mpl
 18 | import matplotlib.cm as cm
 19 | 
 20 | parser = argparse.ArgumentParser(description='Inference script for DispNet learned with \
 21 |                                  Structure from Motion Learner inference on KITTI Dataset',
 22 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 23 | 
 24 | parser.add_argument('data', metavar='DIR', help='path to dataset')
 25 | parser.add_argument("--sequence", default='.', type=str, help="Dataset directory")
 26 | parser.add_argument("--output-dir", default='output', type=str, help="Output directory")
 27 | parser.add_argument("--img-exts", default='jpg', choices=['png', 'jpg', 'bmp'], nargs='*', type=str, help="images extensions to glob")
 28 | 
 29 | parser.add_argument('--resnet-layers', required=True, type=int, default=18, choices=[18, 50],
 30 |                     help='depth network architecture.')
 31 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers')
 32 | parser.add_argument('-b', '--batch-size', default=1, type=int, metavar='N', help='mini-batch size')
 33 | 
 34 | parser.add_argument('--interval', type=int, help='Interval of sequence', metavar='N', default=1)
 35 | parser.add_argument('--sequence_length', type=int, help='Length of sequence', metavar='N', default=3)
 36 | parser.add_argument('--pretrained-model', dest='pretrained_model', default=None, metavar='PATH', help='path to pre-trained model')
 37 | parser.add_argument('--scene_type', type=str, choices=['indoor', 'outdoor'], default='indoor', required=True)
 38 | 
 39 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 40 | 
 41 | def depth_visualizer(inv_depth):
 42 |     """
 43 |     Args:
 44 |         data (HxW): depth data
 45 |     Returns:
 46 |         vis_data (HxWx3): depth visualization (RGB)
 47 |     """
 48 |     inv_depth = inv_depth.squeeze().detach().cpu()
 49 |     vmax = np.percentile(inv_depth, 98)
 50 |     normalizer = mpl.colors.Normalize(vmin=inv_depth.min(), vmax=vmax)
 51 |     mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
 52 |     vis_data = (mapper.to_rgba(inv_depth)[:, :, :3] * 255).astype(np.uint8)
 53 |     return vis_data
 54 | 
 55 | @torch.no_grad()
 56 | def main():
 57 |     args = parser.parse_args()
 58 | 
 59 |     ArrToTen      = custom_transforms.ArrayToTensor(max_value=2**14)
 60 |     ArrToTen4Loss = custom_transforms.ArrayToTensorWithLocalWindow()
 61 |     TenThrRearran = custom_transforms.TensorThermalRearrange(bin_num=30, CLAHE_clip=2)
 62 |     normalize     = custom_transforms.Normalize(mean=[0.45, 0.45, 0.45], std=[0.225, 0.225, 0.225])
 63 | 
 64 |     transform4input  = custom_transforms.Compose([ArrToTen, normalize])
 65 |     transform4loss   = custom_transforms.Compose([ArrToTen4Loss, TenThrRearran, normalize])
 66 | 
 67 |     from dataloader.VIVID_validation_folders import ValidationSet
 68 |     val_set = ValidationSet(
 69 |         args.data,
 70 |         tf_input     = transform4input,
 71 |         tf_loss      = transform4loss,
 72 |         sequence_length = args.sequence_length,
 73 |         interval     = args.interval,
 74 |         scene_type   = args.scene_type,        
 75 |         inference_folder=args.sequence,
 76 |     )
 77 | 
 78 |     val_loader = torch.utils.data.DataLoader(
 79 |         val_set, batch_size=args.batch_size, shuffle=False,
 80 |         num_workers=args.workers, pin_memory=True)
 81 | 
 82 |     # 1. Load models
 83 |     # create model
 84 |     print("=> creating model")
 85 |     disp_pose_net = models.DispPoseResNet(args.resnet_layers, False, num_channel=1).to(device)
 86 |     disp_net = disp_pose_net.DispResNet
 87 | 
 88 |     # load parameters
 89 |     print("=> using pre-trained weights for DispResNet")
 90 |     weights = torch.load(args.pretrained_model)
 91 |     disp_net.load_state_dict(weights['state_dict'])
 92 |     disp_net.eval()
 93 | 
 94 |     # 2. Load dataset
 95 |     output_dir = Path(args.output_dir+'/'+args.sequence)
 96 |     output_dir.makedirs_p()
 97 |     (output_dir/'thr_img').makedirs_p()
 98 |     (output_dir/'thr_invdepth').makedirs_p()
 99 | 
100 |     disp_net.eval()
101 | 
102 |     for idx, (tgt_img, tgt_img_vis, depth_gt) in enumerate(val_loader):
103 | 
104 |         # original validate_with_gt param
105 |         tgt_img   = tgt_img.to(device)
106 |         depth_gt  = depth_gt.to(device)
107 | 
108 |         # compute output
109 |         output_disp = disp_net(tgt_img)
110 | 
111 |         tgt_img_vis        = (255*(0.45 + tgt_img_vis.squeeze().detach().cpu().numpy()*0.225)).astype(np.uint8)
112 |         tgt_disp           = depth_visualizer(output_disp)
113 | 
114 |         # Save images
115 |         file_name = '{:06d}'.format(idx)
116 |         imsave(output_dir/'thr_img'/'{}.{}'.format(file_name, args.img_exts), tgt_img_vis)
117 |         imsave(output_dir/'thr_invdepth'/'{}.{}'.format(file_name, args.img_exts), tgt_disp)
118 | 
119 | if __name__ == '__main__':
120 |     main()
121 | 


--------------------------------------------------------------------------------
/dataloader/VIVID_validation_folders.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | import numpy as np
  4 | 
  5 | import math
  6 | import random
  7 | from imageio import imread
  8 | from path import Path
  9 | 
 10 | def load_as_float(path):
 11 |     return np.expand_dims(imread(path).astype(np.float32), axis=2) # HW -> HW1
 12 | 
 13 | class ValidationSet(data.Dataset):
 14 |     """A sequence data loader where the files are arranged in this way:
 15 |         root/scene_1/Thermal/0000000.jpg
 16 |         root/scene_1/Depth/0000000.npy
 17 |         root/scene_1/Thermal/0000001.jpg
 18 |         root/scene_1/Depth/0000001.npy
 19 |         ..
 20 |         root/scene_2/0000000.jpg
 21 |         root/scene_2/0000000.npy
 22 |         .
 23 | 
 24 |         transform functions must take in a list a images and a numpy array which can be None
 25 |     """
 26 | 
 27 |     def __init__(self, root, tf_input=None, tf_loss=None, inference_folder = '', sequence_length=3, interval=1, scene_type='indoor'):
 28 |         self.root = Path(root)
 29 | 
 30 |         if inference_folder == '' : 
 31 |             if scene_type == 'indoor': 
 32 |                 folder_list_path = self.root/'val_indoor.txt'
 33 |             elif scene_type == 'outdoor': 
 34 |                 folder_list_path = self.root/'val_outdoor.txt'            
 35 |             self.folders = [self.root/folder[:-1] for folder in open(folder_list_path)]
 36 |         else:
 37 |             self.folders = [self.root/inference_folder]
 38 | 
 39 |         self.tf_input = tf_input
 40 |         self.tf_loss = tf_loss        
 41 | 
 42 |         self.crawl_folders(sequence_length, interval)
 43 | 
 44 |     def crawl_folders(self, sequence_length=3, interval=1):
 45 |         sequence_set = []
 46 |         demi_length = (sequence_length-1)//2 + interval - 1
 47 |         shifts = list(range(-demi_length, demi_length + 1))
 48 |         for i in range(1, 2*demi_length):
 49 |             shifts.pop(1)
 50 | 
 51 |         for folder in self.folders:      
 52 |             imgs = sorted((folder/"Thermal").files('*.png')) 
 53 |             for i in range(demi_length, len(imgs)-demi_length):
 54 |                 depth = folder/"Depth_T"/(imgs[i].name[:-4] + '.npy')
 55 |                 sample = {'tgt_img': imgs[i], 'tgt_depth': depth }
 56 |                 sequence_set.append(sample)
 57 | 
 58 |         self.samples = sequence_set
 59 | 
 60 |     def __getitem__(self, index):
 61 |         sample = self.samples[index]
 62 |         tgt_img  = load_as_float(sample['tgt_img'])
 63 |         depth    = np.load(sample['tgt_depth']).astype(np.float32)
 64 | 
 65 |         img_input, _ = self.tf_input([tgt_img], None)
 66 |         img_loss, _  = self.tf_loss([tgt_img], None) # used for visualization only 
 67 | 
 68 |         tgt_img_input  = img_input[0]
 69 |         tgt_img_loss   = img_loss[0]
 70 | 
 71 |         return tgt_img_input, tgt_img_loss, depth
 72 | 
 73 |     def __len__(self):
 74 |         return len(self.samples)
 75 | 
 76 | 
 77 | class ValidationSetPose(data.Dataset):
 78 |     """A sequence data loader where the files are arranged in this way:
 79 |         root/scene_1/Thermal/0000000.jpg
 80 |         root/scene_1/Depth/0000000.npy
 81 |         root/scene_1/Thermal/0000001.jpg
 82 |         root/scene_1/Depth/0000001.npy
 83 |         ..
 84 |         root/scene_2/0000000.jpg
 85 |         root/scene_2/0000000.npy
 86 |         .
 87 | 
 88 |         transform functions must take in a list a images and a numpy array which can be None
 89 |     """
 90 |     def __init__(self, root, tf_input=None, sequence_length=3, interval=1, scene_type='indoor'):
 91 |         self.root = Path(root)
 92 |         if scene_type == 'indoor': 
 93 |             scene_list_path = self.root/'val_indoor.txt'
 94 |         elif scene_type == 'outdoor': 
 95 |             scene_list_path = self.root/'val_outdoor.txt'
 96 | 
 97 |         self.folders = [self.root/folder[:-1] for folder in open(scene_list_path)]
 98 |         self.tf_input   = tf_input
 99 |         self.crawl_folders(sequence_length, step=1)
100 | 
101 |     def crawl_folders(self, sequence_length=3, step=1):
102 |         sequence_set = []
103 |         demi_length = (sequence_length - 1) // 2
104 |         shift_range = np.array([step*i for i in range(-demi_length, demi_length + 1)]).reshape(1, -1)
105 | 
106 |         for folder in self.folders:
107 |             imgs = sorted((folder/"Thermal").files('*.png')) 
108 |             poses  = np.genfromtxt(folder/'poses_T.txt').astype(np.float64).reshape(-1, 3, 4)
109 | 
110 |             # construct 5-snippet sequences
111 |             tgt_indices = np.arange(demi_length, len(imgs) - demi_length).reshape(-1, 1)
112 |             snippet_indices = shift_range + tgt_indices
113 |             
114 |             for indices in snippet_indices :
115 |                 sample = {'imgs' : [], 'poses' : []}
116 |                 for i in indices :
117 |                     sample['imgs'].append(imgs[i])
118 |                     sample['poses'].append(poses[i])
119 |                 sequence_set.append(sample)
120 | 
121 |         self.samples = sequence_set
122 | 
123 |     def __getitem__(self, index):
124 |         sample = self.samples[index]
125 |         imgs = [load_as_float(img) for img in sample['imgs']]
126 |         imgs, _ = self.tf_input(imgs, None)
127 | 
128 |         poses = np.stack([pose for pose in sample['poses']])
129 |         first_pose = poses[0]
130 |         poses[:,:,-1] -= first_pose[:,-1]
131 |         compensated_poses = np.linalg.inv(first_pose[:,:3]) @ poses
132 | 
133 |         return imgs, compensated_poses
134 | 
135 |     def __len__(self):
136 |         return len(self.samples)
137 | 


--------------------------------------------------------------------------------
/test_pose.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from skimage.transform import resize as imresize
  3 | import numpy as np
  4 | from path import Path
  5 | import argparse
  6 | from tqdm import tqdm
  7 | # from imageio import imread
  8 | 
  9 | import sys
 10 | sys.path.append('./common/')
 11 | import models
 12 | from loss.inverse_warp import pose_vec2mat
 13 | 
 14 | parser = argparse.ArgumentParser(description='Script for PoseNet testing with corresponding groundTruth from KITTI Odometry',
 15 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 16 | parser.add_argument("--pretrained-posenet", required=True, type=str, help="pretrained PoseNet path")
 17 | parser.add_argument("--img-height", default=256, type=int, help="Image height") 
 18 | parser.add_argument("--img-width", default=320, type=int, help="Image width") 
 19 | parser.add_argument("--no-resize", action='store_true', help="no resizing is done")
 20 | 
 21 | parser.add_argument("--dataset-dir", type=str, help="Dataset directory")
 22 | parser.add_argument('--sequence-length', type=int, metavar='N', help='sequence length for testing', default=5)
 23 | parser.add_argument("--sequences", default=['indoor_aggresive_dark'], type=str, nargs='*', help="sequences to test")
 24 | parser.add_argument("--output-dir", default=None, type=str, help="Output directory for saving predictions in a big 3D numpy file")
 25 | parser.add_argument('--resnet-layers', required=True, type=int, default=18, choices=[18, 50], help='depth network architecture.')
 26 | 
 27 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 28 | 
 29 | def load_tensor_image(img, args):
 30 |     h,w,_ = img.shape
 31 |     if (h != args.img_height or w != args.img_width):
 32 |         img = imresize(img, (args.img_height, args.img_width)).astype(np.float32)
 33 |     img = np.transpose(img, (2, 0, 1))
 34 |     img = (torch.from_numpy(img).float() / 2**14)
 35 |     tensor_img = ((img.unsqueeze(0)-0.45)/0.225).to(device)
 36 |     return tensor_img
 37 | 
 38 | @torch.no_grad()
 39 | def main():
 40 |     args = parser.parse_args()
 41 | 
 42 |     # load models
 43 |     disp_pose_net = models.DispPoseResNet(args.resnet_layers, False, num_channel=1).to(device)
 44 |     pose_net = disp_pose_net.PoseResNet
 45 | 
 46 |     weights = torch.load(args.pretrained_posenet)
 47 |     pose_net.load_state_dict(weights['state_dict'], strict=False)
 48 |     pose_net.eval()
 49 | 
 50 |     seq_length = 5
 51 | 
 52 |     # load data loader
 53 |     from eval_vivid.pose_evaluation_utils import test_framework_VIVID as test_framework
 54 |     dataset_dir = Path(args.dataset_dir)
 55 |     framework = test_framework(dataset_dir, args.sequences, seq_length=seq_length, step=1)
 56 | 
 57 |     print('{} snippets to test'.format(len(framework)))
 58 |     errors = np.zeros((len(framework), 2), np.float32)
 59 |     if args.output_dir is not None:
 60 |         output_dir = Path(args.output_dir)
 61 |         output_dir.makedirs_p()
 62 |         predictions_array = np.zeros((len(framework), seq_length, 3, 4))
 63 | 
 64 |     for j, sample in enumerate(tqdm(framework)):
 65 |         imgs = sample['imgs']
 66 |         squence_imgs = []
 67 |         for i, img in enumerate(imgs):
 68 |             img = load_tensor_image(img, args)
 69 |             squence_imgs.append(img)
 70 | 
 71 |         global_pose = np.eye(4)
 72 |         poses = []
 73 |         poses.append(global_pose[0:3, :])
 74 | 
 75 |         for iter in range(seq_length - 1):
 76 |             pose = pose_net(squence_imgs[iter], squence_imgs[iter + 1])
 77 |             pose_mat = pose_vec2mat(pose).squeeze(0).cpu().numpy()
 78 | 
 79 |             pose_mat = np.vstack([pose_mat, np.array([0, 0, 0, 1])])
 80 |             global_pose = global_pose @  np.linalg.inv(pose_mat)
 81 |             poses.append(global_pose[0:3, :])
 82 | 
 83 |         final_poses = np.stack(poses, axis=0)
 84 | 
 85 |         if args.output_dir is not None:
 86 |             predictions_array[j] = final_poses
 87 | 
 88 |         ATE, RE = compute_pose_error(sample['poses'], final_poses)
 89 |         errors[j] = ATE, RE
 90 | 
 91 |     mean_errors = errors.mean(0)
 92 |     std_errors = errors.std(0)
 93 |     error_names = ['ATE', 'RE']
 94 |     print('')
 95 |     print("Results")
 96 |     print("\t {:>10}, {:>10}".format(*error_names))
 97 |     print("mean \t {:10.4f}, {:10.4f}".format(*mean_errors))
 98 |     print("std \t {:10.4f}, {:10.4f}".format(*std_errors))
 99 | 
100 |     if args.output_dir is not None:
101 |         np.save(output_dir/'predictions.npy', predictions_array)
102 | 
103 | def compute_pose_error(gt, pred):
104 |     RE = 0
105 |     snippet_length = gt.shape[0]
106 |     scale_factor = np.sum(gt[:, :, -1] * pred[:, :, -1])/np.sum(pred[:, :, -1] ** 2)
107 |     ATE = np.linalg.norm((gt[:, :, -1] - scale_factor * pred[:, :, -1]).reshape(-1))
108 |     for gt_pose, pred_pose in zip(gt, pred):
109 |         # Residual matrix to which we compute angle's sin and cos
110 |         R = gt_pose[:, :3] @ np.linalg.inv(pred_pose[:, :3])
111 |         s = np.linalg.norm([R[0, 1]-R[1, 0],
112 |                             R[1, 2]-R[2, 1],
113 |                             R[0, 2]-R[2, 0]])
114 |         c = np.trace(R) - 1
115 |         # Note: we actually compute double of cos and sin, but arctan2 is invariant to scale
116 |         RE += np.arctan2(s, c)
117 | 
118 |     return ATE/snippet_length, RE/snippet_length
119 | 
120 | 
121 | def compute_pose(pose_net, tgt_img, ref_imgs):
122 |     poses = []
123 |     for ref_img in ref_imgs:
124 |         pose = pose_net(tgt_img, ref_img).unsqueeze(1)
125 |         poses.append(pose)
126 |     poses = torch.cat(poses, 1)
127 |     return poses
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     main()
132 | 


--------------------------------------------------------------------------------
/common/utils/custom_transforms.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import torch
  3 | import random
  4 | import numpy as np
  5 | import cv2
  6 | 
  7 | '''Set of tranform random routines that takes list of inputs as arguments,
  8 | in order to have random but coherent transformations.'''
  9 | 
 10 | class Compose(object):
 11 |     def __init__(self, transforms):
 12 |         self.transforms = transforms
 13 | 
 14 |     def __call__(self, images, intrinsics):
 15 |         for t in self.transforms:
 16 |             images, intrinsics = t(images, intrinsics)
 17 |         return images, intrinsics
 18 | 
 19 | class Normalize(object):
 20 |     def __init__(self, mean, std):
 21 |         self.mean = mean
 22 |         self.std = std
 23 | 
 24 |     def __call__(self, images, intrinsics):
 25 |         for tensor in images:
 26 |             for t, m, s in zip(tensor, self.mean, self.std):
 27 |                 t.sub_(m).div_(s)
 28 |         return images, intrinsics
 29 | 
 30 | class ArrayToTensor(object):
 31 |     """Converts a list of numpy.ndarray (H x W x C) along with a intrinsics matrix to a list of torch.FloatTensor of shape (C x H x W) with a intrinsics tensor."""
 32 |     def __init__(self, max_value=255):
 33 |         self.max_value = max_value
 34 | 
 35 |     def __call__(self, images, intrinsics):
 36 |         tensors = []
 37 |         for im in images:
 38 |             # put it from HWC to CHW format
 39 |             im = np.transpose(im, (2, 0, 1))
 40 |             # handle numpy array
 41 |             tensors.append(torch.from_numpy(im).float()/self.max_value)
 42 |         return tensors, intrinsics
 43 | 
 44 | 
 45 | class ArrayToTensorWithLocalWindow(object):
 46 |     """Converts a list of numpy.ndarray (H x W x C) along with a intrinsics matrix to a list of torch.FloatTensor of shape (C x H x W) with a intrinsics tensor."""
 47 |     def __init__(self, trust_ratio=1.00):
 48 |         self.max_ratio = trust_ratio
 49 |         self.min_ratio = 1.0-trust_ratio
 50 | 
 51 |     def __call__(self, images, intrinsics):
 52 |         tensors = []
 53 |         # Decide min-max values of local image window
 54 |         tmin = 0
 55 |         tmax = 0
 56 |         for im in images : 
 57 |             im = im.squeeze() #HW
 58 |             im_srt = np.sort(im.reshape(-1))
 59 |             tmax += im_srt[round(len(im_srt)*self.max_ratio)-1]
 60 |             tmin += im_srt[round(len(im_srt)*self.min_ratio)]
 61 |         tmax /= len(images)
 62 |         tmin /= len(images)
 63 | 
 64 |         for im in images:
 65 |             # put it from HWC to CHW format
 66 |             im = np.transpose(im, (2, 0, 1))
 67 |             im = torch.clamp(torch.from_numpy(im).float(), tmin, tmax)
 68 |             tensors.append((im - tmin)/(tmax - tmin)) #CHW
 69 |         return tensors, intrinsics
 70 | 
 71 | class TensorThermalRearrange(object):
 72 |     def __init__(self, bin_num = 30, CLAHE_clip = 2, CLAHE_tilesize = 8):
 73 |         self.bins = bin_num
 74 |         self.CLAHE = cv2.createCLAHE(clipLimit=CLAHE_clip, tileGridSize=(CLAHE_tilesize,CLAHE_tilesize))
 75 |     def __call__(self, images, intrinsics):
 76 |         imgs = []
 77 | 
 78 |         tmp_img = torch.cat(images, axis=0)
 79 |         hist = torch.histc(tmp_img, bins=self.bins)
 80 |         imgs_max = tmp_img.max()
 81 |         imgs_min = tmp_img.min()
 82 |         itv = (imgs_max - imgs_min)/self.bins
 83 |         total_num = hist.sum() 
 84 | 
 85 |         for im in images : #CHW
 86 |             _,H,W = im.shape
 87 |             mul_mask_ = torch.zeros((self.bins,H,W))
 88 |             sub_mask_ = torch.zeros((self.bins,H,W))
 89 |             subhist_new_min = imgs_min.clone()
 90 | 
 91 |             for x in range(0,self.bins) : 
 92 |                 subhist = (im > imgs_min+itv*x) & (im <= imgs_min+itv*(x+1))
 93 |                 if (subhist.sum() == 0):
 94 |                     continue
 95 |                 subhist_new_itv = hist[x]/total_num        
 96 |                 mul_mask_[x,...] = subhist * (subhist_new_itv / itv) 
 97 |                 sub_mask_[x,...] = subhist * (subhist_new_itv / itv * -(imgs_min+itv*x) + subhist_new_min) 
 98 |                 subhist_new_min += subhist_new_itv
 99 | 
100 |             mul_mask = mul_mask_.sum(axis=0, keepdim=True).detach()
101 |             sub_mask = sub_mask_.sum(axis=0, keepdim=True).detach()
102 |             im_ = mul_mask*im + sub_mask
103 | 
104 |             im_ = self.CLAHE.apply((im_.squeeze()*255).numpy().astype(np.uint8)).astype(np.float32)
105 |             im_ = np.expand_dims(im_, axis=2)
106 |             img_out = torch.from_numpy(np.transpose(im_/255., (2, 0, 1)))
107 |             imgs.append(img_out) #CHW
108 |         return imgs, intrinsics
109 | 
110 | class RandomHorizontalFlip(object):
111 |     """Randomly horizontally flips the given numpy array with a probability of 0.5"""
112 | 
113 |     def __call__(self, images, intrinsics):
114 |         assert intrinsics is not None
115 |         if random.random() < 0.5:
116 |             output_intrinsics = np.copy(intrinsics)
117 |             output_images = [np.copy(np.fliplr(im)) for im in images]
118 |             w = output_images[0].shape[1]
119 |             output_intrinsics[:, 0, 2] = w - output_intrinsics[:, 0, 2]
120 |         else:
121 |             output_images = images
122 |             output_intrinsics = intrinsics
123 |         return output_images, output_intrinsics
124 | 
125 | class RandomScaleCrop(object):
126 |     """Randomly zooms images up to 15% and crop them to keep same size as before."""
127 | 
128 |     def __call__(self, images, intrinsics):
129 |         assert intrinsics is not None
130 |         output_intrinsics = np.copy(intrinsics)
131 | 
132 |         in_h, in_w, ch = images[0].shape
133 |         x_scaling, y_scaling = np.random.uniform(1, 1.15, 2)
134 |         scaled_h, scaled_w = int(in_h * y_scaling), int(in_w * x_scaling)
135 | 
136 |         output_intrinsics[:,0] *= x_scaling
137 |         output_intrinsics[:,1] *= y_scaling
138 | 
139 |         if ch == 1:
140 |             scaled_images = [np.expand_dims(cv2.resize(im, (scaled_w, scaled_h)), axis=2) for im in images]
141 |         else :
142 |             scaled_images = [cv2.resize(im, (scaled_w, scaled_h)) for im in images]
143 | 
144 |         offset_y = np.random.randint(scaled_h - in_h + 1)
145 |         offset_x = np.random.randint(scaled_w - in_w + 1)
146 |         cropped_images = [im[offset_y:offset_y + in_h, offset_x:offset_x + in_w] for im in scaled_images]
147 | 
148 |         output_intrinsics[:, 0, 2] -= offset_x
149 |         output_intrinsics[:, 1, 2] -= offset_y
150 | 
151 |         return cropped_images, output_intrinsics


--------------------------------------------------------------------------------
/common/data_prepare/prepare_train_data_VIVID.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import scipy.misc
  3 | import numpy as np
  4 | from pebble import ProcessPool
  5 | import sys
  6 | from tqdm import tqdm
  7 | from path import Path
  8 | import cv2
  9 | import shutil
 10 | 
 11 | parser = argparse.ArgumentParser()
 12 | parser.add_argument("dataset_dir", metavar='DIR',
 13 |                     help='path to original dataset')
 14 | parser.add_argument("--with-depth", action='store_true',
 15 |                     help="If available, will store depth ground truth along with images, for validation")
 16 | parser.add_argument("--with-pose", action='store_true',
 17 |                     help="If available, will store pose ground truth along with images, for validation")
 18 | parser.add_argument("--no-train-gt", action='store_true',
 19 |                     help="If selected, will delete ground truth depth to save space")
 20 | parser.add_argument("--dump-root", type=str, default='dump', help="Where to dump the data")
 21 | parser.add_argument("--height", type=int, default=256, help="image height")
 22 | parser.add_argument("--width", type=int, default=320, help="image width")
 23 | parser.add_argument("--num-threads", type=int, default=4, help="number of threads to use")
 24 | 
 25 | args = parser.parse_args()
 26 | 
 27 | def dump_example(args, scene):
 28 |     scene_name = scene.split('/')[-1]
 29 |     scene_data = data_loader.collect_scenes(scene,)
 30 |     dump_dir   = args.dump_root/scene_data['rel_path']
 31 |     dump_dir.makedirs_p()
 32 |     dump_dir_ther      = dump_dir/"Thermal"
 33 |     dump_dir_rgb       = dump_dir/"RGB"
 34 |     dump_dir_depth_T   = dump_dir/"Depth_T"
 35 |     dump_dir_depth_RGB = dump_dir/"Depth_RGB"
 36 |     dump_dir_ther.makedirs_p()
 37 |     dump_dir_rgb.makedirs_p()
 38 |     dump_dir_depth_T.makedirs_p()
 39 |     dump_dir_depth_RGB.makedirs_p()
 40 | 
 41 |     # save intrinsic param
 42 |     intrinsics_T = scene_data['intrinsics_T']
 43 |     dump_cam_file = dump_dir/'cam_T.txt'
 44 |     np.savetxt(dump_cam_file, intrinsics_T) 
 45 | 
 46 |     intrinsics_RGB = scene_data['intrinsics_RGB']
 47 |     dump_cam_file = dump_dir/'cam_RGB.txt'
 48 |     np.savetxt(dump_cam_file, intrinsics_RGB) 
 49 | 
 50 |     extrinsics_T2RGB = scene_data['Tr_T2RGB']
 51 |     dump_cam_file = dump_dir/'Tr_T2RGB.txt'
 52 |     np.savetxt(dump_cam_file, extrinsics_T2RGB) 
 53 | 
 54 |     poses_T_file = dump_dir/'poses_T.txt'
 55 |     poses_RGB_file = dump_dir/'poses_RGB.txt'
 56 | 
 57 |     poses_T = []
 58 |     poses_RGB = []
 59 | 
 60 |     # save each files + pose + depth
 61 |     for sample in data_loader.get_scene_imgs(scene_data):
 62 |         frame_nb = sample["id"]
 63 |         cv2.imwrite(dump_dir_rgb/'{}.png'.format(frame_nb), sample['Img_RGB'])
 64 |         cv2.imwrite(dump_dir_ther/'{}.png'.format(frame_nb), sample['Img_Ther'])
 65 | 
 66 |         if "pose_T" in sample.keys():
 67 |             poses_T.append(sample["pose_T"].tolist())
 68 |             poses_RGB.append(sample["pose_RGB"].tolist())
 69 |         if "depth_T" in sample.keys():
 70 |             dump_depth_T_file = dump_dir_depth_T/'{}.npy'.format(frame_nb)
 71 |             np.save(dump_depth_T_file, sample["depth_T"])
 72 |             dump_depth_RGB_file = dump_dir_depth_RGB/'{}.npy'.format(frame_nb)
 73 |             np.save(dump_depth_RGB_file, sample["depth_RGB"])
 74 | 
 75 |     if len(poses_T) != 0:
 76 |         np.savetxt(poses_T_file, np.array(poses_T).reshape(-1, 12), fmt='%.6e')
 77 |         np.savetxt(poses_RGB_file, np.array(poses_RGB).reshape(-1, 12), fmt='%.6e')
 78 | 
 79 |     if len(dump_dir_rgb.files('*.png')) < 3:
 80 |         dump_dir.rmtree()
 81 | 
 82 | def extract_well_lit_images(args):
 83 |     tgt_dir   = args.dump_root/'indoor_robust_varying'
 84 |     dump_dir   = args.dump_root/'indoor_robust_varying_well_lit'   
 85 |     dump_dir.makedirs_p()
 86 |     dump_dir_ther      = dump_dir/"Thermal"
 87 |     dump_dir_rgb       = dump_dir/"RGB"
 88 |     dump_dir_depth_T   = dump_dir/"Depth_T"
 89 |     dump_dir_depth_RGB = dump_dir/"Depth_RGB"
 90 |     dump_dir_ther.makedirs_p()
 91 |     dump_dir_rgb.makedirs_p()
 92 |     dump_dir_depth_T.makedirs_p()
 93 |     dump_dir_depth_RGB.makedirs_p()
 94 | 
 95 |     # read well-lit image list
 96 |     img_list = np.genfromtxt('./common/data_prepare/well_lit_from_varying.txt').astype(int) # 
 97 | 
 98 |     for frame_nb in img_list :
 99 |         dump_img_T_file     = dump_dir_ther/'{:06d}.png'.format(frame_nb)
100 |         dump_img_RGB_file   = dump_dir_rgb/'{:06d}.png'.format(frame_nb)        
101 |         dump_depth_T_file   = dump_dir_depth_T/'{:06d}.npy'.format(frame_nb)
102 |         dump_depth_RGB_file = dump_dir_depth_RGB/'{:06d}.npy'.format(frame_nb)
103 | 
104 |         shutil.copy(tgt_dir/"Thermal"/'{:06d}.png'.format(frame_nb), dump_img_T_file)
105 |         shutil.copy(tgt_dir/"RGB"/'{:06d}.png'.format(frame_nb), dump_img_RGB_file)
106 |         shutil.copy(tgt_dir/"Depth_T"/'{:06d}.npy'.format(frame_nb), dump_depth_T_file)
107 |         shutil.copy(tgt_dir/"Depth_RGB"/'{:06d}.npy'.format(frame_nb), dump_depth_RGB_file)
108 | 
109 | def main():
110 |     args.dump_root = Path(args.dump_root)
111 |     args.dump_root.mkdir_p()
112 | 
113 |     global data_loader
114 | 
115 |     from VIVID_raw_loader import VIVIDRawLoader
116 |     data_loader = VIVIDRawLoader(args.dataset_dir,
117 |                                  img_height=args.height,
118 |                                  img_width=args.width,
119 |                                  get_depth=args.with_depth,
120 |                                  get_pose=args.with_pose)
121 | 
122 |     n_scenes = len(data_loader.scenes)
123 |     print('Found {} potential scenes'.format(n_scenes))
124 |     print('Retrieving frames')
125 |     if args.num_threads == 1:
126 |         for scene in tqdm(data_loader.scenes):
127 |             dump_example(args, scene)
128 |     else:
129 |         with ProcessPool(max_workers=args.num_threads) as pool:
130 |             tasks = pool.map(dump_example, [args]*n_scenes, data_loader.scenes)
131 |             try:
132 |                 for _ in tqdm(tasks.result(), total=n_scenes):
133 |                     pass
134 |             except KeyboardInterrupt as e:
135 |                 tasks.cancel()
136 |                 raise e
137 | 
138 |     print('Extracting well-lit image from varying illumination set')
139 |     extract_well_lit_images(args)
140 | 
141 |     print('Generating train val lists')
142 |     with open(args.dump_root / 'train_indoor.txt', 'w') as tf:
143 |         for seq in data_loader.indoor_train_list :
144 |             tf.write('{}\n'.format(seq))
145 |     with open(args.dump_root / 'val_indoor.txt', 'w') as tf:
146 |         for seq in data_loader.indoor_val_list :
147 |             tf.write('{}\n'.format(seq))
148 |     with open(args.dump_root / 'test_indoor.txt', 'w') as tf:
149 |         for seq in data_loader.indoor_test_list :
150 |             tf.write('{}\n'.format(seq))
151 |     with open(args.dump_root / 'train_outdoor.txt', 'w') as tf:
152 |         for seq in data_loader.outdoor_train_list :
153 |             tf.write('{}\n'.format(seq))
154 |     with open(args.dump_root / 'val_outdoor.txt', 'w') as tf:
155 |         for seq in data_loader.outdoor_val_list :
156 |             tf.write('{}\n'.format(seq))
157 |     with open(args.dump_root / 'test_outdoor.txt', 'w') as tf:
158 |         for seq in data_loader.outdoor_test_list :
159 |             tf.write('{}\n'.format(seq))
160 | 
161 |     print('Done!')
162 | 
163 | if __name__ == '__main__':
164 |     main()
165 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Maximizing Self-supervision from Thermal Image for Effective Self-supervised Learning of Depth and Ego-motion
  2 | 
  3 | This github is a official implementation of the paper:
  4 | 
  5 |  >Maximizing Self-supervision from Thermal Image for Effective Self-supervised Learning of Depth and Ego-motion
  6 |  >
  7 |  >[Ukcheol Shin](https://ukcheolshin.github.io/), [Kyunghyun Lee](https://scholar.google.co.kr/citations?user=WOBfTQoAAAAJ&hl=ko&oi=ao), [Byeong-Uk Lee](https://sites.google.com/view/bulee), In So Kweon
  8 |  >
  9 |  >**Robotics and Automation Letter 2022 & IROS 2022**
 10 |  >
 11 |  >[[PDF](https://arxiv.org/abs/2201.04387)] [[Project webpage](https://sites.google.com/view/thermal-monodepth-project-page)] [[Full paper](https://arxiv.org/abs/2201.04387)] [[Youtube](https://youtu.be/qIBcOuLYr70)] 
 12 | 
 13 | ## Introduction
 14 | Recently, self-supervised learning of depth and ego-motion from thermal images shows strong robustness and reliability under challenging lighting and weather conditions.
 15 | However, the inherent thermal image properties such as weak contrast, blurry edges, and noise hinder to generate effective self-supervision from thermal images.
 16 | Therefore, most previous researches just rely on additional self-supervisory sources such as RGB video, generative models, and Lidar information.
 17 | In this paper, we conduct an in-depth analysis of thermal image characteristics that degenerates self-supervision from thermal images. 
 18 | Based on the analysis, we propose an effective thermal image mapping method that significantly increases image information, such as overall structure, contrast, and details, while preserving temporal consistency.
 19 | By resolving the fundamental problem of the thermal image, our depth and pose network trained only with thermal images achieves state-of-the-art results without utilizing any extra self-supervisory source.
 20 | As our best knowledge, this work is the first self-supervised learning approach to train monocular depth and relative pose networks with only thermal images.
 21 | 
 22 | <p align="center">
 23 |   <img src="media/overview_a.png" width="500" />
 24 |   <img src="media/overview_b.png" width="500" />
 25 | </p>
 26 | 
 27 | <p align="center">
 28 |   <img src="media/qualitative_1.png" width="900" />
 29 | </p>
 30 | <p align="center">
 31 |   <img src="media/qualitative_2.png" width="900" />
 32 | </p>
 33 | 
 34 | Please refer to the video for more descriptions and visual results.
 35 | 
 36 | [![Video Label](https://img.youtube.com/vi/oNu_YVf8sCc/0.jpg)](https://youtu.be/oNu_YVf8sCc)
 37 | 
 38 | ## Main Results
 39 | 
 40 | ### Depth Results
 41 | 
 42 | #### Indoor test set (Well-lit)
 43 | 
 44 | |   Models   | Abs Rel | Sq Rel | RMSE  | RMSE(log) | Acc.1 | Acc.2 | Acc.3 |
 45 | |------------|---------|--------|-------|-----------|-------|-------|-------|
 46 | | Shin(T)    | 0.225   | 0.201  | 0.709 | 0.262     | 0.620 | 0.920 | 0.993 |
 47 | | Shin(MS)   | 0.156   | 0.111  | 0.527 | 0.197     | 0.783 | 0.975 | 0.997 |
 48 | | Ours       | 0.152   | 0.121  | 0.538 | 0.196     | 0.814 | 0.965 | 0.992 |
 49 | 
 50 | #### Indoor test set (Low-/Zero- light)
 51 | 
 52 | |   Models   | Abs Rel | Sq Rel | RMSE  | RMSE(log) | Acc.1 | Acc.2 | Acc.3 |
 53 | |------------|---------|--------|-------|-----------|-------|-------|-------|
 54 | | Shin(T)    | 0.232   | 0.222  | 0.740 | 0.268     | 0.618 | 0.907 | 0.987 |
 55 | | Shin(MS)   | 0.166   | 0.129  | 0.566 | 0.207     | 0.768 | 0.967 | 0.994 |
 56 | | Ours       | 0.149   | 0.109  | 0.517 | 0.192     | 0.813 | 0.969 | 0.994 |
 57 | 
 58 | #### Outdoor test set (Night-time)
 59 | 
 60 | |   Models   | Abs Rel | Sq Rel | RMSE  | RMSE(log) | Acc.1 | Acc.2 | Acc.3 |
 61 | |------------|---------|--------|-------|-----------|-------|-------|-------|
 62 | | Shin(T)    | 0.157   | 1.179  | 5.802 | 0.211     | 0.750 | 0.948 | 0.985 |
 63 | | Shin(MS)   | 0.146   | 0.873  | 4.697 | 0.184     | 0.801 | 0.973 | 0.993 |
 64 | | Ours       | 0.109   | 0.703  | 4.132 | 0.150     | 0.887 | 0.980 | 0.994 |
 65 | 
 66 | 
 67 | ### Pose Estimation Results 
 68 | 
 69 | #### Indoor-static-dark
 70 | 
 71 | |Metric               | ATE     | RE      |
 72 | |---------------------|---------|---------|
 73 | | Shin(T)             | 0.0063  | 0.0092  |
 74 | | Shin(MS)            | 0.0057  | 0.0089  | 
 75 | | Ours                | 0.0059  | 0.0082  | 
 76 | 
 77 | #### Outdoor-night1
 78 | 
 79 | |Metric               | ATE     | RE      |
 80 | |---------------------|---------|---------|
 81 | | Shin(T)             | 0.0571  | 0.0280  |
 82 | | Shin(MS)            | 0.0562  | 0.0287  | 
 83 | | Ours                | 0.0546  | 0.0287  | 
 84 | 
 85 | ## Getting Started
 86 | 
 87 | ### Prerequisite
 88 | This codebase was developed and tested with python 3.7, Pytorch 1.5.1, and CUDA 10.2 on Ubuntu 16.04. 
 89 | 
 90 | ```bash
 91 | conda env create --file environment.yml
 92 | ```
 93 | 
 94 | ### Pre-trained Model
 95 | Our pre-trained models are availabe in this [link](https://drive.google.com/drive/folders/1ne2yyGMifciHG3uXj6xjc4q_KtX8z4Ki?usp=sharing)
 96 | 
 97 | ### Datasets
 98 | 
 99 | For ViViD Raw dataset, download the dataset provided on the [official website](https://sites.google.com/view/dgbicra2019-vivid/).
100 | 
101 | For our post-processed dataset, please refer to [this Github page](https://github.com/UkcheolShin/ThermalSfMLearner-MS).
102 | 
103 | After download our post-processed dataset, unzip the files to form the below structure.
104 | 
105 | #### Expected dataset structure for the post-processed ViViD dataset:
106 | ```
107 | KAIST_VIVID/
108 |   calibration/
109 |     cali_ther_to_rgb.yaml, ...
110 |   indoor_aggressive_local/
111 |     RGB/
112 |       data/
113 |         000001.png, 000002.png, ...
114 |       timestamps.txt
115 |     Thermal/
116 |       data/
117 |       timestamps.txt
118 |     Lidar/
119 |       data/
120 |       timestamps.txt
121 |     Warped_Depth/
122 |       data/
123 |       timestamps.txt
124 |     avg_velocity_thermal.txt
125 |     poses_thermal.txt
126 |     ...
127 |   indoor_aggressive_global/
128 |     ...	
129 |   outdoor_robust_day1/
130 |     ...
131 |   outdoor_robust_night1/
132 |     ...
133 | ```
134 | 
135 | Upon the above dataset structure, you can generate training/testing dataset by running the script.
136 | ```bash
137 | sh scripts/prepare_vivid_data.sh
138 | ```
139 | 
140 | ### Training
141 | 
142 | The "scripts" folder provides several examples for training, testing, and visualization.
143 | 
144 | You can train the depth and pose model on vivid dataset by running
145 | ```bash
146 | sh scripts/train_vivid_resnet18_indoor.sh
147 | sh scripts/train_vivid_resnet18_outdoor.sh
148 | ```
149 | Then you can start a `tensorboard` session in this folder by
150 | ```bash
151 | tensorboard --logdir=checkpoints/
152 | ```
153 | and visualize the training progress by opening [https://localhost:6006](https://localhost:6006) on your browser. 
154 | 
155 | 
156 | ### Evaluation
157 | 
158 | You can evaluate depth and pose by running
159 | ```bash
160 | bash scripts/test_vivid_indoor.sh
161 | bash scripts/test_vivid_outdoor.sh
162 | ```
163 | and visualize depth by running
164 | ```bash
165 | bash scripts/run_vivid_inference.sh
166 | ```
167 | You can comprehensively see the overall results by running
168 | ```bash
169 | bash scripts/display_result.sh
170 | ```
171 | 
172 | ## Citation
173 | Please cite the following paper if you use our work, parts of this code, and pre-processed dataset in your research.
174 |  
175 | 	@ARTICLE{shin2022maximize,  
176 | 		author={Shin, Ukcheol and Lee, Kyunghyun and Lee, Byeong-Uk and Kweon, In So},  
177 | 		journal={IEEE Robotics and Automation Letters},   
178 | 		title={Maximizing Self-Supervision From Thermal Image for Effective Self-Supervised Learning of Depth and Ego-Motion},   
179 | 		year={2022},  
180 | 		volume={7},  
181 | 		number={3},  
182 | 		pages={7771-7778},  
183 | 		doi={10.1109/LRA.2022.3185382}
184 | 	}	
185 | 
186 | ## Related projects
187 |  
188 |  * [SfMLearner-Pytorch](https://github.com/ClementPinard/SfmLearner-Pytorch) (CVPR 2017)
189 |  * [SC-SfMLearner-Pytorch](https://github.com/JiawangBian/SC-SfMLearner-Release) (NeurIPS 2019)
190 |  * [Thermal-SfMLearner-Pytorch](https://github.com/UkcheolShin/ThermalSfMLearner-MS) (RA-L 2021 & ICRA 2022)
191 | 


--------------------------------------------------------------------------------
/common/loss/loss_functions.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import torch
  3 | from torch import nn
  4 | import torch.nn.functional as F
  5 | from .inverse_warp import inverse_warp2
  6 | import math
  7 | import numpy as np
  8 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
  9 | 
 10 | class SSIM(nn.Module):
 11 |     """Layer to compute the SSIM loss between a pair of images
 12 |     """
 13 |     def __init__(self):
 14 |         super(SSIM, self).__init__()
 15 |         k=3
 16 |         self.mu_x_pool = nn.AvgPool2d(k, 1)
 17 |         self.mu_y_pool = nn.AvgPool2d(k, 1)
 18 |         self.sig_x_pool = nn.AvgPool2d(k, 1)
 19 |         self.sig_y_pool = nn.AvgPool2d(k, 1)
 20 |         self.sig_xy_pool = nn.AvgPool2d(k, 1)
 21 | 
 22 |         self.refl = nn.ReflectionPad2d(k//2)
 23 | 
 24 |         self.C1 = 0.01 ** 2
 25 |         self.C2 = 0.03 ** 2
 26 | 
 27 |     def forward(self, x, y):
 28 |         x = self.refl(x)
 29 |         y = self.refl(y)
 30 | 
 31 |         mu_x = self.mu_x_pool(x)
 32 |         mu_y = self.mu_y_pool(y)
 33 | 
 34 |         sigma_x = self.sig_x_pool(x ** 2) - mu_x ** 2
 35 |         sigma_y = self.sig_y_pool(y ** 2) - mu_y ** 2
 36 |         sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y
 37 |         
 38 |         SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2)
 39 |         SSIM_d = (mu_x ** 2 + mu_y ** 2 + self.C1) * (sigma_x + sigma_y + self.C2)
 40 | 
 41 |         return torch.clamp((1 - SSIM_n / SSIM_d) / 2, 0, 1)
 42 | 
 43 | compute_ssim_loss = SSIM().to(device)
 44 | 
 45 | def gradient(D) : 
 46 |     D_dx = torch.abs(D[:, :, :, :-1] - D[:, :, :, 1:])
 47 |     D_dy = torch.abs(D[:, :, :-1, :] - D[:, :, 1:, :])
 48 |     return D_dx, D_dy
 49 | 
 50 | def gradient_img(I) : 
 51 |     I_dx = torch.mean(torch.abs(I[:, :, :, :-1] - I[:, :, :, 1:]), 1, keepdim=True)
 52 |     I_dy = torch.mean(torch.abs(I[:, :, :-1, :] - I[:, :, 1:, :]), 1, keepdim=True)
 53 |     return I_dx, I_dy
 54 | 
 55 | def compute_smooth_loss(tgt_depth, tgt_img, ref_depths, ref_imgs):
 56 |     def get_smooth_loss(disp, img):
 57 |         """Computes the smoothness loss for a disparity image
 58 |         The color image is used for edge-aware smoothness
 59 |         """
 60 | 
 61 |         # normalize
 62 |         mean_disp = disp.mean(2, True).mean(3, True)
 63 |         norm_disp = disp / (mean_disp + 1e-7)
 64 |         disp = norm_disp
 65 | 
 66 |         grad_disp_x = torch.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:])
 67 |         grad_disp_y = torch.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :])
 68 | 
 69 |         grad_img_x = torch.mean(torch.abs(img[:, :, :, :-1] - img[:, :, :, 1:]), 1, keepdim=True)
 70 |         grad_img_y = torch.mean(torch.abs(img[:, :, :-1, :] - img[:, :, 1:, :]), 1, keepdim=True)
 71 | 
 72 |         grad_disp_x *= torch.exp(-grad_img_x)
 73 |         grad_disp_y *= torch.exp(-grad_img_y)
 74 | 
 75 |         return grad_disp_x.mean() + grad_disp_y.mean()
 76 | 
 77 |     loss = get_smooth_loss(tgt_depth[0], tgt_img)
 78 | 
 79 |     for ref_depth, ref_img in zip(ref_depths, ref_imgs):
 80 |         loss += get_smooth_loss(ref_depth[0], ref_img)
 81 | 
 82 |     return loss
 83 | 
 84 | def compute_photo_and_geometry_loss(tgt_img, ref_imgs, intrinsics, tgt_depth, ref_depths, poses, poses_inv, max_scales, ssim_param, with_ssim, with_mask, with_auto_mask, padding_mode):
 85 |     """Compute the image reconstruction loss and geometric consistency loss between a pair of images
 86 |     """
 87 | 
 88 |     photo_loss = 0
 89 |     geometry_loss = 0
 90 | 
 91 |     num_scales = min(len(tgt_depth), max_scales)
 92 |     for ref_img, ref_depth, pose, pose_inv in zip(ref_imgs, ref_depths, poses, poses_inv):
 93 |         for s in range(num_scales):
 94 | 
 95 |             # upsample depth
 96 |             b, _, h, w = tgt_img.size()
 97 |             tgt_img_scaled = tgt_img
 98 |             ref_img_scaled = ref_img
 99 |             intrinsic_scaled = intrinsics
100 |             if s == 0:
101 |                 tgt_depth_scaled = tgt_depth[s]
102 |                 ref_depth_scaled = ref_depth[s]
103 |             else:
104 |                 tgt_depth_scaled = F.interpolate(tgt_depth[s], (h, w), mode='nearest')
105 |                 ref_depth_scaled = F.interpolate(ref_depth[s], (h, w), mode='nearest')
106 | 
107 |             photo_loss1, geometry_loss1 = compute_pairwise_loss(tgt_img_scaled, ref_img_scaled, tgt_depth_scaled, ref_depth_scaled, pose,
108 |                                                                 intrinsic_scaled, ssim_param, with_ssim, with_mask, with_auto_mask, padding_mode)
109 |             photo_loss2, geometry_loss2 = compute_pairwise_loss(ref_img_scaled, tgt_img_scaled, ref_depth_scaled, tgt_depth_scaled, pose_inv,
110 |                                                                 intrinsic_scaled, ssim_param, with_ssim, with_mask, with_auto_mask, padding_mode)
111 | 
112 |             photo_loss += (photo_loss1 + photo_loss2)
113 |             geometry_loss += (geometry_loss1 + geometry_loss2)
114 | 
115 |     return photo_loss, geometry_loss
116 | 
117 | 
118 | def compute_pairwise_loss(tgt_img, ref_img, tgt_depth, ref_depth, pose, intrinsic, ssim_param, with_ssim, with_mask, with_auto_mask, padding_mode):
119 |     """Compute the image reconstruction loss and geometric consistency loss between a pair of images
120 |     """
121 |     ref_img_warped, valid_mask, projected_depth, computed_depth = inverse_warp2(ref_img, tgt_depth, ref_depth, pose, intrinsic, padding_mode)
122 | 
123 |     diff_img = (tgt_img - ref_img_warped).abs().clamp(0, 1)
124 |     diff_depth = ((computed_depth - projected_depth).abs() / (computed_depth + projected_depth)).clamp(0,1)
125 | 
126 |     if with_ssim == True:
127 |         ssim_map = compute_ssim_loss(tgt_img, ref_img_warped)
128 |         diff_img = ((1 - ssim_param) * diff_img + ssim_param * ssim_map)
129 | 
130 |     if with_auto_mask == True:
131 |         diff_img_ = (tgt_img - ref_img).abs().clamp(0,1)
132 |         ssim_map_ = compute_ssim_loss(tgt_img, ref_img)
133 |         diff_img_ = ((1 - ssim_param) * diff_img_ + ssim_param * ssim_map_) # we follow original automask implementation of Monodepth v2
134 |         auto_mask = (diff_img.mean(dim=1, keepdim=True) < diff_img_.mean(dim=1, keepdim=True)).float() 
135 |         valid_mask *= auto_mask
136 | 
137 |     if with_mask == True:
138 |         weight_mask = (1 - diff_depth)
139 |         diff_img = diff_img * weight_mask
140 | 
141 |     # compute all loss
142 |     reconstruction_loss = mean_on_mask(diff_img, valid_mask)
143 |     geometry_consistency_loss = mean_on_mask(diff_depth, valid_mask)
144 | 
145 |     return reconstruction_loss, geometry_consistency_loss
146 | 
147 | # compute mean value given a binary mask
148 | def mean_on_mask(diff, valid_mask):
149 | 
150 |     if valid_mask.shape != diff.shape :
151 |         mask = valid_mask.expand_as(diff)
152 |     else:
153 |         mask = valid_mask
154 | 
155 |     if mask.sum() > 10000:
156 |         mean_value = (diff * mask).sum() / mask.sum()
157 |     else:
158 |         mean_value = torch.tensor(0).float().to(device)
159 |     return mean_value
160 | 
161 | @torch.no_grad()
162 | def compute_errors(gt, pred, scene):
163 |     abs_diff, abs_rel, sq_rel, a1, a2, a3 = 0, 0, 0, 0, 0, 0
164 |     batch_size, h, w = gt.size()
165 | 
166 |     '''
167 |     crop used by Garg ECCV16 to reprocude Eigen NIPS14 results
168 |     construct a mask of False values, with the same size as target
169 |     and then set to True values inside the crop
170 |     '''
171 |     crop_mask = gt[0] != gt[0]
172 |     crop_mask[:,:] = 1
173 |     if scene == 'indoor' : 
174 |         max_depth = 10
175 |     elif scene == 'outdoor':
176 |         max_depth = 80
177 | 
178 |     for current_gt, current_pred in zip(gt, pred):
179 |         valid = (current_gt > 0.1) & (current_gt < max_depth)
180 |         valid = valid & crop_mask
181 | 
182 |         valid_gt = current_gt[valid]
183 |         valid_pred = current_pred[valid].clamp(1e-3, max_depth)
184 | 
185 |         valid_pred = valid_pred * torch.median(valid_gt)/torch.median(valid_pred)
186 | 
187 |         thresh = torch.max((valid_gt / valid_pred), (valid_pred / valid_gt))
188 |         a1 += (thresh < 1.25).float().mean()
189 |         a2 += (thresh < 1.25 ** 2).float().mean()
190 |         a3 += (thresh < 1.25 ** 3).float().mean()
191 | 
192 |         abs_diff += torch.mean(torch.abs(valid_gt - valid_pred))
193 |         abs_rel += torch.mean(torch.abs(valid_gt - valid_pred) / valid_gt)
194 | 
195 |         sq_rel += torch.mean(((valid_gt - valid_pred)**2) / valid_gt)
196 | 
197 |     return [metric.item() / batch_size for metric in [abs_diff, abs_rel, sq_rel, a1, a2, a3]]


--------------------------------------------------------------------------------
/common/data_prepare/VIVID_raw_loader.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import numpy as np
  3 | from path import Path
  4 | import cv2
  5 | 
  6 | def transform_from_rot_trans(R, t):
  7 |     """Transforation matrix from rotation matrix and translation vector."""
  8 |     R = R.reshape(3, 3)
  9 |     t = t.reshape(3, 1)
 10 |     return np.vstack((np.hstack([R, t]), [0, 0, 0, 1]))
 11 | 
 12 | class VIVIDRawLoader(object):
 13 |     def __init__(self,
 14 |                  dataset_dir,
 15 |                  img_height=256, 
 16 |                  img_width=320, 
 17 |                  min_speed=0.15, # m/sec
 18 |                  get_depth=False,
 19 |                  get_pose=False):
 20 |         dir_path = Path(__file__).realpath().dirname()
 21 | 
 22 |         self.dataset_dir = Path(dataset_dir)
 23 |         self.img_height = img_height
 24 |         self.img_width = img_width
 25 |         self.folder_list = ['RGB', 'Thermal', 'Warped_Depth']
 26 |         self.date_list = ['indoor_aggresive_global', 'indoor_robust_dark', 'indoor_robust_varying', 'indoor_unstable_local', 'outdoor_robust_night1', 'indoor_aggresive_dark', 'indoor_robust_global', 'indoor_unstable_dark', 'outdoor_robust_day1', 'outdoor_robust_night2', 'indoor_aggresive_local', 'indoor_robust_local', 'indoor_unstable_global', 'outdoor_robust_day2']
 27 |         self.indoor_train_list = ['indoor_aggresive_global', 'indoor_unstable_local', 'indoor_robust_global', 'indoor_robust_local', 'indoor_unstable_global']
 28 |         self.indoor_val_list = ['indoor_robust_dark', 'indoor_aggresive_local']
 29 |         self.indoor_test_list = ['indoor_robust_dark', 'indoor_robust_varying', 'indoor_aggresive_dark', 'indoor_unstable_dark', 'indoor_aggresive_local']
 30 |         self.outdoor_train_list = ['outdoor_robust_day1', 'outdoor_robust_day2']
 31 |         self.outdoor_val_list = ['outdoor_robust_night1']
 32 |         self.outdoor_test_list = ['outdoor_robust_night1', 'outdoor_robust_night2']
 33 |         self.min_speed = min_speed
 34 |         self.get_depth = get_depth
 35 |         self.get_pose = get_pose
 36 |         self.collect_train_folders()
 37 | 
 38 |     # collect dataset sequence path.
 39 |     def collect_train_folders(self):
 40 |         self.scenes = []
 41 |         for date in self.date_list:
 42 |             self.scenes.append((self.dataset_dir/date))
 43 | 
 44 |     # collect subset from target 
 45 |     def collect_scenes(self, drive):
 46 |         scene_data = {'dir': drive, 'speed': [], 'frame_id': [], 'pose_T':[], 'pose_RGB':[], 'rel_path': drive.name}
 47 |         ther2rgb = self.read_raw_calib_file(drive.parent/'calibration'/'calib_ther_to_rgb.yaml')
 48 | #            event2rgb = self.read_raw_calib_file(drive.parent/'calibration'/'calib_event_to_rgb.yaml')
 49 | #            lidar2rgb = self.read_raw_calib_file(drive.parent/'calibration'/'calib_lidar_to_rgb.yaml')
 50 | #            rgb2imu = self.read_raw_calib_file(drive.parent/'calibration'/'calib_rgb_to_imu.yaml')
 51 | #            vicon2rgb = self.read_raw_calib_file(drive.parent/'calibration'/'calib_vicon_to_rgb.yaml')
 52 | #            ther2rgb_mat = transform_from_rot_trans(ther2rgb['R'], ther2rgb['T'])
 53 | 
 54 |         # read pose in target coord
 55 |         TgtPoseVec_T = np.genfromtxt(drive/'poses_thermal.txt') # len x 12 (3x4 matrix)
 56 |         TgtPoseMat_T = TgtPoseVec_T.reshape(-1,3,4)
 57 | 
 58 |         TgtPoseVec_RGB = np.genfromtxt(drive/'poses_RGB.txt') # len x 12 (3x4 matrix)
 59 |         TgtPoseMat_RGB = TgtPoseVec_RGB.reshape(-1,3,4)
 60 | 
 61 |         # read avg speed in target coor
 62 |         TgtAvgVelo_T = np.genfromtxt(drive/'avg_velocity_thermal.txt') # len x 1
 63 | 
 64 |         for n in range(len(TgtAvgVelo_T)):
 65 |             scene_data['pose_T'].append(TgtPoseMat_T[n])
 66 |             scene_data['pose_RGB'].append(TgtPoseMat_RGB[n])
 67 |             scene_data['frame_id'].append('{:06d}'.format(n+1)) # files are start from 1, but python indexing start from 0
 68 |             scene_data['speed'].append(TgtAvgVelo_T[n])
 69 | 
 70 |         sample_T = self.load_image(scene_data, 1, 'T')
 71 |         if sample_T is None:
 72 |             return []
 73 |         sample_RGB = self.load_image(scene_data, 1, 'RGB')
 74 |         if sample_RGB is None:
 75 |             return []
 76 | 
 77 |         scene_data['P_rect_T'] = self.get_P_rect(scene_data, sample_T[1], sample_T[2], 'T')
 78 |         scene_data['P_rect_RGB'] = self.get_P_rect(scene_data, sample_RGB[1], sample_RGB[2], 'RGB')
 79 |         scene_data['Tr_T2RGB'] = self.get_T_T2RGB(scene_data)
 80 |         scene_data['intrinsics_T'] = scene_data['P_rect_T'][:,:3]
 81 |         scene_data['intrinsics_RGB'] = scene_data['P_rect_RGB'][:,:3]
 82 |         scene_data['FolderList'] = self.folder_list
 83 | 
 84 |         return scene_data
 85 | 
 86 |     # sampling according to its moving velocity
 87 |     def get_scene_imgs(self, scene_data):
 88 |         def construct_sample(scene_data, i, frame_id):
 89 |             sample = {"id":frame_id}
 90 |             Img_RGB, Img_Ther = self.load_All_image(scene_data, i)
 91 |             sample['Img_RGB'] = Img_RGB
 92 |             sample['Img_Ther'] = Img_Ther
 93 | 
 94 |             if self.get_depth:
 95 |                 sample['depth_T'] = self.load_Depth(scene_data, i, 'T')
 96 |                 sample['depth_RGB'] = self.load_Depth(scene_data, i, 'RGB')
 97 |             if self.get_pose:
 98 |                 sample['pose_T'] = scene_data['pose_T'][i]
 99 |                 sample['pose_RGB'] = scene_data['pose_RGB'][i]
100 | 
101 |             return sample
102 | 
103 |         cum_speed = np.zeros(1)
104 |         for i, speed in enumerate(scene_data['speed']):
105 |             cum_speed += speed
106 |             speed_mag = np.linalg.norm(cum_speed)
107 |             if speed_mag > self.min_speed:
108 |                 frame_id = scene_data['frame_id'][i]
109 |                 yield construct_sample(scene_data, i, frame_id)
110 |                 cum_speed *= 0
111 | 
112 |     def get_P_rect(self, scene_data, zoom_x, zoom_y, Dtype):
113 |         calib_file = scene_data['dir'].parent/'calibration'/'calib_ther_to_rgb.yaml'
114 | 
115 |         filedata = self.read_raw_calib_file(calib_file)
116 |         if Dtype == 'T':
117 |             K_ = np.reshape(filedata['K_Thermal'], (3, 3))
118 |         elif Dtype == 'RGB':
119 |             K_ = np.reshape(filedata['K_RGB'], (3, 3))            
120 |         P_rect = K_@np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0]])
121 |         P_rect[0] *= zoom_x
122 |         P_rect[1] *= zoom_y
123 |         return P_rect
124 | 
125 |     def get_T_T2RGB(self, scene_data):
126 |         calib_file = scene_data['dir'].parent/'calibration'/'calib_ther_to_rgb.yaml'
127 |         filedata = self.read_raw_calib_file(calib_file)
128 |         R = filedata['R']
129 |         T = filedata['T']
130 |         T_T2RGB = transform_from_rot_trans(R,T)
131 |         return T_T2RGB
132 | 
133 |     def load_image(self, scene_data, tgt_idx, Dtype):
134 |         if Dtype == 'T':
135 |             img_file = scene_data['dir']/'Thermal'/'data'/scene_data['frame_id'][tgt_idx]+'.png' 
136 |         elif Dtype == 'RGB':
137 |             img_file = scene_data['dir']/'RGB'/'data'/scene_data['frame_id'][tgt_idx]+'.png' 
138 | 
139 |         if not img_file.isfile():
140 |             return None
141 |         img = cv2.imread(img_file, cv2.IMREAD_UNCHANGED)
142 |         zoom_y = self.img_height/img.shape[0]
143 |         zoom_x = self.img_width/img.shape[1]
144 |         img = cv2.resize(img, (self.img_width, self.img_height)) # shape is width,height order
145 |         return img, zoom_x, zoom_y
146 | 
147 |     def load_All_image(self, scene_data, tgt_idx):
148 |         img_file0 = scene_data['dir']/'RGB'/'data'/scene_data['frame_id'][tgt_idx]+'.png'      # uint8, 3ch
149 |         img_file1 = scene_data['dir']/'Thermal'/'data'/scene_data['frame_id'][tgt_idx]+'.png'  # uint16, 1ch
150 |         if not img_file1.isfile():
151 |             return None
152 | 
153 |         img0 = cv2.imread(img_file0, cv2.IMREAD_UNCHANGED) # RGB
154 |         img0 = cv2.resize(img0, (self.img_width, self.img_height)) # shape is width,height order
155 | 
156 |         img1 = cv2.imread(img_file1, cv2.IMREAD_UNCHANGED)
157 |         img1 = cv2.resize(img1, (self.img_width, self.img_height)) # shape is width,height order
158 | 
159 |         return img0, img1
160 | 
161 |     def load_Depth(self, scene_data, tgt_idx, Dtype):
162 |         if Dtype == 'T':
163 |             depth_file = scene_data['dir']/'Warped_Depth'/'data_THERMAL'/scene_data['frame_id'][tgt_idx]+'.npy' 
164 |         if Dtype == 'RGB':
165 |             depth_file = scene_data['dir']/'Warped_Depth'/'data_RGB'/scene_data['frame_id'][tgt_idx]+'.npy' 
166 |         if not depth_file.isfile():
167 |             return None
168 |         depth = np.load(depth_file)
169 | 
170 |         return depth
171 | 
172 |     def read_raw_calib_file(self, filepath):
173 |         # From https://github.com/utiasSTARS/pykitti/blob/master/pykitti/utils.py
174 |         """Read in a calibration file and parse into a dictionary."""
175 |         data = {}
176 | 
177 |         with open(filepath, 'r') as f:
178 |             for line in f.readlines():
179 |                 key, value = line.split(':', 1)
180 |                 # The only non-float values in these files are dates, which
181 |                 # we don't care about anyway
182 |                 try:
183 |                         data[key] = np.array([float(x) for x in value.split()])
184 |                 except ValueError:
185 |                         pass
186 |         return data
187 | 


--------------------------------------------------------------------------------
/common/loss/inverse_warp.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import torch
  3 | import torch.nn.functional as F
  4 | 
  5 | pixel_coords = None
  6 | 
  7 | def set_id_grid(depth):
  8 |     global pixel_coords
  9 |     b, h, w = depth.size()
 10 |     i_range = torch.arange(0, h).view(1, h, 1).expand(
 11 |         1, h, w).type_as(depth)  # [1, H, W]
 12 |     j_range = torch.arange(0, w).view(1, 1, w).expand(
 13 |         1, h, w).type_as(depth)  # [1, H, W]
 14 |     ones = torch.ones(1, h, w).type_as(depth)
 15 | 
 16 |     pixel_coords = torch.stack((j_range, i_range, ones), dim=1)  # [1, 3, H, W]
 17 | 
 18 | 
 19 | def check_sizes(input, input_name, expected):
 20 |     condition = [input.ndimension() == len(expected)]
 21 |     for i, size in enumerate(expected):
 22 |         if size.isdigit():
 23 |             condition.append(input.size(i) == int(size))
 24 |     assert(all(condition)), "wrong size for {}, expected {}, got  {}".format(
 25 |         input_name, 'x'.join(expected), list(input.size()))
 26 | 
 27 | 
 28 | def pixel2cam(depth, intrinsics_inv):
 29 |     global pixel_coords
 30 |     """Transform coordinates in the pixel frame to the camera frame.
 31 |     Args:
 32 |         depth: depth maps -- [B, H, W]
 33 |         intrinsics_inv: intrinsics_inv matrix for each element of batch -- [B, 3, 3]
 34 |     Returns:
 35 |         array of (u,v,1) cam coordinates -- [B, 3, H, W]
 36 |     """
 37 |     b, h, w = depth.size()
 38 |     if (pixel_coords is None) or pixel_coords.size(2) < h:
 39 |         set_id_grid(depth)
 40 |     current_pixel_coords = pixel_coords[:, :, :h, :w].expand(
 41 |         b, 3, h, w).reshape(b, 3, -1)  # [B, 3, H*W]
 42 |     cam_coords = (intrinsics_inv @ current_pixel_coords).reshape(b, 3, h, w)
 43 |     return cam_coords * depth.unsqueeze(1)
 44 | 
 45 | 
 46 | def cam2pixel(cam_coords, proj_c2p_rot, proj_c2p_tr, padding_mode):
 47 |     """Transform coordinates in the camera frame to the pixel frame.
 48 |     Args:
 49 |         cam_coords: pixel coordinates defined in the first camera coordinates system -- [B, 4, H, W]
 50 |         proj_c2p_rot: rotation matrix of cameras -- [B, 3, 4]
 51 |         proj_c2p_tr: translation vectors of cameras -- [B, 3, 1]
 52 |     Returns:
 53 |         array of [-1,1] coordinates -- [B, 2, H, W]
 54 |     """
 55 |     b, _, h, w = cam_coords.size()
 56 |     cam_coords_flat = cam_coords.reshape(b, 3, -1)  # [B, 3, H*W]
 57 |     if proj_c2p_rot is not None:
 58 |         pcoords = proj_c2p_rot @ cam_coords_flat
 59 |     else:
 60 |         pcoords = cam_coords_flat
 61 | 
 62 |     if proj_c2p_tr is not None:
 63 |         pcoords = pcoords + proj_c2p_tr  # [B, 3, H*W]
 64 |     X = pcoords[:, 0]
 65 |     Y = pcoords[:, 1]
 66 |     Z = pcoords[:, 2].clamp(min=1e-3)
 67 | 
 68 |     # Normalized, -1 if on extreme left, 1 if on extreme right (x = w-1) [B, H*W]
 69 |     X_norm = 2*(X / Z)/(w-1) - 1
 70 |     Y_norm = 2*(Y / Z)/(h-1) - 1  # Idem [B, H*W]
 71 |     if padding_mode == 'zeros':
 72 |         X_mask = ((X_norm > 1)+(X_norm < -1)).detach()
 73 |         # make sure that no point in warped image is a combinaison of im and gray
 74 |         X_norm[X_mask] = 2
 75 |         Y_mask = ((Y_norm > 1)+(Y_norm < -1)).detach()
 76 |         Y_norm[Y_mask] = 2
 77 | 
 78 |     pixel_coords = torch.stack([X_norm, Y_norm], dim=2)  # [B, H*W, 2]
 79 |     return pixel_coords.reshape(b, h, w, 2), Z.reshape(b, 1, h, w)
 80 | 
 81 | 
 82 | def euler2mat(angle):
 83 |     """Convert euler angles to rotation matrix.
 84 |      Reference: https://github.com/pulkitag/pycaffe-utils/blob/master/rot_utils.py#L174
 85 |     Args:
 86 |         angle: rotation angle along 3 axis (in radians) -- size = [B, 3]
 87 |     Returns:
 88 |         Rotation matrix corresponding to the euler angles -- size = [B, 3, 3]
 89 |     """
 90 |     B = angle.size(0)
 91 |     x, y, z = angle[:, 0], angle[:, 1], angle[:, 2]
 92 | 
 93 |     cosz = torch.cos(z)
 94 |     sinz = torch.sin(z)
 95 | 
 96 |     zeros = z.detach()*0
 97 |     ones = zeros.detach()+1
 98 |     zmat = torch.stack([cosz, -sinz, zeros,
 99 |                         sinz,  cosz, zeros,
100 |                         zeros, zeros,  ones], dim=1).reshape(B, 3, 3)
101 | 
102 |     cosy = torch.cos(y)
103 |     siny = torch.sin(y)
104 | 
105 |     ymat = torch.stack([cosy, zeros,  siny,
106 |                         zeros,  ones, zeros,
107 |                         -siny, zeros,  cosy], dim=1).reshape(B, 3, 3)
108 | 
109 |     cosx = torch.cos(x)
110 |     sinx = torch.sin(x)
111 | 
112 |     xmat = torch.stack([ones, zeros, zeros,
113 |                         zeros,  cosx, -sinx,
114 |                         zeros,  sinx,  cosx], dim=1).reshape(B, 3, 3)
115 | 
116 |     rotMat = xmat @ ymat @ zmat
117 |     return rotMat
118 | 
119 | 
120 | def quat2mat(quat):
121 |     """Convert quaternion coefficients to rotation matrix.
122 |     Args:
123 |         quat: first three coeff of quaternion of rotation. fourht is then computed to have a norm of 1 -- size = [B, 3]
124 |     Returns:
125 |         Rotation matrix corresponding to the quaternion -- size = [B, 3, 3]
126 |     """
127 |     norm_quat = torch.cat([quat[:, :1].detach()*0 + 1, quat], dim=1)
128 |     norm_quat = norm_quat/norm_quat.norm(p=2, dim=1, keepdim=True)
129 |     w, x, y, z = norm_quat[:, 0], norm_quat[:,
130 |                                             1], norm_quat[:, 2], norm_quat[:, 3]
131 | 
132 |     B = quat.size(0)
133 | 
134 |     w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
135 |     wx, wy, wz = w*x, w*y, w*z
136 |     xy, xz, yz = x*y, x*z, y*z
137 | 
138 |     rotMat = torch.stack([w2 + x2 - y2 - z2, 2*xy - 2*wz, 2*wy + 2*xz,
139 |                           2*wz + 2*xy, w2 - x2 + y2 - z2, 2*yz - 2*wx,
140 |                           2*xz - 2*wy, 2*wx + 2*yz, w2 - x2 - y2 + z2], dim=1).reshape(B, 3, 3)
141 |     return rotMat
142 | 
143 | 
144 | def pose_vec2mat(vec, rotation_mode='euler'):
145 |     """
146 |     Convert 6DoF parameters to transformation matrix.
147 |     Args:s
148 |         vec: 6DoF parameters in the order of tx, ty, tz, rx, ry, rz -- [B, 6]
149 |     Returns:
150 |         A transformation matrix -- [B, 3, 4]
151 |     """
152 |     translation = vec[:, :3].unsqueeze(-1)  # [B, 3, 1]
153 |     rot = vec[:, 3:]
154 |     if rotation_mode == 'euler':
155 |         rot_mat = euler2mat(rot)  # [B, 3, 3]
156 |     elif rotation_mode == 'quat':
157 |         rot_mat = quat2mat(rot)  # [B, 3, 3]
158 | 
159 |     transform_mat = torch.cat([rot_mat, translation], dim=2)  # [B, 3, 4]
160 |     return transform_mat
161 | 
162 | 
163 | # inverse_warp from sfm-learner paper
164 | def inverse_warp(img, depth, pose, intrinsics, rotation_mode='euler', padding_mode='zeros'):
165 |     """
166 |     Inverse warp a source image to the target image plane.
167 |     Args:
168 |         img: the source image (where to sample pixels) -- [B, 3, H, W]
169 |         depth: depth map of the target image -- [B, H, W]
170 |         pose: 6DoF pose parameters from target to source -- [B, 6]
171 |         intrinsics: camera intrinsic matrix -- [B, 3, 3]
172 |     Returns:
173 |         projected_img: Source image warped to the target image plane
174 |         valid_points: Boolean array indicating point validity
175 |     """
176 |     check_sizes(img, 'img', 'B1HW') # Thermal Image
177 |     check_sizes(depth, 'depth', 'BHW')
178 |     check_sizes(pose, 'pose', 'B6')
179 |     check_sizes(intrinsics, 'intrinsics', 'B33')
180 | 
181 |     batch_size, _, img_height, img_width = img.size()
182 | 
183 |     cam_coords = pixel2cam(depth, intrinsics.inverse())  # [B,3,H,W]
184 | 
185 |     pose_mat = pose_vec2mat(pose, rotation_mode)  # [B,3,4]
186 | 
187 |     # Get projection matrix for tgt camera frame to source pixel frame
188 |     proj_cam_to_src_pixel = intrinsics @ pose_mat  # [B, 3, 4]
189 | 
190 |     rot, tr = proj_cam_to_src_pixel[:, :, :3], proj_cam_to_src_pixel[:, :, -1:]
191 |     src_pixel_coords = cam2pixel(
192 |         cam_coords, rot, tr, padding_mode)[0]  # [B,H,W,2]
193 |     projected_img = F.grid_sample(
194 |         img, src_pixel_coords, padding_mode=padding_mode)
195 | 
196 |     valid_points = src_pixel_coords.abs().max(dim=-1)[0] <= 1
197 | 
198 |     return projected_img, valid_points
199 | 
200 | # inverse_warp from SC-sfm-learner paper
201 | def inverse_warp2(img, depth, ref_depth, pose, intrinsics, padding_mode='zeros'):
202 |     """
203 |     Inverse warp a source image to the target image plane.
204 |     Args:
205 |         img: the source image (where to sample pixels) -- [B, 3, H, W]
206 |         depth: depth map of the target image -- [B, 1, H, W]
207 |         ref_depth: the source depth map (where to sample depth) -- [B, 1, H, W] 
208 |         pose: 6DoF pose parameters from target to source -- [B, 6]
209 |         intrinsics: camera intrinsic matrix -- [B, 3, 3]
210 |     Returns:
211 |         projected_img: Source image warped to the target image plane
212 |         valid_mask: Float array indicating point validity
213 |         projected_depth: sampled depth from source image  
214 |         computed_depth: computed depth of source image using the target depth
215 |     """
216 |     check_sizes(img, 'img', 'B1HW') # Thermal Image
217 |     check_sizes(depth, 'depth', 'B1HW')
218 |     check_sizes(ref_depth, 'ref_depth', 'B1HW')
219 |     check_sizes(pose, 'pose', 'B6')
220 |     check_sizes(intrinsics, 'intrinsics', 'B33')
221 | 
222 |     batch_size, _, img_height, img_width = img.size()
223 | 
224 |     cam_coords = pixel2cam(depth.squeeze(1), intrinsics.inverse())  # [B,3,H,W]
225 | 
226 |     pose_mat = pose_vec2mat(pose)  # [B,3,4]
227 | 
228 |     # Get projection matrix for tgt camera frame to source pixel frame
229 |     proj_cam_to_src_pixel = intrinsics @ pose_mat  # [B, 3, 4]
230 | 
231 |     rot, tr = proj_cam_to_src_pixel[:, :, :3], proj_cam_to_src_pixel[:, :, -1:]
232 |     src_pixel_coords, computed_depth = cam2pixel(cam_coords, rot, tr, padding_mode)  # [B,H,W,2]
233 |     projected_img = F.grid_sample(img, src_pixel_coords, padding_mode=padding_mode, align_corners=False)
234 | 
235 |     valid_points = src_pixel_coords.abs().max(dim=-1)[0] <= 1
236 |     valid_mask = valid_points.unsqueeze(1).float()
237 | 
238 |     projected_depth = F.grid_sample(ref_depth, src_pixel_coords, padding_mode=padding_mode, align_corners=False)
239 | 
240 |     return projected_img, valid_mask, projected_depth, computed_depth


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.backends.cudnn as cudnn
  4 | import torch.optim
  5 | import torch.utils.data
  6 | 
  7 | import argparse
  8 | import time
  9 | import datetime
 10 | from path import Path
 11 | 
 12 | from dataloader.VIVID_sequence_folders import SequenceFolder
 13 | 
 14 | import sys
 15 | sys.path.append('./common/')
 16 | 
 17 | import models
 18 | import utils.custom_transforms as custom_transforms
 19 | from utils.utils import tensor2array, save_checkpoint, tensor2array_thermal
 20 | 
 21 | from loss.inverse_warp import pose_vec2mat
 22 | from loss.loss_functions import compute_smooth_loss, compute_photo_and_geometry_loss, compute_errors
 23 | 
 24 | # logging
 25 | import csv
 26 | from utils.logger import TermLogger, AverageMeter
 27 | from tensorboardX import SummaryWriter
 28 | 
 29 | parser = argparse.ArgumentParser(description='Structure from Motion Learner training on KITTI and CityScapes Dataset',
 30 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 31 | 
 32 | parser.add_argument('data', metavar='DIR', help='path to dataset')
 33 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers')
 34 | parser.add_argument('--epochs', default=200, type=int, metavar='N', help='number of total epochs to run')
 35 | parser.add_argument('--epoch-size', default=0, type=int, metavar='N', help='manual epoch size (will match dataset size if not set)')
 36 | 
 37 | # optimizer param
 38 | parser.add_argument('-b', '--batch-size', default=4, type=int, metavar='N', help='mini-batch size')
 39 | parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float, metavar='LR', help='initial learning rate')
 40 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum for sgd, alpha parameter for adam')
 41 | parser.add_argument('--beta', default=0.999, type=float, metavar='M', help='beta parameters for adam')
 42 | parser.add_argument('--weight-decay', '--wd', default=0, type=float, metavar='W', help='weight decay')
 43 | parser.add_argument('--seed', default=0, type=int, help='seed for random functions, and network initialization')
 44 | parser.add_argument('--padding-mode', type=str, choices=['zeros', 'border'], default='zeros',
 45 |                     help='padding mode for image warping : this is important for photometric differenciation when going outside target image.'
 46 |                          ' zeros will null gradients outside target image.'
 47 |                          ' border will only null gradients of the coordinate outside (x or y)')
 48 | # logger
 49 | parser.add_argument('--name', dest='name', type=str, required=True, help='name of the experiment, checkpoints are stored in checpoints/name')
 50 | parser.add_argument('--print-freq', default=10, type=int, metavar='N', help='print frequency')
 51 | parser.add_argument('--log-output', action='store_true', help='will log dispnet outputs at validation step')
 52 | 
 53 | # data loader
 54 | parser.add_argument('--scene_type', type=str, choices=['indoor', 'outdoor'], default='indoor', required=True)
 55 | parser.add_argument('--sequence-length', type=int, help='Length of sequence', metavar='N', default=3)
 56 | parser.add_argument('--interval', type=int, help='Interval of sequence', metavar='N', default=5)
 57 | 
 58 | parser.add_argument('--with-pretrain', type=int,  default=1, help='with or without imagenet pretrain for resnet')
 59 | parser.add_argument('--pretrained-disp', dest='pretrained_disp', default=None, metavar='PATH', help='path to pre-trained dispnet model')
 60 | parser.add_argument('--pretrained-pose', dest='pretrained_pose', default=None, metavar='PATH', help='path to pre-trained Pose net model')
 61 | 
 62 | # loss param
 63 | parser.add_argument('--resnet-layers',  type=int, default=18, choices=[18, 50], help='number of ResNet layers for depth estimation.')
 64 | parser.add_argument('--num-scales', '--number-of-scales', type=int, help='the number of scales', metavar='W', default=1)
 65 | parser.add_argument('-p', '--photo-loss-weight', type=float, help='weight for photometric loss', metavar='W', default=1)
 66 | parser.add_argument('-s', '--smooth-loss-weight', type=float, help='weight for disparity smoothness loss', metavar='W', default=0.01)
 67 | parser.add_argument('-c', '--geometry-consistency-weight', type=float, help='weight for depth consistency loss', metavar='W', default=0.5)
 68 | parser.add_argument('--ssim_weight', type=float, default=0.85, help='weight for ssim scaling factor')
 69 | 
 70 | parser.add_argument('--with-ssim', type=int, default=1, help='with ssim or not')
 71 | parser.add_argument('--with-mask', type=int, default=1, help='with the image mask for moving objects and occlusions or not')
 72 | parser.add_argument('--with-auto-mask', type=int,  default=1, help='with the mask for stationary points')
 73 | 
 74 | # thermal image mapping param
 75 | parser.add_argument('--rearrange-bin', type=int,  default=30, help='Number of bin for thermal image rearrangement')
 76 | parser.add_argument('--clahe-clip', type=float,  default=2.0, help='clip limitation for CLAHE')
 77 | 
 78 | best_depth_error = -1
 79 | best_pose_error = -1
 80 | n_iter = 0
 81 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 82 | torch.autograd.set_detect_anomaly(True)
 83 | 
 84 | def main():
 85 |     global best_depth_error, best_pose_error, n_iter, device
 86 |     args = parser.parse_args()
 87 | 
 88 |     timestamp = datetime.datetime.now().strftime("%m-%d-%H:%M")
 89 |     save_path = Path(args.name)
 90 |     args.save_path = 'checkpoints'/save_path/timestamp
 91 |     print('=> will save everything to {}'.format(args.save_path))
 92 |     args.save_path.makedirs_p()
 93 | 
 94 |     torch.manual_seed(args.seed)
 95 |     np.random.seed(args.seed)
 96 |     cudnn.deterministic = True
 97 |     cudnn.benchmark = True
 98 | 
 99 |     training_writer = SummaryWriter(args.save_path)
100 |     output_writers = []
101 |     if args.log_output:
102 |         for i in range(3):
103 |             output_writers.append(SummaryWriter(args.save_path/'valid'/str(i)))
104 | 
105 |     # set dataloader
106 |     if args.scene_type == 'indoor' :  # indoor
107 |         args.vis_depth_max = 4
108 |     elif args.scene_type == 'outdoor' : # outdoor
109 |         args.vis_depth_max = 10
110 | 
111 |     ArrToTen      = custom_transforms.ArrayToTensor(max_value=2**14)
112 |     ArrToTen4Loss = custom_transforms.ArrayToTensorWithLocalWindow()
113 |     TenThrRearran = custom_transforms.TensorThermalRearrange(bin_num=args.rearrange_bin, CLAHE_clip=args.clahe_clip)
114 |     normalize     = custom_transforms.Normalize(mean=[0.45, 0.45, 0.45], std=[0.225, 0.225, 0.225])
115 | 
116 |     transform4input  = custom_transforms.Compose([ArrToTen, normalize])
117 |     transform4loss   = custom_transforms.Compose([ArrToTen4Loss, TenThrRearran, normalize])
118 | 
119 |     train_transform_share = custom_transforms.Compose([
120 |         custom_transforms.RandomHorizontalFlip(),
121 |         custom_transforms.RandomScaleCrop(),
122 |     ])
123 | 
124 |     print("=> fetching scenes in '{}'".format(args.data))
125 |     train_set = SequenceFolder(
126 |         args.data,
127 |         tf_share     = train_transform_share,
128 |         tf_input     = transform4input,
129 |         tf_loss      = transform4loss,
130 |         seed         = args.seed,
131 |         scene_type   = args.scene_type,
132 |         sequence_length = args.sequence_length,
133 |         interval     = args.interval,
134 |         train        = True,
135 |     )
136 | 
137 |     from dataloader.VIVID_validation_folders import ValidationSet, ValidationSetPose
138 |     val_set = ValidationSet(
139 |         args.data,
140 |         tf_input     = transform4input,
141 |         tf_loss      = transform4loss,
142 |         sequence_length = args.sequence_length,
143 |         interval     = args.interval,
144 |         scene_type   = args.scene_type,
145 |     )
146 |     val_pose_set = ValidationSetPose(
147 |         args.data,
148 |         tf_input        = transform4input ,
149 |         scene_type      = args.scene_type,   
150 |         sequence_length = args.sequence_length,
151 |     )
152 | 
153 |     print('{} samples found in {} train scenes'.format(len(train_set), len(train_set.folders)))
154 |     print('{} samples found in {} valid scenes'.format(len(val_set), len(val_set.folders)))
155 | 
156 |     train_loader = torch.utils.data.DataLoader(
157 |         train_set, batch_size=args.batch_size, shuffle=True,
158 |         num_workers=args.workers, pin_memory=True)
159 | 
160 |     val_loader = torch.utils.data.DataLoader(
161 |         val_set, batch_size=args.batch_size, shuffle=False,
162 |         num_workers=args.workers, pin_memory=True)
163 | 
164 |     val_pose_loader = torch.utils.data.DataLoader(
165 |         val_pose_set, batch_size=args.batch_size, shuffle=False,
166 |         num_workers=args.workers, pin_memory=True)
167 | 
168 |     if args.epoch_size == 0:
169 |         args.epoch_size = len(train_loader)
170 | 
171 |     # create model
172 |     print("=> creating model")
173 |     disp_pose_net = models.DispPoseResNet(args.resnet_layers, args.with_pretrain, num_channel=1).to(device)
174 |     disp_net = disp_pose_net.DispResNet
175 |     pose_net = disp_pose_net.PoseResNet
176 | 
177 |     # load parameters
178 |     if args.pretrained_disp:
179 |         print("=> using pre-trained weights for DispResNet")
180 |         weights = torch.load(args.pretrained_disp)
181 |         disp_net.load_state_dict(weights['state_dict'], strict=False)
182 | 
183 |     if args.pretrained_pose:
184 |         print("=> using pre-trained weights for PoseResNet")
185 |         weights = torch.load(args.pretrained_pose)
186 |         pose_net.load_state_dict(weights['state_dict'], strict=False)
187 |              
188 |     disp_net = torch.nn.DataParallel(disp_net)
189 |     pose_net = torch.nn.DataParallel(pose_net)
190 | 
191 |     # set optimizer
192 |     print('=> setting adam solver')
193 |     optim_params = [
194 |         {'params': disp_pose_net.parameters(), 'lr': args.lr},
195 |     ]
196 |         
197 |     optimizer = torch.optim.Adam(optim_params,
198 |                                  betas=(args.momentum, args.beta),
199 |                                  weight_decay=args.weight_decay)
200 |     # set logger
201 |     logger = TermLogger(n_epochs=args.epochs, train_size=min(len(train_loader), args.epoch_size), valid_size=len(val_pose_loader))
202 |     logger.epoch_bar.start()
203 | 
204 |     for epoch in range(args.epochs):
205 |         logger.epoch_bar.update(epoch)
206 | 
207 |         # train for one epoch
208 |         logger.reset_train_bar()
209 |         train_loss = train(args, train_loader, disp_net, pose_net, optimizer, args.epoch_size, logger, training_writer)
210 |         logger.train_writer.write(' * Avg Loss : {:.3f}'.format(train_loss))
211 | 
212 |         # evaluate on validation set
213 |         logger.reset_valid_bar()
214 |         pose_errors, pose_error_names = validate_pose_with_gt(args, val_pose_loader, pose_net, epoch, logger, output_writers)
215 |         depth_errors, detph_error_names = validate_depth_with_gt(args, val_loader, disp_net, epoch, logger, output_writers)
216 |         errors = depth_errors + pose_errors
217 |         error_names = detph_error_names + pose_error_names
218 | 
219 |         error_string = ', '.join('{} : {:.3f}'.format(name, error) for name, error in zip(error_names, errors))
220 |         logger.valid_writer.write(' * Avg {}'.format(error_string))
221 | 
222 |         for error, name in zip(errors, error_names):
223 |             training_writer.add_scalar(name, error, epoch)
224 | 
225 |         # Up to you to chose the most relevant error to measure your model's performance, careful some measures are to maximize (such as a1,a2,a3)
226 |         decisive_depth_error = errors[1] # ['abs_diff', 'abs_rel', 'sq_rel', 'a1', 'a2', 'a3', 'ATE', 'RE']
227 |         decisive_pose_error = errors[6] # ATE
228 | 
229 |         if best_depth_error < 0:
230 |             best_depth_error = decisive_depth_error
231 | 
232 |         if best_pose_error < 0:
233 |             best_pose_error = decisive_pose_error
234 | 
235 |         # remember lowest error and save checkpoint
236 |         is_depth_best = decisive_depth_error < best_depth_error
237 |         best_depth_error = min(best_depth_error, decisive_depth_error)
238 | 
239 |         is_pose_best = decisive_pose_error < best_pose_error
240 |         best_pose_error = min(best_pose_error, decisive_pose_error)
241 | 
242 |         save_checkpoint(
243 |             args.save_path, {
244 |                 'epoch': epoch + 1,
245 |                 'state_dict': disp_net.module.state_dict()
246 |             }, {
247 |                 'epoch': epoch + 1,
248 |                 'state_dict': pose_net.module.state_dict()
249 |             },
250 |             is_depth_best, is_pose_best)
251 | 
252 |     logger.epoch_bar.finish()
253 | 
254 | def train(args, train_loader, disp_net, pose_net, optimizer, epoch_size, logger, train_writer):
255 |     global n_iter, device
256 |     batch_time   = AverageMeter()
257 |     data_time    = AverageMeter()
258 |     losses       = AverageMeter(precision=4)
259 |     w1, w2, w3   = args.photo_loss_weight, args.smooth_loss_weight, args.geometry_consistency_weight
260 | 
261 |     # switch to train mode
262 |     disp_net.train()
263 |     pose_net.train()
264 | 
265 |     end = time.time()
266 |     logger.train_bar.update(0)
267 | 
268 |     for i, (tgt_img, ref_imgs, tgt_img_loss, ref_imgs_loss,  intrinsics) in enumerate(train_loader):
269 |         log_losses = i > 0 and n_iter % args.print_freq == 0
270 | 
271 |         # measure data loading time
272 |         data_time.update(time.time() - end)
273 | 
274 |         tgt_img = tgt_img.to(device)
275 |         ref_imgs = [img.to(device) for img in ref_imgs]
276 | 
277 |         tgt_img_loss = tgt_img_loss.to(device)
278 |         ref_imgs_loss = [img.to(device) for img in ref_imgs_loss]
279 | 
280 |         intrinsics = intrinsics.to(device)
281 | 
282 |         # compute output
283 |         tgt_depth, ref_depths = compute_depth(disp_net, tgt_img, ref_imgs)
284 |         poses, poses_inv      = compute_pose_with_inv(pose_net, tgt_img, ref_imgs)
285 | 
286 |         loss_1, loss_3 = compute_photo_and_geometry_loss(tgt_img_loss, ref_imgs_loss, intrinsics, tgt_depth, ref_depths,
287 |                                                          poses, poses_inv, args.num_scales, args.ssim_weight, args.with_ssim,
288 |                                                          args.with_mask, args.with_auto_mask, args.padding_mode)
289 | 
290 |         # smoothness loss, not necessary for thermal image
291 |         loss_2 = compute_smooth_loss(tgt_depth, tgt_img_loss, ref_depths, ref_imgs_loss)
292 | 
293 |         loss = w1*loss_1 + w2*loss_2 + w3*loss_3
294 | 
295 |         if log_losses:
296 |             train_writer.add_scalar('image_recon_loss_thermal'         , loss_1.item(), n_iter)
297 |             train_writer.add_scalar('disparity_smoothness_loss'        , loss_2.item(), n_iter)
298 |             train_writer.add_scalar('geometry_consistency_loss_thermal', loss_3.item(), n_iter)
299 |             train_writer.add_scalar('total_loss', loss.item(), n_iter)
300 | 
301 |         # record loss
302 |         losses.update(loss.item(), args.batch_size)
303 | 
304 |         # compute gradient and do Adam step
305 |         optimizer.zero_grad()
306 |         loss.backward()
307 |         optimizer.step()
308 | 
309 |         # measure elapsed time
310 |         batch_time.update(time.time() - end)
311 |         end = time.time()
312 | 
313 |         logger.train_bar.update(i+1)
314 | 
315 |         if i % args.print_freq == 0:
316 |             logger.train_writer.write('Train: Time {} Data {} Loss {}'.format(batch_time, data_time, losses))
317 |         if i >= epoch_size - 1:
318 |             break
319 | 
320 |         n_iter += 1
321 | 
322 |     return losses.avg[0]
323 | 
324 | @torch.no_grad()
325 | def validate_depth_with_gt(args, val_loader, disp_net, epoch, logger, output_writers=[]):
326 |     global device
327 |     batch_time = AverageMeter()
328 |     error_names = ['abs_diff', 'abs_rel', 'sq_rel', 'a1', 'a2', 'a3']
329 |     errors = AverageMeter(i=len(error_names))
330 |     log_outputs = len(output_writers) > 0
331 | 
332 |     # switch to evaluate mode
333 |     disp_net.eval()
334 | 
335 |     end = time.time()
336 |     logger.valid_bar.update(0)
337 | 
338 |     for i, (tgt_img, tgt_img_loss, depth) in enumerate(val_loader):
339 | 
340 |         tgt_img = tgt_img.to(device)
341 |         tgt_img_loss = tgt_img_loss.to(device)
342 |         depth   = depth.to(device)
343 | 
344 |         # check gt
345 |         if depth.nelement() == 0:
346 |             continue
347 | 
348 |         # compute output
349 |         output_disp = disp_net(tgt_img)
350 | 
351 |         output_depth = 1/output_disp
352 |         output_depth = output_depth[:, 0]
353 | 
354 |         if log_outputs and i < len(output_writers):
355 |             if epoch == 0:
356 |                 output_writers[i].add_image('val Input',   tensor2array_thermal(tgt_img[0]))
357 |                 output_writers[i].add_image('val Input_vis',   tensor2array_thermal(tgt_img_loss[0]))
358 | 
359 |                 depth_to_show = depth[0]
360 |                 output_writers[i].add_image('val GT Depth', tensor2array(depth_to_show, max_value=args.vis_depth_max), epoch)
361 |                 depth_to_show[depth_to_show == 0] = 1000
362 |                 disp_to_show = (1/depth_to_show).clamp(0, 10)
363 |                 output_writers[i].add_image('val GT Disparity', tensor2array(disp_to_show, max_value=None, colormap='magma'), epoch)
364 | 
365 |             output_writers[i].add_image('val Dispnet Pred', tensor2array(output_disp[0], max_value=None, colormap='magma'), epoch)
366 |             output_writers[i].add_image('val Depth Pred', tensor2array(output_depth[0], max_value=args.vis_depth_max), epoch)
367 | 
368 |         if depth.nelement() != output_depth.nelement():
369 |             b, h, w = depth.size()
370 |             output_depth = torch.nn.functional.interpolate(output_depth.unsqueeze(1), [h, w]).squeeze(1)
371 | 
372 |         errors.update(compute_errors(depth, output_depth, args.scene_type))
373 | 
374 |         # measure elapsed time
375 |         batch_time.update(time.time() - end)
376 |         end = time.time()
377 |         logger.valid_bar.update(i+1)
378 |         if i % args.print_freq == 0:
379 |             logger.valid_writer.write('valid: Time {} Abs Error {:.4f} ({:.4f})'.format(batch_time, errors.val[0], errors.avg[0]))
380 |     logger.valid_bar.update(len(val_loader))
381 |     return errors.avg, error_names
382 | 
383 | 
384 | @torch.no_grad()
385 | def validate_pose_with_gt(args, val_pose_loader, pose_net, epoch, logger, output_writers=[]):
386 |     global device
387 |     batch_time = AverageMeter()
388 |     error_names = ['ATE', 'RE']
389 |     errors = AverageMeter(i=len(error_names))
390 |     log_outputs = len(output_writers) > 0
391 | 
392 |     # switch to evaluate mode
393 |     pose_net.eval()
394 | 
395 |     end = time.time()
396 |     logger.valid_bar.update(0)
397 | 
398 |     for i, (thr_imgs, poses_gt) in enumerate(val_pose_loader):
399 |         thr_imgs = [img.to(device) for img in thr_imgs]
400 | 
401 |         # compute output    
402 |         global_pose = np.eye(4)
403 |         poses = []
404 |         poses.append(global_pose[0:3, :])
405 |         for j in range(len(thr_imgs)-1):
406 |             pose = pose_net(thr_imgs[j], thr_imgs[j + 1])
407 |             if pose.shape[0] == 1:
408 |                 pose_mat = pose_vec2mat(pose).squeeze(0).cpu().numpy()
409 |             else:
410 |                 pose_mat = pose_vec2mat(pose[[0]]).squeeze(0).cpu().numpy()
411 | 
412 |             pose_mat = np.vstack([pose_mat, np.array([0, 0, 0, 1])])
413 |             global_pose = global_pose @  np.linalg.inv(pose_mat)
414 |             poses.append(global_pose[0:3, :])
415 | 
416 |         final_poses = np.stack(poses, axis=0)
417 | 
418 |         errors.update(compute_pose_error(poses_gt[0].numpy(), final_poses))
419 | 
420 |         # measure elapsed time
421 |         batch_time.update(time.time() - end)
422 |         end = time.time()
423 |         logger.valid_bar.update(i)
424 |         if i % args.print_freq == 0:
425 |             logger.valid_writer.write('valid: Time {} ATE Error {:.4f} ({:.4f})'.format(batch_time, errors.val[0], errors.avg[0]))
426 |     logger.valid_bar.update(len(val_pose_loader))
427 |     return errors.avg, error_names
428 | 
429 | def compute_depth(disp_net, tgt_img, ref_imgs):
430 |     disps = disp_net(tgt_img)
431 |     tgt_depth = [1/disp for disp in disps]
432 | 
433 |     ref_depths = []
434 |     for ref_img in ref_imgs:
435 |         disps = disp_net(ref_img)
436 |         ref_depth = [1/disp for disp in disps]
437 |         ref_depths.append(ref_depth)
438 | 
439 |     return tgt_depth, ref_depths
440 | 
441 | def compute_pose_with_inv(pose_net, tgt_img, ref_imgs):
442 |     poses = []
443 |     poses_inv = []
444 |     for ref_img in ref_imgs:
445 |         poses.append(pose_net(tgt_img, ref_img))
446 |         poses_inv.append(pose_net(ref_img, tgt_img))
447 | 
448 |     return poses, poses_inv
449 | 
450 | @torch.no_grad()
451 | def compute_pose_error(gt, pred):
452 |     RE = 0
453 |     snippet_length = gt.shape[0]
454 |     scale_factor = np.sum(gt[:,:,-1] * pred[:,:,-1])/np.sum(pred[:,:,-1] ** 2)
455 |     ATE = np.linalg.norm((gt[:,:,-1] - scale_factor * pred[:,:,-1]).reshape(-1))
456 |     for gt_pose, pred_pose in zip(gt, pred):
457 |         # Residual matrix to which we compute angle's sin and cos
458 |         R = gt_pose[:,:3] @ np.linalg.inv(pred_pose[:,:3])
459 |         s = np.linalg.norm([R[0,1]-R[1,0],
460 |                             R[1,2]-R[2,1],
461 |                             R[0,2]-R[2,0]])
462 |         c = np.trace(R) - 1
463 |         # Note: we actually compute double of cos and sin, but arctan2 is invariant to scale
464 |         RE += np.arctan2(s,c)
465 | 
466 |     return [metric.item() for metric in [ATE/snippet_length, RE/snippet_length]]
467 | 
468 | if __name__ == '__main__':
469 |     main()
470 | 


--------------------------------------------------------------------------------