├── .gitignore
├── LICENCE
├── README.md
├── README_prepare_data_mp3d_layout.md
├── README_prepare_data_s2d3d.md
├── README_reproduction.md
├── assets
├── label13_weight.pth
├── pano_asmasuxybohhcj.depth.png
├── pano_asmasuxybohhcj.layout.txt
├── pano_asmasuxybohhcj.png
├── repo_teaser.jpg
├── snapshot_depth.jpg
└── snapshot_layout.jpg
├── config
├── mp3d_depth
│ ├── HOHO_depth_dct_efficienthc_TransEn1.yaml
│ ├── HOHO_depth_dct_efficienthc_TransEn1_hardnet.yaml
│ └── ablation
│ │ ├── tuning___HOHO_depth_dct128_efficienthc_TransEn1.yaml
│ │ ├── tuning___HOHO_depth_dct256_efficienthc_TransEn1.yaml
│ │ ├── tuning___HOHO_depth_dct32_efficienthc_TransEn1.yaml
│ │ ├── tuning___HOHO_depth_dct512_efficienthc_TransEn1.yaml
│ │ ├── tuning___HOHO_depth_dct_LSTM.yaml
│ │ ├── tuning___HOHO_depth_dct_Linear.yaml
│ │ ├── tuning___HOHO_depth_dct_TransEn1.yaml
│ │ ├── tuning___HOHO_depth_dct_efficienthc_LSTM.yaml
│ │ ├── tuning___HOHO_depth_dct_efficienthc_Linear.yaml
│ │ ├── tuning___HOHO_depth_dct_efficienthc_TransEn1.yaml
│ │ ├── tuning___HOHO_depth_dct_efficienthc_TransEn1_resnet34.yaml
│ │ ├── tuning___HOHO_depth_lin128_efficienthc_TransEn1.yaml
│ │ ├── tuning___HOHO_depth_lin256_efficienthc_TransEn1.yaml
│ │ ├── tuning___HOHO_depth_lin32_efficienthc_TransEn1.yaml
│ │ ├── tuning___HOHO_depth_lin512_efficienthc_TransEn1.yaml
│ │ └── tuning___HOHO_depth_lin64_efficienthc_TransEn1.yaml
├── mp3d_layout
│ └── HOHO_layout_aug_efficienthc_Transen1_resnet34.yaml
├── s2d3d_depth
│ ├── HOHO_depthS_SGD_dct_efficienthc_TransEn1.yaml
│ ├── HOHO_depthS_dct_efficienthc_TransEn1.yaml
│ └── HOHO_depth_dct_efficienthc_TransEn1.yaml
└── s2d3d_sem
│ ├── HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101.yaml
│ ├── HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101rgb.yaml
│ ├── HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101.yaml
│ ├── HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101rgb.yaml
│ ├── HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101.yaml
│ ├── HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101rgb.yaml
│ ├── HOHO_depth_dct_efficienthc_TransEn1_h256_fold1_simple.yaml
│ ├── HOHO_depth_dct_efficienthc_TransEn1_h256_fold2_simple.yaml
│ ├── HOHO_depth_dct_efficienthc_TransEn1_h256_fold3_simple.yaml
│ ├── HOHO_depth_dct_efficienthc_TransEn1_h64_fold1_simple.yaml
│ ├── HOHO_depth_dct_efficienthc_TransEn1_h64_fold2_simple.yaml
│ └── HOHO_depth_dct_efficienthc_TransEn1_h64_fold3_simple.yaml
├── count_params_flops.py
├── eval_layout.py
├── infer_depth.ipynb
├── infer_depth.py
├── infer_layout.ipynb
├── infer_layout.py
├── infer_sem.ipynb
├── lib
├── config.py
├── dataset
│ ├── __init__.py
│ ├── dataset_depth.py
│ ├── dataset_layout.py
│ └── dataset_s2d3d_sem.py
├── misc
│ ├── __init__.py
│ ├── gen_txt_structured3d.py
│ ├── pano_lsd_align.py
│ ├── panostretch.py
│ ├── post_proc.py
│ ├── structured3d_extract_zip.py
│ ├── structured3d_prepare_dataset.py
│ └── utils.py
└── model
│ ├── backbone
│ ├── __init__.py
│ ├── hardnet.py
│ ├── resnet.py
│ └── simple.py
│ ├── hohonet.py
│ ├── horizon_compression
│ ├── __init__.py
│ ├── ehc.py
│ ├── hc.py
│ └── simple.py
│ ├── horizon_refinement
│ ├── __init__.py
│ ├── attention.py
│ ├── identity.py
│ ├── linear.py
│ └── rnn.py
│ ├── horizon_upsample
│ ├── __init__.py
│ └── upsample1d.py
│ ├── modality
│ ├── __init__.py
│ ├── bases.py
│ ├── depth.py
│ ├── layout.py
│ └── semantic.py
│ └── utils.py
├── test_depth.py
├── test_layout.py
├── test_sem.py
├── train.py
├── vis_depth.py
└── vis_layout.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | output
3 | ckpt
4 | data
5 |
6 | # Byte-compiled / optimized / DLL files
7 | __pycache__/
8 | *.py[cod]
9 | *$py.class
10 |
11 | # C extensions
12 | *.so
13 |
14 | # Distribution / packaging
15 | .Python
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | pip-wheel-metadata/
28 | share/python-wheels/
29 | *.egg-info/
30 | .installed.cfg
31 | *.egg
32 | MANIFEST
33 |
34 | # PyInstaller
35 | # Usually these files are written by a python script from a template
36 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
37 | *.manifest
38 | *.spec
39 |
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 |
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .nox/
48 | .coverage
49 | .coverage.*
50 | .cache
51 | nosetests.xml
52 | coverage.xml
53 | *.cover
54 | *.py,cover
55 | .hypothesis/
56 | .pytest_cache/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 | db.sqlite3
66 | db.sqlite3-journal
67 |
68 | # Flask stuff:
69 | instance/
70 | .webassets-cache
71 |
72 | # Scrapy stuff:
73 | .scrapy
74 |
75 | # Sphinx documentation
76 | docs/_build/
77 |
78 | # PyBuilder
79 | target/
80 |
81 | # Jupyter Notebook
82 | .ipynb_checkpoints
83 |
84 | # IPython
85 | profile_default/
86 | ipython_config.py
87 |
88 | # pyenv
89 | .python-version
90 |
91 | # pipenv
92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
95 | # install all needed dependencies.
96 | #Pipfile.lock
97 |
98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
99 | __pypackages__/
100 |
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 |
105 | # SageMath parsed files
106 | *.sage.py
107 |
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 |
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 |
121 | # Rope project settings
122 | .ropeproject
123 |
124 | # mkdocs documentation
125 | /site
126 |
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 |
132 | # Pyre type checker
133 | .pyre/
134 |
--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 sunset
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HoHoNet
2 |
3 | Code for our paper in CVPR 2021: **HoHoNet: 360 Indoor Holistic Understanding with Latent Horizontal Features** ([paper](https://arxiv.org/abs/2011.11498), [video](https://www.youtube.com/watch?v=xXtRaRKmMpA)).
4 |
5 | 
6 |
7 | #### News
8 | - **April 3, 2021**: Release inference code, jupyter notebook and visualization tools. Guide for reproduction is also finished.
9 | - **March 4, 2021**: A new backbone **[HarDNet](https://github.com/PingoLH/Pytorch-HarDNet)** is included, which shows better speed and depth accuracy.
10 |
11 |
12 | ## Pretrained weight
13 | Links to trained weights `ckpt/`: [download on Google drive](https://drive.google.com/drive/folders/1raT3vRXnQXRAQuYq36dE-93xFc_hgkTQ?usp=sharing) or [download on Dropbox](https://www.dropbox.com/sh/b014nop5jrehpoq/AACWNTMMHEAbaKOO1drqGio4a?dl=0).
14 |
15 |
16 | ## Inference
17 | In below, we use an out-of-training-distribution 360 image from PanoContext as an example.
18 |
19 | ### Jupyter notebook
20 | See [infer_depth.ipynb](infer_depth.ipynb), [infer_layout.ipynb](infer_layout.ipynb), and [infer_sem.ipynb](infer_sem.ipynb) for interactive demo and visualization.
21 |
22 | ### Batch inference
23 | Run `infer_depth.py`/`infer_layout.py` to inference depth/layout.
24 | Use `--cfg` and `--pth` to specify the path to config file and pretrained weight.
25 | Specify input path with `--inp`. Glob pattern for a batch of files is avaiable.
26 | The results are stored into `--out` directory with the same filename with extention set ot `.depth.png` and `.layout.txt`.
27 |
28 | Example for depth:
29 | ```
30 | python infer_depth.py --cfg config/mp3d_depth/HOHO_depth_dct_efficienthc_TransEn1_hardnet.yaml --pth ckpt/mp3d_depth_HOHO_depth_dct_efficienthc_TransEn1_hardnet/ep60.pth --out assets/ --inp assets/pano_asmasuxybohhcj.png
31 | ```
32 |
33 | Example for layout:
34 | ```
35 | python infer_layout.py --cfg config/mp3d_layout/HOHO_layout_aug_efficienthc_Transen1_resnet34.yaml --pth ckpt/mp3d_layout_HOHO_layout_aug_efficienthc_Transen1_resnet34/ep300.pth --out assets/ --inp assets/pano_asmasuxybohhcj.png
36 | ```
37 |
38 | ### Visualization tools
39 | To visualize layout as 3D mesh, run:
40 | ```
41 | python vis_layout.py --img assets/pano_asmasuxybohhcj.png --layout assets/pano_asmasuxybohhcj.layout.txt
42 | ```
43 | Rendering options: `--show_ceiling`, `--ignore_floor`, `--ignore_wall`, `--ignore_wireframe` are available.
44 | Set `--out` to export the mesh to `ply` file.
45 | Set `--no_vis` to disable the visualization.
46 |
47 |
48 |
49 |
50 |
51 | To visualize depth as point cloud, run:
52 | ```
53 | python vis_depth.py --img assets/pano_asmasuxybohhcj.png --depth assets/pano_asmasuxybohhcj.depth.png
54 | ```
55 | Rendering options: `--crop_ratio`, `--crop_z_above`.
56 |
57 |
58 |
59 |
60 |
61 |
62 | ## Reproduction
63 | Please see [README_reproduction.md](README_reproduction.md) for the guide to:
64 | 1. prepare the datasets for each task in our paper
65 | 2. reproduce the training for each task
66 | 3. reproduce the numerical results in our paper with the provided pretrained weights
67 |
68 |
69 | ## Citation
70 | ```
71 | @inproceedings{SunSC21,
72 | author = {Cheng Sun and
73 | Min Sun and
74 | Hwann{-}Tzong Chen},
75 | title = {HoHoNet: 360 Indoor Holistic Understanding With Latent Horizontal
76 | Features},
77 | booktitle = {CVPR},
78 | year = {2021},
79 | }
80 | ```
81 |
--------------------------------------------------------------------------------
/README_prepare_data_mp3d_layout.md:
--------------------------------------------------------------------------------
1 | # Prepare MatterportLayout dataset
2 |
3 | References:
4 | - [3D Manhattan Room Layout Reconstruction from a Single 360 Image](https://arxiv.org/abs/1910.04099)
5 | - [PanoAnnotator](https://github.com/SunDaDenny/PanoAnnotator)
6 | - [LayoutMP3D: Layout Annotation of Matterport3D](https://arxiv.org/abs/2003.13516)
7 | - [Matterport3DLayoutAnnotation github](https://github.com/ericsujw/Matterport3DLayoutAnnotation) (we use the annotation provided by LayoutNetv2)
8 |
9 | ## Dataset preparation
10 | ### Step 1: download source
11 | Please refer to [Matterport3DLayoutAnnotation](https://github.com/ericsujw/Matterport3DLayoutAnnotation) to download the source datas.
12 | - Put all the rgb under `{ROOT}/image_up/`.
13 | - Download the annotation to `{ROOT}/label_data/` (originally json format).
14 | - Download the data split into `{ROOT}/mp3d_[train|val|test].txt`.
15 |
16 | ### Step 2: convert json annotation to corners in txt format
17 | Use below code to convert original ground-truth json into txt. **(Remember to update the uppercase variables)**
18 | ```python
19 | import os
20 | import glob
21 | import json
22 | import numpy as np
23 |
24 | IN_GLOB = 'label_data/*json'
25 | OUT_DIR = 'label_cor'
26 | os.makedirs(OUT_DIR, exist_ok=True)
27 |
28 | for p in glob.glob(IN_GLOB):
29 | gt = json.load(open(p))
30 | assert gt['cameraHeight'] == 1.6
31 | us = np.array([pts['coords'][0] for pts in gt['layoutPoints']['points']])
32 | us = us * 1024
33 | cs = np.array([pts['xyz'] for pts in gt['layoutPoints']['points']])
34 | cs = np.sqrt((cs**2)[:, [0, 2]].sum(1))
35 |
36 | vf = np.arctan2(-1.6, cs)
37 | vc = np.arctan2(-1.6 + gt['layoutHeight'], cs)
38 | vf = (-vf / np.pi + 0.5) * 512
39 | vc = (-vc / np.pi + 0.5) * 512
40 |
41 | cor_x = np.repeat(us, 2)
42 | cor_y = np.stack([vc, vf], -1).reshape(-1)
43 | cor_xy = np.stack([cor_x, cor_y], -1)
44 |
45 | out_path = os.path.join(OUT_DIR, os.path.split(p)[-1][:-4] + 'txt')
46 | with open(out_path, 'w') as f:
47 | for x, y in cor_xy:
48 | f.write('%.2f %.2f\n' % (x, y))
49 | ```
50 |
51 | ### Step 3: data split
52 | Use below code to organize the data split for training and evaluation. **(Remember to update the uppercase variables)**
53 | ```python
54 | import os
55 | from shutil import copy2
56 |
57 | IMG_ROOT = 'image_up'
58 | TXT_ROOT = 'label_cor'
59 | OUT_ROOT = 'mp3d_layout'
60 | TRAIN_TXT = 'mp3d_train.txt'
61 | VALID_TXT = 'mp3d_val.txt'
62 | TEST_TXT = 'mp3d_test.txt'
63 |
64 | def go(txt, split):
65 | out_img_root = os.path.join(OUT_ROOT, split, 'img')
66 | out_txt_root = os.path.join(OUT_ROOT, split, 'label_cor')
67 | os.makedirs(out_img_root, exist_ok=True)
68 | os.makedirs(out_txt_root, exist_ok=True)
69 |
70 | with open(txt) as f:
71 | ks = ['_'.join(l.strip().split()) for l in f]
72 |
73 | for k in ks:
74 | copy2(os.path.join(IMG_ROOT, k + '.png'), out_img_root)
75 | copy2(os.path.join(TXT_ROOT, k + '_label.txt'), out_txt_root)
76 | os.rename(os.path.join(out_txt_root, k + '_label.txt'), os.path.join(out_txt_root, k + '.txt'))
77 |
78 |
79 | go(TRAIN_TXT, 'train')
80 | go(VALID_TXT, 'valid')
81 | go(TEST_TXT, 'test')
82 | ```
83 |
84 | ### Step 4: clamp occlusion
85 | We assume only visible corners in txt annotation (which is the same as [Holistic 3D Vision Challenge, ECCV2020](https://competitions.codalab.org/competitions/24183#learn_the_details-evaluation)'s format).
86 | For MatterportLayout dataset, please copy&paste below script to `clamp_occ_corners.py` and run:
87 | - `python clamp_occ_corners.py --ori_glob "data/mp3d_layout/train/label_cor/*txt" --output_dir data/mp3d_layout/train_no_occ/label_cor/*txt`
88 | - `python clamp_occ_corners.py --ori_glob "data/mp3d_layout/valid/label_cor/*txt" --output_dir data/mp3d_layout/valid_no_occ/label_cor/*txt`
89 | - `python clamp_occ_corners.py --ori_glob "data/mp3d_layout/test/label_cor/*txt" --output_dir data/mp3d_layout/test_no_occ/label_cor/*txt`
90 | ```python
91 | import os
92 | import json
93 | import glob
94 | import numpy as np
95 | from shapely.geometry import LineString
96 |
97 | from misc import panostretch
98 |
99 | def cor_2_1d(cor, H=512, W=1024):
100 | bon_ceil_x, bon_ceil_y = [], []
101 | bon_floor_x, bon_floor_y = [], []
102 | n_cor = len(cor)
103 | for i in range(n_cor // 2):
104 | xys = panostretch.pano_connect_points(cor[i*2],
105 | cor[(i*2+2) % n_cor],
106 | z=-50, w=W, h=H)
107 | bon_ceil_x.extend(xys[:, 0])
108 | bon_ceil_y.extend(xys[:, 1])
109 | for i in range(n_cor // 2):
110 | xys = panostretch.pano_connect_points(cor[i*2+1],
111 | cor[(i*2+3) % n_cor],
112 | z=50, w=W, h=H)
113 | bon_floor_x.extend(xys[:, 0])
114 | bon_floor_y.extend(xys[:, 1])
115 | bon_ceil_x, bon_ceil_y = sort_xy_filter_unique(bon_ceil_x, bon_ceil_y, y_small_first=True)
116 | bon_floor_x, bon_floor_y = sort_xy_filter_unique(bon_floor_x, bon_floor_y, y_small_first=False)
117 | bon = np.zeros((2, W))
118 | bon[0] = np.interp(np.arange(W), bon_ceil_x, bon_ceil_y, period=W)
119 | bon[1] = np.interp(np.arange(W), bon_floor_x, bon_floor_y, period=W)
120 | #bon = ((bon + 0.5) / H - 0.5) * np.pi
121 | return bon
122 |
123 | def sort_xy_filter_unique(xs, ys, y_small_first=True):
124 | xs, ys = np.array(xs), np.array(ys)
125 | idx_sort = np.argsort(xs + ys / ys.max() * (int(y_small_first)*2-1))
126 | xs, ys = xs[idx_sort], ys[idx_sort]
127 | _, idx_unique = np.unique(xs, return_index=True)
128 | xs, ys = xs[idx_unique], ys[idx_unique]
129 | assert np.all(np.diff(xs) > 0)
130 | return xs, ys
131 |
132 | def find_occlusion(coor):
133 | u = panostretch.coorx2u(coor[:, 0])
134 | v = panostretch.coory2v(coor[:, 1])
135 | x, y = panostretch.uv2xy(u, v, z=-50)
136 | occlusion = []
137 | for i in range(len(x)):
138 | raycast = LineString([(0, 0), (x[i], y[i])])
139 | other_layout = []
140 | for j in range(i+1, len(x)):
141 | other_layout.append((x[j], y[j]))
142 | for j in range(0, i):
143 | other_layout.append((x[j], y[j]))
144 | other_layout = LineString(other_layout)
145 | occlusion.append(raycast.intersects(other_layout))
146 | return np.array(occlusion)
147 |
148 |
149 |
150 | if __name__ == '__main__':
151 |
152 | import argparse
153 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
154 | parser.add_argument('--ori_glob', required=True)
155 | parser.add_argument('--output_dir', required=True)
156 | args = parser.parse_args()
157 |
158 | os.makedirs(args.output_dir, exist_ok=True)
159 |
160 | paths = glob.glob(args.ori_glob)
161 | for path in paths:
162 | if path.endswith('json'):
163 | with open(path) as f:
164 | dt = json.load(f)
165 | cor = np.array(dt['uv'], np.float32)
166 | cor[:, 0] *= 1024
167 | cor[:, 1] *= 512
168 | else:
169 | with open(path) as f:
170 | cor = np.array([l.strip().split() for l in f]).astype(np.float32)
171 | cor = cor.reshape(-1, 4)
172 | duplicated = [False] * len(cor)
173 | for i in range(len(duplicated)):
174 | for j in range(i+1, len(duplicated)):
175 | if (cor[j] == cor[i]).sum() == 4:
176 | duplicated[j] = True
177 | cor = cor[~np.array(duplicated)].reshape(-1, 2)
178 | cor[:, 0] = cor[:, 0] % 1024
179 | cor = np.roll(cor[:, :2], -2 * np.argmin(cor[::2, 0]), 0)
180 | occlusion = find_occlusion(cor[::2].copy()).repeat(2)
181 |
182 | bon = cor_2_1d(cor)
183 |
184 | cor_v1 = []
185 | for i in range(0, len(cor), 2):
186 | if occlusion[i] & ~occlusion[(i+2) % len(cor)]:
187 | cur_x = cor[i, 0]
188 | next_x = cor[(i+2) % len(cor), 0]
189 | prev_x, j = None, i-2
190 | while prev_x is None:
191 | if j < 0:
192 | j += len(cor)
193 | if ~occlusion[j]:
194 | prev_x = cor[j, 0]
195 | break
196 | j -= 2
197 | dist2next = min(abs(next_x-cur_x), abs(next_x+1024-cur_x), abs(next_x-1024-cur_x))
198 | dist2prev = min(abs(prev_x-cur_x), abs(prev_x+1024-cur_x), abs(prev_x-1024-cur_x))
199 | # print(cor[i], prev_x, next_x, dist2next, dist2prev)
200 | if dist2prev < dist2next:
201 | cor_v1.append([prev_x, bon[0, (int(prev_x)+1) % 1024]])
202 | cor_v1.append([prev_x, bon[1, (int(prev_x)+1) % 1024]])
203 | else:
204 | cor_v1.append([next_x, bon[0, (int(next_x)-1) % 1024]])
205 | cor_v1.append([next_x, bon[1, (int(next_x)-1) % 1024]])
206 | elif ~occlusion[i]:
207 | cor_v1.extend(cor[i:i+2])
208 |
209 | cor_v1 = np.stack(cor_v1, 0)
210 | for _ in range(len(cor_v1)):
211 | if np.alltrue(cor_v1[::2, 0][1:] - cor_v1[::2, 0][:-1] >= 0):
212 | break
213 | cor_v1 = np.roll(cor_v1, 2, axis=0)
214 | if not np.alltrue(cor_v1[::2, 0][1:] - cor_v1[::2, 0][:-1] >= 0):
215 | cor_v1[2::2] = np.flip(cor_v1[2::2], 0)
216 | cor_v1[3::2] = np.flip(cor_v1[3::2], 0)
217 | for _ in range(len(cor_v1)):
218 | if np.alltrue(cor_v1[::2, 0][1:] - cor_v1[::2, 0][:-1] >= 0):
219 | break
220 | cor_v1 = np.roll(cor_v1, 2, axis=0)
221 | with open(os.path.join(args.output_dir, f'{os.path.split(path)[1].replace("json", "txt")}'), 'w') as f:
222 | for u, v in cor_v1:
223 | f.write(f'{u:.0f} {v:.0f}\n')
224 | ```
225 |
226 |
227 |
228 | ### Final file structure
229 | So now, you should have a `mp3d_layout` directory with below structure for HoHoNet to train.
230 |
231 | data
232 | └── mp3d_layout
233 | ├── train
234 | │ ├── img/*png
235 | │ └── label_cor/*txt
236 | ├── train_no_occ
237 | │ ├── img/*png
238 | │ └── label_cor/*txt
239 | ├── valid
240 | │ ├── img/*png
241 | │ └── label_cor/*txt
242 | ├── valid_no_occ
243 | │ ├── img/*png
244 | │ └── label_cor/*txt
245 | ├── test
246 | │ ├── img/*png
247 | │ └── label_cor/*txt
248 | └── test_no_occ
249 | ├── img/*png
250 | └── label_cor/*txt
251 |
--------------------------------------------------------------------------------
/README_prepare_data_s2d3d.md:
--------------------------------------------------------------------------------
1 | # Prepare Stanford2d3d dataset
2 |
3 | ## Dataset preparation
4 | ### Step 1: download source
5 | Please refer to [2D-3D-Semantics](https://github.com/alexsax/2D-3D-Semantics) to download the source datas.
6 | Make sure `"$S2D3D_ROOT"/area_[1|2|3|4|5a|5b|6]/pano/[depth|rgb|semantic]` existed.
7 |
8 |
9 | ### Step 2: resize and copy into `data/stanford2D3D/` for depth modality
10 | The source data are in high resolution (`2048x4096`).
11 | To reduce data loading time during training, we resize them to `512x1024` and copy into HoHoNet's `data/`.
12 | Copy below code and paste into `prepare_S2D3D_d.py`.
13 | Run `python prepare_S2D3D_d.py --ori_root "$S2D3D_ROOT" --new_root "$HOHO_ROOT/data/stanford2D3D/"`.
14 | ```python
15 | import os
16 | import glob
17 | import argparse
18 | from tqdm import tqdm
19 |
20 | import numpy as np
21 | from imageio import imread, imwrite
22 | from skimage.transform import rescale
23 |
24 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
25 | parser.add_argument('--ori_root', required=True)
26 | parser.add_argument('--new_root', required=True)
27 | args = parser.parse_args()
28 |
29 | areas = ['area_1', 'area_2', 'area_3', 'area_4', 'area_5a', 'area_5b', 'area_6']
30 |
31 | for area in areas:
32 | print('Processing:', area)
33 | os.makedirs(os.path.join(args.new_root, area, 'rgb'), exist_ok=True)
34 | os.makedirs(os.path.join(args.new_root, area, 'depth'), exist_ok=True)
35 | for fname in tqdm(os.listdir(os.path.join(args.ori_root, area, 'pano', 'rgb'))):
36 | if fname[0] == '.' or not fname.endswith('png'):
37 | continue
38 | rgb_path = os.path.join(args.ori_root, area, 'pano', 'rgb', fname)
39 | d_path = os.path.join(args.ori_root, area, 'pano', 'depth', fname[:-7] + 'depth.png')
40 | assert os.path.isfile(d_path)
41 |
42 | rgb = imread(rgb_path)[..., :3]
43 | depth = imread(d_path)
44 | rgb = rescale(rgb, 0.25, order=0, mode='wrap', anti_aliasing=False, preserve_range=True)
45 | depth = rescale(depth, 0.25, order=0, mode='wrap', anti_aliasing=False, preserve_range=True)
46 |
47 | imwrite(os.path.join(args.new_root, area, 'rgb', fname), rgb.astype(np.uint8))
48 | imwrite(os.path.join(args.new_root, area, 'depth', fname[:-7] + 'depth.png'), depth.astype(np.uint16))
49 | ```
50 |
51 | ### Step 3: resize and copy into `data/s2d3d_sem` for semantic modality
52 | Please download `semantic_labels.json`, `name2label.json`, and `colors.npy` on [Google drive](https://drive.google.com/drive/folders/1raT3vRXnQXRAQuYq36dE-93xFc_hgkTQ?usp=sharing) or [Dropbox](https://www.dropbox.com/sh/b014nop5jrehpoq/AACWNTMMHEAbaKOO1drqGio4a?dl=0).
53 | Put these files under your `$S2D3D_ROOT/`.
54 | Copy below code and paste into `prepare_S2D3D_sem.py`.
55 | Run `python prepare_S2D3D_sem.py --ori_root "$S2D3D_ROOT" --new_root "$HOHO_ROOT/data/s2d3d_sem/"`.
56 | ```python
57 | import os
58 | import json
59 | import glob
60 | from PIL import Image
61 | from tqdm import trange
62 | import numpy as np
63 | from shutil import copyfile
64 |
65 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
66 | parser.add_argument('--ori_root', required=True)
67 | parser.add_argument('--new_root', required=True)
68 | args = parser.parse_args()
69 |
70 | areas = ['area_1', 'area_2', 'area_3', 'area_4', 'area_5a', 'area_5b', 'area_6']
71 |
72 | with open(os.path.join(args.ori_root, 'semantic_labels.json')) as f:
73 | id2name = [name.split('_')[0] for name in json.load(f)] + ['']
74 |
75 | with open(os.path.join(args.ori_root, 'name2label.json')) as f:
76 | name2id = json.load(f)
77 |
78 | colors = np.load(os.path.join(args.ori_root, 'colors.npy'))
79 |
80 | id2label = np.array([name2id[name] for name in id2name], np.uint8)
81 |
82 | for area in areas:
83 | rgb_paths = sorted(glob.glob(os.path.join(args.ori_root, area, 'pano', 'rgb', '*png')))
84 | sem_paths = sorted(glob.glob(os.path.join(args.ori_root, area, 'pano', 'semantic', '*png')))
85 | os.makedirs(os.path.join(args.new_root, area, 'rgb'), exist_ok=True)
86 | os.makedirs(os.path.join(args.new_root, area, 'semantic'), exist_ok=True)
87 | os.makedirs(os.path.join(args.new_root, area, 'semantic_visualize'), exist_ok=True)
88 | for i in trange(len(rgb_paths)):
89 | rgb_k = os.path.split(rgb_paths[i])[-1]
90 | sem_k = os.path.split(sem_paths[i])[-1]
91 |
92 | # RGB
93 | rgb = Image.open(rgb_paths[i]).convert('RGB').resize((1024, 512), Image.LANCZOS)
94 | rgb.save(os.path.join(args.new_root, area, 'rgb', rgb_k))
95 | vis = np.array(rgb)
96 | # Semantic
97 | sem = np.array(Image.open(sem_paths[i]).resize((1024, 512), Image.NEAREST), np.int32)
98 | unk = (sem[..., 0] != 0)
99 | sem = id2label[sem[..., 1] * 256 + sem[..., 2]]
100 | sem[unk] = 0
101 | Image.fromarray(sem).save(os.path.join(args.new_root, area, 'semantic', rgb_k))
102 | # Visualization
103 | vis = vis // 2 + colors[sem] // 2
104 | Image.fromarray(vis).save(os.path.join(args.new_root, area, 'semantic_visualize', rgb_k))
105 | ```
106 |
107 | ### Step 4: prepare data split
108 | Download data split `fold[1|2|3]_[train|valid].txt` and `small_[train|valid|test].txt` on [Google drive](https://drive.google.com/drive/folders/1raT3vRXnQXRAQuYq36dE-93xFc_hgkTQ?usp=sharing) or [Dropbox](https://www.dropbox.com/sh/b014nop5jrehpoq/AACWNTMMHEAbaKOO1drqGio4a?dl=0).
109 | Put these `txt` files under `data/stanford2D3D`.
110 |
111 |
112 |
113 | ### Final file structure
114 | So now, you should have a `stanford2D3D` and `s2d3d_sem` directories with below structure for HoHoNet to train.
115 |
116 | data
117 | ├── stanford2D3D
118 | │ ├── area_[1|2|3|4|5a|5b|6]
119 | │ │ ├── img/*png
120 | │ │ └── depth/*png
121 | │ ├── small_[train|valid|test].txt
122 | │ └── fold[1|2|3]_[train|valid].txt
123 | │
124 | └── s2d3d_sem
125 | └── area_[1|2|3|4|5a|5b|6]
126 | ├── rgb/*png
127 | └── semantic/*png
128 |
--------------------------------------------------------------------------------
/README_reproduction.md:
--------------------------------------------------------------------------------
1 | # Reproduction
2 |
3 | Below provides:
4 | 1. guide to prepare the datasets for each task in our paper
5 | 2. reproduce the training and numerical results in our paper
6 |
7 | ## Dataset
8 | Detail instruction for preparing the datas for each dataset and task:
9 | - `Matterport3d` x `Layout`
10 | - see [Prepare MatterportLayout dataset](README_prepare_data_mp3d_layout.md)
11 | - `Matterport3d` x `Depth (BiFuse's stitching)`
12 | - We use the rgb-d stitching provided by [BiFuse](https://github.com/Yeh-yu-hsuan/BiFuse)
13 | - Put their `mp3d_align/` under `data/`
14 | - Download data split via [Google drive](https://drive.google.com/drive/folders/1raT3vRXnQXRAQuYq36dE-93xFc_hgkTQ?usp=sharing) or via [Dropbox](https://www.dropbox.com/sh/b014nop5jrehpoq/AACWNTMMHEAbaKOO1drqGio4a?dl=0) and put them under `data/matterport3d/`.
15 | - `Matterport3d` x `Depth (our new stitching)`
16 | - We remove the depth noise in BiFuse's stitching
17 | - This is not the version we use in our paper
18 | - **TODO:** release new stiching code with experiment results on it
19 | - `Stanford2d3d` x `Depth`:
20 | - see [Prepare Stanford2d3d dataset](README_prepare_data_s2d3d.md)
21 | - `Stanford2d3d` x `Semantic segmentation`:
22 | - see [Prepare Stanford2d3d dataset](README_prepare_data_s2d3d.md)
23 |
24 | The overall file strucure of the datasets is depicted as follow:
25 |
26 | data
27 | ├── mp3d_align # Stitching provided by BiFuse (https://github.com/Yeh-yu-hsuan/BiFuse)
28 | │ ├── 17DRP5sb8fy
29 | │ │ ├── 00ebbf3782c64d74aaf7dd39cd561175
30 | │ │ │ ├── color.jpg
31 | │ │ │ └── depth.npy
32 | │ │ └── ...
33 | │ └── ...
34 | │
35 | ├── matterport3d
36 | │ ├── scenes_abla_train.txt # 41 house id for ablation training
37 | │ ├── scenes_abla_valid.txt # 20 house id for ablation evaluation
38 | │ ├── scenes_train.txt # 61 house id for training following BiFuse
39 | │ ├── mp3d_scenes_test.txt # 28 house id for testing following BiFuse
40 | │ └── mp3d_rgbd/ # Our new stitching which fixs the depth noise in BiFuse's version
41 | │ # Release new stitching code with new experiments later.
42 | │
43 | ├── mp3d_layout # Please follow README_prepare_data_mp3d_layout.md
44 | │ ├── train_no_occ
45 | │ │ ├── img/*png
46 | │ │ └── label_cor/*txt
47 | │ ├── valid_no_occ
48 | │ │ ├── img/*png
49 | │ │ └── label_cor/*txt
50 | │ └── test_no_occ
51 | │ ├── img/*png
52 | │ └── label_cor/*txt
53 | │
54 | ├── stanford2D3D # Please follow README_prepare_data_s2d3d.md
55 | │ ├── area_[1|2|3|4|5a|5b|6]
56 | │ │ ├── img/*png
57 | │ │ └── depth/*png
58 | │ ├── small_[train|valid|test].txt
59 | │ └── fold[1|2|3]_[train|valid].txt
60 | │
61 | └── s2d3d_sem # Please follow README_prepare_data_s2d3d.md
62 | └── area_[1|2|3|4|5a|5b|6]
63 | ├── rgb/*png
64 | └── semantic/*png
65 |
66 |
67 | ## Reproduction: training
68 | The configs for reproducing the experiments are all in `config/`.
69 |
70 | Just run:
71 | ```
72 | python train.py --cfg {PATH_TO_CONFIG}
73 | ```
74 | to train the same setting as experiments in our paper.
75 | Note that the results with same config but different runs could be different as the random seed is not fixed.
76 |
77 | Some examples:
78 | ```
79 | python train.py --cfg config/mp3d_depth/HOHO_depth_dct_efficienthc_TransEn1_hardnet.yaml
80 | python train.py --cfg config/mp3d_layout/HOHO_layout_aug_efficienthc_Transen1_resnet34.yaml
81 | python train.py --cfg config/s2d3d_depth/HOHO_depth_dct_efficienthc_TransEn1.yaml
82 | python train.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101.yaml
83 | ```
84 |
85 | ## Reproduction: measuring FPS
86 | Just run:
87 | ```
88 | python count_params_flops.py --cfg {PATH_TO_CONFIG}
89 | ```
90 | It measures averaged feed-forward times of the model.
91 | The results reported in our paper are obtained on a GeForce RTX 2080.
92 |
93 | ## Reproduction: quantitative evaluation
94 | Please make sure the dataset and the trained weights are organized as the instruction above.
95 | If not, the config should be updated accordinly and you should directly assign the path to the trained weight to the testing script via `--pth`.
96 |
97 |
98 |
99 |
100 | ### `Matterport3D` x `depth` (BiFuse's stitching and setting)
101 | Assume pretrained weights located at:
102 | - `ckpt/mp3d_depth_HOHO_depth_dct_efficienthc_TransEn1_hardnet/ep60.pth`
103 | - `ckpt/mp3d_depth_HOHO_depth_dct_efficienthc_TransEn1/ep60.pth`
104 |
105 | Run:
106 | ```
107 | python test_depth.py --cfg config/mp3d_depth/HOHO_depth_dct_efficienthc_TransEn1.yaml
108 | python test_depth.py --cfg config/mp3d_depth/HOHO_depth_dct_efficienthc_TransEn1_hardnet.yaml
109 | ```
110 |
111 | Results:
112 | | Exp | fps | mre | mae | rmse | rmse_log | log10 | delta_1 | delta_3 | delta_3 |
113 | | :-- | :-- | :-- | :-- | :--- | :------- | :---- | :------ | :------ | :------ |
114 | | HOHO_depth_dct_efficienthc_TransEn1 | 52 | 0.1488 | 0.2862 | 0.5138 | 0.0871 | 0.0505 | 0.8786 | 0.9519 | 0.9771 |
115 | | HOHO_depth_dct_efficienthc_TransEn1_hardnet | 67 | 0.1482 | 0.2761 | 0.4968 | 0.0857 | 0.0494 | 0.8830 | 0.9547 | 0.9797 |
116 |
117 |
118 |
119 |
120 | ### `Matterport3D` x `depth` (our new stitching and setting)
121 | **TODO**
122 |
123 |
124 |
125 |
126 | ### `Matterport3D` x `layout` (LayoutNetv2's setting)
127 | Assume pretrained weights located at:
128 | - `ckpt/mp3d_layout_HOHO_layout_aug_efficienthc_Transen1_resnet34/ep300.pth`
129 |
130 | Run to predict layout and store the results in txt files:
131 | ```
132 | python test_layout.py --cfg config/mp3d_layout/HOHO_layout_aug_efficienthc_Transen1_resnet34.yaml --img_glob "data/mp3d_layout/test/img/*" --output_dir output/mp3d_layout/HOHO_layout_aug_efficienthc_Transen1_resnet34/
133 | ```
134 |
135 | Run to evaluate the prediction:
136 | ```
137 | python eval_layout.py --gt_glob "data/mp3d_layout/test/label_cor/*" --dt_glob "output/mp3d_layout/HOHO_layout_aug_efficienthc_Transen1_resnet34/*"
138 | ```
139 |
140 | Results:
141 | | Exp | fps | 2DIoU | 3DIoU | RMSE | delta_1 |
142 | | :-- | :-- | :---- | :---- | :--- | :------ |
143 | | HOHO_layout_aug_efficienthc_Transen1_resnet34 | 111 | 82.32 | 79.88 | 0.22 | 0.95 |
144 |
145 | **[Note]** our implementation for the depth-based evaluation (i.e., RMSE, delta_1) is very different from LayoutNetv2's so the results from the two repo is not direct comparable.
146 |
147 |
148 |
149 |
150 | ### `Stanford2d3d` x `depth` (BiFuse's setting)
151 | Assume pretrained weights located at:
152 | - `ckpt/s2d3d_depth_HOHO_depth_dct_efficienthc_TransEn1/ep60.pth`
153 |
154 | Run:
155 | ```
156 | python test_depth.py --cfg config/s2d3d_depth/HOHO_depth_dct_efficienthc_TransEn1.yaml
157 | ```
158 |
159 | Results:
160 | | Exp | fps | mre | mae | rmse | rmse_log | log10 | delta_1 | delta_3 | delta_3 |
161 | | :-- | :-- | :-- | :-- | :--- | :------- | :---- | :------ | :------ | :------ |
162 | | HOHO_depth_dct_efficienthc_TransEn1 | 52 | 0.1014 | 0.2027 | 0.3834 | 0.0668 | 0.0438 | 0.9054 | 0.9693 | 0.9886 |
163 |
164 |
165 |
166 |
167 | ### `Stanford2d3d` x `depth` (GeoReg360's setting)
168 | Assume pretrained weights located at:
169 | - `ckpt/s2d3d_depth_HOHO_depthS_dct_efficienthc_TransEn1/ep60.pth`
170 | - `ckpt/s2d3d_depth_HOHO_depthS_SGD_dct_efficienthc_TransEn1/ep60.pth`
171 |
172 | Run:
173 | ```
174 | python test_depth.py --cfg config/s2d3d_depth/HOHO_depthS_SGD_dct_efficienthc_TransEn1.yaml --clip 100
175 | python test_depth.py --cfg config/s2d3d_depth/HOHO_depthS_dct_efficienthc_TransEn1.yaml --clip 100
176 | ```
177 |
178 | **[Note]** remember to add `--clip 100` to disable depth clip for a fair comparison with GeoReg360's setting.
179 |
180 | Results:
181 | | Exp | fps | mre | mae | rmse | rmse_log | log10 | delta_1 | delta_3 | delta_3 |
182 | | :-- | :-- | :-- | :-- | :--- | :------- | :---- | :------ | :------ | :------ |
183 | | HOHO_depthS_SGD_dct_efficienthc_TransEn1 | 106 | 0.1114 | 0.2197 | 0.4083 | 0.0737 | 0.0502 | 0.8671 | 0.9694 | 0.9916 |
184 | | HOHO_depthS_dct_efficienthc_TransEn1 | 104 | 0.1040 | 0.2134 | 0.3940 | 0.0678 | 0.0475 | 0.8955 | 0.9749 | 0.9933 |
185 |
186 |
187 |
188 |
189 | ### `Stanford2d3d` x `semantic segmentation`
190 | Run:
191 | ```
192 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h64_fold1_simple.yaml
193 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h64_fold2_simple.yaml
194 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h64_fold3_simple.yaml
195 |
196 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h256_fold1_simple.yaml
197 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h256_fold2_simple.yaml
198 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h256_fold3_simple.yaml
199 |
200 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101rgb.yaml
201 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101rgb.yaml
202 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101rgb.yaml
203 |
204 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101.yaml
205 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101.yaml
206 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101.yaml
207 | ```
208 |
209 | Results:
210 | | Exp | fps | iou | acc |
211 | | :-- | :-- | :-- | :-- |
212 | | HOHO_depth_dct_efficienthc_TransEn1_h64_fold1_simple | 202 | 43.04 | 53.06 |
213 | | HOHO_depth_dct_efficienthc_TransEn1_h64_fold2_simple | 204 | 36.27 | 48.45 |
214 | | HOHO_depth_dct_efficienthc_TransEn1_h64_fold3_simple | 202 | 43.14 | 54.81 |
215 |
216 | | Exp | fps | iou | acc |
217 | | :-- | :-- | :-- | :-- |
218 | | HOHO_depth_dct_efficienthc_TransEn1_h256_fold1_simple | 135 | 46.49 | 56.33 |
219 | | HOHO_depth_dct_efficienthc_TransEn1_h256_fold2_simple | 135 | 37.18 | 48.60 |
220 | | HOHO_depth_dct_efficienthc_TransEn1_h256_fold3_simple | 135 | 46.09 | 56.81 |
221 |
222 | | Exp | fps | iou | acc |
223 | | :-- | :-- | :-- | :-- |
224 | | HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101rgb | 10 | 53.94 | 64.30 |
225 | | HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101rgb | 10 | 45.03 | 61.70 |
226 | | HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101rgb | 10 | 56.87 | 68.94 |
227 |
228 | | Exp | fps | iou | acc |
229 | | :-- | :-- | :-- | :-- |
230 | | HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101 | 10 | 59.05 | 68.91 |
231 | | HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101 | 10 | 49.70 | 65.86 |
232 | | HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101 | 10 | 60.28 | 71.85 |
233 |
--------------------------------------------------------------------------------
/assets/label13_weight.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/assets/label13_weight.pth
--------------------------------------------------------------------------------
/assets/pano_asmasuxybohhcj.depth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/assets/pano_asmasuxybohhcj.depth.png
--------------------------------------------------------------------------------
/assets/pano_asmasuxybohhcj.layout.txt:
--------------------------------------------------------------------------------
1 | 83.7 161.1
2 | 83.7 332.7
3 | 126.6 133.3
4 | 126.6 358.2
5 | 181.2 170.3
6 | 181.2 324.7
7 | 354.4 176.4
8 | 354.4 319.4
9 | 609.0 149.1
10 | 609.0 343.6
11 | 941.1 160.6
12 | 941.1 333.1
13 |
--------------------------------------------------------------------------------
/assets/pano_asmasuxybohhcj.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/assets/pano_asmasuxybohhcj.png
--------------------------------------------------------------------------------
/assets/repo_teaser.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/assets/repo_teaser.jpg
--------------------------------------------------------------------------------
/assets/snapshot_depth.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/assets/snapshot_depth.jpg
--------------------------------------------------------------------------------
/assets/snapshot_layout.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/assets/snapshot_layout.jpg
--------------------------------------------------------------------------------
/config/mp3d_depth/HOHO_depth_dct_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/mp3d_scenes_test.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 60
23 | batch_size: 4
24 | save_every: 60
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: dct
48 | n_components: 64
49 | loss: l1
50 |
--------------------------------------------------------------------------------
/config/mp3d_depth/HOHO_depth_dct_efficienthc_TransEn1_hardnet.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/mp3d_scenes_test.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 60
23 | batch_size: 4
24 | save_every: 60
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: HarDNet
36 | kwargs:
37 | depth_wise: False
38 | arch: 68
39 | pretrained: True
40 | decode_config:
41 | module: EfficientHeightReduction
42 | refine_config:
43 | module: TransEn
44 | kwargs:
45 | position_encode: 256
46 | num_layers: 1
47 | modalities_config:
48 | DepthEstimator:
49 | basis: dct
50 | n_components: 64
51 | loss: l1
52 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_dct128_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: dct
48 | n_components: 128
49 | loss: l1
50 |
51 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_dct256_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: dct
48 | n_components: 256
49 | loss: l1
50 |
51 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_dct32_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: dct
48 | n_components: 32
49 | loss: l1
50 |
51 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_dct512_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: dct
48 | n_components: 512
49 | loss: l1
50 |
51 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_dct_LSTM.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: GlobalHeightStage
40 | refine_config:
41 | module: LSTM
42 | modalities_config:
43 | DepthEstimator:
44 | basis: dct
45 | n_components: 64
46 | loss: l1
47 |
48 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_dct_Linear.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: GlobalHeightStage
40 | refine_config:
41 | module: Linear
42 | modalities_config:
43 | DepthEstimator:
44 | basis: dct
45 | n_components: 64
46 | loss: l1
47 |
48 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_dct_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: GlobalHeightStage
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: dct
48 | n_components: 64
49 | loss: l1
50 |
51 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_dct_efficienthc_LSTM.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: LSTM
42 | modalities_config:
43 | DepthEstimator:
44 | basis: dct
45 | n_components: 64
46 | loss: l1
47 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_dct_efficienthc_Linear.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: Linear
42 | modalities_config:
43 | DepthEstimator:
44 | basis: dct
45 | n_components: 64
46 | loss: l1
47 |
48 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_dct_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: dct
48 | n_components: 64
49 | loss: l1
50 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_dct_efficienthc_TransEn1_resnet34.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet34
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: dct
48 | n_components: 64
49 | loss: l1
50 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_lin128_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: linear
48 | n_components: 128
49 | loss: l1
50 |
51 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_lin256_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: linear
48 | n_components: 256
49 | loss: l1
50 |
51 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_lin32_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: linear
48 | n_components: 32
49 | loss: l1
50 |
51 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_lin512_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: linear
48 | n_components: 512
49 | loss: l1
50 |
51 |
--------------------------------------------------------------------------------
/config/mp3d_depth/ablation/tuning___HOHO_depth_lin64_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: CorruptMP3dDepthDataset
7 | common_kwargs:
8 | root: data/mp3d_align
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/matterport3d/scenes_abla_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/matterport3d/scenes_abla_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 40
23 | batch_size: 4
24 | save_every: 40
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: linear
48 | n_components: 64
49 | loss: l1
50 |
51 |
--------------------------------------------------------------------------------
/config/mp3d_layout/HOHO_layout_aug_efficienthc_Transen1_resnet34.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: PanoCorBonDataset
7 | train_kwargs:
8 | root_dir: data/mp3d_layout/train_no_occ
9 | flip: True
10 | rotate: True
11 | gamma: True
12 | stretch: True
13 | valid_kwargs:
14 | root_dir: data/mp3d_layout/valid_no_occ
15 |
16 | training:
17 | epoch: 300
18 | batch_size: 4
19 | save_every: 300
20 | optim_lr: 0.0001
21 | optim_poly_gamma: 0.9
22 |
23 | model:
24 | file: lib.model.hohonet
25 | modelclass: HoHoNet
26 | kwargs:
27 | emb_dim: 256
28 | backbone_config:
29 | module: Resnet
30 | kwargs:
31 | backbone: resnet34
32 | decode_config:
33 | module: EfficientHeightReduction
34 | refine_config:
35 | module: TransEn
36 | kwargs:
37 | position_encode: 256
38 | nhead: 8
39 | num_layers: 1
40 | dim_feedforward: 2048
41 | modalities_config:
42 | LayoutEstimator:
43 | cor_weight: 1.
44 | bon_weight: 1.
45 | last_bias: False
46 | last_ks: 1
47 |
--------------------------------------------------------------------------------
/config/s2d3d_depth/HOHO_depthS_SGD_dct_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dDepthDataset
7 | common_kwargs:
8 | root: data/stanford2D3D
9 | hw: (256, 512)
10 | dmax: 100.
11 | train_kwargs:
12 | scene_txt: data/stanford2D3D/small_train.txt
13 | rand_rotate: True
14 | rand_flip: True
15 | rand_gamma: True
16 | valid_kwargs:
17 | scene_txt: data/stanford2D3D/small_test.txt
18 | rand_rotate: False
19 | rand_flip: False
20 | rand_gamma: False
21 |
22 | training:
23 | optim: SGD
24 | epoch: 60
25 | batch_size: 8
26 | save_every: 60
27 | optim_lr: 0.01
28 | weight_decay: 0.0005
29 | optim_poly_gamma: 0.9
30 | optim_betas: (0.9, 0.999)
31 |
32 | model:
33 | file: lib.model.hohonet
34 | modelclass: HoHoNet
35 | kwargs:
36 | emb_dim: 256
37 | backbone_config:
38 | module: Resnet
39 | kwargs:
40 | backbone: resnet50
41 | input_height: 256
42 | decode_config:
43 | module: EfficientHeightReduction
44 | refine_config:
45 | module: TransEn
46 | kwargs:
47 | position_encode: 128
48 | num_layers: 1
49 | modalities_config:
50 | DepthEstimator:
51 | basis: dct
52 | n_components: 64
53 | loss: l1
54 | output_height: 256
55 |
56 |
--------------------------------------------------------------------------------
/config/s2d3d_depth/HOHO_depthS_dct_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dDepthDataset
7 | common_kwargs:
8 | root: data/stanford2D3D
9 | hw: (256, 512)
10 | dmax: 100.
11 | train_kwargs:
12 | scene_txt: data/stanford2D3D/small_train.txt
13 | rand_rotate: True
14 | rand_flip: True
15 | rand_gamma: True
16 | valid_kwargs:
17 | scene_txt: data/stanford2D3D/small_test.txt
18 | rand_rotate: False
19 | rand_flip: False
20 | rand_gamma: False
21 |
22 | training:
23 | epoch: 60
24 | batch_size: 4
25 | save_every: 60
26 | optim_lr: 0.0001
27 | optim_poly_gamma: 0.9
28 | optim_betas: (0.9, 0.999)
29 |
30 | model:
31 | file: lib.model.hohonet
32 | modelclass: HoHoNet
33 | kwargs:
34 | emb_dim: 256
35 | backbone_config:
36 | module: Resnet
37 | kwargs:
38 | backbone: resnet50
39 | input_height: 256
40 | decode_config:
41 | module: EfficientHeightReduction
42 | refine_config:
43 | module: TransEn
44 | kwargs:
45 | position_encode: 128
46 | num_layers: 1
47 | modalities_config:
48 | DepthEstimator:
49 | basis: dct
50 | n_components: 64
51 | loss: l1
52 | output_height: 256
53 |
54 |
--------------------------------------------------------------------------------
/config/s2d3d_depth/HOHO_depth_dct_efficienthc_TransEn1.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dDepthDataset
7 | common_kwargs:
8 | root: data/stanford2D3D
9 | hw: (512, 1024)
10 | train_kwargs:
11 | scene_txt: data/stanford2D3D/fold1_train.txt
12 | rand_rotate: True
13 | rand_flip: True
14 | rand_gamma: True
15 | valid_kwargs:
16 | scene_txt: data/stanford2D3D/fold1_valid.txt
17 | rand_rotate: False
18 | rand_flip: False
19 | rand_gamma: False
20 |
21 | training:
22 | epoch: 60
23 | batch_size: 4
24 | save_every: 60
25 | optim_lr: 0.0001
26 | optim_poly_gamma: 0.9
27 | optim_betas: (0.9, 0.999)
28 |
29 | model:
30 | file: lib.model.hohonet
31 | modelclass: HoHoNet
32 | kwargs:
33 | emb_dim: 256
34 | backbone_config:
35 | module: Resnet
36 | kwargs:
37 | backbone: resnet50
38 | decode_config:
39 | module: EfficientHeightReduction
40 | refine_config:
41 | module: TransEn
42 | kwargs:
43 | position_encode: 256
44 | num_layers: 1
45 | modalities_config:
46 | DepthEstimator:
47 | basis: dct
48 | n_components: 64
49 | loss: l1
50 |
51 |
--------------------------------------------------------------------------------
/config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dSemDataset
7 | common_kwargs:
8 | root: data/s2d3d_sem/
9 | hw: (1024, 2048)
10 | train_kwargs:
11 | fold: 1_train
12 | flip: True
13 | rotate: True
14 | valid_kwargs:
15 | fold: 1_valid
16 |
17 | training:
18 | epoch: 60
19 | batch_size: 4
20 | save_every: 60
21 | optim_lr: 0.0001
22 | optim_poly_gamma: 0.9
23 | optim_betas: (0.9, 0.999)
24 |
25 | model:
26 | file: lib.model.hohonet
27 | modelclass: HoHoNet
28 | kwargs:
29 | emb_dim: 256
30 | input_norm: ugscnn
31 | backbone_config:
32 | module: Resnet
33 | kwargs:
34 | input_extra: 1
35 | backbone: resnet101
36 | input_height: 1024
37 | decode_config:
38 | module: EfficientHeightReduction
39 | refine_config:
40 | module: TransEn
41 | kwargs:
42 | position_encode: 512
43 | num_layers: 1
44 | modalities_config:
45 | SemanticSegmenter:
46 | num_classes: 13
47 | label_weight: data/s2d3d_sem/label13_weight.pth
48 | basis: dct
49 | loss: ce
50 | n_components: 64
51 | output_height: 1024
52 |
53 |
--------------------------------------------------------------------------------
/config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101rgb.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dSemDataset
7 | common_kwargs:
8 | root: data/s2d3d_sem/
9 | hw: (1024, 2048)
10 | depth: False
11 | train_kwargs:
12 | fold: 1_train
13 | flip: True
14 | rotate: True
15 | valid_kwargs:
16 | fold: 1_valid
17 |
18 | training:
19 | epoch: 60
20 | batch_size: 4
21 | save_every: 60
22 | optim_lr: 0.0001
23 | optim_poly_gamma: 0.9
24 | optim_betas: (0.9, 0.999)
25 |
26 | model:
27 | file: lib.model.hohonet
28 | modelclass: HoHoNet
29 | kwargs:
30 | emb_dim: 256
31 | backbone_config:
32 | module: Resnet
33 | kwargs:
34 | backbone: resnet101
35 | input_height: 1024
36 | decode_config:
37 | module: EfficientHeightReduction
38 | refine_config:
39 | module: TransEn
40 | kwargs:
41 | position_encode: 512
42 | num_layers: 1
43 | modalities_config:
44 | SemanticSegmenter:
45 | num_classes: 13
46 | label_weight: data/s2d3d_sem/label13_weight.pth
47 | basis: dct
48 | loss: ce
49 | n_components: 64
50 | output_height: 1024
51 |
52 |
--------------------------------------------------------------------------------
/config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dSemDataset
7 | common_kwargs:
8 | root: data/s2d3d_sem/
9 | hw: (1024, 2048)
10 | train_kwargs:
11 | fold: 2_train
12 | flip: True
13 | rotate: True
14 | valid_kwargs:
15 | fold: 2_valid
16 |
17 | training:
18 | epoch: 60
19 | batch_size: 4
20 | save_every: 60
21 | optim_lr: 0.0001
22 | optim_poly_gamma: 0.9
23 | optim_betas: (0.9, 0.999)
24 |
25 | model:
26 | file: lib.model.hohonet
27 | modelclass: HoHoNet
28 | kwargs:
29 | emb_dim: 256
30 | input_norm: ugscnn
31 | backbone_config:
32 | module: Resnet
33 | kwargs:
34 | input_extra: 1
35 | backbone: resnet101
36 | input_height: 1024
37 | decode_config:
38 | module: EfficientHeightReduction
39 | refine_config:
40 | module: TransEn
41 | kwargs:
42 | position_encode: 512
43 | num_layers: 1
44 | modalities_config:
45 | SemanticSegmenter:
46 | num_classes: 13
47 | label_weight: data/s2d3d_sem/label13_weight.pth
48 | basis: dct
49 | loss: ce
50 | n_components: 64
51 | output_height: 1024
52 |
53 |
--------------------------------------------------------------------------------
/config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101rgb.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dSemDataset
7 | common_kwargs:
8 | root: data/s2d3d_sem/
9 | hw: (1024, 2048)
10 | depth: False
11 | train_kwargs:
12 | fold: 2_train
13 | flip: True
14 | rotate: True
15 | valid_kwargs:
16 | fold: 2_valid
17 |
18 | training:
19 | epoch: 60
20 | batch_size: 4
21 | save_every: 60
22 | optim_lr: 0.0001
23 | optim_poly_gamma: 0.9
24 | optim_betas: (0.9, 0.999)
25 |
26 | model:
27 | file: lib.model.hohonet
28 | modelclass: HoHoNet
29 | kwargs:
30 | emb_dim: 256
31 | backbone_config:
32 | module: Resnet
33 | kwargs:
34 | backbone: resnet101
35 | input_height: 1024
36 | decode_config:
37 | module: EfficientHeightReduction
38 | refine_config:
39 | module: TransEn
40 | kwargs:
41 | position_encode: 512
42 | num_layers: 1
43 | modalities_config:
44 | SemanticSegmenter:
45 | num_classes: 13
46 | label_weight: data/s2d3d_sem/label13_weight.pth
47 | basis: dct
48 | loss: ce
49 | n_components: 64
50 | output_height: 1024
51 |
52 |
--------------------------------------------------------------------------------
/config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dSemDataset
7 | common_kwargs:
8 | root: data/s2d3d_sem/
9 | hw: (1024, 2048)
10 | train_kwargs:
11 | fold: 3_train
12 | flip: True
13 | rotate: True
14 | valid_kwargs:
15 | fold: 3_valid
16 |
17 | training:
18 | epoch: 60
19 | batch_size: 4
20 | save_every: 60
21 | optim_lr: 0.0001
22 | optim_poly_gamma: 0.9
23 | optim_betas: (0.9, 0.999)
24 |
25 | model:
26 | file: lib.model.hohonet
27 | modelclass: HoHoNet
28 | kwargs:
29 | emb_dim: 256
30 | input_norm: ugscnn
31 | backbone_config:
32 | module: Resnet
33 | kwargs:
34 | input_extra: 1
35 | backbone: resnet101
36 | input_height: 1024
37 | decode_config:
38 | module: EfficientHeightReduction
39 | refine_config:
40 | module: TransEn
41 | kwargs:
42 | position_encode: 512
43 | num_layers: 1
44 | modalities_config:
45 | SemanticSegmenter:
46 | num_classes: 13
47 | label_weight: data/s2d3d_sem/label13_weight.pth
48 | basis: dct
49 | loss: ce
50 | n_components: 64
51 | output_height: 1024
52 |
53 |
--------------------------------------------------------------------------------
/config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101rgb.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dSemDataset
7 | common_kwargs:
8 | root: data/s2d3d_sem/
9 | hw: (1024, 2048)
10 | depth: False
11 | train_kwargs:
12 | fold: 3_train
13 | flip: True
14 | rotate: True
15 | valid_kwargs:
16 | fold: 3_valid
17 |
18 | training:
19 | epoch: 60
20 | batch_size: 4
21 | save_every: 60
22 | optim_lr: 0.0001
23 | optim_poly_gamma: 0.9
24 | optim_betas: (0.9, 0.999)
25 |
26 | model:
27 | file: lib.model.hohonet
28 | modelclass: HoHoNet
29 | kwargs:
30 | emb_dim: 256
31 | backbone_config:
32 | module: Resnet
33 | kwargs:
34 | backbone: resnet101
35 | input_height: 1024
36 | decode_config:
37 | module: EfficientHeightReduction
38 | refine_config:
39 | module: TransEn
40 | kwargs:
41 | position_encode: 512
42 | num_layers: 1
43 | modalities_config:
44 | SemanticSegmenter:
45 | num_classes: 13
46 | label_weight: data/s2d3d_sem/label13_weight.pth
47 | basis: dct
48 | loss: ce
49 | n_components: 64
50 | output_height: 1024
51 |
52 |
--------------------------------------------------------------------------------
/config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h256_fold1_simple.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dSemDataset
7 | common_kwargs:
8 | root: data/s2d3d_sem/
9 | hw: (256, 512)
10 | train_kwargs:
11 | fold: 1_train
12 | flip: True
13 | rotate: True
14 | valid_kwargs:
15 | fold: 1_valid
16 |
17 | training:
18 | epoch: 300
19 | batch_size: 16
20 | save_every: 300
21 | optim_lr: 0.001
22 | optim_poly_gamma: 0.9
23 | optim_betas: (0.9, 0.999)
24 |
25 | model:
26 | file: lib.model.hohonet
27 | modelclass: HoHoNet
28 | kwargs:
29 | emb_dim: 256
30 | input_norm: ugscnn
31 | backbone_config:
32 | module: SimpleEncoder
33 | kwargs:
34 | input_extra: 1
35 | input_height: 256
36 | block: conv3x3max
37 | expand: 2
38 | decode_config:
39 | module: EfficientHeightReduction
40 | kwargs:
41 | out_ch: 256
42 | refine_config:
43 | module: TransEn
44 | kwargs:
45 | position_encode: 128
46 | num_layers: 1
47 | modalities_config:
48 | SemanticSegmenter:
49 | num_classes: 13
50 | label_weight: data/s2d3d_sem/label13_weight.pth
51 | basis: dct
52 | loss: ce
53 | n_components: 64
54 | output_height: 256
55 | dropout: 0.5
56 |
57 |
--------------------------------------------------------------------------------
/config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h256_fold2_simple.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dSemDataset
7 | common_kwargs:
8 | root: data/s2d3d_sem/
9 | hw: (256, 512)
10 | train_kwargs:
11 | fold: 2_train
12 | flip: True
13 | rotate: True
14 | valid_kwargs:
15 | fold: 2_valid
16 |
17 | training:
18 | epoch: 300
19 | batch_size: 16
20 | save_every: 300
21 | optim_lr: 0.001
22 | optim_poly_gamma: 0.9
23 | optim_betas: (0.9, 0.999)
24 |
25 | model:
26 | file: lib.model.hohonet
27 | modelclass: HoHoNet
28 | kwargs:
29 | emb_dim: 256
30 | input_norm: ugscnn
31 | backbone_config:
32 | module: SimpleEncoder
33 | kwargs:
34 | input_extra: 1
35 | input_height: 256
36 | block: conv3x3max
37 | expand: 2
38 | decode_config:
39 | module: EfficientHeightReduction
40 | kwargs:
41 | out_ch: 256
42 | refine_config:
43 | module: TransEn
44 | kwargs:
45 | position_encode: 128
46 | num_layers: 1
47 | modalities_config:
48 | SemanticSegmenter:
49 | num_classes: 13
50 | label_weight: data/s2d3d_sem/label13_weight.pth
51 | basis: dct
52 | loss: ce
53 | n_components: 64
54 | output_height: 256
55 | dropout: 0.5
56 |
57 |
--------------------------------------------------------------------------------
/config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h256_fold3_simple.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dSemDataset
7 | common_kwargs:
8 | root: data/s2d3d_sem/
9 | hw: (256, 512)
10 | train_kwargs:
11 | fold: 3_train
12 | flip: True
13 | rotate: True
14 | valid_kwargs:
15 | fold: 3_valid
16 |
17 | training:
18 | epoch: 300
19 | batch_size: 16
20 | save_every: 300
21 | optim_lr: 0.001
22 | optim_poly_gamma: 0.9
23 | optim_betas: (0.9, 0.999)
24 |
25 | model:
26 | file: lib.model.hohonet
27 | modelclass: HoHoNet
28 | kwargs:
29 | emb_dim: 256
30 | input_norm: ugscnn
31 | backbone_config:
32 | module: SimpleEncoder
33 | kwargs:
34 | input_extra: 1
35 | input_height: 256
36 | block: conv3x3max
37 | expand: 2
38 | decode_config:
39 | module: EfficientHeightReduction
40 | kwargs:
41 | out_ch: 256
42 | refine_config:
43 | module: TransEn
44 | kwargs:
45 | position_encode: 128
46 | num_layers: 1
47 | modalities_config:
48 | SemanticSegmenter:
49 | num_classes: 13
50 | label_weight: data/s2d3d_sem/label13_weight.pth
51 | basis: dct
52 | loss: ce
53 | n_components: 64
54 | output_height: 256
55 | dropout: 0.5
56 |
57 |
--------------------------------------------------------------------------------
/config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h64_fold1_simple.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dSemDataset
7 | common_kwargs:
8 | root: data/s2d3d_sem/
9 | hw: (64, 128)
10 | train_kwargs:
11 | fold: 1_train
12 | flip: True
13 | rotate: True
14 | valid_kwargs:
15 | fold: 1_valid
16 |
17 | training:
18 | epoch: 300
19 | batch_size: 16
20 | save_every: 300
21 | optim_lr: 0.001
22 | optim_poly_gamma: 0.9
23 | optim_betas: (0.9, 0.999)
24 |
25 | model:
26 | file: lib.model.hohonet
27 | modelclass: HoHoNet
28 | kwargs:
29 | emb_dim: 256
30 | input_norm: ugscnn
31 | backbone_config:
32 | module: SimpleEncoder
33 | kwargs:
34 | input_extra: 1
35 | input_height: 64
36 | block: conv3x3max
37 | expand: 2
38 | decode_config:
39 | module: EfficientHeightReduction
40 | kwargs:
41 | out_ch: 256
42 | refine_config:
43 | module: TransEn
44 | kwargs:
45 | position_encode: 32
46 | num_layers: 1
47 | modalities_config:
48 | SemanticSegmenter:
49 | num_classes: 13
50 | label_weight: data/s2d3d_sem/label13_weight.pth
51 | basis: dct
52 | loss: ce
53 | n_components: 64
54 | output_height: 64
55 | dropout: 0.5
56 |
57 |
--------------------------------------------------------------------------------
/config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h64_fold2_simple.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dSemDataset
7 | common_kwargs:
8 | root: data/s2d3d_sem/
9 | hw: (64, 128)
10 | train_kwargs:
11 | fold: 2_train
12 | flip: True
13 | rotate: True
14 | valid_kwargs:
15 | fold: 2_valid
16 |
17 | training:
18 | epoch: 300
19 | batch_size: 16
20 | save_every: 300
21 | optim_lr: 0.001
22 | optim_poly_gamma: 0.9
23 | optim_betas: (0.9, 0.999)
24 |
25 | model:
26 | file: lib.model.hohonet
27 | modelclass: HoHoNet
28 | kwargs:
29 | emb_dim: 256
30 | input_norm: ugscnn
31 | backbone_config:
32 | module: SimpleEncoder
33 | kwargs:
34 | input_extra: 1
35 | input_height: 64
36 | block: conv3x3max
37 | expand: 2
38 | decode_config:
39 | module: EfficientHeightReduction
40 | kwargs:
41 | out_ch: 256
42 | refine_config:
43 | module: TransEn
44 | kwargs:
45 | position_encode: 32
46 | num_layers: 1
47 | modalities_config:
48 | SemanticSegmenter:
49 | num_classes: 13
50 | label_weight: data/s2d3d_sem/label13_weight.pth
51 | basis: dct
52 | loss: ce
53 | n_components: 64
54 | output_height: 64
55 | dropout: 0.5
56 |
57 |
--------------------------------------------------------------------------------
/config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h64_fold3_simple.yaml:
--------------------------------------------------------------------------------
1 | ckpt_root: ckpt
2 | cuda: True
3 | num_workers: 8
4 |
5 | dataset:
6 | name: S2d3dSemDataset
7 | common_kwargs:
8 | root: data/s2d3d_sem/
9 | hw: (64, 128)
10 | train_kwargs:
11 | fold: 3_train
12 | flip: True
13 | rotate: True
14 | valid_kwargs:
15 | fold: 3_valid
16 |
17 | training:
18 | epoch: 300
19 | batch_size: 16
20 | save_every: 300
21 | optim_lr: 0.001
22 | optim_poly_gamma: 0.9
23 | optim_betas: (0.9, 0.999)
24 |
25 | model:
26 | file: lib.model.hohonet
27 | modelclass: HoHoNet
28 | kwargs:
29 | emb_dim: 256
30 | input_norm: ugscnn
31 | backbone_config:
32 | module: SimpleEncoder
33 | kwargs:
34 | input_extra: 1
35 | input_height: 64
36 | block: conv3x3max
37 | expand: 2
38 | decode_config:
39 | module: EfficientHeightReduction
40 | kwargs:
41 | out_ch: 256
42 | refine_config:
43 | module: TransEn
44 | kwargs:
45 | position_encode: 32
46 | num_layers: 1
47 | modalities_config:
48 | SemanticSegmenter:
49 | num_classes: 13
50 | label_weight: data/s2d3d_sem/label13_weight.pth
51 | basis: dct
52 | loss: ce
53 | n_components: 64
54 | output_height: 64
55 | dropout: 0.5
56 |
57 |
--------------------------------------------------------------------------------
/count_params_flops.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import importlib
4 | from tqdm import tqdm, trange
5 | from collections import Counter
6 |
7 | import numpy as np
8 |
9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 |
13 | from thop import profile, clever_format
14 |
15 | from lib.config import config, update_config
16 |
17 |
18 | if __name__ == '__main__':
19 |
20 | # Parse args & config
21 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
22 | parser.add_argument('--cfg', required=True)
23 | parser.add_argument('opts',
24 | help='Modify config options using the command-line',
25 | default=None, nargs=argparse.REMAINDER)
26 | args = parser.parse_args()
27 | update_config(config, args)
28 |
29 | # Init global variable
30 | device = 'cuda' if config.cuda else 'cpu'
31 | if config.cuda and config.cuda_benchmark:
32 | torch.backends.cudnn.benchmark = True
33 |
34 | # Init network
35 | model_file = importlib.import_module(config.model.file)
36 | model_class = getattr(model_file, config.model.modelclass)
37 | net = model_class(**config.model.kwargs).to(device)
38 | net.eval()
39 |
40 | # testing
41 | layers = net
42 | inputs = [torch.randn(1, 3, 512, 1024).to(device)]
43 | with torch.no_grad():
44 | flops, params = profile(layers, inputs)
45 | print(f'input :', [v.shape for v in inputs])
46 | print(f'flops : {flops/(10**9):.2f} G')
47 | print(f'params: {params/(10**6):.2f} M')
48 |
49 | import time
50 | fps = []
51 | with torch.no_grad():
52 | layers(inputs[0])
53 | for _ in range(50):
54 | eps_time = time.time()
55 | layers(inputs[0])
56 | torch.cuda.synchronize()
57 | eps_time = time.time() - eps_time
58 | fps.append(eps_time)
59 | print(f'fps : {1 / (sum(fps) / len(fps)):.2f}')
60 |
61 |
--------------------------------------------------------------------------------
/eval_layout.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import glob
4 | import argparse
5 | import numpy as np
6 | from tqdm import tqdm
7 | from shapely.geometry import Polygon
8 |
9 | from lib.dataset.dataset_layout import cor_2_1d
10 | from lib.misc import post_proc
11 |
12 |
13 | def prepare_gtdt_pairs(gt_glob, dt_glob):
14 | gt_paths = sorted(glob.glob(gt_glob))
15 | dt_paths_json = dict([(os.path.split(v)[-1].split('.')[0], v)
16 | for v in glob.glob(dt_glob) if v.endswith('json')])
17 | dt_paths_txt = dict([(os.path.split(v)[-1].split('.')[0], v)
18 | for v in glob.glob(dt_glob) if v.endswith('txt')])
19 |
20 | gtdt_pairs = []
21 | for gt_path in gt_paths:
22 | k = os.path.split(gt_path)[-1].split('.')[0]
23 | if k in dt_paths_json:
24 | gtdt_pairs.append((gt_path, dt_paths_json[k]))
25 | else:
26 | gtdt_pairs.append((gt_path, dt_paths_txt[k]))
27 | return gtdt_pairs
28 |
29 |
30 | def layout_2_depth(cor_id, h, w, return_mask=False):
31 | # Convert corners to per-column boundary first
32 | # Up -pi/2, Down pi/2
33 | vc, vf = cor_2_1d(cor_id, h, w)
34 | vc = vc[None, :] # [1, w]
35 | vf = vf[None, :] # [1, w]
36 | assert (vc > 0).sum() == 0
37 | assert (vf < 0).sum() == 0
38 |
39 | # Per-pixel v coordinate (vertical angle)
40 | vs = ((np.arange(h) + 0.5) / h - 0.5) * np.pi
41 | vs = np.repeat(vs[:, None], w, axis=1) # [h, w]
42 |
43 | # Floor-plane to depth
44 | floor_h = 1.6
45 | floor_d = np.abs(floor_h / np.sin(vs))
46 |
47 | # wall to camera distance on horizontal plane at cross camera center
48 | cs = floor_h / np.tan(vf)
49 |
50 | # Ceiling-plane to depth
51 | ceil_h = np.abs(cs * np.tan(vc)) # [1, w]
52 | ceil_d = np.abs(ceil_h / np.sin(vs)) # [h, w]
53 |
54 | # Wall to depth
55 | wall_d = np.abs(cs / np.cos(vs)) # [h, w]
56 |
57 | # Recover layout depth
58 | floor_mask = (vs > vf)
59 | ceil_mask = (vs < vc)
60 | wall_mask = (~floor_mask) & (~ceil_mask)
61 | depth = np.zeros([h, w], np.float32) # [h, w]
62 | depth[floor_mask] = floor_d[floor_mask]
63 | depth[ceil_mask] = ceil_d[ceil_mask]
64 | depth[wall_mask] = wall_d[wall_mask]
65 |
66 | assert (depth == 0).sum() == 0
67 | if return_mask:
68 | return depth, floor_mask, ceil_mask, wall_mask
69 | return depth
70 |
71 |
72 | def test_general(dt_cor_id, gt_cor_id, w, h, losses):
73 | dt_floor_coor = dt_cor_id[1::2]
74 | dt_ceil_coor = dt_cor_id[0::2]
75 | gt_floor_coor = gt_cor_id[1::2]
76 | gt_ceil_coor = gt_cor_id[0::2]
77 | assert (dt_floor_coor[:, 0] != dt_ceil_coor[:, 0]).sum() == 0
78 | assert (gt_floor_coor[:, 0] != gt_ceil_coor[:, 0]).sum() == 0
79 |
80 | # Eval 3d IoU and height error(in meter)
81 | N = len(dt_floor_coor)
82 | ch = -1.6
83 | dt_floor_xy = post_proc.np_coor2xy(dt_floor_coor, ch, 1024, 512, floorW=1, floorH=1)
84 | gt_floor_xy = post_proc.np_coor2xy(gt_floor_coor, ch, 1024, 512, floorW=1, floorH=1)
85 | dt_poly = Polygon(dt_floor_xy)
86 | gt_poly = Polygon(gt_floor_xy)
87 | if not gt_poly.is_valid:
88 | print('Skip ground truth invalid (%s)' % gt_path)
89 | return
90 |
91 | # 2D IoU
92 | try:
93 | area_dt = dt_poly.area
94 | area_gt = gt_poly.area
95 | area_inter = dt_poly.intersection(gt_poly).area
96 | iou2d = area_inter / (area_gt + area_dt - area_inter)
97 | except:
98 | iou2d = 0
99 |
100 | # 3D IoU
101 | try:
102 | cch_dt = post_proc.get_z1(dt_floor_coor[:, 1], dt_ceil_coor[:, 1], ch, 512)
103 | cch_gt = post_proc.get_z1(gt_floor_coor[:, 1], gt_ceil_coor[:, 1], ch, 512)
104 | h_dt = abs(cch_dt.mean() - ch)
105 | h_gt = abs(cch_gt.mean() - ch)
106 | area3d_inter = area_inter * min(h_dt, h_gt)
107 | area3d_pred = area_dt * h_dt
108 | area3d_gt = area_gt * h_gt
109 | iou3d = area3d_inter / (area3d_pred + area3d_gt - area3d_inter)
110 | except:
111 | iou3d = 0
112 |
113 | # rmse & delta_1
114 | gt_layout_depth = layout_2_depth(gt_cor_id, h, w)
115 | try:
116 | dt_layout_depth = layout_2_depth(dt_cor_id, h, w)
117 | except:
118 | dt_layout_depth = np.zeros_like(gt_layout_depth)
119 | rmse = ((gt_layout_depth - dt_layout_depth)**2).mean() ** 0.5
120 | thres = np.maximum(gt_layout_depth/dt_layout_depth, dt_layout_depth/gt_layout_depth)
121 | delta_1 = (thres < 1.25).mean()
122 |
123 | # Add a result
124 | n_corners = len(gt_floor_coor)
125 | if n_corners % 2 == 1:
126 | n_corners = 'odd'
127 | elif n_corners < 10:
128 | n_corners = str(n_corners)
129 | else:
130 | n_corners = '10+'
131 | losses[n_corners]['2DIoU'].append(iou2d)
132 | losses[n_corners]['3DIoU'].append(iou3d)
133 | losses[n_corners]['rmse'].append(rmse)
134 | losses[n_corners]['delta_1'].append(delta_1)
135 | losses['overall']['2DIoU'].append(iou2d)
136 | losses['overall']['3DIoU'].append(iou3d)
137 | losses['overall']['rmse'].append(rmse)
138 | losses['overall']['delta_1'].append(delta_1)
139 |
140 |
141 | if __name__ == '__main__':
142 |
143 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
144 | parser.add_argument('--dt_glob',
145 | help='NOTE: Remeber to quote your glob path.'
146 | 'Files assumed to be json from inference.py')
147 | parser.add_argument('--gt_glob',
148 | help='NOTE: Remeber to quote your glob path.'
149 | 'Files assumed to be txt')
150 | parser.add_argument('--w', default=1024, type=int,
151 | help='GT images width')
152 | parser.add_argument('--h', default=512, type=int,
153 | help='GT images height')
154 | args = parser.parse_args()
155 |
156 | # Prepare (gt, dt) pairs
157 | gtdt_pairs = prepare_gtdt_pairs(args.gt_glob, args.dt_glob)
158 |
159 | # Testing
160 | losses = dict([
161 | (n_corner, {'2DIoU': [], '3DIoU': [], 'rmse': [], 'delta_1': []})
162 | for n_corner in ['4', '6', '8', '10+', 'odd', 'overall']
163 | ])
164 | for gt_path, dt_path in tqdm(gtdt_pairs, desc='Testing'):
165 | # Parse ground truth
166 | with open(gt_path) as f:
167 | gt_cor_id = np.array([l.split() for l in f], np.float32)
168 |
169 | # Parse inferenced result
170 | if dt_path.endswith('json'):
171 | with open(dt_path) as f:
172 | dt = json.load(f)
173 | dt_cor_id = np.array(dt['uv'], np.float32)
174 | dt_cor_id[:, 0] *= args.w
175 | dt_cor_id[:, 1] *= args.h
176 | else:
177 | dt_cor_id = np.loadtxt(dt_path, np.float32)
178 |
179 | test_general(dt_cor_id, gt_cor_id, args.w, args.h, losses)
180 |
181 | for k, result in losses.items():
182 | iou2d = np.array(result['2DIoU'])
183 | iou3d = np.array(result['3DIoU'])
184 | rmse = np.array(result['rmse'])
185 | delta_1 = np.array(result['delta_1'])
186 | if len(iou2d) == 0:
187 | continue
188 | print('GT #Corners: %s (%d instances)' % (k, len(iou2d)))
189 | print(' 2DIoU : %.2f' % (iou2d.mean() * 100))
190 | print(' 3DIoU : %.2f' % (iou3d.mean() * 100))
191 | print(' RMSE : %.2f' % (rmse.mean()))
192 | print(' delta^1: %.2f' % (delta_1.mean()))
193 |
--------------------------------------------------------------------------------
/infer_depth.py:
--------------------------------------------------------------------------------
1 | import os, sys, time, glob
2 | import argparse
3 | import importlib
4 | from tqdm import tqdm
5 | from imageio import imread, imwrite
6 | import torch
7 | import numpy as np
8 |
9 | from lib.config import config, update_config
10 |
11 |
12 | if __name__ == '__main__':
13 |
14 | # Parse args & config
15 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
16 | parser.add_argument('--cfg', required=True)
17 | parser.add_argument('--pth', required=True)
18 | parser.add_argument('--out', required=True)
19 | parser.add_argument('--inp', required=True)
20 | parser.add_argument('opts',
21 | help='Modify config options using the command-line',
22 | default=None, nargs=argparse.REMAINDER)
23 | args = parser.parse_args()
24 | update_config(config, args)
25 | device = 'cuda' if config.cuda else 'cpu'
26 |
27 | # Parse input paths
28 | rgb_lst = glob.glob(args.inp)
29 | if len(rgb_lst) == 0:
30 | print('No images found')
31 | import sys; sys.exit()
32 |
33 | # Init model
34 | model_file = importlib.import_module(config.model.file)
35 | model_class = getattr(model_file, config.model.modelclass)
36 | net = model_class(**config.model.kwargs)
37 | net.load_state_dict(torch.load(args.pth, map_location=device))
38 | net = net.eval().to(device)
39 |
40 | # Run inference
41 | with torch.no_grad():
42 | for path in tqdm(rgb_lst):
43 | rgb = imread(path)
44 | x = torch.from_numpy(rgb).permute(2,0,1)[None].float() / 255.
45 | if x.shape[2:] != config.dataset.common_kwargs.hw:
46 | x = torch.nn.functional.interpolate(x, config.dataset.common_kwargs.hw, mode='area')
47 | x = x.to(device)
48 | pred_depth = net.infer(x)
49 | if not torch.is_tensor(pred_depth):
50 | pred_depth = pred_depth.pop('depth')
51 |
52 | fname = os.path.splitext(os.path.split(path)[1])[0]
53 | imwrite(
54 | os.path.join(args.out, f'{fname}.depth.png'),
55 | pred_depth.mul(1000).squeeze().cpu().numpy().astype(np.uint16)
56 | )
57 |
58 |
--------------------------------------------------------------------------------
/infer_layout.py:
--------------------------------------------------------------------------------
1 | import os, sys, time, glob
2 | import argparse
3 | import importlib
4 | from tqdm import tqdm
5 | from imageio import imread, imwrite
6 | import torch
7 | import numpy as np
8 |
9 | from lib.config import config, update_config
10 |
11 |
12 | if __name__ == '__main__':
13 |
14 | # Parse args & config
15 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
16 | parser.add_argument('--cfg', required=True)
17 | parser.add_argument('--pth', required=True)
18 | parser.add_argument('--out', required=True)
19 | parser.add_argument('--inp', required=True)
20 | parser.add_argument('opts',
21 | help='Modify config options using the command-line',
22 | default=None, nargs=argparse.REMAINDER)
23 | args = parser.parse_args()
24 | update_config(config, args)
25 | device = 'cuda' if config.cuda else 'cpu'
26 |
27 | # Parse input paths
28 | rgb_lst = glob.glob(args.inp)
29 | if len(rgb_lst) == 0:
30 | print('No images found')
31 | import sys; sys.exit()
32 |
33 | # Init model
34 | model_file = importlib.import_module(config.model.file)
35 | model_class = getattr(model_file, config.model.modelclass)
36 | net = model_class(**config.model.kwargs)
37 | net.load_state_dict(torch.load(args.pth, map_location=device))
38 | net = net.eval().to(device)
39 |
40 | # Run inference
41 | with torch.no_grad():
42 | for path in tqdm(rgb_lst):
43 | rgb = imread(path)
44 | x = torch.from_numpy(rgb).permute(2,0,1)[None].float() / 255.
45 | x = x.to(device)
46 | cor_id = net.infer(x)['cor_id']
47 |
48 | fname = os.path.splitext(os.path.split(path)[1])[0]
49 | with open(os.path.join(args.out, f'{fname}.layout.txt'), 'w') as f:
50 | for u, v in cor_id:
51 | f.write(f'{u:.1f} {v:.1f}\n')
52 |
53 |
--------------------------------------------------------------------------------
/lib/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | from yacs.config import CfgNode as CN
3 |
4 | config = CN()
5 |
6 | config.ckpt_root = 'ckpt'
7 | config.cuda = True
8 | config.cuda_benchmark = True
9 | config.num_workers = 8
10 |
11 | config.dataset = CN()
12 | config.dataset.name = 'PanoCorBonDataset'
13 | config.dataset.common_kwargs = CN(new_allowed=True)
14 | config.dataset.train_kwargs = CN(new_allowed=True)
15 | config.dataset.valid_kwargs = CN(new_allowed=True)
16 |
17 | config.training = CN()
18 | config.training.epoch = 300
19 | config.training.batch_size = 4
20 | config.training.save_every = 100
21 | config.training.optim = 'Adam'
22 | config.training.optim_lr = 0.0001
23 | config.training.optim_betas = (0.9, 0.999)
24 | config.training.weight_decay = 0.0
25 | config.training.wd_group_mode = 'bn and bias'
26 | config.training.optim_milestons = [0.5, 0.9]
27 | config.training.optim_gamma = 0.2
28 | config.training.optim_poly_gamma = -1.0
29 | config.training.fix_encoder_bn = False
30 |
31 | config.model = CN()
32 | config.model.file = 'lib.model.HorizonNet'
33 | config.model.modelclass = 'HorizonNet'
34 | config.model.kwargs = CN(new_allowed=True)
35 |
36 |
37 | def update_config(cfg, args):
38 | cfg.defrost()
39 |
40 | cfg.merge_from_file(args.cfg)
41 | cfg.merge_from_list(args.opts)
42 |
43 | cfg.freeze()
44 |
45 | def infer_exp_id(cfg_path):
46 | cfg_path = cfg_path.split('config/')[-1]
47 | if cfg_path.endswith('.yaml'):
48 | cfg_path = cfg_path[:-len('.yaml')]
49 | return '_'.join(cfg_path.split('/'))
50 |
51 |
--------------------------------------------------------------------------------
/lib/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset_layout import PanoCorBonDataset
2 | from .dataset_s2d3d_sem import S2d3dSemDataset
3 | from .dataset_depth import CorruptMP3dDepthDataset, MP3dDepthDataset, S2d3dDepthDataset
4 |
5 |
--------------------------------------------------------------------------------
/lib/dataset/dataset_depth.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import numpy as np
4 |
5 | from imageio import imread
6 | from scipy.spatial.transform import Rotation
7 | from lib.misc.pano_lsd_align import rotatePanorama
8 |
9 | import torch
10 | import torch.utils.data as data
11 |
12 |
13 | class BaseDataset(data.Dataset):
14 | def __init__(self, dmin=0.01, dmax=10, hw=(512, 1024),
15 | rand_rotate=False, rand_flip=False, rand_gamma=False,
16 | rand_pitch=0, rand_roll=0,
17 | fix_pitch=0, fix_roll=0):
18 | self.fname = []
19 | self.rgb_paths, self.d_paths = [], []
20 | self.dmin = dmin
21 | self.dmax = dmax
22 | self.hw = hw
23 | self.rand_rotate = rand_rotate
24 | self.rand_flip = rand_flip
25 | self.rand_gamma = rand_gamma
26 | self.rand_pitch = rand_pitch
27 | self.rand_roll = rand_roll
28 | self.fix_pitch = fix_pitch
29 | self.fix_roll = fix_roll
30 |
31 | def __len__(self):
32 | return len(self.rgb_paths)
33 |
34 | def read_rgb(self, path):
35 | return imread(path)
36 |
37 | def read_depth(self, path):
38 | raise NotImplementedError
39 |
40 | def __getitem__(self, idx):
41 | # Read data
42 | fname = self.fname[idx]
43 | color = self.read_rgb(self.rgb_paths[idx])
44 | depth = self.read_depth(self.d_paths[idx])
45 |
46 | # To tensor and reshape to [C, H, W]
47 | color = torch.from_numpy(color).permute(2,0,1).float() / 255
48 | depth = torch.from_numpy(depth)[None].float()
49 | depth = torch.clamp(depth, max=self.dmax)
50 |
51 | # Resize
52 | if color.shape[1:] != self.hw:
53 | color = torch.nn.functional.interpolate(color[None], self.hw, mode='area')[0]
54 | if depth.shape[1:] != self.hw:
55 | depth = torch.nn.functional.interpolate(depth[None], self.hw, mode='nearest')[0]
56 |
57 | # Data augmentation
58 | if self.rand_rotate:
59 | shift = np.random.randint(self.hw[1])
60 | color = torch.roll(color, shift, dims=-1)
61 | depth = torch.roll(depth, shift, dims=-1)
62 |
63 | if self.rand_flip and np.random.randint(2):
64 | color = torch.flip(color, dims=[-1])
65 | depth = torch.flip(depth, dims=[-1])
66 |
67 | if self.rand_gamma:
68 | p = np.random.uniform(1, 1.2)
69 | if np.random.randint(2) == 0:
70 | p = 1 / p
71 | color = color ** p
72 |
73 | # Rotation augmentation
74 | if self.rand_pitch > 0 or self.rand_roll > 0 or self.fix_pitch != 0 or self.fix_roll > 0:
75 | color = color.permute(1,2,0).numpy()
76 | depth = depth.permute(1,2,0).numpy()
77 | if self.fix_pitch:
78 | rot = self.fix_pitch
79 | vp = Rotation.from_rotvec([rot * np.pi / 180, 0, 0]).as_matrix()
80 | color = rotatePanorama(color, vp, order=0)
81 | elif self.rand_pitch > 0:
82 | rot = np.random.randint(0, self.rand_pitch)
83 | vp = Rotation.from_rotvec([rot * np.pi / 180, 0, 0]).as_matrix()
84 | color = rotatePanorama(color, vp, order=0)
85 | depth = rotatePanorama(depth, vp, order=0)
86 | if self.fix_roll:
87 | rot = self.fix_roll
88 | vp = Rotation.from_rotvec([0, rot * np.pi / 180, 0]).as_matrix()
89 | color = rotatePanorama(color, vp, order=0)
90 | elif self.rand_roll > 0:
91 | rot = np.random.randint(0, self.rand_roll)
92 | vp = Rotation.from_rotvec([0, rot * np.pi / 180, 0]).as_matrix()
93 | color = rotatePanorama(color, vp, order=0)
94 | depth = rotatePanorama(depth, vp, order=0)
95 | color = torch.from_numpy(color).permute(2,0,1).float()
96 | depth = torch.from_numpy(depth).permute(2,0,1).float()
97 |
98 | return {'x': color, 'depth': depth, 'fname': fname.ljust(200)}
99 |
100 |
101 | class CorruptMP3dDepthDataset(BaseDataset):
102 | def __init__(self, root, scene_txt, **kwargs):
103 | super(CorruptMP3dDepthDataset, self).__init__(**kwargs)
104 |
105 | # List all rgbd paths
106 | with open(scene_txt) as f:
107 | scene_split_ids = set(f.read().split())
108 | for scene in os.listdir(root):
109 | scene_root = os.path.join(root, scene)
110 | if not os.path.isdir(scene_root) or scene not in scene_split_ids:
111 | continue
112 | for cam in os.listdir(scene_root):
113 | cam_root = os.path.join(scene_root, cam)
114 | if not os.path.isdir(cam_root):
115 | continue
116 | self.rgb_paths.append(os.path.join(cam_root, 'color.jpg'))
117 | self.d_paths.append(os.path.join(cam_root, 'depth.npy'))
118 | assert len(self.rgb_paths) == len(self.d_paths)
119 | for path in self.rgb_paths:
120 | self.fname.append('_'.join(path.split('/')))
121 |
122 | def read_depth(self, path):
123 | depth = np.load(path)
124 | depth[depth == 0.01] = 0
125 | return depth
126 |
127 |
128 | class MP3dDepthDataset(BaseDataset):
129 | def __init__(self, root, scene_txt, **kwargs):
130 | super(MP3dDepthDataset, self).__init__(**kwargs)
131 |
132 | # List all rgbd paths
133 | with open(scene_txt) as f:
134 | scene_split_ids = set(f.read().split())
135 | for scene in os.listdir(root):
136 | scene_root = os.path.join(root, scene)
137 | if not os.path.isdir(scene_root) or scene not in scene_split_ids:
138 | continue
139 | self.rgb_paths.extend(sorted(glob.glob(os.path.join(scene_root, '*rgb.png'))))
140 | self.d_paths.extend(sorted(glob.glob(os.path.join(scene_root, '*depth.exr'))))
141 | assert len(self.rgb_paths) == len(self.d_paths)
142 | for path in self.rgb_paths:
143 | self.fname.append('_'.join(path.split('/')))
144 |
145 | def read_depth(self, path):
146 | import Imath
147 | import OpenEXR
148 | f = OpenEXR.InputFile(path)
149 | dw = f.header()['dataWindow']
150 | size = (dw.max.x - dw.min.x + 1, dw.max.y - dw.min.y + 1)
151 | depth = np.frombuffer(f.channel('Y', Imath.PixelType(Imath.PixelType.FLOAT)), np.float32)
152 | depth = depth.reshape(size[1], size[0])
153 | f.close()
154 | return depth.astype(np.float32)
155 |
156 |
157 | class S2d3dDepthDataset(BaseDataset):
158 | def __init__(self, root, scene_txt, **kwargs):
159 | super(S2d3dDepthDataset, self).__init__(**kwargs)
160 |
161 | # List all rgbd paths
162 | with open(scene_txt) as f:
163 | path_pair = [l.strip().split() for l in f]
164 | for rgb_path, dep_path in path_pair:
165 | self.rgb_paths.append(os.path.join(root, rgb_path))
166 | self.d_paths.append(os.path.join(root, dep_path))
167 | self.fname.append(os.path.split(rgb_path)[1])
168 |
169 | def read_depth(self, path):
170 | depth = imread(path)
171 | return np.where(depth==65535, 0, depth/512)
172 |
173 |
--------------------------------------------------------------------------------
/lib/dataset/dataset_s2d3d_sem.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import numpy as np
4 | from imageio import imread
5 | from shapely.geometry import LineString
6 |
7 | import torch
8 | import torch.utils.data as data
9 | import torch.nn.functional as F
10 |
11 | from lib.misc import panostretch
12 |
13 | __FOLD__ = {
14 | '1_train': ['area_1', 'area_2', 'area_3', 'area_4', 'area_6'],
15 | '1_valid': ['area_5a', 'area_5b'],
16 | '2_train': ['area_1', 'area_3', 'area_5a', 'area_5b', 'area_6'],
17 | '2_valid': ['area_2', 'area_4'],
18 | '3_train': ['area_2', 'area_4', 'area_5a', 'area_5b'],
19 | '3_valid': ['area_1', 'area_3', 'area_6'],
20 | }
21 |
22 | class S2d3dSemDataset(data.Dataset):
23 | NUM_CLASSES = 13
24 | ID2CLASS = ['beam', 'board', 'bookcase', 'ceiling', 'chair', 'clutter', 'column', 'door', 'floor', 'sofa', 'table', 'wall', 'window']
25 | def __init__(self, root, fold, depth=True, hw=(512, 1024), mask_black=True, flip=False, rotate=False):
26 | assert fold in __FOLD__, 'Unknown fold'
27 | self.depth = depth
28 | self.hw = hw
29 | self.mask_black = mask_black
30 | self.rgb_paths = []
31 | self.sem_paths = []
32 | self.dep_paths = []
33 | for dname in __FOLD__[fold]:
34 | self.rgb_paths.extend(sorted(glob.glob(os.path.join(root, dname, 'rgb', '*png'))))
35 | self.sem_paths.extend(sorted(glob.glob(os.path.join(root, dname, 'semantic', '*png'))))
36 | self.dep_paths.extend(sorted(glob.glob(os.path.join(root, dname, 'depth', '*png'))))
37 | assert len(self.rgb_paths)
38 | assert len(self.rgb_paths) == len(self.sem_paths)
39 | assert len(self.rgb_paths) == len(self.dep_paths)
40 | self.flip = flip
41 | self.rotate = rotate
42 |
43 | def __len__(self):
44 | return len(self.rgb_paths)
45 |
46 | def __getitem__(self, idx):
47 | rgb = torch.FloatTensor(imread(self.rgb_paths[idx]) / 255.).permute(2,0,1)
48 | sem = torch.LongTensor(imread(self.sem_paths[idx])) - 1
49 | if self.depth:
50 | dep = imread(self.dep_paths[idx])
51 | dep = np.where(dep==65535, 0, dep/512)
52 | dep = np.clip(dep, 0, 4)
53 | dep = torch.FloatTensor(dep[None])
54 | rgb = torch.cat([rgb, dep], 0)
55 | H, W = rgb.shape[1:]
56 | if (H, W) != self.hw:
57 | rgb = F.interpolate(rgb[None], size=self.hw, mode='bilinear', align_corners=False)[0]
58 | sem = F.interpolate(sem[None,None].float(), size=self.hw, mode='nearest')[0,0].long()
59 |
60 | # Random flip
61 | if self.flip and np.random.randint(2) == 0:
62 | rgb = torch.flip(rgb, (-1,))
63 | sem = torch.flip(sem, (-1,))
64 |
65 | # Random horizontal rotate
66 | if self.rotate:
67 | dx = np.random.randint(W)
68 | rgb = torch.roll(rgb, dx, dims=-1)
69 | sem = torch.roll(sem, dx, dims=-1)
70 |
71 | # Mask out top-down black
72 | if self.mask_black:
73 | sem[rgb.sum(0) == 0] = -1
74 |
75 | # Convert all data to tensor
76 | out_dict = {
77 | 'x': rgb,
78 | 'sem': sem,
79 | 'fname': os.path.split(self.rgb_paths[idx])[1].ljust(200),
80 | }
81 | return out_dict
82 |
83 |
84 | if __name__ == '__main__':
85 |
86 | import argparse
87 | from tqdm import tqdm
88 |
89 | parser = argparse.ArgumentParser()
90 | parser.add_argument('--root_dir', default='data/valid/')
91 | parser.add_argument('--ith', default=0, type=int,
92 | help='Pick a data id to visualize.'
93 | '-1 for visualize all data')
94 | parser.add_argument('--flip', action='store_true',
95 | help='whether to random flip')
96 | parser.add_argument('--rotate', action='store_true',
97 | help='whether to random horizon rotation')
98 | parser.add_argument('--gamma', action='store_true',
99 | help='whether to random luminance change')
100 | parser.add_argument('--stretch', action='store_true',
101 | help='whether to random pano stretch')
102 | parser.add_argument('--dist_clip', default=20)
103 | parser.add_argument('--out_dir', default='data/vis_dataset')
104 | args = parser.parse_args()
105 |
106 | os.makedirs(args.out_dir, exist_ok=True)
107 |
108 | print('args:')
109 | for key, val in vars(args).items():
110 | print(' {:16} {}'.format(key, val))
111 |
112 | dataset = PanoCorBonDataset(
113 | root_dir=args.root_dir,
114 | flip=args.flip, rotate=args.rotate, gamma=args.gamma, stretch=args.stretch)
115 |
116 | # Showing some information about dataset
117 | print('len(dataset): {}'.format(len(dataset)))
118 | batch = dataset[args.ith]
119 | for k, v in batch.items():
120 | if torch.is_tensor(v):
121 | print(k, v.shape)
122 | else:
123 | print(k, v)
124 | print('=' * 20)
125 |
126 | if args.ith >= 0:
127 | to_visualize = [dataset[args.ith]]
128 | else:
129 | to_visualize = dataset
130 |
131 | import matplotlib.pyplot as plt
132 | cmap = plt.get_cmap('bwr')
133 | for batch in tqdm(to_visualize):
134 | fname = os.path.split(batch['img_path'])[-1]
135 | img = batch['x'].permute(1,2,0).numpy()
136 | y_bon = batch['bon'].numpy()
137 | y_bon = ((y_bon / np.pi + 0.5) * img.shape[0]).round().astype(int)
138 | img[y_bon[0], np.arange(len(y_bon[0])), 1] = 1
139 | img[y_bon[1], np.arange(len(y_bon[1])), 1] = 1
140 | img = (img * 255).astype(np.uint8)
141 | img_pad = np.full((3, 1024, 3), 255, np.uint8)
142 | img_vot = batch['vot'].repeat(30, 1).numpy()
143 | img_vot = (img_vot / args.dist_clip + 1) / 2
144 | vot_mask = (img_vot >= 0) & (img_vot <= 1)
145 | img_vot = (cmap(img_vot)[...,:3] * 255).astype(np.uint8)
146 | img_vot[~vot_mask] = 0
147 | out = np.concatenate([img_vot, img_pad, img], 0)
148 | Image.fromarray(out).save(os.path.join(args.out_dir, fname))
149 |
150 |
--------------------------------------------------------------------------------
/lib/misc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/lib/misc/__init__.py
--------------------------------------------------------------------------------
/lib/misc/gen_txt_structured3d.py:
--------------------------------------------------------------------------------
1 | '''
2 | Help generate txt for train.py
3 | Please contact https://github.com/bertjiazheng/Structured3D for dataset.
4 | '''
5 |
6 | import os
7 | import glob
8 | import argparse
9 |
10 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
11 | parser.add_argument('--root', required=True,
12 | help='path to the dataset directory')
13 | parser.add_argument('--train_txt', required=True,
14 | help='path to save txt for train')
15 | parser.add_argument('--valid_txt', required=True,
16 | help='path to save txt for valid')
17 | parser.add_argument('--test_txt', required=True,
18 | help='path to save txt for test')
19 | args = parser.parse_args()
20 |
21 | train_scene = ['scene_%05d' % i for i in range(0, 3000)]
22 | valid_scene = ['scene_%05d' % i for i in range(3000, 3250)]
23 | test_scene = ['scene_%05d' % i for i in range(3250, 3500)]
24 |
25 | # Simple check: all directories exist
26 | for path in train_scene + valid_scene + test_scene:
27 | assert os.path.isdir(os.path.join(args.root, path)), '%s not found' % path
28 |
29 | def gen_pairs(scene_id_lst):
30 | pairs = []
31 | for scene_id in scene_id_lst:
32 | for fname in os.listdir(os.path.join(args.root, scene_id, 'rgb')):
33 | room_id = os.path.split(fname)[1].split('_')[0]
34 |
35 | img_k = os.path.join(os.path.join(scene_id, 'rgb', fname))
36 | layout_k = os.path.join(os.path.join(scene_id, 'layout', room_id + '_layout.txt'))
37 | assert os.path.isfile(os.path.join(args.root, img_k))
38 | assert os.path.isfile(os.path.join(args.root, layout_k))
39 | pairs.append((img_k, layout_k))
40 | return pairs
41 |
42 | with open(args.train_txt, 'w') as f:
43 | pairs = gen_pairs(train_scene)
44 | f.write('\n'.join([' '.join(p) for p in pairs]))
45 |
46 | with open(args.valid_txt, 'w') as f:
47 | pairs = gen_pairs(valid_scene)
48 | f.write('\n'.join([' '.join(p) for p in pairs]))
49 |
50 | with open(args.test_txt, 'w') as f:
51 | pairs = gen_pairs(test_scene)
52 | f.write('\n'.join([' '.join(p) for p in pairs]))
53 |
--------------------------------------------------------------------------------
/lib/misc/panostretch.py:
--------------------------------------------------------------------------------
1 | import functools
2 | import numpy as np
3 | from scipy.ndimage import map_coordinates
4 |
5 |
6 | def uv_meshgrid(w, h):
7 | uv = np.stack(np.meshgrid(range(w), range(h)), axis=-1)
8 | uv = uv.astype(np.float64)
9 | uv[..., 0] = ((uv[..., 0] + 0.5) / w - 0.5) * 2 * np.pi
10 | uv[..., 1] = ((uv[..., 1] + 0.5) / h - 0.5) * np.pi
11 | return uv
12 |
13 |
14 | @functools.lru_cache()
15 | def _uv_tri(w, h):
16 | uv = uv_meshgrid(w, h)
17 | sin_u = np.sin(uv[..., 0])
18 | cos_u = np.cos(uv[..., 0])
19 | tan_v = np.tan(uv[..., 1])
20 | return sin_u, cos_u, tan_v
21 |
22 |
23 | def uv_tri(w, h):
24 | sin_u, cos_u, tan_v = _uv_tri(w, h)
25 | return sin_u.copy(), cos_u.copy(), tan_v.copy()
26 |
27 |
28 | def coorx2u(x, w=1024):
29 | return ((x + 0.5) / w - 0.5) * 2 * np.pi
30 |
31 |
32 | def coory2v(y, h=512):
33 | return ((y + 0.5) / h - 0.5) * np.pi
34 |
35 |
36 | def u2coorx(u, w=1024):
37 | return (u / (2 * np.pi) + 0.5) * w - 0.5
38 |
39 |
40 | def v2coory(v, h=512):
41 | return (v / np.pi + 0.5) * h - 0.5
42 |
43 |
44 | def uv2xy(u, v, z=-50):
45 | c = z / np.tan(v)
46 | x = c * np.cos(u)
47 | y = c * np.sin(u)
48 | return x, y
49 |
50 |
51 | def pano_connect_points(p1, p2, z=-50, w=1024, h=512):
52 | if p1[0] == p2[0]:
53 | return np.array([p1, p2], np.float32)
54 |
55 | u1 = coorx2u(p1[0], w)
56 | v1 = coory2v(p1[1], h)
57 | u2 = coorx2u(p2[0], w)
58 | v2 = coory2v(p2[1], h)
59 |
60 | x1, y1 = uv2xy(u1, v1, z)
61 | x2, y2 = uv2xy(u2, v2, z)
62 |
63 | if abs(p1[0] - p2[0]) < w / 2:
64 | pstart = np.ceil(min(p1[0], p2[0]))
65 | pend = np.floor(max(p1[0], p2[0]))
66 | else:
67 | pstart = np.ceil(max(p1[0], p2[0]))
68 | pend = np.floor(min(p1[0], p2[0]) + w)
69 | coorxs = (np.arange(pstart, pend + 1) % w).astype(np.float64)
70 | vx = x2 - x1
71 | vy = y2 - y1
72 | us = coorx2u(coorxs, w)
73 | ps = (np.tan(us) * x1 - y1) / (vy - np.tan(us) * vx)
74 | cs = np.sqrt((x1 + ps * vx) ** 2 + (y1 + ps * vy) ** 2)
75 | vs = np.arctan2(z, cs)
76 | coorys = v2coory(vs, h)
77 |
78 | return np.stack([coorxs, coorys], axis=-1)
79 |
80 |
81 | def pano_stretch(img, corners, kx, ky, order=1):
82 | '''
83 | img: [H, W, C]
84 | corners: [N, 2] in image coordinate (x, y) format
85 | kx: Stretching along front-back direction
86 | ky: Stretching along left-right direction
87 | order: Interpolation order. 0 for nearest-neighbor. 1 for bilinear.
88 | '''
89 |
90 | # Process image
91 | sin_u, cos_u, tan_v = uv_tri(img.shape[1], img.shape[0])
92 | u0 = np.arctan2(sin_u * kx / ky, cos_u)
93 | v0 = np.arctan(tan_v * np.sin(u0) / sin_u * ky)
94 |
95 | refx = (u0 / (2 * np.pi) + 0.5) * img.shape[1] - 0.5
96 | refy = (v0 / np.pi + 0.5) * img.shape[0] - 0.5
97 |
98 | # [TODO]: using opencv remap could probably speedup the process a little
99 | stretched_img = np.stack([
100 | map_coordinates(img[..., i], [refy, refx], order=order, mode='wrap')
101 | for i in range(img.shape[-1])
102 | ], axis=-1)
103 |
104 | # Process corners
105 | corners_u0 = coorx2u(corners[:, 0], img.shape[1])
106 | corners_v0 = coory2v(corners[:, 1], img.shape[0])
107 | corners_u = np.arctan2(np.sin(corners_u0) * ky / kx, np.cos(corners_u0))
108 | corners_v = np.arctan(np.tan(corners_v0) * np.sin(corners_u) / np.sin(corners_u0) / ky)
109 | cornersX = u2coorx(corners_u, img.shape[1])
110 | cornersY = v2coory(corners_v, img.shape[0])
111 | stretched_corners = np.stack([cornersX, cornersY], axis=-1)
112 |
113 | return stretched_img, stretched_corners
114 |
115 |
116 | def visualize_pano_stretch(stretched_img, stretched_cor, title):
117 | '''
118 | Helper function for visualizing the effect of pano_stretch
119 | '''
120 | thikness = 2
121 | color = (0, 255, 0)
122 | for i in range(4):
123 | xys = pano_connect_points(stretched_cor[i*2], stretched_cor[(i*2+2) % 8], z=-50)
124 | xys = xys.astype(int)
125 | blue_split = np.where((xys[1:, 0] - xys[:-1, 0]) < 0)[0]
126 | if len(blue_split) == 0:
127 | cv2.polylines(stretched_img, [xys], False, color, 2)
128 | else:
129 | t = blue_split[0] + 1
130 | cv2.polylines(stretched_img, [xys[:t]], False, color, thikness)
131 | cv2.polylines(stretched_img, [xys[t:]], False, color, thikness)
132 |
133 | for i in range(4):
134 | xys = pano_connect_points(stretched_cor[i*2+1], stretched_cor[(i*2+3) % 8], z=50)
135 | xys = xys.astype(int)
136 | blue_split = np.where((xys[1:, 0] - xys[:-1, 0]) < 0)[0]
137 | if len(blue_split) == 0:
138 | cv2.polylines(stretched_img, [xys], False, color, 2)
139 | else:
140 | t = blue_split[0] + 1
141 | cv2.polylines(stretched_img, [xys[:t]], False, color, thikness)
142 | cv2.polylines(stretched_img, [xys[t:]], False, color, thikness)
143 |
144 | cv2.putText(stretched_img, title, (25, 50), cv2.FONT_HERSHEY_SIMPLEX, 1,
145 | (0, 0, 0), 2, cv2.LINE_AA)
146 |
147 | return stretched_img.astype(np.uint8)
148 |
149 |
150 | if __name__ == '__main__':
151 |
152 | import argparse
153 | import time
154 | from PIL import Image
155 | import cv2
156 |
157 | parser = argparse.ArgumentParser()
158 | parser.add_argument('--i', default='data/valid/img/pano_abpohapclcyuuz.png')
159 | parser.add_argument('--i_gt', default='data/valid/label_cor/pano_abpohapclcyuuz.txt')
160 | parser.add_argument('--o', default='sample_stretched_pano.png')
161 | parser.add_argument('--kx', default=2, type=float,
162 | help='Stretching along front-back direction')
163 | parser.add_argument('--ky', default=1, type=float,
164 | help='Stretching along left-right direction')
165 | args = parser.parse_args()
166 |
167 | img = np.array(Image.open(args.i), np.float64)
168 | with open(args.i_gt) as f:
169 | cor = np.array([line.strip().split() for line in f], np.int32)
170 | stretched_img, stretched_cor = pano_stretch(img, cor, args.kx, args.ky)
171 |
172 | title = 'kx=%3.2f, ky=%3.2f' % (args.kx, args.ky)
173 | visual_stretched_img = visualize_pano_stretch(stretched_img, stretched_cor, title)
174 | Image.fromarray(visual_stretched_img).save(args.o)
175 |
--------------------------------------------------------------------------------
/lib/misc/structured3d_extract_zip.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | from zipfile import ZipFile
4 | from tqdm import tqdm
5 | import imageio
6 |
7 | '''
8 | Zipfile format assumption:
9 | Structured3D
10 | -- [scene_xxxxx]
11 | -- other something
12 | -- 2D_rendering
13 | -- [image_id]
14 | -- panorama
15 | -- camera_xyz.txt
16 | -- layout.txt
17 | -- [empty|simple|full]
18 | -- depth.png
19 | -- rgb_rawlight.png
20 | -- rgb_coldlight.png
21 | -- rgb_warmlight.png
22 | -- other something
23 |
24 | Output format
25 | outdir
26 | -- [scene_xxxxx]
27 | -- img
28 | -- layout
29 | '''
30 |
31 | parser = argparse.ArgumentParser()
32 | parser.add_argument('--zippath', required=True)
33 | parser.add_argument('--style', default='full')
34 | parser.add_argument('--outdir', default='structured3d')
35 | args = parser.parse_args()
36 |
37 | path_format = 'Structured3D/%s/2D_rendering/%s/panorama/%s'
38 |
39 | with ZipFile(args.zippath) as zipf:
40 | id_set = set()
41 | for path in zipf.namelist():
42 | assert path.startswith('Structured3D')
43 | if path.endswith('camera_xyz.txt'):
44 | path_lst = path.split('/')
45 | scene_id = path_lst[1]
46 | image_id = path_lst[3]
47 | id_set.add((scene_id, image_id))
48 |
49 | for scene_id, image_id in tqdm(id_set):
50 | path_img = path_format % (scene_id, image_id, '%s/rgb_rawlight.png' % args.style)
51 | path_layout = path_format % (scene_id, image_id, 'layout.txt')
52 |
53 | os.makedirs(os.path.join(args.outdir, scene_id, 'rgb'), exist_ok=True)
54 | os.makedirs(os.path.join(args.outdir, scene_id, 'layout'), exist_ok=True)
55 |
56 | with zipf.open(path_img) as f:
57 | rgb = imageio.imread(f)[..., :3]
58 | imageio.imwrite(os.path.join(args.outdir, scene_id, 'rgb', image_id + '_rgb_rawlight.png'), rgb)
59 | with zipf.open(path_layout) as f:
60 | with open(os.path.join(args.outdir, scene_id, 'layout', image_id + '_layout.txt'), 'w') as fo:
61 | fo.write(f.read().decode())
62 |
--------------------------------------------------------------------------------
/lib/misc/structured3d_prepare_dataset.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | from zipfile import ZipFile
4 | from tqdm import tqdm
5 | import imageio
6 |
7 | '''
8 | Assume datas is extracted by `misc/structured3d_extract_zip.py`.
9 | That is to said, assuming following structure:
10 | - {in_root}/scene_xxxxx
11 | - rgb/
12 | - *png
13 | - layout/
14 | - *txt
15 |
16 | The reorganized structure as follow:
17 | - {out_train_root}
18 | - img/
19 | - scene_xxxxx_*png (softlink)
20 | - label_cor/
21 | - scene_xxxxx_*txt (softlink)
22 | - {out_valid_root} ...
23 | - {out_test_root} ...
24 | '''
25 | TRAIN_SCENE = ['scene_%05d' % i for i in range(0, 3000)]
26 | VALID_SCENE = ['scene_%05d' % i for i in range(3000, 3250)]
27 | TEST_SCENE = ['scene_%05d' % i for i in range(3250, 3500)]
28 |
29 | parser = argparse.ArgumentParser()
30 | parser.add_argument('--in_root', required=True)
31 | parser.add_argument('--out_train_root', default='data/st3d_train_full_raw_light')
32 | parser.add_argument('--out_valid_root', default='data/st3d_valid_full_raw_light')
33 | parser.add_argument('--out_test_root', default='data/st3d_test_full_raw_light')
34 | args = parser.parse_args()
35 |
36 | def prepare_dataset(scene_ids, out_dir):
37 | root_img = os.path.join(out_dir, 'img')
38 | root_cor = os.path.join(out_dir, 'label_cor')
39 | os.makedirs(root_img, exist_ok=True)
40 | os.makedirs(root_cor, exist_ok=True)
41 | for scene_id in tqdm(scene_ids):
42 | source_img_root = os.path.join(args.in_root, scene_id, 'rgb')
43 | source_cor_root = os.path.join(args.in_root, scene_id, 'layout')
44 | for fname in os.listdir(source_cor_root):
45 | room_id = fname.split('_')[0]
46 | source_img_path = os.path.join(args.in_root, scene_id, 'rgb', room_id + '_rgb_rawlight.png')
47 | source_cor_path = os.path.join(args.in_root, scene_id, 'layout', room_id + '_layout.txt')
48 | target_img_path = os.path.join(root_img, '%s_%s.png' % (scene_id, room_id))
49 | target_cor_path = os.path.join(root_cor, '%s_%s.txt' % (scene_id, room_id))
50 | assert os.path.isfile(source_img_path)
51 | assert os.path.isfile(source_cor_path)
52 | os.symlink(source_img_path, target_img_path)
53 | os.symlink(source_cor_path, target_cor_path)
54 |
55 | prepare_dataset(TRAIN_SCENE, args.out_train_root)
56 | prepare_dataset(VALID_SCENE, args.out_valid_root)
57 | prepare_dataset(TEST_SCENE, args.out_test_root)
58 |
--------------------------------------------------------------------------------
/lib/misc/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from collections import OrderedDict
4 |
5 |
6 | def group_weight(module):
7 | # Group module parameters into two group
8 | # One need weight_decay and the other doesn't
9 | group_decay = []
10 | group_no_decay = []
11 | for m in module.modules():
12 | if isinstance(m, nn.Linear):
13 | group_decay.append(m.weight)
14 | if m.bias is not None:
15 | group_no_decay.append(m.bias)
16 | elif isinstance(m, nn.modules.conv._ConvNd):
17 | group_decay.append(m.weight)
18 | if m.bias is not None:
19 | group_no_decay.append(m.bias)
20 | elif isinstance(m, nn.modules.batchnorm._BatchNorm):
21 | if m.weight is not None:
22 | group_no_decay.append(m.weight)
23 | if m.bias is not None:
24 | group_no_decay.append(m.bias)
25 | elif isinstance(m, nn.GroupNorm):
26 | if m.weight is not None:
27 | group_no_decay.append(m.weight)
28 | if m.bias is not None:
29 | group_no_decay.append(m.bias)
30 |
31 | assert len(list(module.parameters())) == len(group_decay) + len(group_no_decay)
32 | return [dict(params=group_decay), dict(params=group_no_decay, weight_decay=.0)]
33 |
34 |
35 | def adjust_learning_rate(optimizer, args):
36 | if args.cur_iter < args.warmup_iters:
37 | frac = args.cur_iter / args.warmup_iters
38 | step = args.lr - args.warmup_lr
39 | args.running_lr = args.warmup_lr + step * frac
40 | else:
41 | frac = (float(args.cur_iter) - args.warmup_iters) / (args.max_iters - args.warmup_iters)
42 | scale_running_lr = max((1. - frac), 0.) ** args.lr_pow
43 | args.running_lr = args.lr * scale_running_lr
44 |
45 | for param_group in optimizer.param_groups:
46 | param_group['lr'] = args.running_lr
47 |
48 |
49 | def save_model(net, path, args):
50 | state_dict = OrderedDict({
51 | 'args': args.__dict__,
52 | 'kwargs': {
53 | 'backbone': net.backbone,
54 | 'use_rnn': net.use_rnn,
55 | },
56 | 'state_dict': net.state_dict(),
57 | })
58 | torch.save(state_dict, path)
59 |
60 |
61 | def load_trained_model(Net, path):
62 | state_dict = torch.load(path, map_location='cpu')
63 | net = Net(**state_dict['kwargs'])
64 | net.load_state_dict(state_dict['state_dict'])
65 | return net
66 |
--------------------------------------------------------------------------------
/lib/model/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet import Resnet
2 | from .simple import SimpleEncoder
3 | from .hardnet import HarDNet
4 |
--------------------------------------------------------------------------------
/lib/model/backbone/hardnet.py:
--------------------------------------------------------------------------------
1 | ''' Copy-paste from
2 | https://github.com/PingoLH/Pytorch-HarDNet
3 | '''
4 | import os
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 |
9 | class Flatten(nn.Module):
10 | def __init__(self):
11 | super().__init__()
12 | def forward(self, x):
13 | return x.view(x.data.size(0),-1)
14 |
15 |
16 |
17 | class CombConvLayer(nn.Sequential):
18 | def __init__(self, in_channels, out_channels, kernel=1, stride=1, dropout=0.1, bias=False):
19 | super().__init__()
20 | self.add_module('layer1',ConvLayer(in_channels, out_channels, kernel))
21 | self.add_module('layer2',DWConvLayer(out_channels, out_channels, stride=stride))
22 |
23 | def forward(self, x):
24 | return super().forward(x)
25 |
26 | class DWConvLayer(nn.Sequential):
27 | def __init__(self, in_channels, out_channels, stride=1, bias=False):
28 | super().__init__()
29 | out_ch = out_channels
30 |
31 | groups = in_channels
32 | kernel = 3
33 | #print(kernel, 'x', kernel, 'x', out_channels, 'x', out_channels, 'DepthWise')
34 |
35 | self.add_module('dwconv', nn.Conv2d(groups, groups, kernel_size=3,
36 | stride=stride, padding=1, groups=groups, bias=bias))
37 | self.add_module('norm', nn.BatchNorm2d(groups))
38 | def forward(self, x):
39 | return super().forward(x)
40 |
41 | class ConvLayer(nn.Sequential):
42 | def __init__(self, in_channels, out_channels, kernel=3, stride=1, dropout=0.1, bias=False):
43 | super().__init__()
44 | out_ch = out_channels
45 | groups = 1
46 | #print(kernel, 'x', kernel, 'x', in_channels, 'x', out_channels)
47 | self.add_module('conv', nn.Conv2d(in_channels, out_ch, kernel_size=kernel,
48 | stride=stride, padding=kernel//2, groups=groups, bias=bias))
49 | self.add_module('norm', nn.BatchNorm2d(out_ch))
50 | self.add_module('relu', nn.ReLU6(True))
51 | def forward(self, x):
52 | return super().forward(x)
53 |
54 |
55 | class HarDBlock(nn.Module):
56 | def get_link(self, layer, base_ch, growth_rate, grmul):
57 | if layer == 0:
58 | return base_ch, 0, []
59 | out_channels = growth_rate
60 | link = []
61 | for i in range(10):
62 | dv = 2 ** i
63 | if layer % dv == 0:
64 | k = layer - dv
65 | link.append(k)
66 | if i > 0:
67 | out_channels *= grmul
68 | out_channels = int(int(out_channels + 1) / 2) * 2
69 | in_channels = 0
70 | for i in link:
71 | ch,_,_ = self.get_link(i, base_ch, growth_rate, grmul)
72 | in_channels += ch
73 | return out_channels, in_channels, link
74 |
75 | def get_out_ch(self):
76 | return self.out_channels
77 |
78 | def __init__(self, in_channels, growth_rate, grmul, n_layers, keepBase=False, residual_out=False, dwconv=False):
79 | super().__init__()
80 | self.keepBase = keepBase
81 | self.links = []
82 | layers_ = []
83 | self.out_channels = 0 # if upsample else in_channels
84 | for i in range(n_layers):
85 | outch, inch, link = self.get_link(i+1, in_channels, growth_rate, grmul)
86 | self.links.append(link)
87 | use_relu = residual_out
88 | if dwconv:
89 | layers_.append(CombConvLayer(inch, outch))
90 | else:
91 | layers_.append(ConvLayer(inch, outch))
92 |
93 | if (i % 2 == 0) or (i == n_layers - 1):
94 | self.out_channels += outch
95 | #print("Blk out =",self.out_channels)
96 | self.layers = nn.ModuleList(layers_)
97 |
98 | def forward(self, x):
99 | layers_ = [x]
100 |
101 | for layer in range(len(self.layers)):
102 | link = self.links[layer]
103 | tin = []
104 | for i in link:
105 | tin.append(layers_[i])
106 | if len(tin) > 1:
107 | x = torch.cat(tin, 1)
108 | else:
109 | x = tin[0]
110 | out = self.layers[layer](x)
111 | layers_.append(out)
112 |
113 | t = len(layers_)
114 | out_ = []
115 | for i in range(t):
116 | if (i == 0 and self.keepBase) or \
117 | (i == t-1) or (i%2 == 1):
118 | out_.append(layers_[i])
119 | out = torch.cat(out_, 1)
120 | return out
121 |
122 |
123 |
124 |
125 | class HarDNet(nn.Module):
126 | def __init__(self, depth_wise=False, arch=68, pretrained=True, weight_path='', input_height=512):
127 | super().__init__()
128 | first_ch = [32, 64]
129 | second_kernel = 3
130 | max_pool = True
131 | grmul = 1.7
132 | drop_rate = 0.1
133 |
134 | #HarDNet68
135 | ch_list = [ 128, 256, 320, 640, 1024]
136 | gr = [ 14, 16, 20, 40,160]
137 | n_layers = [ 8, 16, 16, 16, 4]
138 | downSamp = [ 1, 0, 1, 1, 0]
139 |
140 | if arch==85:
141 | #HarDNet85
142 | first_ch = [48, 96]
143 | ch_list = [ 192, 256, 320, 480, 720, 1280]
144 | gr = [ 24, 24, 28, 36, 48, 256]
145 | n_layers = [ 8, 16, 16, 16, 16, 4]
146 | downSamp = [ 1, 0, 1, 0, 1, 0]
147 | drop_rate = 0.2
148 | elif arch==39:
149 | #HarDNet39
150 | first_ch = [24, 48]
151 | ch_list = [ 96, 320, 640, 1024]
152 | grmul = 1.6
153 | gr = [ 16, 20, 64, 160]
154 | n_layers = [ 4, 16, 8, 4]
155 | downSamp = [ 1, 1, 1, 0]
156 |
157 | if depth_wise:
158 | second_kernel = 1
159 | max_pool = False
160 | drop_rate = 0.05
161 |
162 | blks = len(n_layers)
163 | self.base = nn.ModuleList([])
164 |
165 | # First Layer: Standard Conv3x3, Stride=2
166 | self.base.append (
167 | ConvLayer(in_channels=3, out_channels=first_ch[0], kernel=3,
168 | stride=2, bias=False) )
169 |
170 | # Second Layer
171 | self.base.append ( ConvLayer(first_ch[0], first_ch[1], kernel=second_kernel) )
172 |
173 | # Maxpooling or DWConv3x3 downsampling
174 | if max_pool:
175 | self.base.append(nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
176 | else:
177 | self.base.append ( DWConvLayer(first_ch[1], first_ch[1], stride=2) )
178 |
179 | # Build all HarDNet blocks
180 | ch = first_ch[1]
181 | for i in range(blks):
182 | blk = HarDBlock(ch, gr[i], grmul, n_layers[i], dwconv=depth_wise)
183 | ch = blk.get_out_ch()
184 | self.base.append ( blk )
185 |
186 | if i == blks-1 and arch == 85:
187 | self.base.append ( nn.Dropout(0.1))
188 |
189 | self.base.append ( ConvLayer(ch, ch_list[i], kernel=1) )
190 | ch = ch_list[i]
191 | if downSamp[i] == 1:
192 | if max_pool:
193 | self.base.append(nn.MaxPool2d(kernel_size=2, stride=2))
194 | else:
195 | self.base.append ( DWConvLayer(ch, ch, stride=2) )
196 |
197 | ch = ch_list[blks-1]
198 | self.base.append (
199 | nn.Sequential(
200 | nn.AdaptiveAvgPool2d((1,1)),
201 | Flatten(),
202 | nn.Dropout(drop_rate),
203 | nn.Linear(ch, 1000) ))
204 |
205 | if pretrained:
206 | if hasattr(torch, 'hub'):
207 |
208 | if arch == 68 and not depth_wise:
209 | checkpoint = 'https://ping-chao.com/hardnet/hardnet68-5d684880.pth'
210 | elif arch == 85 and not depth_wise:
211 | checkpoint = 'https://ping-chao.com/hardnet/hardnet85-a28faa00.pth'
212 | elif arch == 68 and depth_wise:
213 | checkpoint = 'https://ping-chao.com/hardnet/hardnet68ds-632474d2.pth'
214 | else:
215 | checkpoint = 'https://ping-chao.com/hardnet/hardnet39ds-0e6c6fa9.pth'
216 |
217 | self.load_state_dict(torch.hub.load_state_dict_from_url(checkpoint, progress=False))
218 | else:
219 | postfix = 'ds' if depth_wise else ''
220 | weight_file = '%shardnet%d%s.pth'%(weight_path, arch, postfix)
221 | if not os.path.isfile(weight_file):
222 | print(weight_file,'is not found')
223 | exit(0)
224 | weights = torch.load(weight_file)
225 | self.load_state_dict(weights)
226 |
227 | postfix = 'DS' if depth_wise else ''
228 | print('ImageNet pretrained weights for HarDNet%d%s is loaded'%(arch, postfix))
229 |
230 | # Patch for HoHoNet
231 | self.base = self.base[:-1]
232 | if arch == 68:
233 | self.out_channels = [128, 320, 640, 1024]
234 | self.checkpoint = [4, 9, 12, 15]
235 | elif arch == 85:
236 | self.out_channels = [192, 320, 720, 1280]
237 | self.checkpoint = [4, 9, 14, 18]
238 | else:
239 | raise NotImplementedError
240 | self.feat_heights = [input_height//4//(2**i) for i in range(4)]
241 |
242 | def forward(self, x):
243 | x_lst = []
244 | for i, layer in enumerate(self.base):
245 | x = layer(x)
246 | if i in self.checkpoint:
247 | x_lst.append(x)
248 | return x_lst
249 |
250 |
--------------------------------------------------------------------------------
/lib/model/backbone/resnet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import torchvision.models as models
5 |
6 | class Resnet(nn.Module):
7 | def __init__(self, backbone='resnet50', coco='', input_extra=0, input_height=512):
8 | super(Resnet, self).__init__()
9 | self.encoder = getattr(models, backbone)(pretrained=True)
10 | del self.encoder.fc, self.encoder.avgpool
11 | if coco:
12 | coco_pretrain = getattr(models.segmentation, coco)(pretrained=True).backbone
13 | self.encoder.load_state_dict(coco_pretrain.state_dict())
14 | self.out_channels = [256, 512, 1024, 2048]
15 | self.feat_heights = [input_height//4//(2**i) for i in range(4)]
16 | if int(backbone[6:]) < 50:
17 | self.out_channels = [_//4 for _ in self.out_channels]
18 |
19 | # Patch for extra input channel
20 | if input_extra > 0:
21 | ori_conv1 = self.encoder.conv1
22 | new_conv1 = nn.Conv2d(
23 | 3+input_extra, ori_conv1.out_channels,
24 | kernel_size=ori_conv1.kernel_size,
25 | stride=ori_conv1.stride,
26 | padding=ori_conv1.padding,
27 | bias=ori_conv1.bias)
28 | with torch.no_grad():
29 | for i in range(0, 3+input_extra, 3):
30 | n = new_conv1.weight[:, i:i+3].shape[1]
31 | new_conv1.weight[:, i:i+n] = ori_conv1.weight[:, :n]
32 | self.encoder.conv1 = new_conv1
33 |
34 | # Prepare for pre/pose down height filtering
35 | self.pre_down = None
36 | self.post_down = None
37 |
38 | def forward(self, x):
39 | features = []
40 | x = self.encoder.conv1(x)
41 | x = self.encoder.bn1(x)
42 | x = self.encoder.relu(x)
43 | x = self.encoder.maxpool(x)
44 |
45 | if self.pre_down is not None:
46 | x = self.pre_down(x)
47 | x = self.encoder.layer1(x);
48 | if self.post_down is not None:
49 | x = self.post_down(x)
50 | features.append(x) # 1/4
51 | x = self.encoder.layer2(x); features.append(x) # 1/8
52 | x = self.encoder.layer3(x); features.append(x) # 1/16
53 | x = self.encoder.layer4(x); features.append(x) # 1/32
54 | return features
55 |
--------------------------------------------------------------------------------
/lib/model/backbone/simple.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import torchvision.models as models
5 |
6 | class SimpleResBlock(nn.Module):
7 | def __init__(self, a, b, c, s):
8 | super(SimpleResBlock, self).__init__()
9 | self.layer = nn.Sequential(
10 | nn.Conv2d(a, b, 1, bias=False),
11 | nn.BatchNorm2d(b),
12 | nn.ReLU(inplace=True),
13 | nn.Conv2d(b, b, 3, padding=1, stride=s, bias=False),
14 | nn.BatchNorm2d(b),
15 | nn.ReLU(inplace=True),
16 | nn.Conv2d(b, c, 1, bias=False),
17 | nn.BatchNorm2d(c),
18 | )
19 | self.skip = nn.Sequential(
20 | nn.Conv2d(a, c, 1, stride=s, bias=False),
21 | nn.BatchNorm2d(c),
22 | )
23 | self.relu = nn.ReLU(inplace=True)
24 | nn.init.constant_(self.layer[-1].weight, 0)
25 | nn.init.constant_(self.layer[-1].bias, 0)
26 |
27 | def forward(self, x):
28 | return self.relu(self.layer(x) + self.skip(x))
29 |
30 | class SimpleConv3x3Block(nn.Module):
31 | def __init__(self, a, b, c, s):
32 | super(SimpleConv3x3Block, self).__init__()
33 | self.layer = nn.Sequential(
34 | nn.Conv2d(a, c, 3, padding=1, stride=s, bias=False),
35 | nn.BatchNorm2d(c),
36 | nn.ReLU(inplace=True),
37 | nn.Conv2d(c, c, 3, padding=1, bias=False),
38 | nn.BatchNorm2d(c),
39 | nn.ReLU(inplace=True),
40 | )
41 |
42 | def forward(self, x):
43 | return self.layer(x)
44 |
45 | def SimpleConv3x3MaxBlock(a, b, c, s):
46 | return nn.Sequential(
47 | nn.Conv2d(a, c, 3, padding=1, bias=False),
48 | nn.BatchNorm2d(c),
49 | nn.ReLU(inplace=True),
50 | nn.Conv2d(c, c, 3, padding=1, bias=False),
51 | nn.BatchNorm2d(c),
52 | nn.ReLU(inplace=True),
53 | nn.MaxPool2d(s, stride=s),
54 | )
55 |
56 | def SimpleConv3x3lBlock(a, b, c, s):
57 | return nn.Sequential(
58 | nn.Conv2d(a, c, 3, padding=1, bias=False),
59 | nn.BatchNorm2d(c),
60 | nn.ReLU(inplace=True),
61 | nn.Conv2d(c, c, 3, padding=1, stride=s, bias=False),
62 | nn.BatchNorm2d(c),
63 | nn.ReLU(inplace=True),
64 | )
65 |
66 |
67 | class SimpleEncoder(nn.Module):
68 | def __init__(self, input_extra=0, input_height=512, block='res', expand=1):
69 | super(SimpleEncoder, self).__init__()
70 | self.conv_pre = nn.Sequential(
71 | nn.Conv2d(3+input_extra, 16*expand, kernel_size=3, padding=1, bias=False),
72 | nn.BatchNorm2d(16*expand),
73 | nn.ReLU(inplace=True),
74 | )
75 |
76 | if block == 'res':
77 | Block = SimpleResBlock
78 | elif block == 'conv3x3':
79 | Block = SimpleConv3x3Block
80 | elif block == 'conv3x3l':
81 | Block = SimpleConv3x3lBlock
82 | elif block == 'conv3x3max':
83 | Block = SimpleConv3x3MaxBlock
84 | else:
85 | raise NotImplementedError
86 | self.block0 = Block(16*expand, 16*expand, 32*expand, 2)
87 | self.block1 = Block(32*expand, 32*expand, 64*expand, 2)
88 | self.block2 = Block(64*expand, 64*expand, 128*expand, 2)
89 | self.block3 = Block(128*expand, 128*expand, 256*expand, 2)
90 | self.block4 = Block(256*expand, 256*expand, 256*expand, 2)
91 |
92 | self.out_channels = [64*expand, 128*expand, 256*expand, 256*expand]
93 | self.feat_heights = [input_height//4//(2**i) for i in range(4)]
94 |
95 | def forward(self, x):
96 | features = []
97 | x = self.conv_pre(x)
98 | x = self.block0(x)
99 | x = self.block1(x); features.append(x) # 1/4
100 | x = self.block2(x); features.append(x) # 1/8
101 | x = self.block3(x); features.append(x) # 1/16
102 | x = self.block4(x); features.append(x) # 1/32
103 | return features
104 |
--------------------------------------------------------------------------------
/lib/model/hohonet.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 | from . import backbone
8 | from . import horizon_compression
9 | from . import horizon_refinement
10 | from . import horizon_upsample
11 | from . import modality
12 | from .utils import wrap_lr_pad
13 |
14 |
15 | '''
16 | HoHoNet
17 | '''
18 | class HoHoNet(nn.Module):
19 | def __init__(self, emb_dim=256, input_hw=None, input_norm='imagenet', pretrain='',
20 | backbone_config={'module': 'Resnet'},
21 | decode_config={'module': 'EfficientHeightReduction'},
22 | refine_config={'module': 'TransEn'},
23 | upsample_config={'module': 'Upsample1D'},
24 | modalities_config={}):
25 | super(HoHoNet, self).__init__()
26 | self.input_hw = input_hw
27 | if input_norm == 'imagenet':
28 | self.register_buffer('x_mean', torch.FloatTensor(np.array([0.485, 0.456, 0.406])[None, :, None, None]))
29 | self.register_buffer('x_std', torch.FloatTensor(np.array([0.229, 0.224, 0.225])[None, :, None, None]))
30 | elif input_norm == 'ugscnn':
31 | self.register_buffer('x_mean', torch.FloatTensor(np.array([0.4974898, 0.47918808, 0.42809588, 1.0961773])[None, :, None, None]))
32 | self.register_buffer('x_std', torch.FloatTensor(np.array([0.23762763, 0.23354423, 0.23272438, 0.75536704])[None, :, None, None]))
33 | else:
34 | raise NotImplementedError
35 |
36 | # Encoder
37 | Encoder = getattr(backbone, backbone_config['module'])
38 | Encoder_kwargs = backbone_config.get('kwargs', {})
39 | self.encoder = Encoder(**Encoder_kwargs)
40 |
41 | # Horizon compression convert backbone features to horizontal feature
42 | # I name the variable as decoder during development and forgot to fix :P
43 | Decoder = getattr(horizon_compression, decode_config['module'])
44 | Decoder_kwargs = decode_config.get('kwargs', {})
45 | self.decoder = Decoder(self.encoder.out_channels, self.encoder.feat_heights, **Decoder_kwargs)
46 |
47 | # Horizontal feature refinement module
48 | Refinement = getattr(horizon_refinement, refine_config['module'])
49 | Refinement_kwargs = refine_config.get('kwargs', {})
50 | self.horizon_refine = Refinement(self.decoder.out_channels, **Refinement_kwargs)
51 |
52 | # Channel reduction to the shared latent
53 | Upsampler = getattr(horizon_upsample, upsample_config['module'])
54 | Upsampler_kwargs = upsample_config.get('kwargs', {})
55 | self.emb_shared_latent = Upsampler(self.horizon_refine.out_channels, emb_dim)
56 |
57 | # Instantiate desired modalities
58 | self.modalities = nn.ModuleList([
59 | getattr(modality, key)(emb_dim, **config)
60 | for key, config in modalities_config.items()
61 | ])
62 |
63 | # Patch for all conv1d/2d layer's left-right padding
64 | wrap_lr_pad(self)
65 |
66 | # Load pretrained
67 | if pretrain:
68 | print(f'Load pretrained {pretrain}')
69 | st = torch.load(pretrain)
70 | missing_key = self.state_dict().keys() - st.keys()
71 | unknown_key = st.keys() - self.state_dict().keys()
72 | print('Missing key:', missing_key)
73 | print('Unknown key:', unknown_key)
74 | self.load_state_dict(st, strict=False)
75 |
76 | def extract_feat(self, x):
77 | ''' Map the input RGB to the shared latent (by all modalities) '''
78 |
79 | if self.input_hw:
80 | x = F.interpolate(x, size=self.input_hw, mode='bilinear', align_corners=False)
81 | x = (x - self.x_mean) / self.x_std
82 | # encoder
83 | conv_list = self.encoder(x)
84 | # decoder to get horizontal feature
85 | feat = self.decoder(conv_list)
86 | # refine feat
87 | feat = self.horizon_refine(feat)
88 | # embed the shared latent
89 | feat = self.emb_shared_latent(feat)
90 | return feat
91 |
92 | def call_modality(self, method, *feed_args, **feed_kwargs):
93 | ''' Calling the method implemented in each modality and merge the results '''
94 | output_dict = {}
95 | for m in self.modalities:
96 | curr_dict = getattr(m, method)(*feed_args, **feed_kwargs)
97 | assert len(output_dict.keys() & curr_dict.keys()) == 0, 'Key collision for different modalities'
98 | output_dict.update(curr_dict)
99 | return output_dict
100 |
101 | def forward(self, x):
102 | feat = self.extract_feat(x)
103 | return self.call_modality('forward', feat)
104 |
105 | def infer(self, x):
106 | feat = self.extract_feat(x)
107 | return self.call_modality('infer', feat)
108 |
109 | def compute_losses(self, batch):
110 | feat = self.extract_feat(batch['x'])
111 | losses = self.call_modality('compute_losses', feat, batch=batch)
112 | losses['total'] = sum(v for k, v in losses.items() if k.startswith('total'))
113 | return losses
114 |
115 |
--------------------------------------------------------------------------------
/lib/model/horizon_compression/__init__.py:
--------------------------------------------------------------------------------
1 | from .hc import GlobalHeightStage
2 | from .ehc import EfficientHeightReduction
3 | from .simple import SimpleReduction
4 |
--------------------------------------------------------------------------------
/lib/model/horizon_compression/ehc.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from ..utils import pano_upsample_w, PanoUpsampleW
6 |
7 |
8 | '''
9 | EHC
10 | '''
11 | class EfficientHeightReduction(nn.Module):
12 | def __init__(self, cs, heights, out_ch=1024, fuse_ks=1):
13 | ''' Process 4 blocks from encoder to single multiscale features '''
14 | super(EfficientHeightReduction, self).__init__()
15 | c1, c2, c3, c4 = cs
16 | h1, h2, h3, h4 = heights
17 |
18 | def EfficientConvCompressH(in_c, out_c, scale, down_h):
19 | return nn.Sequential(
20 | nn.Conv2d(in_c, out_c, 3, padding=1, bias=False),
21 | nn.BatchNorm2d(out_c),
22 | nn.ReLU(inplace=True),
23 | PanoUpsampleW(scale),
24 | nn.Conv2d(out_c, out_c, 3, padding=1, bias=False),
25 | nn.BatchNorm2d(out_c),
26 | nn.ReLU(inplace=True),
27 | nn.Conv2d(out_c, out_c, (down_h, 1), groups=out_c, bias=False),
28 | )
29 |
30 | self.ghc_lst = nn.ModuleList([
31 | EfficientConvCompressH(c1, c1//4, scale=1, down_h=h1),
32 | EfficientConvCompressH(c2, c2//4, scale=2, down_h=h2),
33 | EfficientConvCompressH(c3, c3//4, scale=4, down_h=h3),
34 | EfficientConvCompressH(c4, c4//4, scale=8, down_h=h4),
35 | ])
36 | self.fuse = nn.Sequential(
37 | nn.Conv2d((c1+c2+c3+c4)//4, out_ch, fuse_ks, padding=fuse_ks//2, bias=False),
38 | nn.BatchNorm2d(out_ch),
39 | nn.ReLU(inplace=True),
40 | )
41 | self.out_channels = out_ch
42 |
43 | def forward(self, conv_list):
44 | assert len(conv_list) == 4
45 | feature = torch.cat([
46 | f(x) for f, x in zip(self.ghc_lst, conv_list)
47 | ], dim=1)
48 | feature = self.fuse(feature).squeeze(2)
49 | return {'1D': feature, 'conv_list': conv_list}
50 |
--------------------------------------------------------------------------------
/lib/model/horizon_compression/hc.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from ..utils import pano_upsample_w, PanoUpsampleW
6 |
7 |
8 | '''
9 | Original HC
10 | '''
11 | class GlobalHeightConv(nn.Module):
12 | def __init__(self, in_c, out_c):
13 | super(GlobalHeightConv, self).__init__()
14 |
15 | def ConvCompressH(in_c, out_c, ks=3):
16 | return nn.Sequential(
17 | nn.Conv2d(in_c, out_c, kernel_size=ks, stride=(2, 1), padding=ks//2),
18 | nn.BatchNorm2d(out_c),
19 | nn.ReLU(inplace=True),
20 | )
21 |
22 | self.layer = nn.Sequential(
23 | ConvCompressH(in_c, in_c//2),
24 | ConvCompressH(in_c//2, in_c//2),
25 | ConvCompressH(in_c//2, in_c//4),
26 | ConvCompressH(in_c//4, out_c),
27 | )
28 |
29 | def forward(self, x, out_w):
30 | x = self.layer(x)
31 | assert out_w % x.shape[3] == 0
32 | return pano_upsample_w(x, out_w//x.shape[-1])
33 |
34 |
35 | class GlobalHeightStage(nn.Module):
36 | def __init__(self, cs, heights, down_h=8):
37 | ''' Process 4 blocks from encoder to single multiscale features '''
38 | super(GlobalHeightStage, self).__init__()
39 | c1, c2, c3, c4 = cs
40 | h1, h2, h3, h4 = heights
41 | self.ghc_lst = nn.ModuleList([
42 | GlobalHeightConv(c1, c1//down_h),
43 | GlobalHeightConv(c2, c2//down_h),
44 | GlobalHeightConv(c3, c3//down_h),
45 | GlobalHeightConv(c4, c4//down_h),
46 | ])
47 | self.out_channels = (c1*h1 + c2*h2 + c3*h3 + c4*h4) // 16 // down_h
48 |
49 | def forward(self, conv_list):
50 | assert len(conv_list) == 4
51 | bs, _, _, out_w = conv_list[0].shape
52 | feature = torch.cat([
53 | f(x, out_w).reshape(bs, -1, out_w)
54 | for f, x in zip(self.ghc_lst, conv_list)
55 | ], dim=1)
56 | return {'1D': feature}
57 |
--------------------------------------------------------------------------------
/lib/model/horizon_compression/simple.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from ..utils import pano_upsample_w, PanoUpsampleW
6 |
7 |
8 | '''
9 | Simple decoder (for s2d3d sem small input size)
10 | '''
11 | class SimpleReduction(nn.Module):
12 | def __init__(self, cs, heights, out_ch=64):
13 | ''' Process 4 blocks from encoder to single multiscale features '''
14 | super(SimpleReduction, self).__init__()
15 | c1, c2, c3, c4 = cs
16 | h1, h2, h3, h4 = heights
17 |
18 | def EfficientConvCompressH(in_c, out_c, scale, down_h):
19 | return nn.Sequential(
20 | PanoUpsampleW(scale),
21 | nn.Conv2d(in_c, out_c, (down_h, 1), bias=False),
22 | nn.BatchNorm2d(out_c),
23 | nn.ReLU(inplace=True),
24 | )
25 |
26 | self.ghc_lst = nn.ModuleList([
27 | EfficientConvCompressH(c1, c1//4, scale=1, down_h=h1),
28 | EfficientConvCompressH(c2, c2//4, scale=2, down_h=h2),
29 | EfficientConvCompressH(c3, c3//4, scale=4, down_h=h3),
30 | EfficientConvCompressH(c4, c4//4, scale=8, down_h=h4),
31 | ])
32 | self.fuse = nn.Sequential(
33 | nn.Conv2d((c1+c2+c3+c4)//4, out_ch, (1, 9), padding=(0, 4), bias=False),
34 | nn.BatchNorm2d(out_ch),
35 | nn.ReLU(inplace=True),
36 | )
37 | self.out_channels = out_ch
38 |
39 | def forward(self, conv_list):
40 | assert len(conv_list) == 4
41 | feature = torch.cat([
42 | f(x) for f, x in zip(self.ghc_lst, conv_list)
43 | ], dim=1)
44 | feature = self.fuse(feature).squeeze(2)
45 | return {'1D': feature}
46 |
--------------------------------------------------------------------------------
/lib/model/horizon_refinement/__init__.py:
--------------------------------------------------------------------------------
1 | from .identity import Identity
2 | from .linear import Linear
3 | from .rnn import LSTM, GRU
4 | from .attention import TransEn
5 |
--------------------------------------------------------------------------------
/lib/model/horizon_refinement/attention.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | import copy
6 |
7 |
8 | ''' Transformer encoder '''
9 | class TransformerEncoder(nn.Module):
10 | ''' Adapt from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/transformer.py '''
11 | def __init__(self, encoder_layer, num_layers):
12 | super(TransformerEncoder, self).__init__()
13 | self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for i in range(num_layers)])
14 | self.num_layers = num_layers
15 |
16 | def forward(self, x):
17 | for mod in self.layers:
18 | x = mod(x)
19 | return x
20 |
21 | class TransformerEncoderLayer(nn.Module):
22 | ''' Adapt from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/transformer.py '''
23 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, mode='pre'):
24 | super(TransformerEncoderLayer, self).__init__()
25 | self.mode = mode
26 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
27 | # Implementation of Feedforward model
28 | self.linear1 = nn.Linear(d_model, dim_feedforward)
29 | self.dropout = nn.Dropout(dropout)
30 | self.linear2 = nn.Linear(dim_feedforward, d_model)
31 |
32 | self.norm1 = nn.LayerNorm(d_model)
33 | self.norm2 = nn.LayerNorm(d_model)
34 | self.dropout1 = nn.Dropout(dropout)
35 | self.dropout2 = nn.Dropout(dropout)
36 |
37 | self.activation = nn.ReLU(inplace=True)
38 |
39 | def forward(self, x):
40 | if self.mode == 'post':
41 | x2 = self.self_attn(x, x, x)[0]
42 | x = x + self.dropout1(x2)
43 | x = self.norm1(x)
44 | x2 = self.linear2(self.dropout(self.activation(self.linear1(x))))
45 | x = x + self.dropout2(x2)
46 | x = self.norm2(x)
47 | return x
48 | elif self.mode == 'pre':
49 | x2 = self.norm1(x)
50 | x2 = self.self_attn(x2, x2, x2)[0]
51 | x = x + self.dropout1(x2)
52 | x2 = self.norm2(x)
53 | x2 = self.linear2(self.dropout(self.activation(self.linear1(x2))))
54 | x = x + self.dropout2(x2)
55 | return x
56 | raise NotImplementedError
57 |
58 | class TransEn(nn.Module):
59 | def __init__(self, c_mid, position_encode, nhead=8, num_layers=2, dim_feedforward=2048, mode='pre'):
60 | super(TransEn, self).__init__()
61 | if isinstance(c_mid, (tuple, list)):
62 | c_mid = c_mid[0]
63 | encoder_layer = TransformerEncoderLayer(c_mid, nhead, dim_feedforward, mode=mode)
64 | self.transen = TransformerEncoder(encoder_layer, num_layers)
65 |
66 | import math
67 | max_len, d_model = position_encode, c_mid
68 | pe = torch.zeros(max_len, d_model)
69 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
70 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
71 | pe[:, 0::2] = torch.sin(position * div_term)
72 | pe[:, 1::2] = torch.cos(position * div_term)
73 | self.register_buffer('pos', pe.T[None].contiguous())
74 |
75 | self.out_channels = c_mid
76 |
77 | def forward(self, feat):
78 | feat1d = feat['1D']
79 | feat1d = (feat1d + self.pos).permute(2,0,1)
80 | feat1d = self.transen(feat1d).permute(1,2,0)
81 | feat['1D'] = feat1d
82 | return feat
83 |
--------------------------------------------------------------------------------
/lib/model/horizon_refinement/identity.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class Identity(nn.Module):
7 | def __init__(self, c_mid, *args, **kwargs):
8 | super(Identity, self).__init__()
9 | self.out_channels = c_mid
10 |
11 | def forward(self, x):
12 | return x
13 |
--------------------------------------------------------------------------------
/lib/model/horizon_refinement/linear.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | def conv1dbnrelu(in_channels, out_channels, **kwargs):
7 | return nn.Sequential(
8 | nn.Conv1d(in_channels, out_channels, **kwargs),
9 | nn.BatchNorm1d(out_channels),
10 | nn.ReLU(inplace=True),
11 | )
12 |
13 | class Linear(nn.Module):
14 | def __init__(self, c_mid, base_ch=256):
15 | super(Linear, self).__init__()
16 | self.conv_1x1 = conv1dbnrelu(c_mid, base_ch*4, kernel_size=1, bias=False)
17 | self.out_channels = base_ch*4
18 |
19 | def forward(self, feat):
20 | feat = feat['1D']
21 | feat = self.conv_1x1(feat)
22 | return {'1D': feat}
23 |
--------------------------------------------------------------------------------
/lib/model/horizon_refinement/rnn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | ''' RNN '''
7 | class LSTM(nn.Module):
8 | def __init__(self, c_mid, base_ch=256, num_layers=2, bidirectional=True):
9 | super(LSTM, self).__init__()
10 | self.rnn = nn.LSTM(
11 | c_mid, hidden_size=base_ch,
12 | num_layers=num_layers, bidirectional=bidirectional)
13 | self.out_channels = base_ch * (1+int(bidirectional))
14 |
15 | def forward(self, feat):
16 | feat = self.rnn(feat.permute(2,0,1))[0].permute(1,2,0).contiguous()
17 | return {'1D': feat}
18 |
19 | class GRU(nn.Module):
20 | def __init__(self, c_mid, base_ch=256, num_layers=2, bidirectional=True):
21 | super(GRU, self).__init__()
22 | self.rnn = nn.GRU(
23 | c_mid, hidden_size=base_ch,
24 | num_layers=num_layers, bidirectional=bidirectional)
25 | self.out_channels = base_ch * (1+int(bidirectional))
26 |
27 | def forward(self, feat):
28 | feat = feat['1D']
29 | feat = self.rnn(feat.permute(2,0,1))[0].permute(1,2,0).contiguous()
30 | return {'1D': feat}
31 |
--------------------------------------------------------------------------------
/lib/model/horizon_upsample/__init__.py:
--------------------------------------------------------------------------------
1 | from torch.nn import Identity
2 | from .upsample1d import Upsample1D
3 |
--------------------------------------------------------------------------------
/lib/model/horizon_upsample/upsample1d.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from ..utils import PanoUpsampleW
6 |
7 |
8 | class Upsample1D(nn.Sequential):
9 | def __init__(self, ic, oc):
10 | super(Upsample1D, self).__init__(
11 | PanoUpsampleW(4),
12 | nn.Conv1d(ic, oc, 3, padding=1, bias=False),
13 | nn.BatchNorm1d(oc),
14 | nn.ReLU(inplace=True),
15 | )
16 |
17 | def forward(self, feat):
18 | feat1d = feat['1D']
19 | for module in self:
20 | feat1d = module(feat1d)
21 | feat['1D'] = feat1d
22 | return feat
23 |
--------------------------------------------------------------------------------
/lib/model/modality/__init__.py:
--------------------------------------------------------------------------------
1 | from .depth import DepthEstimator
2 | from .semantic import SemanticSegmenter
3 | from .layout import LayoutEstimator
4 |
--------------------------------------------------------------------------------
/lib/model/modality/bases.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 |
5 | def dct(n_components, output_height):
6 | basis = (torch.arange(output_height)[None].float() + 0.5) / output_height * np.pi
7 | basis = torch.arange(0, n_components)[:,None].float() * basis
8 | basis = torch.cos(basis)
9 | return basis
10 |
11 |
12 | def linear(*args, **kwargs):
13 | return None
14 |
--------------------------------------------------------------------------------
/lib/model/modality/depth.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 | from . import bases
8 | from ..utils import PanoUpsampleW
9 |
10 |
11 | ''' Dense (per-pixel) depth estimation '''
12 | class DepthBase(nn.Module):
13 | def __init__(self):
14 | super(DepthBase, self).__init__()
15 |
16 | def infer(self, x_emb):
17 | depth = self(x_emb)['depth']
18 | return {'depth': depth}
19 |
20 | def compute_losses(self, x_emb, batch):
21 | gt = batch['depth']
22 | mask = (gt > 0)
23 |
24 | # Forward
25 | pred_dict = self(x_emb)
26 | pred = pred_dict['depth']
27 |
28 | # Compute losses
29 | losses = {}
30 | l1 = (pred[mask] - gt[mask]).abs()
31 | l2 = (pred[mask] - gt[mask]).pow(2)
32 | losses['mae'] = l1.mean()
33 | losses['rmse'] = l2.mean().sqrt()
34 | losses['delta1'] = (torch.max(pred[mask]/gt[mask], gt[mask]/pred[mask]) < 1.25).float().mean()
35 |
36 | losses['total.depth'] = loss_for_backward(pred_dict['depth1d'], gt, mask, self.loss)
37 | if 'residual' in pred_dict:
38 | with torch.no_grad():
39 | gt_residual = gt - pred_dict['depth1d'].detach()
40 | losses['total.residual'] = loss_for_backward(pred_dict['residual'], gt_residual, mask, 'l1')
41 | return losses
42 |
43 |
44 | def loss_for_backward(pred, gt, mask, loss):
45 | if loss == 'l1':
46 | return F.l1_loss(pred[mask], gt[mask])
47 | elif loss == 'l2':
48 | return F.mse_loss(pred[mask], gt[mask])
49 | elif loss == 'huber':
50 | return F.smooth_l1_loss(pred[mask], gt[mask])
51 | elif loss == 'berhu':
52 | l1 = (pred[mask] - gt[mask]).abs().mean()
53 | l2 = (pred[mask] - gt[mask]).pow(2).mean()
54 | with torch.no_grad():
55 | c = max(l1.detach().max() * 0.2, 0.01)
56 | l2c = (l2 + c**2) / (2 * c)
57 | return torch.where(l1<=c, l1, l2c).mean()
58 | else:
59 | raise NotImplementedError
60 |
61 |
62 | class DepthEstimator(DepthBase):
63 | def __init__(self, emb_dim, basis='dct', loss='l1', n_components=64,
64 | init_weight=0.1, init_bias=2.5, output_height=512,
65 | resisual=False, basis_tuning=False):
66 | super(DepthEstimator, self).__init__()
67 | self.loss = loss
68 |
69 | self.output_height = output_height
70 | basis = getattr(bases, basis)(n_components, output_height)
71 | if basis_tuning:
72 | self.basis = nn.Parameter(basis)
73 | else:
74 | self.register_buffer('basis', basis)
75 |
76 | self.estimator = nn.Sequential(
77 | nn.Conv1d(emb_dim, emb_dim, 1),
78 | nn.BatchNorm1d(emb_dim),
79 | nn.ReLU(inplace=True),
80 | nn.Conv1d(emb_dim, n_components, 1, bias=False),
81 | )
82 | self.bias = nn.Parameter(torch.full([1], init_bias))
83 | nn.init.normal_(self.estimator[-1].weight, std=init_weight/np.sqrt(emb_dim/2))
84 |
85 | self.residual = None
86 | if resisual:
87 | self.residual = nn.Sequential(
88 | nn.Conv2d(256, 64, 3, padding=1, bias=False),
89 | nn.BatchNorm2d(64),
90 | nn.ReLU(inplace=True),
91 | nn.Conv2d(64, 1, 1, bias=False),
92 | PanoUpsampleW(4),
93 | nn.UpsamplingBilinear2d(scale_factor=(4,1)),
94 | )
95 |
96 | def forward(self, x_emb):
97 | ws = self.estimator(x_emb['1D'])
98 | if self.basis is None:
99 | h, w = self.output_height, ws.shape[-1]
100 | depth = self.bias + F.interpolate(ws.unsqueeze(1), size=(h,w), mode='bilinear', align_corners=False)
101 | else:
102 | depth = self.bias + torch.einsum('bkw,kh->bhw', ws, self.basis).unsqueeze(1)
103 | ret_dict = {'depth': depth, 'depth1d': depth}
104 | if self.residual is not None:
105 | residual = 0.1 * self.residual(x_emb['conv_list'][0].detach())
106 | ret_dict['residual'] = residual
107 | ret_dict['depth'] = depth + residual
108 | return ret_dict
109 |
--------------------------------------------------------------------------------
/lib/model/modality/layout.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 | from . import bases
8 |
9 | from lib.misc import panostretch, post_proc
10 | from ..utils import peaks_finding
11 | from scipy.ndimage.filters import maximum_filter
12 | from shapely.geometry import Polygon
13 |
14 |
15 | ''' Layout (per-column) estimation '''
16 | class LayoutEstimator(nn.Module):
17 | def __init__(self, emb_dim, bon_weight=1., cor_weight=1., bon_loss='l1', cor_loss='bce', bon_scale=1.,
18 | init_weight=0.1, dropout=0., oneconv=True, last_ks=1, last_bias=True,
19 | H=512, W=1024, post_force_cuboid=False):
20 | super(LayoutEstimator, self).__init__()
21 | self.bon_loss = bon_loss
22 | self.cor_loss = cor_loss
23 | self.bon_scale = bon_scale
24 | self.bon_weight = bon_weight
25 | self.cor_weight = cor_weight
26 | self.H = H
27 | self.W = W
28 | self.post_force_cuboid = post_force_cuboid
29 |
30 | if oneconv:
31 | self.pred_bon = nn.Conv1d(emb_dim, 2, last_ks, padding=last_ks//2, bias=last_bias)
32 | self.pred_cor = nn.Conv1d(emb_dim, 1, last_ks, padding=last_ks//2, bias=last_bias)
33 | if last_bias:
34 | nn.init.constant_(self.pred_bon.bias[0], -0.478)
35 | nn.init.constant_(self.pred_bon.bias[1], 0.425)
36 | nn.init.constant_(self.pred_cor.bias, -1.)
37 | else:
38 | self.pred_bon = nn.Sequential(
39 | nn.Conv1d(emb_dim, emb_dim, 3, padding=1, bias=False),
40 | nn.BatchNorm1d(emb_dim),
41 | nn.ReLU(inplace=True),
42 | nn.Conv1d(emb_dim, 2, 1),
43 | )
44 | self.pred_cor = nn.Sequential(
45 | nn.Conv1d(emb_dim, emb_dim, 3, padding=1, bias=False),
46 | nn.BatchNorm1d(emb_dim),
47 | nn.ReLU(inplace=True),
48 | nn.Conv1d(emb_dim, 1, 1),
49 | )
50 | nn.init.constant_(self.pred_bon[-1].bias[0], -0.478)
51 | nn.init.constant_(self.pred_bon[-1].bias[1], 0.425)
52 | nn.init.constant_(self.pred_cor[-1].bias, -1.)
53 | self.dropout = None
54 | if dropout > 0:
55 | self.dropout = nn.Dropout(dropout)
56 |
57 | def forward(self, x_emb):
58 | x_emb = x_emb['1D']
59 | if self.dropout is not None:
60 | x_emb = self.dropout(x_emb)
61 | pred_bon = self.pred_bon(x_emb)
62 | pred_cor = self.pred_cor(x_emb)
63 | return {'bon': pred_bon, 'cor': pred_cor}
64 |
65 | def infer(self, x_emb):
66 | pred = self(x_emb)
67 | pred_bon = pred['bon'] / self.bon_scale
68 | pred_cor = pred['cor']
69 | H, W = self.H, self.W
70 |
71 | y_bon_ = (pred_bon[0].cpu().numpy() / np.pi + 0.5) * H - 0.5
72 | y_cor_ = pred_cor[0,0].sigmoid().cpu().numpy()
73 | # Init floor/ceil plane
74 | z0 = 50
75 | _, z1 = post_proc.np_refine_by_fix_z(*y_bon_, z0)
76 |
77 | # Detech wall-wall peaks
78 | def find_N_peaks(signal, r, min_v, N):
79 | max_v = maximum_filter(signal, size=r, mode='wrap')
80 | pk_loc = np.where(max_v == signal)[0]
81 | pk_loc = pk_loc[signal[pk_loc] > min_v]
82 | if N is not None:
83 | order = np.argsort(-signal[pk_loc])
84 | pk_loc = pk_loc[order[:N]]
85 | pk_loc = pk_loc[np.argsort(pk_loc)]
86 | return pk_loc, signal[pk_loc]
87 | min_v = 0 if self.post_force_cuboid else 0.05
88 | r = int(round(W * 0.05 / 2))
89 | N = 4 if self.post_force_cuboid else None
90 | xs_ = find_N_peaks(y_cor_, r=r, min_v=min_v, N=N)[0]
91 |
92 | # Generate wall-walls
93 | cor, xy_cor = post_proc.gen_ww(xs_, y_bon_[0], z0, tol=abs(0.16 * z1 / 1.6), force_cuboid=self.post_force_cuboid)
94 | if not self.post_force_cuboid:
95 | # Check valid (for fear self-intersection)
96 | xy2d = np.zeros((len(xy_cor), 2), np.float32)
97 | for i in range(len(xy_cor)):
98 | xy2d[i, xy_cor[i]['type']] = xy_cor[i]['val']
99 | xy2d[i, xy_cor[i-1]['type']] = xy_cor[i-1]['val']
100 | if not Polygon(xy2d).is_valid:
101 | import sys
102 | print(
103 | 'Fail to generate valid general layout!! '
104 | 'Generate cuboid as fallback.',
105 | file=sys.stderr)
106 | xs_ = find_N_peaks(y_cor_, r=r, min_v=0, N=4)[0]
107 | cor, xy_cor = post_proc.gen_ww(xs_, y_bon_[0], z0, tol=abs(0.16 * z1 / 1.6), force_cuboid=True)
108 |
109 | # Expand with btn coory
110 | cor = np.hstack([cor, post_proc.infer_coory(cor[:, 1], z1 - z0, z0)[:, None]])
111 | # Collect corner position in equirectangular
112 | cor_id = np.zeros((len(cor)*2, 2), np.float32)
113 | for j in range(len(cor)):
114 | cor_id[j*2] = cor[j, 0], cor[j, 1]
115 | cor_id[j*2 + 1] = cor[j, 0], cor[j, 2]
116 | return {'cor_id': cor_id, 'y_bon_': y_bon_, 'y_cor_': y_cor_}
117 |
118 | def compute_losses(self, x_emb, batch):
119 | gt_bon = batch['bon'] * self.bon_scale
120 | gt_vot = batch['vot']
121 | gt_cor = 0.96 ** gt_vot.abs()
122 |
123 | # Forward
124 | pred = self(x_emb)
125 |
126 | # Compute losses
127 | losses = {}
128 | if self.bon_loss == 'l1':
129 | losses['bon'] = F.l1_loss(pred['bon'], gt_bon)
130 | elif self.bon_loss == 'l2':
131 | losses['bon'] = F.mse_loss(pred['bon'], gt_bon)
132 | else:
133 | raise NotImplementedError
134 |
135 | if self.cor_loss == 'bce':
136 | losses['cor'] = F.binary_cross_entropy_with_logits(pred['cor'], gt_cor)
137 | elif self.cor_loss == 'prfocal':
138 | g, p = gt_cor, pred['cor']
139 | pos_mask = (g >= 1-1e-6)
140 | B, alpha, beta = len(g), 2, 4
141 | L_pos = -F.logsigmoid(p) * F.sigmoid(-p).pow(alpha)
142 | L_neg = -F.logsigmoid(-p) * F.sigmoid(p).pow(alpha) * (1-g).pow(beta)
143 | L = torch.where(pos_mask, L_pos, L_neg).view(B,-1).sum(-1) / pos_mask.float().view(B,-1).sum(-1)
144 | losses['cor'] = L.mean()
145 | else:
146 | raise NotImplementedError
147 |
148 | losses['total.layout'] = self.bon_weight * losses['bon'] + self.cor_weight * losses['cor']
149 | with torch.no_grad():
150 | losses['bon.mae'] = F.l1_loss(pred['bon'], gt_bon) / self.bon_scale
151 | losses['cor.mae'] = F.l1_loss(pred['cor'].sigmoid(), gt_cor)
152 | return losses
153 |
--------------------------------------------------------------------------------
/lib/model/modality/semantic.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 | from . import bases
8 |
9 |
10 | ''' Dense (per-pixel) semantic segmentation '''
11 | class SemanticSegmenter(nn.Module):
12 | def __init__(self, emb_dim, num_classes, basis='dct', loss='bce', label_weight='', invalid_ids=[], n_components=64,
13 | last_ks=1, dropout=0, init_weight=0.1, init_bias=None, output_height=512, pre1d=False):
14 | super(SemanticSegmenter, self).__init__()
15 | self.num_classes = num_classes
16 | self.loss = loss
17 | self.n_components = n_components
18 | self.invalid_ids = invalid_ids
19 | if init_bias is None:
20 | if self.loss == 'bce':
21 | init_bias = -np.log(num_classes-1)
22 | else:
23 | init_bias = 0.0
24 |
25 | self.output_height = output_height
26 | self.register_buffer('basis', getattr(bases, basis)(n_components, output_height))
27 |
28 | self.estimator = nn.Sequential(
29 | nn.Conv1d(emb_dim, emb_dim, last_ks, padding=last_ks//2),
30 | nn.BatchNorm1d(emb_dim),
31 | nn.ReLU(inplace=True),
32 | nn.Conv1d(emb_dim, n_components * num_classes, 1, bias=False),
33 | )
34 | if dropout > 0:
35 | self.estimator = nn.Sequential(*self.estimator[:-1], nn.Dropout(dropout), self.estimator[-1])
36 | self.bias = nn.Parameter(torch.full([1, num_classes, 1, 1], init_bias))
37 | nn.init.normal_(self.estimator[-1].weight, std=init_weight/np.sqrt(emb_dim/2))
38 |
39 | self.estimator1d = None
40 | if pre1d:
41 | self.estimator1d = nn.Sequential(
42 | nn.Conv1d(emb_dim, emb_dim, last_ks, padding=last_ks//2),
43 | nn.BatchNorm1d(emb_dim),
44 | nn.ReLU(inplace=True),
45 | nn.Conv1d(emb_dim, num_classes, 1),
46 | )
47 | nn.init.constant_(self.estimator1d[-1].bias, -np.log(10-1))
48 |
49 | if label_weight:
50 | self.register_buffer('label_weight', torch.load(label_weight).float())
51 | else:
52 | self.register_buffer('label_weight', torch.ones(num_classes))
53 | self.label_weight[self.invalid_ids] = 0
54 | self.label_weight *= (num_classes - len(self.invalid_ids)) / self.label_weight.sum()
55 |
56 | def forward(self, x_emb):
57 | x_emb = x_emb['1D']
58 | B, _, W = x_emb.shape
59 | ws = self.estimator(x_emb).view(B, self.num_classes, self.n_components, W)
60 | if self.basis is None:
61 | h, w = self.output_height, ws.shape[-1]
62 | sem = self.bias + F.interpolate(ws, size=(h,w), mode='bilinear', align_corners=False)
63 | else:
64 | sem = self.bias + torch.einsum('bckw,kh->bchw', ws, self.basis)
65 | sem[:, self.invalid_ids] = -100
66 |
67 | if self.estimator1d is not None:
68 | sem1d = self.estimator1d(x_emb).view(B, self.num_classes, 1, W)
69 | sem1d[:, self.invalid_ids] = -100
70 | sem.permute(0,1,3,2)[sem1d.sigmoid().squeeze(2) < 0.1] = float("-Inf")
71 | return {'sem': sem, 'sem1d': sem1d}
72 | else:
73 | return {'sem': sem}
74 |
75 | def infer(self, x_emb):
76 | return self(x_emb)
77 |
78 | def compute_losses(self, x_emb, batch):
79 | gt = batch['sem']
80 | mask = (gt >= 0)
81 | B, H, W = gt.shape
82 | if mask.sum() == 0:
83 | return {}
84 |
85 | # Forward
86 | pred = self(x_emb)
87 | pred_sem = pred['sem']
88 |
89 | # Compute losses
90 | losses = {}
91 |
92 | if 'sem1d' in pred:
93 | pred_sem1d = pred['sem1d']
94 | gt1d = torch.zeros_like(pred_sem1d)
95 | brcid = torch.stack(torch.meshgrid(torch.arange(gt.shape[0]), torch.arange(gt.shape[1]), torch.arange(gt.shape[2])), -1)
96 | bid, rid, cid = brcid[mask].T
97 | gt1d[bid, gt[mask], 0, cid] = 1
98 | losses['acc.sem1d.fn'] = ((pred_sem1d.sigmoid() < 0.1) & (gt1d == 1)).float().mean()
99 | losses['acc.sem1d.tn'] = ((pred_sem1d.sigmoid() < 0.1) & (gt1d == 0)).float().mean()
100 | losses['total.sem1d'] = F.binary_cross_entropy_with_logits(pred_sem1d, gt1d)
101 |
102 | pred_sem = pred_sem.permute(0,2,3,1)[mask]
103 | gt = gt[mask]
104 | if 'sem1d' in pred:
105 | activate = (pred_sem1d.detach().sigmoid() >= 0.1).float().repeat(1,1,H,1)
106 | activate = activate.permute(0,2,3,1)[mask]
107 | else:
108 | activate = torch.ones_like(pred_sem)
109 | losses['acc'] = (pred_sem.argmax(1) == gt).float().mean()
110 | if self.loss == 'bce':
111 | gt_onehot = torch.zeros_like(pred_sem).scatter_(dim=1, index=gt[:,None], src=torch.ones_like(pred_sem))
112 | bce = F.binary_cross_entropy_with_logits(pred_sem, gt_onehot, reduction='none')
113 | bce = (bce * self.label_weight)[activate.bool()]
114 | losses['total.sem'] = bce.mean()
115 | elif self.loss == 'ce':
116 | ce = F.cross_entropy(pred_sem, gt, weight=self.label_weight, reduction='none')
117 | ce = ce[~torch.isinf(ce) & ~torch.isnan(ce)]
118 | losses['total.sem'] = ce.mean()
119 | elif self.loss.startswith('mse'):
120 | R = float(self.loss[3:])
121 | gt_R = torch.full_like(pred_sem, -R).scatter_(dim=1, index=gt[:,None], src=torch.full_like(pred_sem, R))
122 | mse = (pred_sem - gt_R).pow(2)
123 | losses['total.sem'] = (mse * self.label_weight).mean()
124 | else:
125 | raise NotImplementedError
126 | return losses
127 |
--------------------------------------------------------------------------------
/lib/model/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import functools
5 |
6 | import scipy
7 | import numpy as np
8 | from scipy.ndimage.filters import maximum_filter
9 | from sklearn.linear_model import HuberRegressor
10 |
11 |
12 | ''' Panorama patch for layers '''
13 | def lr_pad(x, padding=1):
14 | ''' Pad left/right-most to each other instead of zero padding '''
15 | return torch.cat([x[..., -padding:], x, x[..., :padding]], dim=-1)
16 |
17 | class LR_PAD(nn.Module):
18 | ''' Pad left/right-most to each other instead of zero padding '''
19 | def __init__(self, padding=1):
20 | super(LR_PAD, self).__init__()
21 | self.padding = padding
22 |
23 | def forward(self, x):
24 | return lr_pad(x, self.padding)
25 |
26 | def wrap_lr_pad(net):
27 | for name, m in net.named_modules():
28 | names = name.split('.')
29 | root = functools.reduce(lambda o, i: getattr(o, i), [net] + names[:-1])
30 | if isinstance(m, nn.Conv2d):
31 | if m.padding[1] == 0:
32 | continue
33 | w_pad = int(m.padding[1])
34 | m.padding = (m.padding[0], 0)
35 | setattr(
36 | root, names[-1],
37 | nn.Sequential(LR_PAD(w_pad), m)
38 | )
39 | elif isinstance(m, nn.Conv1d):
40 | if m.padding == (0, ):
41 | continue
42 | w_pad = int(m.padding[0])
43 | m.padding = (0,)
44 | setattr(
45 | root, names[-1],
46 | nn.Sequential(LR_PAD(w_pad), m)
47 | )
48 |
49 | def pano_upsample_w(x, s):
50 | if len(x.shape) == 3:
51 | mode = 'linear'
52 | scale_factor = s
53 | elif len(x.shape) == 4:
54 | mode = 'bilinear'
55 | scale_factor = (1, s)
56 | else:
57 | raise NotImplementedError
58 | x = torch.cat([x[...,-1:], x, x[...,:1]], dim=-1)
59 | x = F.interpolate(x, scale_factor=scale_factor, mode=mode, align_corners=False)
60 | x = x[...,s:-s]
61 | return x
62 |
63 | class PanoUpsampleW(nn.Module):
64 | def __init__(self, s):
65 | super(PanoUpsampleW, self).__init__()
66 | self.s = s
67 |
68 | def forward(self, x):
69 | return pano_upsample_w(x, self.s)
70 |
71 |
72 | ''' Testing augmentation helper '''
73 | def augment(x, flip, rotate, rotate_flip):
74 | aug_type = ['']
75 | x_augmented = [x]
76 | if flip:
77 | aug_type.append('flip')
78 | x_augmented.append(x.flip(dims=(-1,)))
79 | for shift in rotate:
80 | aug_type.append('rotate %d' % shift)
81 | x_augmented.append(x.roll(shifts=shift, dims=-1))
82 | if rotate_flip:
83 | aug_type.append('rotate_flip %d' % shift)
84 | x_augmented.append(x_augmented[-1].flip(dims=(-1,)))
85 | return torch.cat(x_augmented, 0), aug_type
86 |
87 | def augment_undo(pred_augmented, aug_type):
88 | pred_augmented = pred_augmented.cpu().numpy()
89 | assert len(pred_augmented) == len(aug_type), 'Unable to recover testing aug'
90 | pred_final = 0
91 | for pred, aug in zip(pred_augmented, aug_type):
92 | if aug == 'flip':
93 | pred_final += np.flip(pred, axis=-1)
94 | elif aug.startswith('rotate'):
95 | if 'flip' in aug:
96 | pred = np.flip(pred, axis=-1)
97 | shift = int(aug.split()[-1])
98 | pred_final += np.roll(pred, -shift, axis=-1)
99 | elif aug == '':
100 | pred_final += pred
101 | else:
102 | raise NotImplementedError
103 |
104 | return pred_final / len(aug_type)
105 |
106 |
107 | ''' Post-processing '''
108 | def peaks_mask_torch(x1d, winsz=7, min_v=0.5):
109 | pad = winsz // 2
110 | x1d_max = F.max_pool1d(torch.cat([x1d[...,-pad:], x1d, x1d[...,:pad]], -1), winsz, stride=1)
111 | return (x1d == x1d_max) & (x1d >= min_v)
112 |
113 | def peaks_finding_torch(x1d, winsz=7, min_v=0.5):
114 | ''' x1d: [B, 1, W] '''
115 | bid, _, cid = torch.where(peaks_mask_torch(x1d, winsz, min_v))
116 | return bid, cid
117 |
118 | def peaks_finding(signal, winsz=7, min_v=0.5):
119 | max_v = maximum_filter(signal, size=winsz, mode='wrap')
120 | pk_loc = np.where(max_v == signal)[0]
121 | pk_loc = pk_loc[signal[pk_loc] > min_v]
122 | return pk_loc
123 |
--------------------------------------------------------------------------------
/test_depth.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import importlib
4 | from natsort import natsorted
5 | from tqdm import tqdm, trange
6 | from collections import Counter
7 |
8 | import numpy as np
9 | from imageio import imwrite
10 | from scipy.spatial.transform import Rotation
11 | from lib.misc.pano_lsd_align import rotatePanorama, panoEdgeDetection
12 |
13 | import torch
14 | import torch.nn as nn
15 | import torch.nn.functional as F
16 | from torch.utils.data import DataLoader
17 |
18 | from lib.config import config, update_config, infer_exp_id
19 | from lib import dataset
20 |
21 |
22 | def eval_metric(pred, gt, dmax):
23 | gt = gt.clamp(0.01, dmax)
24 | pred = pred.clamp(0.01, dmax)
25 | mre = ((gt - pred).abs() / gt).mean().item()
26 | mae = (gt - pred).abs().mean().item()
27 | rmse = ((gt - pred)**2).mean().sqrt().item()
28 | rmse_log = ((gt.log10() - pred.log10())**2).mean().sqrt().item()
29 | log10 = (gt.log10() - pred.log10()).abs().mean().item()
30 |
31 | delta = torch.max(pred/gt, gt/pred)
32 | delta_1 = (delta < 1.25).float().mean().item()
33 | delta_2 = (delta < 1.25**2).float().mean().item()
34 | delta_3 = (delta < 1.25**3).float().mean().item()
35 | return {
36 | 'mre': mre, 'mae': mae, 'rmse': rmse, 'rmse_log': rmse_log, 'log10': log10,
37 | 'delta_1': delta_1, 'delta_2': delta_2, 'delta_3': delta_3,
38 | }
39 |
40 |
41 | if __name__ == '__main__':
42 |
43 | # Parse args & config
44 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
45 | parser.add_argument('--cfg', required=True)
46 | parser.add_argument('--pth')
47 | parser.add_argument('--out')
48 | parser.add_argument('--vis_dir')
49 | parser.add_argument('--clip', default=10, type=float)
50 | parser.add_argument('--y', action='store_true')
51 | parser.add_argument('--pitch', default=0, type=float)
52 | parser.add_argument('--roll', default=0, type=float)
53 | parser.add_argument('opts',
54 | help='Modify config options using the command-line',
55 | default=None, nargs=argparse.REMAINDER)
56 | args = parser.parse_args()
57 | update_config(config, args)
58 | device = 'cuda' if config.cuda else 'cpu'
59 |
60 | if not args.pth:
61 | from glob import glob
62 | exp_id = infer_exp_id(args.cfg)
63 | exp_ckpt_root = os.path.join(config.ckpt_root, exp_id)
64 | args.pth = natsorted(glob(os.path.join(exp_ckpt_root, 'ep*pth')))[-1]
65 | print(f'No pth given, inferring the trained pth: {args.pth}')
66 |
67 | if not args.out:
68 | out = [os.path.splitext(args.pth)[0]]
69 | if args.pitch > 0:
70 | out.append(f'.pitch{args.pitch:.0f}')
71 | if args.roll > 0:
72 | out.append(f'.roll{args.roll:.0f}')
73 | args.out = ''.join(out + ['.npz'])
74 | print(f'No out given, inferring the output path: {args.out}')
75 | if os.path.isfile(args.out) and not args.y:
76 | print(f'{args.out} is existed:')
77 | print(dict(np.load(args.out)))
78 | print('Re-write this results ?', end=' ')
79 | input()
80 |
81 | # Init dataset
82 | DatasetClass = getattr(dataset, config.dataset.name)
83 | config.dataset.valid_kwargs.update(config.dataset.common_kwargs)
84 | config.dataset.valid_kwargs['fix_pitch'] = args.pitch
85 | config.dataset.valid_kwargs['fix_roll'] = args.roll
86 | valid_dataset = DatasetClass(**config.dataset.valid_kwargs)
87 |
88 | # Init network
89 | model_file = importlib.import_module(config.model.file)
90 | model_class = getattr(model_file, config.model.modelclass)
91 | net = model_class(**config.model.kwargs).to(device)
92 | net.load_state_dict(torch.load(args.pth))
93 | net.eval()
94 |
95 | # Run evaluation
96 | evaluation_metric = Counter()
97 | for batch in tqdm(valid_dataset):
98 | # Add batch dim and move to gpu
99 | color = batch['x'][None].to(device)
100 | depth = batch['depth'][None].to(device)
101 | mask = (depth > 0)
102 |
103 | # feed forward
104 | with torch.no_grad():
105 | pred_depth = net.infer(color)
106 | if not torch.is_tensor(pred_depth):
107 | viz_dict = pred_depth
108 | pred_depth = viz_dict.pop('depth')
109 | pred_depth = pred_depth.clamp(0.01)
110 |
111 | if args.pitch:
112 | vp = Rotation.from_rotvec([-args.pitch * np.pi / 180, 0, 0]).as_matrix()
113 | pred_depth = pred_depth.squeeze()[...,None].cpu().numpy()
114 | pred_depth = rotatePanorama(pred_depth, vp, order=0)[...,0]
115 | pred_depth = torch.from_numpy(pred_depth[None,None]).to(depth.device)
116 | if args.roll:
117 | vp = Rotation.from_rotvec([0, -args.roll * np.pi / 180, 0]).as_matrix()
118 | pred_depth = pred_depth.squeeze()[...,None].cpu().numpy()
119 | pred_depth = rotatePanorama(pred_depth, vp, order=0)[...,0]
120 | pred_depth = torch.from_numpy(pred_depth[None,None]).to(depth.device)
121 |
122 | if args.vis_dir:
123 | fname = batch['fname'].strip()
124 | os.makedirs(args.vis_dir, exist_ok=True)
125 | rgb = (batch['x'].permute(1,2,0) * 255).cpu().numpy().astype(np.uint8)
126 | dep = pred_depth.squeeze().mul(512).cpu().numpy().astype(np.uint16)
127 | dep[~mask.squeeze().cpu().numpy()] = 0
128 | gtdep = depth.squeeze().mul(512).cpu().numpy().astype(np.uint16)
129 | imwrite(os.path.join(args.vis_dir, fname + '.rgb' + '.jpg'), rgb)
130 | imwrite(os.path.join(args.vis_dir, fname + '.rgb' + '.png'), gtdep)
131 | imwrite(os.path.join(args.vis_dir, fname + '.depth' + '.png'), dep)
132 | for k, v in viz_dict.items():
133 | if v.dtype == np.uint8 or v.dtype == np.uint16:
134 | imwrite(os.path.join(args.vis_dir, fname + '.' + k + '.png'), v)
135 | else:
136 | raise NotImplementedError
137 |
138 | evaluation_metric['N'] += 1
139 | for metric, v in eval_metric(pred_depth[mask], depth[mask], args.clip).items():
140 | evaluation_metric[metric] += v
141 |
142 | N = evaluation_metric.pop('N')
143 | for metric, v in evaluation_metric.items():
144 | evaluation_metric[metric] = v / N
145 | for metric, v in evaluation_metric.items():
146 | print(f'{metric:20s} {v:.4f}')
147 |
148 | np.savez(args.out, **evaluation_metric)
149 |
150 |
--------------------------------------------------------------------------------
/test_layout.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import json
4 | import argparse
5 | import importlib
6 | import numpy as np
7 | from PIL import Image
8 | from tqdm import tqdm
9 |
10 | import torch
11 | import torch.nn as nn
12 | import torch.nn.functional as F
13 |
14 | from lib.config import config, update_config, infer_exp_id
15 |
16 |
17 | if __name__ == '__main__':
18 |
19 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
20 | parser.add_argument('--cfg', required=True)
21 | parser.add_argument('--pth', help='path to load saved checkpoint.')
22 | parser.add_argument('--img_glob', required=True)
23 | parser.add_argument('--output_dir', required=True)
24 | # Augmentation related
25 | parser.add_argument('--flip', action='store_true',
26 | help='whether to perfome left-right flip. '
27 | '# of input x2.')
28 | parser.add_argument('--rotate', nargs='*', default=[], type=int,
29 | help='whether to perfome horizontal rotate. '
30 | 'each elements indicate fraction of image width. '
31 | '# of input xlen(rotate).')
32 | # Misc arguments
33 | parser.add_argument('--no_cuda', action='store_true',
34 | help='disable cuda')
35 | parser.add_argument('opts',
36 | help='Modify config options using the command-line',
37 | default=None, nargs=argparse.REMAINDER)
38 | args = parser.parse_args()
39 |
40 | # Init setting
41 | update_config(config, args)
42 | if not args.pth:
43 | exp_id = infer_exp_id(args.cfg)
44 | exp_ckpt_root = os.path.join(config.ckpt_root, exp_id)
45 | args.pth = sorted(glob.glob(os.path.join(exp_ckpt_root, '*pth')))[-1]
46 | print(f'--pth is not given. Auto infer the pth={args.pth}')
47 | device = torch.device('cpu' if args.no_cuda else 'cuda')
48 |
49 | # Prepare image to processed
50 | paths = sorted(glob.glob(args.img_glob))
51 | if len(paths) == 0:
52 | print('no images found')
53 | for path in paths:
54 | assert os.path.isfile(path), '%s not found' % path
55 |
56 | # Prepare the trained model
57 | model_file = importlib.import_module(config.model.file)
58 | model_class = getattr(model_file, config.model.modelclass)
59 | net = model_class(**config.model.kwargs)
60 | net.load_state_dict(torch.load(args.pth))
61 | net = net.to(device).eval()
62 |
63 | # Check target directory
64 | if not os.path.isdir(args.output_dir):
65 | print('Output directory %s not existed. Create one.' % args.output_dir)
66 | os.makedirs(args.output_dir)
67 |
68 | # Inferencing
69 | with torch.no_grad():
70 | for i_path in tqdm(paths, desc='Inferencing'):
71 | k = os.path.split(i_path)[-1][:-4]
72 |
73 | # Load image
74 | img_pil = Image.open(i_path)
75 | if img_pil.size != (1024, 512):
76 | img_pil = img_pil.resize((1024, 512), Image.BICUBIC)
77 | img_ori = np.array(img_pil)[..., :3].transpose([2, 0, 1]).copy()
78 | x = torch.FloatTensor([img_ori / 255]).to(device)
79 |
80 | # Inferenceing corners
81 | net.fname = k
82 | cor_id = net.infer(x)['cor_id']
83 |
84 | # Output result
85 | with open(os.path.join(args.output_dir, k + '.txt'), 'w') as f:
86 | for x, y in cor_id:
87 | f.write('%d %d\n' % (x, y))
88 |
89 |
--------------------------------------------------------------------------------
/test_sem.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import importlib
4 | from natsort import natsorted
5 | from tqdm import tqdm, trange
6 | from collections import Counter
7 |
8 | import numpy as np
9 |
10 | import torch
11 | import torch.nn as nn
12 | import torch.nn.functional as F
13 | from torch.utils.data import DataLoader
14 |
15 | from lib.config import config, update_config, infer_exp_id
16 | from lib import dataset
17 |
18 |
19 | if __name__ == '__main__':
20 |
21 | # Parse args & config
22 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
23 | parser.add_argument('--cfg', required=True)
24 | parser.add_argument('--pth')
25 | parser.add_argument('--out')
26 | parser.add_argument('--vis_dir')
27 | parser.add_argument('--y', action='store_true')
28 | parser.add_argument('--test_hw', type=int, nargs='*')
29 | parser.add_argument('opts',
30 | help='Modify config options using the command-line',
31 | default=None, nargs=argparse.REMAINDER)
32 | args = parser.parse_args()
33 | update_config(config, args)
34 | device = 'cuda' if config.cuda else 'cpu'
35 |
36 | if config.cuda and config.cuda_benchmark:
37 | torch.backends.cudnn.benchmark = False
38 |
39 | # Init global variable
40 | if not args.pth:
41 | from glob import glob
42 | exp_id = infer_exp_id(args.cfg)
43 | exp_ckpt_root = os.path.join(config.ckpt_root, exp_id)
44 | args.pth = natsorted(glob(os.path.join(exp_ckpt_root, 'ep*pth')))[-1]
45 | print(f'No pth given, inferring the trained pth: {args.pth}')
46 |
47 | if not args.out:
48 | args.out = os.path.splitext(args.pth)[0]
49 | print(f'No out given, inferring the output dir: {args.out}')
50 | os.makedirs(args.out, exist_ok=True)
51 | if os.path.isfile(os.path.join(args.out, 'cm.npz')) and not args.y:
52 | print(f'{os.path.join(args.out, "cm.npz")} is existed:')
53 | cm = np.load(os.path.join(args.out, 'cm.npz'))['cm']
54 | inter = np.diag(cm)
55 | union = cm.sum(0) + cm.sum(1) - inter
56 | ious = inter / union
57 | accs = inter / cm.sum(1)
58 | DatasetClass = getattr(dataset, config.dataset.name)
59 | config.dataset.valid_kwargs.update(config.dataset.common_kwargs)
60 | valid_dataset = DatasetClass(**config.dataset.valid_kwargs)
61 | id2class = np.array(valid_dataset.ID2CLASS)
62 | for name, iou, acc in zip(id2class, ious, accs):
63 | print(f'{name:20s}: iou {iou*100:5.2f} / acc {acc*100:5.2f}')
64 | print(f'{"Overall":20s}: iou {ious.mean()*100:5.2f} / acc {accs.mean()*100:5.2f}')
65 | print('Re-write this results ?', end=' ')
66 | input()
67 |
68 | # Init dataset
69 | DatasetClass = getattr(dataset, config.dataset.name)
70 | config.dataset.valid_kwargs.update(config.dataset.common_kwargs)
71 | if args.test_hw:
72 | input_hw = config.dataset.common_kwargs['hw']
73 | config.dataset.valid_kwargs['hw'] = args.test_hw
74 | else:
75 | input_hw = None
76 | valid_dataset = DatasetClass(**config.dataset.valid_kwargs)
77 | valid_loader = DataLoader(valid_dataset, 1,
78 | num_workers=config.num_workers,
79 | pin_memory=config.cuda)
80 |
81 | # Init network
82 | model_file = importlib.import_module(config.model.file)
83 | model_class = getattr(model_file, config.model.modelclass)
84 | net = model_class(**config.model.kwargs).to(device)
85 | net.load_state_dict(torch.load(args.pth))
86 | net = net.to(device).eval()
87 |
88 | # Start eval
89 | cm = 0
90 | num_classes = config.model.kwargs.modalities_config.SemanticSegmenter.num_classes
91 | with torch.no_grad():
92 | for batch in tqdm(valid_loader, position=1, total=len(valid_loader)):
93 | color = batch['x'].to(device)
94 | sem = batch['sem'].to(device)
95 | mask = (sem >= 0)
96 | if mask.sum() == 0:
97 | continue
98 |
99 | # feed forward & compute losses
100 | if input_hw is not None:
101 | color = F.interpolate(color, size=input_hw, mode='bilinear', align_corners=False)
102 | pred_sem = net.infer(color)['sem']
103 | if input_hw is not None:
104 | pred_sem = F.interpolate(pred_sem, size=args.test_hw, mode='bilinear', align_corners=False)
105 |
106 | # Visualization
107 | if args.vis_dir:
108 | import matplotlib.pyplot as plt
109 | from imageio import imwrite
110 | cmap = (plt.get_cmap('gist_rainbow')(np.arange(num_classes) / num_classes)[...,:3] * 255).astype(np.uint8)
111 | rgb = (batch['x'][0, :3].permute(1,2,0) * 255).cpu().numpy().astype(np.uint8)
112 | vis_sem = cmap[pred_sem[0].argmax(0).cpu().numpy()]
113 | vis_sem = (rgb * 0.2 + vis_sem * 0.8).astype(np.uint8)
114 | imwrite(os.path.join(args.vis_dir, batch['fname'][0].strip()), vis_sem)
115 | vis_sem = cmap[sem[0].cpu().numpy()]
116 | vis_sem = (rgb * 0.2 + vis_sem * 0.8).astype(np.uint8)
117 | imwrite(os.path.join(args.vis_dir, batch['fname'][0].strip() + '.gt.png'), vis_sem)
118 |
119 | # Log
120 | gt = sem[mask]
121 | pred = pred_sem.argmax(1)[mask]
122 | assert gt.min() >= 0 and gt.max() < num_classes and pred_sem.shape[1] == num_classes
123 | cm += np.bincount((gt * num_classes + pred).cpu().numpy(), minlength=num_classes**2)
124 |
125 | # Summarize
126 | print(' Summarize '.center(50, '='))
127 | cm = cm.reshape(num_classes, num_classes)
128 | id2class = np.array(valid_dataset.ID2CLASS)
129 | valid_mask = (cm.sum(1) != 0)
130 | cm = cm[valid_mask][:, valid_mask]
131 | id2class = id2class[valid_mask]
132 | inter = np.diag(cm)
133 | union = cm.sum(0) + cm.sum(1) - inter
134 | ious = inter / union
135 | accs = inter / cm.sum(1)
136 | for name, iou, acc in zip(id2class, ious, accs):
137 | print(f'{name:20s}: iou {iou*100:5.2f} / acc {acc*100:5.2f}')
138 | print(f'{"Overall":20s}: iou {ious.mean()*100:5.2f} / acc {accs.mean()*100:5.2f}')
139 | np.savez(os.path.join(args.out, 'cm.npz'), cm=cm)
140 |
141 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import importlib
4 | from tqdm import tqdm, trange
5 | from collections import Counter
6 |
7 | import numpy as np
8 |
9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | from torch.utils.data import DataLoader
13 |
14 | from lib.config import config, update_config, infer_exp_id
15 | from lib import dataset
16 |
17 |
18 | def train_loop(net, loader, optimizer):
19 | net.train()
20 | if config.training.fix_encoder_bn:
21 | apply_fn_based_on_key(net.encoder, ['bn'], lambda m: m.eval())
22 | epoch_losses = Counter()
23 | for iit, batch in tqdm(enumerate(loader, 1), position=1, total=len(loader)):
24 | # Move data to the given computation device
25 | for k, v in batch.items():
26 | if torch.is_tensor(v):
27 | batch[k] = v.to(device)
28 |
29 | # feed forward & compute losses
30 | losses = net.compute_losses(batch)
31 | if len(losses) == 0:
32 | continue
33 |
34 | # backprop
35 | optimizer.zero_grad()
36 | losses['total'].backward()
37 | optimizer.step()
38 |
39 | # Log
40 | BS = len(batch['x'])
41 | epoch_losses['N'] += BS
42 | for k, v in losses.items():
43 | if torch.is_tensor(v):
44 | epoch_losses[k] += BS * v.item()
45 | else:
46 | epoch_losses[k] += BS * v
47 |
48 | # Statistic over the epoch
49 | N = epoch_losses.pop('N')
50 | for k, v in epoch_losses.items():
51 | epoch_losses[k] = v / N
52 |
53 | return epoch_losses
54 |
55 |
56 | def valid_loop(net, loader):
57 | net.eval()
58 | epoch_losses = Counter()
59 | with torch.no_grad():
60 | for iit, batch in tqdm(enumerate(loader, 1), position=1, total=len(loader)):
61 | for k, v in batch.items():
62 | if torch.is_tensor(v):
63 | batch[k] = v.to(device)
64 |
65 | # feed forward & compute losses
66 | losses = net.compute_losses(batch)
67 |
68 | # Log
69 | for k, v in losses.items():
70 | if torch.is_tensor(v):
71 | epoch_losses[k] += float(v.item()) / len(loader)
72 | else:
73 | epoch_losses[k] += v / len(loader)
74 |
75 | return epoch_losses
76 |
77 |
78 | def apply_fn_based_on_key(net, key_lst, fn):
79 | for name, m in net.named_modules():
80 | if any(k in name for k in key_lst):
81 | fn(m)
82 |
83 |
84 | def group_parameters(net, wd_group_mode):
85 | wd = []
86 | nowd = []
87 | for name, p in net.named_parameters():
88 | if not p.requires_grad:
89 | continue
90 | if wd_group_mode == 'bn and bias':
91 | if 'bn' in name or 'bias' in name:
92 | nowd.append(p)
93 | else:
94 | wd.append(p)
95 | elif wd_group_mode == 'encoder decoder':
96 | if 'feature_extractor' in name:
97 | nowd.append(p)
98 | else:
99 | wd.append(p)
100 | return [{'params': wd}, {'params': nowd, 'weight_decay': 0}]
101 |
102 |
103 | if __name__ == '__main__':
104 |
105 | # Parse args & config
106 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
107 | parser.add_argument('--cfg', required=True)
108 | parser.add_argument('opts',
109 | help='Modify config options using the command-line',
110 | default=None, nargs=argparse.REMAINDER)
111 | args = parser.parse_args()
112 | update_config(config, args)
113 |
114 | # Init global variable
115 | exp_id = infer_exp_id(args.cfg)
116 | exp_ckpt_root = os.path.join(config.ckpt_root, exp_id)
117 | os.makedirs(exp_ckpt_root, exist_ok=True)
118 | device = 'cuda' if config.cuda else 'cpu'
119 | if config.cuda and config.cuda_benchmark:
120 | torch.backends.cudnn.benchmark = True
121 |
122 | # Init dataset
123 | DatasetClass = getattr(dataset, config.dataset.name)
124 | config.dataset.train_kwargs.update(config.dataset.common_kwargs)
125 | config.dataset.valid_kwargs.update(config.dataset.common_kwargs)
126 | train_dataset = DatasetClass(**config.dataset.train_kwargs)
127 | valid_dataset = DatasetClass(**config.dataset.valid_kwargs)
128 | train_loader = DataLoader(train_dataset, config.training.batch_size,
129 | shuffle=True, drop_last=True,
130 | num_workers=config.num_workers,
131 | pin_memory=config.cuda,
132 | worker_init_fn=lambda x: np.random.seed())
133 | valid_loader = DataLoader(valid_dataset, 1,
134 | num_workers=config.num_workers,
135 | pin_memory=config.cuda)
136 |
137 | # Init network
138 | model_file = importlib.import_module(config.model.file)
139 | model_class = getattr(model_file, config.model.modelclass)
140 | net = model_class(**config.model.kwargs).to(device)
141 | if config.training.fix_encoder_bn:
142 | apply_fn_based_on_key(net.encoder, ['bn'], lambda m: m.requires_grad_(False))
143 |
144 | # Init optimizer
145 | if config.training.optim == 'Adam':
146 | optimizer = torch.optim.Adam(
147 | group_parameters(net, config.training.wd_group_mode),
148 | lr=config.training.optim_lr, weight_decay=config.training.weight_decay)
149 | elif config.training.optim == 'AdamW':
150 | optimizer = torch.optim.AdamW(
151 | group_parameters(net, config.training.wd_group_mode),
152 | lr=config.training.optim_lr, weight_decay=config.training.weight_decay)
153 | elif config.training.optim == 'SGD':
154 | optimizer = torch.optim.SGD(
155 | group_parameters(net, config.training.wd_group_mode), momentum=0.9,
156 | lr=config.training.optim_lr, weight_decay=config.training.weight_decay)
157 |
158 | if config.training.optim_poly_gamma > 0:
159 | def lr_poly_rate(epoch):
160 | return (1 - epoch / config.training.epoch) ** config.training.optim_poly_gamma
161 | scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_poly_rate)
162 | else:
163 | scheduler = torch.optim.lr_scheduler.MultiStepLR(
164 | optimizer, milestones=[int(p * config.training.epoch) for p in config.training.optim_milestons],
165 | gamma=config.training.optim_gamma)
166 |
167 | # Start training
168 | for iep in trange(1, config.training.epoch + 1, position=0):
169 |
170 | # Train phase
171 | epoch_losses = train_loop(net, train_loader, optimizer)
172 | scheduler.step()
173 | print(f'EP[{iep}/{config.training.epoch}] train: ' +
174 | ' \ '.join([f'{k} {v:.3f}' for k, v in epoch_losses.items()]))
175 |
176 | # Periodically save model
177 | if iep % config.training.save_every == 0:
178 | torch.save(net.state_dict(), os.path.join(exp_ckpt_root, f'ep{iep}.pth'))
179 | print('Model saved')
180 |
181 | # Valid phase
182 | epoch_losses = valid_loop(net, valid_loader)
183 | print(f'EP[{iep}/{config.training.epoch}] valid: ' +
184 | ' \ '.join([f'{k} {v:.3f}' for k, v in epoch_losses.items()]))
185 |
186 |
--------------------------------------------------------------------------------
/vis_depth.py:
--------------------------------------------------------------------------------
1 | import json
2 | import numpy as np
3 | import open3d as o3d
4 | from imageio import imread
5 |
6 |
7 | def get_uni_sphere_xyz(H, W):
8 | j, i = np.meshgrid(np.arange(H), np.arange(W), indexing='ij')
9 | u = (i+0.5) / W * 2 * np.pi
10 | v = ((j+0.5) / H - 0.5) * np.pi
11 | z = -np.sin(v)
12 | c = np.cos(v)
13 | y = c * np.sin(u)
14 | x = c * np.cos(u)
15 | sphere_xyz = np.stack([x, y, z], -1)
16 | return sphere_xyz
17 |
18 |
19 | if __name__ == '__main__':
20 |
21 | import argparse
22 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
23 | parser.add_argument('--img', required=True,
24 | help='Image texture in equirectangular format')
25 | parser.add_argument('--depth', required=True,
26 | help='Depth map')
27 | parser.add_argument('--scale', default=0.001, type=float,
28 | help='Rescale the depth map')
29 | parser.add_argument('--crop_ratio', default=80/512, type=float,
30 | help='Crop ratio for upper and lower part of the image')
31 | parser.add_argument('--crop_z_above', default=1.2, type=float,
32 | help='Filter 3D point with z coordinate above')
33 | args = parser.parse_args()
34 |
35 | # Reading rgb-d
36 | rgb = imread(args.img)
37 | depth = imread(args.depth)[...,None].astype(np.float32) * args.scale
38 |
39 | # Project to 3d
40 | H, W = rgb.shape[:2]
41 | xyz = depth * get_uni_sphere_xyz(H, W)
42 | xyzrgb = np.concatenate([xyz, rgb/255.], 2)
43 |
44 | # Crop the image and flatten
45 | if args.crop_ratio > 0:
46 | assert args.crop_ratio < 1
47 | crop = int(H * args.crop_ratio)
48 | xyzrgb = xyzrgb[crop:-crop]
49 | xyzrgb = xyzrgb.reshape(-1, 6)
50 |
51 | # Crop in 3d
52 | xyzrgb = xyzrgb[xyzrgb[:,2] <= args.crop_z_above]
53 |
54 | # Visualize
55 | pcd = o3d.geometry.PointCloud()
56 | pcd.points = o3d.utility.Vector3dVector(xyzrgb[:, :3])
57 | pcd.colors = o3d.utility.Vector3dVector(xyzrgb[:, 3:])
58 |
59 | o3d.visualization.draw_geometries([
60 | pcd,
61 | o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.3, origin=[0, 0, 0])
62 | ])
63 |
--------------------------------------------------------------------------------
/vis_layout.py:
--------------------------------------------------------------------------------
1 | import json
2 | import numpy as np
3 | import open3d as o3d
4 | from PIL import Image
5 | from scipy.signal import correlate2d
6 | from scipy.ndimage import shift
7 |
8 | from lib.misc.post_proc import np_coor2xy, np_coorx2u, np_coory2v
9 | from eval_layout import layout_2_depth
10 |
11 |
12 | if __name__ == '__main__':
13 |
14 | import argparse
15 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
16 | parser.add_argument('--img', required=True,
17 | help='Image texture in equirectangular format')
18 | parser.add_argument('--layout', required=True,
19 | help='Txt or json file containing layout corners (cor_id)')
20 | parser.add_argument('--out')
21 | parser.add_argument('--no_vis', action='store_true')
22 | parser.add_argument('--show_ceiling', action='store_true',
23 | help='Rendering ceiling (skip by default)')
24 | parser.add_argument('--ignore_floor', action='store_true',
25 | help='Skip rendering floor')
26 | parser.add_argument('--ignore_wall', action='store_true',
27 | help='Skip rendering wall')
28 | parser.add_argument('--ignore_wireframe', action='store_true',
29 | help='Skip rendering wireframe')
30 | args = parser.parse_args()
31 |
32 | if not args.out and args.no_vis:
33 | print('You may want to export (via --out) or visualize (without --vis)')
34 | import sys; sys.exit()
35 |
36 | # Reading source (texture img, cor_id txt)
37 | equirect_texture = np.array(Image.open(args.img))
38 | H, W = equirect_texture.shape[:2]
39 | if args.layout.endswith('json'):
40 | with open(args.layout) as f:
41 | inferenced_result = json.load(f)
42 | cor_id = np.array(inferenced_result['uv'], np.float32)
43 | cor_id[:, 0] *= W
44 | cor_id[:, 1] *= H
45 | else:
46 | cor_id = np.loadtxt(args.layout).astype(np.float32)
47 |
48 | # Convert corners to layout
49 | depth, floor_mask, ceil_mask, wall_mask = layout_2_depth(cor_id, H, W, return_mask=True)
50 | coorx, coory = np.meshgrid(np.arange(W), np.arange(H))
51 | us = np_coorx2u(coorx, W)
52 | vs = np_coory2v(coory, H)
53 | zs = depth * np.sin(vs)
54 | cs = depth * np.cos(vs)
55 | xs = cs * np.sin(us)
56 | ys = -cs * np.cos(us)
57 |
58 | # Aggregate mask
59 | mask = np.ones_like(floor_mask)
60 | if args.ignore_floor:
61 | mask &= ~floor_mask
62 | if not args.show_ceiling:
63 | mask &= ~ceil_mask
64 | if args.ignore_wall:
65 | mask &= ~wall_mask
66 |
67 | # Prepare ply's points and faces
68 | xyzrgb = np.concatenate([
69 | xs[...,None], ys[...,None], zs[...,None],
70 | equirect_texture], -1)
71 | xyzrgb = np.concatenate([xyzrgb, xyzrgb[:,[0]]], 1)
72 | mask = np.concatenate([mask, mask[:,[0]]], 1)
73 | lo_tri_template = np.array([
74 | [0, 0, 0],
75 | [0, 1, 0],
76 | [0, 1, 1]])
77 | up_tri_template = np.array([
78 | [0, 0, 0],
79 | [0, 1, 1],
80 | [0, 0, 1]])
81 | ma_tri_template = np.array([
82 | [0, 0, 0],
83 | [0, 1, 1],
84 | [0, 1, 0]])
85 | lo_mask = (correlate2d(mask, lo_tri_template, mode='same') == 3)
86 | up_mask = (correlate2d(mask, up_tri_template, mode='same') == 3)
87 | ma_mask = (correlate2d(mask, ma_tri_template, mode='same') == 3) & (~lo_mask) & (~up_mask)
88 | ref_mask = (
89 | lo_mask | (correlate2d(lo_mask, np.flip(lo_tri_template, (0,1)), mode='same') > 0) |\
90 | up_mask | (correlate2d(up_mask, np.flip(up_tri_template, (0,1)), mode='same') > 0) |\
91 | ma_mask | (correlate2d(ma_mask, np.flip(ma_tri_template, (0,1)), mode='same') > 0)
92 | )
93 | points = xyzrgb[ref_mask]
94 |
95 | ref_id = np.full(ref_mask.shape, -1, np.int32)
96 | ref_id[ref_mask] = np.arange(ref_mask.sum())
97 | faces_lo_tri = np.stack([
98 | ref_id[lo_mask],
99 | ref_id[shift(lo_mask, [1, 0], cval=False, order=0)],
100 | ref_id[shift(lo_mask, [1, 1], cval=False, order=0)],
101 | ], 1)
102 | faces_up_tri = np.stack([
103 | ref_id[up_mask],
104 | ref_id[shift(up_mask, [1, 1], cval=False, order=0)],
105 | ref_id[shift(up_mask, [0, 1], cval=False, order=0)],
106 | ], 1)
107 | faces_ma_tri = np.stack([
108 | ref_id[ma_mask],
109 | ref_id[shift(ma_mask, [1, 0], cval=False, order=0)],
110 | ref_id[shift(ma_mask, [0, 1], cval=False, order=0)],
111 | ], 1)
112 | faces = np.concatenate([faces_lo_tri, faces_up_tri, faces_ma_tri])
113 |
114 | # Dump results ply
115 | if args.out:
116 | ply_header = '\n'.join([
117 | 'ply',
118 | 'format ascii 1.0',
119 | f'element vertex {len(points):d}',
120 | 'property float x',
121 | 'property float y',
122 | 'property float z',
123 | 'property uchar red',
124 | 'property uchar green',
125 | 'property uchar blue',
126 | f'element face {len(faces):d}',
127 | 'property list uchar int vertex_indices',
128 | 'end_header',
129 | ])
130 | with open(args.out, 'w') as f:
131 | f.write(ply_header)
132 | f.write('\n')
133 | for x, y, z, r, g, b in points:
134 | f.write(f'{x:.2f} {y:.2f} {z:.2f} {r:.0f} {g:.0f} {b:.0f}\n')
135 | for i, j, k in faces:
136 | f.write(f'3 {i:d} {j:d} {k:d}\n')
137 |
138 | if not args.no_vis:
139 | mesh = o3d.geometry.TriangleMesh()
140 | mesh.vertices = o3d.utility.Vector3dVector(points[:, :3])
141 | mesh.vertex_colors = o3d.utility.Vector3dVector(points[:, 3:] / 255.)
142 | mesh.triangles = o3d.utility.Vector3iVector(faces)
143 | draw_geometries = [mesh]
144 |
145 | # Show wireframe
146 | if not args.ignore_wireframe:
147 | # Convert cor_id to 3d xyz
148 | N = len(cor_id) // 2
149 | floor_z = -1.6
150 | floor_xy = np_coor2xy(cor_id[1::2], floor_z, W, H, floorW=1, floorH=1)
151 | c = np.sqrt((floor_xy**2).sum(1))
152 | v = np_coory2v(cor_id[0::2, 1], H)
153 | ceil_z = (c * np.tan(v)).mean()
154 |
155 | # Prepare wireframe in open3d
156 | assert N == len(floor_xy)
157 | wf_points = [[x, y, floor_z] for x, y in floor_xy] +\
158 | [[x, y, ceil_z] for x, y in floor_xy]
159 | wf_lines = [[i, (i+1)%N] for i in range(N)] +\
160 | [[i+N, (i+1)%N+N] for i in range(N)] +\
161 | [[i, i+N] for i in range(N)]
162 | wf_colors = [[1, 0, 0] for i in range(len(wf_lines))]
163 | wf_line_set = o3d.geometry.LineSet()
164 | wf_line_set.points = o3d.utility.Vector3dVector(wf_points)
165 | wf_line_set.lines = o3d.utility.Vector2iVector(wf_lines)
166 | wf_line_set.colors = o3d.utility.Vector3dVector(wf_colors)
167 | draw_geometries.append(wf_line_set)
168 |
169 | o3d.visualization.draw_geometries(draw_geometries, mesh_show_back_face=True)
170 |
--------------------------------------------------------------------------------