├── .gitignore ├── LICENCE ├── README.md ├── README_prepare_data_mp3d_layout.md ├── README_prepare_data_s2d3d.md ├── README_reproduction.md ├── assets ├── label13_weight.pth ├── pano_asmasuxybohhcj.depth.png ├── pano_asmasuxybohhcj.layout.txt ├── pano_asmasuxybohhcj.png ├── repo_teaser.jpg ├── snapshot_depth.jpg └── snapshot_layout.jpg ├── config ├── mp3d_depth │ ├── HOHO_depth_dct_efficienthc_TransEn1.yaml │ ├── HOHO_depth_dct_efficienthc_TransEn1_hardnet.yaml │ └── ablation │ │ ├── tuning___HOHO_depth_dct128_efficienthc_TransEn1.yaml │ │ ├── tuning___HOHO_depth_dct256_efficienthc_TransEn1.yaml │ │ ├── tuning___HOHO_depth_dct32_efficienthc_TransEn1.yaml │ │ ├── tuning___HOHO_depth_dct512_efficienthc_TransEn1.yaml │ │ ├── tuning___HOHO_depth_dct_LSTM.yaml │ │ ├── tuning___HOHO_depth_dct_Linear.yaml │ │ ├── tuning___HOHO_depth_dct_TransEn1.yaml │ │ ├── tuning___HOHO_depth_dct_efficienthc_LSTM.yaml │ │ ├── tuning___HOHO_depth_dct_efficienthc_Linear.yaml │ │ ├── tuning___HOHO_depth_dct_efficienthc_TransEn1.yaml │ │ ├── tuning___HOHO_depth_dct_efficienthc_TransEn1_resnet34.yaml │ │ ├── tuning___HOHO_depth_lin128_efficienthc_TransEn1.yaml │ │ ├── tuning___HOHO_depth_lin256_efficienthc_TransEn1.yaml │ │ ├── tuning___HOHO_depth_lin32_efficienthc_TransEn1.yaml │ │ ├── tuning___HOHO_depth_lin512_efficienthc_TransEn1.yaml │ │ └── tuning___HOHO_depth_lin64_efficienthc_TransEn1.yaml ├── mp3d_layout │ └── HOHO_layout_aug_efficienthc_Transen1_resnet34.yaml ├── s2d3d_depth │ ├── HOHO_depthS_SGD_dct_efficienthc_TransEn1.yaml │ ├── HOHO_depthS_dct_efficienthc_TransEn1.yaml │ └── HOHO_depth_dct_efficienthc_TransEn1.yaml └── s2d3d_sem │ ├── HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101.yaml │ ├── HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101rgb.yaml │ ├── HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101.yaml │ ├── HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101rgb.yaml │ ├── HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101.yaml │ ├── HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101rgb.yaml │ ├── HOHO_depth_dct_efficienthc_TransEn1_h256_fold1_simple.yaml │ ├── HOHO_depth_dct_efficienthc_TransEn1_h256_fold2_simple.yaml │ ├── HOHO_depth_dct_efficienthc_TransEn1_h256_fold3_simple.yaml │ ├── HOHO_depth_dct_efficienthc_TransEn1_h64_fold1_simple.yaml │ ├── HOHO_depth_dct_efficienthc_TransEn1_h64_fold2_simple.yaml │ └── HOHO_depth_dct_efficienthc_TransEn1_h64_fold3_simple.yaml ├── count_params_flops.py ├── eval_layout.py ├── infer_depth.ipynb ├── infer_depth.py ├── infer_layout.ipynb ├── infer_layout.py ├── infer_sem.ipynb ├── lib ├── config.py ├── dataset │ ├── __init__.py │ ├── dataset_depth.py │ ├── dataset_layout.py │ └── dataset_s2d3d_sem.py ├── misc │ ├── __init__.py │ ├── gen_txt_structured3d.py │ ├── pano_lsd_align.py │ ├── panostretch.py │ ├── post_proc.py │ ├── structured3d_extract_zip.py │ ├── structured3d_prepare_dataset.py │ └── utils.py └── model │ ├── backbone │ ├── __init__.py │ ├── hardnet.py │ ├── resnet.py │ └── simple.py │ ├── hohonet.py │ ├── horizon_compression │ ├── __init__.py │ ├── ehc.py │ ├── hc.py │ └── simple.py │ ├── horizon_refinement │ ├── __init__.py │ ├── attention.py │ ├── identity.py │ ├── linear.py │ └── rnn.py │ ├── horizon_upsample │ ├── __init__.py │ └── upsample1d.py │ ├── modality │ ├── __init__.py │ ├── bases.py │ ├── depth.py │ ├── layout.py │ └── semantic.py │ └── utils.py ├── test_depth.py ├── test_layout.py ├── test_sem.py ├── train.py ├── vis_depth.py └── vis_layout.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | output 3 | ckpt 4 | data 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 sunset 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HoHoNet 2 | 3 | Code for our paper in CVPR 2021: **HoHoNet: 360 Indoor Holistic Understanding with Latent Horizontal Features** ([paper](https://arxiv.org/abs/2011.11498), [video](https://www.youtube.com/watch?v=xXtRaRKmMpA)). 4 | 5 | ![teaser](./assets/repo_teaser.jpg) 6 | 7 | #### News 8 | - **April 3, 2021**: Release inference code, jupyter notebook and visualization tools. Guide for reproduction is also finished. 9 | - **March 4, 2021**: A new backbone **[HarDNet](https://github.com/PingoLH/Pytorch-HarDNet)** is included, which shows better speed and depth accuracy. 10 | 11 | 12 | ## Pretrained weight 13 | Links to trained weights `ckpt/`: [download on Google drive](https://drive.google.com/drive/folders/1raT3vRXnQXRAQuYq36dE-93xFc_hgkTQ?usp=sharing) or [download on Dropbox](https://www.dropbox.com/sh/b014nop5jrehpoq/AACWNTMMHEAbaKOO1drqGio4a?dl=0). 14 | 15 | 16 | ## Inference 17 | In below, we use an out-of-training-distribution 360 image from PanoContext as an example. 18 | 19 | ### Jupyter notebook 20 | See [infer_depth.ipynb](infer_depth.ipynb), [infer_layout.ipynb](infer_layout.ipynb), and [infer_sem.ipynb](infer_sem.ipynb) for interactive demo and visualization. 21 | 22 | ### Batch inference 23 | Run `infer_depth.py`/`infer_layout.py` to inference depth/layout. 24 | Use `--cfg` and `--pth` to specify the path to config file and pretrained weight. 25 | Specify input path with `--inp`. Glob pattern for a batch of files is avaiable. 26 | The results are stored into `--out` directory with the same filename with extention set ot `.depth.png` and `.layout.txt`. 27 | 28 | Example for depth: 29 | ``` 30 | python infer_depth.py --cfg config/mp3d_depth/HOHO_depth_dct_efficienthc_TransEn1_hardnet.yaml --pth ckpt/mp3d_depth_HOHO_depth_dct_efficienthc_TransEn1_hardnet/ep60.pth --out assets/ --inp assets/pano_asmasuxybohhcj.png 31 | ``` 32 | 33 | Example for layout: 34 | ``` 35 | python infer_layout.py --cfg config/mp3d_layout/HOHO_layout_aug_efficienthc_Transen1_resnet34.yaml --pth ckpt/mp3d_layout_HOHO_layout_aug_efficienthc_Transen1_resnet34/ep300.pth --out assets/ --inp assets/pano_asmasuxybohhcj.png 36 | ``` 37 | 38 | ### Visualization tools 39 | To visualize layout as 3D mesh, run: 40 | ``` 41 | python vis_layout.py --img assets/pano_asmasuxybohhcj.png --layout assets/pano_asmasuxybohhcj.layout.txt 42 | ``` 43 | Rendering options: `--show_ceiling`, `--ignore_floor`, `--ignore_wall`, `--ignore_wireframe` are available. 44 | Set `--out` to export the mesh to `ply` file. 45 | Set `--no_vis` to disable the visualization. 46 |

47 | 48 |

49 | 50 | 51 | To visualize depth as point cloud, run: 52 | ``` 53 | python vis_depth.py --img assets/pano_asmasuxybohhcj.png --depth assets/pano_asmasuxybohhcj.depth.png 54 | ``` 55 | Rendering options: `--crop_ratio`, `--crop_z_above`. 56 |

57 | 58 |

59 | 60 | 61 | 62 | ## Reproduction 63 | Please see [README_reproduction.md](README_reproduction.md) for the guide to: 64 | 1. prepare the datasets for each task in our paper 65 | 2. reproduce the training for each task 66 | 3. reproduce the numerical results in our paper with the provided pretrained weights 67 | 68 | 69 | ## Citation 70 | ``` 71 | @inproceedings{SunSC21, 72 | author = {Cheng Sun and 73 | Min Sun and 74 | Hwann{-}Tzong Chen}, 75 | title = {HoHoNet: 360 Indoor Holistic Understanding With Latent Horizontal 76 | Features}, 77 | booktitle = {CVPR}, 78 | year = {2021}, 79 | } 80 | ``` 81 | -------------------------------------------------------------------------------- /README_prepare_data_mp3d_layout.md: -------------------------------------------------------------------------------- 1 | # Prepare MatterportLayout dataset 2 | 3 | References: 4 | - [3D Manhattan Room Layout Reconstruction from a Single 360 Image](https://arxiv.org/abs/1910.04099) 5 | - [PanoAnnotator](https://github.com/SunDaDenny/PanoAnnotator) 6 | - [LayoutMP3D: Layout Annotation of Matterport3D](https://arxiv.org/abs/2003.13516) 7 | - [Matterport3DLayoutAnnotation github](https://github.com/ericsujw/Matterport3DLayoutAnnotation) (we use the annotation provided by LayoutNetv2) 8 | 9 | ## Dataset preparation 10 | ### Step 1: download source 11 | Please refer to [Matterport3DLayoutAnnotation](https://github.com/ericsujw/Matterport3DLayoutAnnotation) to download the source datas. 12 | - Put all the rgb under `{ROOT}/image_up/`. 13 | - Download the annotation to `{ROOT}/label_data/` (originally json format). 14 | - Download the data split into `{ROOT}/mp3d_[train|val|test].txt`. 15 | 16 | ### Step 2: convert json annotation to corners in txt format 17 | Use below code to convert original ground-truth json into txt. **(Remember to update the uppercase variables)** 18 | ```python 19 | import os 20 | import glob 21 | import json 22 | import numpy as np 23 | 24 | IN_GLOB = 'label_data/*json' 25 | OUT_DIR = 'label_cor' 26 | os.makedirs(OUT_DIR, exist_ok=True) 27 | 28 | for p in glob.glob(IN_GLOB): 29 | gt = json.load(open(p)) 30 | assert gt['cameraHeight'] == 1.6 31 | us = np.array([pts['coords'][0] for pts in gt['layoutPoints']['points']]) 32 | us = us * 1024 33 | cs = np.array([pts['xyz'] for pts in gt['layoutPoints']['points']]) 34 | cs = np.sqrt((cs**2)[:, [0, 2]].sum(1)) 35 | 36 | vf = np.arctan2(-1.6, cs) 37 | vc = np.arctan2(-1.6 + gt['layoutHeight'], cs) 38 | vf = (-vf / np.pi + 0.5) * 512 39 | vc = (-vc / np.pi + 0.5) * 512 40 | 41 | cor_x = np.repeat(us, 2) 42 | cor_y = np.stack([vc, vf], -1).reshape(-1) 43 | cor_xy = np.stack([cor_x, cor_y], -1) 44 | 45 | out_path = os.path.join(OUT_DIR, os.path.split(p)[-1][:-4] + 'txt') 46 | with open(out_path, 'w') as f: 47 | for x, y in cor_xy: 48 | f.write('%.2f %.2f\n' % (x, y)) 49 | ``` 50 | 51 | ### Step 3: data split 52 | Use below code to organize the data split for training and evaluation. **(Remember to update the uppercase variables)** 53 | ```python 54 | import os 55 | from shutil import copy2 56 | 57 | IMG_ROOT = 'image_up' 58 | TXT_ROOT = 'label_cor' 59 | OUT_ROOT = 'mp3d_layout' 60 | TRAIN_TXT = 'mp3d_train.txt' 61 | VALID_TXT = 'mp3d_val.txt' 62 | TEST_TXT = 'mp3d_test.txt' 63 | 64 | def go(txt, split): 65 | out_img_root = os.path.join(OUT_ROOT, split, 'img') 66 | out_txt_root = os.path.join(OUT_ROOT, split, 'label_cor') 67 | os.makedirs(out_img_root, exist_ok=True) 68 | os.makedirs(out_txt_root, exist_ok=True) 69 | 70 | with open(txt) as f: 71 | ks = ['_'.join(l.strip().split()) for l in f] 72 | 73 | for k in ks: 74 | copy2(os.path.join(IMG_ROOT, k + '.png'), out_img_root) 75 | copy2(os.path.join(TXT_ROOT, k + '_label.txt'), out_txt_root) 76 | os.rename(os.path.join(out_txt_root, k + '_label.txt'), os.path.join(out_txt_root, k + '.txt')) 77 | 78 | 79 | go(TRAIN_TXT, 'train') 80 | go(VALID_TXT, 'valid') 81 | go(TEST_TXT, 'test') 82 | ``` 83 | 84 | ### Step 4: clamp occlusion 85 | We assume only visible corners in txt annotation (which is the same as [Holistic 3D Vision Challenge, ECCV2020](https://competitions.codalab.org/competitions/24183#learn_the_details-evaluation)'s format). 86 | For MatterportLayout dataset, please copy&paste below script to `clamp_occ_corners.py` and run: 87 | - `python clamp_occ_corners.py --ori_glob "data/mp3d_layout/train/label_cor/*txt" --output_dir data/mp3d_layout/train_no_occ/label_cor/*txt` 88 | - `python clamp_occ_corners.py --ori_glob "data/mp3d_layout/valid/label_cor/*txt" --output_dir data/mp3d_layout/valid_no_occ/label_cor/*txt` 89 | - `python clamp_occ_corners.py --ori_glob "data/mp3d_layout/test/label_cor/*txt" --output_dir data/mp3d_layout/test_no_occ/label_cor/*txt` 90 | ```python 91 | import os 92 | import json 93 | import glob 94 | import numpy as np 95 | from shapely.geometry import LineString 96 | 97 | from misc import panostretch 98 | 99 | def cor_2_1d(cor, H=512, W=1024): 100 | bon_ceil_x, bon_ceil_y = [], [] 101 | bon_floor_x, bon_floor_y = [], [] 102 | n_cor = len(cor) 103 | for i in range(n_cor // 2): 104 | xys = panostretch.pano_connect_points(cor[i*2], 105 | cor[(i*2+2) % n_cor], 106 | z=-50, w=W, h=H) 107 | bon_ceil_x.extend(xys[:, 0]) 108 | bon_ceil_y.extend(xys[:, 1]) 109 | for i in range(n_cor // 2): 110 | xys = panostretch.pano_connect_points(cor[i*2+1], 111 | cor[(i*2+3) % n_cor], 112 | z=50, w=W, h=H) 113 | bon_floor_x.extend(xys[:, 0]) 114 | bon_floor_y.extend(xys[:, 1]) 115 | bon_ceil_x, bon_ceil_y = sort_xy_filter_unique(bon_ceil_x, bon_ceil_y, y_small_first=True) 116 | bon_floor_x, bon_floor_y = sort_xy_filter_unique(bon_floor_x, bon_floor_y, y_small_first=False) 117 | bon = np.zeros((2, W)) 118 | bon[0] = np.interp(np.arange(W), bon_ceil_x, bon_ceil_y, period=W) 119 | bon[1] = np.interp(np.arange(W), bon_floor_x, bon_floor_y, period=W) 120 | #bon = ((bon + 0.5) / H - 0.5) * np.pi 121 | return bon 122 | 123 | def sort_xy_filter_unique(xs, ys, y_small_first=True): 124 | xs, ys = np.array(xs), np.array(ys) 125 | idx_sort = np.argsort(xs + ys / ys.max() * (int(y_small_first)*2-1)) 126 | xs, ys = xs[idx_sort], ys[idx_sort] 127 | _, idx_unique = np.unique(xs, return_index=True) 128 | xs, ys = xs[idx_unique], ys[idx_unique] 129 | assert np.all(np.diff(xs) > 0) 130 | return xs, ys 131 | 132 | def find_occlusion(coor): 133 | u = panostretch.coorx2u(coor[:, 0]) 134 | v = panostretch.coory2v(coor[:, 1]) 135 | x, y = panostretch.uv2xy(u, v, z=-50) 136 | occlusion = [] 137 | for i in range(len(x)): 138 | raycast = LineString([(0, 0), (x[i], y[i])]) 139 | other_layout = [] 140 | for j in range(i+1, len(x)): 141 | other_layout.append((x[j], y[j])) 142 | for j in range(0, i): 143 | other_layout.append((x[j], y[j])) 144 | other_layout = LineString(other_layout) 145 | occlusion.append(raycast.intersects(other_layout)) 146 | return np.array(occlusion) 147 | 148 | 149 | 150 | if __name__ == '__main__': 151 | 152 | import argparse 153 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 154 | parser.add_argument('--ori_glob', required=True) 155 | parser.add_argument('--output_dir', required=True) 156 | args = parser.parse_args() 157 | 158 | os.makedirs(args.output_dir, exist_ok=True) 159 | 160 | paths = glob.glob(args.ori_glob) 161 | for path in paths: 162 | if path.endswith('json'): 163 | with open(path) as f: 164 | dt = json.load(f) 165 | cor = np.array(dt['uv'], np.float32) 166 | cor[:, 0] *= 1024 167 | cor[:, 1] *= 512 168 | else: 169 | with open(path) as f: 170 | cor = np.array([l.strip().split() for l in f]).astype(np.float32) 171 | cor = cor.reshape(-1, 4) 172 | duplicated = [False] * len(cor) 173 | for i in range(len(duplicated)): 174 | for j in range(i+1, len(duplicated)): 175 | if (cor[j] == cor[i]).sum() == 4: 176 | duplicated[j] = True 177 | cor = cor[~np.array(duplicated)].reshape(-1, 2) 178 | cor[:, 0] = cor[:, 0] % 1024 179 | cor = np.roll(cor[:, :2], -2 * np.argmin(cor[::2, 0]), 0) 180 | occlusion = find_occlusion(cor[::2].copy()).repeat(2) 181 | 182 | bon = cor_2_1d(cor) 183 | 184 | cor_v1 = [] 185 | for i in range(0, len(cor), 2): 186 | if occlusion[i] & ~occlusion[(i+2) % len(cor)]: 187 | cur_x = cor[i, 0] 188 | next_x = cor[(i+2) % len(cor), 0] 189 | prev_x, j = None, i-2 190 | while prev_x is None: 191 | if j < 0: 192 | j += len(cor) 193 | if ~occlusion[j]: 194 | prev_x = cor[j, 0] 195 | break 196 | j -= 2 197 | dist2next = min(abs(next_x-cur_x), abs(next_x+1024-cur_x), abs(next_x-1024-cur_x)) 198 | dist2prev = min(abs(prev_x-cur_x), abs(prev_x+1024-cur_x), abs(prev_x-1024-cur_x)) 199 | # print(cor[i], prev_x, next_x, dist2next, dist2prev) 200 | if dist2prev < dist2next: 201 | cor_v1.append([prev_x, bon[0, (int(prev_x)+1) % 1024]]) 202 | cor_v1.append([prev_x, bon[1, (int(prev_x)+1) % 1024]]) 203 | else: 204 | cor_v1.append([next_x, bon[0, (int(next_x)-1) % 1024]]) 205 | cor_v1.append([next_x, bon[1, (int(next_x)-1) % 1024]]) 206 | elif ~occlusion[i]: 207 | cor_v1.extend(cor[i:i+2]) 208 | 209 | cor_v1 = np.stack(cor_v1, 0) 210 | for _ in range(len(cor_v1)): 211 | if np.alltrue(cor_v1[::2, 0][1:] - cor_v1[::2, 0][:-1] >= 0): 212 | break 213 | cor_v1 = np.roll(cor_v1, 2, axis=0) 214 | if not np.alltrue(cor_v1[::2, 0][1:] - cor_v1[::2, 0][:-1] >= 0): 215 | cor_v1[2::2] = np.flip(cor_v1[2::2], 0) 216 | cor_v1[3::2] = np.flip(cor_v1[3::2], 0) 217 | for _ in range(len(cor_v1)): 218 | if np.alltrue(cor_v1[::2, 0][1:] - cor_v1[::2, 0][:-1] >= 0): 219 | break 220 | cor_v1 = np.roll(cor_v1, 2, axis=0) 221 | with open(os.path.join(args.output_dir, f'{os.path.split(path)[1].replace("json", "txt")}'), 'w') as f: 222 | for u, v in cor_v1: 223 | f.write(f'{u:.0f} {v:.0f}\n') 224 | ``` 225 | 226 | 227 | 228 | ### Final file structure 229 | So now, you should have a `mp3d_layout` directory with below structure for HoHoNet to train. 230 | 231 | data 232 | └── mp3d_layout 233 | ├── train 234 | │ ├── img/*png 235 | │ └── label_cor/*txt 236 | ├── train_no_occ 237 | │ ├── img/*png 238 | │ └── label_cor/*txt 239 | ├── valid 240 | │ ├── img/*png 241 | │ └── label_cor/*txt 242 | ├── valid_no_occ 243 | │ ├── img/*png 244 | │ └── label_cor/*txt 245 | ├── test 246 | │ ├── img/*png 247 | │ └── label_cor/*txt 248 | └── test_no_occ 249 | ├── img/*png 250 | └── label_cor/*txt 251 | -------------------------------------------------------------------------------- /README_prepare_data_s2d3d.md: -------------------------------------------------------------------------------- 1 | # Prepare Stanford2d3d dataset 2 | 3 | ## Dataset preparation 4 | ### Step 1: download source 5 | Please refer to [2D-3D-Semantics](https://github.com/alexsax/2D-3D-Semantics) to download the source datas. 6 | Make sure `"$S2D3D_ROOT"/area_[1|2|3|4|5a|5b|6]/pano/[depth|rgb|semantic]` existed. 7 | 8 | 9 | ### Step 2: resize and copy into `data/stanford2D3D/` for depth modality 10 | The source data are in high resolution (`2048x4096`). 11 | To reduce data loading time during training, we resize them to `512x1024` and copy into HoHoNet's `data/`. 12 | Copy below code and paste into `prepare_S2D3D_d.py`. 13 | Run `python prepare_S2D3D_d.py --ori_root "$S2D3D_ROOT" --new_root "$HOHO_ROOT/data/stanford2D3D/"`. 14 | ```python 15 | import os 16 | import glob 17 | import argparse 18 | from tqdm import tqdm 19 | 20 | import numpy as np 21 | from imageio import imread, imwrite 22 | from skimage.transform import rescale 23 | 24 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 25 | parser.add_argument('--ori_root', required=True) 26 | parser.add_argument('--new_root', required=True) 27 | args = parser.parse_args() 28 | 29 | areas = ['area_1', 'area_2', 'area_3', 'area_4', 'area_5a', 'area_5b', 'area_6'] 30 | 31 | for area in areas: 32 | print('Processing:', area) 33 | os.makedirs(os.path.join(args.new_root, area, 'rgb'), exist_ok=True) 34 | os.makedirs(os.path.join(args.new_root, area, 'depth'), exist_ok=True) 35 | for fname in tqdm(os.listdir(os.path.join(args.ori_root, area, 'pano', 'rgb'))): 36 | if fname[0] == '.' or not fname.endswith('png'): 37 | continue 38 | rgb_path = os.path.join(args.ori_root, area, 'pano', 'rgb', fname) 39 | d_path = os.path.join(args.ori_root, area, 'pano', 'depth', fname[:-7] + 'depth.png') 40 | assert os.path.isfile(d_path) 41 | 42 | rgb = imread(rgb_path)[..., :3] 43 | depth = imread(d_path) 44 | rgb = rescale(rgb, 0.25, order=0, mode='wrap', anti_aliasing=False, preserve_range=True) 45 | depth = rescale(depth, 0.25, order=0, mode='wrap', anti_aliasing=False, preserve_range=True) 46 | 47 | imwrite(os.path.join(args.new_root, area, 'rgb', fname), rgb.astype(np.uint8)) 48 | imwrite(os.path.join(args.new_root, area, 'depth', fname[:-7] + 'depth.png'), depth.astype(np.uint16)) 49 | ``` 50 | 51 | ### Step 3: resize and copy into `data/s2d3d_sem` for semantic modality 52 | Please download `semantic_labels.json`, `name2label.json`, and `colors.npy` on [Google drive](https://drive.google.com/drive/folders/1raT3vRXnQXRAQuYq36dE-93xFc_hgkTQ?usp=sharing) or [Dropbox](https://www.dropbox.com/sh/b014nop5jrehpoq/AACWNTMMHEAbaKOO1drqGio4a?dl=0). 53 | Put these files under your `$S2D3D_ROOT/`. 54 | Copy below code and paste into `prepare_S2D3D_sem.py`. 55 | Run `python prepare_S2D3D_sem.py --ori_root "$S2D3D_ROOT" --new_root "$HOHO_ROOT/data/s2d3d_sem/"`. 56 | ```python 57 | import os 58 | import json 59 | import glob 60 | from PIL import Image 61 | from tqdm import trange 62 | import numpy as np 63 | from shutil import copyfile 64 | 65 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 66 | parser.add_argument('--ori_root', required=True) 67 | parser.add_argument('--new_root', required=True) 68 | args = parser.parse_args() 69 | 70 | areas = ['area_1', 'area_2', 'area_3', 'area_4', 'area_5a', 'area_5b', 'area_6'] 71 | 72 | with open(os.path.join(args.ori_root, 'semantic_labels.json')) as f: 73 | id2name = [name.split('_')[0] for name in json.load(f)] + [''] 74 | 75 | with open(os.path.join(args.ori_root, 'name2label.json')) as f: 76 | name2id = json.load(f) 77 | 78 | colors = np.load(os.path.join(args.ori_root, 'colors.npy')) 79 | 80 | id2label = np.array([name2id[name] for name in id2name], np.uint8) 81 | 82 | for area in areas: 83 | rgb_paths = sorted(glob.glob(os.path.join(args.ori_root, area, 'pano', 'rgb', '*png'))) 84 | sem_paths = sorted(glob.glob(os.path.join(args.ori_root, area, 'pano', 'semantic', '*png'))) 85 | os.makedirs(os.path.join(args.new_root, area, 'rgb'), exist_ok=True) 86 | os.makedirs(os.path.join(args.new_root, area, 'semantic'), exist_ok=True) 87 | os.makedirs(os.path.join(args.new_root, area, 'semantic_visualize'), exist_ok=True) 88 | for i in trange(len(rgb_paths)): 89 | rgb_k = os.path.split(rgb_paths[i])[-1] 90 | sem_k = os.path.split(sem_paths[i])[-1] 91 | 92 | # RGB 93 | rgb = Image.open(rgb_paths[i]).convert('RGB').resize((1024, 512), Image.LANCZOS) 94 | rgb.save(os.path.join(args.new_root, area, 'rgb', rgb_k)) 95 | vis = np.array(rgb) 96 | # Semantic 97 | sem = np.array(Image.open(sem_paths[i]).resize((1024, 512), Image.NEAREST), np.int32) 98 | unk = (sem[..., 0] != 0) 99 | sem = id2label[sem[..., 1] * 256 + sem[..., 2]] 100 | sem[unk] = 0 101 | Image.fromarray(sem).save(os.path.join(args.new_root, area, 'semantic', rgb_k)) 102 | # Visualization 103 | vis = vis // 2 + colors[sem] // 2 104 | Image.fromarray(vis).save(os.path.join(args.new_root, area, 'semantic_visualize', rgb_k)) 105 | ``` 106 | 107 | ### Step 4: prepare data split 108 | Download data split `fold[1|2|3]_[train|valid].txt` and `small_[train|valid|test].txt` on [Google drive](https://drive.google.com/drive/folders/1raT3vRXnQXRAQuYq36dE-93xFc_hgkTQ?usp=sharing) or [Dropbox](https://www.dropbox.com/sh/b014nop5jrehpoq/AACWNTMMHEAbaKOO1drqGio4a?dl=0). 109 | Put these `txt` files under `data/stanford2D3D`. 110 | 111 | 112 | 113 | ### Final file structure 114 | So now, you should have a `stanford2D3D` and `s2d3d_sem` directories with below structure for HoHoNet to train. 115 | 116 | data 117 | ├── stanford2D3D 118 | │ ├── area_[1|2|3|4|5a|5b|6] 119 | │ │ ├── img/*png 120 | │ │ └── depth/*png 121 | │ ├── small_[train|valid|test].txt 122 | │ └── fold[1|2|3]_[train|valid].txt 123 | │ 124 | └── s2d3d_sem 125 | └── area_[1|2|3|4|5a|5b|6] 126 | ├── rgb/*png 127 | └── semantic/*png 128 | -------------------------------------------------------------------------------- /README_reproduction.md: -------------------------------------------------------------------------------- 1 | # Reproduction 2 | 3 | Below provides: 4 | 1. guide to prepare the datasets for each task in our paper 5 | 2. reproduce the training and numerical results in our paper 6 | 7 | ## Dataset 8 | Detail instruction for preparing the datas for each dataset and task: 9 | - `Matterport3d` x `Layout` 10 | - see [Prepare MatterportLayout dataset](README_prepare_data_mp3d_layout.md) 11 | - `Matterport3d` x `Depth (BiFuse's stitching)` 12 | - We use the rgb-d stitching provided by [BiFuse](https://github.com/Yeh-yu-hsuan/BiFuse) 13 | - Put their `mp3d_align/` under `data/` 14 | - Download data split via [Google drive](https://drive.google.com/drive/folders/1raT3vRXnQXRAQuYq36dE-93xFc_hgkTQ?usp=sharing) or via [Dropbox](https://www.dropbox.com/sh/b014nop5jrehpoq/AACWNTMMHEAbaKOO1drqGio4a?dl=0) and put them under `data/matterport3d/`. 15 | - `Matterport3d` x `Depth (our new stitching)` 16 | - We remove the depth noise in BiFuse's stitching 17 | - This is not the version we use in our paper 18 | - **TODO:** release new stiching code with experiment results on it 19 | - `Stanford2d3d` x `Depth`: 20 | - see [Prepare Stanford2d3d dataset](README_prepare_data_s2d3d.md) 21 | - `Stanford2d3d` x `Semantic segmentation`: 22 | - see [Prepare Stanford2d3d dataset](README_prepare_data_s2d3d.md) 23 | 24 | The overall file strucure of the datasets is depicted as follow: 25 | 26 | data 27 | ├── mp3d_align # Stitching provided by BiFuse (https://github.com/Yeh-yu-hsuan/BiFuse) 28 | │ ├── 17DRP5sb8fy 29 | │ │ ├── 00ebbf3782c64d74aaf7dd39cd561175 30 | │ │ │ ├── color.jpg 31 | │ │ │ └── depth.npy 32 | │ │ └── ... 33 | │ └── ... 34 | │ 35 | ├── matterport3d 36 | │ ├── scenes_abla_train.txt # 41 house id for ablation training 37 | │ ├── scenes_abla_valid.txt # 20 house id for ablation evaluation 38 | │ ├── scenes_train.txt # 61 house id for training following BiFuse 39 | │ ├── mp3d_scenes_test.txt # 28 house id for testing following BiFuse 40 | │ └── mp3d_rgbd/ # Our new stitching which fixs the depth noise in BiFuse's version 41 | │ # Release new stitching code with new experiments later. 42 | │ 43 | ├── mp3d_layout # Please follow README_prepare_data_mp3d_layout.md 44 | │ ├── train_no_occ 45 | │ │ ├── img/*png 46 | │ │ └── label_cor/*txt 47 | │ ├── valid_no_occ 48 | │ │ ├── img/*png 49 | │ │ └── label_cor/*txt 50 | │ └── test_no_occ 51 | │ ├── img/*png 52 | │ └── label_cor/*txt 53 | │ 54 | ├── stanford2D3D # Please follow README_prepare_data_s2d3d.md 55 | │ ├── area_[1|2|3|4|5a|5b|6] 56 | │ │ ├── img/*png 57 | │ │ └── depth/*png 58 | │ ├── small_[train|valid|test].txt 59 | │ └── fold[1|2|3]_[train|valid].txt 60 | │ 61 | └── s2d3d_sem # Please follow README_prepare_data_s2d3d.md 62 | └── area_[1|2|3|4|5a|5b|6] 63 | ├── rgb/*png 64 | └── semantic/*png 65 | 66 | 67 | ## Reproduction: training 68 | The configs for reproducing the experiments are all in `config/`. 69 | 70 | Just run: 71 | ``` 72 | python train.py --cfg {PATH_TO_CONFIG} 73 | ``` 74 | to train the same setting as experiments in our paper. 75 | Note that the results with same config but different runs could be different as the random seed is not fixed. 76 | 77 | Some examples: 78 | ``` 79 | python train.py --cfg config/mp3d_depth/HOHO_depth_dct_efficienthc_TransEn1_hardnet.yaml 80 | python train.py --cfg config/mp3d_layout/HOHO_layout_aug_efficienthc_Transen1_resnet34.yaml 81 | python train.py --cfg config/s2d3d_depth/HOHO_depth_dct_efficienthc_TransEn1.yaml 82 | python train.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101.yaml 83 | ``` 84 | 85 | ## Reproduction: measuring FPS 86 | Just run: 87 | ``` 88 | python count_params_flops.py --cfg {PATH_TO_CONFIG} 89 | ``` 90 | It measures averaged feed-forward times of the model. 91 | The results reported in our paper are obtained on a GeForce RTX 2080. 92 | 93 | ## Reproduction: quantitative evaluation 94 | Please make sure the dataset and the trained weights are organized as the instruction above. 95 | If not, the config should be updated accordinly and you should directly assign the path to the trained weight to the testing script via `--pth`. 96 | 97 | 98 |
99 | 100 | ### `Matterport3D` x `depth` (BiFuse's stitching and setting) 101 | Assume pretrained weights located at: 102 | - `ckpt/mp3d_depth_HOHO_depth_dct_efficienthc_TransEn1_hardnet/ep60.pth` 103 | - `ckpt/mp3d_depth_HOHO_depth_dct_efficienthc_TransEn1/ep60.pth` 104 | 105 | Run: 106 | ``` 107 | python test_depth.py --cfg config/mp3d_depth/HOHO_depth_dct_efficienthc_TransEn1.yaml 108 | python test_depth.py --cfg config/mp3d_depth/HOHO_depth_dct_efficienthc_TransEn1_hardnet.yaml 109 | ``` 110 | 111 | Results: 112 | | Exp | fps | mre | mae | rmse | rmse_log | log10 | delta_1 | delta_3 | delta_3 | 113 | | :-- | :-- | :-- | :-- | :--- | :------- | :---- | :------ | :------ | :------ | 114 | | HOHO_depth_dct_efficienthc_TransEn1 | 52 | 0.1488 | 0.2862 | 0.5138 | 0.0871 | 0.0505 | 0.8786 | 0.9519 | 0.9771 | 115 | | HOHO_depth_dct_efficienthc_TransEn1_hardnet | 67 | 0.1482 | 0.2761 | 0.4968 | 0.0857 | 0.0494 | 0.8830 | 0.9547 | 0.9797 | 116 | 117 | 118 |
119 | 120 | ### `Matterport3D` x `depth` (our new stitching and setting) 121 | **TODO** 122 | 123 | 124 |
125 | 126 | ### `Matterport3D` x `layout` (LayoutNetv2's setting) 127 | Assume pretrained weights located at: 128 | - `ckpt/mp3d_layout_HOHO_layout_aug_efficienthc_Transen1_resnet34/ep300.pth` 129 | 130 | Run to predict layout and store the results in txt files: 131 | ``` 132 | python test_layout.py --cfg config/mp3d_layout/HOHO_layout_aug_efficienthc_Transen1_resnet34.yaml --img_glob "data/mp3d_layout/test/img/*" --output_dir output/mp3d_layout/HOHO_layout_aug_efficienthc_Transen1_resnet34/ 133 | ``` 134 | 135 | Run to evaluate the prediction: 136 | ``` 137 | python eval_layout.py --gt_glob "data/mp3d_layout/test/label_cor/*" --dt_glob "output/mp3d_layout/HOHO_layout_aug_efficienthc_Transen1_resnet34/*" 138 | ``` 139 | 140 | Results: 141 | | Exp | fps | 2DIoU | 3DIoU | RMSE | delta_1 | 142 | | :-- | :-- | :---- | :---- | :--- | :------ | 143 | | HOHO_layout_aug_efficienthc_Transen1_resnet34 | 111 | 82.32 | 79.88 | 0.22 | 0.95 | 144 | 145 | **[Note]** our implementation for the depth-based evaluation (i.e., RMSE, delta_1) is very different from LayoutNetv2's so the results from the two repo is not direct comparable. 146 | 147 | 148 |
149 | 150 | ### `Stanford2d3d` x `depth` (BiFuse's setting) 151 | Assume pretrained weights located at: 152 | - `ckpt/s2d3d_depth_HOHO_depth_dct_efficienthc_TransEn1/ep60.pth` 153 | 154 | Run: 155 | ``` 156 | python test_depth.py --cfg config/s2d3d_depth/HOHO_depth_dct_efficienthc_TransEn1.yaml 157 | ``` 158 | 159 | Results: 160 | | Exp | fps | mre | mae | rmse | rmse_log | log10 | delta_1 | delta_3 | delta_3 | 161 | | :-- | :-- | :-- | :-- | :--- | :------- | :---- | :------ | :------ | :------ | 162 | | HOHO_depth_dct_efficienthc_TransEn1 | 52 | 0.1014 | 0.2027 | 0.3834 | 0.0668 | 0.0438 | 0.9054 | 0.9693 | 0.9886 | 163 | 164 | 165 |
166 | 167 | ### `Stanford2d3d` x `depth` (GeoReg360's setting) 168 | Assume pretrained weights located at: 169 | - `ckpt/s2d3d_depth_HOHO_depthS_dct_efficienthc_TransEn1/ep60.pth` 170 | - `ckpt/s2d3d_depth_HOHO_depthS_SGD_dct_efficienthc_TransEn1/ep60.pth` 171 | 172 | Run: 173 | ``` 174 | python test_depth.py --cfg config/s2d3d_depth/HOHO_depthS_SGD_dct_efficienthc_TransEn1.yaml --clip 100 175 | python test_depth.py --cfg config/s2d3d_depth/HOHO_depthS_dct_efficienthc_TransEn1.yaml --clip 100 176 | ``` 177 | 178 | **[Note]** remember to add `--clip 100` to disable depth clip for a fair comparison with GeoReg360's setting. 179 | 180 | Results: 181 | | Exp | fps | mre | mae | rmse | rmse_log | log10 | delta_1 | delta_3 | delta_3 | 182 | | :-- | :-- | :-- | :-- | :--- | :------- | :---- | :------ | :------ | :------ | 183 | | HOHO_depthS_SGD_dct_efficienthc_TransEn1 | 106 | 0.1114 | 0.2197 | 0.4083 | 0.0737 | 0.0502 | 0.8671 | 0.9694 | 0.9916 | 184 | | HOHO_depthS_dct_efficienthc_TransEn1 | 104 | 0.1040 | 0.2134 | 0.3940 | 0.0678 | 0.0475 | 0.8955 | 0.9749 | 0.9933 | 185 | 186 | 187 |
188 | 189 | ### `Stanford2d3d` x `semantic segmentation` 190 | Run: 191 | ``` 192 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h64_fold1_simple.yaml 193 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h64_fold2_simple.yaml 194 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h64_fold3_simple.yaml 195 | 196 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h256_fold1_simple.yaml 197 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h256_fold2_simple.yaml 198 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h256_fold3_simple.yaml 199 | 200 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101rgb.yaml 201 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101rgb.yaml 202 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101rgb.yaml 203 | 204 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101.yaml 205 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101.yaml 206 | python test_sem.py --cfg config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101.yaml 207 | ``` 208 | 209 | Results: 210 | | Exp | fps | iou | acc | 211 | | :-- | :-- | :-- | :-- | 212 | | HOHO_depth_dct_efficienthc_TransEn1_h64_fold1_simple | 202 | 43.04 | 53.06 | 213 | | HOHO_depth_dct_efficienthc_TransEn1_h64_fold2_simple | 204 | 36.27 | 48.45 | 214 | | HOHO_depth_dct_efficienthc_TransEn1_h64_fold3_simple | 202 | 43.14 | 54.81 | 215 | 216 | | Exp | fps | iou | acc | 217 | | :-- | :-- | :-- | :-- | 218 | | HOHO_depth_dct_efficienthc_TransEn1_h256_fold1_simple | 135 | 46.49 | 56.33 | 219 | | HOHO_depth_dct_efficienthc_TransEn1_h256_fold2_simple | 135 | 37.18 | 48.60 | 220 | | HOHO_depth_dct_efficienthc_TransEn1_h256_fold3_simple | 135 | 46.09 | 56.81 | 221 | 222 | | Exp | fps | iou | acc | 223 | | :-- | :-- | :-- | :-- | 224 | | HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101rgb | 10 | 53.94 | 64.30 | 225 | | HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101rgb | 10 | 45.03 | 61.70 | 226 | | HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101rgb | 10 | 56.87 | 68.94 | 227 | 228 | | Exp | fps | iou | acc | 229 | | :-- | :-- | :-- | :-- | 230 | | HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101 | 10 | 59.05 | 68.91 | 231 | | HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101 | 10 | 49.70 | 65.86 | 232 | | HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101 | 10 | 60.28 | 71.85 | 233 | -------------------------------------------------------------------------------- /assets/label13_weight.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/assets/label13_weight.pth -------------------------------------------------------------------------------- /assets/pano_asmasuxybohhcj.depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/assets/pano_asmasuxybohhcj.depth.png -------------------------------------------------------------------------------- /assets/pano_asmasuxybohhcj.layout.txt: -------------------------------------------------------------------------------- 1 | 83.7 161.1 2 | 83.7 332.7 3 | 126.6 133.3 4 | 126.6 358.2 5 | 181.2 170.3 6 | 181.2 324.7 7 | 354.4 176.4 8 | 354.4 319.4 9 | 609.0 149.1 10 | 609.0 343.6 11 | 941.1 160.6 12 | 941.1 333.1 13 | -------------------------------------------------------------------------------- /assets/pano_asmasuxybohhcj.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/assets/pano_asmasuxybohhcj.png -------------------------------------------------------------------------------- /assets/repo_teaser.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/assets/repo_teaser.jpg -------------------------------------------------------------------------------- /assets/snapshot_depth.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/assets/snapshot_depth.jpg -------------------------------------------------------------------------------- /assets/snapshot_layout.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/assets/snapshot_layout.jpg -------------------------------------------------------------------------------- /config/mp3d_depth/HOHO_depth_dct_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/mp3d_scenes_test.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 60 23 | batch_size: 4 24 | save_every: 60 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: dct 48 | n_components: 64 49 | loss: l1 50 | -------------------------------------------------------------------------------- /config/mp3d_depth/HOHO_depth_dct_efficienthc_TransEn1_hardnet.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/mp3d_scenes_test.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 60 23 | batch_size: 4 24 | save_every: 60 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: HarDNet 36 | kwargs: 37 | depth_wise: False 38 | arch: 68 39 | pretrained: True 40 | decode_config: 41 | module: EfficientHeightReduction 42 | refine_config: 43 | module: TransEn 44 | kwargs: 45 | position_encode: 256 46 | num_layers: 1 47 | modalities_config: 48 | DepthEstimator: 49 | basis: dct 50 | n_components: 64 51 | loss: l1 52 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_dct128_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: dct 48 | n_components: 128 49 | loss: l1 50 | 51 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_dct256_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: dct 48 | n_components: 256 49 | loss: l1 50 | 51 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_dct32_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: dct 48 | n_components: 32 49 | loss: l1 50 | 51 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_dct512_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: dct 48 | n_components: 512 49 | loss: l1 50 | 51 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_dct_LSTM.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: GlobalHeightStage 40 | refine_config: 41 | module: LSTM 42 | modalities_config: 43 | DepthEstimator: 44 | basis: dct 45 | n_components: 64 46 | loss: l1 47 | 48 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_dct_Linear.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: GlobalHeightStage 40 | refine_config: 41 | module: Linear 42 | modalities_config: 43 | DepthEstimator: 44 | basis: dct 45 | n_components: 64 46 | loss: l1 47 | 48 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_dct_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: GlobalHeightStage 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: dct 48 | n_components: 64 49 | loss: l1 50 | 51 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_dct_efficienthc_LSTM.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: LSTM 42 | modalities_config: 43 | DepthEstimator: 44 | basis: dct 45 | n_components: 64 46 | loss: l1 47 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_dct_efficienthc_Linear.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: Linear 42 | modalities_config: 43 | DepthEstimator: 44 | basis: dct 45 | n_components: 64 46 | loss: l1 47 | 48 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_dct_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: dct 48 | n_components: 64 49 | loss: l1 50 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_dct_efficienthc_TransEn1_resnet34.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet34 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: dct 48 | n_components: 64 49 | loss: l1 50 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_lin128_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: linear 48 | n_components: 128 49 | loss: l1 50 | 51 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_lin256_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: linear 48 | n_components: 256 49 | loss: l1 50 | 51 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_lin32_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: linear 48 | n_components: 32 49 | loss: l1 50 | 51 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_lin512_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: linear 48 | n_components: 512 49 | loss: l1 50 | 51 | -------------------------------------------------------------------------------- /config/mp3d_depth/ablation/tuning___HOHO_depth_lin64_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: CorruptMP3dDepthDataset 7 | common_kwargs: 8 | root: data/mp3d_align 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/matterport3d/scenes_abla_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/matterport3d/scenes_abla_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 40 23 | batch_size: 4 24 | save_every: 40 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: linear 48 | n_components: 64 49 | loss: l1 50 | 51 | -------------------------------------------------------------------------------- /config/mp3d_layout/HOHO_layout_aug_efficienthc_Transen1_resnet34.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: PanoCorBonDataset 7 | train_kwargs: 8 | root_dir: data/mp3d_layout/train_no_occ 9 | flip: True 10 | rotate: True 11 | gamma: True 12 | stretch: True 13 | valid_kwargs: 14 | root_dir: data/mp3d_layout/valid_no_occ 15 | 16 | training: 17 | epoch: 300 18 | batch_size: 4 19 | save_every: 300 20 | optim_lr: 0.0001 21 | optim_poly_gamma: 0.9 22 | 23 | model: 24 | file: lib.model.hohonet 25 | modelclass: HoHoNet 26 | kwargs: 27 | emb_dim: 256 28 | backbone_config: 29 | module: Resnet 30 | kwargs: 31 | backbone: resnet34 32 | decode_config: 33 | module: EfficientHeightReduction 34 | refine_config: 35 | module: TransEn 36 | kwargs: 37 | position_encode: 256 38 | nhead: 8 39 | num_layers: 1 40 | dim_feedforward: 2048 41 | modalities_config: 42 | LayoutEstimator: 43 | cor_weight: 1. 44 | bon_weight: 1. 45 | last_bias: False 46 | last_ks: 1 47 | -------------------------------------------------------------------------------- /config/s2d3d_depth/HOHO_depthS_SGD_dct_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dDepthDataset 7 | common_kwargs: 8 | root: data/stanford2D3D 9 | hw: (256, 512) 10 | dmax: 100. 11 | train_kwargs: 12 | scene_txt: data/stanford2D3D/small_train.txt 13 | rand_rotate: True 14 | rand_flip: True 15 | rand_gamma: True 16 | valid_kwargs: 17 | scene_txt: data/stanford2D3D/small_test.txt 18 | rand_rotate: False 19 | rand_flip: False 20 | rand_gamma: False 21 | 22 | training: 23 | optim: SGD 24 | epoch: 60 25 | batch_size: 8 26 | save_every: 60 27 | optim_lr: 0.01 28 | weight_decay: 0.0005 29 | optim_poly_gamma: 0.9 30 | optim_betas: (0.9, 0.999) 31 | 32 | model: 33 | file: lib.model.hohonet 34 | modelclass: HoHoNet 35 | kwargs: 36 | emb_dim: 256 37 | backbone_config: 38 | module: Resnet 39 | kwargs: 40 | backbone: resnet50 41 | input_height: 256 42 | decode_config: 43 | module: EfficientHeightReduction 44 | refine_config: 45 | module: TransEn 46 | kwargs: 47 | position_encode: 128 48 | num_layers: 1 49 | modalities_config: 50 | DepthEstimator: 51 | basis: dct 52 | n_components: 64 53 | loss: l1 54 | output_height: 256 55 | 56 | -------------------------------------------------------------------------------- /config/s2d3d_depth/HOHO_depthS_dct_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dDepthDataset 7 | common_kwargs: 8 | root: data/stanford2D3D 9 | hw: (256, 512) 10 | dmax: 100. 11 | train_kwargs: 12 | scene_txt: data/stanford2D3D/small_train.txt 13 | rand_rotate: True 14 | rand_flip: True 15 | rand_gamma: True 16 | valid_kwargs: 17 | scene_txt: data/stanford2D3D/small_test.txt 18 | rand_rotate: False 19 | rand_flip: False 20 | rand_gamma: False 21 | 22 | training: 23 | epoch: 60 24 | batch_size: 4 25 | save_every: 60 26 | optim_lr: 0.0001 27 | optim_poly_gamma: 0.9 28 | optim_betas: (0.9, 0.999) 29 | 30 | model: 31 | file: lib.model.hohonet 32 | modelclass: HoHoNet 33 | kwargs: 34 | emb_dim: 256 35 | backbone_config: 36 | module: Resnet 37 | kwargs: 38 | backbone: resnet50 39 | input_height: 256 40 | decode_config: 41 | module: EfficientHeightReduction 42 | refine_config: 43 | module: TransEn 44 | kwargs: 45 | position_encode: 128 46 | num_layers: 1 47 | modalities_config: 48 | DepthEstimator: 49 | basis: dct 50 | n_components: 64 51 | loss: l1 52 | output_height: 256 53 | 54 | -------------------------------------------------------------------------------- /config/s2d3d_depth/HOHO_depth_dct_efficienthc_TransEn1.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dDepthDataset 7 | common_kwargs: 8 | root: data/stanford2D3D 9 | hw: (512, 1024) 10 | train_kwargs: 11 | scene_txt: data/stanford2D3D/fold1_train.txt 12 | rand_rotate: True 13 | rand_flip: True 14 | rand_gamma: True 15 | valid_kwargs: 16 | scene_txt: data/stanford2D3D/fold1_valid.txt 17 | rand_rotate: False 18 | rand_flip: False 19 | rand_gamma: False 20 | 21 | training: 22 | epoch: 60 23 | batch_size: 4 24 | save_every: 60 25 | optim_lr: 0.0001 26 | optim_poly_gamma: 0.9 27 | optim_betas: (0.9, 0.999) 28 | 29 | model: 30 | file: lib.model.hohonet 31 | modelclass: HoHoNet 32 | kwargs: 33 | emb_dim: 256 34 | backbone_config: 35 | module: Resnet 36 | kwargs: 37 | backbone: resnet50 38 | decode_config: 39 | module: EfficientHeightReduction 40 | refine_config: 41 | module: TransEn 42 | kwargs: 43 | position_encode: 256 44 | num_layers: 1 45 | modalities_config: 46 | DepthEstimator: 47 | basis: dct 48 | n_components: 64 49 | loss: l1 50 | 51 | -------------------------------------------------------------------------------- /config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dSemDataset 7 | common_kwargs: 8 | root: data/s2d3d_sem/ 9 | hw: (1024, 2048) 10 | train_kwargs: 11 | fold: 1_train 12 | flip: True 13 | rotate: True 14 | valid_kwargs: 15 | fold: 1_valid 16 | 17 | training: 18 | epoch: 60 19 | batch_size: 4 20 | save_every: 60 21 | optim_lr: 0.0001 22 | optim_poly_gamma: 0.9 23 | optim_betas: (0.9, 0.999) 24 | 25 | model: 26 | file: lib.model.hohonet 27 | modelclass: HoHoNet 28 | kwargs: 29 | emb_dim: 256 30 | input_norm: ugscnn 31 | backbone_config: 32 | module: Resnet 33 | kwargs: 34 | input_extra: 1 35 | backbone: resnet101 36 | input_height: 1024 37 | decode_config: 38 | module: EfficientHeightReduction 39 | refine_config: 40 | module: TransEn 41 | kwargs: 42 | position_encode: 512 43 | num_layers: 1 44 | modalities_config: 45 | SemanticSegmenter: 46 | num_classes: 13 47 | label_weight: data/s2d3d_sem/label13_weight.pth 48 | basis: dct 49 | loss: ce 50 | n_components: 64 51 | output_height: 1024 52 | 53 | -------------------------------------------------------------------------------- /config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold1_resnet101rgb.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dSemDataset 7 | common_kwargs: 8 | root: data/s2d3d_sem/ 9 | hw: (1024, 2048) 10 | depth: False 11 | train_kwargs: 12 | fold: 1_train 13 | flip: True 14 | rotate: True 15 | valid_kwargs: 16 | fold: 1_valid 17 | 18 | training: 19 | epoch: 60 20 | batch_size: 4 21 | save_every: 60 22 | optim_lr: 0.0001 23 | optim_poly_gamma: 0.9 24 | optim_betas: (0.9, 0.999) 25 | 26 | model: 27 | file: lib.model.hohonet 28 | modelclass: HoHoNet 29 | kwargs: 30 | emb_dim: 256 31 | backbone_config: 32 | module: Resnet 33 | kwargs: 34 | backbone: resnet101 35 | input_height: 1024 36 | decode_config: 37 | module: EfficientHeightReduction 38 | refine_config: 39 | module: TransEn 40 | kwargs: 41 | position_encode: 512 42 | num_layers: 1 43 | modalities_config: 44 | SemanticSegmenter: 45 | num_classes: 13 46 | label_weight: data/s2d3d_sem/label13_weight.pth 47 | basis: dct 48 | loss: ce 49 | n_components: 64 50 | output_height: 1024 51 | 52 | -------------------------------------------------------------------------------- /config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dSemDataset 7 | common_kwargs: 8 | root: data/s2d3d_sem/ 9 | hw: (1024, 2048) 10 | train_kwargs: 11 | fold: 2_train 12 | flip: True 13 | rotate: True 14 | valid_kwargs: 15 | fold: 2_valid 16 | 17 | training: 18 | epoch: 60 19 | batch_size: 4 20 | save_every: 60 21 | optim_lr: 0.0001 22 | optim_poly_gamma: 0.9 23 | optim_betas: (0.9, 0.999) 24 | 25 | model: 26 | file: lib.model.hohonet 27 | modelclass: HoHoNet 28 | kwargs: 29 | emb_dim: 256 30 | input_norm: ugscnn 31 | backbone_config: 32 | module: Resnet 33 | kwargs: 34 | input_extra: 1 35 | backbone: resnet101 36 | input_height: 1024 37 | decode_config: 38 | module: EfficientHeightReduction 39 | refine_config: 40 | module: TransEn 41 | kwargs: 42 | position_encode: 512 43 | num_layers: 1 44 | modalities_config: 45 | SemanticSegmenter: 46 | num_classes: 13 47 | label_weight: data/s2d3d_sem/label13_weight.pth 48 | basis: dct 49 | loss: ce 50 | n_components: 64 51 | output_height: 1024 52 | 53 | -------------------------------------------------------------------------------- /config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold2_resnet101rgb.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dSemDataset 7 | common_kwargs: 8 | root: data/s2d3d_sem/ 9 | hw: (1024, 2048) 10 | depth: False 11 | train_kwargs: 12 | fold: 2_train 13 | flip: True 14 | rotate: True 15 | valid_kwargs: 16 | fold: 2_valid 17 | 18 | training: 19 | epoch: 60 20 | batch_size: 4 21 | save_every: 60 22 | optim_lr: 0.0001 23 | optim_poly_gamma: 0.9 24 | optim_betas: (0.9, 0.999) 25 | 26 | model: 27 | file: lib.model.hohonet 28 | modelclass: HoHoNet 29 | kwargs: 30 | emb_dim: 256 31 | backbone_config: 32 | module: Resnet 33 | kwargs: 34 | backbone: resnet101 35 | input_height: 1024 36 | decode_config: 37 | module: EfficientHeightReduction 38 | refine_config: 39 | module: TransEn 40 | kwargs: 41 | position_encode: 512 42 | num_layers: 1 43 | modalities_config: 44 | SemanticSegmenter: 45 | num_classes: 13 46 | label_weight: data/s2d3d_sem/label13_weight.pth 47 | basis: dct 48 | loss: ce 49 | n_components: 64 50 | output_height: 1024 51 | 52 | -------------------------------------------------------------------------------- /config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dSemDataset 7 | common_kwargs: 8 | root: data/s2d3d_sem/ 9 | hw: (1024, 2048) 10 | train_kwargs: 11 | fold: 3_train 12 | flip: True 13 | rotate: True 14 | valid_kwargs: 15 | fold: 3_valid 16 | 17 | training: 18 | epoch: 60 19 | batch_size: 4 20 | save_every: 60 21 | optim_lr: 0.0001 22 | optim_poly_gamma: 0.9 23 | optim_betas: (0.9, 0.999) 24 | 25 | model: 26 | file: lib.model.hohonet 27 | modelclass: HoHoNet 28 | kwargs: 29 | emb_dim: 256 30 | input_norm: ugscnn 31 | backbone_config: 32 | module: Resnet 33 | kwargs: 34 | input_extra: 1 35 | backbone: resnet101 36 | input_height: 1024 37 | decode_config: 38 | module: EfficientHeightReduction 39 | refine_config: 40 | module: TransEn 41 | kwargs: 42 | position_encode: 512 43 | num_layers: 1 44 | modalities_config: 45 | SemanticSegmenter: 46 | num_classes: 13 47 | label_weight: data/s2d3d_sem/label13_weight.pth 48 | basis: dct 49 | loss: ce 50 | n_components: 64 51 | output_height: 1024 52 | 53 | -------------------------------------------------------------------------------- /config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h1024_fold3_resnet101rgb.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dSemDataset 7 | common_kwargs: 8 | root: data/s2d3d_sem/ 9 | hw: (1024, 2048) 10 | depth: False 11 | train_kwargs: 12 | fold: 3_train 13 | flip: True 14 | rotate: True 15 | valid_kwargs: 16 | fold: 3_valid 17 | 18 | training: 19 | epoch: 60 20 | batch_size: 4 21 | save_every: 60 22 | optim_lr: 0.0001 23 | optim_poly_gamma: 0.9 24 | optim_betas: (0.9, 0.999) 25 | 26 | model: 27 | file: lib.model.hohonet 28 | modelclass: HoHoNet 29 | kwargs: 30 | emb_dim: 256 31 | backbone_config: 32 | module: Resnet 33 | kwargs: 34 | backbone: resnet101 35 | input_height: 1024 36 | decode_config: 37 | module: EfficientHeightReduction 38 | refine_config: 39 | module: TransEn 40 | kwargs: 41 | position_encode: 512 42 | num_layers: 1 43 | modalities_config: 44 | SemanticSegmenter: 45 | num_classes: 13 46 | label_weight: data/s2d3d_sem/label13_weight.pth 47 | basis: dct 48 | loss: ce 49 | n_components: 64 50 | output_height: 1024 51 | 52 | -------------------------------------------------------------------------------- /config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h256_fold1_simple.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dSemDataset 7 | common_kwargs: 8 | root: data/s2d3d_sem/ 9 | hw: (256, 512) 10 | train_kwargs: 11 | fold: 1_train 12 | flip: True 13 | rotate: True 14 | valid_kwargs: 15 | fold: 1_valid 16 | 17 | training: 18 | epoch: 300 19 | batch_size: 16 20 | save_every: 300 21 | optim_lr: 0.001 22 | optim_poly_gamma: 0.9 23 | optim_betas: (0.9, 0.999) 24 | 25 | model: 26 | file: lib.model.hohonet 27 | modelclass: HoHoNet 28 | kwargs: 29 | emb_dim: 256 30 | input_norm: ugscnn 31 | backbone_config: 32 | module: SimpleEncoder 33 | kwargs: 34 | input_extra: 1 35 | input_height: 256 36 | block: conv3x3max 37 | expand: 2 38 | decode_config: 39 | module: EfficientHeightReduction 40 | kwargs: 41 | out_ch: 256 42 | refine_config: 43 | module: TransEn 44 | kwargs: 45 | position_encode: 128 46 | num_layers: 1 47 | modalities_config: 48 | SemanticSegmenter: 49 | num_classes: 13 50 | label_weight: data/s2d3d_sem/label13_weight.pth 51 | basis: dct 52 | loss: ce 53 | n_components: 64 54 | output_height: 256 55 | dropout: 0.5 56 | 57 | -------------------------------------------------------------------------------- /config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h256_fold2_simple.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dSemDataset 7 | common_kwargs: 8 | root: data/s2d3d_sem/ 9 | hw: (256, 512) 10 | train_kwargs: 11 | fold: 2_train 12 | flip: True 13 | rotate: True 14 | valid_kwargs: 15 | fold: 2_valid 16 | 17 | training: 18 | epoch: 300 19 | batch_size: 16 20 | save_every: 300 21 | optim_lr: 0.001 22 | optim_poly_gamma: 0.9 23 | optim_betas: (0.9, 0.999) 24 | 25 | model: 26 | file: lib.model.hohonet 27 | modelclass: HoHoNet 28 | kwargs: 29 | emb_dim: 256 30 | input_norm: ugscnn 31 | backbone_config: 32 | module: SimpleEncoder 33 | kwargs: 34 | input_extra: 1 35 | input_height: 256 36 | block: conv3x3max 37 | expand: 2 38 | decode_config: 39 | module: EfficientHeightReduction 40 | kwargs: 41 | out_ch: 256 42 | refine_config: 43 | module: TransEn 44 | kwargs: 45 | position_encode: 128 46 | num_layers: 1 47 | modalities_config: 48 | SemanticSegmenter: 49 | num_classes: 13 50 | label_weight: data/s2d3d_sem/label13_weight.pth 51 | basis: dct 52 | loss: ce 53 | n_components: 64 54 | output_height: 256 55 | dropout: 0.5 56 | 57 | -------------------------------------------------------------------------------- /config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h256_fold3_simple.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dSemDataset 7 | common_kwargs: 8 | root: data/s2d3d_sem/ 9 | hw: (256, 512) 10 | train_kwargs: 11 | fold: 3_train 12 | flip: True 13 | rotate: True 14 | valid_kwargs: 15 | fold: 3_valid 16 | 17 | training: 18 | epoch: 300 19 | batch_size: 16 20 | save_every: 300 21 | optim_lr: 0.001 22 | optim_poly_gamma: 0.9 23 | optim_betas: (0.9, 0.999) 24 | 25 | model: 26 | file: lib.model.hohonet 27 | modelclass: HoHoNet 28 | kwargs: 29 | emb_dim: 256 30 | input_norm: ugscnn 31 | backbone_config: 32 | module: SimpleEncoder 33 | kwargs: 34 | input_extra: 1 35 | input_height: 256 36 | block: conv3x3max 37 | expand: 2 38 | decode_config: 39 | module: EfficientHeightReduction 40 | kwargs: 41 | out_ch: 256 42 | refine_config: 43 | module: TransEn 44 | kwargs: 45 | position_encode: 128 46 | num_layers: 1 47 | modalities_config: 48 | SemanticSegmenter: 49 | num_classes: 13 50 | label_weight: data/s2d3d_sem/label13_weight.pth 51 | basis: dct 52 | loss: ce 53 | n_components: 64 54 | output_height: 256 55 | dropout: 0.5 56 | 57 | -------------------------------------------------------------------------------- /config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h64_fold1_simple.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dSemDataset 7 | common_kwargs: 8 | root: data/s2d3d_sem/ 9 | hw: (64, 128) 10 | train_kwargs: 11 | fold: 1_train 12 | flip: True 13 | rotate: True 14 | valid_kwargs: 15 | fold: 1_valid 16 | 17 | training: 18 | epoch: 300 19 | batch_size: 16 20 | save_every: 300 21 | optim_lr: 0.001 22 | optim_poly_gamma: 0.9 23 | optim_betas: (0.9, 0.999) 24 | 25 | model: 26 | file: lib.model.hohonet 27 | modelclass: HoHoNet 28 | kwargs: 29 | emb_dim: 256 30 | input_norm: ugscnn 31 | backbone_config: 32 | module: SimpleEncoder 33 | kwargs: 34 | input_extra: 1 35 | input_height: 64 36 | block: conv3x3max 37 | expand: 2 38 | decode_config: 39 | module: EfficientHeightReduction 40 | kwargs: 41 | out_ch: 256 42 | refine_config: 43 | module: TransEn 44 | kwargs: 45 | position_encode: 32 46 | num_layers: 1 47 | modalities_config: 48 | SemanticSegmenter: 49 | num_classes: 13 50 | label_weight: data/s2d3d_sem/label13_weight.pth 51 | basis: dct 52 | loss: ce 53 | n_components: 64 54 | output_height: 64 55 | dropout: 0.5 56 | 57 | -------------------------------------------------------------------------------- /config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h64_fold2_simple.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dSemDataset 7 | common_kwargs: 8 | root: data/s2d3d_sem/ 9 | hw: (64, 128) 10 | train_kwargs: 11 | fold: 2_train 12 | flip: True 13 | rotate: True 14 | valid_kwargs: 15 | fold: 2_valid 16 | 17 | training: 18 | epoch: 300 19 | batch_size: 16 20 | save_every: 300 21 | optim_lr: 0.001 22 | optim_poly_gamma: 0.9 23 | optim_betas: (0.9, 0.999) 24 | 25 | model: 26 | file: lib.model.hohonet 27 | modelclass: HoHoNet 28 | kwargs: 29 | emb_dim: 256 30 | input_norm: ugscnn 31 | backbone_config: 32 | module: SimpleEncoder 33 | kwargs: 34 | input_extra: 1 35 | input_height: 64 36 | block: conv3x3max 37 | expand: 2 38 | decode_config: 39 | module: EfficientHeightReduction 40 | kwargs: 41 | out_ch: 256 42 | refine_config: 43 | module: TransEn 44 | kwargs: 45 | position_encode: 32 46 | num_layers: 1 47 | modalities_config: 48 | SemanticSegmenter: 49 | num_classes: 13 50 | label_weight: data/s2d3d_sem/label13_weight.pth 51 | basis: dct 52 | loss: ce 53 | n_components: 64 54 | output_height: 64 55 | dropout: 0.5 56 | 57 | -------------------------------------------------------------------------------- /config/s2d3d_sem/HOHO_depth_dct_efficienthc_TransEn1_h64_fold3_simple.yaml: -------------------------------------------------------------------------------- 1 | ckpt_root: ckpt 2 | cuda: True 3 | num_workers: 8 4 | 5 | dataset: 6 | name: S2d3dSemDataset 7 | common_kwargs: 8 | root: data/s2d3d_sem/ 9 | hw: (64, 128) 10 | train_kwargs: 11 | fold: 3_train 12 | flip: True 13 | rotate: True 14 | valid_kwargs: 15 | fold: 3_valid 16 | 17 | training: 18 | epoch: 300 19 | batch_size: 16 20 | save_every: 300 21 | optim_lr: 0.001 22 | optim_poly_gamma: 0.9 23 | optim_betas: (0.9, 0.999) 24 | 25 | model: 26 | file: lib.model.hohonet 27 | modelclass: HoHoNet 28 | kwargs: 29 | emb_dim: 256 30 | input_norm: ugscnn 31 | backbone_config: 32 | module: SimpleEncoder 33 | kwargs: 34 | input_extra: 1 35 | input_height: 64 36 | block: conv3x3max 37 | expand: 2 38 | decode_config: 39 | module: EfficientHeightReduction 40 | kwargs: 41 | out_ch: 256 42 | refine_config: 43 | module: TransEn 44 | kwargs: 45 | position_encode: 32 46 | num_layers: 1 47 | modalities_config: 48 | SemanticSegmenter: 49 | num_classes: 13 50 | label_weight: data/s2d3d_sem/label13_weight.pth 51 | basis: dct 52 | loss: ce 53 | n_components: 64 54 | output_height: 64 55 | dropout: 0.5 56 | 57 | -------------------------------------------------------------------------------- /count_params_flops.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import importlib 4 | from tqdm import tqdm, trange 5 | from collections import Counter 6 | 7 | import numpy as np 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | from thop import profile, clever_format 14 | 15 | from lib.config import config, update_config 16 | 17 | 18 | if __name__ == '__main__': 19 | 20 | # Parse args & config 21 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 22 | parser.add_argument('--cfg', required=True) 23 | parser.add_argument('opts', 24 | help='Modify config options using the command-line', 25 | default=None, nargs=argparse.REMAINDER) 26 | args = parser.parse_args() 27 | update_config(config, args) 28 | 29 | # Init global variable 30 | device = 'cuda' if config.cuda else 'cpu' 31 | if config.cuda and config.cuda_benchmark: 32 | torch.backends.cudnn.benchmark = True 33 | 34 | # Init network 35 | model_file = importlib.import_module(config.model.file) 36 | model_class = getattr(model_file, config.model.modelclass) 37 | net = model_class(**config.model.kwargs).to(device) 38 | net.eval() 39 | 40 | # testing 41 | layers = net 42 | inputs = [torch.randn(1, 3, 512, 1024).to(device)] 43 | with torch.no_grad(): 44 | flops, params = profile(layers, inputs) 45 | print(f'input :', [v.shape for v in inputs]) 46 | print(f'flops : {flops/(10**9):.2f} G') 47 | print(f'params: {params/(10**6):.2f} M') 48 | 49 | import time 50 | fps = [] 51 | with torch.no_grad(): 52 | layers(inputs[0]) 53 | for _ in range(50): 54 | eps_time = time.time() 55 | layers(inputs[0]) 56 | torch.cuda.synchronize() 57 | eps_time = time.time() - eps_time 58 | fps.append(eps_time) 59 | print(f'fps : {1 / (sum(fps) / len(fps)):.2f}') 60 | 61 | -------------------------------------------------------------------------------- /eval_layout.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import glob 4 | import argparse 5 | import numpy as np 6 | from tqdm import tqdm 7 | from shapely.geometry import Polygon 8 | 9 | from lib.dataset.dataset_layout import cor_2_1d 10 | from lib.misc import post_proc 11 | 12 | 13 | def prepare_gtdt_pairs(gt_glob, dt_glob): 14 | gt_paths = sorted(glob.glob(gt_glob)) 15 | dt_paths_json = dict([(os.path.split(v)[-1].split('.')[0], v) 16 | for v in glob.glob(dt_glob) if v.endswith('json')]) 17 | dt_paths_txt = dict([(os.path.split(v)[-1].split('.')[0], v) 18 | for v in glob.glob(dt_glob) if v.endswith('txt')]) 19 | 20 | gtdt_pairs = [] 21 | for gt_path in gt_paths: 22 | k = os.path.split(gt_path)[-1].split('.')[0] 23 | if k in dt_paths_json: 24 | gtdt_pairs.append((gt_path, dt_paths_json[k])) 25 | else: 26 | gtdt_pairs.append((gt_path, dt_paths_txt[k])) 27 | return gtdt_pairs 28 | 29 | 30 | def layout_2_depth(cor_id, h, w, return_mask=False): 31 | # Convert corners to per-column boundary first 32 | # Up -pi/2, Down pi/2 33 | vc, vf = cor_2_1d(cor_id, h, w) 34 | vc = vc[None, :] # [1, w] 35 | vf = vf[None, :] # [1, w] 36 | assert (vc > 0).sum() == 0 37 | assert (vf < 0).sum() == 0 38 | 39 | # Per-pixel v coordinate (vertical angle) 40 | vs = ((np.arange(h) + 0.5) / h - 0.5) * np.pi 41 | vs = np.repeat(vs[:, None], w, axis=1) # [h, w] 42 | 43 | # Floor-plane to depth 44 | floor_h = 1.6 45 | floor_d = np.abs(floor_h / np.sin(vs)) 46 | 47 | # wall to camera distance on horizontal plane at cross camera center 48 | cs = floor_h / np.tan(vf) 49 | 50 | # Ceiling-plane to depth 51 | ceil_h = np.abs(cs * np.tan(vc)) # [1, w] 52 | ceil_d = np.abs(ceil_h / np.sin(vs)) # [h, w] 53 | 54 | # Wall to depth 55 | wall_d = np.abs(cs / np.cos(vs)) # [h, w] 56 | 57 | # Recover layout depth 58 | floor_mask = (vs > vf) 59 | ceil_mask = (vs < vc) 60 | wall_mask = (~floor_mask) & (~ceil_mask) 61 | depth = np.zeros([h, w], np.float32) # [h, w] 62 | depth[floor_mask] = floor_d[floor_mask] 63 | depth[ceil_mask] = ceil_d[ceil_mask] 64 | depth[wall_mask] = wall_d[wall_mask] 65 | 66 | assert (depth == 0).sum() == 0 67 | if return_mask: 68 | return depth, floor_mask, ceil_mask, wall_mask 69 | return depth 70 | 71 | 72 | def test_general(dt_cor_id, gt_cor_id, w, h, losses): 73 | dt_floor_coor = dt_cor_id[1::2] 74 | dt_ceil_coor = dt_cor_id[0::2] 75 | gt_floor_coor = gt_cor_id[1::2] 76 | gt_ceil_coor = gt_cor_id[0::2] 77 | assert (dt_floor_coor[:, 0] != dt_ceil_coor[:, 0]).sum() == 0 78 | assert (gt_floor_coor[:, 0] != gt_ceil_coor[:, 0]).sum() == 0 79 | 80 | # Eval 3d IoU and height error(in meter) 81 | N = len(dt_floor_coor) 82 | ch = -1.6 83 | dt_floor_xy = post_proc.np_coor2xy(dt_floor_coor, ch, 1024, 512, floorW=1, floorH=1) 84 | gt_floor_xy = post_proc.np_coor2xy(gt_floor_coor, ch, 1024, 512, floorW=1, floorH=1) 85 | dt_poly = Polygon(dt_floor_xy) 86 | gt_poly = Polygon(gt_floor_xy) 87 | if not gt_poly.is_valid: 88 | print('Skip ground truth invalid (%s)' % gt_path) 89 | return 90 | 91 | # 2D IoU 92 | try: 93 | area_dt = dt_poly.area 94 | area_gt = gt_poly.area 95 | area_inter = dt_poly.intersection(gt_poly).area 96 | iou2d = area_inter / (area_gt + area_dt - area_inter) 97 | except: 98 | iou2d = 0 99 | 100 | # 3D IoU 101 | try: 102 | cch_dt = post_proc.get_z1(dt_floor_coor[:, 1], dt_ceil_coor[:, 1], ch, 512) 103 | cch_gt = post_proc.get_z1(gt_floor_coor[:, 1], gt_ceil_coor[:, 1], ch, 512) 104 | h_dt = abs(cch_dt.mean() - ch) 105 | h_gt = abs(cch_gt.mean() - ch) 106 | area3d_inter = area_inter * min(h_dt, h_gt) 107 | area3d_pred = area_dt * h_dt 108 | area3d_gt = area_gt * h_gt 109 | iou3d = area3d_inter / (area3d_pred + area3d_gt - area3d_inter) 110 | except: 111 | iou3d = 0 112 | 113 | # rmse & delta_1 114 | gt_layout_depth = layout_2_depth(gt_cor_id, h, w) 115 | try: 116 | dt_layout_depth = layout_2_depth(dt_cor_id, h, w) 117 | except: 118 | dt_layout_depth = np.zeros_like(gt_layout_depth) 119 | rmse = ((gt_layout_depth - dt_layout_depth)**2).mean() ** 0.5 120 | thres = np.maximum(gt_layout_depth/dt_layout_depth, dt_layout_depth/gt_layout_depth) 121 | delta_1 = (thres < 1.25).mean() 122 | 123 | # Add a result 124 | n_corners = len(gt_floor_coor) 125 | if n_corners % 2 == 1: 126 | n_corners = 'odd' 127 | elif n_corners < 10: 128 | n_corners = str(n_corners) 129 | else: 130 | n_corners = '10+' 131 | losses[n_corners]['2DIoU'].append(iou2d) 132 | losses[n_corners]['3DIoU'].append(iou3d) 133 | losses[n_corners]['rmse'].append(rmse) 134 | losses[n_corners]['delta_1'].append(delta_1) 135 | losses['overall']['2DIoU'].append(iou2d) 136 | losses['overall']['3DIoU'].append(iou3d) 137 | losses['overall']['rmse'].append(rmse) 138 | losses['overall']['delta_1'].append(delta_1) 139 | 140 | 141 | if __name__ == '__main__': 142 | 143 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 144 | parser.add_argument('--dt_glob', 145 | help='NOTE: Remeber to quote your glob path.' 146 | 'Files assumed to be json from inference.py') 147 | parser.add_argument('--gt_glob', 148 | help='NOTE: Remeber to quote your glob path.' 149 | 'Files assumed to be txt') 150 | parser.add_argument('--w', default=1024, type=int, 151 | help='GT images width') 152 | parser.add_argument('--h', default=512, type=int, 153 | help='GT images height') 154 | args = parser.parse_args() 155 | 156 | # Prepare (gt, dt) pairs 157 | gtdt_pairs = prepare_gtdt_pairs(args.gt_glob, args.dt_glob) 158 | 159 | # Testing 160 | losses = dict([ 161 | (n_corner, {'2DIoU': [], '3DIoU': [], 'rmse': [], 'delta_1': []}) 162 | for n_corner in ['4', '6', '8', '10+', 'odd', 'overall'] 163 | ]) 164 | for gt_path, dt_path in tqdm(gtdt_pairs, desc='Testing'): 165 | # Parse ground truth 166 | with open(gt_path) as f: 167 | gt_cor_id = np.array([l.split() for l in f], np.float32) 168 | 169 | # Parse inferenced result 170 | if dt_path.endswith('json'): 171 | with open(dt_path) as f: 172 | dt = json.load(f) 173 | dt_cor_id = np.array(dt['uv'], np.float32) 174 | dt_cor_id[:, 0] *= args.w 175 | dt_cor_id[:, 1] *= args.h 176 | else: 177 | dt_cor_id = np.loadtxt(dt_path, np.float32) 178 | 179 | test_general(dt_cor_id, gt_cor_id, args.w, args.h, losses) 180 | 181 | for k, result in losses.items(): 182 | iou2d = np.array(result['2DIoU']) 183 | iou3d = np.array(result['3DIoU']) 184 | rmse = np.array(result['rmse']) 185 | delta_1 = np.array(result['delta_1']) 186 | if len(iou2d) == 0: 187 | continue 188 | print('GT #Corners: %s (%d instances)' % (k, len(iou2d))) 189 | print(' 2DIoU : %.2f' % (iou2d.mean() * 100)) 190 | print(' 3DIoU : %.2f' % (iou3d.mean() * 100)) 191 | print(' RMSE : %.2f' % (rmse.mean())) 192 | print(' delta^1: %.2f' % (delta_1.mean())) 193 | -------------------------------------------------------------------------------- /infer_depth.py: -------------------------------------------------------------------------------- 1 | import os, sys, time, glob 2 | import argparse 3 | import importlib 4 | from tqdm import tqdm 5 | from imageio import imread, imwrite 6 | import torch 7 | import numpy as np 8 | 9 | from lib.config import config, update_config 10 | 11 | 12 | if __name__ == '__main__': 13 | 14 | # Parse args & config 15 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 16 | parser.add_argument('--cfg', required=True) 17 | parser.add_argument('--pth', required=True) 18 | parser.add_argument('--out', required=True) 19 | parser.add_argument('--inp', required=True) 20 | parser.add_argument('opts', 21 | help='Modify config options using the command-line', 22 | default=None, nargs=argparse.REMAINDER) 23 | args = parser.parse_args() 24 | update_config(config, args) 25 | device = 'cuda' if config.cuda else 'cpu' 26 | 27 | # Parse input paths 28 | rgb_lst = glob.glob(args.inp) 29 | if len(rgb_lst) == 0: 30 | print('No images found') 31 | import sys; sys.exit() 32 | 33 | # Init model 34 | model_file = importlib.import_module(config.model.file) 35 | model_class = getattr(model_file, config.model.modelclass) 36 | net = model_class(**config.model.kwargs) 37 | net.load_state_dict(torch.load(args.pth, map_location=device)) 38 | net = net.eval().to(device) 39 | 40 | # Run inference 41 | with torch.no_grad(): 42 | for path in tqdm(rgb_lst): 43 | rgb = imread(path) 44 | x = torch.from_numpy(rgb).permute(2,0,1)[None].float() / 255. 45 | if x.shape[2:] != config.dataset.common_kwargs.hw: 46 | x = torch.nn.functional.interpolate(x, config.dataset.common_kwargs.hw, mode='area') 47 | x = x.to(device) 48 | pred_depth = net.infer(x) 49 | if not torch.is_tensor(pred_depth): 50 | pred_depth = pred_depth.pop('depth') 51 | 52 | fname = os.path.splitext(os.path.split(path)[1])[0] 53 | imwrite( 54 | os.path.join(args.out, f'{fname}.depth.png'), 55 | pred_depth.mul(1000).squeeze().cpu().numpy().astype(np.uint16) 56 | ) 57 | 58 | -------------------------------------------------------------------------------- /infer_layout.py: -------------------------------------------------------------------------------- 1 | import os, sys, time, glob 2 | import argparse 3 | import importlib 4 | from tqdm import tqdm 5 | from imageio import imread, imwrite 6 | import torch 7 | import numpy as np 8 | 9 | from lib.config import config, update_config 10 | 11 | 12 | if __name__ == '__main__': 13 | 14 | # Parse args & config 15 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 16 | parser.add_argument('--cfg', required=True) 17 | parser.add_argument('--pth', required=True) 18 | parser.add_argument('--out', required=True) 19 | parser.add_argument('--inp', required=True) 20 | parser.add_argument('opts', 21 | help='Modify config options using the command-line', 22 | default=None, nargs=argparse.REMAINDER) 23 | args = parser.parse_args() 24 | update_config(config, args) 25 | device = 'cuda' if config.cuda else 'cpu' 26 | 27 | # Parse input paths 28 | rgb_lst = glob.glob(args.inp) 29 | if len(rgb_lst) == 0: 30 | print('No images found') 31 | import sys; sys.exit() 32 | 33 | # Init model 34 | model_file = importlib.import_module(config.model.file) 35 | model_class = getattr(model_file, config.model.modelclass) 36 | net = model_class(**config.model.kwargs) 37 | net.load_state_dict(torch.load(args.pth, map_location=device)) 38 | net = net.eval().to(device) 39 | 40 | # Run inference 41 | with torch.no_grad(): 42 | for path in tqdm(rgb_lst): 43 | rgb = imread(path) 44 | x = torch.from_numpy(rgb).permute(2,0,1)[None].float() / 255. 45 | x = x.to(device) 46 | cor_id = net.infer(x)['cor_id'] 47 | 48 | fname = os.path.splitext(os.path.split(path)[1])[0] 49 | with open(os.path.join(args.out, f'{fname}.layout.txt'), 'w') as f: 50 | for u, v in cor_id: 51 | f.write(f'{u:.1f} {v:.1f}\n') 52 | 53 | -------------------------------------------------------------------------------- /lib/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from yacs.config import CfgNode as CN 3 | 4 | config = CN() 5 | 6 | config.ckpt_root = 'ckpt' 7 | config.cuda = True 8 | config.cuda_benchmark = True 9 | config.num_workers = 8 10 | 11 | config.dataset = CN() 12 | config.dataset.name = 'PanoCorBonDataset' 13 | config.dataset.common_kwargs = CN(new_allowed=True) 14 | config.dataset.train_kwargs = CN(new_allowed=True) 15 | config.dataset.valid_kwargs = CN(new_allowed=True) 16 | 17 | config.training = CN() 18 | config.training.epoch = 300 19 | config.training.batch_size = 4 20 | config.training.save_every = 100 21 | config.training.optim = 'Adam' 22 | config.training.optim_lr = 0.0001 23 | config.training.optim_betas = (0.9, 0.999) 24 | config.training.weight_decay = 0.0 25 | config.training.wd_group_mode = 'bn and bias' 26 | config.training.optim_milestons = [0.5, 0.9] 27 | config.training.optim_gamma = 0.2 28 | config.training.optim_poly_gamma = -1.0 29 | config.training.fix_encoder_bn = False 30 | 31 | config.model = CN() 32 | config.model.file = 'lib.model.HorizonNet' 33 | config.model.modelclass = 'HorizonNet' 34 | config.model.kwargs = CN(new_allowed=True) 35 | 36 | 37 | def update_config(cfg, args): 38 | cfg.defrost() 39 | 40 | cfg.merge_from_file(args.cfg) 41 | cfg.merge_from_list(args.opts) 42 | 43 | cfg.freeze() 44 | 45 | def infer_exp_id(cfg_path): 46 | cfg_path = cfg_path.split('config/')[-1] 47 | if cfg_path.endswith('.yaml'): 48 | cfg_path = cfg_path[:-len('.yaml')] 49 | return '_'.join(cfg_path.split('/')) 50 | 51 | -------------------------------------------------------------------------------- /lib/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset_layout import PanoCorBonDataset 2 | from .dataset_s2d3d_sem import S2d3dSemDataset 3 | from .dataset_depth import CorruptMP3dDepthDataset, MP3dDepthDataset, S2d3dDepthDataset 4 | 5 | -------------------------------------------------------------------------------- /lib/dataset/dataset_depth.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import numpy as np 4 | 5 | from imageio import imread 6 | from scipy.spatial.transform import Rotation 7 | from lib.misc.pano_lsd_align import rotatePanorama 8 | 9 | import torch 10 | import torch.utils.data as data 11 | 12 | 13 | class BaseDataset(data.Dataset): 14 | def __init__(self, dmin=0.01, dmax=10, hw=(512, 1024), 15 | rand_rotate=False, rand_flip=False, rand_gamma=False, 16 | rand_pitch=0, rand_roll=0, 17 | fix_pitch=0, fix_roll=0): 18 | self.fname = [] 19 | self.rgb_paths, self.d_paths = [], [] 20 | self.dmin = dmin 21 | self.dmax = dmax 22 | self.hw = hw 23 | self.rand_rotate = rand_rotate 24 | self.rand_flip = rand_flip 25 | self.rand_gamma = rand_gamma 26 | self.rand_pitch = rand_pitch 27 | self.rand_roll = rand_roll 28 | self.fix_pitch = fix_pitch 29 | self.fix_roll = fix_roll 30 | 31 | def __len__(self): 32 | return len(self.rgb_paths) 33 | 34 | def read_rgb(self, path): 35 | return imread(path) 36 | 37 | def read_depth(self, path): 38 | raise NotImplementedError 39 | 40 | def __getitem__(self, idx): 41 | # Read data 42 | fname = self.fname[idx] 43 | color = self.read_rgb(self.rgb_paths[idx]) 44 | depth = self.read_depth(self.d_paths[idx]) 45 | 46 | # To tensor and reshape to [C, H, W] 47 | color = torch.from_numpy(color).permute(2,0,1).float() / 255 48 | depth = torch.from_numpy(depth)[None].float() 49 | depth = torch.clamp(depth, max=self.dmax) 50 | 51 | # Resize 52 | if color.shape[1:] != self.hw: 53 | color = torch.nn.functional.interpolate(color[None], self.hw, mode='area')[0] 54 | if depth.shape[1:] != self.hw: 55 | depth = torch.nn.functional.interpolate(depth[None], self.hw, mode='nearest')[0] 56 | 57 | # Data augmentation 58 | if self.rand_rotate: 59 | shift = np.random.randint(self.hw[1]) 60 | color = torch.roll(color, shift, dims=-1) 61 | depth = torch.roll(depth, shift, dims=-1) 62 | 63 | if self.rand_flip and np.random.randint(2): 64 | color = torch.flip(color, dims=[-1]) 65 | depth = torch.flip(depth, dims=[-1]) 66 | 67 | if self.rand_gamma: 68 | p = np.random.uniform(1, 1.2) 69 | if np.random.randint(2) == 0: 70 | p = 1 / p 71 | color = color ** p 72 | 73 | # Rotation augmentation 74 | if self.rand_pitch > 0 or self.rand_roll > 0 or self.fix_pitch != 0 or self.fix_roll > 0: 75 | color = color.permute(1,2,0).numpy() 76 | depth = depth.permute(1,2,0).numpy() 77 | if self.fix_pitch: 78 | rot = self.fix_pitch 79 | vp = Rotation.from_rotvec([rot * np.pi / 180, 0, 0]).as_matrix() 80 | color = rotatePanorama(color, vp, order=0) 81 | elif self.rand_pitch > 0: 82 | rot = np.random.randint(0, self.rand_pitch) 83 | vp = Rotation.from_rotvec([rot * np.pi / 180, 0, 0]).as_matrix() 84 | color = rotatePanorama(color, vp, order=0) 85 | depth = rotatePanorama(depth, vp, order=0) 86 | if self.fix_roll: 87 | rot = self.fix_roll 88 | vp = Rotation.from_rotvec([0, rot * np.pi / 180, 0]).as_matrix() 89 | color = rotatePanorama(color, vp, order=0) 90 | elif self.rand_roll > 0: 91 | rot = np.random.randint(0, self.rand_roll) 92 | vp = Rotation.from_rotvec([0, rot * np.pi / 180, 0]).as_matrix() 93 | color = rotatePanorama(color, vp, order=0) 94 | depth = rotatePanorama(depth, vp, order=0) 95 | color = torch.from_numpy(color).permute(2,0,1).float() 96 | depth = torch.from_numpy(depth).permute(2,0,1).float() 97 | 98 | return {'x': color, 'depth': depth, 'fname': fname.ljust(200)} 99 | 100 | 101 | class CorruptMP3dDepthDataset(BaseDataset): 102 | def __init__(self, root, scene_txt, **kwargs): 103 | super(CorruptMP3dDepthDataset, self).__init__(**kwargs) 104 | 105 | # List all rgbd paths 106 | with open(scene_txt) as f: 107 | scene_split_ids = set(f.read().split()) 108 | for scene in os.listdir(root): 109 | scene_root = os.path.join(root, scene) 110 | if not os.path.isdir(scene_root) or scene not in scene_split_ids: 111 | continue 112 | for cam in os.listdir(scene_root): 113 | cam_root = os.path.join(scene_root, cam) 114 | if not os.path.isdir(cam_root): 115 | continue 116 | self.rgb_paths.append(os.path.join(cam_root, 'color.jpg')) 117 | self.d_paths.append(os.path.join(cam_root, 'depth.npy')) 118 | assert len(self.rgb_paths) == len(self.d_paths) 119 | for path in self.rgb_paths: 120 | self.fname.append('_'.join(path.split('/'))) 121 | 122 | def read_depth(self, path): 123 | depth = np.load(path) 124 | depth[depth == 0.01] = 0 125 | return depth 126 | 127 | 128 | class MP3dDepthDataset(BaseDataset): 129 | def __init__(self, root, scene_txt, **kwargs): 130 | super(MP3dDepthDataset, self).__init__(**kwargs) 131 | 132 | # List all rgbd paths 133 | with open(scene_txt) as f: 134 | scene_split_ids = set(f.read().split()) 135 | for scene in os.listdir(root): 136 | scene_root = os.path.join(root, scene) 137 | if not os.path.isdir(scene_root) or scene not in scene_split_ids: 138 | continue 139 | self.rgb_paths.extend(sorted(glob.glob(os.path.join(scene_root, '*rgb.png')))) 140 | self.d_paths.extend(sorted(glob.glob(os.path.join(scene_root, '*depth.exr')))) 141 | assert len(self.rgb_paths) == len(self.d_paths) 142 | for path in self.rgb_paths: 143 | self.fname.append('_'.join(path.split('/'))) 144 | 145 | def read_depth(self, path): 146 | import Imath 147 | import OpenEXR 148 | f = OpenEXR.InputFile(path) 149 | dw = f.header()['dataWindow'] 150 | size = (dw.max.x - dw.min.x + 1, dw.max.y - dw.min.y + 1) 151 | depth = np.frombuffer(f.channel('Y', Imath.PixelType(Imath.PixelType.FLOAT)), np.float32) 152 | depth = depth.reshape(size[1], size[0]) 153 | f.close() 154 | return depth.astype(np.float32) 155 | 156 | 157 | class S2d3dDepthDataset(BaseDataset): 158 | def __init__(self, root, scene_txt, **kwargs): 159 | super(S2d3dDepthDataset, self).__init__(**kwargs) 160 | 161 | # List all rgbd paths 162 | with open(scene_txt) as f: 163 | path_pair = [l.strip().split() for l in f] 164 | for rgb_path, dep_path in path_pair: 165 | self.rgb_paths.append(os.path.join(root, rgb_path)) 166 | self.d_paths.append(os.path.join(root, dep_path)) 167 | self.fname.append(os.path.split(rgb_path)[1]) 168 | 169 | def read_depth(self, path): 170 | depth = imread(path) 171 | return np.where(depth==65535, 0, depth/512) 172 | 173 | -------------------------------------------------------------------------------- /lib/dataset/dataset_s2d3d_sem.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import numpy as np 4 | from imageio import imread 5 | from shapely.geometry import LineString 6 | 7 | import torch 8 | import torch.utils.data as data 9 | import torch.nn.functional as F 10 | 11 | from lib.misc import panostretch 12 | 13 | __FOLD__ = { 14 | '1_train': ['area_1', 'area_2', 'area_3', 'area_4', 'area_6'], 15 | '1_valid': ['area_5a', 'area_5b'], 16 | '2_train': ['area_1', 'area_3', 'area_5a', 'area_5b', 'area_6'], 17 | '2_valid': ['area_2', 'area_4'], 18 | '3_train': ['area_2', 'area_4', 'area_5a', 'area_5b'], 19 | '3_valid': ['area_1', 'area_3', 'area_6'], 20 | } 21 | 22 | class S2d3dSemDataset(data.Dataset): 23 | NUM_CLASSES = 13 24 | ID2CLASS = ['beam', 'board', 'bookcase', 'ceiling', 'chair', 'clutter', 'column', 'door', 'floor', 'sofa', 'table', 'wall', 'window'] 25 | def __init__(self, root, fold, depth=True, hw=(512, 1024), mask_black=True, flip=False, rotate=False): 26 | assert fold in __FOLD__, 'Unknown fold' 27 | self.depth = depth 28 | self.hw = hw 29 | self.mask_black = mask_black 30 | self.rgb_paths = [] 31 | self.sem_paths = [] 32 | self.dep_paths = [] 33 | for dname in __FOLD__[fold]: 34 | self.rgb_paths.extend(sorted(glob.glob(os.path.join(root, dname, 'rgb', '*png')))) 35 | self.sem_paths.extend(sorted(glob.glob(os.path.join(root, dname, 'semantic', '*png')))) 36 | self.dep_paths.extend(sorted(glob.glob(os.path.join(root, dname, 'depth', '*png')))) 37 | assert len(self.rgb_paths) 38 | assert len(self.rgb_paths) == len(self.sem_paths) 39 | assert len(self.rgb_paths) == len(self.dep_paths) 40 | self.flip = flip 41 | self.rotate = rotate 42 | 43 | def __len__(self): 44 | return len(self.rgb_paths) 45 | 46 | def __getitem__(self, idx): 47 | rgb = torch.FloatTensor(imread(self.rgb_paths[idx]) / 255.).permute(2,0,1) 48 | sem = torch.LongTensor(imread(self.sem_paths[idx])) - 1 49 | if self.depth: 50 | dep = imread(self.dep_paths[idx]) 51 | dep = np.where(dep==65535, 0, dep/512) 52 | dep = np.clip(dep, 0, 4) 53 | dep = torch.FloatTensor(dep[None]) 54 | rgb = torch.cat([rgb, dep], 0) 55 | H, W = rgb.shape[1:] 56 | if (H, W) != self.hw: 57 | rgb = F.interpolate(rgb[None], size=self.hw, mode='bilinear', align_corners=False)[0] 58 | sem = F.interpolate(sem[None,None].float(), size=self.hw, mode='nearest')[0,0].long() 59 | 60 | # Random flip 61 | if self.flip and np.random.randint(2) == 0: 62 | rgb = torch.flip(rgb, (-1,)) 63 | sem = torch.flip(sem, (-1,)) 64 | 65 | # Random horizontal rotate 66 | if self.rotate: 67 | dx = np.random.randint(W) 68 | rgb = torch.roll(rgb, dx, dims=-1) 69 | sem = torch.roll(sem, dx, dims=-1) 70 | 71 | # Mask out top-down black 72 | if self.mask_black: 73 | sem[rgb.sum(0) == 0] = -1 74 | 75 | # Convert all data to tensor 76 | out_dict = { 77 | 'x': rgb, 78 | 'sem': sem, 79 | 'fname': os.path.split(self.rgb_paths[idx])[1].ljust(200), 80 | } 81 | return out_dict 82 | 83 | 84 | if __name__ == '__main__': 85 | 86 | import argparse 87 | from tqdm import tqdm 88 | 89 | parser = argparse.ArgumentParser() 90 | parser.add_argument('--root_dir', default='data/valid/') 91 | parser.add_argument('--ith', default=0, type=int, 92 | help='Pick a data id to visualize.' 93 | '-1 for visualize all data') 94 | parser.add_argument('--flip', action='store_true', 95 | help='whether to random flip') 96 | parser.add_argument('--rotate', action='store_true', 97 | help='whether to random horizon rotation') 98 | parser.add_argument('--gamma', action='store_true', 99 | help='whether to random luminance change') 100 | parser.add_argument('--stretch', action='store_true', 101 | help='whether to random pano stretch') 102 | parser.add_argument('--dist_clip', default=20) 103 | parser.add_argument('--out_dir', default='data/vis_dataset') 104 | args = parser.parse_args() 105 | 106 | os.makedirs(args.out_dir, exist_ok=True) 107 | 108 | print('args:') 109 | for key, val in vars(args).items(): 110 | print(' {:16} {}'.format(key, val)) 111 | 112 | dataset = PanoCorBonDataset( 113 | root_dir=args.root_dir, 114 | flip=args.flip, rotate=args.rotate, gamma=args.gamma, stretch=args.stretch) 115 | 116 | # Showing some information about dataset 117 | print('len(dataset): {}'.format(len(dataset))) 118 | batch = dataset[args.ith] 119 | for k, v in batch.items(): 120 | if torch.is_tensor(v): 121 | print(k, v.shape) 122 | else: 123 | print(k, v) 124 | print('=' * 20) 125 | 126 | if args.ith >= 0: 127 | to_visualize = [dataset[args.ith]] 128 | else: 129 | to_visualize = dataset 130 | 131 | import matplotlib.pyplot as plt 132 | cmap = plt.get_cmap('bwr') 133 | for batch in tqdm(to_visualize): 134 | fname = os.path.split(batch['img_path'])[-1] 135 | img = batch['x'].permute(1,2,0).numpy() 136 | y_bon = batch['bon'].numpy() 137 | y_bon = ((y_bon / np.pi + 0.5) * img.shape[0]).round().astype(int) 138 | img[y_bon[0], np.arange(len(y_bon[0])), 1] = 1 139 | img[y_bon[1], np.arange(len(y_bon[1])), 1] = 1 140 | img = (img * 255).astype(np.uint8) 141 | img_pad = np.full((3, 1024, 3), 255, np.uint8) 142 | img_vot = batch['vot'].repeat(30, 1).numpy() 143 | img_vot = (img_vot / args.dist_clip + 1) / 2 144 | vot_mask = (img_vot >= 0) & (img_vot <= 1) 145 | img_vot = (cmap(img_vot)[...,:3] * 255).astype(np.uint8) 146 | img_vot[~vot_mask] = 0 147 | out = np.concatenate([img_vot, img_pad, img], 0) 148 | Image.fromarray(out).save(os.path.join(args.out_dir, fname)) 149 | 150 | -------------------------------------------------------------------------------- /lib/misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunset1995/HoHoNet/2bbc0866789cf7ad728064bc52aaf1d11b67c885/lib/misc/__init__.py -------------------------------------------------------------------------------- /lib/misc/gen_txt_structured3d.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Help generate txt for train.py 3 | Please contact https://github.com/bertjiazheng/Structured3D for dataset. 4 | ''' 5 | 6 | import os 7 | import glob 8 | import argparse 9 | 10 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 11 | parser.add_argument('--root', required=True, 12 | help='path to the dataset directory') 13 | parser.add_argument('--train_txt', required=True, 14 | help='path to save txt for train') 15 | parser.add_argument('--valid_txt', required=True, 16 | help='path to save txt for valid') 17 | parser.add_argument('--test_txt', required=True, 18 | help='path to save txt for test') 19 | args = parser.parse_args() 20 | 21 | train_scene = ['scene_%05d' % i for i in range(0, 3000)] 22 | valid_scene = ['scene_%05d' % i for i in range(3000, 3250)] 23 | test_scene = ['scene_%05d' % i for i in range(3250, 3500)] 24 | 25 | # Simple check: all directories exist 26 | for path in train_scene + valid_scene + test_scene: 27 | assert os.path.isdir(os.path.join(args.root, path)), '%s not found' % path 28 | 29 | def gen_pairs(scene_id_lst): 30 | pairs = [] 31 | for scene_id in scene_id_lst: 32 | for fname in os.listdir(os.path.join(args.root, scene_id, 'rgb')): 33 | room_id = os.path.split(fname)[1].split('_')[0] 34 | 35 | img_k = os.path.join(os.path.join(scene_id, 'rgb', fname)) 36 | layout_k = os.path.join(os.path.join(scene_id, 'layout', room_id + '_layout.txt')) 37 | assert os.path.isfile(os.path.join(args.root, img_k)) 38 | assert os.path.isfile(os.path.join(args.root, layout_k)) 39 | pairs.append((img_k, layout_k)) 40 | return pairs 41 | 42 | with open(args.train_txt, 'w') as f: 43 | pairs = gen_pairs(train_scene) 44 | f.write('\n'.join([' '.join(p) for p in pairs])) 45 | 46 | with open(args.valid_txt, 'w') as f: 47 | pairs = gen_pairs(valid_scene) 48 | f.write('\n'.join([' '.join(p) for p in pairs])) 49 | 50 | with open(args.test_txt, 'w') as f: 51 | pairs = gen_pairs(test_scene) 52 | f.write('\n'.join([' '.join(p) for p in pairs])) 53 | -------------------------------------------------------------------------------- /lib/misc/panostretch.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import numpy as np 3 | from scipy.ndimage import map_coordinates 4 | 5 | 6 | def uv_meshgrid(w, h): 7 | uv = np.stack(np.meshgrid(range(w), range(h)), axis=-1) 8 | uv = uv.astype(np.float64) 9 | uv[..., 0] = ((uv[..., 0] + 0.5) / w - 0.5) * 2 * np.pi 10 | uv[..., 1] = ((uv[..., 1] + 0.5) / h - 0.5) * np.pi 11 | return uv 12 | 13 | 14 | @functools.lru_cache() 15 | def _uv_tri(w, h): 16 | uv = uv_meshgrid(w, h) 17 | sin_u = np.sin(uv[..., 0]) 18 | cos_u = np.cos(uv[..., 0]) 19 | tan_v = np.tan(uv[..., 1]) 20 | return sin_u, cos_u, tan_v 21 | 22 | 23 | def uv_tri(w, h): 24 | sin_u, cos_u, tan_v = _uv_tri(w, h) 25 | return sin_u.copy(), cos_u.copy(), tan_v.copy() 26 | 27 | 28 | def coorx2u(x, w=1024): 29 | return ((x + 0.5) / w - 0.5) * 2 * np.pi 30 | 31 | 32 | def coory2v(y, h=512): 33 | return ((y + 0.5) / h - 0.5) * np.pi 34 | 35 | 36 | def u2coorx(u, w=1024): 37 | return (u / (2 * np.pi) + 0.5) * w - 0.5 38 | 39 | 40 | def v2coory(v, h=512): 41 | return (v / np.pi + 0.5) * h - 0.5 42 | 43 | 44 | def uv2xy(u, v, z=-50): 45 | c = z / np.tan(v) 46 | x = c * np.cos(u) 47 | y = c * np.sin(u) 48 | return x, y 49 | 50 | 51 | def pano_connect_points(p1, p2, z=-50, w=1024, h=512): 52 | if p1[0] == p2[0]: 53 | return np.array([p1, p2], np.float32) 54 | 55 | u1 = coorx2u(p1[0], w) 56 | v1 = coory2v(p1[1], h) 57 | u2 = coorx2u(p2[0], w) 58 | v2 = coory2v(p2[1], h) 59 | 60 | x1, y1 = uv2xy(u1, v1, z) 61 | x2, y2 = uv2xy(u2, v2, z) 62 | 63 | if abs(p1[0] - p2[0]) < w / 2: 64 | pstart = np.ceil(min(p1[0], p2[0])) 65 | pend = np.floor(max(p1[0], p2[0])) 66 | else: 67 | pstart = np.ceil(max(p1[0], p2[0])) 68 | pend = np.floor(min(p1[0], p2[0]) + w) 69 | coorxs = (np.arange(pstart, pend + 1) % w).astype(np.float64) 70 | vx = x2 - x1 71 | vy = y2 - y1 72 | us = coorx2u(coorxs, w) 73 | ps = (np.tan(us) * x1 - y1) / (vy - np.tan(us) * vx) 74 | cs = np.sqrt((x1 + ps * vx) ** 2 + (y1 + ps * vy) ** 2) 75 | vs = np.arctan2(z, cs) 76 | coorys = v2coory(vs, h) 77 | 78 | return np.stack([coorxs, coorys], axis=-1) 79 | 80 | 81 | def pano_stretch(img, corners, kx, ky, order=1): 82 | ''' 83 | img: [H, W, C] 84 | corners: [N, 2] in image coordinate (x, y) format 85 | kx: Stretching along front-back direction 86 | ky: Stretching along left-right direction 87 | order: Interpolation order. 0 for nearest-neighbor. 1 for bilinear. 88 | ''' 89 | 90 | # Process image 91 | sin_u, cos_u, tan_v = uv_tri(img.shape[1], img.shape[0]) 92 | u0 = np.arctan2(sin_u * kx / ky, cos_u) 93 | v0 = np.arctan(tan_v * np.sin(u0) / sin_u * ky) 94 | 95 | refx = (u0 / (2 * np.pi) + 0.5) * img.shape[1] - 0.5 96 | refy = (v0 / np.pi + 0.5) * img.shape[0] - 0.5 97 | 98 | # [TODO]: using opencv remap could probably speedup the process a little 99 | stretched_img = np.stack([ 100 | map_coordinates(img[..., i], [refy, refx], order=order, mode='wrap') 101 | for i in range(img.shape[-1]) 102 | ], axis=-1) 103 | 104 | # Process corners 105 | corners_u0 = coorx2u(corners[:, 0], img.shape[1]) 106 | corners_v0 = coory2v(corners[:, 1], img.shape[0]) 107 | corners_u = np.arctan2(np.sin(corners_u0) * ky / kx, np.cos(corners_u0)) 108 | corners_v = np.arctan(np.tan(corners_v0) * np.sin(corners_u) / np.sin(corners_u0) / ky) 109 | cornersX = u2coorx(corners_u, img.shape[1]) 110 | cornersY = v2coory(corners_v, img.shape[0]) 111 | stretched_corners = np.stack([cornersX, cornersY], axis=-1) 112 | 113 | return stretched_img, stretched_corners 114 | 115 | 116 | def visualize_pano_stretch(stretched_img, stretched_cor, title): 117 | ''' 118 | Helper function for visualizing the effect of pano_stretch 119 | ''' 120 | thikness = 2 121 | color = (0, 255, 0) 122 | for i in range(4): 123 | xys = pano_connect_points(stretched_cor[i*2], stretched_cor[(i*2+2) % 8], z=-50) 124 | xys = xys.astype(int) 125 | blue_split = np.where((xys[1:, 0] - xys[:-1, 0]) < 0)[0] 126 | if len(blue_split) == 0: 127 | cv2.polylines(stretched_img, [xys], False, color, 2) 128 | else: 129 | t = blue_split[0] + 1 130 | cv2.polylines(stretched_img, [xys[:t]], False, color, thikness) 131 | cv2.polylines(stretched_img, [xys[t:]], False, color, thikness) 132 | 133 | for i in range(4): 134 | xys = pano_connect_points(stretched_cor[i*2+1], stretched_cor[(i*2+3) % 8], z=50) 135 | xys = xys.astype(int) 136 | blue_split = np.where((xys[1:, 0] - xys[:-1, 0]) < 0)[0] 137 | if len(blue_split) == 0: 138 | cv2.polylines(stretched_img, [xys], False, color, 2) 139 | else: 140 | t = blue_split[0] + 1 141 | cv2.polylines(stretched_img, [xys[:t]], False, color, thikness) 142 | cv2.polylines(stretched_img, [xys[t:]], False, color, thikness) 143 | 144 | cv2.putText(stretched_img, title, (25, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, 145 | (0, 0, 0), 2, cv2.LINE_AA) 146 | 147 | return stretched_img.astype(np.uint8) 148 | 149 | 150 | if __name__ == '__main__': 151 | 152 | import argparse 153 | import time 154 | from PIL import Image 155 | import cv2 156 | 157 | parser = argparse.ArgumentParser() 158 | parser.add_argument('--i', default='data/valid/img/pano_abpohapclcyuuz.png') 159 | parser.add_argument('--i_gt', default='data/valid/label_cor/pano_abpohapclcyuuz.txt') 160 | parser.add_argument('--o', default='sample_stretched_pano.png') 161 | parser.add_argument('--kx', default=2, type=float, 162 | help='Stretching along front-back direction') 163 | parser.add_argument('--ky', default=1, type=float, 164 | help='Stretching along left-right direction') 165 | args = parser.parse_args() 166 | 167 | img = np.array(Image.open(args.i), np.float64) 168 | with open(args.i_gt) as f: 169 | cor = np.array([line.strip().split() for line in f], np.int32) 170 | stretched_img, stretched_cor = pano_stretch(img, cor, args.kx, args.ky) 171 | 172 | title = 'kx=%3.2f, ky=%3.2f' % (args.kx, args.ky) 173 | visual_stretched_img = visualize_pano_stretch(stretched_img, stretched_cor, title) 174 | Image.fromarray(visual_stretched_img).save(args.o) 175 | -------------------------------------------------------------------------------- /lib/misc/structured3d_extract_zip.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from zipfile import ZipFile 4 | from tqdm import tqdm 5 | import imageio 6 | 7 | ''' 8 | Zipfile format assumption: 9 | Structured3D 10 | -- [scene_xxxxx] 11 | -- other something 12 | -- 2D_rendering 13 | -- [image_id] 14 | -- panorama 15 | -- camera_xyz.txt 16 | -- layout.txt 17 | -- [empty|simple|full] 18 | -- depth.png 19 | -- rgb_rawlight.png 20 | -- rgb_coldlight.png 21 | -- rgb_warmlight.png 22 | -- other something 23 | 24 | Output format 25 | outdir 26 | -- [scene_xxxxx] 27 | -- img 28 | -- layout 29 | ''' 30 | 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('--zippath', required=True) 33 | parser.add_argument('--style', default='full') 34 | parser.add_argument('--outdir', default='structured3d') 35 | args = parser.parse_args() 36 | 37 | path_format = 'Structured3D/%s/2D_rendering/%s/panorama/%s' 38 | 39 | with ZipFile(args.zippath) as zipf: 40 | id_set = set() 41 | for path in zipf.namelist(): 42 | assert path.startswith('Structured3D') 43 | if path.endswith('camera_xyz.txt'): 44 | path_lst = path.split('/') 45 | scene_id = path_lst[1] 46 | image_id = path_lst[3] 47 | id_set.add((scene_id, image_id)) 48 | 49 | for scene_id, image_id in tqdm(id_set): 50 | path_img = path_format % (scene_id, image_id, '%s/rgb_rawlight.png' % args.style) 51 | path_layout = path_format % (scene_id, image_id, 'layout.txt') 52 | 53 | os.makedirs(os.path.join(args.outdir, scene_id, 'rgb'), exist_ok=True) 54 | os.makedirs(os.path.join(args.outdir, scene_id, 'layout'), exist_ok=True) 55 | 56 | with zipf.open(path_img) as f: 57 | rgb = imageio.imread(f)[..., :3] 58 | imageio.imwrite(os.path.join(args.outdir, scene_id, 'rgb', image_id + '_rgb_rawlight.png'), rgb) 59 | with zipf.open(path_layout) as f: 60 | with open(os.path.join(args.outdir, scene_id, 'layout', image_id + '_layout.txt'), 'w') as fo: 61 | fo.write(f.read().decode()) 62 | -------------------------------------------------------------------------------- /lib/misc/structured3d_prepare_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from zipfile import ZipFile 4 | from tqdm import tqdm 5 | import imageio 6 | 7 | ''' 8 | Assume datas is extracted by `misc/structured3d_extract_zip.py`. 9 | That is to said, assuming following structure: 10 | - {in_root}/scene_xxxxx 11 | - rgb/ 12 | - *png 13 | - layout/ 14 | - *txt 15 | 16 | The reorganized structure as follow: 17 | - {out_train_root} 18 | - img/ 19 | - scene_xxxxx_*png (softlink) 20 | - label_cor/ 21 | - scene_xxxxx_*txt (softlink) 22 | - {out_valid_root} ... 23 | - {out_test_root} ... 24 | ''' 25 | TRAIN_SCENE = ['scene_%05d' % i for i in range(0, 3000)] 26 | VALID_SCENE = ['scene_%05d' % i for i in range(3000, 3250)] 27 | TEST_SCENE = ['scene_%05d' % i for i in range(3250, 3500)] 28 | 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument('--in_root', required=True) 31 | parser.add_argument('--out_train_root', default='data/st3d_train_full_raw_light') 32 | parser.add_argument('--out_valid_root', default='data/st3d_valid_full_raw_light') 33 | parser.add_argument('--out_test_root', default='data/st3d_test_full_raw_light') 34 | args = parser.parse_args() 35 | 36 | def prepare_dataset(scene_ids, out_dir): 37 | root_img = os.path.join(out_dir, 'img') 38 | root_cor = os.path.join(out_dir, 'label_cor') 39 | os.makedirs(root_img, exist_ok=True) 40 | os.makedirs(root_cor, exist_ok=True) 41 | for scene_id in tqdm(scene_ids): 42 | source_img_root = os.path.join(args.in_root, scene_id, 'rgb') 43 | source_cor_root = os.path.join(args.in_root, scene_id, 'layout') 44 | for fname in os.listdir(source_cor_root): 45 | room_id = fname.split('_')[0] 46 | source_img_path = os.path.join(args.in_root, scene_id, 'rgb', room_id + '_rgb_rawlight.png') 47 | source_cor_path = os.path.join(args.in_root, scene_id, 'layout', room_id + '_layout.txt') 48 | target_img_path = os.path.join(root_img, '%s_%s.png' % (scene_id, room_id)) 49 | target_cor_path = os.path.join(root_cor, '%s_%s.txt' % (scene_id, room_id)) 50 | assert os.path.isfile(source_img_path) 51 | assert os.path.isfile(source_cor_path) 52 | os.symlink(source_img_path, target_img_path) 53 | os.symlink(source_cor_path, target_cor_path) 54 | 55 | prepare_dataset(TRAIN_SCENE, args.out_train_root) 56 | prepare_dataset(VALID_SCENE, args.out_valid_root) 57 | prepare_dataset(TEST_SCENE, args.out_test_root) 58 | -------------------------------------------------------------------------------- /lib/misc/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import OrderedDict 4 | 5 | 6 | def group_weight(module): 7 | # Group module parameters into two group 8 | # One need weight_decay and the other doesn't 9 | group_decay = [] 10 | group_no_decay = [] 11 | for m in module.modules(): 12 | if isinstance(m, nn.Linear): 13 | group_decay.append(m.weight) 14 | if m.bias is not None: 15 | group_no_decay.append(m.bias) 16 | elif isinstance(m, nn.modules.conv._ConvNd): 17 | group_decay.append(m.weight) 18 | if m.bias is not None: 19 | group_no_decay.append(m.bias) 20 | elif isinstance(m, nn.modules.batchnorm._BatchNorm): 21 | if m.weight is not None: 22 | group_no_decay.append(m.weight) 23 | if m.bias is not None: 24 | group_no_decay.append(m.bias) 25 | elif isinstance(m, nn.GroupNorm): 26 | if m.weight is not None: 27 | group_no_decay.append(m.weight) 28 | if m.bias is not None: 29 | group_no_decay.append(m.bias) 30 | 31 | assert len(list(module.parameters())) == len(group_decay) + len(group_no_decay) 32 | return [dict(params=group_decay), dict(params=group_no_decay, weight_decay=.0)] 33 | 34 | 35 | def adjust_learning_rate(optimizer, args): 36 | if args.cur_iter < args.warmup_iters: 37 | frac = args.cur_iter / args.warmup_iters 38 | step = args.lr - args.warmup_lr 39 | args.running_lr = args.warmup_lr + step * frac 40 | else: 41 | frac = (float(args.cur_iter) - args.warmup_iters) / (args.max_iters - args.warmup_iters) 42 | scale_running_lr = max((1. - frac), 0.) ** args.lr_pow 43 | args.running_lr = args.lr * scale_running_lr 44 | 45 | for param_group in optimizer.param_groups: 46 | param_group['lr'] = args.running_lr 47 | 48 | 49 | def save_model(net, path, args): 50 | state_dict = OrderedDict({ 51 | 'args': args.__dict__, 52 | 'kwargs': { 53 | 'backbone': net.backbone, 54 | 'use_rnn': net.use_rnn, 55 | }, 56 | 'state_dict': net.state_dict(), 57 | }) 58 | torch.save(state_dict, path) 59 | 60 | 61 | def load_trained_model(Net, path): 62 | state_dict = torch.load(path, map_location='cpu') 63 | net = Net(**state_dict['kwargs']) 64 | net.load_state_dict(state_dict['state_dict']) 65 | return net 66 | -------------------------------------------------------------------------------- /lib/model/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet import Resnet 2 | from .simple import SimpleEncoder 3 | from .hardnet import HarDNet 4 | -------------------------------------------------------------------------------- /lib/model/backbone/hardnet.py: -------------------------------------------------------------------------------- 1 | ''' Copy-paste from 2 | https://github.com/PingoLH/Pytorch-HarDNet 3 | ''' 4 | import os 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | class Flatten(nn.Module): 10 | def __init__(self): 11 | super().__init__() 12 | def forward(self, x): 13 | return x.view(x.data.size(0),-1) 14 | 15 | 16 | 17 | class CombConvLayer(nn.Sequential): 18 | def __init__(self, in_channels, out_channels, kernel=1, stride=1, dropout=0.1, bias=False): 19 | super().__init__() 20 | self.add_module('layer1',ConvLayer(in_channels, out_channels, kernel)) 21 | self.add_module('layer2',DWConvLayer(out_channels, out_channels, stride=stride)) 22 | 23 | def forward(self, x): 24 | return super().forward(x) 25 | 26 | class DWConvLayer(nn.Sequential): 27 | def __init__(self, in_channels, out_channels, stride=1, bias=False): 28 | super().__init__() 29 | out_ch = out_channels 30 | 31 | groups = in_channels 32 | kernel = 3 33 | #print(kernel, 'x', kernel, 'x', out_channels, 'x', out_channels, 'DepthWise') 34 | 35 | self.add_module('dwconv', nn.Conv2d(groups, groups, kernel_size=3, 36 | stride=stride, padding=1, groups=groups, bias=bias)) 37 | self.add_module('norm', nn.BatchNorm2d(groups)) 38 | def forward(self, x): 39 | return super().forward(x) 40 | 41 | class ConvLayer(nn.Sequential): 42 | def __init__(self, in_channels, out_channels, kernel=3, stride=1, dropout=0.1, bias=False): 43 | super().__init__() 44 | out_ch = out_channels 45 | groups = 1 46 | #print(kernel, 'x', kernel, 'x', in_channels, 'x', out_channels) 47 | self.add_module('conv', nn.Conv2d(in_channels, out_ch, kernel_size=kernel, 48 | stride=stride, padding=kernel//2, groups=groups, bias=bias)) 49 | self.add_module('norm', nn.BatchNorm2d(out_ch)) 50 | self.add_module('relu', nn.ReLU6(True)) 51 | def forward(self, x): 52 | return super().forward(x) 53 | 54 | 55 | class HarDBlock(nn.Module): 56 | def get_link(self, layer, base_ch, growth_rate, grmul): 57 | if layer == 0: 58 | return base_ch, 0, [] 59 | out_channels = growth_rate 60 | link = [] 61 | for i in range(10): 62 | dv = 2 ** i 63 | if layer % dv == 0: 64 | k = layer - dv 65 | link.append(k) 66 | if i > 0: 67 | out_channels *= grmul 68 | out_channels = int(int(out_channels + 1) / 2) * 2 69 | in_channels = 0 70 | for i in link: 71 | ch,_,_ = self.get_link(i, base_ch, growth_rate, grmul) 72 | in_channels += ch 73 | return out_channels, in_channels, link 74 | 75 | def get_out_ch(self): 76 | return self.out_channels 77 | 78 | def __init__(self, in_channels, growth_rate, grmul, n_layers, keepBase=False, residual_out=False, dwconv=False): 79 | super().__init__() 80 | self.keepBase = keepBase 81 | self.links = [] 82 | layers_ = [] 83 | self.out_channels = 0 # if upsample else in_channels 84 | for i in range(n_layers): 85 | outch, inch, link = self.get_link(i+1, in_channels, growth_rate, grmul) 86 | self.links.append(link) 87 | use_relu = residual_out 88 | if dwconv: 89 | layers_.append(CombConvLayer(inch, outch)) 90 | else: 91 | layers_.append(ConvLayer(inch, outch)) 92 | 93 | if (i % 2 == 0) or (i == n_layers - 1): 94 | self.out_channels += outch 95 | #print("Blk out =",self.out_channels) 96 | self.layers = nn.ModuleList(layers_) 97 | 98 | def forward(self, x): 99 | layers_ = [x] 100 | 101 | for layer in range(len(self.layers)): 102 | link = self.links[layer] 103 | tin = [] 104 | for i in link: 105 | tin.append(layers_[i]) 106 | if len(tin) > 1: 107 | x = torch.cat(tin, 1) 108 | else: 109 | x = tin[0] 110 | out = self.layers[layer](x) 111 | layers_.append(out) 112 | 113 | t = len(layers_) 114 | out_ = [] 115 | for i in range(t): 116 | if (i == 0 and self.keepBase) or \ 117 | (i == t-1) or (i%2 == 1): 118 | out_.append(layers_[i]) 119 | out = torch.cat(out_, 1) 120 | return out 121 | 122 | 123 | 124 | 125 | class HarDNet(nn.Module): 126 | def __init__(self, depth_wise=False, arch=68, pretrained=True, weight_path='', input_height=512): 127 | super().__init__() 128 | first_ch = [32, 64] 129 | second_kernel = 3 130 | max_pool = True 131 | grmul = 1.7 132 | drop_rate = 0.1 133 | 134 | #HarDNet68 135 | ch_list = [ 128, 256, 320, 640, 1024] 136 | gr = [ 14, 16, 20, 40,160] 137 | n_layers = [ 8, 16, 16, 16, 4] 138 | downSamp = [ 1, 0, 1, 1, 0] 139 | 140 | if arch==85: 141 | #HarDNet85 142 | first_ch = [48, 96] 143 | ch_list = [ 192, 256, 320, 480, 720, 1280] 144 | gr = [ 24, 24, 28, 36, 48, 256] 145 | n_layers = [ 8, 16, 16, 16, 16, 4] 146 | downSamp = [ 1, 0, 1, 0, 1, 0] 147 | drop_rate = 0.2 148 | elif arch==39: 149 | #HarDNet39 150 | first_ch = [24, 48] 151 | ch_list = [ 96, 320, 640, 1024] 152 | grmul = 1.6 153 | gr = [ 16, 20, 64, 160] 154 | n_layers = [ 4, 16, 8, 4] 155 | downSamp = [ 1, 1, 1, 0] 156 | 157 | if depth_wise: 158 | second_kernel = 1 159 | max_pool = False 160 | drop_rate = 0.05 161 | 162 | blks = len(n_layers) 163 | self.base = nn.ModuleList([]) 164 | 165 | # First Layer: Standard Conv3x3, Stride=2 166 | self.base.append ( 167 | ConvLayer(in_channels=3, out_channels=first_ch[0], kernel=3, 168 | stride=2, bias=False) ) 169 | 170 | # Second Layer 171 | self.base.append ( ConvLayer(first_ch[0], first_ch[1], kernel=second_kernel) ) 172 | 173 | # Maxpooling or DWConv3x3 downsampling 174 | if max_pool: 175 | self.base.append(nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) 176 | else: 177 | self.base.append ( DWConvLayer(first_ch[1], first_ch[1], stride=2) ) 178 | 179 | # Build all HarDNet blocks 180 | ch = first_ch[1] 181 | for i in range(blks): 182 | blk = HarDBlock(ch, gr[i], grmul, n_layers[i], dwconv=depth_wise) 183 | ch = blk.get_out_ch() 184 | self.base.append ( blk ) 185 | 186 | if i == blks-1 and arch == 85: 187 | self.base.append ( nn.Dropout(0.1)) 188 | 189 | self.base.append ( ConvLayer(ch, ch_list[i], kernel=1) ) 190 | ch = ch_list[i] 191 | if downSamp[i] == 1: 192 | if max_pool: 193 | self.base.append(nn.MaxPool2d(kernel_size=2, stride=2)) 194 | else: 195 | self.base.append ( DWConvLayer(ch, ch, stride=2) ) 196 | 197 | ch = ch_list[blks-1] 198 | self.base.append ( 199 | nn.Sequential( 200 | nn.AdaptiveAvgPool2d((1,1)), 201 | Flatten(), 202 | nn.Dropout(drop_rate), 203 | nn.Linear(ch, 1000) )) 204 | 205 | if pretrained: 206 | if hasattr(torch, 'hub'): 207 | 208 | if arch == 68 and not depth_wise: 209 | checkpoint = 'https://ping-chao.com/hardnet/hardnet68-5d684880.pth' 210 | elif arch == 85 and not depth_wise: 211 | checkpoint = 'https://ping-chao.com/hardnet/hardnet85-a28faa00.pth' 212 | elif arch == 68 and depth_wise: 213 | checkpoint = 'https://ping-chao.com/hardnet/hardnet68ds-632474d2.pth' 214 | else: 215 | checkpoint = 'https://ping-chao.com/hardnet/hardnet39ds-0e6c6fa9.pth' 216 | 217 | self.load_state_dict(torch.hub.load_state_dict_from_url(checkpoint, progress=False)) 218 | else: 219 | postfix = 'ds' if depth_wise else '' 220 | weight_file = '%shardnet%d%s.pth'%(weight_path, arch, postfix) 221 | if not os.path.isfile(weight_file): 222 | print(weight_file,'is not found') 223 | exit(0) 224 | weights = torch.load(weight_file) 225 | self.load_state_dict(weights) 226 | 227 | postfix = 'DS' if depth_wise else '' 228 | print('ImageNet pretrained weights for HarDNet%d%s is loaded'%(arch, postfix)) 229 | 230 | # Patch for HoHoNet 231 | self.base = self.base[:-1] 232 | if arch == 68: 233 | self.out_channels = [128, 320, 640, 1024] 234 | self.checkpoint = [4, 9, 12, 15] 235 | elif arch == 85: 236 | self.out_channels = [192, 320, 720, 1280] 237 | self.checkpoint = [4, 9, 14, 18] 238 | else: 239 | raise NotImplementedError 240 | self.feat_heights = [input_height//4//(2**i) for i in range(4)] 241 | 242 | def forward(self, x): 243 | x_lst = [] 244 | for i, layer in enumerate(self.base): 245 | x = layer(x) 246 | if i in self.checkpoint: 247 | x_lst.append(x) 248 | return x_lst 249 | 250 | -------------------------------------------------------------------------------- /lib/model/backbone/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torchvision.models as models 5 | 6 | class Resnet(nn.Module): 7 | def __init__(self, backbone='resnet50', coco='', input_extra=0, input_height=512): 8 | super(Resnet, self).__init__() 9 | self.encoder = getattr(models, backbone)(pretrained=True) 10 | del self.encoder.fc, self.encoder.avgpool 11 | if coco: 12 | coco_pretrain = getattr(models.segmentation, coco)(pretrained=True).backbone 13 | self.encoder.load_state_dict(coco_pretrain.state_dict()) 14 | self.out_channels = [256, 512, 1024, 2048] 15 | self.feat_heights = [input_height//4//(2**i) for i in range(4)] 16 | if int(backbone[6:]) < 50: 17 | self.out_channels = [_//4 for _ in self.out_channels] 18 | 19 | # Patch for extra input channel 20 | if input_extra > 0: 21 | ori_conv1 = self.encoder.conv1 22 | new_conv1 = nn.Conv2d( 23 | 3+input_extra, ori_conv1.out_channels, 24 | kernel_size=ori_conv1.kernel_size, 25 | stride=ori_conv1.stride, 26 | padding=ori_conv1.padding, 27 | bias=ori_conv1.bias) 28 | with torch.no_grad(): 29 | for i in range(0, 3+input_extra, 3): 30 | n = new_conv1.weight[:, i:i+3].shape[1] 31 | new_conv1.weight[:, i:i+n] = ori_conv1.weight[:, :n] 32 | self.encoder.conv1 = new_conv1 33 | 34 | # Prepare for pre/pose down height filtering 35 | self.pre_down = None 36 | self.post_down = None 37 | 38 | def forward(self, x): 39 | features = [] 40 | x = self.encoder.conv1(x) 41 | x = self.encoder.bn1(x) 42 | x = self.encoder.relu(x) 43 | x = self.encoder.maxpool(x) 44 | 45 | if self.pre_down is not None: 46 | x = self.pre_down(x) 47 | x = self.encoder.layer1(x); 48 | if self.post_down is not None: 49 | x = self.post_down(x) 50 | features.append(x) # 1/4 51 | x = self.encoder.layer2(x); features.append(x) # 1/8 52 | x = self.encoder.layer3(x); features.append(x) # 1/16 53 | x = self.encoder.layer4(x); features.append(x) # 1/32 54 | return features 55 | -------------------------------------------------------------------------------- /lib/model/backbone/simple.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torchvision.models as models 5 | 6 | class SimpleResBlock(nn.Module): 7 | def __init__(self, a, b, c, s): 8 | super(SimpleResBlock, self).__init__() 9 | self.layer = nn.Sequential( 10 | nn.Conv2d(a, b, 1, bias=False), 11 | nn.BatchNorm2d(b), 12 | nn.ReLU(inplace=True), 13 | nn.Conv2d(b, b, 3, padding=1, stride=s, bias=False), 14 | nn.BatchNorm2d(b), 15 | nn.ReLU(inplace=True), 16 | nn.Conv2d(b, c, 1, bias=False), 17 | nn.BatchNorm2d(c), 18 | ) 19 | self.skip = nn.Sequential( 20 | nn.Conv2d(a, c, 1, stride=s, bias=False), 21 | nn.BatchNorm2d(c), 22 | ) 23 | self.relu = nn.ReLU(inplace=True) 24 | nn.init.constant_(self.layer[-1].weight, 0) 25 | nn.init.constant_(self.layer[-1].bias, 0) 26 | 27 | def forward(self, x): 28 | return self.relu(self.layer(x) + self.skip(x)) 29 | 30 | class SimpleConv3x3Block(nn.Module): 31 | def __init__(self, a, b, c, s): 32 | super(SimpleConv3x3Block, self).__init__() 33 | self.layer = nn.Sequential( 34 | nn.Conv2d(a, c, 3, padding=1, stride=s, bias=False), 35 | nn.BatchNorm2d(c), 36 | nn.ReLU(inplace=True), 37 | nn.Conv2d(c, c, 3, padding=1, bias=False), 38 | nn.BatchNorm2d(c), 39 | nn.ReLU(inplace=True), 40 | ) 41 | 42 | def forward(self, x): 43 | return self.layer(x) 44 | 45 | def SimpleConv3x3MaxBlock(a, b, c, s): 46 | return nn.Sequential( 47 | nn.Conv2d(a, c, 3, padding=1, bias=False), 48 | nn.BatchNorm2d(c), 49 | nn.ReLU(inplace=True), 50 | nn.Conv2d(c, c, 3, padding=1, bias=False), 51 | nn.BatchNorm2d(c), 52 | nn.ReLU(inplace=True), 53 | nn.MaxPool2d(s, stride=s), 54 | ) 55 | 56 | def SimpleConv3x3lBlock(a, b, c, s): 57 | return nn.Sequential( 58 | nn.Conv2d(a, c, 3, padding=1, bias=False), 59 | nn.BatchNorm2d(c), 60 | nn.ReLU(inplace=True), 61 | nn.Conv2d(c, c, 3, padding=1, stride=s, bias=False), 62 | nn.BatchNorm2d(c), 63 | nn.ReLU(inplace=True), 64 | ) 65 | 66 | 67 | class SimpleEncoder(nn.Module): 68 | def __init__(self, input_extra=0, input_height=512, block='res', expand=1): 69 | super(SimpleEncoder, self).__init__() 70 | self.conv_pre = nn.Sequential( 71 | nn.Conv2d(3+input_extra, 16*expand, kernel_size=3, padding=1, bias=False), 72 | nn.BatchNorm2d(16*expand), 73 | nn.ReLU(inplace=True), 74 | ) 75 | 76 | if block == 'res': 77 | Block = SimpleResBlock 78 | elif block == 'conv3x3': 79 | Block = SimpleConv3x3Block 80 | elif block == 'conv3x3l': 81 | Block = SimpleConv3x3lBlock 82 | elif block == 'conv3x3max': 83 | Block = SimpleConv3x3MaxBlock 84 | else: 85 | raise NotImplementedError 86 | self.block0 = Block(16*expand, 16*expand, 32*expand, 2) 87 | self.block1 = Block(32*expand, 32*expand, 64*expand, 2) 88 | self.block2 = Block(64*expand, 64*expand, 128*expand, 2) 89 | self.block3 = Block(128*expand, 128*expand, 256*expand, 2) 90 | self.block4 = Block(256*expand, 256*expand, 256*expand, 2) 91 | 92 | self.out_channels = [64*expand, 128*expand, 256*expand, 256*expand] 93 | self.feat_heights = [input_height//4//(2**i) for i in range(4)] 94 | 95 | def forward(self, x): 96 | features = [] 97 | x = self.conv_pre(x) 98 | x = self.block0(x) 99 | x = self.block1(x); features.append(x) # 1/4 100 | x = self.block2(x); features.append(x) # 1/8 101 | x = self.block3(x); features.append(x) # 1/16 102 | x = self.block4(x); features.append(x) # 1/32 103 | return features 104 | -------------------------------------------------------------------------------- /lib/model/hohonet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from . import backbone 8 | from . import horizon_compression 9 | from . import horizon_refinement 10 | from . import horizon_upsample 11 | from . import modality 12 | from .utils import wrap_lr_pad 13 | 14 | 15 | ''' 16 | HoHoNet 17 | ''' 18 | class HoHoNet(nn.Module): 19 | def __init__(self, emb_dim=256, input_hw=None, input_norm='imagenet', pretrain='', 20 | backbone_config={'module': 'Resnet'}, 21 | decode_config={'module': 'EfficientHeightReduction'}, 22 | refine_config={'module': 'TransEn'}, 23 | upsample_config={'module': 'Upsample1D'}, 24 | modalities_config={}): 25 | super(HoHoNet, self).__init__() 26 | self.input_hw = input_hw 27 | if input_norm == 'imagenet': 28 | self.register_buffer('x_mean', torch.FloatTensor(np.array([0.485, 0.456, 0.406])[None, :, None, None])) 29 | self.register_buffer('x_std', torch.FloatTensor(np.array([0.229, 0.224, 0.225])[None, :, None, None])) 30 | elif input_norm == 'ugscnn': 31 | self.register_buffer('x_mean', torch.FloatTensor(np.array([0.4974898, 0.47918808, 0.42809588, 1.0961773])[None, :, None, None])) 32 | self.register_buffer('x_std', torch.FloatTensor(np.array([0.23762763, 0.23354423, 0.23272438, 0.75536704])[None, :, None, None])) 33 | else: 34 | raise NotImplementedError 35 | 36 | # Encoder 37 | Encoder = getattr(backbone, backbone_config['module']) 38 | Encoder_kwargs = backbone_config.get('kwargs', {}) 39 | self.encoder = Encoder(**Encoder_kwargs) 40 | 41 | # Horizon compression convert backbone features to horizontal feature 42 | # I name the variable as decoder during development and forgot to fix :P 43 | Decoder = getattr(horizon_compression, decode_config['module']) 44 | Decoder_kwargs = decode_config.get('kwargs', {}) 45 | self.decoder = Decoder(self.encoder.out_channels, self.encoder.feat_heights, **Decoder_kwargs) 46 | 47 | # Horizontal feature refinement module 48 | Refinement = getattr(horizon_refinement, refine_config['module']) 49 | Refinement_kwargs = refine_config.get('kwargs', {}) 50 | self.horizon_refine = Refinement(self.decoder.out_channels, **Refinement_kwargs) 51 | 52 | # Channel reduction to the shared latent 53 | Upsampler = getattr(horizon_upsample, upsample_config['module']) 54 | Upsampler_kwargs = upsample_config.get('kwargs', {}) 55 | self.emb_shared_latent = Upsampler(self.horizon_refine.out_channels, emb_dim) 56 | 57 | # Instantiate desired modalities 58 | self.modalities = nn.ModuleList([ 59 | getattr(modality, key)(emb_dim, **config) 60 | for key, config in modalities_config.items() 61 | ]) 62 | 63 | # Patch for all conv1d/2d layer's left-right padding 64 | wrap_lr_pad(self) 65 | 66 | # Load pretrained 67 | if pretrain: 68 | print(f'Load pretrained {pretrain}') 69 | st = torch.load(pretrain) 70 | missing_key = self.state_dict().keys() - st.keys() 71 | unknown_key = st.keys() - self.state_dict().keys() 72 | print('Missing key:', missing_key) 73 | print('Unknown key:', unknown_key) 74 | self.load_state_dict(st, strict=False) 75 | 76 | def extract_feat(self, x): 77 | ''' Map the input RGB to the shared latent (by all modalities) ''' 78 | 79 | if self.input_hw: 80 | x = F.interpolate(x, size=self.input_hw, mode='bilinear', align_corners=False) 81 | x = (x - self.x_mean) / self.x_std 82 | # encoder 83 | conv_list = self.encoder(x) 84 | # decoder to get horizontal feature 85 | feat = self.decoder(conv_list) 86 | # refine feat 87 | feat = self.horizon_refine(feat) 88 | # embed the shared latent 89 | feat = self.emb_shared_latent(feat) 90 | return feat 91 | 92 | def call_modality(self, method, *feed_args, **feed_kwargs): 93 | ''' Calling the method implemented in each modality and merge the results ''' 94 | output_dict = {} 95 | for m in self.modalities: 96 | curr_dict = getattr(m, method)(*feed_args, **feed_kwargs) 97 | assert len(output_dict.keys() & curr_dict.keys()) == 0, 'Key collision for different modalities' 98 | output_dict.update(curr_dict) 99 | return output_dict 100 | 101 | def forward(self, x): 102 | feat = self.extract_feat(x) 103 | return self.call_modality('forward', feat) 104 | 105 | def infer(self, x): 106 | feat = self.extract_feat(x) 107 | return self.call_modality('infer', feat) 108 | 109 | def compute_losses(self, batch): 110 | feat = self.extract_feat(batch['x']) 111 | losses = self.call_modality('compute_losses', feat, batch=batch) 112 | losses['total'] = sum(v for k, v in losses.items() if k.startswith('total')) 113 | return losses 114 | 115 | -------------------------------------------------------------------------------- /lib/model/horizon_compression/__init__.py: -------------------------------------------------------------------------------- 1 | from .hc import GlobalHeightStage 2 | from .ehc import EfficientHeightReduction 3 | from .simple import SimpleReduction 4 | -------------------------------------------------------------------------------- /lib/model/horizon_compression/ehc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from ..utils import pano_upsample_w, PanoUpsampleW 6 | 7 | 8 | ''' 9 | EHC 10 | ''' 11 | class EfficientHeightReduction(nn.Module): 12 | def __init__(self, cs, heights, out_ch=1024, fuse_ks=1): 13 | ''' Process 4 blocks from encoder to single multiscale features ''' 14 | super(EfficientHeightReduction, self).__init__() 15 | c1, c2, c3, c4 = cs 16 | h1, h2, h3, h4 = heights 17 | 18 | def EfficientConvCompressH(in_c, out_c, scale, down_h): 19 | return nn.Sequential( 20 | nn.Conv2d(in_c, out_c, 3, padding=1, bias=False), 21 | nn.BatchNorm2d(out_c), 22 | nn.ReLU(inplace=True), 23 | PanoUpsampleW(scale), 24 | nn.Conv2d(out_c, out_c, 3, padding=1, bias=False), 25 | nn.BatchNorm2d(out_c), 26 | nn.ReLU(inplace=True), 27 | nn.Conv2d(out_c, out_c, (down_h, 1), groups=out_c, bias=False), 28 | ) 29 | 30 | self.ghc_lst = nn.ModuleList([ 31 | EfficientConvCompressH(c1, c1//4, scale=1, down_h=h1), 32 | EfficientConvCompressH(c2, c2//4, scale=2, down_h=h2), 33 | EfficientConvCompressH(c3, c3//4, scale=4, down_h=h3), 34 | EfficientConvCompressH(c4, c4//4, scale=8, down_h=h4), 35 | ]) 36 | self.fuse = nn.Sequential( 37 | nn.Conv2d((c1+c2+c3+c4)//4, out_ch, fuse_ks, padding=fuse_ks//2, bias=False), 38 | nn.BatchNorm2d(out_ch), 39 | nn.ReLU(inplace=True), 40 | ) 41 | self.out_channels = out_ch 42 | 43 | def forward(self, conv_list): 44 | assert len(conv_list) == 4 45 | feature = torch.cat([ 46 | f(x) for f, x in zip(self.ghc_lst, conv_list) 47 | ], dim=1) 48 | feature = self.fuse(feature).squeeze(2) 49 | return {'1D': feature, 'conv_list': conv_list} 50 | -------------------------------------------------------------------------------- /lib/model/horizon_compression/hc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from ..utils import pano_upsample_w, PanoUpsampleW 6 | 7 | 8 | ''' 9 | Original HC 10 | ''' 11 | class GlobalHeightConv(nn.Module): 12 | def __init__(self, in_c, out_c): 13 | super(GlobalHeightConv, self).__init__() 14 | 15 | def ConvCompressH(in_c, out_c, ks=3): 16 | return nn.Sequential( 17 | nn.Conv2d(in_c, out_c, kernel_size=ks, stride=(2, 1), padding=ks//2), 18 | nn.BatchNorm2d(out_c), 19 | nn.ReLU(inplace=True), 20 | ) 21 | 22 | self.layer = nn.Sequential( 23 | ConvCompressH(in_c, in_c//2), 24 | ConvCompressH(in_c//2, in_c//2), 25 | ConvCompressH(in_c//2, in_c//4), 26 | ConvCompressH(in_c//4, out_c), 27 | ) 28 | 29 | def forward(self, x, out_w): 30 | x = self.layer(x) 31 | assert out_w % x.shape[3] == 0 32 | return pano_upsample_w(x, out_w//x.shape[-1]) 33 | 34 | 35 | class GlobalHeightStage(nn.Module): 36 | def __init__(self, cs, heights, down_h=8): 37 | ''' Process 4 blocks from encoder to single multiscale features ''' 38 | super(GlobalHeightStage, self).__init__() 39 | c1, c2, c3, c4 = cs 40 | h1, h2, h3, h4 = heights 41 | self.ghc_lst = nn.ModuleList([ 42 | GlobalHeightConv(c1, c1//down_h), 43 | GlobalHeightConv(c2, c2//down_h), 44 | GlobalHeightConv(c3, c3//down_h), 45 | GlobalHeightConv(c4, c4//down_h), 46 | ]) 47 | self.out_channels = (c1*h1 + c2*h2 + c3*h3 + c4*h4) // 16 // down_h 48 | 49 | def forward(self, conv_list): 50 | assert len(conv_list) == 4 51 | bs, _, _, out_w = conv_list[0].shape 52 | feature = torch.cat([ 53 | f(x, out_w).reshape(bs, -1, out_w) 54 | for f, x in zip(self.ghc_lst, conv_list) 55 | ], dim=1) 56 | return {'1D': feature} 57 | -------------------------------------------------------------------------------- /lib/model/horizon_compression/simple.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from ..utils import pano_upsample_w, PanoUpsampleW 6 | 7 | 8 | ''' 9 | Simple decoder (for s2d3d sem small input size) 10 | ''' 11 | class SimpleReduction(nn.Module): 12 | def __init__(self, cs, heights, out_ch=64): 13 | ''' Process 4 blocks from encoder to single multiscale features ''' 14 | super(SimpleReduction, self).__init__() 15 | c1, c2, c3, c4 = cs 16 | h1, h2, h3, h4 = heights 17 | 18 | def EfficientConvCompressH(in_c, out_c, scale, down_h): 19 | return nn.Sequential( 20 | PanoUpsampleW(scale), 21 | nn.Conv2d(in_c, out_c, (down_h, 1), bias=False), 22 | nn.BatchNorm2d(out_c), 23 | nn.ReLU(inplace=True), 24 | ) 25 | 26 | self.ghc_lst = nn.ModuleList([ 27 | EfficientConvCompressH(c1, c1//4, scale=1, down_h=h1), 28 | EfficientConvCompressH(c2, c2//4, scale=2, down_h=h2), 29 | EfficientConvCompressH(c3, c3//4, scale=4, down_h=h3), 30 | EfficientConvCompressH(c4, c4//4, scale=8, down_h=h4), 31 | ]) 32 | self.fuse = nn.Sequential( 33 | nn.Conv2d((c1+c2+c3+c4)//4, out_ch, (1, 9), padding=(0, 4), bias=False), 34 | nn.BatchNorm2d(out_ch), 35 | nn.ReLU(inplace=True), 36 | ) 37 | self.out_channels = out_ch 38 | 39 | def forward(self, conv_list): 40 | assert len(conv_list) == 4 41 | feature = torch.cat([ 42 | f(x) for f, x in zip(self.ghc_lst, conv_list) 43 | ], dim=1) 44 | feature = self.fuse(feature).squeeze(2) 45 | return {'1D': feature} 46 | -------------------------------------------------------------------------------- /lib/model/horizon_refinement/__init__.py: -------------------------------------------------------------------------------- 1 | from .identity import Identity 2 | from .linear import Linear 3 | from .rnn import LSTM, GRU 4 | from .attention import TransEn 5 | -------------------------------------------------------------------------------- /lib/model/horizon_refinement/attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | import copy 6 | 7 | 8 | ''' Transformer encoder ''' 9 | class TransformerEncoder(nn.Module): 10 | ''' Adapt from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/transformer.py ''' 11 | def __init__(self, encoder_layer, num_layers): 12 | super(TransformerEncoder, self).__init__() 13 | self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for i in range(num_layers)]) 14 | self.num_layers = num_layers 15 | 16 | def forward(self, x): 17 | for mod in self.layers: 18 | x = mod(x) 19 | return x 20 | 21 | class TransformerEncoderLayer(nn.Module): 22 | ''' Adapt from https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/transformer.py ''' 23 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, mode='pre'): 24 | super(TransformerEncoderLayer, self).__init__() 25 | self.mode = mode 26 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 27 | # Implementation of Feedforward model 28 | self.linear1 = nn.Linear(d_model, dim_feedforward) 29 | self.dropout = nn.Dropout(dropout) 30 | self.linear2 = nn.Linear(dim_feedforward, d_model) 31 | 32 | self.norm1 = nn.LayerNorm(d_model) 33 | self.norm2 = nn.LayerNorm(d_model) 34 | self.dropout1 = nn.Dropout(dropout) 35 | self.dropout2 = nn.Dropout(dropout) 36 | 37 | self.activation = nn.ReLU(inplace=True) 38 | 39 | def forward(self, x): 40 | if self.mode == 'post': 41 | x2 = self.self_attn(x, x, x)[0] 42 | x = x + self.dropout1(x2) 43 | x = self.norm1(x) 44 | x2 = self.linear2(self.dropout(self.activation(self.linear1(x)))) 45 | x = x + self.dropout2(x2) 46 | x = self.norm2(x) 47 | return x 48 | elif self.mode == 'pre': 49 | x2 = self.norm1(x) 50 | x2 = self.self_attn(x2, x2, x2)[0] 51 | x = x + self.dropout1(x2) 52 | x2 = self.norm2(x) 53 | x2 = self.linear2(self.dropout(self.activation(self.linear1(x2)))) 54 | x = x + self.dropout2(x2) 55 | return x 56 | raise NotImplementedError 57 | 58 | class TransEn(nn.Module): 59 | def __init__(self, c_mid, position_encode, nhead=8, num_layers=2, dim_feedforward=2048, mode='pre'): 60 | super(TransEn, self).__init__() 61 | if isinstance(c_mid, (tuple, list)): 62 | c_mid = c_mid[0] 63 | encoder_layer = TransformerEncoderLayer(c_mid, nhead, dim_feedforward, mode=mode) 64 | self.transen = TransformerEncoder(encoder_layer, num_layers) 65 | 66 | import math 67 | max_len, d_model = position_encode, c_mid 68 | pe = torch.zeros(max_len, d_model) 69 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 70 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 71 | pe[:, 0::2] = torch.sin(position * div_term) 72 | pe[:, 1::2] = torch.cos(position * div_term) 73 | self.register_buffer('pos', pe.T[None].contiguous()) 74 | 75 | self.out_channels = c_mid 76 | 77 | def forward(self, feat): 78 | feat1d = feat['1D'] 79 | feat1d = (feat1d + self.pos).permute(2,0,1) 80 | feat1d = self.transen(feat1d).permute(1,2,0) 81 | feat['1D'] = feat1d 82 | return feat 83 | -------------------------------------------------------------------------------- /lib/model/horizon_refinement/identity.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Identity(nn.Module): 7 | def __init__(self, c_mid, *args, **kwargs): 8 | super(Identity, self).__init__() 9 | self.out_channels = c_mid 10 | 11 | def forward(self, x): 12 | return x 13 | -------------------------------------------------------------------------------- /lib/model/horizon_refinement/linear.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | def conv1dbnrelu(in_channels, out_channels, **kwargs): 7 | return nn.Sequential( 8 | nn.Conv1d(in_channels, out_channels, **kwargs), 9 | nn.BatchNorm1d(out_channels), 10 | nn.ReLU(inplace=True), 11 | ) 12 | 13 | class Linear(nn.Module): 14 | def __init__(self, c_mid, base_ch=256): 15 | super(Linear, self).__init__() 16 | self.conv_1x1 = conv1dbnrelu(c_mid, base_ch*4, kernel_size=1, bias=False) 17 | self.out_channels = base_ch*4 18 | 19 | def forward(self, feat): 20 | feat = feat['1D'] 21 | feat = self.conv_1x1(feat) 22 | return {'1D': feat} 23 | -------------------------------------------------------------------------------- /lib/model/horizon_refinement/rnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | ''' RNN ''' 7 | class LSTM(nn.Module): 8 | def __init__(self, c_mid, base_ch=256, num_layers=2, bidirectional=True): 9 | super(LSTM, self).__init__() 10 | self.rnn = nn.LSTM( 11 | c_mid, hidden_size=base_ch, 12 | num_layers=num_layers, bidirectional=bidirectional) 13 | self.out_channels = base_ch * (1+int(bidirectional)) 14 | 15 | def forward(self, feat): 16 | feat = self.rnn(feat.permute(2,0,1))[0].permute(1,2,0).contiguous() 17 | return {'1D': feat} 18 | 19 | class GRU(nn.Module): 20 | def __init__(self, c_mid, base_ch=256, num_layers=2, bidirectional=True): 21 | super(GRU, self).__init__() 22 | self.rnn = nn.GRU( 23 | c_mid, hidden_size=base_ch, 24 | num_layers=num_layers, bidirectional=bidirectional) 25 | self.out_channels = base_ch * (1+int(bidirectional)) 26 | 27 | def forward(self, feat): 28 | feat = feat['1D'] 29 | feat = self.rnn(feat.permute(2,0,1))[0].permute(1,2,0).contiguous() 30 | return {'1D': feat} 31 | -------------------------------------------------------------------------------- /lib/model/horizon_upsample/__init__.py: -------------------------------------------------------------------------------- 1 | from torch.nn import Identity 2 | from .upsample1d import Upsample1D 3 | -------------------------------------------------------------------------------- /lib/model/horizon_upsample/upsample1d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from ..utils import PanoUpsampleW 6 | 7 | 8 | class Upsample1D(nn.Sequential): 9 | def __init__(self, ic, oc): 10 | super(Upsample1D, self).__init__( 11 | PanoUpsampleW(4), 12 | nn.Conv1d(ic, oc, 3, padding=1, bias=False), 13 | nn.BatchNorm1d(oc), 14 | nn.ReLU(inplace=True), 15 | ) 16 | 17 | def forward(self, feat): 18 | feat1d = feat['1D'] 19 | for module in self: 20 | feat1d = module(feat1d) 21 | feat['1D'] = feat1d 22 | return feat 23 | -------------------------------------------------------------------------------- /lib/model/modality/__init__.py: -------------------------------------------------------------------------------- 1 | from .depth import DepthEstimator 2 | from .semantic import SemanticSegmenter 3 | from .layout import LayoutEstimator 4 | -------------------------------------------------------------------------------- /lib/model/modality/bases.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def dct(n_components, output_height): 6 | basis = (torch.arange(output_height)[None].float() + 0.5) / output_height * np.pi 7 | basis = torch.arange(0, n_components)[:,None].float() * basis 8 | basis = torch.cos(basis) 9 | return basis 10 | 11 | 12 | def linear(*args, **kwargs): 13 | return None 14 | -------------------------------------------------------------------------------- /lib/model/modality/depth.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from . import bases 8 | from ..utils import PanoUpsampleW 9 | 10 | 11 | ''' Dense (per-pixel) depth estimation ''' 12 | class DepthBase(nn.Module): 13 | def __init__(self): 14 | super(DepthBase, self).__init__() 15 | 16 | def infer(self, x_emb): 17 | depth = self(x_emb)['depth'] 18 | return {'depth': depth} 19 | 20 | def compute_losses(self, x_emb, batch): 21 | gt = batch['depth'] 22 | mask = (gt > 0) 23 | 24 | # Forward 25 | pred_dict = self(x_emb) 26 | pred = pred_dict['depth'] 27 | 28 | # Compute losses 29 | losses = {} 30 | l1 = (pred[mask] - gt[mask]).abs() 31 | l2 = (pred[mask] - gt[mask]).pow(2) 32 | losses['mae'] = l1.mean() 33 | losses['rmse'] = l2.mean().sqrt() 34 | losses['delta1'] = (torch.max(pred[mask]/gt[mask], gt[mask]/pred[mask]) < 1.25).float().mean() 35 | 36 | losses['total.depth'] = loss_for_backward(pred_dict['depth1d'], gt, mask, self.loss) 37 | if 'residual' in pred_dict: 38 | with torch.no_grad(): 39 | gt_residual = gt - pred_dict['depth1d'].detach() 40 | losses['total.residual'] = loss_for_backward(pred_dict['residual'], gt_residual, mask, 'l1') 41 | return losses 42 | 43 | 44 | def loss_for_backward(pred, gt, mask, loss): 45 | if loss == 'l1': 46 | return F.l1_loss(pred[mask], gt[mask]) 47 | elif loss == 'l2': 48 | return F.mse_loss(pred[mask], gt[mask]) 49 | elif loss == 'huber': 50 | return F.smooth_l1_loss(pred[mask], gt[mask]) 51 | elif loss == 'berhu': 52 | l1 = (pred[mask] - gt[mask]).abs().mean() 53 | l2 = (pred[mask] - gt[mask]).pow(2).mean() 54 | with torch.no_grad(): 55 | c = max(l1.detach().max() * 0.2, 0.01) 56 | l2c = (l2 + c**2) / (2 * c) 57 | return torch.where(l1<=c, l1, l2c).mean() 58 | else: 59 | raise NotImplementedError 60 | 61 | 62 | class DepthEstimator(DepthBase): 63 | def __init__(self, emb_dim, basis='dct', loss='l1', n_components=64, 64 | init_weight=0.1, init_bias=2.5, output_height=512, 65 | resisual=False, basis_tuning=False): 66 | super(DepthEstimator, self).__init__() 67 | self.loss = loss 68 | 69 | self.output_height = output_height 70 | basis = getattr(bases, basis)(n_components, output_height) 71 | if basis_tuning: 72 | self.basis = nn.Parameter(basis) 73 | else: 74 | self.register_buffer('basis', basis) 75 | 76 | self.estimator = nn.Sequential( 77 | nn.Conv1d(emb_dim, emb_dim, 1), 78 | nn.BatchNorm1d(emb_dim), 79 | nn.ReLU(inplace=True), 80 | nn.Conv1d(emb_dim, n_components, 1, bias=False), 81 | ) 82 | self.bias = nn.Parameter(torch.full([1], init_bias)) 83 | nn.init.normal_(self.estimator[-1].weight, std=init_weight/np.sqrt(emb_dim/2)) 84 | 85 | self.residual = None 86 | if resisual: 87 | self.residual = nn.Sequential( 88 | nn.Conv2d(256, 64, 3, padding=1, bias=False), 89 | nn.BatchNorm2d(64), 90 | nn.ReLU(inplace=True), 91 | nn.Conv2d(64, 1, 1, bias=False), 92 | PanoUpsampleW(4), 93 | nn.UpsamplingBilinear2d(scale_factor=(4,1)), 94 | ) 95 | 96 | def forward(self, x_emb): 97 | ws = self.estimator(x_emb['1D']) 98 | if self.basis is None: 99 | h, w = self.output_height, ws.shape[-1] 100 | depth = self.bias + F.interpolate(ws.unsqueeze(1), size=(h,w), mode='bilinear', align_corners=False) 101 | else: 102 | depth = self.bias + torch.einsum('bkw,kh->bhw', ws, self.basis).unsqueeze(1) 103 | ret_dict = {'depth': depth, 'depth1d': depth} 104 | if self.residual is not None: 105 | residual = 0.1 * self.residual(x_emb['conv_list'][0].detach()) 106 | ret_dict['residual'] = residual 107 | ret_dict['depth'] = depth + residual 108 | return ret_dict 109 | -------------------------------------------------------------------------------- /lib/model/modality/layout.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from . import bases 8 | 9 | from lib.misc import panostretch, post_proc 10 | from ..utils import peaks_finding 11 | from scipy.ndimage.filters import maximum_filter 12 | from shapely.geometry import Polygon 13 | 14 | 15 | ''' Layout (per-column) estimation ''' 16 | class LayoutEstimator(nn.Module): 17 | def __init__(self, emb_dim, bon_weight=1., cor_weight=1., bon_loss='l1', cor_loss='bce', bon_scale=1., 18 | init_weight=0.1, dropout=0., oneconv=True, last_ks=1, last_bias=True, 19 | H=512, W=1024, post_force_cuboid=False): 20 | super(LayoutEstimator, self).__init__() 21 | self.bon_loss = bon_loss 22 | self.cor_loss = cor_loss 23 | self.bon_scale = bon_scale 24 | self.bon_weight = bon_weight 25 | self.cor_weight = cor_weight 26 | self.H = H 27 | self.W = W 28 | self.post_force_cuboid = post_force_cuboid 29 | 30 | if oneconv: 31 | self.pred_bon = nn.Conv1d(emb_dim, 2, last_ks, padding=last_ks//2, bias=last_bias) 32 | self.pred_cor = nn.Conv1d(emb_dim, 1, last_ks, padding=last_ks//2, bias=last_bias) 33 | if last_bias: 34 | nn.init.constant_(self.pred_bon.bias[0], -0.478) 35 | nn.init.constant_(self.pred_bon.bias[1], 0.425) 36 | nn.init.constant_(self.pred_cor.bias, -1.) 37 | else: 38 | self.pred_bon = nn.Sequential( 39 | nn.Conv1d(emb_dim, emb_dim, 3, padding=1, bias=False), 40 | nn.BatchNorm1d(emb_dim), 41 | nn.ReLU(inplace=True), 42 | nn.Conv1d(emb_dim, 2, 1), 43 | ) 44 | self.pred_cor = nn.Sequential( 45 | nn.Conv1d(emb_dim, emb_dim, 3, padding=1, bias=False), 46 | nn.BatchNorm1d(emb_dim), 47 | nn.ReLU(inplace=True), 48 | nn.Conv1d(emb_dim, 1, 1), 49 | ) 50 | nn.init.constant_(self.pred_bon[-1].bias[0], -0.478) 51 | nn.init.constant_(self.pred_bon[-1].bias[1], 0.425) 52 | nn.init.constant_(self.pred_cor[-1].bias, -1.) 53 | self.dropout = None 54 | if dropout > 0: 55 | self.dropout = nn.Dropout(dropout) 56 | 57 | def forward(self, x_emb): 58 | x_emb = x_emb['1D'] 59 | if self.dropout is not None: 60 | x_emb = self.dropout(x_emb) 61 | pred_bon = self.pred_bon(x_emb) 62 | pred_cor = self.pred_cor(x_emb) 63 | return {'bon': pred_bon, 'cor': pred_cor} 64 | 65 | def infer(self, x_emb): 66 | pred = self(x_emb) 67 | pred_bon = pred['bon'] / self.bon_scale 68 | pred_cor = pred['cor'] 69 | H, W = self.H, self.W 70 | 71 | y_bon_ = (pred_bon[0].cpu().numpy() / np.pi + 0.5) * H - 0.5 72 | y_cor_ = pred_cor[0,0].sigmoid().cpu().numpy() 73 | # Init floor/ceil plane 74 | z0 = 50 75 | _, z1 = post_proc.np_refine_by_fix_z(*y_bon_, z0) 76 | 77 | # Detech wall-wall peaks 78 | def find_N_peaks(signal, r, min_v, N): 79 | max_v = maximum_filter(signal, size=r, mode='wrap') 80 | pk_loc = np.where(max_v == signal)[0] 81 | pk_loc = pk_loc[signal[pk_loc] > min_v] 82 | if N is not None: 83 | order = np.argsort(-signal[pk_loc]) 84 | pk_loc = pk_loc[order[:N]] 85 | pk_loc = pk_loc[np.argsort(pk_loc)] 86 | return pk_loc, signal[pk_loc] 87 | min_v = 0 if self.post_force_cuboid else 0.05 88 | r = int(round(W * 0.05 / 2)) 89 | N = 4 if self.post_force_cuboid else None 90 | xs_ = find_N_peaks(y_cor_, r=r, min_v=min_v, N=N)[0] 91 | 92 | # Generate wall-walls 93 | cor, xy_cor = post_proc.gen_ww(xs_, y_bon_[0], z0, tol=abs(0.16 * z1 / 1.6), force_cuboid=self.post_force_cuboid) 94 | if not self.post_force_cuboid: 95 | # Check valid (for fear self-intersection) 96 | xy2d = np.zeros((len(xy_cor), 2), np.float32) 97 | for i in range(len(xy_cor)): 98 | xy2d[i, xy_cor[i]['type']] = xy_cor[i]['val'] 99 | xy2d[i, xy_cor[i-1]['type']] = xy_cor[i-1]['val'] 100 | if not Polygon(xy2d).is_valid: 101 | import sys 102 | print( 103 | 'Fail to generate valid general layout!! ' 104 | 'Generate cuboid as fallback.', 105 | file=sys.stderr) 106 | xs_ = find_N_peaks(y_cor_, r=r, min_v=0, N=4)[0] 107 | cor, xy_cor = post_proc.gen_ww(xs_, y_bon_[0], z0, tol=abs(0.16 * z1 / 1.6), force_cuboid=True) 108 | 109 | # Expand with btn coory 110 | cor = np.hstack([cor, post_proc.infer_coory(cor[:, 1], z1 - z0, z0)[:, None]]) 111 | # Collect corner position in equirectangular 112 | cor_id = np.zeros((len(cor)*2, 2), np.float32) 113 | for j in range(len(cor)): 114 | cor_id[j*2] = cor[j, 0], cor[j, 1] 115 | cor_id[j*2 + 1] = cor[j, 0], cor[j, 2] 116 | return {'cor_id': cor_id, 'y_bon_': y_bon_, 'y_cor_': y_cor_} 117 | 118 | def compute_losses(self, x_emb, batch): 119 | gt_bon = batch['bon'] * self.bon_scale 120 | gt_vot = batch['vot'] 121 | gt_cor = 0.96 ** gt_vot.abs() 122 | 123 | # Forward 124 | pred = self(x_emb) 125 | 126 | # Compute losses 127 | losses = {} 128 | if self.bon_loss == 'l1': 129 | losses['bon'] = F.l1_loss(pred['bon'], gt_bon) 130 | elif self.bon_loss == 'l2': 131 | losses['bon'] = F.mse_loss(pred['bon'], gt_bon) 132 | else: 133 | raise NotImplementedError 134 | 135 | if self.cor_loss == 'bce': 136 | losses['cor'] = F.binary_cross_entropy_with_logits(pred['cor'], gt_cor) 137 | elif self.cor_loss == 'prfocal': 138 | g, p = gt_cor, pred['cor'] 139 | pos_mask = (g >= 1-1e-6) 140 | B, alpha, beta = len(g), 2, 4 141 | L_pos = -F.logsigmoid(p) * F.sigmoid(-p).pow(alpha) 142 | L_neg = -F.logsigmoid(-p) * F.sigmoid(p).pow(alpha) * (1-g).pow(beta) 143 | L = torch.where(pos_mask, L_pos, L_neg).view(B,-1).sum(-1) / pos_mask.float().view(B,-1).sum(-1) 144 | losses['cor'] = L.mean() 145 | else: 146 | raise NotImplementedError 147 | 148 | losses['total.layout'] = self.bon_weight * losses['bon'] + self.cor_weight * losses['cor'] 149 | with torch.no_grad(): 150 | losses['bon.mae'] = F.l1_loss(pred['bon'], gt_bon) / self.bon_scale 151 | losses['cor.mae'] = F.l1_loss(pred['cor'].sigmoid(), gt_cor) 152 | return losses 153 | -------------------------------------------------------------------------------- /lib/model/modality/semantic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from . import bases 8 | 9 | 10 | ''' Dense (per-pixel) semantic segmentation ''' 11 | class SemanticSegmenter(nn.Module): 12 | def __init__(self, emb_dim, num_classes, basis='dct', loss='bce', label_weight='', invalid_ids=[], n_components=64, 13 | last_ks=1, dropout=0, init_weight=0.1, init_bias=None, output_height=512, pre1d=False): 14 | super(SemanticSegmenter, self).__init__() 15 | self.num_classes = num_classes 16 | self.loss = loss 17 | self.n_components = n_components 18 | self.invalid_ids = invalid_ids 19 | if init_bias is None: 20 | if self.loss == 'bce': 21 | init_bias = -np.log(num_classes-1) 22 | else: 23 | init_bias = 0.0 24 | 25 | self.output_height = output_height 26 | self.register_buffer('basis', getattr(bases, basis)(n_components, output_height)) 27 | 28 | self.estimator = nn.Sequential( 29 | nn.Conv1d(emb_dim, emb_dim, last_ks, padding=last_ks//2), 30 | nn.BatchNorm1d(emb_dim), 31 | nn.ReLU(inplace=True), 32 | nn.Conv1d(emb_dim, n_components * num_classes, 1, bias=False), 33 | ) 34 | if dropout > 0: 35 | self.estimator = nn.Sequential(*self.estimator[:-1], nn.Dropout(dropout), self.estimator[-1]) 36 | self.bias = nn.Parameter(torch.full([1, num_classes, 1, 1], init_bias)) 37 | nn.init.normal_(self.estimator[-1].weight, std=init_weight/np.sqrt(emb_dim/2)) 38 | 39 | self.estimator1d = None 40 | if pre1d: 41 | self.estimator1d = nn.Sequential( 42 | nn.Conv1d(emb_dim, emb_dim, last_ks, padding=last_ks//2), 43 | nn.BatchNorm1d(emb_dim), 44 | nn.ReLU(inplace=True), 45 | nn.Conv1d(emb_dim, num_classes, 1), 46 | ) 47 | nn.init.constant_(self.estimator1d[-1].bias, -np.log(10-1)) 48 | 49 | if label_weight: 50 | self.register_buffer('label_weight', torch.load(label_weight).float()) 51 | else: 52 | self.register_buffer('label_weight', torch.ones(num_classes)) 53 | self.label_weight[self.invalid_ids] = 0 54 | self.label_weight *= (num_classes - len(self.invalid_ids)) / self.label_weight.sum() 55 | 56 | def forward(self, x_emb): 57 | x_emb = x_emb['1D'] 58 | B, _, W = x_emb.shape 59 | ws = self.estimator(x_emb).view(B, self.num_classes, self.n_components, W) 60 | if self.basis is None: 61 | h, w = self.output_height, ws.shape[-1] 62 | sem = self.bias + F.interpolate(ws, size=(h,w), mode='bilinear', align_corners=False) 63 | else: 64 | sem = self.bias + torch.einsum('bckw,kh->bchw', ws, self.basis) 65 | sem[:, self.invalid_ids] = -100 66 | 67 | if self.estimator1d is not None: 68 | sem1d = self.estimator1d(x_emb).view(B, self.num_classes, 1, W) 69 | sem1d[:, self.invalid_ids] = -100 70 | sem.permute(0,1,3,2)[sem1d.sigmoid().squeeze(2) < 0.1] = float("-Inf") 71 | return {'sem': sem, 'sem1d': sem1d} 72 | else: 73 | return {'sem': sem} 74 | 75 | def infer(self, x_emb): 76 | return self(x_emb) 77 | 78 | def compute_losses(self, x_emb, batch): 79 | gt = batch['sem'] 80 | mask = (gt >= 0) 81 | B, H, W = gt.shape 82 | if mask.sum() == 0: 83 | return {} 84 | 85 | # Forward 86 | pred = self(x_emb) 87 | pred_sem = pred['sem'] 88 | 89 | # Compute losses 90 | losses = {} 91 | 92 | if 'sem1d' in pred: 93 | pred_sem1d = pred['sem1d'] 94 | gt1d = torch.zeros_like(pred_sem1d) 95 | brcid = torch.stack(torch.meshgrid(torch.arange(gt.shape[0]), torch.arange(gt.shape[1]), torch.arange(gt.shape[2])), -1) 96 | bid, rid, cid = brcid[mask].T 97 | gt1d[bid, gt[mask], 0, cid] = 1 98 | losses['acc.sem1d.fn'] = ((pred_sem1d.sigmoid() < 0.1) & (gt1d == 1)).float().mean() 99 | losses['acc.sem1d.tn'] = ((pred_sem1d.sigmoid() < 0.1) & (gt1d == 0)).float().mean() 100 | losses['total.sem1d'] = F.binary_cross_entropy_with_logits(pred_sem1d, gt1d) 101 | 102 | pred_sem = pred_sem.permute(0,2,3,1)[mask] 103 | gt = gt[mask] 104 | if 'sem1d' in pred: 105 | activate = (pred_sem1d.detach().sigmoid() >= 0.1).float().repeat(1,1,H,1) 106 | activate = activate.permute(0,2,3,1)[mask] 107 | else: 108 | activate = torch.ones_like(pred_sem) 109 | losses['acc'] = (pred_sem.argmax(1) == gt).float().mean() 110 | if self.loss == 'bce': 111 | gt_onehot = torch.zeros_like(pred_sem).scatter_(dim=1, index=gt[:,None], src=torch.ones_like(pred_sem)) 112 | bce = F.binary_cross_entropy_with_logits(pred_sem, gt_onehot, reduction='none') 113 | bce = (bce * self.label_weight)[activate.bool()] 114 | losses['total.sem'] = bce.mean() 115 | elif self.loss == 'ce': 116 | ce = F.cross_entropy(pred_sem, gt, weight=self.label_weight, reduction='none') 117 | ce = ce[~torch.isinf(ce) & ~torch.isnan(ce)] 118 | losses['total.sem'] = ce.mean() 119 | elif self.loss.startswith('mse'): 120 | R = float(self.loss[3:]) 121 | gt_R = torch.full_like(pred_sem, -R).scatter_(dim=1, index=gt[:,None], src=torch.full_like(pred_sem, R)) 122 | mse = (pred_sem - gt_R).pow(2) 123 | losses['total.sem'] = (mse * self.label_weight).mean() 124 | else: 125 | raise NotImplementedError 126 | return losses 127 | -------------------------------------------------------------------------------- /lib/model/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import functools 5 | 6 | import scipy 7 | import numpy as np 8 | from scipy.ndimage.filters import maximum_filter 9 | from sklearn.linear_model import HuberRegressor 10 | 11 | 12 | ''' Panorama patch for layers ''' 13 | def lr_pad(x, padding=1): 14 | ''' Pad left/right-most to each other instead of zero padding ''' 15 | return torch.cat([x[..., -padding:], x, x[..., :padding]], dim=-1) 16 | 17 | class LR_PAD(nn.Module): 18 | ''' Pad left/right-most to each other instead of zero padding ''' 19 | def __init__(self, padding=1): 20 | super(LR_PAD, self).__init__() 21 | self.padding = padding 22 | 23 | def forward(self, x): 24 | return lr_pad(x, self.padding) 25 | 26 | def wrap_lr_pad(net): 27 | for name, m in net.named_modules(): 28 | names = name.split('.') 29 | root = functools.reduce(lambda o, i: getattr(o, i), [net] + names[:-1]) 30 | if isinstance(m, nn.Conv2d): 31 | if m.padding[1] == 0: 32 | continue 33 | w_pad = int(m.padding[1]) 34 | m.padding = (m.padding[0], 0) 35 | setattr( 36 | root, names[-1], 37 | nn.Sequential(LR_PAD(w_pad), m) 38 | ) 39 | elif isinstance(m, nn.Conv1d): 40 | if m.padding == (0, ): 41 | continue 42 | w_pad = int(m.padding[0]) 43 | m.padding = (0,) 44 | setattr( 45 | root, names[-1], 46 | nn.Sequential(LR_PAD(w_pad), m) 47 | ) 48 | 49 | def pano_upsample_w(x, s): 50 | if len(x.shape) == 3: 51 | mode = 'linear' 52 | scale_factor = s 53 | elif len(x.shape) == 4: 54 | mode = 'bilinear' 55 | scale_factor = (1, s) 56 | else: 57 | raise NotImplementedError 58 | x = torch.cat([x[...,-1:], x, x[...,:1]], dim=-1) 59 | x = F.interpolate(x, scale_factor=scale_factor, mode=mode, align_corners=False) 60 | x = x[...,s:-s] 61 | return x 62 | 63 | class PanoUpsampleW(nn.Module): 64 | def __init__(self, s): 65 | super(PanoUpsampleW, self).__init__() 66 | self.s = s 67 | 68 | def forward(self, x): 69 | return pano_upsample_w(x, self.s) 70 | 71 | 72 | ''' Testing augmentation helper ''' 73 | def augment(x, flip, rotate, rotate_flip): 74 | aug_type = [''] 75 | x_augmented = [x] 76 | if flip: 77 | aug_type.append('flip') 78 | x_augmented.append(x.flip(dims=(-1,))) 79 | for shift in rotate: 80 | aug_type.append('rotate %d' % shift) 81 | x_augmented.append(x.roll(shifts=shift, dims=-1)) 82 | if rotate_flip: 83 | aug_type.append('rotate_flip %d' % shift) 84 | x_augmented.append(x_augmented[-1].flip(dims=(-1,))) 85 | return torch.cat(x_augmented, 0), aug_type 86 | 87 | def augment_undo(pred_augmented, aug_type): 88 | pred_augmented = pred_augmented.cpu().numpy() 89 | assert len(pred_augmented) == len(aug_type), 'Unable to recover testing aug' 90 | pred_final = 0 91 | for pred, aug in zip(pred_augmented, aug_type): 92 | if aug == 'flip': 93 | pred_final += np.flip(pred, axis=-1) 94 | elif aug.startswith('rotate'): 95 | if 'flip' in aug: 96 | pred = np.flip(pred, axis=-1) 97 | shift = int(aug.split()[-1]) 98 | pred_final += np.roll(pred, -shift, axis=-1) 99 | elif aug == '': 100 | pred_final += pred 101 | else: 102 | raise NotImplementedError 103 | 104 | return pred_final / len(aug_type) 105 | 106 | 107 | ''' Post-processing ''' 108 | def peaks_mask_torch(x1d, winsz=7, min_v=0.5): 109 | pad = winsz // 2 110 | x1d_max = F.max_pool1d(torch.cat([x1d[...,-pad:], x1d, x1d[...,:pad]], -1), winsz, stride=1) 111 | return (x1d == x1d_max) & (x1d >= min_v) 112 | 113 | def peaks_finding_torch(x1d, winsz=7, min_v=0.5): 114 | ''' x1d: [B, 1, W] ''' 115 | bid, _, cid = torch.where(peaks_mask_torch(x1d, winsz, min_v)) 116 | return bid, cid 117 | 118 | def peaks_finding(signal, winsz=7, min_v=0.5): 119 | max_v = maximum_filter(signal, size=winsz, mode='wrap') 120 | pk_loc = np.where(max_v == signal)[0] 121 | pk_loc = pk_loc[signal[pk_loc] > min_v] 122 | return pk_loc 123 | -------------------------------------------------------------------------------- /test_depth.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import importlib 4 | from natsort import natsorted 5 | from tqdm import tqdm, trange 6 | from collections import Counter 7 | 8 | import numpy as np 9 | from imageio import imwrite 10 | from scipy.spatial.transform import Rotation 11 | from lib.misc.pano_lsd_align import rotatePanorama, panoEdgeDetection 12 | 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | from torch.utils.data import DataLoader 17 | 18 | from lib.config import config, update_config, infer_exp_id 19 | from lib import dataset 20 | 21 | 22 | def eval_metric(pred, gt, dmax): 23 | gt = gt.clamp(0.01, dmax) 24 | pred = pred.clamp(0.01, dmax) 25 | mre = ((gt - pred).abs() / gt).mean().item() 26 | mae = (gt - pred).abs().mean().item() 27 | rmse = ((gt - pred)**2).mean().sqrt().item() 28 | rmse_log = ((gt.log10() - pred.log10())**2).mean().sqrt().item() 29 | log10 = (gt.log10() - pred.log10()).abs().mean().item() 30 | 31 | delta = torch.max(pred/gt, gt/pred) 32 | delta_1 = (delta < 1.25).float().mean().item() 33 | delta_2 = (delta < 1.25**2).float().mean().item() 34 | delta_3 = (delta < 1.25**3).float().mean().item() 35 | return { 36 | 'mre': mre, 'mae': mae, 'rmse': rmse, 'rmse_log': rmse_log, 'log10': log10, 37 | 'delta_1': delta_1, 'delta_2': delta_2, 'delta_3': delta_3, 38 | } 39 | 40 | 41 | if __name__ == '__main__': 42 | 43 | # Parse args & config 44 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 45 | parser.add_argument('--cfg', required=True) 46 | parser.add_argument('--pth') 47 | parser.add_argument('--out') 48 | parser.add_argument('--vis_dir') 49 | parser.add_argument('--clip', default=10, type=float) 50 | parser.add_argument('--y', action='store_true') 51 | parser.add_argument('--pitch', default=0, type=float) 52 | parser.add_argument('--roll', default=0, type=float) 53 | parser.add_argument('opts', 54 | help='Modify config options using the command-line', 55 | default=None, nargs=argparse.REMAINDER) 56 | args = parser.parse_args() 57 | update_config(config, args) 58 | device = 'cuda' if config.cuda else 'cpu' 59 | 60 | if not args.pth: 61 | from glob import glob 62 | exp_id = infer_exp_id(args.cfg) 63 | exp_ckpt_root = os.path.join(config.ckpt_root, exp_id) 64 | args.pth = natsorted(glob(os.path.join(exp_ckpt_root, 'ep*pth')))[-1] 65 | print(f'No pth given, inferring the trained pth: {args.pth}') 66 | 67 | if not args.out: 68 | out = [os.path.splitext(args.pth)[0]] 69 | if args.pitch > 0: 70 | out.append(f'.pitch{args.pitch:.0f}') 71 | if args.roll > 0: 72 | out.append(f'.roll{args.roll:.0f}') 73 | args.out = ''.join(out + ['.npz']) 74 | print(f'No out given, inferring the output path: {args.out}') 75 | if os.path.isfile(args.out) and not args.y: 76 | print(f'{args.out} is existed:') 77 | print(dict(np.load(args.out))) 78 | print('Re-write this results ?', end=' ') 79 | input() 80 | 81 | # Init dataset 82 | DatasetClass = getattr(dataset, config.dataset.name) 83 | config.dataset.valid_kwargs.update(config.dataset.common_kwargs) 84 | config.dataset.valid_kwargs['fix_pitch'] = args.pitch 85 | config.dataset.valid_kwargs['fix_roll'] = args.roll 86 | valid_dataset = DatasetClass(**config.dataset.valid_kwargs) 87 | 88 | # Init network 89 | model_file = importlib.import_module(config.model.file) 90 | model_class = getattr(model_file, config.model.modelclass) 91 | net = model_class(**config.model.kwargs).to(device) 92 | net.load_state_dict(torch.load(args.pth)) 93 | net.eval() 94 | 95 | # Run evaluation 96 | evaluation_metric = Counter() 97 | for batch in tqdm(valid_dataset): 98 | # Add batch dim and move to gpu 99 | color = batch['x'][None].to(device) 100 | depth = batch['depth'][None].to(device) 101 | mask = (depth > 0) 102 | 103 | # feed forward 104 | with torch.no_grad(): 105 | pred_depth = net.infer(color) 106 | if not torch.is_tensor(pred_depth): 107 | viz_dict = pred_depth 108 | pred_depth = viz_dict.pop('depth') 109 | pred_depth = pred_depth.clamp(0.01) 110 | 111 | if args.pitch: 112 | vp = Rotation.from_rotvec([-args.pitch * np.pi / 180, 0, 0]).as_matrix() 113 | pred_depth = pred_depth.squeeze()[...,None].cpu().numpy() 114 | pred_depth = rotatePanorama(pred_depth, vp, order=0)[...,0] 115 | pred_depth = torch.from_numpy(pred_depth[None,None]).to(depth.device) 116 | if args.roll: 117 | vp = Rotation.from_rotvec([0, -args.roll * np.pi / 180, 0]).as_matrix() 118 | pred_depth = pred_depth.squeeze()[...,None].cpu().numpy() 119 | pred_depth = rotatePanorama(pred_depth, vp, order=0)[...,0] 120 | pred_depth = torch.from_numpy(pred_depth[None,None]).to(depth.device) 121 | 122 | if args.vis_dir: 123 | fname = batch['fname'].strip() 124 | os.makedirs(args.vis_dir, exist_ok=True) 125 | rgb = (batch['x'].permute(1,2,0) * 255).cpu().numpy().astype(np.uint8) 126 | dep = pred_depth.squeeze().mul(512).cpu().numpy().astype(np.uint16) 127 | dep[~mask.squeeze().cpu().numpy()] = 0 128 | gtdep = depth.squeeze().mul(512).cpu().numpy().astype(np.uint16) 129 | imwrite(os.path.join(args.vis_dir, fname + '.rgb' + '.jpg'), rgb) 130 | imwrite(os.path.join(args.vis_dir, fname + '.rgb' + '.png'), gtdep) 131 | imwrite(os.path.join(args.vis_dir, fname + '.depth' + '.png'), dep) 132 | for k, v in viz_dict.items(): 133 | if v.dtype == np.uint8 or v.dtype == np.uint16: 134 | imwrite(os.path.join(args.vis_dir, fname + '.' + k + '.png'), v) 135 | else: 136 | raise NotImplementedError 137 | 138 | evaluation_metric['N'] += 1 139 | for metric, v in eval_metric(pred_depth[mask], depth[mask], args.clip).items(): 140 | evaluation_metric[metric] += v 141 | 142 | N = evaluation_metric.pop('N') 143 | for metric, v in evaluation_metric.items(): 144 | evaluation_metric[metric] = v / N 145 | for metric, v in evaluation_metric.items(): 146 | print(f'{metric:20s} {v:.4f}') 147 | 148 | np.savez(args.out, **evaluation_metric) 149 | 150 | -------------------------------------------------------------------------------- /test_layout.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import json 4 | import argparse 5 | import importlib 6 | import numpy as np 7 | from PIL import Image 8 | from tqdm import tqdm 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | 14 | from lib.config import config, update_config, infer_exp_id 15 | 16 | 17 | if __name__ == '__main__': 18 | 19 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 20 | parser.add_argument('--cfg', required=True) 21 | parser.add_argument('--pth', help='path to load saved checkpoint.') 22 | parser.add_argument('--img_glob', required=True) 23 | parser.add_argument('--output_dir', required=True) 24 | # Augmentation related 25 | parser.add_argument('--flip', action='store_true', 26 | help='whether to perfome left-right flip. ' 27 | '# of input x2.') 28 | parser.add_argument('--rotate', nargs='*', default=[], type=int, 29 | help='whether to perfome horizontal rotate. ' 30 | 'each elements indicate fraction of image width. ' 31 | '# of input xlen(rotate).') 32 | # Misc arguments 33 | parser.add_argument('--no_cuda', action='store_true', 34 | help='disable cuda') 35 | parser.add_argument('opts', 36 | help='Modify config options using the command-line', 37 | default=None, nargs=argparse.REMAINDER) 38 | args = parser.parse_args() 39 | 40 | # Init setting 41 | update_config(config, args) 42 | if not args.pth: 43 | exp_id = infer_exp_id(args.cfg) 44 | exp_ckpt_root = os.path.join(config.ckpt_root, exp_id) 45 | args.pth = sorted(glob.glob(os.path.join(exp_ckpt_root, '*pth')))[-1] 46 | print(f'--pth is not given. Auto infer the pth={args.pth}') 47 | device = torch.device('cpu' if args.no_cuda else 'cuda') 48 | 49 | # Prepare image to processed 50 | paths = sorted(glob.glob(args.img_glob)) 51 | if len(paths) == 0: 52 | print('no images found') 53 | for path in paths: 54 | assert os.path.isfile(path), '%s not found' % path 55 | 56 | # Prepare the trained model 57 | model_file = importlib.import_module(config.model.file) 58 | model_class = getattr(model_file, config.model.modelclass) 59 | net = model_class(**config.model.kwargs) 60 | net.load_state_dict(torch.load(args.pth)) 61 | net = net.to(device).eval() 62 | 63 | # Check target directory 64 | if not os.path.isdir(args.output_dir): 65 | print('Output directory %s not existed. Create one.' % args.output_dir) 66 | os.makedirs(args.output_dir) 67 | 68 | # Inferencing 69 | with torch.no_grad(): 70 | for i_path in tqdm(paths, desc='Inferencing'): 71 | k = os.path.split(i_path)[-1][:-4] 72 | 73 | # Load image 74 | img_pil = Image.open(i_path) 75 | if img_pil.size != (1024, 512): 76 | img_pil = img_pil.resize((1024, 512), Image.BICUBIC) 77 | img_ori = np.array(img_pil)[..., :3].transpose([2, 0, 1]).copy() 78 | x = torch.FloatTensor([img_ori / 255]).to(device) 79 | 80 | # Inferenceing corners 81 | net.fname = k 82 | cor_id = net.infer(x)['cor_id'] 83 | 84 | # Output result 85 | with open(os.path.join(args.output_dir, k + '.txt'), 'w') as f: 86 | for x, y in cor_id: 87 | f.write('%d %d\n' % (x, y)) 88 | 89 | -------------------------------------------------------------------------------- /test_sem.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import importlib 4 | from natsort import natsorted 5 | from tqdm import tqdm, trange 6 | from collections import Counter 7 | 8 | import numpy as np 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | from torch.utils.data import DataLoader 14 | 15 | from lib.config import config, update_config, infer_exp_id 16 | from lib import dataset 17 | 18 | 19 | if __name__ == '__main__': 20 | 21 | # Parse args & config 22 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 23 | parser.add_argument('--cfg', required=True) 24 | parser.add_argument('--pth') 25 | parser.add_argument('--out') 26 | parser.add_argument('--vis_dir') 27 | parser.add_argument('--y', action='store_true') 28 | parser.add_argument('--test_hw', type=int, nargs='*') 29 | parser.add_argument('opts', 30 | help='Modify config options using the command-line', 31 | default=None, nargs=argparse.REMAINDER) 32 | args = parser.parse_args() 33 | update_config(config, args) 34 | device = 'cuda' if config.cuda else 'cpu' 35 | 36 | if config.cuda and config.cuda_benchmark: 37 | torch.backends.cudnn.benchmark = False 38 | 39 | # Init global variable 40 | if not args.pth: 41 | from glob import glob 42 | exp_id = infer_exp_id(args.cfg) 43 | exp_ckpt_root = os.path.join(config.ckpt_root, exp_id) 44 | args.pth = natsorted(glob(os.path.join(exp_ckpt_root, 'ep*pth')))[-1] 45 | print(f'No pth given, inferring the trained pth: {args.pth}') 46 | 47 | if not args.out: 48 | args.out = os.path.splitext(args.pth)[0] 49 | print(f'No out given, inferring the output dir: {args.out}') 50 | os.makedirs(args.out, exist_ok=True) 51 | if os.path.isfile(os.path.join(args.out, 'cm.npz')) and not args.y: 52 | print(f'{os.path.join(args.out, "cm.npz")} is existed:') 53 | cm = np.load(os.path.join(args.out, 'cm.npz'))['cm'] 54 | inter = np.diag(cm) 55 | union = cm.sum(0) + cm.sum(1) - inter 56 | ious = inter / union 57 | accs = inter / cm.sum(1) 58 | DatasetClass = getattr(dataset, config.dataset.name) 59 | config.dataset.valid_kwargs.update(config.dataset.common_kwargs) 60 | valid_dataset = DatasetClass(**config.dataset.valid_kwargs) 61 | id2class = np.array(valid_dataset.ID2CLASS) 62 | for name, iou, acc in zip(id2class, ious, accs): 63 | print(f'{name:20s}: iou {iou*100:5.2f} / acc {acc*100:5.2f}') 64 | print(f'{"Overall":20s}: iou {ious.mean()*100:5.2f} / acc {accs.mean()*100:5.2f}') 65 | print('Re-write this results ?', end=' ') 66 | input() 67 | 68 | # Init dataset 69 | DatasetClass = getattr(dataset, config.dataset.name) 70 | config.dataset.valid_kwargs.update(config.dataset.common_kwargs) 71 | if args.test_hw: 72 | input_hw = config.dataset.common_kwargs['hw'] 73 | config.dataset.valid_kwargs['hw'] = args.test_hw 74 | else: 75 | input_hw = None 76 | valid_dataset = DatasetClass(**config.dataset.valid_kwargs) 77 | valid_loader = DataLoader(valid_dataset, 1, 78 | num_workers=config.num_workers, 79 | pin_memory=config.cuda) 80 | 81 | # Init network 82 | model_file = importlib.import_module(config.model.file) 83 | model_class = getattr(model_file, config.model.modelclass) 84 | net = model_class(**config.model.kwargs).to(device) 85 | net.load_state_dict(torch.load(args.pth)) 86 | net = net.to(device).eval() 87 | 88 | # Start eval 89 | cm = 0 90 | num_classes = config.model.kwargs.modalities_config.SemanticSegmenter.num_classes 91 | with torch.no_grad(): 92 | for batch in tqdm(valid_loader, position=1, total=len(valid_loader)): 93 | color = batch['x'].to(device) 94 | sem = batch['sem'].to(device) 95 | mask = (sem >= 0) 96 | if mask.sum() == 0: 97 | continue 98 | 99 | # feed forward & compute losses 100 | if input_hw is not None: 101 | color = F.interpolate(color, size=input_hw, mode='bilinear', align_corners=False) 102 | pred_sem = net.infer(color)['sem'] 103 | if input_hw is not None: 104 | pred_sem = F.interpolate(pred_sem, size=args.test_hw, mode='bilinear', align_corners=False) 105 | 106 | # Visualization 107 | if args.vis_dir: 108 | import matplotlib.pyplot as plt 109 | from imageio import imwrite 110 | cmap = (plt.get_cmap('gist_rainbow')(np.arange(num_classes) / num_classes)[...,:3] * 255).astype(np.uint8) 111 | rgb = (batch['x'][0, :3].permute(1,2,0) * 255).cpu().numpy().astype(np.uint8) 112 | vis_sem = cmap[pred_sem[0].argmax(0).cpu().numpy()] 113 | vis_sem = (rgb * 0.2 + vis_sem * 0.8).astype(np.uint8) 114 | imwrite(os.path.join(args.vis_dir, batch['fname'][0].strip()), vis_sem) 115 | vis_sem = cmap[sem[0].cpu().numpy()] 116 | vis_sem = (rgb * 0.2 + vis_sem * 0.8).astype(np.uint8) 117 | imwrite(os.path.join(args.vis_dir, batch['fname'][0].strip() + '.gt.png'), vis_sem) 118 | 119 | # Log 120 | gt = sem[mask] 121 | pred = pred_sem.argmax(1)[mask] 122 | assert gt.min() >= 0 and gt.max() < num_classes and pred_sem.shape[1] == num_classes 123 | cm += np.bincount((gt * num_classes + pred).cpu().numpy(), minlength=num_classes**2) 124 | 125 | # Summarize 126 | print(' Summarize '.center(50, '=')) 127 | cm = cm.reshape(num_classes, num_classes) 128 | id2class = np.array(valid_dataset.ID2CLASS) 129 | valid_mask = (cm.sum(1) != 0) 130 | cm = cm[valid_mask][:, valid_mask] 131 | id2class = id2class[valid_mask] 132 | inter = np.diag(cm) 133 | union = cm.sum(0) + cm.sum(1) - inter 134 | ious = inter / union 135 | accs = inter / cm.sum(1) 136 | for name, iou, acc in zip(id2class, ious, accs): 137 | print(f'{name:20s}: iou {iou*100:5.2f} / acc {acc*100:5.2f}') 138 | print(f'{"Overall":20s}: iou {ious.mean()*100:5.2f} / acc {accs.mean()*100:5.2f}') 139 | np.savez(os.path.join(args.out, 'cm.npz'), cm=cm) 140 | 141 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import importlib 4 | from tqdm import tqdm, trange 5 | from collections import Counter 6 | 7 | import numpy as np 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from torch.utils.data import DataLoader 13 | 14 | from lib.config import config, update_config, infer_exp_id 15 | from lib import dataset 16 | 17 | 18 | def train_loop(net, loader, optimizer): 19 | net.train() 20 | if config.training.fix_encoder_bn: 21 | apply_fn_based_on_key(net.encoder, ['bn'], lambda m: m.eval()) 22 | epoch_losses = Counter() 23 | for iit, batch in tqdm(enumerate(loader, 1), position=1, total=len(loader)): 24 | # Move data to the given computation device 25 | for k, v in batch.items(): 26 | if torch.is_tensor(v): 27 | batch[k] = v.to(device) 28 | 29 | # feed forward & compute losses 30 | losses = net.compute_losses(batch) 31 | if len(losses) == 0: 32 | continue 33 | 34 | # backprop 35 | optimizer.zero_grad() 36 | losses['total'].backward() 37 | optimizer.step() 38 | 39 | # Log 40 | BS = len(batch['x']) 41 | epoch_losses['N'] += BS 42 | for k, v in losses.items(): 43 | if torch.is_tensor(v): 44 | epoch_losses[k] += BS * v.item() 45 | else: 46 | epoch_losses[k] += BS * v 47 | 48 | # Statistic over the epoch 49 | N = epoch_losses.pop('N') 50 | for k, v in epoch_losses.items(): 51 | epoch_losses[k] = v / N 52 | 53 | return epoch_losses 54 | 55 | 56 | def valid_loop(net, loader): 57 | net.eval() 58 | epoch_losses = Counter() 59 | with torch.no_grad(): 60 | for iit, batch in tqdm(enumerate(loader, 1), position=1, total=len(loader)): 61 | for k, v in batch.items(): 62 | if torch.is_tensor(v): 63 | batch[k] = v.to(device) 64 | 65 | # feed forward & compute losses 66 | losses = net.compute_losses(batch) 67 | 68 | # Log 69 | for k, v in losses.items(): 70 | if torch.is_tensor(v): 71 | epoch_losses[k] += float(v.item()) / len(loader) 72 | else: 73 | epoch_losses[k] += v / len(loader) 74 | 75 | return epoch_losses 76 | 77 | 78 | def apply_fn_based_on_key(net, key_lst, fn): 79 | for name, m in net.named_modules(): 80 | if any(k in name for k in key_lst): 81 | fn(m) 82 | 83 | 84 | def group_parameters(net, wd_group_mode): 85 | wd = [] 86 | nowd = [] 87 | for name, p in net.named_parameters(): 88 | if not p.requires_grad: 89 | continue 90 | if wd_group_mode == 'bn and bias': 91 | if 'bn' in name or 'bias' in name: 92 | nowd.append(p) 93 | else: 94 | wd.append(p) 95 | elif wd_group_mode == 'encoder decoder': 96 | if 'feature_extractor' in name: 97 | nowd.append(p) 98 | else: 99 | wd.append(p) 100 | return [{'params': wd}, {'params': nowd, 'weight_decay': 0}] 101 | 102 | 103 | if __name__ == '__main__': 104 | 105 | # Parse args & config 106 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 107 | parser.add_argument('--cfg', required=True) 108 | parser.add_argument('opts', 109 | help='Modify config options using the command-line', 110 | default=None, nargs=argparse.REMAINDER) 111 | args = parser.parse_args() 112 | update_config(config, args) 113 | 114 | # Init global variable 115 | exp_id = infer_exp_id(args.cfg) 116 | exp_ckpt_root = os.path.join(config.ckpt_root, exp_id) 117 | os.makedirs(exp_ckpt_root, exist_ok=True) 118 | device = 'cuda' if config.cuda else 'cpu' 119 | if config.cuda and config.cuda_benchmark: 120 | torch.backends.cudnn.benchmark = True 121 | 122 | # Init dataset 123 | DatasetClass = getattr(dataset, config.dataset.name) 124 | config.dataset.train_kwargs.update(config.dataset.common_kwargs) 125 | config.dataset.valid_kwargs.update(config.dataset.common_kwargs) 126 | train_dataset = DatasetClass(**config.dataset.train_kwargs) 127 | valid_dataset = DatasetClass(**config.dataset.valid_kwargs) 128 | train_loader = DataLoader(train_dataset, config.training.batch_size, 129 | shuffle=True, drop_last=True, 130 | num_workers=config.num_workers, 131 | pin_memory=config.cuda, 132 | worker_init_fn=lambda x: np.random.seed()) 133 | valid_loader = DataLoader(valid_dataset, 1, 134 | num_workers=config.num_workers, 135 | pin_memory=config.cuda) 136 | 137 | # Init network 138 | model_file = importlib.import_module(config.model.file) 139 | model_class = getattr(model_file, config.model.modelclass) 140 | net = model_class(**config.model.kwargs).to(device) 141 | if config.training.fix_encoder_bn: 142 | apply_fn_based_on_key(net.encoder, ['bn'], lambda m: m.requires_grad_(False)) 143 | 144 | # Init optimizer 145 | if config.training.optim == 'Adam': 146 | optimizer = torch.optim.Adam( 147 | group_parameters(net, config.training.wd_group_mode), 148 | lr=config.training.optim_lr, weight_decay=config.training.weight_decay) 149 | elif config.training.optim == 'AdamW': 150 | optimizer = torch.optim.AdamW( 151 | group_parameters(net, config.training.wd_group_mode), 152 | lr=config.training.optim_lr, weight_decay=config.training.weight_decay) 153 | elif config.training.optim == 'SGD': 154 | optimizer = torch.optim.SGD( 155 | group_parameters(net, config.training.wd_group_mode), momentum=0.9, 156 | lr=config.training.optim_lr, weight_decay=config.training.weight_decay) 157 | 158 | if config.training.optim_poly_gamma > 0: 159 | def lr_poly_rate(epoch): 160 | return (1 - epoch / config.training.epoch) ** config.training.optim_poly_gamma 161 | scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_poly_rate) 162 | else: 163 | scheduler = torch.optim.lr_scheduler.MultiStepLR( 164 | optimizer, milestones=[int(p * config.training.epoch) for p in config.training.optim_milestons], 165 | gamma=config.training.optim_gamma) 166 | 167 | # Start training 168 | for iep in trange(1, config.training.epoch + 1, position=0): 169 | 170 | # Train phase 171 | epoch_losses = train_loop(net, train_loader, optimizer) 172 | scheduler.step() 173 | print(f'EP[{iep}/{config.training.epoch}] train: ' + 174 | ' \ '.join([f'{k} {v:.3f}' for k, v in epoch_losses.items()])) 175 | 176 | # Periodically save model 177 | if iep % config.training.save_every == 0: 178 | torch.save(net.state_dict(), os.path.join(exp_ckpt_root, f'ep{iep}.pth')) 179 | print('Model saved') 180 | 181 | # Valid phase 182 | epoch_losses = valid_loop(net, valid_loader) 183 | print(f'EP[{iep}/{config.training.epoch}] valid: ' + 184 | ' \ '.join([f'{k} {v:.3f}' for k, v in epoch_losses.items()])) 185 | 186 | -------------------------------------------------------------------------------- /vis_depth.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import open3d as o3d 4 | from imageio import imread 5 | 6 | 7 | def get_uni_sphere_xyz(H, W): 8 | j, i = np.meshgrid(np.arange(H), np.arange(W), indexing='ij') 9 | u = (i+0.5) / W * 2 * np.pi 10 | v = ((j+0.5) / H - 0.5) * np.pi 11 | z = -np.sin(v) 12 | c = np.cos(v) 13 | y = c * np.sin(u) 14 | x = c * np.cos(u) 15 | sphere_xyz = np.stack([x, y, z], -1) 16 | return sphere_xyz 17 | 18 | 19 | if __name__ == '__main__': 20 | 21 | import argparse 22 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 23 | parser.add_argument('--img', required=True, 24 | help='Image texture in equirectangular format') 25 | parser.add_argument('--depth', required=True, 26 | help='Depth map') 27 | parser.add_argument('--scale', default=0.001, type=float, 28 | help='Rescale the depth map') 29 | parser.add_argument('--crop_ratio', default=80/512, type=float, 30 | help='Crop ratio for upper and lower part of the image') 31 | parser.add_argument('--crop_z_above', default=1.2, type=float, 32 | help='Filter 3D point with z coordinate above') 33 | args = parser.parse_args() 34 | 35 | # Reading rgb-d 36 | rgb = imread(args.img) 37 | depth = imread(args.depth)[...,None].astype(np.float32) * args.scale 38 | 39 | # Project to 3d 40 | H, W = rgb.shape[:2] 41 | xyz = depth * get_uni_sphere_xyz(H, W) 42 | xyzrgb = np.concatenate([xyz, rgb/255.], 2) 43 | 44 | # Crop the image and flatten 45 | if args.crop_ratio > 0: 46 | assert args.crop_ratio < 1 47 | crop = int(H * args.crop_ratio) 48 | xyzrgb = xyzrgb[crop:-crop] 49 | xyzrgb = xyzrgb.reshape(-1, 6) 50 | 51 | # Crop in 3d 52 | xyzrgb = xyzrgb[xyzrgb[:,2] <= args.crop_z_above] 53 | 54 | # Visualize 55 | pcd = o3d.geometry.PointCloud() 56 | pcd.points = o3d.utility.Vector3dVector(xyzrgb[:, :3]) 57 | pcd.colors = o3d.utility.Vector3dVector(xyzrgb[:, 3:]) 58 | 59 | o3d.visualization.draw_geometries([ 60 | pcd, 61 | o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.3, origin=[0, 0, 0]) 62 | ]) 63 | -------------------------------------------------------------------------------- /vis_layout.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import open3d as o3d 4 | from PIL import Image 5 | from scipy.signal import correlate2d 6 | from scipy.ndimage import shift 7 | 8 | from lib.misc.post_proc import np_coor2xy, np_coorx2u, np_coory2v 9 | from eval_layout import layout_2_depth 10 | 11 | 12 | if __name__ == '__main__': 13 | 14 | import argparse 15 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 16 | parser.add_argument('--img', required=True, 17 | help='Image texture in equirectangular format') 18 | parser.add_argument('--layout', required=True, 19 | help='Txt or json file containing layout corners (cor_id)') 20 | parser.add_argument('--out') 21 | parser.add_argument('--no_vis', action='store_true') 22 | parser.add_argument('--show_ceiling', action='store_true', 23 | help='Rendering ceiling (skip by default)') 24 | parser.add_argument('--ignore_floor', action='store_true', 25 | help='Skip rendering floor') 26 | parser.add_argument('--ignore_wall', action='store_true', 27 | help='Skip rendering wall') 28 | parser.add_argument('--ignore_wireframe', action='store_true', 29 | help='Skip rendering wireframe') 30 | args = parser.parse_args() 31 | 32 | if not args.out and args.no_vis: 33 | print('You may want to export (via --out) or visualize (without --vis)') 34 | import sys; sys.exit() 35 | 36 | # Reading source (texture img, cor_id txt) 37 | equirect_texture = np.array(Image.open(args.img)) 38 | H, W = equirect_texture.shape[:2] 39 | if args.layout.endswith('json'): 40 | with open(args.layout) as f: 41 | inferenced_result = json.load(f) 42 | cor_id = np.array(inferenced_result['uv'], np.float32) 43 | cor_id[:, 0] *= W 44 | cor_id[:, 1] *= H 45 | else: 46 | cor_id = np.loadtxt(args.layout).astype(np.float32) 47 | 48 | # Convert corners to layout 49 | depth, floor_mask, ceil_mask, wall_mask = layout_2_depth(cor_id, H, W, return_mask=True) 50 | coorx, coory = np.meshgrid(np.arange(W), np.arange(H)) 51 | us = np_coorx2u(coorx, W) 52 | vs = np_coory2v(coory, H) 53 | zs = depth * np.sin(vs) 54 | cs = depth * np.cos(vs) 55 | xs = cs * np.sin(us) 56 | ys = -cs * np.cos(us) 57 | 58 | # Aggregate mask 59 | mask = np.ones_like(floor_mask) 60 | if args.ignore_floor: 61 | mask &= ~floor_mask 62 | if not args.show_ceiling: 63 | mask &= ~ceil_mask 64 | if args.ignore_wall: 65 | mask &= ~wall_mask 66 | 67 | # Prepare ply's points and faces 68 | xyzrgb = np.concatenate([ 69 | xs[...,None], ys[...,None], zs[...,None], 70 | equirect_texture], -1) 71 | xyzrgb = np.concatenate([xyzrgb, xyzrgb[:,[0]]], 1) 72 | mask = np.concatenate([mask, mask[:,[0]]], 1) 73 | lo_tri_template = np.array([ 74 | [0, 0, 0], 75 | [0, 1, 0], 76 | [0, 1, 1]]) 77 | up_tri_template = np.array([ 78 | [0, 0, 0], 79 | [0, 1, 1], 80 | [0, 0, 1]]) 81 | ma_tri_template = np.array([ 82 | [0, 0, 0], 83 | [0, 1, 1], 84 | [0, 1, 0]]) 85 | lo_mask = (correlate2d(mask, lo_tri_template, mode='same') == 3) 86 | up_mask = (correlate2d(mask, up_tri_template, mode='same') == 3) 87 | ma_mask = (correlate2d(mask, ma_tri_template, mode='same') == 3) & (~lo_mask) & (~up_mask) 88 | ref_mask = ( 89 | lo_mask | (correlate2d(lo_mask, np.flip(lo_tri_template, (0,1)), mode='same') > 0) |\ 90 | up_mask | (correlate2d(up_mask, np.flip(up_tri_template, (0,1)), mode='same') > 0) |\ 91 | ma_mask | (correlate2d(ma_mask, np.flip(ma_tri_template, (0,1)), mode='same') > 0) 92 | ) 93 | points = xyzrgb[ref_mask] 94 | 95 | ref_id = np.full(ref_mask.shape, -1, np.int32) 96 | ref_id[ref_mask] = np.arange(ref_mask.sum()) 97 | faces_lo_tri = np.stack([ 98 | ref_id[lo_mask], 99 | ref_id[shift(lo_mask, [1, 0], cval=False, order=0)], 100 | ref_id[shift(lo_mask, [1, 1], cval=False, order=0)], 101 | ], 1) 102 | faces_up_tri = np.stack([ 103 | ref_id[up_mask], 104 | ref_id[shift(up_mask, [1, 1], cval=False, order=0)], 105 | ref_id[shift(up_mask, [0, 1], cval=False, order=0)], 106 | ], 1) 107 | faces_ma_tri = np.stack([ 108 | ref_id[ma_mask], 109 | ref_id[shift(ma_mask, [1, 0], cval=False, order=0)], 110 | ref_id[shift(ma_mask, [0, 1], cval=False, order=0)], 111 | ], 1) 112 | faces = np.concatenate([faces_lo_tri, faces_up_tri, faces_ma_tri]) 113 | 114 | # Dump results ply 115 | if args.out: 116 | ply_header = '\n'.join([ 117 | 'ply', 118 | 'format ascii 1.0', 119 | f'element vertex {len(points):d}', 120 | 'property float x', 121 | 'property float y', 122 | 'property float z', 123 | 'property uchar red', 124 | 'property uchar green', 125 | 'property uchar blue', 126 | f'element face {len(faces):d}', 127 | 'property list uchar int vertex_indices', 128 | 'end_header', 129 | ]) 130 | with open(args.out, 'w') as f: 131 | f.write(ply_header) 132 | f.write('\n') 133 | for x, y, z, r, g, b in points: 134 | f.write(f'{x:.2f} {y:.2f} {z:.2f} {r:.0f} {g:.0f} {b:.0f}\n') 135 | for i, j, k in faces: 136 | f.write(f'3 {i:d} {j:d} {k:d}\n') 137 | 138 | if not args.no_vis: 139 | mesh = o3d.geometry.TriangleMesh() 140 | mesh.vertices = o3d.utility.Vector3dVector(points[:, :3]) 141 | mesh.vertex_colors = o3d.utility.Vector3dVector(points[:, 3:] / 255.) 142 | mesh.triangles = o3d.utility.Vector3iVector(faces) 143 | draw_geometries = [mesh] 144 | 145 | # Show wireframe 146 | if not args.ignore_wireframe: 147 | # Convert cor_id to 3d xyz 148 | N = len(cor_id) // 2 149 | floor_z = -1.6 150 | floor_xy = np_coor2xy(cor_id[1::2], floor_z, W, H, floorW=1, floorH=1) 151 | c = np.sqrt((floor_xy**2).sum(1)) 152 | v = np_coory2v(cor_id[0::2, 1], H) 153 | ceil_z = (c * np.tan(v)).mean() 154 | 155 | # Prepare wireframe in open3d 156 | assert N == len(floor_xy) 157 | wf_points = [[x, y, floor_z] for x, y in floor_xy] +\ 158 | [[x, y, ceil_z] for x, y in floor_xy] 159 | wf_lines = [[i, (i+1)%N] for i in range(N)] +\ 160 | [[i+N, (i+1)%N+N] for i in range(N)] +\ 161 | [[i, i+N] for i in range(N)] 162 | wf_colors = [[1, 0, 0] for i in range(len(wf_lines))] 163 | wf_line_set = o3d.geometry.LineSet() 164 | wf_line_set.points = o3d.utility.Vector3dVector(wf_points) 165 | wf_line_set.lines = o3d.utility.Vector2iVector(wf_lines) 166 | wf_line_set.colors = o3d.utility.Vector3dVector(wf_colors) 167 | draw_geometries.append(wf_line_set) 168 | 169 | o3d.visualization.draw_geometries(draw_geometries, mesh_show_back_face=True) 170 | --------------------------------------------------------------------------------