├── .gitignore ├── LICENSE ├── README.md ├── configs ├── deeplab_cityscapes_seg.yml ├── deeplab_kitti_depth.yml ├── deeplab_kitti_seg.yml ├── dispnet_cityscapes_seg.yml ├── dispnet_kitti_depth.yml ├── dispnet_kitti_seg.yml ├── fcn_cityscapes_seg.yml ├── fcn_kitti_depth.yml ├── fcn_kitti_seg.yml ├── fcrn_cityscapes_seg.yml ├── fcrn_kitti_depth.yml ├── fcrn_kitti_seg.yml ├── frrnA_cityscapes_seg.yml ├── frrnA_kitti_depth.yml ├── frrnA_kitti_seg.yml ├── segnet_cityscapes_seg.yml ├── segnet_kitti_depth.yml └── segnet_kitti_seg.yml ├── demo_depth.py ├── demo_saliency.py ├── demo_seg.py ├── fcrn_metric_0.png ├── kitti_depth_eval ├── __pycache__ │ └── depth_evaluation_utils.cpython-36.pyc ├── depth_evaluation_utils.py └── test_files_eigen.txt ├── kitti_train_depth_prepare ├── __pycache__ │ └── kitti_raw_loader.cpython-36.pyc ├── kitti_raw_loader.py ├── prepare_train_data.py ├── static_frames.txt └── test_scenes.txt ├── output_predict_img ├── deeplab_output_depth.png ├── deeplab_output_seg.png ├── dispnet_output_depth.png ├── dispnet_output_seg.png ├── fcn_output_depth.png ├── fcn_output_seg.png ├── fcrn_output_depth.png ├── fcrn_output_seg.png ├── frrn_output_depth.png ├── frrn_output_seg.png ├── segnet_output_depth.png └── segnet_output_seg.png ├── ptsemseg ├── augmentations │ ├── __init__.py │ └── augmentations.py ├── caffe_pb2.py ├── loader │ ├── __init__.py │ ├── cityscapes_loader_depth.py │ ├── cityscapes_loader_seg.py │ ├── kitti_loader_depth.py │ └── kitti_loader_seg.py ├── loss │ ├── __init__.py │ └── loss.py ├── metrics.py ├── models │ ├── __init__.py │ ├── deeplab_depth.py │ ├── deeplab_seg.py │ ├── dispnet_depth.py │ ├── dispnet_seg.py │ ├── fcn_depth.py │ ├── fcn_seg.py │ ├── fcrn_depth.py │ ├── fcrn_seg.py │ ├── frrn_depth.py │ ├── frrn_seg.py │ ├── segnet_depth.py │ ├── segnet_seg.py │ └── utils.py ├── optimizers │ └── __init__.py ├── schedulers │ ├── __init__.py │ └── schedulers.py └── utils.py ├── requirements.txt ├── saliency.py ├── saliency_analysis.py ├── saliency_class_val.py ├── saliency_eval.py ├── saliency_iou.py ├── saliency_results ├── BP_saliency_map_fcrn_seg.png └── image_pixel_locate_0.png ├── test_depth_cityscapes.py ├── test_depth_kitti.py ├── test_seg_cityscapes.py ├── train.py └── validate_seg.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 sanweiliti 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /configs/deeplab_cityscapes_seg.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: deeplab 3 | task: seg 4 | data: 5 | dataset: cityscapes 6 | train_split: train 7 | val_split: val 8 | img_rows: 512 9 | img_cols: 1024 10 | img_norm: True 11 | # version: cityscapes 12 | path: ../pytorch-semseg/datasets/cityscapes 13 | 14 | training: 15 | train_iters: 200000 16 | batch_size: 2 17 | val_interval: 1500 18 | n_workers: 2 19 | print_interval: 100 20 | optimizer: 21 | name: 'adam' 22 | lr: 1.0e-4 23 | loss: 24 | name: 'cross_entropy' 25 | size_average: True 26 | # augmentations: 27 | # rcrop: [256, 512] 28 | lr_schedule: 29 | resume: 30 | -------------------------------------------------------------------------------- /configs/deeplab_kitti_depth.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: deeplab 3 | task: depth 4 | data: 5 | dataset: kitti 6 | train_split: train 7 | val_split: val 8 | img_rows: 128 9 | img_cols: 416 10 | img_norm: True 11 | path: prepared_kitti_train_data 12 | 13 | training: 14 | train_iters: 100000000 15 | batch_size: 4 16 | val_interval: 2000 17 | n_workers: 2 18 | print_interval: 500 19 | optimizer: 20 | name: 'adam' 21 | lr: 1.0e-4 22 | loss: 23 | name: 'scale_invariant_loss' 24 | smooth: True 25 | lr_schedule: 26 | resume: 27 | -------------------------------------------------------------------------------- /configs/deeplab_kitti_seg.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: deeplab 3 | task: seg 4 | data: 5 | dataset: kitti 6 | train_split: train 7 | val_split: val 8 | img_rows: 256 9 | img_cols: 832 10 | img_norm: True 11 | # version: cityscapes 12 | path: ../pytorch-semseg/datasets/kitti/semantics 13 | 14 | training: 15 | train_iters: 10000000 16 | batch_size: 4 17 | val_interval: 40 18 | n_workers: 2 19 | print_interval: 10 20 | optimizer: 21 | name: 'adam' 22 | lr: 1.0e-5 23 | loss: 24 | name: 'cross_entropy' 25 | size_average: True 26 | # augmentations: 27 | # rcrop: [256, 512] 28 | lr_schedule: 29 | resume: ../pytorch-semseg/runs/deeplab_cityscapes/11044/deeplab_cityscapes_best_model.pkl 30 | -------------------------------------------------------------------------------- /configs/dispnet_cityscapes_seg.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: dispnet 3 | task: seg 4 | data: 5 | dataset: cityscapes 6 | train_split: train 7 | val_split: val 8 | img_rows: 512 9 | img_cols: 1024 10 | img_norm: True 11 | # version: cityscapes 12 | path: ../pytorch-semseg/datasets/cityscapes 13 | 14 | training: 15 | train_iters: 200000 16 | batch_size: 4 17 | val_interval: 750 18 | n_workers: 2 19 | print_interval: 150 20 | optimizer: 21 | name: 'adam' 22 | lr: 1.0e-4 23 | loss: 24 | name: 'cross_entropy' 25 | size_average: True 26 | lr_schedule: 27 | resume: 28 | -------------------------------------------------------------------------------- /configs/dispnet_kitti_depth.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: dispnet 3 | task: depth 4 | 5 | data: 6 | dataset: kitti 7 | train_split: train 8 | val_split: val 9 | test_split: test 10 | img_rows: 128 11 | img_cols: 416 12 | img_norm: True 13 | path: prepared_kitti_train_data 14 | 15 | training: 16 | train_iters: 1000000 17 | batch_size: 4 18 | val_interval: 2000 19 | n_workers: 2 20 | print_interval: 500 21 | optimizer: 22 | name: 'adam' 23 | lr: 1.0e-4 24 | loss: 25 | name: 'scale_invariant_loss' 26 | smooth: True 27 | lr_schedule: 28 | resume: 29 | -------------------------------------------------------------------------------- /configs/dispnet_kitti_seg.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: dispnet 3 | task: seg 4 | data: 5 | dataset: kitti 6 | train_split: train 7 | val_split: val 8 | img_rows: 256 9 | img_cols: 832 10 | img_norm: True 11 | # version: cityscapes 12 | path: ../pytorch-semseg/datasets/kitti/semantics 13 | 14 | training: 15 | train_iters: 10000000 16 | batch_size: 4 17 | val_interval: 40 18 | n_workers: 2 19 | print_interval: 10 20 | optimizer: 21 | name: 'adam' 22 | lr: 1.0e-5 23 | loss: 24 | name: 'cross_entropy' 25 | size_average: True 26 | lr_schedule: 27 | resume: 28 | -------------------------------------------------------------------------------- /configs/fcn_cityscapes_seg.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: fcn 3 | task: seg 4 | data: 5 | dataset: cityscapes 6 | train_split: train 7 | val_split: val 8 | # test_split: test 9 | img_rows: 512 10 | img_cols: 1024 11 | img_norm: True 12 | # version: cityscapes 13 | path: ../pytorch-semseg/datasets/cityscapes 14 | training: 15 | train_iters: 3000000 16 | batch_size: 2 # for test, batch_size === 1 17 | val_interval: 750 18 | n_workers: 2 19 | print_interval: 150 20 | optimizer: 21 | name: 'adam' 22 | lr: 1.0e-4 23 | #weight_decay: 0.0005 24 | #momentum: 0.9 25 | loss: 26 | name: 'cross_entropy' 27 | size_average: True 28 | lr_schedule: 29 | resume: 30 | #testing: 31 | # trained_model: ./runs/fcn8s_cityscapes/67739_downsample1_batchSize1_withModel/fcn8s_cityscapes_best_model.pkl 32 | -------------------------------------------------------------------------------- /configs/fcn_kitti_depth.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: fcn 3 | task: depth 4 | data: 5 | dataset: kitti 6 | train_split: train 7 | val_split: val 8 | img_rows: 128 9 | img_cols: 416 10 | img_norm: True 11 | path: prepared_kitti_train_data 12 | 13 | training: 14 | train_iters: 100000000 15 | batch_size: 4 16 | val_interval: 2000 17 | n_workers: 2 18 | print_interval: 1 19 | optimizer: 20 | name: 'adam' 21 | lr: 1.0e-4 22 | loss: 23 | name: 'scale_invariant_loss' 24 | smooth: True 25 | lr_schedule: 26 | resume: 27 | -------------------------------------------------------------------------------- /configs/fcn_kitti_seg.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: fcn 3 | task: seg 4 | data: 5 | dataset: kitti 6 | train_split: train 7 | val_split: val 8 | img_rows: 256 9 | img_cols: 832 10 | img_norm: True 11 | # version: cityscapes 12 | path: ../pytorch-semseg/datasets/kitti/semantics 13 | 14 | training: 15 | train_iters: 10000000 16 | batch_size: 4 17 | val_interval: 40 18 | n_workers: 2 19 | print_interval: 10 20 | optimizer: 21 | name: 'adam' 22 | lr: 1.0e-5 23 | loss: 24 | name: 'cross_entropy' 25 | size_average: True 26 | lr_schedule: 27 | resume: runs/fcn8s_cityscapes/37333/fcn8s_cityscapes_best_model.pkl 28 | -------------------------------------------------------------------------------- /configs/fcrn_cityscapes_seg.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: fcrn 3 | task: seg 4 | data: 5 | dataset: cityscapes 6 | train_split: train 7 | val_split: val 8 | img_rows: 512 9 | img_cols: 1024 10 | img_norm: True 11 | # version: cityscapes 12 | path: datasets/cityscapes 13 | 14 | training: 15 | train_iters: 200000 16 | batch_size: 4 17 | val_interval: 750 18 | n_workers: 2 19 | print_interval: 150 20 | optimizer: 21 | name: 'adam' 22 | lr: 1.0e-4 23 | loss: 24 | name: 'cross_entropy' 25 | size_average: True 26 | lr_schedule: 27 | resume: 28 | -------------------------------------------------------------------------------- /configs/fcrn_kitti_depth.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: fcrn 3 | task: depth 4 | data: 5 | dataset: kitti 6 | train_split: train 7 | val_split: val 8 | img_rows: 128 9 | img_cols: 416 10 | img_norm: True 11 | path: prepared_kitti_train_data 12 | 13 | training: 14 | train_iters: 100000000 15 | batch_size: 4 16 | val_interval: 2000 17 | n_workers: 2 18 | print_interval: 1 19 | optimizer: 20 | name: 'adam' 21 | lr: 1.0e-4 22 | loss: 23 | name: 'scale_invariant_loss' 24 | smooth: True 25 | lr_schedule: 26 | resume: 27 | -------------------------------------------------------------------------------- /configs/fcrn_kitti_seg.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: fcrn 3 | task: seg 4 | data: 5 | dataset: kitti 6 | train_split: train 7 | val_split: val 8 | img_rows: 256 9 | img_cols: 832 10 | img_norm: True 11 | # version: cityscapes 12 | path: datasets/kitti/semantics 13 | 14 | training: 15 | train_iters: 10000000 16 | batch_size: 4 17 | val_interval: 40 18 | n_workers: 2 19 | print_interval: 10 20 | optimizer: 21 | name: 'adam' 22 | lr: 1.0e-5 23 | loss: 24 | name: 'cross_entropy' 25 | size_average: True 26 | lr_schedule: 27 | resume: runs/frrnA_cityscapes_seg/73777/frrn_cityscapes_best_model.pkl -------------------------------------------------------------------------------- /configs/frrnA_cityscapes_seg.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: frrn 3 | model_type: A 4 | task: seg 5 | data: 6 | dataset: cityscapes 7 | train_split: train 8 | val_split: val 9 | img_rows: 256 10 | img_cols: 512 11 | img_norm: True 12 | # version: cityscapes 13 | path: datasets/cityscapes 14 | training: 15 | train_iters: 10000 16 | batch_size: 3 17 | val_interval: 1000 18 | n_workers: 2 19 | print_interval: 200 20 | optimizer: 21 | name: 'adam' 22 | lr: 1.0e-3 23 | loss: 24 | name: 'bootstrapped_cross_entropy' 25 | size_average: True 26 | K: 16384 # 512*256/8 27 | lr_schedule: 28 | resume: runs/frrnA_cityscapes_seg/94260/frrn_cityscapes_best_model.pkl 29 | -------------------------------------------------------------------------------- /configs/frrnA_kitti_depth.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: frrn 3 | model_type: A 4 | task: depth 5 | data: 6 | dataset: kitti 7 | train_split: train 8 | val_split: val 9 | img_rows: 128 10 | img_cols: 416 11 | img_norm: True 12 | path: prepared_kitti_train_data 13 | 14 | training: 15 | train_iters: 100000000 16 | batch_size: 4 17 | val_interval: 2000 18 | n_workers: 2 19 | print_interval: 500 20 | optimizer: 21 | name: 'adam' 22 | lr: 1.0e-4 23 | loss: 24 | name: 'scale_invariant_loss' 25 | smooth: True 26 | lr_schedule: 27 | resume: 28 | -------------------------------------------------------------------------------- /configs/frrnA_kitti_seg.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: frrn 3 | model_type: A 4 | task: seg 5 | data: 6 | dataset: kitti 7 | train_split: train 8 | val_split: val 9 | img_rows: 256 10 | img_cols: 832 11 | img_norm: True 12 | # version: cityscapes 13 | path: datasets/kitti/semantics 14 | training: 15 | train_iters: 10000000 16 | batch_size: 4 17 | val_interval: 40 18 | n_workers: 2 19 | print_interval: 10 20 | optimizer: 21 | name: 'adam' 22 | lr: 1.0e-5 23 | loss: 24 | name: 'bootstrapped_cross_entropy' 25 | size_average: True 26 | K: 26624 # 256*832/8 27 | lr_schedule: 28 | resume: # runs/frrnA_cityscapes/94430/frrnA_cityscapes_best_model.pkl 29 | -------------------------------------------------------------------------------- /configs/segnet_cityscapes_seg.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: segnet 3 | task: seg 4 | data: 5 | dataset: cityscapes 6 | train_split: train 7 | val_split: val 8 | img_rows: 512 9 | img_cols: 1024 10 | img_norm: True 11 | # version: cityscapes 12 | path: datasets/cityscapes 13 | 14 | training: 15 | train_iters: 200000 16 | batch_size: 2 17 | val_interval: 1500 18 | n_workers: 2 19 | print_interval: 300 20 | optimizer: 21 | name: 'adam' 22 | lr: 1.0e-4 23 | loss: 24 | name: 'cross_entropy' 25 | size_average: True 26 | lr_schedule: 27 | resume: 28 | -------------------------------------------------------------------------------- /configs/segnet_kitti_depth.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: segnet 3 | task: depth 4 | data: 5 | dataset: kitti 6 | train_split: train 7 | val_split: val 8 | img_rows: 128 9 | img_cols: 416 10 | img_norm: True 11 | path: prepared_kitti_train_data 12 | 13 | training: 14 | train_iters: 100000000 15 | batch_size: 4 16 | val_interval: 2000 17 | n_workers: 2 18 | print_interval: 500 19 | optimizer: 20 | name: 'adam' 21 | lr: 1.0e-4 22 | loss: 23 | name: 'scale_invariant_loss' 24 | smooth: True 25 | lr_schedule: 26 | resume: 27 | -------------------------------------------------------------------------------- /configs/segnet_kitti_seg.yml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: segnet 3 | task: seg 4 | data: 5 | dataset: kitti 6 | train_split: train 7 | val_split: val 8 | img_rows: 256 9 | img_cols: 832 10 | img_norm: True 11 | # version: cityscapes 12 | path: datasets/kitti/semantics 13 | 14 | training: 15 | train_iters: 10000000 16 | batch_size: 4 17 | val_interval: 40 18 | n_workers: 2 19 | print_interval: 10 20 | optimizer: 21 | name: 'adam' 22 | lr: 1.0e-5 23 | loss: 24 | name: 'cross_entropy' 25 | size_average: True 26 | lr_schedule: 27 | resume: ../pytorch-semseg/runs/segnet_cityscapes/66269/segnet_cityscapes_best_model.pkl 28 | -------------------------------------------------------------------------------- /demo_depth.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from scipy.misc import imresize 4 | import argparse 5 | import scipy.misc as m 6 | import matplotlib.pyplot as plot 7 | 8 | from ptsemseg.models.fcn_depth import * 9 | from ptsemseg.models.segnet_depth import * 10 | from ptsemseg.models.frrn_depth import * 11 | from ptsemseg.models.deeplab_depth import * 12 | from ptsemseg.models.fcrn_depth import * 13 | from ptsemseg.models.dispnet_depth import * 14 | 15 | 16 | # depth demo 17 | # image resize height and width need to match the training settings of the pretrained model 18 | 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("--dataset", default='cityscapes', type=str, choices=["cityscapes", "kitti"]) 21 | 22 | # datasets/kitti/semantics/training/image_2/000193_10.png 23 | parser.add_argument("--img_path", default='datasets/cityscapes/leftImg8bit/train/aachen/aachen_000005_000019_leftImg8bit.png', 24 | type=str, 25 | help='path to the input image') 26 | parser.add_argument("--model_name", type=str, default='fcrn', choices=["fcn", "frrnA", "segnet", "deeplab", "dispnet", "fcrn"]) 27 | parser.add_argument("--model_path", 28 | default='runs/fcrn_cityscapes_depth/212_512_1024_bs2_smooth1000_berhuLoss/fcrn_cityscapes_best_model.pkl', 29 | type=str, 30 | help='path to the pretrained model') 31 | parser.add_argument("--height", type=int, default=512, help="image resize height") # 256 for kitti 32 | parser.add_argument("--width", type=int, default=1024, help="image resize width") # 832 for kitti 33 | parser.add_argument("--pred_disp", action='store_true', 34 | help="model predicts disparity instead of depth if selected") 35 | 36 | args = parser.parse_args() 37 | 38 | def get_model(model_name): 39 | try: 40 | return { 41 | "fcn": fcn_depth(), 42 | "frrnA": frrn_depth(model_type = "A"), 43 | "segnet": segnet_depth(), 44 | "deeplab": deeplab_depth(), 45 | "dispnet": dispnet_depth(), 46 | "fcrn": fcrn_depth(), 47 | }[model_name] 48 | except: 49 | raise("Model {} not available".format(model_name)) 50 | 51 | @torch.no_grad() 52 | def main(): 53 | img = m.imread(args.img_path).astype(np.float32) 54 | 55 | # input image preprocessing, need to match the training settings of the pretrained model 56 | img = imresize(img, (args.height, args.width)).astype(np.float32) # [128, 416, 3] 57 | img = ((img / 255 - 0.5) / 0.5) 58 | img = np.transpose(img, (2, 0, 1)) 59 | img = torch.from_numpy(img).unsqueeze(0) # tensor [1, 3, 128, 416] 60 | 61 | # load pretrained model 62 | model = get_model(args.model_name) 63 | weights = torch.load(args.model_path, map_location=lambda storage, loc: storage) 64 | model.load_state_dict(weights['model_state']) 65 | model.eval() 66 | 67 | output = model(img).cpu().numpy()[0,0] 68 | 69 | if args.dataset == "kitti": 70 | y1, y2 = int(0.40810811 * output.shape[0]), int(0.99189189 * output.shape[0]) 71 | x1, x2 = int(0 * output.shape[1]), int(1 * output.shape[1]) 72 | output_cut = output[y1:y2, x1:x2] 73 | output_cut = 1/output_cut # TODO: not for dispnet 74 | 75 | output_upper = np.full((y1, args.width), np.min(output_cut), dtype=float) 76 | output_cut = (output_cut - np.min(output_cut)) / np.max(output_cut) 77 | output_final = np.concatenate((output_upper, output_cut), axis=0) 78 | m.imsave("output_predict_img/dispnet_output_depth.png", output_final) # for dispnet 79 | 80 | if args.dataset == "cityscapes": 81 | y1, y2 = int(0.05 * output.shape[0]), int(0.80 * output.shape[0]) 82 | x1, x2 = int(0.05 * output.shape[1]), int(0.99 * output.shape[1]) 83 | output_cut = output[y1:y2, x1:x2] 84 | output_cut = 1/output_cut # TODO: not for dispnet 85 | m.imsave("city_depth.png", output_cut) 86 | plot.imsave("city_depth.png", output_cut, cmap="viridis") 87 | 88 | 89 | 90 | if __name__ == '__main__': 91 | main() -------------------------------------------------------------------------------- /demo_saliency.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # 4 | # Author: Kazuto Nakashima 5 | # URL: http://kazuto1011.github.io 6 | # Created: 2017-05-18 7 | 8 | from __future__ import print_function 9 | 10 | import argparse 11 | import numpy as np 12 | import torch 13 | import scipy.misc as m 14 | import cv2 15 | import matplotlib.pyplot as plot 16 | 17 | from ptsemseg.models.fcn_seg import * 18 | from ptsemseg.models.segnet_seg import * 19 | from ptsemseg.models.frrn_seg import * 20 | from ptsemseg.models.deeplab_seg import * 21 | from ptsemseg.models.fcrn_seg import * 22 | from ptsemseg.models.dispnet_seg import * 23 | 24 | from ptsemseg.models.fcn_depth import * 25 | from ptsemseg.models.segnet_depth import * 26 | from ptsemseg.models.frrn_depth import * 27 | from ptsemseg.models.deeplab_depth import * 28 | from ptsemseg.models.fcrn_depth import * 29 | from ptsemseg.models.dispnet_depth import * 30 | 31 | from saliency import BackPropagation 32 | 33 | 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("--image_path", default='datasets/kitti/semantics/training/image_2/000193_10.png', type=str, 36 | help='path to test image') 37 | 38 | parser.add_argument("--model_name", type=str, default='dispnet', choices=["fcn", "frrnA", "segnet", "deeplab", "dispnet", "fcrn"]) 39 | parser.add_argument("--task", type=str, default="seg", choices=["seg", "depth"]) 40 | parser.add_argument("--model_path", type=str, 41 | default='runs/dispnet_kitti_seg/86732_256_832_cityscaperPretrained_lr5/dispnet_kitti_best_model.pkl', 42 | help='path to pretrained model') 43 | 44 | # the image resolution here should match the pretrained model training resolution 45 | parser.add_argument("--height", type=int, default=256, help="image resize height") 46 | parser.add_argument("--width", type=int, default=832, help="image resize width") 47 | 48 | parser.add_argument("--pos_i", type=int, default=200, help="x coordinate for the pixel to test") 49 | parser.add_argument("--pos_j", type=int, default=160, help="j coordinate for the pixel to test") 50 | 51 | parser.add_argument("--topk", type=int, default=1, 52 | help="top k classes to produce the saliency map for seg (shoud be set to 1 for depth)") 53 | 54 | args = parser.parse_args() 55 | 56 | class_names = [ 57 | "road", 58 | "sidewalk", 59 | "building", 60 | "wall", 61 | "fence", 62 | "pole", 63 | "traffic_light", 64 | "traffic_sign", 65 | "vegetation", 66 | "terrain", 67 | "sky", 68 | "person", 69 | "rider", 70 | "car", 71 | "truck", 72 | "bus", 73 | "train", 74 | "motorcycle", 75 | "bicycle", 76 | ] 77 | 78 | def get_model(model_name, task): 79 | if task == "seg": 80 | try: 81 | return { 82 | "fcn": fcn_seg(n_classes=19), 83 | "frrnA": frrn_seg(model_type = "A", n_classes=19), 84 | "segnet": segnet_seg(n_classes=19), 85 | "deeplab": deeplab_seg(n_classes=19), 86 | "dispnet": dispnet_seg(n_classes=19), 87 | "fcrn": fcrn_seg(n_classes=19), 88 | }[model_name] 89 | except: 90 | raise("Model {} not available".format(model_name)) 91 | elif task == "depth": # TODO: add depth models 92 | try: 93 | return { 94 | "fcn": fcn_depth(), 95 | "frrnA": frrn_depth(model_type = "A"), 96 | "segnet": segnet_depth(), 97 | "deeplab": deeplab_depth(), 98 | "dispnet": dispnet_depth(), 99 | "fcrn": fcrn_depth(), 100 | }[model_name] 101 | except: 102 | raise("Model {} not available".format(model_name)) 103 | 104 | 105 | def image_process(img, task): 106 | # image preprocessing need to match the training settings of the corresponding pretrained model 107 | img = np.array(img, dtype=np.uint8) 108 | img = m.imresize(img, (args.height, args.width)) 109 | 110 | #img[args.pos_i, args.pos_j] = 0 111 | 112 | raw_img = img.astype(np.float) 113 | if task == "seg": 114 | img = img[:, :, ::-1] # RGB -> BGR shape: [h, w, 3] 115 | img = img.astype(float) / 255.0 # norm to [0,1] for seg 116 | if task == "depth": 117 | img = ((img.astype(float) / 255 - 0.5) / 0.5) # normalize to [-1, 1] 118 | 119 | # NHWC -> NCHW 120 | img = img.transpose(2, 0, 1) # [3, h, w] 121 | img = torch.from_numpy(img).float() # tensor, shape: [3, h, w] 122 | img = img.unsqueeze(0) 123 | return img, raw_img 124 | 125 | 126 | def pixel_locate(img, pos_i, pos_j): 127 | for p in range(pos_i - 3, pos_i + 4): 128 | for q in range(pos_j-3, pos_j+4): 129 | img[p, q, 0] = 255 130 | img[p, q, 1] = 255 131 | img[p, q, 2] = 255 132 | return img 133 | 134 | 135 | def main(): 136 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 137 | 138 | # Model 139 | model = get_model(args.model_name, args.task) 140 | weights = torch.load(args.model_path, map_location=lambda storage, loc: storage) 141 | model.load_state_dict(weights['model_state']) 142 | model.to(device) 143 | model.eval() 144 | 145 | # Image preprocessing 146 | img = m.imread(args.image_path) 147 | img, raw_img = image_process(img, args.task) 148 | 149 | # ========================================================================= 150 | print('Vanilla Backpropagation and saliency map') 151 | # ========================================================================= 152 | bp = BackPropagation(model=model, task=args.task) 153 | # preds, idx = bp.forward_demo(img.to(device), args.pos_i, args.pos_j) 154 | pred_idx = bp.forward(img.to(device)) 155 | 156 | for i in range(0, args.topk): 157 | bp.backward(pos_i=args.pos_i, pos_j=args.pos_j, idx=pred_idx[args.pos_i, args.pos_j]) 158 | output_vanilla, output_saliency = bp.generate() # [3, h, w] 159 | # m.imsave('saliency_results/vanilla_BP_map_{}_{}.png'.format(args.model_name, args.task), output_vanilla) 160 | for p in range(args.pos_i - 5, args.pos_i + 6): 161 | for q in range(args.pos_j - 5, args.pos_j + 6): 162 | output_saliency[p, q] = np.max(output_saliency) 163 | output_saliency[p, q] = np.max(output_saliency) 164 | plot.imsave('saliency_results/BP_saliency_map_{}_{}.png'.format(args.model_name, args.task), output_saliency, cmap="viridis") 165 | m.imsave('saliency_results/image_pixel_locate.png', pixel_locate(raw_img, pos_i=args.pos_i, pos_j=args.pos_j)) 166 | 167 | 168 | 169 | # output_saliency = (output_saliency - np.min(output_saliency)) / np.max(output_saliency) 170 | # output_saliency = 1 - output_saliency 171 | # heatmap = cv2.applyColorMap(np.uint8(255 * output_saliency), cv2.COLORMAP_JET) 172 | # m.imsave('saliency_results/heatmap_{}_{}.png'.format(args.model_name, args.task), heatmap) 173 | 174 | # if args.task == "seg": 175 | # print('[{:.5f}] {}'.format(preds[i], class_names[idx[i]])) 176 | 177 | 178 | 179 | if __name__ == '__main__': 180 | main() 181 | -------------------------------------------------------------------------------- /demo_seg.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import scipy.misc as m 4 | 5 | from ptsemseg.models.fcn_seg import * 6 | from ptsemseg.models.segnet_seg import * 7 | from ptsemseg.models.frrn_seg import * 8 | from ptsemseg.models.deeplab_seg import * 9 | from ptsemseg.models.fcrn_seg import * 10 | from ptsemseg.models.dispnet_seg import * 11 | 12 | # segmentation demo 13 | # image resize height and width need to match the training settings of the pretrained model 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--dataset", default='cityscapes', type=str, choices=["cityscapes", "kitti"]) 17 | # datasets/kitti/semantics/training/image_2/000193_10.png 18 | parser.add_argument("--img_path", type=str, 19 | default='datasets/cityscapes/leftImg8bit/train/aachen/aachen_000005_000019_leftImg8bit.png', 20 | help='path to the input image') 21 | parser.add_argument("--model_name", type=str, default='deeplab', choices=["fcn", "frrnA", "segnet", "deeplab", "dispnet", "fcrn"]) 22 | parser.add_argument("--model_path", type=str, 23 | default='runs/deeplab_cityscapes_seg/11044_513_1025_train_requireF_adam_lr4_batchsize2/deeplab_cityscapes_best_model.pkl', 24 | help='path to the pretrained model') 25 | parser.add_argument("--height", type=int, default=512, help="image resize height") # 256 for kitti 26 | parser.add_argument("--width", type=int, default=1024, help="image resize width") # 832 for kitti 27 | 28 | args = parser.parse_args() 29 | 30 | def get_model(model_name): 31 | try: 32 | return { 33 | "fcn": fcn_seg(n_classes=19), 34 | "frrnA": frrn_seg(n_classes=19, model_type="A"), 35 | "segnet": segnet_seg(n_classes=19), 36 | "deeplab": deeplab_seg(n_classes=19), 37 | "dispnet": dispnet_seg(n_classes=19), 38 | "fcrn": fcrn_seg(n_classes=19), 39 | }[model_name] 40 | except: 41 | raise("Model {} not available".format(model_name)) 42 | 43 | # 19classes, RGB of maskes 44 | colors = [ # [ 0, 0, 0], 45 | [128, 64, 128], 46 | [244, 35, 232], 47 | [70, 70, 70], 48 | [102, 102, 156], 49 | [190, 153, 153], 50 | [153, 153, 153], 51 | [250, 170, 30], 52 | [220, 220, 0], 53 | [107, 142, 35], 54 | [152, 251, 152], 55 | [0, 130, 180], 56 | [220, 20, 60], 57 | [255, 0, 0], 58 | [0, 0, 142], 59 | [0, 0, 70], 60 | [0, 60, 100], 61 | [0, 80, 100], 62 | [0, 0, 230], 63 | [119, 11, 32], 64 | ] 65 | 66 | label_colours = dict(zip(range(19), colors)) 67 | 68 | def decode_segmap_tocolor(temp, n_classes=19): 69 | r = temp.copy() 70 | g = temp.copy() 71 | b = temp.copy() 72 | for l in range(0, n_classes): 73 | r[temp == l] = label_colours[l][0] 74 | g[temp == l] = label_colours[l][1] 75 | b[temp == l] = label_colours[l][2] 76 | 77 | rgb = np.zeros((temp.shape[0], temp.shape[1], 3)) 78 | rgb[:, :, 0] = r / 255.0 79 | rgb[:, :, 1] = g / 255.0 80 | rgb[:, :, 2] = b / 255.0 81 | return rgb 82 | 83 | 84 | @torch.no_grad() 85 | def main(): 86 | img = m.imread(args.img_path) 87 | 88 | # input image preprocessing, need to match the training settings of the pretrained model 89 | img = m.imresize(img, (args.height, args.width)).astype(np.float32) 90 | img = img[:, :, ::-1] 91 | img = img / 255.0 92 | img = img.transpose(2, 0, 1) 93 | img = torch.from_numpy(img).float().unsqueeze(0) 94 | 95 | # load pretrained model 96 | model = get_model(args.model_name) 97 | # weights = torch.load(args.model_path) 98 | weights = torch.load(args.model_path, map_location=lambda storage, loc: storage) 99 | model.load_state_dict(weights['model_state']) 100 | model.eval() 101 | 102 | output = model(img) 103 | pred = np.squeeze(output.data.max(1)[1].cpu().numpy(), axis=0) 104 | 105 | decoded = decode_segmap_tocolor(pred, n_classes=19) 106 | # m.imsave("output_predict_img/dispnet_output_seg.png", decoded) 107 | m.imsave("city_seg.png", decoded) 108 | 109 | 110 | if __name__ == '__main__': 111 | main() 112 | 113 | -------------------------------------------------------------------------------- /fcrn_metric_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/fcrn_metric_0.png -------------------------------------------------------------------------------- /kitti_depth_eval/__pycache__/depth_evaluation_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/kitti_depth_eval/__pycache__/depth_evaluation_utils.cpython-36.pyc -------------------------------------------------------------------------------- /kitti_depth_eval/depth_evaluation_utils.py: -------------------------------------------------------------------------------- 1 | # Mostly based on the code written by Clement Godard: 2 | # https://github.com/mrharicot/monodepth/blob/master/utils/evaluation_utils.py 3 | import numpy as np 4 | from collections import Counter 5 | from path import Path 6 | from scipy.misc import imread 7 | from tqdm import tqdm 8 | from scipy.interpolate import LinearNDInterpolator 9 | import scipy.misc as m 10 | 11 | 12 | class test_framework_KITTI(object): 13 | def __init__(self, root, test_files, seq_length=3, min_depth=1e-3, max_depth=100, step=1): 14 | self.root = root 15 | self.min_depth, self.max_depth = min_depth, max_depth 16 | self.calib_dirs, self.gt_files, self.img_files, self.cams = read_scene_data(self.root, test_files, seq_length, step) 17 | 18 | def __getitem__(self, i): 19 | tgt = imread(self.img_files[i]).astype(np.float32) # input image 20 | depth, depth_fill = generate_depth_map(self.calib_dirs[i], self.gt_files[i], tgt.shape[:2], self.cams[i], interp=True) 21 | # m.imsave('depth_gt.png', depth) 22 | # m.imsave('depth_gt_filled.png', depth_fill) 23 | return {'tgt': tgt, 24 | 'path': self.img_files[i], 25 | 'gt_depth': depth, 26 | 'mask': generate_mask(depth, self.min_depth, self.max_depth) 27 | } 28 | 29 | def __len__(self): 30 | return len(self.img_files) 31 | 32 | 33 | ############################################################################### 34 | # EIGEN 35 | # generate depth ground truths 36 | 37 | def read_scene_data(data_root, test_list): 38 | data_root = Path(data_root) 39 | gt_files = [] 40 | calib_dirs = [] 41 | im_files = [] 42 | cams = [] 43 | # displacements = [] 44 | # demi_length = (seq_length - 1) // 2 45 | # shift_range = step * np.arange(-demi_length, demi_length + 1) 46 | 47 | print('getting test metadata ... ') 48 | for sample in tqdm(test_list): 49 | tgt_img_path = data_root/sample # path for image 50 | # date: '2011_09_26', scence: '2011_09_26_drive_0002_sync', cam_id: 'img_02', index: '0000000069' 51 | date, scene, cam_id, _, index = sample[:-4].split('/') 52 | vel_path = data_root/date/scene/'velodyne_points'/'data'/'{}.bin'.format(index[:10]) 53 | 54 | if tgt_img_path.isfile(): 55 | gt_files.append(vel_path) 56 | calib_dirs.append(data_root/date) 57 | im_files.append(tgt_img_path) 58 | cams.append(int(cam_id[-2:])) 59 | else: 60 | print('{} missing'.format(tgt_img_path)) 61 | return calib_dirs, gt_files, im_files, cams 62 | 63 | 64 | def load_velodyne_points(file_name): 65 | # adapted from https://github.com/hunse/kitti 66 | points = np.fromfile(file_name, dtype=np.float32).reshape(-1, 4) 67 | points[:,3] = 1 68 | return points 69 | 70 | 71 | def lin_interp(shape, xyd): 72 | # taken from https://github.com/hunse/kitti 73 | m, n = shape 74 | ij, d = xyd[:, 1::-1], xyd[:, 2] 75 | f = LinearNDInterpolator(ij, d, fill_value=0) 76 | J, I = np.meshgrid(np.arange(n), np.arange(m)) 77 | IJ = np.vstack([I.flatten(), J.flatten()]).T 78 | disparity = f(IJ).reshape(shape) 79 | return disparity 80 | 81 | 82 | def read_calib_file(path): 83 | # taken from https://github.com/hunse/kitti 84 | float_chars = set("0123456789.e+- ") 85 | data = {} 86 | with open(path, 'r') as f: 87 | for line in f.readlines(): 88 | key, value = line.split(':', 1) 89 | value = value.strip() 90 | data[key] = value 91 | if float_chars.issuperset(value): 92 | # try to cast to float array 93 | try: 94 | data[key] = np.array(list(map(float, value.split(' ')))) 95 | except ValueError: 96 | # casting error: data[key] already eq. value, so pass 97 | pass 98 | return data 99 | 100 | 101 | def sub2ind(matrixSize, rowSub, colSub): 102 | m, n = matrixSize 103 | return rowSub * (n-1) + colSub - 1 104 | 105 | 106 | def generate_depth_map(calib_dir, velo_file_name, im_shape, cam=2, interp=False): 107 | # load calibration files 108 | cam2cam = read_calib_file(calib_dir/'calib_cam_to_cam.txt') 109 | velo2cam = read_calib_file(calib_dir/'calib_velo_to_cam.txt') 110 | velo2cam = np.hstack((velo2cam['R'].reshape(3,3), velo2cam['T'][..., np.newaxis])) 111 | velo2cam = np.vstack((velo2cam, np.array([0, 0, 0, 1.0]))) 112 | 113 | # compute projection matrix velodyne->image plane 114 | R_cam2rect = np.eye(4) 115 | R_cam2rect[:3,:3] = cam2cam['R_rect_00'].reshape(3,3) 116 | P_rect = cam2cam['P_rect_0'+str(cam)].reshape(3,4) 117 | P_velo2im = np.dot(np.dot(P_rect, R_cam2rect), velo2cam) 118 | 119 | # load velodyne points and remove all behind image plane (approximation) 120 | # each row of the velodyne data is forward, left, up, reflectance 121 | velo = load_velodyne_points(velo_file_name) 122 | velo = velo[velo[:, 0] >= 0, :] 123 | 124 | # project the points to the camera 125 | velo_pts_im = np.dot(P_velo2im, velo.T).T 126 | velo_pts_im[:, :2] = velo_pts_im[:,:2] / velo_pts_im[:,-1:] 127 | 128 | # check if in bounds 129 | # use minus 1 to get the exact same value as KITTI matlab code 130 | velo_pts_im[:, 0] = np.round(velo_pts_im[:,0]) - 1 131 | velo_pts_im[:, 1] = np.round(velo_pts_im[:,1]) - 1 132 | val_inds = (velo_pts_im[:, 0] >= 0) & (velo_pts_im[:, 1] >= 0) 133 | val_inds = val_inds & (velo_pts_im[:,0] < im_shape[1]) & (velo_pts_im[:,1] < im_shape[0]) 134 | velo_pts_im = velo_pts_im[val_inds, :] 135 | 136 | # project to image 137 | depth = np.zeros((im_shape)) 138 | depth[velo_pts_im[:, 1].astype(np.int), velo_pts_im[:, 0].astype(np.int)] = velo_pts_im[:, 2] 139 | 140 | # find the duplicate points and choose the closest depth 141 | inds = sub2ind(depth.shape, velo_pts_im[:, 1], velo_pts_im[:, 0]) 142 | dupe_inds = [item for item, count in Counter(inds).items() if count > 1] 143 | for dd in dupe_inds: 144 | pts = np.where(inds == dd)[0] 145 | x_loc = int(velo_pts_im[pts[0], 0]) 146 | y_loc = int(velo_pts_im[pts[0], 1]) 147 | depth[y_loc, x_loc] = velo_pts_im[pts, 2].min() 148 | depth[depth < 0] = 0 149 | 150 | if interp: 151 | # interpolate the depth map to fill in holes 152 | depth_interp = lin_interp(im_shape, velo_pts_im) 153 | return depth, depth_interp 154 | else: 155 | return depth 156 | 157 | 158 | def generate_mask(gt_depth, min_depth, max_depth): 159 | mask = np.logical_and(gt_depth > min_depth, 160 | gt_depth < max_depth) 161 | # crop used by Garg ECCV16 to reprocude Eigen NIPS14 results 162 | # for that the ground truth is not for the entire image 163 | gt_height, gt_width = gt_depth.shape 164 | crop = np.array([0.40810811 * gt_height, 0.99189189 * gt_height, 165 | 0.03594771 * gt_width, 0.96405229 * gt_width]).astype(np.int32) 166 | 167 | crop_mask = np.zeros(mask.shape) 168 | crop_mask[crop[0]:crop[1],crop[2]:crop[3]] = 1 169 | mask = np.logical_and(mask, crop_mask) 170 | return mask 171 | -------------------------------------------------------------------------------- /kitti_train_depth_prepare/__pycache__/kitti_raw_loader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/kitti_train_depth_prepare/__pycache__/kitti_raw_loader.cpython-36.pyc -------------------------------------------------------------------------------- /kitti_train_depth_prepare/kitti_raw_loader.py: -------------------------------------------------------------------------------- 1 | # Modified from code of Clement Pinard 2 | # https://github.com/ClementPinard/SfmLearner-Pytorch 3 | 4 | from __future__ import division 5 | import numpy as np 6 | from path import Path 7 | import scipy.misc 8 | from collections import Counter 9 | 10 | 11 | def read_calib_file(path): 12 | # taken from https://github.com/hunse/kitti 13 | float_chars = set("0123456789.e+- ") 14 | data = {} 15 | with open(path, 'r') as f: 16 | for line in f.readlines(): 17 | key, value = line.split(':', 1) 18 | value = value.strip() 19 | data[key] = value 20 | if float_chars.issuperset(value): 21 | # try to cast to float array 22 | try: 23 | data[key] = np.array(list(map(float, value.split(' ')))) 24 | except ValueError: 25 | # casting error: data[key] already eq. value, so pass 26 | pass 27 | 28 | return data 29 | 30 | 31 | class KittiRawLoader(object): 32 | def __init__(self, 33 | dataset_dir, 34 | static_frames_file=None, 35 | img_height=128, 36 | img_width=416, 37 | min_speed=2, 38 | ): 39 | dir_path = Path(__file__).realpath().dirname() 40 | test_scene_file = dir_path/'test_scenes.txt' 41 | 42 | self.from_speed = static_frames_file is None 43 | if static_frames_file is not None: 44 | static_frames_file = Path(static_frames_file) 45 | self.collect_static_frames(static_frames_file) 46 | 47 | with open(test_scene_file, 'r') as f: 48 | test_scenes = f.readlines() 49 | self.test_scenes = [t[:-1] for t in test_scenes] 50 | self.dataset_dir = Path(dataset_dir) 51 | self.img_height = img_height 52 | self.img_width = img_width 53 | self.cam_ids = ['02', '03'] 54 | self.date_list = ['2011_09_26', '2011_09_28', '2011_09_29', '2011_09_30', '2011_10_03'] 55 | self.min_speed = min_speed 56 | self.collect_train_folders() 57 | 58 | def collect_static_frames(self, static_frames_file): 59 | with open(static_frames_file, 'r') as f: 60 | frames = f.readlines() 61 | self.static_frames = {} 62 | for fr in frames: 63 | if fr == '\n': 64 | continue 65 | date, drive, frame_id = fr.split(' ') 66 | curr_fid = '%.10d' % (np.int(frame_id[:-1])) 67 | if drive not in self.static_frames.keys(): 68 | self.static_frames[drive] = [] 69 | self.static_frames[drive].append(curr_fid) 70 | 71 | def collect_train_folders(self): 72 | self.scenes = [] 73 | for date in self.date_list: 74 | drive_set = (self.dataset_dir/date).dirs() 75 | for dr in drive_set: 76 | if dr.name[:-5] not in self.test_scenes: 77 | self.scenes.append(dr) 78 | 79 | def collect_scenes(self, drive): 80 | train_scenes = [] 81 | for c in self.cam_ids: 82 | oxts = sorted((drive/'oxts'/'data').files('*.txt')) 83 | scene_data = {'cid': c, 'dir': drive, 'speed': [], 'frame_id': [], 'rel_path': drive.name + '_' + c} 84 | scale = None 85 | 86 | for n, f in enumerate(oxts): 87 | metadata = np.genfromtxt(f) 88 | speed = metadata[8:11] 89 | scene_data['speed'].append(speed) 90 | scene_data['frame_id'].append('{:010d}'.format(n)) 91 | lat = metadata[0] 92 | 93 | if scale is None: 94 | scale = np.cos(lat * np.pi / 180.) 95 | 96 | sample = self.load_image(scene_data, 0) 97 | if sample is None: 98 | return [] 99 | scene_data['P_rect'] = self.get_P_rect(scene_data, sample[1], sample[2]) 100 | scene_data['intrinsics'] = scene_data['P_rect'][:,:3] 101 | 102 | train_scenes.append(scene_data) 103 | return train_scenes 104 | 105 | def get_scene_imgs(self, scene_data): 106 | def construct_sample(scene_data, i, frame_id): 107 | sample = {"img":self.load_image(scene_data, i)[0], "id":frame_id} 108 | sample['depth'] = self.generate_depth_map(scene_data, i) 109 | return sample 110 | 111 | if self.from_speed: 112 | cum_speed = np.zeros(3) 113 | for i, speed in enumerate(scene_data['speed']): 114 | cum_speed += speed 115 | speed_mag = np.linalg.norm(cum_speed) 116 | if speed_mag > self.min_speed: 117 | frame_id = scene_data['frame_id'][i] 118 | yield construct_sample(scene_data, i, frame_id) 119 | cum_speed *= 0 120 | else: # from static frame file 121 | drive = str(scene_data['dir'].name) 122 | for (i,frame_id) in enumerate(scene_data['frame_id']): 123 | if (drive not in self.static_frames.keys()) or (frame_id not in self.static_frames[drive]): 124 | yield construct_sample(scene_data, i, frame_id) 125 | 126 | def get_P_rect(self, scene_data, zoom_x, zoom_y): 127 | calib_file = scene_data['dir'].parent/'calib_cam_to_cam.txt' 128 | 129 | filedata = self.read_raw_calib_file(calib_file) 130 | P_rect = np.reshape(filedata['P_rect_' + scene_data['cid']], (3, 4)) 131 | P_rect[0] *= zoom_x 132 | P_rect[1] *= zoom_y 133 | return P_rect 134 | 135 | def load_image(self, scene_data, tgt_idx): 136 | img_file = scene_data['dir']/'image_{}'.format(scene_data['cid'])/'data'/scene_data['frame_id'][tgt_idx]+'.png' 137 | if not img_file.isfile(): 138 | return None 139 | img = scipy.misc.imread(img_file) 140 | zoom_y = self.img_height/img.shape[0] 141 | zoom_x = self.img_width/img.shape[1] 142 | img = scipy.misc.imresize(img, (self.img_height, self.img_width)) 143 | return img, zoom_x, zoom_y 144 | 145 | def read_raw_calib_file(self, filepath): 146 | # From https://github.com/utiasSTARS/pykitti/blob/master/pykitti/utils.py 147 | """Read in a calibration file and parse into a dictionary.""" 148 | data = {} 149 | 150 | with open(filepath, 'r') as f: 151 | for line in f.readlines(): 152 | key, value = line.split(':', 1) 153 | # The only non-float values in these files are dates, which 154 | # we don't care about anyway 155 | try: 156 | data[key] = np.array([float(x) for x in value.split()]) 157 | except ValueError: 158 | pass 159 | return data 160 | 161 | def generate_depth_map(self, scene_data, tgt_idx): 162 | # compute projection matrix velodyne->image plane 163 | def sub2ind(matrixSize, rowSub, colSub): 164 | m, n = matrixSize 165 | return rowSub * (n-1) + colSub - 1 166 | 167 | R_cam2rect = np.eye(4) 168 | 169 | calib_dir = scene_data['dir'].parent 170 | cam2cam = self.read_raw_calib_file(calib_dir/'calib_cam_to_cam.txt') 171 | velo2cam = self.read_raw_calib_file(calib_dir/'calib_velo_to_cam.txt') 172 | velo2cam = np.hstack((velo2cam['R'].reshape(3,3), velo2cam['T'][..., np.newaxis])) 173 | velo2cam = np.vstack((velo2cam, np.array([0, 0, 0, 1.0]))) 174 | P_rect = np.copy(scene_data['P_rect']) 175 | R_cam2rect[:3,:3] = cam2cam['R_rect_00'].reshape(3,3) 176 | P_velo2im = np.dot(np.dot(P_rect, R_cam2rect), velo2cam) 177 | 178 | velo_file_name = scene_data['dir']/'velodyne_points'/'data'/'{}.bin'.format(scene_data['frame_id'][tgt_idx]) 179 | 180 | # load velodyne points and remove all behind image plane (approximation) 181 | # each row of the velodyne data is forward, left, up, reflectance 182 | velo = np.fromfile(velo_file_name, dtype=np.float32).reshape(-1, 4) 183 | velo[:,3] = 1 184 | velo = velo[velo[:, 0] >= 0, :] 185 | 186 | # project the points to the camera 187 | velo_pts_im = np.dot(P_velo2im, velo.T).T 188 | velo_pts_im[:, :2] = velo_pts_im[:,:2] / velo_pts_im[:,-1:] 189 | 190 | # check if in bounds 191 | # use minus 1 to get the exact same value as KITTI matlab code 192 | velo_pts_im[:, 0] = np.round(velo_pts_im[:,0]) - 1 193 | velo_pts_im[:, 1] = np.round(velo_pts_im[:,1]) - 1 194 | 195 | val_inds = (velo_pts_im[:, 0] >= 0) & (velo_pts_im[:, 1] >= 0) 196 | val_inds = val_inds & (velo_pts_im[:, 0] < self.img_width) 197 | val_inds = val_inds & (velo_pts_im[:, 1] < self.img_height) 198 | velo_pts_im = velo_pts_im[val_inds, :] 199 | 200 | # project to image 201 | depth = np.zeros((self.img_height, self.img_width)).astype(np.float32) 202 | depth[velo_pts_im[:, 1].astype(np.int), velo_pts_im[:, 0].astype(np.int)] = velo_pts_im[:, 2] 203 | 204 | # find the duplicate points and choose the closest depth 205 | inds = sub2ind(depth.shape, velo_pts_im[:, 1], velo_pts_im[:, 0]) 206 | dupe_inds = [item for item, count in Counter(inds).items() if count > 1] 207 | for dd in dupe_inds: 208 | pts = np.where(inds == dd)[0] 209 | x_loc = int(velo_pts_im[pts[0], 0]) 210 | y_loc = int(velo_pts_im[pts[0], 1]) 211 | depth[y_loc, x_loc] = velo_pts_im[pts, 2].min() 212 | depth[depth < 0] = 0 213 | return depth 214 | -------------------------------------------------------------------------------- /kitti_train_depth_prepare/prepare_train_data.py: -------------------------------------------------------------------------------- 1 | # Modified from code of Clement Pinard 2 | # https://github.com/ClementPinard/SfmLearner-Pytorch 3 | 4 | import argparse 5 | import scipy.misc 6 | import numpy as np 7 | from joblib import Parallel, delayed 8 | from tqdm import tqdm 9 | from path import Path 10 | from kitti_raw_loader import KittiRawLoader 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("--dataset_dir", default='../../kitti', type=str, 14 | help='path to original dataset') 15 | parser.add_argument("--static-frames", default='static_frames.txt', 16 | help="list of imgs to discard for being static, if not set will discard them based on speed \ 17 | (careful, on KITTI some frames have incorrect speed)") 18 | parser.add_argument("--dump-root", type=str, default='../prepared_kitti_train_data', help="Where to dump the data") 19 | parser.add_argument("--height", type=int, default=128, help="image height") 20 | parser.add_argument("--width", type=int, default=416, help="image width") 21 | parser.add_argument("--num-threads", type=int, default=1, help="number of threads to use") 22 | 23 | args = parser.parse_args() 24 | 25 | 26 | def dump_example(scene, args): # scene: 2011_0926_drive_0003_sync, ... 27 | scene_list = data_loader.collect_scenes(scene) # scene_list: ..._02, ..._03 28 | # print(scene) 29 | for scene_data in scene_list: 30 | dump_dir = args.dump_root/scene_data['rel_path'] 31 | dump_dir.makedirs_p() 32 | intrinsics = scene_data['intrinsics'] 33 | dump_cam_file = dump_dir/'cam.txt' 34 | np.savetxt(dump_cam_file, intrinsics) 35 | 36 | # print(dump_dir) 37 | for sample in data_loader.get_scene_imgs(scene_data): # sample: img, id, depth 38 | img, frame_nb = sample["img"], sample["id"] 39 | dump_img_file = dump_dir/'{}.jpg'.format(frame_nb) 40 | scipy.misc.imsave(dump_img_file, img) 41 | dump_depth_file = dump_dir/'{}.npy'.format(frame_nb) 42 | np.save(dump_depth_file, sample["depth"]) 43 | 44 | if len(dump_dir.files('*.jpg')) < 3: 45 | dump_dir.rmtree() 46 | 47 | 48 | def main(): 49 | args.dump_root = Path(args.dump_root) 50 | args.dump_root.mkdir_p() 51 | 52 | global data_loader 53 | 54 | data_loader = KittiRawLoader(args.dataset_dir, 55 | static_frames_file=args.static_frames, 56 | img_height=args.height, 57 | img_width=args.width, 58 | ) 59 | 60 | print('Retrieving frames') 61 | for scene in data_loader.scenes: 62 | print(scene) 63 | if args.num_threads == 1: 64 | for scene in tqdm(data_loader.scenes): 65 | dump_example(scene, args) 66 | else: 67 | Parallel(n_jobs=args.num_threads)(delayed(dump_example)(scene, args) for scene in tqdm(data_loader.scenes)) 68 | 69 | print('Generating train val lists') 70 | np.random.seed(8964) 71 | # to avoid data snooping, we will make two cameras of the same scene to fall in the same set, train or val 72 | subdirs = args.dump_root.dirs() 73 | canonic_prefixes = set([subdir.basename()[:-2] for subdir in subdirs]) 74 | with open(args.dump_root / 'train.txt', 'w') as tf: 75 | with open(args.dump_root / 'val.txt', 'w') as vf: 76 | for pr in tqdm(canonic_prefixes): 77 | corresponding_dirs = args.dump_root.dirs('{}*'.format(pr)) 78 | if np.random.random() < 0.1: 79 | for s in corresponding_dirs: 80 | vf.write('{}\n'.format(s.name)) 81 | else: 82 | for s in corresponding_dirs: 83 | tf.write('{}\n'.format(s.name)) 84 | 85 | 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /kitti_train_depth_prepare/test_scenes.txt: -------------------------------------------------------------------------------- 1 | 2011_09_26_drive_0117 2 | 2011_09_28_drive_0002 3 | 2011_09_26_drive_0052 4 | 2011_09_30_drive_0016 5 | 2011_09_26_drive_0059 6 | 2011_09_26_drive_0027 7 | 2011_09_26_drive_0020 8 | 2011_09_26_drive_0009 9 | 2011_09_26_drive_0013 10 | 2011_09_26_drive_0101 11 | 2011_09_26_drive_0046 12 | 2011_09_26_drive_0029 13 | 2011_09_26_drive_0064 14 | 2011_09_26_drive_0048 15 | 2011_10_03_drive_0027 16 | 2011_09_26_drive_0002 17 | 2011_09_26_drive_0036 18 | 2011_09_29_drive_0071 19 | 2011_10_03_drive_0047 20 | 2011_09_30_drive_0027 21 | 2011_09_26_drive_0086 22 | 2011_09_26_drive_0084 23 | 2011_09_26_drive_0096 24 | 2011_09_30_drive_0018 25 | 2011_09_26_drive_0106 26 | 2011_09_26_drive_0056 27 | 2011_09_26_drive_0023 28 | 2011_09_26_drive_0093 29 | -------------------------------------------------------------------------------- /output_predict_img/deeplab_output_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/output_predict_img/deeplab_output_depth.png -------------------------------------------------------------------------------- /output_predict_img/deeplab_output_seg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/output_predict_img/deeplab_output_seg.png -------------------------------------------------------------------------------- /output_predict_img/dispnet_output_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/output_predict_img/dispnet_output_depth.png -------------------------------------------------------------------------------- /output_predict_img/dispnet_output_seg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/output_predict_img/dispnet_output_seg.png -------------------------------------------------------------------------------- /output_predict_img/fcn_output_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/output_predict_img/fcn_output_depth.png -------------------------------------------------------------------------------- /output_predict_img/fcn_output_seg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/output_predict_img/fcn_output_seg.png -------------------------------------------------------------------------------- /output_predict_img/fcrn_output_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/output_predict_img/fcrn_output_depth.png -------------------------------------------------------------------------------- /output_predict_img/fcrn_output_seg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/output_predict_img/fcrn_output_seg.png -------------------------------------------------------------------------------- /output_predict_img/frrn_output_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/output_predict_img/frrn_output_depth.png -------------------------------------------------------------------------------- /output_predict_img/frrn_output_seg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/output_predict_img/frrn_output_seg.png -------------------------------------------------------------------------------- /output_predict_img/segnet_output_depth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/output_predict_img/segnet_output_depth.png -------------------------------------------------------------------------------- /output_predict_img/segnet_output_seg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/output_predict_img/segnet_output_seg.png -------------------------------------------------------------------------------- /ptsemseg/augmentations/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from ptsemseg.augmentations.augmentations import * 3 | 4 | logger = logging.getLogger('ptsemseg') 5 | 6 | key2aug = {'gamma': AdjustGamma, 7 | 'hue': AdjustHue, 8 | 'brightness': AdjustBrightness, 9 | 'saturation': AdjustSaturation, 10 | 'contrast': AdjustContrast, 11 | 'rcrop': RandomCrop, 12 | 'hflip': RandomHorizontallyFlip, 13 | 'vflip': RandomVerticallyFlip, 14 | 'scale': Scale, 15 | 'rsize': RandomSized, 16 | 'rsizecrop': RandomSizedCrop, 17 | 'rotate': RandomRotate, 18 | 'translate': RandomTranslate, 19 | 'ccrop': CenterCrop,} 20 | 21 | def get_composed_augmentations(aug_dict): 22 | if aug_dict is None: 23 | logger.info("Using No Augmentations") 24 | return None 25 | 26 | augmentations = [] 27 | for aug_key, aug_param in aug_dict.items(): 28 | augmentations.append(key2aug[aug_key](aug_param)) 29 | logger.info("Using {} aug with params {}".format(aug_key, aug_param)) 30 | return Compose(augmentations) 31 | 32 | 33 | -------------------------------------------------------------------------------- /ptsemseg/loader/__init__.py: -------------------------------------------------------------------------------- 1 | from ptsemseg.loader.cityscapes_loader_seg import cityscapesLoader_seg 2 | from ptsemseg.loader.kitti_loader_seg import kittiLoader_seg 3 | from ptsemseg.loader.kitti_loader_depth import kittiLoader_depth 4 | 5 | 6 | def get_loader(name, task): 7 | if task == "seg": 8 | return { 9 | "cityscapes": cityscapesLoader_seg, 10 | "kitti": kittiLoader_seg 11 | }[name] 12 | elif task == "depth": 13 | return { 14 | "kitti": kittiLoader_depth 15 | }[name] 16 | else: 17 | print("task undefined!") 18 | -------------------------------------------------------------------------------- /ptsemseg/loader/cityscapes_loader_depth.py: -------------------------------------------------------------------------------- 1 | # Mostly borrowed from https://github.com/meetshah1995/pytorch-semseg 2 | 3 | import os 4 | import torch 5 | import sys 6 | import numpy as np 7 | import scipy.misc as m 8 | import cv2 9 | import copy 10 | 11 | from torch.utils import data 12 | 13 | from ptsemseg.utils import recursive_glob 14 | from ptsemseg.augmentations import * 15 | 16 | 17 | class cityscapesLoader_depth(data.Dataset): 18 | def __init__( 19 | self, 20 | root, 21 | split="train", 22 | is_transform=True, 23 | img_size=(1024, 2048), 24 | augmentations=None, 25 | img_norm=True, 26 | # version="cityscapes", 27 | ): 28 | """__init__ 29 | 30 | :param root: 31 | :param split: 32 | :param is_transform: 33 | :param img_size: 34 | :param augmentations 35 | """ 36 | self.root = root 37 | self.split = split 38 | self.is_transform = is_transform 39 | self.augmentations = augmentations 40 | self.img_norm = img_norm 41 | self.img_size = ( 42 | img_size if isinstance(img_size, tuple) else (img_size, img_size) 43 | ) 44 | # self.mean = np.array(self.mean_rgb[version]) 45 | self.files = {} 46 | 47 | self.images_base = os.path.join(self.root, "leftImg8bit", self.split) 48 | self.annotations_base = os.path.join( 49 | self.root, "disparity", self.split 50 | ) 51 | 52 | self.files[split] = recursive_glob(rootdir=self.images_base, suffix=".png") 53 | 54 | if not self.files[split]: 55 | raise Exception( 56 | "No files for split=[%s] found in %s" % (split, self.images_base) 57 | ) 58 | 59 | print("Found %d %s images" % (len(self.files[split]), split)) 60 | sys.stdout.flush() 61 | 62 | def __len__(self): 63 | """__len__""" 64 | return len(self.files[self.split]) 65 | 66 | def __getitem__(self, index): 67 | """__getitem__ 68 | 69 | :param index: 70 | """ 71 | img_path = self.files[self.split][index].rstrip() 72 | disp_path = os.path.join( 73 | self.annotations_base, 74 | img_path.split(os.sep)[-2], 75 | os.path.basename(img_path)[:-15] + "disparity.png", 76 | ) 77 | 78 | img = m.imread(img_path) # original image size: 1024*2048*3 79 | img = np.array(img, dtype=np.uint8) 80 | 81 | disp = cv2.imread(disp_path, cv2.IMREAD_UNCHANGED).astype(np.float32) # disparity map: [1024, 2056] 82 | disp[disp > 0] = (disp[disp > 0] - 1) / 256 83 | depth = copy.copy(disp) 84 | depth[depth > 0] = (0.209313 * 2262.52) / depth[depth > 0] 85 | depth[depth >= 85] = 0 86 | 87 | if self.augmentations is not None: 88 | img, depth = self.augmentations(img, depth) 89 | 90 | if self.is_transform: 91 | img, depth = self.transform(img, depth) 92 | 93 | return img, depth, img_path 94 | 95 | def transform(self, img, depth): 96 | """transform 97 | 98 | :param img: 99 | :param depth: 100 | """ 101 | img = m.imresize(img, (self.img_size[0], self.img_size[1])) # uint8 with RGB mode 102 | # img = img[:, :, ::-1] # RGB -> BGR [h, w, 3] do not exist for depth task 103 | img = img.astype(np.float32) 104 | if self.img_norm: 105 | img = ((img / 255 - 0.5) / 0.5) # normalize to [-1, 1], different from segmentation 106 | img = img.transpose(2, 0, 1) # [3, h, w] 107 | 108 | depth = depth.astype(np.float32) 109 | depth = m.imresize(depth, (self.img_size[0], self.img_size[1]), "nearest", mode="F") 110 | depth = np.expand_dims(depth, axis=0) 111 | 112 | img = torch.from_numpy(img).float() # tensor, shape: [3, h, w] 113 | depth = torch.from_numpy(depth).float() # tensor, shape: [1, h, w] 114 | 115 | return img, depth 116 | 117 | -------------------------------------------------------------------------------- /ptsemseg/loader/cityscapes_loader_seg.py: -------------------------------------------------------------------------------- 1 | # Mostly borrowed from https://github.com/meetshah1995/pytorch-semseg 2 | 3 | import os 4 | import torch 5 | import sys 6 | import numpy as np 7 | import scipy.misc as m 8 | 9 | from torch.utils import data 10 | 11 | from ptsemseg.utils import recursive_glob 12 | from ptsemseg.augmentations import * 13 | 14 | 15 | class cityscapesLoader_seg(data.Dataset): 16 | # 19classes, RGB of maskes 17 | colors = [ # [ 0, 0, 0], 18 | [128, 64, 128], 19 | [244, 35, 232], 20 | [70, 70, 70], 21 | [102, 102, 156], 22 | [190, 153, 153], 23 | [153, 153, 153], 24 | [250, 170, 30], 25 | [220, 220, 0], 26 | [107, 142, 35], 27 | [152, 251, 152], 28 | [0, 130, 180], 29 | [220, 20, 60], 30 | [255, 0, 0], 31 | [0, 0, 142], 32 | [0, 0, 70], 33 | [0, 60, 100], 34 | [0, 80, 100], 35 | [0, 0, 230], 36 | [119, 11, 32], 37 | ] 38 | 39 | label_colours = dict(zip(range(19), colors)) 40 | 41 | # mean_rgb = { 42 | # "pascal": [103.939, 116.779, 123.68], 43 | # "cityscapes": [0.0, 0.0, 0.0], 44 | # } # pascal mean for PSPNet and ICNet pre-trained model 45 | 46 | def __init__( 47 | self, 48 | root, 49 | split="train", 50 | is_transform=True, 51 | img_size=(1024, 2048), 52 | augmentations=None, 53 | img_norm=True, 54 | saliency_eval_depth=False, 55 | # version="cityscapes", 56 | ): 57 | """__init__ 58 | 59 | :param root: 60 | :param split: 61 | :param is_transform: 62 | :param img_size: 63 | :param augmentations 64 | """ 65 | self.root = root 66 | self.split = split 67 | self.is_transform = is_transform 68 | self.augmentations = augmentations 69 | self.img_norm = img_norm 70 | self.n_classes = 19 71 | self.img_size = ( 72 | img_size if isinstance(img_size, tuple) else (img_size, img_size) 73 | ) 74 | # self.mean = np.array(self.mean_rgb[version]) 75 | self.files = {} 76 | self.saliency_eval_depth = saliency_eval_depth 77 | 78 | self.images_base = os.path.join(self.root, "leftImg8bit", self.split) 79 | self.annotations_base = os.path.join( 80 | self.root, "gtFine", self.split 81 | ) 82 | 83 | self.files[split] = recursive_glob(rootdir=self.images_base, suffix=".png") 84 | 85 | self.void_classes = [0, 1, 2, 3, 4, 5, 6, 9, 10, 14, 15, 16, 18, 29, 30, -1] 86 | self.valid_classes = [ 87 | 7, 88 | 8, 89 | 11, 90 | 12, 91 | 13, 92 | 17, 93 | 19, 94 | 20, 95 | 21, 96 | 22, 97 | 23, 98 | 24, 99 | 25, 100 | 26, 101 | 27, 102 | 28, 103 | 31, 104 | 32, 105 | 33, 106 | ] 107 | self.class_names = [ 108 | "unlabelled", 109 | "road", 110 | "sidewalk", 111 | "building", 112 | "wall", 113 | "fence", 114 | "pole", 115 | "traffic_light", 116 | "traffic_sign", 117 | "vegetation", 118 | "terrain", 119 | "sky", 120 | "person", 121 | "rider", 122 | "car", 123 | "truck", 124 | "bus", 125 | "train", 126 | "motorcycle", 127 | "bicycle", 128 | ] 129 | 130 | self.ignore_index = 250 131 | self.class_map = dict(zip(self.valid_classes, range(19))) 132 | self.decode_class_map = dict(zip(range(19), self.valid_classes)) 133 | 134 | if not self.files[split]: 135 | raise Exception( 136 | "No files for split=[%s] found in %s" % (split, self.images_base) 137 | ) 138 | 139 | print("Found %d %s images" % (len(self.files[split]), split)) 140 | sys.stdout.flush() 141 | 142 | def __len__(self): 143 | """__len__""" 144 | return len(self.files[self.split]) 145 | 146 | def __getitem__(self, index): 147 | """__getitem__ 148 | 149 | :param index: 150 | """ 151 | img_path = self.files[self.split][index].rstrip() 152 | lbl_path = os.path.join( 153 | self.annotations_base, 154 | img_path.split(os.sep)[-2], 155 | os.path.basename(img_path)[:-15] + "gtFine_labelIds.png", 156 | ) 157 | 158 | img = m.imread(img_path) # original image size: 1024*2048*3 159 | img = np.array(img, dtype=np.uint8) 160 | 161 | lbl = m.imread(lbl_path) # original label size: 1024*2048 162 | lbl = self.encode_segmap(np.array(lbl, dtype=np.uint8)) 163 | 164 | if self.augmentations is not None: 165 | img, lbl = self.augmentations(img, lbl) 166 | 167 | if self.is_transform: 168 | img, lbl = self.transform(img, lbl) 169 | 170 | return img, lbl, img_path 171 | 172 | def transform(self, img, lbl): 173 | """transform 174 | 175 | :param img: 176 | :param lbl: 177 | """ 178 | img = m.imresize( 179 | img, (self.img_size[0], self.img_size[1]) 180 | ) # uint8 with RGB mode 181 | if self.saliency_eval_depth == False: 182 | img = img[:, :, ::-1] # RGB -> BGR shape: [h, w, 3] 183 | img = img.astype(np.float64) 184 | # img -= self.mean 185 | if self.img_norm: 186 | if self.saliency_eval_depth == False: 187 | img = img.astype(float) / 255.0 188 | else: 189 | img = ((img / 255 - 0.5) / 0.5) 190 | img = img.transpose(2, 0, 1) # NHWC -> NCHW [3, h, w] 191 | 192 | classes = np.unique(lbl) # all classes included in this label image 193 | lbl = lbl.astype(float) 194 | lbl = m.imresize(lbl, (self.img_size[0], self.img_size[1]), "nearest", mode="F") 195 | lbl = lbl.astype(int) 196 | 197 | if not np.all(classes == np.unique(lbl)): 198 | print("WARN: resizing labels yielded fewer classes") 199 | 200 | if not np.all(np.unique(lbl[lbl != self.ignore_index]) < self.n_classes): 201 | print("after det", classes, np.unique(lbl)) 202 | raise ValueError("Segmentation map contained invalid class values") 203 | 204 | img = torch.from_numpy(img).float() # tensor, shape: [3, h, w] 205 | lbl = torch.from_numpy(lbl).long() # tensor, shape: [h, w] 206 | 207 | return img, lbl 208 | 209 | def decode_segmap_tocolor(self, temp): 210 | r = temp.copy() 211 | g = temp.copy() 212 | b = temp.copy() 213 | for l in range(0, self.n_classes): 214 | r[temp == l] = self.label_colours[l][0] 215 | g[temp == l] = self.label_colours[l][1] 216 | b[temp == l] = self.label_colours[l][2] 217 | 218 | rgb = np.zeros((temp.shape[0], temp.shape[1], 3)) 219 | rgb[:, :, 0] = r / 255.0 220 | rgb[:, :, 1] = g / 255.0 221 | rgb[:, :, 2] = b / 255.0 222 | return rgb 223 | 224 | def decode_segmap_tolabelId(self, temp): 225 | labels_ID = temp.copy() 226 | for i in range(19): 227 | labels_ID[temp == i] = self.valid_classes[i] 228 | return labels_ID 229 | 230 | def encode_segmap(self, mask): 231 | # Put all void classes to 250 232 | # map valid classes to 0~18 233 | for _voidc in self.void_classes: 234 | mask[mask == _voidc] = self.ignore_index 235 | for _validc in self.valid_classes: 236 | mask[mask == _validc] = self.class_map[_validc] 237 | return mask -------------------------------------------------------------------------------- /ptsemseg/loader/kitti_loader_depth.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | import numpy as np 3 | from path import Path 4 | import scipy.misc as m 5 | import torch 6 | from ptsemseg.augmentations import * 7 | 8 | 9 | def crawl_folders(folders_list): 10 | # taken from https://github.com/ClementPinard/SfmLearner-Pytorch 11 | imgs = [] 12 | depth = [] 13 | for folder in folders_list: 14 | current_imgs = sorted(folder.files('*.jpg')) 15 | current_depth = [] 16 | for img in current_imgs: 17 | d = img.dirname()/(img.name[:-4] + '.npy') 18 | assert(d.isfile()), "depth file {} not found".format(str(d)) 19 | depth.append(d) 20 | imgs.extend(current_imgs) 21 | depth.extend(current_depth) 22 | return imgs, depth 23 | 24 | 25 | class kittiLoader_depth(data.Dataset): 26 | """A sequence data loader where the files are arranged in this way: 27 | root/scene_1/0000000.jpg 28 | root/scene_1/0000000.npy 29 | root/scene_1/0000001.jpg 30 | root/scene_1/0000001.npy 31 | .. 32 | root/scene_2/0000000.jpg 33 | root/scene_2/0000000.npy 34 | . 35 | 36 | transform functions must take in a list a images and a numpy array which can be None 37 | """ 38 | 39 | def __init__( 40 | self, 41 | root, 42 | split="train", 43 | is_transform=True, 44 | img_size=(128, 416), 45 | augmentations=None, 46 | img_norm=True, 47 | ): 48 | self.root = Path(root) 49 | scene_list_path = self.root/'{}.txt'.format(split) 50 | self.scenes = [self.root/folder[:-1] for folder in open(scene_list_path)] 51 | self.imgs, self.depth = crawl_folders(self.scenes) 52 | self.is_transform = is_transform 53 | self.augmentations = augmentations 54 | self.img_norm = img_norm 55 | self.img_size = ( 56 | img_size if isinstance(img_size, tuple) else (img_size, img_size) 57 | ) 58 | print("number of {} images:".format(split), len(self.imgs)) 59 | 60 | def __len__(self): 61 | return len(self.imgs) 62 | 63 | def __getitem__(self, index): 64 | img = m.imread(self.imgs[index]) # img: [h, w, 3], shape determined by img_height, img_width arguments of prepare_train_data.py 65 | img = np.array(img, dtype=np.uint8) 66 | depth = np.load(self.depth[index]) # depth: [h, w] 67 | 68 | if self.augmentations is not None: 69 | img, depth = self.augmentations(img, depth) 70 | 71 | if self.is_transform: 72 | img, depth = self.transform(img, depth) 73 | 74 | return img, depth, self.imgs[index] 75 | 76 | def transform(self, img, depth): 77 | img = m.imresize(img, (self.img_size[0], self.img_size[1])) # uint8 with RGB mode 78 | img = img.astype(np.float32) 79 | img = np.transpose(img, (2, 0, 1)) # [3, h, w] 80 | 81 | depth = depth.astype(np.float32) 82 | depth = m.imresize(depth, (self.img_size[0], self.img_size[1]), "nearest", mode="F") 83 | depth = np.expand_dims(depth, axis=0) 84 | 85 | if self.img_norm: 86 | img = ((img / 255 - 0.5) / 0.5) # normalize to [-1, 1] 87 | 88 | img = torch.from_numpy(img).float() # [3, h, w] 89 | depth = torch.from_numpy(depth).float() # [1, h, w] 90 | 91 | return img, depth 92 | 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /ptsemseg/loader/kitti_loader_seg.py: -------------------------------------------------------------------------------- 1 | # adapted from https://github.com/meetshah1995/pytorch-semseg 2 | 3 | import os 4 | import torch 5 | import sys 6 | import numpy as np 7 | import scipy.misc as m 8 | 9 | from torch.utils import data 10 | 11 | from ptsemseg.augmentations import * 12 | 13 | class kittiLoader_seg(data.Dataset): 14 | # 19classes, RGB of maskes 15 | colors = [ # [ 0, 0, 0], 16 | [128, 64, 128], 17 | [244, 35, 232], 18 | [70, 70, 70], 19 | [102, 102, 156], 20 | [190, 153, 153], 21 | [153, 153, 153], 22 | [250, 170, 30], 23 | [220, 220, 0], 24 | [107, 142, 35], 25 | [152, 251, 152], 26 | [0, 130, 180], 27 | [220, 20, 60], 28 | [255, 0, 0], 29 | [0, 0, 142], 30 | [0, 0, 70], 31 | [0, 60, 100], 32 | [0, 80, 100], 33 | [0, 0, 230], 34 | [119, 11, 32], 35 | ] 36 | 37 | label_colours = dict(zip(range(19), colors)) 38 | 39 | # mean_rgb = { 40 | # "pascal": [103.939, 116.779, 123.68], 41 | # "cityscapes": [0.0, 0.0, 0.0], 42 | # } # pascal mean for PSPNet and ICNet pre-trained model 43 | 44 | def __init__( 45 | self, 46 | root, 47 | split="train", 48 | is_transform=True, 49 | img_size=(375, 1242), 50 | augmentations=None, 51 | img_norm=True, 52 | saliency_eval_depth = False 53 | # version="cityscapes", 54 | ): 55 | """__init__ 56 | 57 | :param root: 58 | :param split: 59 | :param is_transform: 60 | :param img_size: 61 | :param augmentations 62 | """ 63 | self.root = root 64 | self.split = split 65 | self.is_transform = is_transform 66 | self.augmentations = augmentations 67 | self.img_norm = img_norm 68 | self.n_classes = 19 69 | self.img_size = ( 70 | img_size if isinstance(img_size, tuple) else (img_size, img_size) 71 | ) 72 | # self.mean = np.array(self.mean_rgb[version]) 73 | self.files = {} 74 | self.saliency_eval_depth = saliency_eval_depth # for later saliency evaluation on depth, always set to False for KITTI segmentation 75 | 76 | if self.split == "test": 77 | self.images_base = os.path.join(self.root, "testing", "image_2") 78 | self.annotations_base = os.path.join(self.root, "training", "semantic") # invalid 79 | else: 80 | self.images_base = os.path.join(self.root, "training", "image_2") 81 | self.annotations_base = os.path.join(self.root, "training", "semantic") 82 | 83 | self.all_files = os.listdir(self.images_base) 84 | self.all_files.sort() 85 | 86 | # split 40 images from the training set as the val set 87 | if self.split == "val": 88 | self.files[split] = self.all_files[::5] # select one img from every 5 imgs into the val set 89 | # 160 training images 90 | if self.split == "train": 91 | self.files[split] = [file_name for file_name in self.all_files if file_name not in self.all_files[::5]] 92 | if self.split == "test": 93 | self.files[split] = self.all_files 94 | 95 | 96 | self.void_classes = [0, 1, 2, 3, 4, 5, 6, 9, 10, 14, 15, 16, 18, 29, 30, -1] 97 | self.valid_classes = [ 98 | 7, 99 | 8, 100 | 11, 101 | 12, 102 | 13, 103 | 17, 104 | 19, 105 | 20, 106 | 21, 107 | 22, 108 | 23, 109 | 24, 110 | 25, 111 | 26, 112 | 27, 113 | 28, 114 | 31, 115 | 32, 116 | 33, 117 | ] 118 | self.class_names = [ 119 | "unlabelled", 120 | "road", 121 | "sidewalk", 122 | "building", 123 | "wall", 124 | "fence", 125 | "pole", 126 | "traffic_light", 127 | "traffic_sign", 128 | "vegetation", 129 | "terrain", 130 | "sky", 131 | "person", 132 | "rider", 133 | "car", 134 | "truck", 135 | "bus", 136 | "train", 137 | "motorcycle", 138 | "bicycle", 139 | ] 140 | 141 | self.ignore_index = 250 142 | self.class_map = dict(zip(self.valid_classes, range(19))) 143 | self.decode_class_map = dict(zip(range(19), self.valid_classes)) 144 | 145 | if not self.files[split]: 146 | raise Exception( 147 | "No files for split=[%s] found in %s" % (split, self.images_base) 148 | ) 149 | 150 | print("Found %d %s images" % (len(self.files[split]), split)) 151 | sys.stdout.flush() 152 | 153 | def __len__(self): 154 | """__len__""" 155 | return len(self.files[self.split]) 156 | 157 | def __getitem__(self, index): 158 | """__getitem__ 159 | 160 | :param index: 161 | """ 162 | path = self.files[self.split][index].rstrip() 163 | img_path = os.path.join(self.images_base, path) 164 | lbl_path = os.path.join(self.annotations_base, path) 165 | 166 | img = m.imread(img_path) # original image size: 375*1242*3 167 | img = np.array(img, dtype=np.uint8) 168 | 169 | lbl = m.imread(lbl_path) # original label size: 375*1242 170 | lbl = self.encode_segmap(np.array(lbl, dtype=np.uint8)) 171 | 172 | if self.augmentations is not None: 173 | img, lbl = self.augmentations(img, lbl) 174 | 175 | if self.is_transform: 176 | img, lbl = self.transform(img, lbl) 177 | 178 | return img, lbl, img_path 179 | 180 | def transform(self, img, lbl): 181 | """transform 182 | 183 | :param img: 184 | :param lbl: 185 | """ 186 | # img: shape: [h, w, 3] 187 | img = m.imresize(img, (self.img_size[0], self.img_size[1])) # uint8 with RGB mode 188 | if self.saliency_eval_depth == False: 189 | img = img[:, :, ::-1] # RGB -> BGR shape: [h, w, 3] 190 | img = img.astype(np.float64) 191 | # img -= self.mean 192 | if self.img_norm: 193 | if self.saliency_eval_depth == False: 194 | img = img.astype(float) / 255.0 195 | else: 196 | img = ((img / 255 - 0.5) / 0.5) 197 | # NHWC -> NCHW 198 | img = img.transpose(2, 0, 1) # shape: [3, h, w] 199 | 200 | classes = np.unique(lbl) # all classes included in this label image 201 | lbl = lbl.astype(float) 202 | lbl = m.imresize(lbl, (self.img_size[0], self.img_size[1]), "nearest", mode="F") 203 | lbl = lbl.astype(int) 204 | 205 | if not np.all(classes == np.unique(lbl)): 206 | print("WARN: resizing labels yielded fewer classes") 207 | # sys.stdout.flush() 208 | 209 | if not np.all(np.unique(lbl[lbl != self.ignore_index]) < self.n_classes): 210 | print("after det", classes, np.unique(lbl)) 211 | raise ValueError("Segmentation map contained invalid class values") 212 | 213 | img = torch.from_numpy(img).float() # tensor, shape: [3, h, w] 214 | lbl = torch.from_numpy(lbl).long() # tensor, shape: [h, w] 215 | 216 | return img, lbl 217 | 218 | def decode_segmap_tocolor(self, temp): 219 | r = temp.copy() 220 | g = temp.copy() 221 | b = temp.copy() 222 | for l in range(0, self.n_classes): 223 | r[temp == l] = self.label_colours[l][0] 224 | g[temp == l] = self.label_colours[l][1] 225 | b[temp == l] = self.label_colours[l][2] 226 | 227 | rgb = np.zeros((temp.shape[0], temp.shape[1], 3)) 228 | rgb[:, :, 0] = r / 255.0 229 | rgb[:, :, 1] = g / 255.0 230 | rgb[:, :, 2] = b / 255.0 231 | return rgb 232 | 233 | def decode_segmap_tolabelId(self, temp): 234 | labels_ID = temp.copy() 235 | for i in range(19): 236 | labels_ID[temp == i] = self.valid_classes[i] 237 | return labels_ID 238 | 239 | def encode_segmap(self, mask): 240 | # Put all void classes to 250 241 | # map valid classes to 0~18 242 | for _voidc in self.void_classes: 243 | mask[mask == _voidc] = self.ignore_index 244 | for _validc in self.valid_classes: 245 | mask[mask == _validc] = self.class_map[_validc] 246 | return mask -------------------------------------------------------------------------------- /ptsemseg/loss/__init__.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | import functools 4 | 5 | from ptsemseg.loss.loss import cross_entropy2d 6 | from ptsemseg.loss.loss import bootstrapped_cross_entropy2d 7 | 8 | from ptsemseg.loss.loss import l1_loss 9 | from ptsemseg.loss.loss import Berhu_loss 10 | from ptsemseg.loss.loss import Huber_loss 11 | from ptsemseg.loss.loss import scale_invariant_loss 12 | 13 | logger = logging.getLogger('ptsemseg') 14 | 15 | key2loss = {'cross_entropy': cross_entropy2d, 16 | 'bootstrapped_cross_entropy': bootstrapped_cross_entropy2d, 17 | 'l1_loss': l1_loss, 18 | 'berhu_loss': Berhu_loss, 19 | 'huber_loss': Huber_loss, 20 | 'scale_invariant_loss': scale_invariant_loss} 21 | 22 | def get_loss_function(cfg): 23 | if cfg['training']['loss'] is None: 24 | if cfg['task'] == "seg": 25 | logger.info("Using default cross entropy loss for segmentation") 26 | return cross_entropy2d 27 | elif cfg['task'] == "depth": 28 | logger.info("Using default scale invariant loss for depth") 29 | return scale_invariant_loss 30 | else: 31 | print("Please specify the loss!") 32 | 33 | else: 34 | loss_dict = cfg['training']['loss'] 35 | loss_name = loss_dict['name'] 36 | loss_params = {k:v for k,v in loss_dict.items() if k != 'name'} 37 | 38 | if loss_name not in key2loss: 39 | raise NotImplementedError('Loss {} not implemented'.format(loss_name)) 40 | 41 | logger.info('Using {} with {} params'.format(loss_name, 42 | loss_params)) 43 | return functools.partial(key2loss[loss_name], **loss_params) 44 | -------------------------------------------------------------------------------- /ptsemseg/loss/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 7 | 8 | ########################### for segmentation #################### 9 | 10 | 11 | def cross_entropy2d(input, target, weight=None, size_average=True): 12 | # taken from https://github.com/meetshah1995/pytorch-semseg 13 | n, c, h, w = input.size() 14 | nt, ht, wt = target.size() 15 | 16 | # Handle inconsistent size between input and target 17 | if h > ht and w > wt: # upsample labels 18 | target = target.unsequeeze(1) 19 | target = F.upsample(target, size=(h, w), mode="nearest") 20 | target = target.sequeeze(1) 21 | elif h < ht and w < wt: # upsample images 22 | input = F.upsample(input, size=(ht, wt), mode="bilinear") 23 | elif h != ht and w != wt: 24 | raise Exception("Only support upsampling") 25 | 26 | loss = F.cross_entropy( 27 | input, target, weight=weight, size_average=size_average, ignore_index=250 28 | ) 29 | return loss 30 | 31 | 32 | def bootstrapped_cross_entropy2d(input, 33 | target, 34 | K, 35 | weight=None, 36 | size_average=True): 37 | # taken from https://github.com/meetshah1995/pytorch-semseg 38 | batch_size = input.size()[0] 39 | 40 | def _bootstrap_xentropy_single(input, 41 | target, 42 | K, 43 | weight=None, 44 | size_average=True): 45 | 46 | n, c, h, w = input.size() 47 | loss = F.cross_entropy(input, 48 | target, 49 | weight=weight, 50 | reduce=False, 51 | size_average=False, 52 | ignore_index=250) 53 | loss = loss.view(-1) 54 | topk_loss, _ = loss.topk(K) 55 | reduced_topk_loss = topk_loss.sum() / K 56 | 57 | return reduced_topk_loss 58 | 59 | loss = 0.0 60 | # Bootstrap from each image not entire batch 61 | for i in range(batch_size): 62 | loss += _bootstrap_xentropy_single( 63 | input=torch.unsqueeze(input[i], 0), 64 | target=torch.unsqueeze(target[i], 0), 65 | K=K, 66 | weight=weight, 67 | size_average=size_average, 68 | ) 69 | return loss / float(batch_size) 70 | 71 | 72 | ############################ for depth ########################### 73 | 74 | 75 | def compute_mask(input, target): 76 | # mask out depth values in predicted and target depth which are <= 0 77 | mask = np.logical_and(input.data.cpu().numpy() > 0, target.data.cpu().numpy() > 0) 78 | total_pixel = np.prod(input.size(), dtype=np.float32).item() 79 | total_pixel = total_pixel - np.sum(mask) 80 | mask = torch.from_numpy(mask.astype(int)).float().to(device) 81 | return mask, total_pixel 82 | 83 | 84 | def l1_loss(input, target, smooth=True): 85 | if not input.size() == target.size(): 86 | _, _, H, W = target.size() 87 | input = F.upsample(input, size=(H, W), mode='bilinear') 88 | 89 | # mask out depth values in input and target which are <= 0 90 | mask, total_pixel = compute_mask(input, target) 91 | diff = torch.abs(target - input) 92 | diff = diff * mask 93 | loss = torch.sum(diff) / total_pixel 94 | if smooth: 95 | loss = loss + smooth_loss(input=input) / 1000.0 # empirical weight for smooth loss 96 | return loss 97 | 98 | 99 | def Berhu_loss(input, target, smooth=True): 100 | if not input.size() == target.size(): 101 | _, _, H, W = target.size() 102 | input = F.upsample(input, size=(H, W), mode='bilinear') 103 | 104 | # mask out depth values in input and target which are <= 0 105 | mask, total_pixel = compute_mask(input, target) 106 | diff = torch.abs(target - input) 107 | c = torch.max(diff).item() / 5 108 | leq = (diff <= c).float() 109 | l2_losses = (diff ** 2 + c ** 2) / (2 * c) 110 | losses = leq * diff + (1 - leq) * l2_losses 111 | losses = losses * mask 112 | loss = torch.sum(losses) / total_pixel 113 | if smooth: 114 | loss = loss + smooth_loss(input=input) / 1000.0 115 | return loss 116 | 117 | 118 | def Huber_loss(input, target, smooth=True): 119 | if not input.size() == target.size(): 120 | _, _, H, W = target.size() 121 | input = F.upsample(input, size=(H, W), mode='bilinear') 122 | 123 | # mask out depth values in input and target which are <= 0 124 | mask, total_pixel = compute_mask(input, target) 125 | diff = target - input 126 | leq = (diff < 1).float() 127 | l2_losses = diff ** 2 / 2 128 | losses = leq * l2_losses + (1-leq) * (diff - 0.5) 129 | losses = losses * mask 130 | loss = torch.sum(losses) / total_pixel 131 | if smooth: 132 | loss = loss + smooth_loss(input=input) / 1000.0 133 | return loss 134 | 135 | 136 | # input, target: [batch_size, 1, h, w] 137 | def scale_invariant_loss(input, target, smooth=True): 138 | if not input.size() == target.size(): 139 | _, _, H, W = target.size() 140 | input = F.upsample(input, size=(H, W), mode='bilinear') 141 | 142 | # mask out depth values in input and target which are <= 0 143 | mask, total_pixel = compute_mask(input, target) 144 | 145 | first_log = torch.log(torch.clamp(input, min=1e-3)) 146 | second_log = torch.log(torch.clamp(target, min=1e-3)) 147 | diff = first_log - second_log 148 | diff = diff * mask 149 | loss = torch.sum((diff ** 2))/total_pixel - (torch.sum(diff) ** 2)/(total_pixel ** 2) 150 | if smooth: 151 | loss = loss + smooth_loss(input=input) / 1000.0 152 | return loss 153 | 154 | 155 | def gradient(pred): 156 | D_dy = pred[:, :, 1:] - pred[:, :, :-1] 157 | D_dx = pred[:, :, :, 1:] - pred[:, :, :, :-1] 158 | return D_dx, D_dy 159 | 160 | 161 | def smooth_loss(input): 162 | dx, dy = gradient(input) 163 | dx2, dxdy = gradient(dx) 164 | dydx, dy2 = gradient(dy) 165 | loss = dx2.abs().mean() + dxdy.abs().mean() + dydx.abs().mean() + dy2.abs().mean() 166 | return loss -------------------------------------------------------------------------------- /ptsemseg/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class runningScoreSeg(object): 5 | # Adapted from https://github.com/meetshah1995/pytorch-semseg 6 | def __init__(self, n_classes): 7 | self.n_classes = n_classes 8 | self.confusion_matrix = np.zeros((n_classes, n_classes)) 9 | 10 | def _fast_hist(self, label_true, label_pred, n_class): # label_true / label_pred: length (width*height) 11 | mask = (label_true >= 0) & (label_true < n_class) # remove invalid class (class 250) 12 | hist = np.bincount( 13 | n_class * label_true[mask].astype(int) + label_pred[mask], 14 | minlength=n_class ** 2, 15 | ).reshape(n_class, n_class) # [n_classes, n_classes] 16 | return hist 17 | 18 | def update(self, gt, pred): # [batch_size, height, width] 19 | for lt, lp in zip(gt, pred): 20 | self.confusion_matrix += self._fast_hist( 21 | lt.flatten(), lp.flatten(), self.n_classes 22 | ) 23 | 24 | def get_scores(self): 25 | """Returns accuracy score evaluation result. 26 | - overall accuracy 27 | - mean accuracy 28 | - mean IU 29 | - fwavacc 30 | """ 31 | hist = self.confusion_matrix 32 | acc = np.diag(hist).sum() / hist.sum() 33 | acc_cls = np.diag(hist) / hist.sum(axis=1) 34 | acc_cls = np.nanmean(acc_cls) 35 | iu = np.diag(hist) / (hist.sum(axis=1) + hist.sum(axis=0) - np.diag(hist)) 36 | mean_iu = np.nanmean(iu) 37 | freq = hist.sum(axis=1) / hist.sum() 38 | fwaviu = (freq[freq > 0] * iu[freq > 0]).sum() 39 | cls_iu = dict(zip(range(self.n_classes), iu)) 40 | 41 | return ( 42 | { 43 | "Overall Acc: \t": acc, 44 | "Mean Acc : \t": acc_cls, 45 | "FreqW IoU : \t": fwaviu, 46 | "Mean IoU : \t": mean_iu, 47 | }, 48 | cls_iu, 49 | ) 50 | 51 | def reset(self): 52 | self.confusion_matrix = np.zeros((self.n_classes, self.n_classes)) 53 | 54 | 55 | class runningScoreDepth(object): 56 | def __init__(self, dataset): 57 | self.error_names = ['abs_diff', 'abs_rel', 'sq_rel', 'rmse', 'rmse_log', 'a1', 'a2', 'a3'] 58 | self.metric_len = len(self.error_names) 59 | self.error_metric = [0 for i in range(self.metric_len)] # [0,0,0,0,...,0] 60 | self.dataset = dataset 61 | self.reset() 62 | 63 | def compute_errors_depth(self, gt, pred, crop=True): # input gt, pred: numpy array, shape: [batch_size, h, w] 64 | abs_diff, abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = 0, 0, 0, 0, 0, 0, 0, 0 65 | batch_size = gt.shape[0] 66 | 67 | ''' 68 | crop used by Garg ECCV16 to reprocude Eigen NIPS14 results 69 | construct a mask of False values, with the same size as target 70 | and then set to True values inside the crop 71 | ''' 72 | if crop: 73 | crop_mask = gt[0] != gt[0] 74 | if self.dataset == 'kitti': 75 | y1, y2 = int(0.40810811 * gt.shape[1]), int(0.99189189 * gt.shape[1]) 76 | x1, x2 = int(0.03594771 * gt.shape[2]), int(0.96405229 * gt.shape[2]) 77 | elif self.dataset == 'cityscapes': 78 | y1, y2 = int(0.05 * gt.shape[1]), int(0.80 * gt.shape[1]) 79 | x1, x2 = int(0.05 * gt.shape[2]), int(0.99 * gt.shape[2]) 80 | crop_mask[y1:y2, x1:x2] = 1 81 | 82 | for current_gt, current_pred in zip(gt, pred): # for each image in a batch 83 | valid = (current_gt > 0) & (current_gt < 80) & (current_pred > 0) & ( 84 | current_pred < 80) # mask out depth not in (0, 80) 85 | if crop: 86 | valid = valid & crop_mask 87 | 88 | valid_gt = current_gt[valid] 89 | valid_pred = current_pred[valid] 90 | 91 | # valid_pred = valid_pred * np.median(valid_gt)/np.median(valid_pred) 92 | 93 | thresh = np.maximum((valid_gt / valid_pred), (valid_pred / valid_gt)) 94 | a1 += (thresh < 1.25).mean() 95 | a2 += (thresh < 1.25 ** 2).mean() 96 | a3 += (thresh < 1.25 ** 3).mean() 97 | 98 | rmse += np.sqrt(np.mean((valid_gt - valid_pred) ** 2)) 99 | rmse_log += np.sqrt(np.mean((np.log(valid_gt) - np.log(valid_pred)) ** 2)) 100 | 101 | abs_diff += np.mean(np.abs(valid_gt - valid_pred)) 102 | abs_rel += np.mean(np.abs(valid_gt - valid_pred) / valid_gt) 103 | 104 | sq_rel += np.mean(((valid_gt - valid_pred) ** 2) / valid_gt) 105 | 106 | return [metric.item() / batch_size for metric in [abs_diff, abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3]] 107 | 108 | def update(self, gt, pred): 109 | self.error_metric = self.compute_errors_depth(gt, pred) 110 | n = 1 111 | self.count += n 112 | for i, v in enumerate(self.error_metric): 113 | self.val[i] = v 114 | self.sum[i] += v * n 115 | self.avg[i] = self.sum[i] / self.count 116 | 117 | def reset(self): 118 | self.val = [0.0]*self.metric_len 119 | self.avg = [0.0]*self.metric_len 120 | self.sum = [0.0]*self.metric_len 121 | self.count = 0 122 | 123 | def get_scores(self): 124 | return ({ 125 | "abs diff: \t": self.avg[0], 126 | "abs rel : \t": self.avg[1], 127 | "sq rel : \t": self.avg[2], 128 | "rmse : \t": self.avg[3], 129 | "rmse log : \t": self.avg[4], 130 | "threshold 1 : \t": self.avg[5], 131 | "threshold 2 : \t": self.avg[6], 132 | "threshold 3 : \t": self.avg[7]} 133 | ) 134 | 135 | 136 | class averageMeter(object): 137 | """Computes and stores the average and current value""" 138 | def __init__(self): 139 | self.reset() 140 | 141 | def reset(self): 142 | self.val = 0 143 | self.avg = 0 144 | self.sum = 0 145 | self.count = 0 146 | 147 | def update(self, val, n=1): 148 | self.val = val 149 | self.sum += val * n 150 | self.count += n 151 | self.avg = self.sum / self.count 152 | 153 | -------------------------------------------------------------------------------- /ptsemseg/models/__init__.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torchvision.models as models 3 | from collections import OrderedDict 4 | 5 | from ptsemseg.models.fcn_seg import * 6 | from ptsemseg.models.segnet_seg import * 7 | from ptsemseg.models.frrn_seg import * 8 | from ptsemseg.models.deeplab_seg import * 9 | from ptsemseg.models.fcrn_seg import * 10 | from ptsemseg.models.dispnet_seg import * 11 | 12 | from ptsemseg.models.fcn_depth import * 13 | from ptsemseg.models.segnet_depth import * 14 | from ptsemseg.models.frrn_depth import * 15 | from ptsemseg.models.deeplab_depth import * 16 | from ptsemseg.models.fcrn_depth import * 17 | from ptsemseg.models.dispnet_depth import * 18 | 19 | 20 | def get_model(model_dict, task, n_classes): 21 | name = model_dict['arch'] 22 | model = _get_model_instance(name, task) # model: an instance of class fcn8s 23 | param_dict = copy.deepcopy(model_dict) 24 | param_dict.pop('arch') 25 | 26 | if task == "seg": 27 | model = model(n_classes=n_classes, **param_dict) 28 | elif task == "depth": 29 | model = model(**param_dict) 30 | 31 | if name == "frrn": 32 | pass 33 | 34 | elif name == "fcn": 35 | vgg16 = models.vgg16(pretrained=True) 36 | model.init_vgg16_params(vgg16) 37 | 38 | # if you want to load from downloaded pretrained model: 39 | # vgg16 = models.vgg16(pretrained=False) 40 | # vgg16.load_state_dict(torch.load("pretrained_models/vgg16-imagenet.pth")) 41 | # model.init_vgg16_params(vgg16) 42 | 43 | elif name == "segnet": 44 | vgg16 = models.vgg16(pretrained=True) 45 | model.init_vgg16_params(vgg16) 46 | 47 | # if you want to load from downloaded pretrained model: 48 | # vgg16 = models.vgg16(pretrained=False) 49 | # vgg16.load_state_dict(torch.load("pretrained_models/vgg16-imagenet.pth")) 50 | # model.init_vgg16_params(vgg16) 51 | 52 | elif name == "dispnet": 53 | model.init_weights() 54 | 55 | elif name == "deeplab": 56 | resnet101 = models.resnet101(pretrained=True) 57 | initial_state_dict = model.init_resnet101_params(resnet101) 58 | model.load_state_dict(initial_state_dict, strict=False) 59 | 60 | # if you want to load from downloaded pretrained model: 61 | # model_path = 'pretrained_models/resnet101-imagenet.pth' 62 | # new_state_dict = model.init_resnet101_params(model_path) 63 | # model.load_state_dict(new_state_dict, strict=False) 64 | 65 | elif name == "fcrn": 66 | resnet50 = models.resnet50(pretrained=True) 67 | init_state_dict = model.init_resnet50_params(resnet50) 68 | model.load_state_dict(init_state_dict, strict=False) 69 | 70 | # if you want to load from downloaded pretrained model: 71 | # model_path = 'pretrained_models/resnet50-imagenet.pth' 72 | # init_state_dict = model.init_resnet50_params(model_path) 73 | # model.load_state_dict(init_state_dict, strict=False) 74 | 75 | else: 76 | print("Model {} not available".format(name)) 77 | 78 | return model 79 | 80 | 81 | def _get_model_instance(name, task): 82 | try: 83 | if task == "seg": 84 | return { 85 | "fcn": fcn_seg, 86 | "segnet": segnet_seg, 87 | "frrn": frrn_seg, 88 | "dispnet": dispnet_seg, 89 | "deeplab": deeplab_seg, 90 | "fcrn": fcrn_seg, 91 | }[name] 92 | elif task == "depth": 93 | return { 94 | "fcn": fcn_depth, 95 | "segnet": segnet_depth, 96 | "frrn": frrn_depth, 97 | "dispnet": dispnet_depth, 98 | "deeplab": deeplab_depth, 99 | "fcrn": fcrn_depth, 100 | }[name] 101 | except: 102 | raise("Model {} not available".format(name)) 103 | -------------------------------------------------------------------------------- /ptsemseg/models/deeplab_depth.py: -------------------------------------------------------------------------------- 1 | # deeplab v2: ResNet101 + ASPP, no multi scale input, for depth 2 | 3 | import torch.nn as nn 4 | import math 5 | import torch 6 | import numpy as np 7 | import torch.nn.functional as F 8 | from collections import OrderedDict 9 | 10 | affine_par = True # allow weights and bias in batch normalization layers or not 11 | learnable_bn_weights = False # allow learnable weights and bias in batch normalization layers or not 12 | 13 | 14 | def conv3x3(in_planes, out_planes, stride=1): 15 | "3x3 convolution with padding" 16 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 17 | padding=1, bias=False) 18 | 19 | 20 | class BasicBlock(nn.Module): 21 | expansion = 1 22 | 23 | def __init__(self, inplanes, planes, stride=1, downsample=None): 24 | super(BasicBlock, self).__init__() 25 | self.conv1 = conv3x3(inplanes, planes, stride) 26 | self.bn1 = nn.BatchNorm2d(planes, affine = affine_par) 27 | self.relu = nn.ReLU(inplace=True) 28 | self.conv2 = conv3x3(planes, planes) 29 | self.bn2 = nn.BatchNorm2d(planes, affine = affine_par) 30 | self.downsample = downsample 31 | self.stride = stride 32 | 33 | def forward(self, x): 34 | residual = x 35 | 36 | out = self.conv1(x) 37 | out = self.bn1(out) 38 | out = self.relu(out) 39 | 40 | out = self.conv2(out) 41 | out = self.bn2(out) 42 | 43 | if self.downsample is not None: 44 | residual = self.downsample(x) 45 | 46 | out += residual 47 | out = self.relu(out) 48 | 49 | return out 50 | 51 | 52 | class Bottleneck(nn.Module): 53 | expansion = 4 54 | 55 | def __init__(self, inplanes, planes, stride=1, dilation_ = 1, downsample=None): 56 | super(Bottleneck, self).__init__() 57 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change 58 | self.bn1 = nn.BatchNorm2d(planes,affine = affine_par) 59 | if learnable_bn_weights == False: 60 | for i in self.bn1.parameters(): 61 | i.requires_grad = False 62 | padding = 1 63 | if dilation_ == 2: 64 | padding = 2 65 | elif dilation_ == 4: 66 | padding = 4 67 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change 68 | padding=padding, bias=False, dilation = dilation_) 69 | self.bn2 = nn.BatchNorm2d(planes,affine = affine_par) 70 | if learnable_bn_weights == False: 71 | for i in self.bn2.parameters(): 72 | i.requires_grad = False 73 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 74 | self.bn3 = nn.BatchNorm2d(planes * 4, affine = affine_par) 75 | if learnable_bn_weights == False: 76 | for i in self.bn3.parameters(): 77 | i.requires_grad = False 78 | self.relu = nn.ReLU(inplace=True) 79 | self.downsample = downsample 80 | self.stride = stride 81 | 82 | 83 | 84 | def forward(self, x): 85 | residual = x 86 | 87 | out = self.conv1(x) 88 | out = self.bn1(out) 89 | out = self.relu(out) 90 | 91 | out = self.conv2(out) 92 | out = self.bn2(out) 93 | out = self.relu(out) 94 | 95 | out = self.conv3(out) 96 | out = self.bn3(out) 97 | 98 | if self.downsample is not None: 99 | residual = self.downsample(x) 100 | 101 | out += residual 102 | out = self.relu(out) 103 | 104 | return out 105 | 106 | class Classifier_Module(nn.Module): 107 | 108 | def __init__(self,dilation_series,padding_series): 109 | super(Classifier_Module, self).__init__() 110 | self.conv2d_list = nn.ModuleList() 111 | for dilation,padding in zip(dilation_series,padding_series): 112 | self.conv2d_list.append(nn.Conv2d(2048,1,kernel_size=3,stride=1, padding =padding, dilation = dilation,bias = True)) 113 | 114 | for m in self.conv2d_list: 115 | m.weight.data.normal_(0, 0.01) 116 | 117 | 118 | def forward(self, x): 119 | out = self.conv2d_list[0](x) 120 | for i in range(len(self.conv2d_list)-1): 121 | out += self.conv2d_list[i+1](x) 122 | return out 123 | 124 | 125 | 126 | class ResNet(nn.Module): 127 | def __init__(self, block, layers): 128 | self.inplanes = 64 129 | super(ResNet, self).__init__() 130 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 131 | bias=False) 132 | self.bn1 = nn.BatchNorm2d(64,affine = affine_par) 133 | if learnable_bn_weights == False: 134 | for i in self.bn1.parameters(): 135 | i.requires_grad = False 136 | self.relu = nn.ReLU(inplace=True) 137 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=True) # change 138 | self.layer1 = self._make_layer(block, 64, layers[0]) 139 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 140 | self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation__ = 2) 141 | self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation__ = 4) 142 | self.layer5 = self._make_pred_layer(Classifier_Module, [6,12,18,24],[6,12,18,24]) # ASPP 143 | 144 | for m in self.modules(): 145 | if isinstance(m, nn.Conv2d): 146 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 147 | m.weight.data.normal_(0, 0.01) 148 | elif isinstance(m, nn.BatchNorm2d): 149 | m.weight.data.fill_(1) 150 | m.bias.data.zero_() 151 | 152 | def _make_layer(self, block, planes, blocks, stride=1,dilation__ = 1): 153 | downsample = None 154 | if stride != 1 or self.inplanes != planes * block.expansion or dilation__ == 2 or dilation__ == 4: 155 | downsample = nn.Sequential( 156 | nn.Conv2d(self.inplanes, planes * block.expansion, 157 | kernel_size=1, stride=stride, bias=False), 158 | nn.BatchNorm2d(planes * block.expansion,affine = affine_par), 159 | ) 160 | if learnable_bn_weights == False: 161 | for i in downsample._modules['1'].parameters(): 162 | i.requires_grad = False 163 | layers = [] 164 | layers.append(block(self.inplanes, planes, stride,dilation_=dilation__, downsample = downsample )) 165 | self.inplanes = planes * block.expansion 166 | for i in range(1, blocks): 167 | layers.append(block(self.inplanes, planes,dilation_=dilation__)) 168 | 169 | return nn.Sequential(*layers) 170 | 171 | def _make_pred_layer(self,block, dilation_series, padding_series): 172 | return block(dilation_series,padding_series) 173 | 174 | def forward(self, x): 175 | x = self.conv1(x) 176 | x = self.bn1(x) 177 | x = self.relu(x) 178 | x = self.maxpool(x) 179 | x = self.layer1(x) 180 | x = self.layer2(x) 181 | x = self.layer3(x) 182 | x = self.layer4(x) 183 | x = self.layer5(x) 184 | 185 | return x 186 | 187 | class deeplab_depth(nn.Module): 188 | def __init__(self): 189 | super(deeplab_depth,self).__init__() 190 | self.Scale = ResNet(Bottleneck,[3, 4, 23, 3]) 191 | 192 | def forward(self,x): 193 | input_size = x.size()[2:] # x: [batch_size, 3, h, w] 194 | out = self.Scale(x) # for original scale 195 | out = F.interpolate(out, size=input_size, mode='bilinear', align_corners=True) 196 | return out 197 | 198 | # load pretrained resnet101 weights from torchvision resnet101 model 199 | def init_resnet101_params(self, resnet101): 200 | initial_state_dict = resnet101.state_dict() 201 | new_state_dict = OrderedDict() 202 | for k, v in initial_state_dict.items(): 203 | k = 'Scale.' + k 204 | new_state_dict[k] = v 205 | return new_state_dict 206 | 207 | # if you want to load from downloaded pretrained model: 208 | # model_path: path to the downloaded model 209 | # def init_resnet101_params(self, model_path): 210 | # saved_state_dict = torch.load(model_path, map_location=lambda storage, loc: storage) 211 | # new_state_dict = OrderedDict() 212 | # for k, v in saved_state_dict.items(): 213 | # k = 'Scale.' + k 214 | # new_state_dict[k] = v 215 | # return new_state_dict 216 | 217 | 218 | -------------------------------------------------------------------------------- /ptsemseg/models/deeplab_seg.py: -------------------------------------------------------------------------------- 1 | # deeplab v2: ResNet101 + ASPP, no multi scale input (for cityscapes) 2 | # for segmentation 3 | 4 | import torch.nn as nn 5 | import math 6 | import torch 7 | import numpy as np 8 | import torch.nn.functional as F 9 | from collections import OrderedDict 10 | 11 | affine_par = True # allow weights and bias in batch normalization layers or not 12 | learnable_bn_weights = False # allow learnable weights and bias in batch normalization layers or not 13 | 14 | 15 | def conv3x3(in_planes, out_planes, stride=1): 16 | "3x3 convolution with padding" 17 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 18 | padding=1, bias=False) 19 | 20 | 21 | class BasicBlock(nn.Module): 22 | expansion = 1 23 | 24 | def __init__(self, inplanes, planes, stride=1, downsample=None): 25 | super(BasicBlock, self).__init__() 26 | self.conv1 = conv3x3(inplanes, planes, stride) 27 | self.bn1 = nn.BatchNorm2d(planes, affine = affine_par) 28 | self.relu = nn.ReLU(inplace=True) 29 | self.conv2 = conv3x3(planes, planes) 30 | self.bn2 = nn.BatchNorm2d(planes, affine = affine_par) 31 | self.downsample = downsample 32 | self.stride = stride 33 | 34 | def forward(self, x): 35 | residual = x 36 | 37 | out = self.conv1(x) 38 | out = self.bn1(out) 39 | out = self.relu(out) 40 | 41 | out = self.conv2(out) 42 | out = self.bn2(out) 43 | 44 | if self.downsample is not None: 45 | residual = self.downsample(x) 46 | 47 | out += residual 48 | out = self.relu(out) 49 | 50 | return out 51 | 52 | 53 | class Bottleneck(nn.Module): 54 | expansion = 4 55 | 56 | def __init__(self, inplanes, planes, stride=1, dilation_ = 1, downsample=None): 57 | super(Bottleneck, self).__init__() 58 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change 59 | self.bn1 = nn.BatchNorm2d(planes,affine = affine_par) 60 | if learnable_bn_weights == False: 61 | for i in self.bn1.parameters(): 62 | i.requires_grad = False 63 | padding = 1 64 | if dilation_ == 2: 65 | padding = 2 66 | elif dilation_ == 4: 67 | padding = 4 68 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change 69 | padding=padding, bias=False, dilation = dilation_) 70 | self.bn2 = nn.BatchNorm2d(planes,affine = affine_par) 71 | if learnable_bn_weights == False: 72 | for i in self.bn2.parameters(): 73 | i.requires_grad = False 74 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 75 | self.bn3 = nn.BatchNorm2d(planes * 4, affine = affine_par) 76 | if learnable_bn_weights == False: 77 | for i in self.bn3.parameters(): 78 | i.requires_grad = False 79 | self.relu = nn.ReLU(inplace=True) 80 | self.downsample = downsample 81 | self.stride = stride 82 | 83 | 84 | 85 | def forward(self, x): 86 | residual = x 87 | 88 | out = self.conv1(x) 89 | out = self.bn1(out) 90 | out = self.relu(out) 91 | 92 | out = self.conv2(out) 93 | out = self.bn2(out) 94 | out = self.relu(out) 95 | 96 | out = self.conv3(out) 97 | out = self.bn3(out) 98 | 99 | if self.downsample is not None: 100 | residual = self.downsample(x) 101 | 102 | out += residual 103 | out = self.relu(out) 104 | 105 | return out 106 | 107 | class Classifier_Module(nn.Module): 108 | 109 | def __init__(self,dilation_series,padding_series,n_classes): 110 | super(Classifier_Module, self).__init__() 111 | self.conv2d_list = nn.ModuleList() 112 | for dilation,padding in zip(dilation_series,padding_series): 113 | self.conv2d_list.append(nn.Conv2d(2048,n_classes,kernel_size=3,stride=1, padding =padding, dilation = dilation,bias = True)) 114 | 115 | for m in self.conv2d_list: 116 | m.weight.data.normal_(0, 0.01) 117 | 118 | 119 | def forward(self, x): 120 | out = self.conv2d_list[0](x) 121 | for i in range(len(self.conv2d_list)-1): 122 | out += self.conv2d_list[i+1](x) 123 | return out 124 | 125 | 126 | 127 | class ResNet(nn.Module): 128 | def __init__(self, block, layers, n_classes): 129 | self.inplanes = 64 130 | super(ResNet, self).__init__() 131 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 132 | bias=False) 133 | self.bn1 = nn.BatchNorm2d(64,affine = affine_par) 134 | if learnable_bn_weights == False: 135 | for i in self.bn1.parameters(): 136 | i.requires_grad = False 137 | self.relu = nn.ReLU(inplace=True) 138 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=True) # change 139 | self.layer1 = self._make_layer(block, 64, layers[0]) 140 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 141 | self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation__ = 2) 142 | self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation__ = 4) 143 | self.layer5 = self._make_pred_layer(Classifier_Module, [6,12,18,24],[6,12,18,24],n_classes) # ASPP 144 | 145 | for m in self.modules(): 146 | if isinstance(m, nn.Conv2d): 147 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 148 | m.weight.data.normal_(0, 0.01) 149 | elif isinstance(m, nn.BatchNorm2d): 150 | m.weight.data.fill_(1) 151 | m.bias.data.zero_() 152 | 153 | def _make_layer(self, block, planes, blocks, stride=1,dilation__ = 1): 154 | downsample = None 155 | if stride != 1 or self.inplanes != planes * block.expansion or dilation__ == 2 or dilation__ == 4: 156 | downsample = nn.Sequential( 157 | nn.Conv2d(self.inplanes, planes * block.expansion, 158 | kernel_size=1, stride=stride, bias=False), 159 | nn.BatchNorm2d(planes * block.expansion,affine = affine_par), 160 | ) 161 | if learnable_bn_weights == False: 162 | for i in downsample._modules['1'].parameters(): 163 | i.requires_grad = False 164 | layers = [] 165 | layers.append(block(self.inplanes, planes, stride,dilation_=dilation__, downsample = downsample )) 166 | self.inplanes = planes * block.expansion 167 | for i in range(1, blocks): 168 | layers.append(block(self.inplanes, planes,dilation_=dilation__)) 169 | 170 | return nn.Sequential(*layers) 171 | 172 | def _make_pred_layer(self,block, dilation_series, padding_series,n_classes): 173 | return block(dilation_series,padding_series,n_classes) 174 | 175 | def forward(self, x): 176 | x = self.conv1(x) 177 | x = self.bn1(x) 178 | x = self.relu(x) 179 | x = self.maxpool(x) 180 | x = self.layer1(x) 181 | x = self.layer2(x) 182 | x = self.layer3(x) 183 | x = self.layer4(x) 184 | x = self.layer5(x) 185 | 186 | return x 187 | 188 | class deeplab_seg(nn.Module): 189 | def __init__(self,n_classes=19): 190 | super(deeplab_seg,self).__init__() 191 | self.Scale = ResNet(Bottleneck,[3, 4, 23, 3],n_classes) 192 | 193 | def forward(self,x): 194 | input_size = x.size()[2:] # x: [batch_size, 3, h, w] 195 | out = self.Scale(x) # for original scale 196 | out = F.interpolate(out, size=input_size, mode='bilinear', align_corners=True) 197 | return out 198 | 199 | def init_resnet101_params(self, resnet101): 200 | initial_state_dict = resnet101.state_dict() 201 | new_state_dict = OrderedDict() 202 | for k, v in initial_state_dict.items(): 203 | k = 'Scale.' + k 204 | new_state_dict[k] = v 205 | return new_state_dict 206 | 207 | # if you want to load from downloaded pretrained model: 208 | # model_path: path to the downloaded model 209 | # def init_resnet101_params(self, model_path): 210 | # saved_state_dict = torch.load(model_path, map_location=lambda storage, loc: storage) 211 | # new_state_dict = OrderedDict() 212 | # for k, v in saved_state_dict.items(): 213 | # k = 'Scale.' + k 214 | # new_state_dict[k] = v 215 | # return new_state_dict 216 | 217 | 218 | 219 | -------------------------------------------------------------------------------- /ptsemseg/models/dispnet_depth.py: -------------------------------------------------------------------------------- 1 | # taken from https://github.com/ClementPinard/SfmLearner-Pytorch 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.nn.init import xavier_uniform_, zeros_ 7 | 8 | # DispNetS for depth 9 | 10 | def downsample_conv(in_planes, out_planes, kernel_size=3): 11 | return nn.Sequential( 12 | nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=2, padding=(kernel_size-1)//2), 13 | nn.ReLU(inplace=True), 14 | nn.Conv2d(out_planes, out_planes, kernel_size=kernel_size, padding=(kernel_size-1)//2), 15 | nn.ReLU(inplace=True) 16 | ) 17 | 18 | 19 | def predict_disp(in_planes): 20 | return nn.Sequential( 21 | nn.Conv2d(in_planes, 1, kernel_size=1, padding=0), 22 | nn.Sigmoid() 23 | ) 24 | 25 | 26 | def conv(in_planes, out_planes): 27 | return nn.Sequential( 28 | nn.Conv2d(in_planes, out_planes, kernel_size=3, padding=1), 29 | nn.ReLU(inplace=True) 30 | ) 31 | 32 | 33 | def upconv(in_planes, out_planes): 34 | return nn.Sequential( 35 | nn.ConvTranspose2d(in_planes, out_planes, kernel_size=3, stride=2, padding=1, output_padding=1), 36 | nn.ReLU(inplace=True) 37 | ) 38 | 39 | 40 | def crop_like(input, ref): 41 | assert(input.size(2) >= ref.size(2) and input.size(3) >= ref.size(3)) 42 | return input[:, :, :ref.size(2), :ref.size(3)] 43 | 44 | 45 | class dispnet_depth(nn.Module): 46 | 47 | def __init__(self, alpha=10, beta=0.01): 48 | super(dispnet_depth, self).__init__() 49 | 50 | self.alpha = alpha 51 | self.beta = beta 52 | 53 | conv_planes = [32, 64, 128, 256, 512, 512, 512] 54 | self.conv1 = downsample_conv(3, conv_planes[0], kernel_size=7) 55 | self.conv2 = downsample_conv(conv_planes[0], conv_planes[1], kernel_size=5) 56 | self.conv3 = downsample_conv(conv_planes[1], conv_planes[2]) 57 | self.conv4 = downsample_conv(conv_planes[2], conv_planes[3]) 58 | self.conv5 = downsample_conv(conv_planes[3], conv_planes[4]) 59 | self.conv6 = downsample_conv(conv_planes[4], conv_planes[5]) 60 | self.conv7 = downsample_conv(conv_planes[5], conv_planes[6]) 61 | 62 | upconv_planes = [512, 512, 256, 128, 64, 32, 16] 63 | self.upconv7 = upconv(conv_planes[6], upconv_planes[0]) 64 | self.upconv6 = upconv(upconv_planes[0], upconv_planes[1]) 65 | self.upconv5 = upconv(upconv_planes[1], upconv_planes[2]) 66 | self.upconv4 = upconv(upconv_planes[2], upconv_planes[3]) 67 | self.upconv3 = upconv(upconv_planes[3], upconv_planes[4]) 68 | self.upconv2 = upconv(upconv_planes[4], upconv_planes[5]) 69 | self.upconv1 = upconv(upconv_planes[5], upconv_planes[6]) 70 | 71 | self.iconv7 = conv(upconv_planes[0] + conv_planes[5], upconv_planes[0]) 72 | self.iconv6 = conv(upconv_planes[1] + conv_planes[4], upconv_planes[1]) 73 | self.iconv5 = conv(upconv_planes[2] + conv_planes[3], upconv_planes[2]) 74 | self.iconv4 = conv(upconv_planes[3] + conv_planes[2], upconv_planes[3]) 75 | self.iconv3 = conv(1 + upconv_planes[4] + conv_planes[1], upconv_planes[4]) 76 | self.iconv2 = conv(1 + upconv_planes[5] + conv_planes[0], upconv_planes[5]) 77 | self.iconv1 = conv(1 + upconv_planes[6], upconv_planes[6]) 78 | 79 | self.predict_disp4 = predict_disp(upconv_planes[3]) 80 | self.predict_disp3 = predict_disp(upconv_planes[4]) 81 | self.predict_disp2 = predict_disp(upconv_planes[5]) 82 | self.predict_disp1 = predict_disp(upconv_planes[6]) 83 | 84 | def init_weights(self): 85 | for m in self.modules(): 86 | if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d): 87 | xavier_uniform_(m.weight) 88 | if m.bias is not None: 89 | zeros_(m.bias) 90 | 91 | def forward(self, x): 92 | out_conv1 = self.conv1(x) 93 | out_conv2 = self.conv2(out_conv1) 94 | out_conv3 = self.conv3(out_conv2) 95 | out_conv4 = self.conv4(out_conv3) 96 | out_conv5 = self.conv5(out_conv4) 97 | out_conv6 = self.conv6(out_conv5) 98 | out_conv7 = self.conv7(out_conv6) 99 | 100 | out_upconv7 = crop_like(self.upconv7(out_conv7), out_conv6) 101 | concat7 = torch.cat((out_upconv7, out_conv6), 1) 102 | out_iconv7 = self.iconv7(concat7) 103 | 104 | out_upconv6 = crop_like(self.upconv6(out_iconv7), out_conv5) 105 | concat6 = torch.cat((out_upconv6, out_conv5), 1) 106 | out_iconv6 = self.iconv6(concat6) 107 | 108 | out_upconv5 = crop_like(self.upconv5(out_iconv6), out_conv4) 109 | concat5 = torch.cat((out_upconv5, out_conv4), 1) 110 | out_iconv5 = self.iconv5(concat5) 111 | 112 | out_upconv4 = crop_like(self.upconv4(out_iconv5), out_conv3) 113 | concat4 = torch.cat((out_upconv4, out_conv3), 1) 114 | out_iconv4 = self.iconv4(concat4) 115 | disp4 = self.alpha * self.predict_disp4(out_iconv4) + self.beta 116 | 117 | out_upconv3 = crop_like(self.upconv3(out_iconv4), out_conv2) 118 | disp4_up = crop_like(F.interpolate(disp4, scale_factor=2, mode='bilinear', align_corners=True), out_conv2) 119 | concat3 = torch.cat((out_upconv3, out_conv2, disp4_up), 1) 120 | out_iconv3 = self.iconv3(concat3) 121 | disp3 = self.alpha * self.predict_disp3(out_iconv3) + self.beta 122 | 123 | out_upconv2 = crop_like(self.upconv2(out_iconv3), out_conv1) 124 | disp3_up = crop_like(F.interpolate(disp3, scale_factor=2, mode='bilinear', align_corners=True), out_conv1) 125 | concat2 = torch.cat((out_upconv2, out_conv1, disp3_up), 1) 126 | out_iconv2 = self.iconv2(concat2) 127 | disp2 = self.alpha * self.predict_disp2(out_iconv2) + self.beta 128 | 129 | out_upconv1 = crop_like(self.upconv1(out_iconv2), x) 130 | disp2_up = crop_like(F.interpolate(disp2, scale_factor=2, mode='bilinear', align_corners=True), x) 131 | concat1 = torch.cat((out_upconv1, disp2_up), 1) 132 | out_iconv1 = self.iconv1(concat1) 133 | disp1 = self.alpha * self.predict_disp1(out_iconv1) + self.beta 134 | 135 | 136 | return disp1 137 | -------------------------------------------------------------------------------- /ptsemseg/models/dispnet_seg.py: -------------------------------------------------------------------------------- 1 | # modified from https://github.com/ClementPinard/SfmLearner-Pytorch 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.nn.init import xavier_uniform_, zeros_ 7 | 8 | ## DispNetS for segmentation 9 | 10 | def downsample_conv(in_planes, out_planes, kernel_size=3): 11 | return nn.Sequential( 12 | nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=2, padding=(kernel_size-1)//2), 13 | nn.ReLU(inplace=True), 14 | nn.Conv2d(out_planes, out_planes, kernel_size=kernel_size, padding=(kernel_size-1)//2), 15 | nn.ReLU(inplace=True) 16 | ) 17 | 18 | 19 | def predict_disp(in_planes, n_classes=19): 20 | return nn.Sequential( 21 | nn.Conv2d(in_planes, n_classes, kernel_size=1, padding=0), 22 | nn.ReLU(inplace=True) # different from dispnet for depth 23 | ) 24 | 25 | 26 | def conv(in_planes, out_planes): 27 | return nn.Sequential( 28 | nn.Conv2d(in_planes, out_planes, kernel_size=3, padding=1), 29 | nn.ReLU(inplace=True) 30 | ) 31 | 32 | 33 | def upconv(in_planes, out_planes): 34 | return nn.Sequential( 35 | nn.ConvTranspose2d(in_planes, out_planes, kernel_size=3, stride=2, padding=1, output_padding=1), 36 | nn.ReLU(inplace=True) 37 | ) 38 | 39 | 40 | def crop_like(input, ref): 41 | assert(input.size(2) >= ref.size(2) and input.size(3) >= ref.size(3)) 42 | return input[:, :, :ref.size(2), :ref.size(3)] 43 | 44 | 45 | class dispnet_seg(nn.Module): 46 | 47 | def __init__(self, alpha=10.0, beta=0, n_classes=19): 48 | super(dispnet_seg, self).__init__() 49 | 50 | self.alpha = alpha 51 | self.beta = beta 52 | self.n_classes = n_classes 53 | 54 | conv_planes = [32, 64, 128, 256, 512, 512, 512] 55 | self.conv1 = downsample_conv(3, conv_planes[0], kernel_size=7) 56 | self.conv2 = downsample_conv(conv_planes[0], conv_planes[1], kernel_size=5) 57 | self.conv3 = downsample_conv(conv_planes[1], conv_planes[2]) 58 | self.conv4 = downsample_conv(conv_planes[2], conv_planes[3]) 59 | self.conv5 = downsample_conv(conv_planes[3], conv_planes[4]) 60 | self.conv6 = downsample_conv(conv_planes[4], conv_planes[5]) 61 | self.conv7 = downsample_conv(conv_planes[5], conv_planes[6]) 62 | 63 | upconv_planes = [512, 512, 256, 128, 64, 32, 16] 64 | self.upconv7 = upconv(conv_planes[6], upconv_planes[0]) 65 | self.upconv6 = upconv(upconv_planes[0], upconv_planes[1]) 66 | self.upconv5 = upconv(upconv_planes[1], upconv_planes[2]) 67 | self.upconv4 = upconv(upconv_planes[2], upconv_planes[3]) 68 | self.upconv3 = upconv(upconv_planes[3], upconv_planes[4]) 69 | self.upconv2 = upconv(upconv_planes[4], upconv_planes[5]) 70 | self.upconv1 = upconv(upconv_planes[5], upconv_planes[6]) 71 | 72 | self.iconv7 = conv(upconv_planes[0] + conv_planes[5], upconv_planes[0]) 73 | self.iconv6 = conv(upconv_planes[1] + conv_planes[4], upconv_planes[1]) 74 | self.iconv5 = conv(upconv_planes[2] + conv_planes[3], upconv_planes[2]) 75 | self.iconv4 = conv(upconv_planes[3] + conv_planes[2], upconv_planes[3]) 76 | self.iconv3 = conv(self.n_classes + upconv_planes[4] + conv_planes[1], upconv_planes[4]) 77 | self.iconv2 = conv(self.n_classes + upconv_planes[5] + conv_planes[0], upconv_planes[5]) 78 | self.iconv1 = conv(self.n_classes + upconv_planes[6], upconv_planes[6]) 79 | 80 | self.predict_disp4 = predict_disp(upconv_planes[3], n_classes=self.n_classes) 81 | self.predict_disp3 = predict_disp(upconv_planes[4], n_classes=self.n_classes) 82 | self.predict_disp2 = predict_disp(upconv_planes[5], n_classes=self.n_classes) 83 | self.predict_disp1 = predict_disp(upconv_planes[6], n_classes=self.n_classes) 84 | 85 | def init_weights(self): 86 | for m in self.modules(): 87 | if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d): 88 | xavier_uniform_(m.weight) 89 | if m.bias is not None: 90 | zeros_(m.bias) 91 | 92 | def forward(self, x): 93 | out_conv1 = self.conv1(x) 94 | out_conv2 = self.conv2(out_conv1) 95 | out_conv3 = self.conv3(out_conv2) 96 | out_conv4 = self.conv4(out_conv3) 97 | out_conv5 = self.conv5(out_conv4) 98 | out_conv6 = self.conv6(out_conv5) 99 | out_conv7 = self.conv7(out_conv6) 100 | 101 | out_upconv7 = crop_like(self.upconv7(out_conv7), out_conv6) 102 | concat7 = torch.cat((out_upconv7, out_conv6), 1) 103 | out_iconv7 = self.iconv7(concat7) 104 | 105 | out_upconv6 = crop_like(self.upconv6(out_iconv7), out_conv5) 106 | concat6 = torch.cat((out_upconv6, out_conv5), 1) 107 | out_iconv6 = self.iconv6(concat6) 108 | 109 | out_upconv5 = crop_like(self.upconv5(out_iconv6), out_conv4) 110 | concat5 = torch.cat((out_upconv5, out_conv4), 1) 111 | out_iconv5 = self.iconv5(concat5) 112 | 113 | out_upconv4 = crop_like(self.upconv4(out_iconv5), out_conv3) 114 | concat4 = torch.cat((out_upconv4, out_conv3), 1) 115 | out_iconv4 = self.iconv4(concat4) 116 | disp4 = self.alpha * self.predict_disp4(out_iconv4) + self.beta 117 | 118 | out_upconv3 = crop_like(self.upconv3(out_iconv4), out_conv2) 119 | disp4_up = crop_like(F.interpolate(disp4, scale_factor=2, mode='bilinear', align_corners=True), out_conv2) 120 | concat3 = torch.cat((out_upconv3, out_conv2, disp4_up), 1) 121 | out_iconv3 = self.iconv3(concat3) 122 | disp3 = self.alpha * self.predict_disp3(out_iconv3) + self.beta 123 | 124 | out_upconv2 = crop_like(self.upconv2(out_iconv3), out_conv1) 125 | disp3_up = crop_like(F.interpolate(disp3, scale_factor=2, mode='bilinear', align_corners=True), out_conv1) 126 | concat2 = torch.cat((out_upconv2, out_conv1, disp3_up), 1) 127 | out_iconv2 = self.iconv2(concat2) 128 | disp2 = self.alpha * self.predict_disp2(out_iconv2) + self.beta 129 | 130 | out_upconv1 = crop_like(self.upconv1(out_iconv2), x) 131 | disp2_up = crop_like(F.interpolate(disp2, scale_factor=2, mode='bilinear', align_corners=True), x) 132 | concat1 = torch.cat((out_upconv1, disp2_up), 1) 133 | out_iconv1 = self.iconv1(concat1) 134 | disp1 = self.alpha * self.predict_disp1(out_iconv1) + self.beta 135 | 136 | 137 | return disp1 138 | -------------------------------------------------------------------------------- /ptsemseg/models/fcn_depth.py: -------------------------------------------------------------------------------- 1 | # adapted from https://github.com/meetshah1995/pytorch-semseg 2 | 3 | import functools 4 | 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from ptsemseg.models.utils import get_upsampling_weight 9 | 10 | # FCN 8s for depth 11 | class fcn_depth(nn.Module): 12 | def __init__(self): 13 | super(fcn_depth, self).__init__() 14 | 15 | self.conv_block1 = nn.Sequential( 16 | nn.Conv2d(3, 64, 3, padding=100), 17 | nn.ReLU(inplace=True), 18 | nn.Conv2d(64, 64, 3, padding=1), 19 | nn.ReLU(inplace=True), 20 | nn.MaxPool2d(2, stride=2, ceil_mode=True), 21 | ) 22 | 23 | self.conv_block2 = nn.Sequential( 24 | nn.Conv2d(64, 128, 3, padding=1), 25 | nn.ReLU(inplace=True), 26 | nn.Conv2d(128, 128, 3, padding=1), 27 | nn.ReLU(inplace=True), 28 | nn.MaxPool2d(2, stride=2, ceil_mode=True), 29 | ) 30 | 31 | self.conv_block3 = nn.Sequential( 32 | nn.Conv2d(128, 256, 3, padding=1), 33 | nn.ReLU(inplace=True), 34 | nn.Conv2d(256, 256, 3, padding=1), 35 | nn.ReLU(inplace=True), 36 | nn.Conv2d(256, 256, 3, padding=1), 37 | nn.ReLU(inplace=True), 38 | nn.MaxPool2d(2, stride=2, ceil_mode=True), 39 | ) 40 | 41 | self.conv_block4 = nn.Sequential( 42 | nn.Conv2d(256, 512, 3, padding=1), 43 | nn.ReLU(inplace=True), 44 | nn.Conv2d(512, 512, 3, padding=1), 45 | nn.ReLU(inplace=True), 46 | nn.Conv2d(512, 512, 3, padding=1), 47 | nn.ReLU(inplace=True), 48 | nn.MaxPool2d(2, stride=2, ceil_mode=True), 49 | ) 50 | 51 | self.conv_block5 = nn.Sequential( 52 | nn.Conv2d(512, 512, 3, padding=1), 53 | nn.ReLU(inplace=True), 54 | nn.Conv2d(512, 512, 3, padding=1), 55 | nn.ReLU(inplace=True), 56 | nn.Conv2d(512, 512, 3, padding=1), 57 | nn.ReLU(inplace=True), 58 | nn.MaxPool2d(2, stride=2, ceil_mode=True), 59 | ) 60 | 61 | self.classifier = nn.Sequential( 62 | nn.Conv2d(512, 2048, 7), 63 | nn.ReLU(inplace=True), 64 | # nn.Dropout2d(), 65 | nn.Conv2d(2048, 1024, 1), 66 | nn.ReLU(inplace=True), 67 | # nn.Dropout2d(), 68 | nn.Conv2d(1024, 64, 1), 69 | nn.ReLU(inplace=True), 70 | # nn.Dropout2d(), 71 | ) 72 | 73 | self.score_pool4 = nn.Conv2d(512, 64, 1) 74 | self.score_pool3 = nn.Conv2d(256, 64, 1) 75 | 76 | self.conv3 = nn.Conv2d(64, 1, 1, padding=0) 77 | self.relu = nn.ReLU(inplace=True) 78 | 79 | # deconvolution 80 | self.upscore2 = nn.ConvTranspose2d(64, 64, 4, 81 | stride=2, bias=False) 82 | self.upscore4 = nn.ConvTranspose2d(64, 64, 4, 83 | stride=2, bias=False) 84 | self.upscore8 = nn.ConvTranspose2d(64, 64, 16, 85 | stride=8, bias=False) 86 | 87 | for m in self.modules(): 88 | if isinstance(m, nn.ConvTranspose2d): 89 | m.weight.data.copy_(get_upsampling_weight(m.in_channels, 90 | m.out_channels, 91 | m.kernel_size[0])) 92 | 93 | 94 | def forward(self, x): 95 | conv1 = self.conv_block1(x) 96 | conv2 = self.conv_block2(conv1) 97 | conv3 = self.conv_block3(conv2) 98 | conv4 = self.conv_block4(conv3) 99 | conv5 = self.conv_block5(conv4) 100 | 101 | score = self.classifier(conv5) 102 | 103 | upscore2 = self.upscore2(score) 104 | score_pool4c = self.score_pool4(conv4)[:, :, 5:5+upscore2.size()[2], 105 | 5:5+upscore2.size()[3]] 106 | upscore_pool4 = self.upscore4(upscore2 + score_pool4c) 107 | 108 | score_pool3c = self.score_pool3(conv3)[:, :, 9:9+upscore_pool4.size()[2], 109 | 9:9+upscore_pool4.size()[3]] 110 | 111 | out = self.upscore8(score_pool3c + upscore_pool4)[:, :, 31:31+x.size()[2], 112 | 31:31+x.size()[3]] 113 | out = self.conv3(out) 114 | out = self.relu(out) 115 | return out 116 | 117 | 118 | 119 | 120 | def init_vgg16_params(self, vgg16, copy_fc8=True): 121 | blocks = [ 122 | self.conv_block1, 123 | self.conv_block2, 124 | self.conv_block3, 125 | self.conv_block4, 126 | self.conv_block5, 127 | ] 128 | 129 | ranges = [[0, 4], [5, 9], [10, 16], [17, 23], [24, 29]] 130 | features = list(vgg16.features.children()) 131 | 132 | for idx, conv_block in enumerate(blocks): 133 | for l1, l2 in zip(features[ranges[idx][0] : ranges[idx][1]], conv_block): 134 | if isinstance(l1, nn.Conv2d) and isinstance(l2, nn.Conv2d): 135 | assert l1.weight.size() == l2.weight.size() 136 | assert l1.bias.size() == l2.bias.size() 137 | l2.weight.data = l1.weight.data 138 | l2.bias.data = l1.bias.data -------------------------------------------------------------------------------- /ptsemseg/models/fcn_seg.py: -------------------------------------------------------------------------------- 1 | # taken from https://github.com/meetshah1995/pytorch-semseg 2 | 3 | import functools 4 | 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from ptsemseg.models.utils import get_upsampling_weight 9 | from ptsemseg.loss import cross_entropy2d 10 | 11 | # FCN 8s 12 | # for segmentation 13 | class fcn_seg(nn.Module): 14 | def __init__(self, n_classes=21, learned_billinear=True): 15 | super(fcn_seg, self).__init__() 16 | self.learned_billinear = learned_billinear 17 | self.n_classes = n_classes 18 | self.loss = functools.partial(cross_entropy2d, 19 | size_average=True) 20 | 21 | self.conv_block1 = nn.Sequential( 22 | nn.Conv2d(3, 64, 3, padding=100), 23 | nn.ReLU(inplace=True), 24 | nn.Conv2d(64, 64, 3, padding=1), 25 | nn.ReLU(inplace=True), 26 | nn.MaxPool2d(2, stride=2, ceil_mode=True), 27 | ) 28 | 29 | self.conv_block2 = nn.Sequential( 30 | nn.Conv2d(64, 128, 3, padding=1), 31 | nn.ReLU(inplace=True), 32 | nn.Conv2d(128, 128, 3, padding=1), 33 | nn.ReLU(inplace=True), 34 | nn.MaxPool2d(2, stride=2, ceil_mode=True), 35 | ) 36 | 37 | self.conv_block3 = nn.Sequential( 38 | nn.Conv2d(128, 256, 3, padding=1), 39 | nn.ReLU(inplace=True), 40 | nn.Conv2d(256, 256, 3, padding=1), 41 | nn.ReLU(inplace=True), 42 | nn.Conv2d(256, 256, 3, padding=1), 43 | nn.ReLU(inplace=True), 44 | nn.MaxPool2d(2, stride=2, ceil_mode=True), 45 | ) 46 | 47 | self.conv_block4 = nn.Sequential( 48 | nn.Conv2d(256, 512, 3, padding=1), 49 | nn.ReLU(inplace=True), 50 | nn.Conv2d(512, 512, 3, padding=1), 51 | nn.ReLU(inplace=True), 52 | nn.Conv2d(512, 512, 3, padding=1), 53 | nn.ReLU(inplace=True), 54 | nn.MaxPool2d(2, stride=2, ceil_mode=True), 55 | ) 56 | 57 | self.conv_block5 = nn.Sequential( 58 | nn.Conv2d(512, 512, 3, padding=1), 59 | nn.ReLU(inplace=True), 60 | nn.Conv2d(512, 512, 3, padding=1), 61 | nn.ReLU(inplace=True), 62 | nn.Conv2d(512, 512, 3, padding=1), 63 | nn.ReLU(inplace=True), 64 | nn.MaxPool2d(2, stride=2, ceil_mode=True), 65 | ) 66 | 67 | self.classifier = nn.Sequential( 68 | nn.Conv2d(512, 4096, 7), 69 | nn.ReLU(inplace=True), 70 | nn.Dropout2d(), 71 | nn.Conv2d(4096, 4096, 1), 72 | nn.ReLU(inplace=True), 73 | nn.Dropout2d(), 74 | nn.Conv2d(4096, self.n_classes, 1), 75 | ) 76 | 77 | self.score_pool4 = nn.Conv2d(512, self.n_classes, 1) 78 | self.score_pool3 = nn.Conv2d(256, self.n_classes, 1) 79 | 80 | if self.learned_billinear: 81 | # deconvolution 82 | self.upscore2 = nn.ConvTranspose2d(self.n_classes, self.n_classes, 4, 83 | stride=2, bias=False) 84 | self.upscore4 = nn.ConvTranspose2d(self.n_classes, self.n_classes, 4, 85 | stride=2, bias=False) 86 | self.upscore8 = nn.ConvTranspose2d(self.n_classes, self.n_classes, 16, 87 | stride=8, bias=False) 88 | 89 | for m in self.modules(): 90 | if isinstance(m, nn.ConvTranspose2d): 91 | m.weight.data.copy_(get_upsampling_weight(m.in_channels, 92 | m.out_channels, 93 | m.kernel_size[0])) 94 | 95 | 96 | def forward(self, x): 97 | conv1 = self.conv_block1(x) 98 | conv2 = self.conv_block2(conv1) 99 | conv3 = self.conv_block3(conv2) 100 | conv4 = self.conv_block4(conv3) 101 | conv5 = self.conv_block5(conv4) 102 | 103 | score = self.classifier(conv5) 104 | 105 | if self.learned_billinear: 106 | upscore2 = self.upscore2(score) 107 | score_pool4c = self.score_pool4(conv4)[:, :, 5:5+upscore2.size()[2], 108 | 5:5+upscore2.size()[3]] 109 | upscore_pool4 = self.upscore4(upscore2 + score_pool4c) 110 | 111 | score_pool3c = self.score_pool3(conv3)[:, :, 9:9+upscore_pool4.size()[2], 112 | 9:9+upscore_pool4.size()[3]] 113 | 114 | out = self.upscore8(score_pool3c + upscore_pool4)[:, :, 31:31+x.size()[2], 115 | 31:31+x.size()[3]] 116 | return out.contiguous() 117 | 118 | 119 | else: 120 | score_pool4 = self.score_pool4(conv4) 121 | score_pool3 = self.score_pool3(conv3) 122 | score = F.upsample(score, score_pool4.size()[2:]) 123 | score += score_pool4 124 | score = F.upsample(score, score_pool3.size()[2:]) 125 | score += score_pool3 126 | out = F.upsample(score, x.size()[2:]) 127 | 128 | return out 129 | 130 | def init_vgg16_params(self, vgg16, copy_fc8=True): 131 | blocks = [ 132 | self.conv_block1, 133 | self.conv_block2, 134 | self.conv_block3, 135 | self.conv_block4, 136 | self.conv_block5, 137 | ] 138 | 139 | ranges = [[0, 4], [5, 9], [10, 16], [17, 23], [24, 29]] 140 | features = list(vgg16.features.children()) 141 | 142 | for idx, conv_block in enumerate(blocks): 143 | for l1, l2 in zip(features[ranges[idx][0] : ranges[idx][1]], conv_block): 144 | if isinstance(l1, nn.Conv2d) and isinstance(l2, nn.Conv2d): 145 | assert l1.weight.size() == l2.weight.size() 146 | assert l1.bias.size() == l2.bias.size() 147 | l2.weight.data = l1.weight.data 148 | l2.bias.data = l1.bias.data 149 | for i1, i2 in zip([0, 3], [0, 3]): 150 | l1 = vgg16.classifier[i1] 151 | l2 = self.classifier[i2] 152 | l2.weight.data = l1.weight.data.view(l2.weight.size()) 153 | l2.bias.data = l1.bias.data.view(l2.bias.size()) 154 | n_class = self.classifier[6].weight.size()[0] 155 | if copy_fc8: 156 | l1 = vgg16.classifier[6] 157 | l2 = self.classifier[6] 158 | l2.weight.data = l1.weight.data[:n_class, :].view(l2.weight.size()) 159 | l2.bias.data = l1.bias.data[:n_class] 160 | -------------------------------------------------------------------------------- /ptsemseg/models/fcrn_depth.py: -------------------------------------------------------------------------------- 1 | # adapted from https://github.com/XPFly1989/FCRN 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional 6 | import math 7 | 8 | ### fcrn for depth ### 9 | # based on ResNet50 10 | class Bottleneck(nn.Module): 11 | expansion = 4 12 | 13 | def __init__(self, inplanes, planes, stride=1, downsample=None): 14 | super(Bottleneck, self).__init__() 15 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 16 | self.bn1 = nn.BatchNorm2d(planes) 17 | 18 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, 19 | bias=False) 20 | self.bn2 = nn.BatchNorm2d(planes) 21 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 22 | self.bn3 = nn.BatchNorm2d(planes * 4) 23 | self.relu = nn.ReLU(inplace=True) 24 | self.downsample = downsample 25 | self.stride = stride 26 | 27 | def forward(self, x): 28 | residual = x 29 | 30 | out = self.conv1(x) 31 | out = self.bn1(out) 32 | out = self.relu(out) 33 | 34 | out = self.conv2(out) 35 | out = self.bn2(out) 36 | out = self.relu(out) 37 | 38 | out = self.conv3(out) 39 | out = self.bn3(out) 40 | 41 | if self.downsample is not None: 42 | residual = self.downsample(x) 43 | 44 | out += residual 45 | out = self.relu(out) 46 | 47 | return out 48 | 49 | 50 | class UpProject(nn.Module): 51 | 52 | def __init__(self, in_channels, out_channels): 53 | super(UpProject, self).__init__() 54 | 55 | self.conv1_1 = nn.Conv2d(in_channels, out_channels, 3) 56 | self.conv1_2 = nn.Conv2d(in_channels, out_channels, (2, 3)) 57 | self.conv1_3 = nn.Conv2d(in_channels, out_channels, (3, 2)) 58 | self.conv1_4 = nn.Conv2d(in_channels, out_channels, 2) 59 | 60 | self.conv2_1 = nn.Conv2d(in_channels, out_channels, 3) 61 | self.conv2_2 = nn.Conv2d(in_channels, out_channels, (2, 3)) 62 | self.conv2_3 = nn.Conv2d(in_channels, out_channels, (3, 2)) 63 | self.conv2_4 = nn.Conv2d(in_channels, out_channels, 2) 64 | 65 | self.bn1_1 = nn.BatchNorm2d(out_channels) 66 | self.bn1_2 = nn.BatchNorm2d(out_channels) 67 | 68 | self.relu = nn.ReLU(inplace=True) 69 | 70 | self.conv3 = nn.Conv2d(out_channels, out_channels, 3, padding=1) 71 | 72 | self.bn2 = nn.BatchNorm2d(out_channels) 73 | 74 | def forward(self, x): 75 | batch_size = x.size()[0] 76 | out1_1 = self.conv1_1(nn.functional.pad(x, (1, 1, 1, 1))) 77 | #out1_2 = self.conv1_2(nn.functional.pad(x, (1, 1, 0, 1)))#right interleaving padding 78 | out1_2 = self.conv1_2(nn.functional.pad(x, (1, 1, 1, 0)))#author's interleaving pading in github 79 | #out1_3 = self.conv1_3(nn.functional.pad(x, (0, 1, 1, 1)))#right interleaving padding 80 | out1_3 = self.conv1_3(nn.functional.pad(x, (1, 0, 1, 1)))#author's interleaving pading in github 81 | #out1_4 = self.conv1_4(nn.functional.pad(x, (0, 1, 0, 1)))#right interleaving padding 82 | out1_4 = self.conv1_4(nn.functional.pad(x, (1, 0, 1, 0)))#author's interleaving pading in github 83 | 84 | out2_1 = self.conv2_1(nn.functional.pad(x, (1, 1, 1, 1))) 85 | #out2_2 = self.conv2_2(nn.functional.pad(x, (1, 1, 0, 1)))#right interleaving padding 86 | out2_2 = self.conv2_2(nn.functional.pad(x, (1, 1, 1, 0)))#author's interleaving pading in github 87 | #out2_3 = self.conv2_3(nn.functional.pad(x, (0, 1, 1, 1)))#right interleaving padding 88 | out2_3 = self.conv2_3(nn.functional.pad(x, (1, 0, 1, 1)))#author's interleaving pading in github 89 | #out2_4 = self.conv2_4(nn.functional.pad(x, (0, 1, 0, 1)))#right interleaving padding 90 | out2_4 = self.conv2_4(nn.functional.pad(x, (1, 0, 1, 0)))#author's interleaving pading in github 91 | 92 | height = out1_1.size()[2] 93 | width = out1_1.size()[3] 94 | 95 | out1_1_2 = torch.stack((out1_1, out1_2), dim=-3).permute(0, 1, 3, 4, 2).contiguous().view( 96 | batch_size, -1, height, width * 2) 97 | out1_3_4 = torch.stack((out1_3, out1_4), dim=-3).permute(0, 1, 3, 4, 2).contiguous().view( 98 | batch_size, -1, height, width * 2) 99 | 100 | out1_1234 = torch.stack((out1_1_2, out1_3_4), dim=-3).permute(0, 1, 3, 2, 4).contiguous().view( 101 | batch_size, -1, height * 2, width * 2) 102 | 103 | out2_1_2 = torch.stack((out2_1, out2_2), dim=-3).permute(0, 1, 3, 4, 2).contiguous().view( 104 | batch_size, -1, height, width * 2) 105 | out2_3_4 = torch.stack((out2_3, out2_4), dim=-3).permute(0, 1, 3, 4, 2).contiguous().view( 106 | batch_size, -1, height, width * 2) 107 | 108 | out2_1234 = torch.stack((out2_1_2, out2_3_4), dim=-3).permute(0, 1, 3, 2, 4).contiguous().view( 109 | batch_size, -1, height * 2, width * 2) 110 | 111 | out1 = self.bn1_1(out1_1234) 112 | out1 = self.relu(out1) 113 | out1 = self.conv3(out1) 114 | out1 = self.bn2(out1) 115 | 116 | out2 = self.bn1_2(out2_1234) 117 | 118 | out = out1 + out2 119 | out = self.relu(out) 120 | 121 | return out 122 | 123 | 124 | class fcrn_depth(nn.Module): 125 | 126 | def __init__(self): 127 | super(fcrn_depth, self).__init__() 128 | self.inplanes = 64 129 | # self.n_classes = n_classes 130 | 131 | # ResNet with out avrgpool & fc 132 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 133 | self.bn1 = nn.BatchNorm2d(64) 134 | self.relu = nn.ReLU(inplace=True) 135 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 136 | self.layer1 = self._make_layer(Bottleneck, 64, 3, stride=1) 137 | self.layer2 = self._make_layer(Bottleneck, 128, 4, stride=2) 138 | self.layer3 = self._make_layer(Bottleneck, 256, 6, stride=2) 139 | self.layer4 = self._make_layer(Bottleneck, 512, 3, stride=2) 140 | 141 | # Up-Conv layers 142 | self.conv2 = nn.Conv2d(2048, 1024, kernel_size=1, bias=False) 143 | self.bn2 = nn.BatchNorm2d(1024) 144 | 145 | self.up1 = self._make_upproj_layer(UpProject, 1024, 512) 146 | self.up2 = self._make_upproj_layer(UpProject, 512, 256) 147 | self.up3 = self._make_upproj_layer(UpProject, 256, 128) 148 | self.up4 = self._make_upproj_layer(UpProject, 128, 64) 149 | 150 | self.drop = nn.Dropout2d() 151 | 152 | self.conv3 = nn.Conv2d(64, 1, 3, padding=1) 153 | 154 | # initialize 155 | if True: 156 | for m in self.modules(): 157 | if isinstance(m, nn.Conv2d): 158 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 159 | m.weight.data.normal_(0, math.sqrt(2. / n)) 160 | elif isinstance(m, nn.BatchNorm2d): 161 | m.weight.data.fill_(1) 162 | m.bias.data.zero_() 163 | 164 | def _make_layer(self, block, planes, blocks, stride=1): 165 | downsample = None 166 | if stride != 1 or self.inplanes != planes * block.expansion: 167 | downsample = nn.Sequential( 168 | nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, 169 | stride=stride, bias=False), 170 | nn.BatchNorm2d(planes * block.expansion), 171 | ) 172 | 173 | layers = [] 174 | layers.append(block(self.inplanes, planes, stride, downsample)) 175 | self.inplanes = planes * block.expansion 176 | for i in range(1, blocks): 177 | layers.append(block(self.inplanes, planes)) 178 | 179 | return nn.Sequential(*layers) 180 | 181 | def _make_upproj_layer(self, block, in_channels, out_channels): 182 | return block(in_channels, out_channels) 183 | 184 | def forward(self, x): 185 | inp_shape = x.shape[2:] 186 | 187 | x = self.conv1(x) 188 | x = self.bn1(x) 189 | x = self.relu(x) 190 | x = self.maxpool(x) 191 | 192 | x = self.layer1(x) 193 | x = self.layer2(x) 194 | x = self.layer3(x) 195 | x = self.layer4(x) 196 | 197 | x = self.conv2(x) 198 | x = self.bn2(x) 199 | 200 | x = self.up1(x) 201 | x = self.up2(x) 202 | x = self.up3(x) 203 | x = self.up4(x) 204 | 205 | x = self.drop(x) 206 | 207 | x = self.conv3(x) 208 | x = self.relu(x) 209 | 210 | x = nn.functional.interpolate(x, size=inp_shape, mode='bilinear', align_corners=True) 211 | 212 | return x 213 | 214 | def init_resnet50_params(self, resnet50): 215 | initial_state_dict = resnet50.state_dict() 216 | return initial_state_dict 217 | 218 | # if you want to load from downloaded pretrained model: 219 | # def init_resnet50_params(self, model_path): 220 | # init_state_dict = torch.load(model_path) 221 | # return init_state_dict -------------------------------------------------------------------------------- /ptsemseg/models/fcrn_seg.py: -------------------------------------------------------------------------------- 1 | # adapted from https://github.com/XPFly1989/FCRN 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional 6 | import math 7 | 8 | ### fcrn for semantic segmentation ### 9 | # based on ResNet50 10 | 11 | class Bottleneck(nn.Module): 12 | expansion = 4 13 | 14 | def __init__(self, inplanes, planes, stride=1, downsample=None): 15 | super(Bottleneck, self).__init__() 16 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 17 | self.bn1 = nn.BatchNorm2d(planes) 18 | 19 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, 20 | bias=False) 21 | self.bn2 = nn.BatchNorm2d(planes) 22 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 23 | self.bn3 = nn.BatchNorm2d(planes * 4) 24 | self.relu = nn.ReLU(inplace=True) 25 | self.downsample = downsample 26 | self.stride = stride 27 | 28 | def forward(self, x): 29 | residual = x 30 | 31 | out = self.conv1(x) 32 | out = self.bn1(out) 33 | out = self.relu(out) 34 | 35 | out = self.conv2(out) 36 | out = self.bn2(out) 37 | out = self.relu(out) 38 | 39 | out = self.conv3(out) 40 | out = self.bn3(out) 41 | 42 | if self.downsample is not None: 43 | residual = self.downsample(x) 44 | 45 | out += residual 46 | out = self.relu(out) 47 | 48 | return out 49 | 50 | 51 | class UpProject(nn.Module): 52 | 53 | def __init__(self, in_channels, out_channels): 54 | super(UpProject, self).__init__() 55 | 56 | self.conv1_1 = nn.Conv2d(in_channels, out_channels, 3) 57 | self.conv1_2 = nn.Conv2d(in_channels, out_channels, (2, 3)) 58 | self.conv1_3 = nn.Conv2d(in_channels, out_channels, (3, 2)) 59 | self.conv1_4 = nn.Conv2d(in_channels, out_channels, 2) 60 | 61 | self.conv2_1 = nn.Conv2d(in_channels, out_channels, 3) 62 | self.conv2_2 = nn.Conv2d(in_channels, out_channels, (2, 3)) 63 | self.conv2_3 = nn.Conv2d(in_channels, out_channels, (3, 2)) 64 | self.conv2_4 = nn.Conv2d(in_channels, out_channels, 2) 65 | 66 | self.bn1_1 = nn.BatchNorm2d(out_channels) 67 | self.bn1_2 = nn.BatchNorm2d(out_channels) 68 | 69 | self.relu = nn.ReLU(inplace=True) 70 | 71 | self.conv3 = nn.Conv2d(out_channels, out_channels, 3, padding=1) 72 | 73 | self.bn2 = nn.BatchNorm2d(out_channels) 74 | 75 | def forward(self, x): 76 | batch_size = x.size()[0] 77 | out1_1 = self.conv1_1(nn.functional.pad(x, (1, 1, 1, 1))) 78 | #out1_2 = self.conv1_2(nn.functional.pad(x, (1, 1, 0, 1)))#right interleaving padding 79 | out1_2 = self.conv1_2(nn.functional.pad(x, (1, 1, 1, 0)))#author's interleaving pading in github 80 | #out1_3 = self.conv1_3(nn.functional.pad(x, (0, 1, 1, 1)))#right interleaving padding 81 | out1_3 = self.conv1_3(nn.functional.pad(x, (1, 0, 1, 1)))#author's interleaving pading in github 82 | #out1_4 = self.conv1_4(nn.functional.pad(x, (0, 1, 0, 1)))#right interleaving padding 83 | out1_4 = self.conv1_4(nn.functional.pad(x, (1, 0, 1, 0)))#author's interleaving pading in github 84 | 85 | out2_1 = self.conv2_1(nn.functional.pad(x, (1, 1, 1, 1))) 86 | #out2_2 = self.conv2_2(nn.functional.pad(x, (1, 1, 0, 1)))#right interleaving padding 87 | out2_2 = self.conv2_2(nn.functional.pad(x, (1, 1, 1, 0)))#author's interleaving pading in github 88 | #out2_3 = self.conv2_3(nn.functional.pad(x, (0, 1, 1, 1)))#right interleaving padding 89 | out2_3 = self.conv2_3(nn.functional.pad(x, (1, 0, 1, 1)))#author's interleaving pading in github 90 | #out2_4 = self.conv2_4(nn.functional.pad(x, (0, 1, 0, 1)))#right interleaving padding 91 | out2_4 = self.conv2_4(nn.functional.pad(x, (1, 0, 1, 0)))#author's interleaving pading in github 92 | 93 | height = out1_1.size()[2] 94 | width = out1_1.size()[3] 95 | 96 | out1_1_2 = torch.stack((out1_1, out1_2), dim=-3).permute(0, 1, 3, 4, 2).contiguous().view( 97 | batch_size, -1, height, width * 2) 98 | out1_3_4 = torch.stack((out1_3, out1_4), dim=-3).permute(0, 1, 3, 4, 2).contiguous().view( 99 | batch_size, -1, height, width * 2) 100 | 101 | out1_1234 = torch.stack((out1_1_2, out1_3_4), dim=-3).permute(0, 1, 3, 2, 4).contiguous().view( 102 | batch_size, -1, height * 2, width * 2) 103 | 104 | out2_1_2 = torch.stack((out2_1, out2_2), dim=-3).permute(0, 1, 3, 4, 2).contiguous().view( 105 | batch_size, -1, height, width * 2) 106 | out2_3_4 = torch.stack((out2_3, out2_4), dim=-3).permute(0, 1, 3, 4, 2).contiguous().view( 107 | batch_size, -1, height, width * 2) 108 | 109 | out2_1234 = torch.stack((out2_1_2, out2_3_4), dim=-3).permute(0, 1, 3, 2, 4).contiguous().view( 110 | batch_size, -1, height * 2, width * 2) 111 | 112 | out1 = self.bn1_1(out1_1234) 113 | out1 = self.relu(out1) 114 | out1 = self.conv3(out1) 115 | out1 = self.bn2(out1) 116 | 117 | out2 = self.bn1_2(out2_1234) 118 | 119 | out = out1 + out2 120 | out = self.relu(out) 121 | 122 | return out 123 | 124 | 125 | class fcrn_seg(nn.Module): 126 | 127 | def __init__(self, n_classes=19): 128 | super(fcrn_seg, self).__init__() 129 | self.inplanes = 64 130 | self.n_classes = n_classes 131 | 132 | # ResNet with out avrgpool & fc 133 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 134 | self.bn1 = nn.BatchNorm2d(64) 135 | self.relu = nn.ReLU(inplace=True) 136 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 137 | self.layer1 = self._make_layer(Bottleneck, 64, 3, stride=1) 138 | self.layer2 = self._make_layer(Bottleneck, 128, 4, stride=2) 139 | self.layer3 = self._make_layer(Bottleneck, 256, 6, stride=2) 140 | self.layer4 = self._make_layer(Bottleneck, 512, 3, stride=2) 141 | 142 | # Up-Conv layers 143 | self.conv2 = nn.Conv2d(2048, 1024, kernel_size=1, bias=False) 144 | self.bn2 = nn.BatchNorm2d(1024) 145 | 146 | self.up1 = self._make_upproj_layer(UpProject, 1024, 512) 147 | self.up2 = self._make_upproj_layer(UpProject, 512, 256) 148 | self.up3 = self._make_upproj_layer(UpProject, 256, 128) 149 | self.up4 = self._make_upproj_layer(UpProject, 128, 64) 150 | 151 | self.drop = nn.Dropout2d() 152 | 153 | # for segmentation 154 | self.conv3 = nn.Conv2d(64, self.n_classes, 3, padding=1) 155 | 156 | # initialize 157 | if True: 158 | for m in self.modules(): 159 | if isinstance(m, nn.Conv2d): 160 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 161 | m.weight.data.normal_(0, math.sqrt(2. / n)) 162 | elif isinstance(m, nn.BatchNorm2d): 163 | m.weight.data.fill_(1) 164 | m.bias.data.zero_() 165 | 166 | def _make_layer(self, block, planes, blocks, stride=1): 167 | downsample = None 168 | if stride != 1 or self.inplanes != planes * block.expansion: 169 | downsample = nn.Sequential( 170 | nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, 171 | stride=stride, bias=False), 172 | nn.BatchNorm2d(planes * block.expansion), 173 | ) 174 | 175 | layers = [] 176 | layers.append(block(self.inplanes, planes, stride, downsample)) 177 | self.inplanes = planes * block.expansion 178 | for i in range(1, blocks): 179 | layers.append(block(self.inplanes, planes)) 180 | 181 | return nn.Sequential(*layers) 182 | 183 | def _make_upproj_layer(self, block, in_channels, out_channels): 184 | return block(in_channels, out_channels) 185 | 186 | def forward(self, x): 187 | inp_shape = x.shape[2:] 188 | 189 | x = self.conv1(x) 190 | x = self.bn1(x) 191 | x = self.relu(x) 192 | x = self.maxpool(x) 193 | 194 | x = self.layer1(x) 195 | x = self.layer2(x) 196 | x = self.layer3(x) 197 | x = self.layer4(x) 198 | 199 | x = self.conv2(x) 200 | x = self.bn2(x) 201 | 202 | x = self.up1(x) 203 | x = self.up2(x) 204 | x = self.up3(x) 205 | x = self.up4(x) 206 | 207 | x = self.drop(x) 208 | 209 | x = self.conv3(x) 210 | x = self.relu(x) 211 | 212 | x = nn.functional.interpolate(x, size=inp_shape, mode='bilinear', align_corners=True) 213 | 214 | return x 215 | 216 | def init_resnet50_params(self, resnet50): 217 | initial_state_dict = resnet50.state_dict() 218 | return initial_state_dict 219 | 220 | # if you want to load from downloaded pretrained model: 221 | # def init_resnet50_params(self, model_path): 222 | # init_state_dict = torch.load(model_path, map_location=lambda storage, loc: storage) 223 | # return init_state_dict -------------------------------------------------------------------------------- /ptsemseg/models/frrn_depth.py: -------------------------------------------------------------------------------- 1 | # taken from https://github.com/meetshah1995/pytorch-semseg 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import functools 7 | 8 | from ptsemseg.models.utils import * 9 | 10 | # frrn for depth 11 | 12 | # n_blocks, channel, scale 13 | frrn_specs_dic = { 14 | "A": { 15 | "encoder": [[3, 96, 2], [4, 192, 4], [2, 384, 8], [2, 384, 16]], 16 | "decoder": [[2, 192, 8], [2, 192, 4], [2, 48, 2]], 17 | }, 18 | "B": { 19 | "encoder": [[3, 96, 2], [4, 192, 4], [2, 384, 8], [2, 384, 16], [2, 384, 32]], 20 | "decoder": [[2, 192, 16], [2, 192, 8], [2, 192, 4], [2, 96, 2]], 21 | }, 22 | } 23 | 24 | 25 | class frrn_depth(nn.Module): 26 | """ 27 | Full Resolution Residual Networks for Semantic Segmentation 28 | URL: https://arxiv.org/abs/1611.08323 29 | 30 | References: 31 | 1) Original Author's code: https://github.com/TobyPDE/FRRN 32 | 2) TF implementation by @kiwonjoon: https://github.com/hiwonjoon/tf-frrn 33 | """ 34 | 35 | def __init__(self, 36 | model_type="A", 37 | group_norm=False, 38 | n_groups=16): 39 | super(frrn_depth, self).__init__() 40 | self.model_type = model_type 41 | self.group_norm = group_norm 42 | self.n_groups = n_groups 43 | 44 | if self.group_norm: 45 | self.conv1 = conv2DGroupNormRelu(3, 48, 5, 1, 2) 46 | else: 47 | self.conv1 = conv2DBatchNormRelu(3, 48, 5, 1, 2) 48 | 49 | self.up_residual_units = [] 50 | self.down_residual_units = [] 51 | for i in range(3): 52 | self.up_residual_units.append(RU(channels=48, 53 | kernel_size=3, 54 | strides=1, 55 | group_norm=self.group_norm, 56 | n_groups=self.n_groups)) 57 | self.down_residual_units.append(RU(channels=48, 58 | kernel_size=3, 59 | strides=1, 60 | group_norm=self.group_norm, 61 | n_groups=self.n_groups)) 62 | 63 | self.up_residual_units = nn.ModuleList(self.up_residual_units) 64 | self.down_residual_units = nn.ModuleList(self.down_residual_units) 65 | 66 | self.split_conv = nn.Conv2d( 67 | 48, 32, kernel_size=1, padding=0, stride=1, bias=False 68 | ) 69 | 70 | # each spec is as (n_blocks, channels, scale) 71 | self.encoder_frru_specs = frrn_specs_dic[self.model_type]["encoder"] 72 | 73 | self.decoder_frru_specs = frrn_specs_dic[self.model_type]["decoder"] 74 | 75 | # encoding 76 | prev_channels = 48 77 | self.encoding_frrus = {} 78 | for n_blocks, channels, scale in self.encoder_frru_specs: 79 | for block in range(n_blocks): 80 | key = "_".join(map(str, ["encoding_frru", n_blocks, channels, scale, block])) 81 | setattr(self, key, FRRU(prev_channels=prev_channels, 82 | out_channels=channels, 83 | scale=scale, 84 | group_norm=self.group_norm, 85 | n_groups=self.n_groups),) 86 | prev_channels = channels 87 | 88 | # decoding 89 | self.decoding_frrus = {} 90 | for n_blocks, channels, scale in self.decoder_frru_specs: 91 | # pass through decoding FRRUs 92 | for block in range(n_blocks): 93 | key = "_".join(map(str, ["decoding_frru", n_blocks, channels, scale, block])) 94 | setattr(self, key, FRRU(prev_channels=prev_channels, 95 | out_channels=channels, 96 | scale=scale, 97 | group_norm=self.group_norm, 98 | n_groups=self.n_groups),) 99 | prev_channels = channels 100 | 101 | self.merge_conv = nn.Conv2d( 102 | prev_channels + 32, 48, kernel_size=1, padding=0, stride=1, bias=False 103 | ) 104 | 105 | self.predict = nn.Conv2d( 106 | 48, 1, kernel_size=3, padding=1, stride=1, bias=True 107 | ) 108 | 109 | def forward(self, x): 110 | 111 | # pass to initial conv 112 | x = self.conv1(x) 113 | 114 | # pass through residual units 115 | for i in range(3): 116 | x = self.up_residual_units[i](x) 117 | 118 | # divide stream 119 | y = x 120 | z = self.split_conv(x) 121 | 122 | prev_channels = 48 123 | # encoding 124 | for n_blocks, channels, scale in self.encoder_frru_specs: 125 | # maxpool bigger feature map 126 | y_pooled = F.max_pool2d(y, stride=2, kernel_size=2, padding=0) 127 | # pass through encoding FRRUs 128 | for block in range(n_blocks): 129 | key = "_".join( 130 | map(str, ["encoding_frru", n_blocks, channels, scale, block]) 131 | ) 132 | y, z = getattr(self, key)(y_pooled, z) 133 | prev_channels = channels 134 | 135 | # decoding 136 | for n_blocks, channels, scale in self.decoder_frru_specs: 137 | # bilinear upsample smaller feature map 138 | upsample_size = torch.Size([_s * 2 for _s in y.size()[-2:]]) 139 | y_upsampled = F.upsample(y, size=upsample_size, mode="bilinear", align_corners=True) 140 | # pass through decoding FRRUs 141 | for block in range(n_blocks): 142 | key = "_".join( 143 | map(str, ["decoding_frru", n_blocks, channels, scale, block]) 144 | ) 145 | y, z = getattr(self, key)(y_upsampled, z) 146 | prev_channels = channels 147 | 148 | # merge streams 149 | x = torch.cat([F.upsample(y, scale_factor=2, mode="bilinear", align_corners=True), z], dim=1) 150 | x = self.merge_conv(x) 151 | 152 | # pass through residual units 153 | for i in range(3): 154 | x = self.down_residual_units[i](x) 155 | 156 | # final 1x1 conv to get depth 157 | x = self.predict(x) 158 | 159 | return x 160 | -------------------------------------------------------------------------------- /ptsemseg/models/frrn_seg.py: -------------------------------------------------------------------------------- 1 | # taken from https://github.com/meetshah1995/pytorch-semseg 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import functools 7 | 8 | # for segmentation 9 | 10 | from ptsemseg.models.utils import * 11 | from ptsemseg.loss import bootstrapped_cross_entropy2d 12 | 13 | # n_blocks, channel, scale 14 | frrn_specs_dic = { 15 | "A": { 16 | "encoder": [[3, 96, 2], [4, 192, 4], [2, 384, 8], [2, 384, 16]], 17 | "decoder": [[2, 192, 8], [2, 192, 4], [2, 48, 2]], 18 | }, 19 | "B": { 20 | "encoder": [[3, 96, 2], [4, 192, 4], [2, 384, 8], [2, 384, 16], [2, 384, 32]], 21 | "decoder": [[2, 192, 16], [2, 192, 8], [2, 192, 4], [2, 96, 2]], 22 | }, 23 | } 24 | 25 | 26 | class frrn_seg(nn.Module): 27 | """ 28 | Full Resolution Residual Networks for Semantic Segmentation 29 | URL: https://arxiv.org/abs/1611.08323 30 | 31 | References: 32 | 1) Original Author's code: https://github.com/TobyPDE/FRRN 33 | 2) TF implementation by @kiwonjoon: https://github.com/hiwonjoon/tf-frrn 34 | """ 35 | 36 | def __init__(self, 37 | n_classes=19, 38 | model_type=None, 39 | group_norm=False, 40 | n_groups=16): 41 | super(frrn_seg, self).__init__() 42 | self.n_classes = n_classes 43 | self.model_type = model_type 44 | self.group_norm = group_norm 45 | self.n_groups = n_groups 46 | 47 | if self.group_norm: 48 | self.conv1 = conv2DGroupNormRelu(3, 48, 5, 1, 2) 49 | else: 50 | self.conv1 = conv2DBatchNormRelu(3, 48, 5, 1, 2) 51 | 52 | self.up_residual_units = [] 53 | self.down_residual_units = [] 54 | for i in range(3): 55 | self.up_residual_units.append(RU(channels=48, 56 | kernel_size=3, 57 | strides=1, 58 | group_norm=self.group_norm, 59 | n_groups=self.n_groups)) 60 | self.down_residual_units.append(RU(channels=48, 61 | kernel_size=3, 62 | strides=1, 63 | group_norm=self.group_norm, 64 | n_groups=self.n_groups)) 65 | 66 | self.up_residual_units = nn.ModuleList(self.up_residual_units) 67 | self.down_residual_units = nn.ModuleList(self.down_residual_units) 68 | 69 | self.split_conv = nn.Conv2d( 70 | 48, 32, kernel_size=1, padding=0, stride=1, bias=False 71 | ) 72 | 73 | # each spec is as (n_blocks, channels, scale) 74 | self.encoder_frru_specs = frrn_specs_dic[self.model_type]["encoder"] 75 | 76 | self.decoder_frru_specs = frrn_specs_dic[self.model_type]["decoder"] 77 | 78 | # encoding 79 | prev_channels = 48 80 | self.encoding_frrus = {} 81 | for n_blocks, channels, scale in self.encoder_frru_specs: 82 | for block in range(n_blocks): 83 | key = "_".join(map(str, ["encoding_frru", n_blocks, channels, scale, block])) 84 | setattr(self, key, FRRU(prev_channels=prev_channels, 85 | out_channels=channels, 86 | scale=scale, 87 | group_norm=self.group_norm, 88 | n_groups=self.n_groups),) 89 | prev_channels = channels 90 | 91 | # decoding 92 | self.decoding_frrus = {} 93 | for n_blocks, channels, scale in self.decoder_frru_specs: 94 | # pass through decoding FRRUs 95 | for block in range(n_blocks): 96 | key = "_".join(map(str, ["decoding_frru", n_blocks, channels, scale, block])) 97 | setattr(self, key, FRRU(prev_channels=prev_channels, 98 | out_channels=channels, 99 | scale=scale, 100 | group_norm=self.group_norm, 101 | n_groups=self.n_groups),) 102 | prev_channels = channels 103 | 104 | self.merge_conv = nn.Conv2d( 105 | prev_channels + 32, 48, kernel_size=1, padding=0, stride=1, bias=False 106 | ) 107 | 108 | self.classif_conv = nn.Conv2d( 109 | 48, self.n_classes, kernel_size=3, padding=1, stride=1, bias=True 110 | ) 111 | 112 | def forward(self, x): 113 | 114 | # pass to initial conv 115 | x = self.conv1(x) 116 | 117 | # pass through residual units 118 | for i in range(3): 119 | x = self.up_residual_units[i](x) 120 | 121 | # divide stream 122 | y = x 123 | z = self.split_conv(x) 124 | 125 | prev_channels = 48 126 | # encoding 127 | for n_blocks, channels, scale in self.encoder_frru_specs: 128 | # maxpool bigger feature map 129 | y_pooled = F.max_pool2d(y, stride=2, kernel_size=2, padding=0) 130 | # pass through encoding FRRUs 131 | for block in range(n_blocks): 132 | key = "_".join( 133 | map(str, ["encoding_frru", n_blocks, channels, scale, block]) 134 | ) 135 | y, z = getattr(self, key)(y_pooled, z) 136 | prev_channels = channels 137 | 138 | # decoding 139 | for n_blocks, channels, scale in self.decoder_frru_specs: 140 | # bilinear upsample smaller feature map 141 | upsample_size = torch.Size([_s * 2 for _s in y.size()[-2:]]) 142 | y_upsampled = F.upsample(y, size=upsample_size, mode="bilinear", align_corners=True) 143 | # pass through decoding FRRUs 144 | for block in range(n_blocks): 145 | key = "_".join( 146 | map(str, ["decoding_frru", n_blocks, channels, scale, block]) 147 | ) 148 | y, z = getattr(self, key)(y_upsampled, z) 149 | prev_channels = channels 150 | 151 | # merge streams 152 | x = torch.cat([F.upsample(y, scale_factor=2, mode="bilinear", align_corners=True), z], dim=1) 153 | x = self.merge_conv(x) 154 | 155 | # pass through residual units 156 | for i in range(3): 157 | x = self.down_residual_units[i](x) 158 | 159 | # final 1x1 conv to get classification 160 | x = self.classif_conv(x) 161 | 162 | return x 163 | -------------------------------------------------------------------------------- /ptsemseg/models/segnet_depth.py: -------------------------------------------------------------------------------- 1 | # adapted from https://github.com/meetshah1995/pytorch-semseg 2 | 3 | import torch.nn as nn 4 | 5 | from ptsemseg.models.utils import * 6 | 7 | # segnet for depth 8 | 9 | class segnet_depth(nn.Module): 10 | def __init__(self, in_channels=3, is_unpooling=True): 11 | super(segnet_depth, self).__init__() 12 | 13 | self.in_channels = in_channels 14 | self.is_unpooling = is_unpooling 15 | 16 | self.down1 = segnetDown2(self.in_channels, 64) 17 | self.down2 = segnetDown2(64, 128) 18 | self.down3 = segnetDown3(128, 256) 19 | self.down4 = segnetDown3(256, 512) 20 | self.down5 = segnetDown3(512, 512) 21 | 22 | self.up5 = segnetUp3(512, 512) 23 | self.up4 = segnetUp3(512, 256) 24 | self.up3 = segnetUp3(256, 128) 25 | self.up2 = segnetUp2(128, 64) 26 | self.up1 = segnetUp2(64, 1) 27 | 28 | def forward(self, inputs): 29 | 30 | down1, indices_1, unpool_shape1 = self.down1(inputs) 31 | down2, indices_2, unpool_shape2 = self.down2(down1) 32 | down3, indices_3, unpool_shape3 = self.down3(down2) 33 | down4, indices_4, unpool_shape4 = self.down4(down3) 34 | down5, indices_5, unpool_shape5 = self.down5(down4) 35 | 36 | up5 = self.up5(down5, indices_5, unpool_shape5) 37 | up4 = self.up4(up5, indices_4, unpool_shape4) 38 | up3 = self.up3(up4, indices_3, unpool_shape3) 39 | up2 = self.up2(up3, indices_2, unpool_shape2) 40 | up1 = self.up1(up2, indices_1, unpool_shape1) 41 | 42 | return up1 43 | 44 | def init_vgg16_params(self, vgg16): 45 | blocks = [self.down1, self.down2, self.down3, self.down4, self.down5] 46 | 47 | ranges = [[0, 4], [5, 9], [10, 16], [17, 23], [24, 29]] 48 | features = list(vgg16.features.children()) 49 | 50 | vgg_layers = [] 51 | for _layer in features: 52 | if isinstance(_layer, nn.Conv2d): 53 | vgg_layers.append(_layer) 54 | 55 | merged_layers = [] 56 | for idx, conv_block in enumerate(blocks): 57 | if idx < 2: 58 | units = [conv_block.conv1.cbr_unit, conv_block.conv2.cbr_unit] 59 | else: 60 | units = [ 61 | conv_block.conv1.cbr_unit, 62 | conv_block.conv2.cbr_unit, 63 | conv_block.conv3.cbr_unit, 64 | ] 65 | for _unit in units: 66 | for _layer in _unit: 67 | if isinstance(_layer, nn.Conv2d): 68 | merged_layers.append(_layer) 69 | 70 | assert len(vgg_layers) == len(merged_layers) 71 | 72 | for l1, l2 in zip(vgg_layers, merged_layers): 73 | if isinstance(l1, nn.Conv2d) and isinstance(l2, nn.Conv2d): 74 | assert l1.weight.size() == l2.weight.size() 75 | assert l1.bias.size() == l2.bias.size() 76 | l2.weight.data = l1.weight.data 77 | l2.bias.data = l1.bias.data 78 | -------------------------------------------------------------------------------- /ptsemseg/models/segnet_seg.py: -------------------------------------------------------------------------------- 1 | # taken from https://github.com/meetshah1995/pytorch-semseg 2 | 3 | import torch.nn as nn 4 | 5 | from ptsemseg.models.utils import * 6 | 7 | # for segmentation 8 | 9 | class segnet_seg(nn.Module): 10 | def __init__(self, n_classes=21, in_channels=3, is_unpooling=True): 11 | super(segnet_seg, self).__init__() 12 | 13 | self.in_channels = in_channels 14 | self.is_unpooling = is_unpooling 15 | 16 | self.down1 = segnetDown2(self.in_channels, 64) 17 | self.down2 = segnetDown2(64, 128) 18 | self.down3 = segnetDown3(128, 256) 19 | self.down4 = segnetDown3(256, 512) 20 | self.down5 = segnetDown3(512, 512) 21 | 22 | self.up5 = segnetUp3(512, 512) 23 | self.up4 = segnetUp3(512, 256) 24 | self.up3 = segnetUp3(256, 128) 25 | self.up2 = segnetUp2(128, 64) 26 | self.up1 = segnetUp2(64, n_classes) 27 | 28 | def forward(self, inputs): 29 | 30 | down1, indices_1, unpool_shape1 = self.down1(inputs) 31 | down2, indices_2, unpool_shape2 = self.down2(down1) 32 | down3, indices_3, unpool_shape3 = self.down3(down2) 33 | down4, indices_4, unpool_shape4 = self.down4(down3) 34 | down5, indices_5, unpool_shape5 = self.down5(down4) 35 | 36 | up5 = self.up5(down5, indices_5, unpool_shape5) 37 | up4 = self.up4(up5, indices_4, unpool_shape4) 38 | up3 = self.up3(up4, indices_3, unpool_shape3) 39 | up2 = self.up2(up3, indices_2, unpool_shape2) 40 | up1 = self.up1(up2, indices_1, unpool_shape1) 41 | 42 | return up1 43 | 44 | def init_vgg16_params(self, vgg16): 45 | blocks = [self.down1, self.down2, self.down3, self.down4, self.down5] 46 | 47 | ranges = [[0, 4], [5, 9], [10, 16], [17, 23], [24, 29]] 48 | features = list(vgg16.features.children()) 49 | 50 | vgg_layers = [] 51 | for _layer in features: 52 | if isinstance(_layer, nn.Conv2d): 53 | vgg_layers.append(_layer) 54 | 55 | merged_layers = [] 56 | for idx, conv_block in enumerate(blocks): 57 | if idx < 2: 58 | units = [conv_block.conv1.cbr_unit, conv_block.conv2.cbr_unit] 59 | else: 60 | units = [ 61 | conv_block.conv1.cbr_unit, 62 | conv_block.conv2.cbr_unit, 63 | conv_block.conv3.cbr_unit, 64 | ] 65 | for _unit in units: 66 | for _layer in _unit: 67 | if isinstance(_layer, nn.Conv2d): 68 | merged_layers.append(_layer) 69 | 70 | assert len(vgg_layers) == len(merged_layers) 71 | 72 | for l1, l2 in zip(vgg_layers, merged_layers): 73 | if isinstance(l1, nn.Conv2d) and isinstance(l2, nn.Conv2d): 74 | assert l1.weight.size() == l2.weight.size() 75 | assert l1.bias.size() == l2.bias.size() 76 | l2.weight.data = l1.weight.data 77 | l2.bias.data = l1.bias.data 78 | -------------------------------------------------------------------------------- /ptsemseg/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | import functools 4 | 5 | from torch.optim import SGD 6 | from torch.optim import Adam 7 | from torch.optim import ASGD 8 | from torch.optim import Adamax 9 | from torch.optim import Adadelta 10 | from torch.optim import Adagrad 11 | from torch.optim import RMSprop 12 | 13 | logger = logging.getLogger('ptsemseg') 14 | 15 | key2opt = {'sgd': SGD, 16 | 'adam': Adam, 17 | 'asgd': ASGD, 18 | 'adamax': Adamax, 19 | 'adadelta': Adadelta, 20 | 'adagrad': Adagrad, 21 | 'rmsprop': RMSprop,} 22 | 23 | def get_optimizer(cfg): 24 | if cfg['training']['optimizer'] is None: 25 | logger.info("Using SGD optimizer") 26 | return SGD 27 | 28 | else: 29 | opt_name = cfg['training']['optimizer']['name'] 30 | if opt_name not in key2opt: 31 | raise NotImplementedError('Optimizer {} not implemented'.format(opt_name)) 32 | 33 | logger.info('Using {} optimizer'.format(opt_name)) 34 | return key2opt[opt_name] 35 | -------------------------------------------------------------------------------- /ptsemseg/schedulers/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from ptsemseg.schedulers.schedulers import * 3 | 4 | logger = logging.getLogger('ptsemseg') 5 | 6 | key2scheduler = {'constant_lr': ConstantLR, 7 | 'poly_lr': PolynomialLR, 8 | 'multi_step': MultiStepLR, 9 | 'cosine_annealing': CosineAnnealingLR, 10 | 'exp_lr': ExponentialLR} 11 | 12 | 13 | def get_scheduler(optimizer, scheduler_dict): 14 | if scheduler_dict is None: 15 | logger.info('Using No LR Scheduling') 16 | return ConstantLR(optimizer) 17 | 18 | s_type = scheduler_dict['name'] 19 | scheduler_dict.pop('name') 20 | 21 | logger.info('Using {} scheduler with {} params'.format(s_type, 22 | scheduler_dict)) 23 | 24 | warmup_dict = {} 25 | if 'warmup_iters' in scheduler_dict: 26 | # This can be done in a more pythonic way... 27 | warmup_dict['warmup_iters'] = scheduler_dict.get('warmup_iters', 100) 28 | warmup_dict['mode'] = scheduler_dict.get('warmup_mode', 'linear') 29 | warmup_dict['gamma'] = scheduler_dict.get('warmup_factor', 0.2) 30 | 31 | logger.info('Using Warmup with {} iters {} gamma and {} mode'.format( 32 | warmup_dict['warmup_iters'], 33 | warmup_dict['gamma'], 34 | warmup_dict['mode'])) 35 | 36 | scheduler_dict.pop('warmup_iters', None) 37 | scheduler_dict.pop('warmup_mode', None) 38 | scheduler_dict.pop('warmup_factor', None) 39 | 40 | base_scheduler = key2scheduler[s_type](optimizer, **scheduler_dict) 41 | return WarmUpLR(optimizer, base_scheduler, **warmup_dict) 42 | 43 | return key2scheduler[s_type](optimizer, **scheduler_dict) 44 | -------------------------------------------------------------------------------- /ptsemseg/schedulers/schedulers.py: -------------------------------------------------------------------------------- 1 | # taken from https://github.com/meetshah1995/pytorch-semseg 2 | 3 | import torch 4 | 5 | from torch.optim.lr_scheduler import _LRScheduler 6 | from torch.optim.lr_scheduler import MultiStepLR 7 | from torch.optim.lr_scheduler import ExponentialLR 8 | from torch.optim.lr_scheduler import CosineAnnealingLR 9 | from torch.optim.lr_scheduler import ReduceLROnPlateau 10 | 11 | 12 | class ConstantLR(_LRScheduler): 13 | def __init__(self, optimizer, last_epoch=-1): 14 | super(ConstantLR, self).__init__(optimizer, last_epoch) 15 | 16 | def get_lr(self): 17 | return [base_lr for base_lr in self.base_lrs] 18 | 19 | 20 | class PolynomialLR(_LRScheduler): 21 | def __init__(self, optimizer, max_iter, decay_iter=1, 22 | gamma=0.9, last_epoch=-1): 23 | self.decay_iter = decay_iter 24 | self.max_iter = max_iter 25 | self.gamma = gamma 26 | super(PolynomialLR, self).__init__(optimizer, last_epoch) 27 | 28 | def get_lr(self): 29 | if self.last_epoch % self.decay_iter or self.last_epoch % self.max_iter: 30 | return [base_lr for base_lr in self.base_lrs] 31 | else: 32 | factor = (1 - self.last_epoch / float(self.max_iter)) ** self.gamma 33 | return [base_lr * factor for base_lr in self.base_lrs] 34 | 35 | class WarmUpLR(_LRScheduler): 36 | def __init__(self, optimizer, scheduler, mode='linear', 37 | warmup_iters=100, gamma=0.2, last_epoch=-1): 38 | self.mode = mode 39 | self.scheduler = scheduler 40 | self.warmup_iters = warmup_iters 41 | self.gamma = gamma 42 | super(WarmUpLR, self).__init__(optimizer, last_epoch) 43 | 44 | def get_lr(self): 45 | cold_lrs = self.scheduler.get_lr() 46 | 47 | if self.last_epoch < self.warmup_iters: 48 | if self.mode == 'linear': 49 | alpha = self.last_epoch / float(self.warmup_iters) 50 | factor = self.gamma * (1 - alpha) + alpha 51 | 52 | elif self.mode == 'constant': 53 | factor = self.gamma 54 | else: 55 | raise KeyError('WarmUp type {} not implemented'.format(self.mode)) 56 | 57 | return [factor * base_lr for base_lr in cold_lrs] 58 | 59 | return cold_lrs 60 | -------------------------------------------------------------------------------- /ptsemseg/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Misc Utility functions 3 | taken from https://github.com/meetshah1995/pytorch-semseg 4 | """ 5 | import os 6 | import logging 7 | import datetime 8 | import numpy as np 9 | 10 | from collections import OrderedDict 11 | 12 | def recursive_glob(rootdir=".", suffix=""): 13 | """Performs recursive glob with given suffix and rootdir 14 | :param rootdir is the root directory 15 | :param suffix is the suffix to be searched 16 | """ 17 | image_paths = [] 18 | for looproot, _, filenames in os.walk(rootdir): 19 | for filename in filenames: 20 | if filename.endswith(suffix): 21 | image_paths.append(os.path.join(looproot, filename)) 22 | return image_paths 23 | 24 | 25 | def alpha_blend(input_image, segmentation_mask, alpha=0.5): 26 | """Alpha Blending utility to overlay RGB masks on RBG images 27 | :param input_image is a np.ndarray with 3 channels 28 | :param segmentation_mask is a np.ndarray with 3 channels 29 | :param alpha is a float value 30 | 31 | """ 32 | blended = np.zeros(input_image.size, dtype=np.float32) 33 | blended = input_image * alpha + segmentation_mask * (1 - alpha) 34 | return blended 35 | 36 | 37 | def get_logger(logdir): 38 | logger = logging.getLogger('ptsemseg') 39 | ts = str(datetime.datetime.now()).split('.')[0].replace(" ", "_") 40 | ts = ts.replace(":", "_").replace("-","_") 41 | file_path = os.path.join(logdir, 'run_{}.log'.format(ts)) 42 | hdlr = logging.FileHandler(file_path) 43 | formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') 44 | hdlr.setFormatter(formatter) 45 | logger.addHandler(hdlr) 46 | logger.setLevel(logging.INFO) 47 | return logger 48 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==2.2.2 2 | numpy==1.14.2 3 | scipy==1.0.1 4 | torch==0.4.1 5 | torchvision==0.2.1 6 | tqdm==4.23.0 7 | pydensecrf 8 | protobuf 9 | tensorboardX 10 | blessings 11 | progressbar2 12 | path.py 13 | imageio 14 | -------------------------------------------------------------------------------- /saliency.py: -------------------------------------------------------------------------------- 1 | # adapted from https://github.com/kazuto1011/grad-cam-pytorch 2 | 3 | from collections import OrderedDict 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | from torch.nn import functional as F 9 | 10 | 11 | class _PropagationBase(object): 12 | def __init__(self, model, task): 13 | super(_PropagationBase, self).__init__() 14 | self.device = next(model.parameters()).device 15 | self.model = model 16 | self.image = None 17 | self.task = task 18 | 19 | def _encode_one_hot(self, pos_i, pos_j, idx): 20 | one_hot = torch.FloatTensor(self.preds.size()).zero_() 21 | one_hot[0][idx][pos_i][pos_j] = 1.0 22 | return one_hot.to(self.device) 23 | 24 | def forward(self, image): 25 | self.image = image.requires_grad_() 26 | self.model.zero_grad() # Sets gradients of all model parameters to zero 27 | self.preds = self.model(self.image) # [1, 19, h, w] 28 | 29 | self.height = image.size()[2] 30 | self.width = image.size()[3] 31 | if self.task == "seg": 32 | self.pred_idx = np.squeeze(self.preds.data.max(1)[1].cpu().numpy(), axis=0) # [h, w] 33 | if self.task == "depth": 34 | self.pred_idx = np.zeros((self.height, self.width), dtype=int) 35 | 36 | return self.pred_idx 37 | 38 | def backward(self, pos_i, pos_j, idx): 39 | one_hot = self._encode_one_hot(pos_i, pos_j, idx) # [1, 19, h, w] 40 | self.preds.backward(gradient=one_hot, retain_graph=True) # Computes the gradient of current tensor w.r.t. graph leaves 41 | 42 | 43 | class BackPropagation(_PropagationBase): 44 | def generate(self): 45 | # produce vanilla bp map 46 | image_grads_vanilla = self.image.grad.detach().cpu().numpy().copy() # [1, 3, h, w] 47 | output_vanilla_bp = image_grads_vanilla.transpose(0,2,3,1)[0] 48 | 49 | # produce bp saliency map 50 | image_grads_abs = np.abs(image_grads_vanilla) 51 | output_saliency = image_grads_abs.transpose(0,2,3,1)[0] 52 | output_saliency = np.max(output_saliency, axis=2) 53 | self.image.grad.data.zero_() 54 | 55 | return output_vanilla_bp, output_saliency # [h, w, 3] -------------------------------------------------------------------------------- /saliency_analysis.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats import normaltest, wilcoxon 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | num_img = 100 7 | num_metric = 6 8 | model_name = "fcn" # TODO: to modify for other models, options: ["fcn", "frrn", "segnet", "deeplab", "fcrn", "dispnet"] 9 | 10 | ### fcrn ### 11 | seg_fcrn_pixel = np.load("saliency_eval_pixel/seg_fcrn_pixel.npy")[0:num_img] # [100, 6, 1078] 12 | depth_fcrn_pixel = np.load("saliency_eval_pixel/depth_fcrn_pixel.npy")[0:num_img] # 13 | fcrn_iou_pixel = np.load("saliency_eval_pixel/fcrn_iou.npy")[0:num_img] # [100, 4, 266] # 4 thresholds, 266 pixels for each image 14 | 15 | 16 | ## deeplab ### 17 | seg_deeplab_pixel = np.load("saliency_eval_pixel/seg_deeplab_pixel.npy")[0:num_img] # [100, 6, 1078] 18 | depth_deeplab_pixel = np.load("saliency_eval_pixel/depth_deeplab_pixel.npy")[0:num_img] # 19 | deeplab_iou_pixel = np.load("saliency_eval_pixel/deeplab_iou.npy")[0:num_img] 20 | 21 | 22 | ### dispnet ### 23 | seg_dispnet_pixel = np.load("saliency_eval_pixel/seg_dispnet_pixel.npy")[0:num_img] # 24 | depth_dispnet_pixel = np.load("saliency_eval_pixel/depth_dispnet_pixel.npy")[0:num_img] # 25 | dispnet_iou_pixel = np.load("saliency_eval_pixel/dispnet_iou.npy")[0:num_img] 26 | 27 | 28 | ### frrn ### 29 | seg_frrn_pixel = np.load("saliency_eval_pixel/seg_frrnA_pixel.npy")[0:num_img] # 30 | depth_frrn_pixel = np.load("saliency_eval_pixel/depth_frrnA_pixel.npy")[0:num_img] # 31 | frrn_iou_pixel = np.load("saliency_eval_pixel/frrnA_iou.npy")[0:num_img] 32 | 33 | 34 | ### segnet ### 35 | seg_segnet_pixel = np.load("saliency_eval_pixel/seg_segnet_pixel.npy")[0:num_img] # 36 | depth_segnet_pixel = np.load("saliency_eval_pixel/depth_segnet_pixel.npy")[0:num_img] # 37 | segnet_iou_pixel = np.load("saliency_eval_pixel/segnet_iou.npy")[0:num_img] 38 | 39 | ### fcn ### 40 | seg_fcn_pixel = np.load("saliency_eval_pixel/seg_fcn_pixel.npy")[0:num_img] # 41 | depth_fcn_pixel = np.load("saliency_eval_pixel/depth_fcn_pixel.npy")[0:num_img] # 42 | fcn_iou_pixel = np.load("saliency_eval_pixel/fcn_iou.npy")[0:num_img] 43 | 44 | # range for hist for 6 pixel radius metrics 45 | range_hist_min = [0, 0, 0, 0.01, 0.0002, 0.000010] # for all metrics 46 | range_hist_max = [0.35, 0.35, 0.35, 0.10, 0.0010, 0.00004] 47 | # range_hist_min = [0.08, 0, 0, 0.01, 0, 0.000010] # for threshold=0.1 48 | # range_hist_max = [0.35, 0.35, 0.35, 0.08, 0.0010, 0.00004] 49 | 50 | if model_name == "fcn": 51 | seg_pixel = seg_fcn_pixel 52 | depth_pixel = depth_fcn_pixel 53 | iou_pixel = fcn_iou_pixel 54 | if model_name == "frrn": 55 | seg_pixel = seg_frrn_pixel 56 | depth_pixel = depth_frrn_pixel 57 | iou_pixel = frrn_iou_pixel 58 | if model_name == "segnet": 59 | seg_pixel = seg_segnet_pixel 60 | depth_pixel = depth_segnet_pixel 61 | iou_pixel = segnet_iou_pixel 62 | if model_name == "deeplab": 63 | seg_pixel = seg_deeplab_pixel 64 | depth_pixel = depth_deeplab_pixel 65 | iou_pixel = deeplab_iou_pixel 66 | if model_name == "fcrn": 67 | seg_pixel = seg_fcrn_pixel 68 | depth_pixel = depth_fcrn_pixel 69 | iou_pixel = fcrn_iou_pixel 70 | if model_name == "dispnet": 71 | seg_pixel = seg_dispnet_pixel 72 | depth_pixel = depth_dispnet_pixel 73 | iou_pixel = dispnet_iou_pixel 74 | 75 | threshold__pixel = [0.1, 0.5, 0.9, 0.1, 0.5, 0.9] 76 | name_pixel = ['act_d', 'act_d', 'act_d', 'act_ratio', 'act_ratio', 'act_ratio'] 77 | 78 | 79 | ### significant analysis ### 80 | for i in range(num_metric): 81 | mean_seg_img_list = [] # store mean value over all pixels for each image, length: 100 (100*1) 82 | mean_depth_img_list = [] 83 | p_value_list = [] 84 | count = 0 85 | seg_pixel_metric = seg_pixel[:, i, :] # [100, 1078] metric map i for 100 images, 1078 pixels/image 86 | depth_pixel_metric = depth_pixel[:, i, :] 87 | 88 | for k in range(num_img): 89 | p_value = wilcoxon(seg_pixel_metric[k], depth_pixel_metric[k])[1] 90 | p_value_list.append(p_value) 91 | mean_seg = np.mean(seg_pixel_metric[k]) 92 | mean_depth = np.mean(depth_pixel_metric[k]) 93 | mean_seg_img_list.append(mean_seg) 94 | mean_depth_img_list.append(mean_depth) 95 | if mean_seg < mean_depth and p_value < 0.05: 96 | count += 1 97 | 98 | n, bins, patches = plt.hist(x=mean_seg_img_list, bins='auto', color='red', 99 | range=(range_hist_min[i], range_hist_max[i]), alpha=0.5) 100 | plt.grid(axis='y', alpha=0.75) 101 | plt.xlabel('mean value of {}>={} of each image'.format(name_pixel[i], threshold__pixel[i])) 102 | plt.ylabel('number of images') 103 | plt.title('{}: {}>={}'.format(model_name, name_pixel[i], threshold__pixel[i])) 104 | plt.text(23, 45, r'$\mu=15, b=3$') 105 | plt.hist(x=mean_depth_img_list, bins=bins, range=(range_hist_min[i], range_hist_max[i]), color='blue', alpha=0.5) 106 | plt.ylim(0, 35) 107 | if i == 0 or i == 3: 108 | plt.savefig('saliency_eval_hist/{}_metric_{}.png'.format(model_name, i)) 109 | plt.show() 110 | 111 | print("metric ", i) 112 | print("number of images fitting assumption:",count/num_img) 113 | print("mean seg:", np.mean(np.array(mean_seg_img_list))) # mean over all pixels in all images 114 | print("mean depth:", np.mean(np.array(mean_depth_img_list))) 115 | 116 | 117 | for i in range(4): 118 | iou_pixel_metric = iou_pixel[:, i, :] 119 | iou_img_metric = np.mean(iou_pixel_metric) 120 | print("metric ", i) 121 | print("mean iou over all pixels over all imgs: ", iou_img_metric) 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /saliency_eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import argparse 4 | import numpy as np 5 | import torch 6 | import scipy.misc as m 7 | import cv2 8 | from torch.utils import data 9 | from tqdm import tqdm 10 | from joblib import Parallel, delayed 11 | 12 | from ptsemseg.models.fcn_seg import * 13 | from ptsemseg.models.segnet_seg import * 14 | from ptsemseg.models.frrn_seg import * 15 | from ptsemseg.models.deeplab_seg import * 16 | from ptsemseg.models.fcrn_seg import * 17 | from ptsemseg.models.dispnet_seg import * 18 | 19 | from ptsemseg.models.fcn_depth import * 20 | from ptsemseg.models.segnet_depth import * 21 | from ptsemseg.models.frrn_depth import * 22 | from ptsemseg.models.deeplab_depth import * 23 | from ptsemseg.models.fcrn_depth import * 24 | from ptsemseg.models.dispnet_depth import * 25 | 26 | from saliency import BackPropagation 27 | from ptsemseg.loader.kitti_loader_seg import kittiLoader_seg 28 | 29 | 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument("--data_path", default='datasets/kitti/semantics/', type=str, 32 | help='path to test images') 33 | parser.add_argument("--model_name", type=str, default='deeplab', choices=["fcn", "frrnA", "segnet", "deeplab", "dispnet", "fcrn"]) 34 | parser.add_argument("--task", type=str, default="depth", choices=["seg", "depth"]) 35 | parser.add_argument("--model_path", type=str, 36 | default='runs/deeplab_kitti_depth/2976_256_832_smooth1000_init_BNfreeze/deeplab_kitti_best_model.pkl', 37 | help='path to pretrained model') 38 | 39 | # the image resolution here should match the pretrained model training resolution 40 | parser.add_argument("--height", type=int, default=256, help="image resize height") 41 | parser.add_argument("--width", type=int, default=832, help="image resize width") 42 | parser.add_argument("--sample_rate", type=int, default=10, help="sample rate for eval") 43 | parser.add_argument("--num_image", type=int, default=100, help="number of images to evaluate") 44 | 45 | 46 | args = parser.parse_args() 47 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 48 | 49 | def get_model(model_name, task): 50 | if task == "seg": 51 | try: 52 | return { 53 | "fcn": fcn_seg(n_classes=19), 54 | "frrnA": frrn_seg(model_type = "A", n_classes=19), 55 | "segnet": segnet_seg(n_classes=19), 56 | "deeplab": deeplab_seg(n_classes=19), 57 | "dispnet": dispnet_seg(n_classes=19), 58 | "fcrn": fcrn_seg(n_classes=19), 59 | }[model_name] 60 | except: 61 | raise("Model {} not available".format(model_name)) 62 | elif task == "depth": 63 | try: 64 | return { 65 | "fcn": fcn_depth(), 66 | "frrnA": frrn_depth(model_type = "A"), 67 | "segnet": segnet_depth(), 68 | "deeplab": deeplab_depth(), 69 | "dispnet": dispnet_depth(), 70 | "fcrn": fcrn_depth(), 71 | }[model_name] 72 | except: 73 | raise("Model {} not available".format(model_name)) 74 | 75 | 76 | def most_act_dis(saliency_map, pos_i, pos_j): 77 | # distance between the most activated pixel and current pixel 78 | height, width = saliency_map.shape 79 | most_act_pos = np.where(saliency_map == np.max(saliency_map)) 80 | all_dist = 0 81 | for i in range(most_act_pos[0].shape[0]): 82 | dist = np.sqrt((most_act_pos[0][i]-pos_i) ** 2 + (most_act_pos[1][i]-pos_j) ** 2) 83 | all_dist = all_dist + dist 84 | result = (all_dist / len(most_act_pos[0])) / np.sqrt(height ** 2 + width ** 2) 85 | return result 86 | 87 | 88 | # biggest distance between current pixel and all pixels with value >= threshold 89 | # number of pixels >= threshold / number of total_pixels 90 | def largest_radius(saliency_map, pos_i, pos_j, threshold=0.2): 91 | height, width = saliency_map.shape 92 | 93 | act_pixel_pos = np.where(saliency_map >= threshold) 94 | all_dist = np.zeros(act_pixel_pos[0].shape[0]) 95 | 96 | if act_pixel_pos[0].shape[0] == 0: 97 | return 0, 0 98 | for i in range(act_pixel_pos[0].shape[0]): 99 | all_dist[i] = np.sqrt((act_pixel_pos[0][i]-pos_i) ** 2 + (act_pixel_pos[1][i]-pos_j) ** 2) 100 | radius = np.max(all_dist) / np.sqrt(height ** 2 + width ** 2) 101 | part = act_pixel_pos[0].shape[0] / (height * width) 102 | return radius, part 103 | 104 | 105 | def calculate(image, label, bp, args): 106 | pred_idx = bp.forward(image.to(device)) # predict lbl / depth: [h, w] 107 | 108 | img_radius1, img_radius2, img_radius3 = [], [], [] 109 | img_part1, img_part2, img_part3 = [], [], [] 110 | 111 | y1, y2 = int(0.40810811 * args.height), int(0.99189189 * args.height) 112 | x1, x2 = int(0.03594771 * args.width), int(0.96405229 * args.width) 113 | total_pixel = 0 114 | 115 | for pos_i in tqdm(range(y1+args.sample_rate, y2, args.sample_rate)): 116 | for pos_j in tqdm(range(x1+args.sample_rate, x2, args.sample_rate)): 117 | bp.backward(pos_i=pos_i, pos_j=pos_j, idx=pred_idx[pos_i, pos_j]) 118 | output_vanilla, output_saliency = bp.generate() # output_saliency: [h, w] 119 | 120 | output_saliency = output_saliency[y1:y2, x1:x2] 121 | # normalized saliency map for a pixel in an image 122 | if np.max(output_saliency) > 0: 123 | output_saliency = (output_saliency - np.min(output_saliency)) / np.max(output_saliency) 124 | radius1, part1 = largest_radius(output_saliency, pos_i=pos_i-y1, pos_j=pos_j-x1, threshold=0.1) 125 | radius2, part2 = largest_radius(output_saliency, pos_i=pos_i-y1, pos_j=pos_j-x1, threshold=0.5) 126 | radius3, part3 = largest_radius(output_saliency, pos_i=pos_i-y1, pos_j=pos_j-x1, threshold=0.9) 127 | 128 | img_radius1.append(radius1) 129 | img_radius2.append(radius2) 130 | img_radius3.append(radius3) 131 | img_part1.append(part1) 132 | img_part2.append(part2) 133 | img_part3.append(part3) 134 | total_pixel += 1 135 | 136 | return img_radius1, img_radius2, img_radius3, \ 137 | img_part1, img_part2, img_part3 138 | 139 | 140 | 141 | def main(): 142 | # Model 143 | model = get_model(args.model_name, args.task) 144 | weights = torch.load(args.model_path) 145 | # weights = torch.load(args.model_path, map_location=lambda storage, loc: storage) 146 | model.load_state_dict(weights['model_state']) 147 | model.to(device) 148 | model.eval() 149 | 150 | depth_flag = False 151 | if args.task == 'depth': 152 | depth_flag = True 153 | 154 | loader = kittiLoader_seg( 155 | root=args.data_path, 156 | split='train', 157 | is_transform=True, 158 | img_size=(args.height, args.width), 159 | augmentations=None, 160 | img_norm=True, 161 | saliency_eval_depth=depth_flag 162 | ) 163 | 164 | testloader = data.DataLoader(loader, 165 | batch_size=1, 166 | num_workers=0, 167 | shuffle=False) 168 | 169 | bp = BackPropagation(model=model, task=args.task) 170 | result_img = [] 171 | for i, (image, label, img_path) in enumerate(testloader): 172 | print(img_path) 173 | img_eval_res = calculate(image=image, label=label, bp=bp, args=args) 174 | result_img.append(img_eval_res) 175 | result_img_out = np.array(result_img, dtype=float) # [num_image, num_metrics, num_pixels_for_each_image] 176 | np.save("saliency_eval_pixel/{}_{}_pixel_try.npy".format(args.task, args.model_name), result_img_out) 177 | if i >= args.num_image: 178 | break 179 | 180 | 181 | if __name__ == '__main__': 182 | main() 183 | -------------------------------------------------------------------------------- /saliency_iou.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import argparse 4 | import numpy as np 5 | import torch 6 | import scipy.misc as m 7 | import cv2 8 | from torch.utils import data 9 | from tqdm import tqdm 10 | from joblib import Parallel, delayed 11 | 12 | from ptsemseg.models.fcn_seg import * 13 | from ptsemseg.models.segnet_seg import * 14 | from ptsemseg.models.frrn_seg import * 15 | from ptsemseg.models.deeplab_seg import * 16 | from ptsemseg.models.fcrn_seg import * 17 | from ptsemseg.models.dispnet_seg import * 18 | 19 | from ptsemseg.models.fcn_depth import * 20 | from ptsemseg.models.segnet_depth import * 21 | from ptsemseg.models.frrn_depth import * 22 | from ptsemseg.models.deeplab_depth import * 23 | from ptsemseg.models.fcrn_depth import * 24 | from ptsemseg.models.dispnet_depth import * 25 | 26 | from saliency import BackPropagation 27 | from ptsemseg.loader.kitti_loader_seg import kittiLoader_seg 28 | 29 | 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument("--data_path", default='datasets/kitti/semantics/', type=str, 32 | help='path to test images') 33 | parser.add_argument("--model_name", type=str, default='fcn', choices=["fcn", "frrnA", "segnet", "deeplab", "dispnet", "fcrn"]) 34 | parser.add_argument("--model_seg_path", type=str, 35 | default='runs/fcn8s_kitti_seg/12543_256_832_cityscaperPretrained_lr5/fcn8s_kitti_best_model.pkl', 36 | help='path to pretrained model') 37 | parser.add_argument("--model_depth_path", type=str, 38 | default='runs/fcn_kitti_depth/2972_256_832_bs4_smooth1000/fcn_kitti_best_model.pkl', 39 | help='path to pretrained model') 40 | 41 | # the image resolution here should match the pretrained model training resolution 42 | # here the segmentation model and depth model should have the same training resolution 43 | parser.add_argument("--height", type=int, default=256, help="image resize height") 44 | parser.add_argument("--width", type=int, default=832, help="image resize width") 45 | parser.add_argument("--sample_rate", type=int, default=20, help="sample rate for eval") 46 | parser.add_argument("--num_image", type=int, default=100, help="number of images to evaluate") 47 | 48 | 49 | args = parser.parse_args() 50 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 51 | 52 | def get_model(model_name, task): 53 | if task == "seg": 54 | try: 55 | return { 56 | "fcn": fcn_seg(n_classes=19), 57 | "frrnA": frrn_seg(model_type="A", n_classes=19), 58 | "segnet": segnet_seg(n_classes=19), 59 | "deeplab": deeplab_seg(n_classes=19), 60 | "dispnet": dispnet_seg(n_classes=19), 61 | "fcrn": fcrn_seg(n_classes=19), 62 | }[model_name] 63 | except: 64 | raise("Model {} not available".format(model_name)) 65 | elif task == "depth": 66 | try: 67 | return { 68 | "fcn": fcn_depth(), 69 | "frrnA": frrn_depth(model_type = "A"), 70 | "segnet": segnet_depth(), 71 | "deeplab": deeplab_depth(), 72 | "dispnet": dispnet_depth(), 73 | "fcrn": fcrn_depth(), 74 | }[model_name] 75 | except: 76 | raise("Model {} not available".format(model_name)) 77 | 78 | 79 | def saliency_iou(saliency_seg, saliency_depth, threshold): 80 | mask_seg = np.logical_not(saliency_seg < threshold) 81 | mask_depth = np.logical_not(saliency_depth < threshold) 82 | union = np.logical_or(mask_seg, mask_depth) 83 | inter = np.logical_and(mask_seg, mask_depth) 84 | iou = np.sum(inter) / np.sum(union) 85 | return iou 86 | 87 | 88 | def calculate_overlap(img_seg, img_depth, bp_seg, bp_depth, args): 89 | pred_seg = bp_seg.forward(img_seg.to(device)) # predict lbl / depth: [h, w] 90 | pred_depth = bp_depth.forward(img_depth.to(device)) 91 | 92 | img_iou1, img_iou2, img_iou3, img_iou4 = [], [], [], [] 93 | 94 | y1, y2 = int(0.40810811 * args.height), int(0.99189189 * args.height) 95 | x1, x2 = int(0.03594771 * args.width), int(0.96405229 * args.width) 96 | total_pixel = 0 97 | 98 | for pos_i in tqdm(range(y1+args.sample_rate, y2, args.sample_rate)): 99 | for pos_j in tqdm(range(x1+args.sample_rate, x2, args.sample_rate)): 100 | bp_seg.backward(pos_i=pos_i, pos_j=pos_j, idx=pred_seg[pos_i, pos_j]) 101 | bp_depth.backward(pos_i=pos_i, pos_j=pos_j, idx=pred_depth[pos_i, pos_j]) 102 | _, output_saliency_seg = bp_seg.generate() # output_saliency: [h, w] 103 | _, output_saliency_depth = bp_depth.generate() 104 | 105 | output_saliency_seg = output_saliency_seg[y1:y2, x1:x2] 106 | output_saliency_depth = output_saliency_depth[y1:y2, x1:x2] 107 | # normalized saliency map for a pixel in an image 108 | if np.max(output_saliency_seg) > 0: 109 | output_saliency_seg = (output_saliency_seg - np.min(output_saliency_seg)) / np.max(output_saliency_seg) 110 | if np.max(output_saliency_depth) > 0: 111 | output_saliency_depth = (output_saliency_depth - np.min(output_saliency_depth)) / np.max(output_saliency_depth) 112 | 113 | iou1 = saliency_iou(saliency_seg=output_saliency_seg, saliency_depth=output_saliency_depth, threshold=0.05) 114 | iou2 = saliency_iou(saliency_seg=output_saliency_seg, saliency_depth=output_saliency_depth, threshold=0.1) 115 | iou3 = saliency_iou(saliency_seg=output_saliency_seg, saliency_depth=output_saliency_depth, threshold=0.5) 116 | iou4 = saliency_iou(saliency_seg=output_saliency_seg, saliency_depth=output_saliency_depth, threshold=0.9) 117 | 118 | total_pixel += 1 119 | img_iou1.append(iou1) 120 | img_iou2.append(iou2) 121 | img_iou3.append(iou3) 122 | img_iou4.append(iou4) 123 | 124 | return img_iou1, img_iou2, img_iou3, img_iou4 # list, for all pixels evaluated 125 | 126 | 127 | 128 | def main(): 129 | # seg Model and depth Model 130 | model_seg = get_model(args.model_name, task="seg") 131 | weights_seg = torch.load(args.model_seg_path) 132 | # weights = torch.load(args.model_seg_path, map_location=lambda storage, loc: storage) 133 | model_seg.load_state_dict(weights_seg['model_state']) 134 | model_seg.to(device) 135 | model_seg.eval() 136 | 137 | model_depth = get_model(args.model_name, task="depth") 138 | weights_depth = torch.load(args.model_depth_path) 139 | # weights = torch.load(args.model_depth_path, map_location=lambda storage, loc: storage) 140 | model_depth.load_state_dict(weights_depth['model_state']) 141 | model_depth.to(device) 142 | model_depth.eval() 143 | 144 | loader_seg = kittiLoader_seg( 145 | root=args.data_path, 146 | split='train', 147 | is_transform=True, 148 | img_size=(args.height, args.width), 149 | augmentations=None, 150 | img_norm=True, 151 | saliency_eval_depth=False 152 | ) 153 | 154 | loader_depth = kittiLoader_seg( 155 | root=args.data_path, 156 | split='train', 157 | is_transform=True, 158 | img_size=(args.height, args.width), 159 | augmentations=None, 160 | img_norm=True, 161 | saliency_eval_depth=True 162 | ) 163 | 164 | testloader_seg = data.DataLoader(loader_seg, 165 | batch_size=1, 166 | num_workers=0, 167 | shuffle=False) 168 | testloader_depth = data.DataLoader(loader_depth, 169 | batch_size=1, 170 | num_workers=0, 171 | shuffle=False) 172 | 173 | bp_seg = BackPropagation(model=model_seg, task="seg") 174 | bp_depth = BackPropagation(model=model_depth, task="depth") 175 | result_img = [] 176 | 177 | for i, (image_seg, label_seg, img_path_seg) in enumerate(testloader_seg): 178 | for j, (image_depth, _, img_path_depth) in enumerate(testloader_depth): 179 | if i == j: 180 | print(img_path_seg) 181 | img_iou = calculate_overlap(img_seg=image_seg, img_depth=image_depth, bp_seg=bp_seg, bp_depth=bp_depth, args=args) 182 | result_img.append(img_iou) 183 | result_img_out = np.array(result_img, dtype=float) # [num_image, num_metrics=4, num_pixels_for_each_image] 184 | np.save("saliency_eval_pixel/{}_iou_try.npy".format(args.model_name), result_img_out) 185 | 186 | if i >= args.num_image: 187 | break 188 | else: 189 | continue 190 | 191 | 192 | if __name__ == '__main__': 193 | main() 194 | -------------------------------------------------------------------------------- /saliency_results/BP_saliency_map_fcrn_seg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/saliency_results/BP_saliency_map_fcrn_seg.png -------------------------------------------------------------------------------- /saliency_results/image_pixel_locate_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanweiliti/Segmentation-MonoDepth-Pytorch/d1a3de8d10c60fe9d3b86b585e0f0089555fc8a6/saliency_results/image_pixel_locate_0.png -------------------------------------------------------------------------------- /test_depth_cityscapes.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import torch 3 | import argparse 4 | 5 | from torch.utils import data 6 | from tqdm import tqdm 7 | 8 | from ptsemseg.models import get_model 9 | from ptsemseg.loader import get_loader 10 | from ptsemseg.metrics import runningScoreDepth, averageMeter 11 | 12 | 13 | def count_parameters(model): 14 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 15 | 16 | 17 | def test(cfg, args): 18 | # Setup device 19 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 20 | 21 | # Setup Dataloader 22 | data_loader = get_loader(cfg['data']['dataset'], cfg['task']) 23 | data_path = cfg['data']['path'] 24 | 25 | loader = data_loader( 26 | data_path, 27 | split=cfg['data']['test_split'], 28 | is_transform=True, 29 | img_size=(cfg['data']['img_rows'], 30 | cfg['data']['img_cols']), 31 | img_norm=cfg['data']['img_norm'] 32 | ) 33 | 34 | n_classes = 0 35 | running_metrics_val = runningScoreDepth(cfg['data']['dataset']) 36 | 37 | testloader = data.DataLoader(loader, 38 | batch_size=cfg['training']['batch_size'], 39 | num_workers=0) 40 | 41 | # Load Model 42 | model = get_model(cfg['model'], cfg['task'], n_classes=n_classes).to(device) 43 | #weights = torch.load(cfg['testing']['trained_model']) 44 | weights = torch.load(cfg['testing']['trained_model'], map_location=lambda storage, loc: storage) 45 | model.load_state_dict(weights["model_state"]) 46 | model.eval() 47 | model.to(device) 48 | 49 | with torch.no_grad(): 50 | for i, (images, labels, img_path) in tqdm(enumerate(testloader)): 51 | images = images.to(device) 52 | labels = labels.to(device) 53 | 54 | outputs = model(images) # [batch_size, n_classes, height, width] 55 | if cfg['model']['arch'] == "dispnet" and cfg['task'] == "depth": 56 | outputs = 1 / outputs 57 | 58 | pred = outputs.squeeze(1).data.cpu().numpy() 59 | gt = labels.data.squeeze(1).cpu().numpy() 60 | 61 | running_metrics_val.update(gt=gt, pred=pred) 62 | 63 | val_result = running_metrics_val.get_scores() 64 | for k, v in val_result.items(): 65 | print(k, v) 66 | 67 | 68 | if __name__ == "__main__": 69 | parser = argparse.ArgumentParser(description="Hyperparams") 70 | parser.add_argument( 71 | "--config", 72 | nargs="?", 73 | type=str, 74 | default="configs/fcn_cityscapes_depth.yml", 75 | help="Config file to be used", 76 | ) 77 | 78 | args = parser.parse_args() 79 | 80 | with open(args.config) as fp: 81 | cfg = yaml.load(fp) 82 | 83 | test(cfg, args) 84 | 85 | -------------------------------------------------------------------------------- /test_depth_kitti.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from scipy.misc import imresize 4 | from scipy.ndimage.interpolation import zoom 5 | import numpy as np 6 | from path import Path 7 | import argparse 8 | from tqdm import tqdm 9 | 10 | from ptsemseg.models.fcn_depth import * 11 | from ptsemseg.models.segnet_depth import * 12 | from ptsemseg.models.frrn_depth import * 13 | from ptsemseg.models.deeplab_depth import * 14 | from ptsemseg.models.fcrn_depth import * 15 | from ptsemseg.models.dispnet_depth import * 16 | 17 | from kitti_depth_eval.depth_evaluation_utils import test_framework_KITTI as test_framework 18 | 19 | 20 | parser = argparse.ArgumentParser(description='Script for depth testing with corresponding groundTruth', 21 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 22 | parser.add_argument("--model_name", type=str, default='dispnet', choices=["fcn", "frrnA", "segnet", "deeplab", "dispnet", "fcrn"]) 23 | parser.add_argument("--model_path", default='runs/frrn_kitti_depth/33888_128_416_bs4_smooth1000/frrn_kitti_best_model.pkl', 24 | type=str, help="pretrained model path") 25 | parser.add_argument("--img_height", default=128, type=int, help="Image height") 26 | parser.add_argument("--img_width", default=416, type=int, help="Image width") 27 | parser.add_argument("--min-depth", default=1e-3) 28 | parser.add_argument("--max-depth", default=80) 29 | parser.add_argument("--pred_disp", action='store_true', 30 | help="model predicts disparity instead of depth if selected") 31 | 32 | parser.add_argument("--dataset_dir", default='../kitti', type=str, help="Kitti raw dataset directory") 33 | parser.add_argument("--dataset_list", default='kitti_depth_eval/test_files_eigen.txt', 34 | type=str, help="Kitti test dataset list file") 35 | 36 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") 37 | 38 | 39 | def get_depth_model(model_name): 40 | try: 41 | return { 42 | "fcn": fcn_depth(), 43 | "frrnA": frrn_depth(model_type = "A"), 44 | "segnet": segnet_depth(), 45 | "deeplab": deeplab_depth(), 46 | "dispnet": dispnet_depth(), 47 | "fcrn": fcrn_depth(), 48 | }[model_name] 49 | except: 50 | raise("Model {} not available".format(model_name)) 51 | 52 | 53 | @torch.no_grad() 54 | def main(): 55 | args = parser.parse_args() 56 | 57 | model = get_depth_model(args.model_name).to(device) 58 | weights = torch.load(args.model_path) 59 | # weights = torch.load(args.model_path, map_location=lambda storage, loc: storage) 60 | model.load_state_dict(weights['model_state']) 61 | model.eval() 62 | 63 | seq_length = 0 64 | 65 | dataset_dir = Path(args.dataset_dir) 66 | with open(args.dataset_list, 'r') as f: 67 | test_files = list(f.read().splitlines()) 68 | 69 | framework = test_framework(dataset_dir, test_files, seq_length, args.min_depth, args.max_depth) 70 | 71 | print('{} files to test'.format(len(test_files))) 72 | errors = np.zeros((2, 7, len(test_files)), np.float32) 73 | 74 | 75 | for j, sample in enumerate(tqdm(framework)): 76 | tgt_img = sample['tgt'] # [375, 1242, 3] ndarray, original RGB image 77 | 78 | h,w,_ = tgt_img.shape 79 | if h != args.img_height or w != args.img_width: 80 | tgt_img = imresize(tgt_img, (args.img_height, args.img_width)).astype(np.float32) 81 | 82 | tgt_img = np.transpose(tgt_img, (2, 0, 1)) 83 | tgt_img = torch.from_numpy(tgt_img).unsqueeze(0) 84 | tgt_img = ((tgt_img/255 - 0.5)/0.5).to(device) # normalize to [-1, 1] 85 | 86 | pred = model(tgt_img).cpu().numpy()[0,0] 87 | gt_depth = sample['gt_depth'] 88 | 89 | if args.pred_disp: 90 | pred_depth = 1 / pred 91 | else: 92 | pred_depth = pred 93 | 94 | # upsample to gt depth resolution, [375, 1242] 95 | # and mask out pixels with depth not in [min_depth, max_depth] 96 | pred_depth_zoomed = zoom(pred_depth, 97 | (gt_depth.shape[0]/pred_depth.shape[0], 98 | gt_depth.shape[1]/pred_depth.shape[1]) 99 | ).clip(args.min_depth, args.max_depth) 100 | if sample['mask'] is not None: 101 | pred_depth_zoomed = pred_depth_zoomed[sample['mask']] 102 | gt_depth = gt_depth[sample['mask']] 103 | 104 | errors[1, :, j] = compute_errors(gt_depth, pred_depth_zoomed) 105 | 106 | mean_errors = errors.mean(2) 107 | error_names = ['abs_rel','sq_rel','rms','log_rms','a1','a2','a3'] 108 | 109 | print("Results : ") 110 | print("{:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}, {:>10}".format(*error_names)) 111 | print("{:10.4f}, {:10.4f}, {:10.4f}, {:10.4f}, {:10.4f}, {:10.4f}, {:10.4f}".format(*mean_errors[1])) 112 | 113 | 114 | def compute_errors(gt, pred): 115 | thresh = np.maximum((gt / pred), (pred / gt)) 116 | a1 = (thresh < 1.25 ).mean() 117 | a2 = (thresh < 1.25 ** 2).mean() 118 | a3 = (thresh < 1.25 ** 3).mean() 119 | 120 | rmse = (gt - pred) ** 2 121 | rmse = np.sqrt(rmse.mean()) 122 | 123 | rmse_log = (np.log(gt) - np.log(pred)) ** 2 124 | rmse_log = np.sqrt(rmse_log.mean()) 125 | 126 | abs_rel = np.mean(np.abs(gt - pred) / gt) 127 | sq_rel = np.mean(((gt - pred)**2) / gt) 128 | 129 | return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 130 | 131 | 132 | if __name__ == '__main__': 133 | main() 134 | -------------------------------------------------------------------------------- /test_seg_cityscapes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import yaml 4 | import torch 5 | import argparse 6 | import timeit 7 | import numpy as np 8 | import scipy.misc as m 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import torchvision.models as models 12 | 13 | from torch.backends import cudnn 14 | from torch.utils import data 15 | 16 | from tqdm import tqdm 17 | 18 | from ptsemseg.models import get_model 19 | from ptsemseg.loader import get_loader 20 | from ptsemseg.metrics import runningScoreSeg 21 | from ptsemseg.utils import convert_state_dict 22 | 23 | torch.backends.cudnn.benchmark = True 24 | 25 | # test code for cityscapes segmentation 26 | def test(cfg, args): 27 | 28 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 29 | 30 | # Setup Dataloader 31 | data_loader = get_loader(cfg['data']['dataset'], cfg['task']) 32 | data_path = cfg['data']['path'] 33 | 34 | loader = data_loader( 35 | data_path, 36 | split=cfg['data']['test_split'], 37 | is_transform=True, 38 | img_size=(cfg['data']['img_rows'], 39 | cfg['data']['img_cols']), 40 | img_norm=cfg['data']['img_norm'] 41 | ) 42 | 43 | n_classes = loader.n_classes 44 | 45 | testloader = data.DataLoader(loader, 46 | batch_size=cfg['training']['batch_size'], 47 | num_workers=0) 48 | 49 | # Setup Model 50 | model = get_model(cfg['model'], cfg['task'], n_classes=n_classes).to(device) 51 | weights = torch.load(cfg['testing']['trained_model'], map_location=lambda storage, loc: storage) 52 | model.load_state_dict(weights["model_state"]) 53 | 54 | model.eval() 55 | model.to(device) 56 | 57 | for i, (images, labels, img_path) in tqdm(enumerate(testloader)): 58 | images = images.to(device) 59 | 60 | outputs = model(images) 61 | pred = np.squeeze(outputs.data.max(1)[1].cpu().numpy(), axis=0) 62 | 63 | decoded = loader.decode_segmap_tocolor(pred) # color segmentation mask 64 | decoded_labelID = loader.decode_segmap_tolabelId(pred) # segmentation mask of labelIDs for online test 65 | print("Classes found: ", np.unique(decoded_labelID)) 66 | 67 | # m.imsave("output.png", decoded) 68 | 69 | out_file_name = [img_path[0][39:-16], '*.png'] 70 | out_file_name = ''.join(out_file_name) 71 | out_path = os.path.join(args.out_path, out_file_name) 72 | 73 | decoded_labelID = m.imresize(decoded_labelID, (1024, 2048), "nearest", mode="F") 74 | m.toimage(decoded_labelID, high=np.max(decoded_labelID), low=np.min(decoded_labelID)).save(out_path) 75 | print("Segmentation Mask Saved at: {}".format(out_path)) 76 | 77 | 78 | if __name__ == "__main__": 79 | parser = argparse.ArgumentParser(description="Hyperparams") 80 | parser.add_argument( 81 | "--config", 82 | nargs="?", 83 | type=str, 84 | default="configs/fcn8s_cityscapes.yml", 85 | help="Config file to be used", 86 | ) 87 | 88 | parser.add_argument( 89 | "--out_path", 90 | nargs="?", 91 | type=str, 92 | default="./test_output/fcn8s_cityscapes", 93 | help="Path of the output segmap", 94 | ) 95 | 96 | args = parser.parse_args() 97 | 98 | with open(args.config) as fp: 99 | cfg = yaml.load(fp) 100 | 101 | test(cfg, args) 102 | -------------------------------------------------------------------------------- /validate_seg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import yaml 4 | import torch 5 | import argparse 6 | import timeit 7 | import numpy as np 8 | import scipy.misc as misc 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import torchvision.models as models 12 | 13 | from torch.backends import cudnn 14 | from torch.utils import data 15 | 16 | from tqdm import tqdm 17 | 18 | from ptsemseg.models import get_model 19 | from ptsemseg.loader import get_loader 20 | from ptsemseg.metrics import runningScoreSeg 21 | 22 | torch.backends.cudnn.benchmark = True 23 | 24 | ### for segmentation validation 25 | 26 | def validate(cfg, args): 27 | 28 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 29 | 30 | # Setup Dataloader 31 | data_loader = get_loader(cfg['data']['dataset'], cfg['task']) 32 | data_path = cfg['data']['path'] 33 | 34 | loader = data_loader( 35 | data_path, 36 | split=cfg['data']['val_split'], 37 | is_transform=True, 38 | img_norm=cfg['data']['img_norm'], 39 | img_size=(cfg['data']['img_rows'], 40 | cfg['data']['img_cols']), 41 | ) 42 | 43 | n_classes = loader.n_classes 44 | valloader = data.DataLoader(loader, 45 | batch_size=cfg['training']['batch_size'], 46 | num_workers=0) 47 | running_metrics = runningScoreSeg(n_classes) 48 | 49 | # Setup Model 50 | 51 | model = get_model(cfg['model'], cfg['task'], n_classes).to(device) 52 | state = torch.load(args.model_path)["model_state"] 53 | #state = torch.load(args.model_path, map_location=lambda storage, loc: storage)["model_state"] 54 | model.load_state_dict(state) 55 | model.to(device) 56 | model.eval() 57 | 58 | with torch.no_grad(): 59 | for i, (images, labels, images_path) in enumerate(valloader): 60 | images = images.to(device) 61 | outputs = model(images) 62 | pred = outputs.data.max(1)[1].cpu().numpy() 63 | gt = labels.numpy() 64 | running_metrics.update(gt, pred) 65 | 66 | score, class_iou = running_metrics.get_scores() 67 | 68 | for k, v in score.items(): 69 | print(k, v) 70 | 71 | for i in range(n_classes): 72 | print(i, class_iou[i]) 73 | 74 | 75 | if __name__ == "__main__": 76 | parser = argparse.ArgumentParser(description="Hyperparams") 77 | parser.add_argument( 78 | "--config", 79 | nargs="?", 80 | type=str, 81 | default="configs/segnet_kitti_seg.yml", 82 | help="Config file to be used", 83 | ) 84 | parser.add_argument( 85 | "--model_path", 86 | nargs="?", 87 | type=str, 88 | default="runs/segnet_kitti_seg/3574_256_832_cityscaperPretrained_lr5/segnet_kitti_best_model.pkl", 89 | help="Path to the saved model", 90 | ) 91 | args = parser.parse_args() 92 | 93 | with open(args.config) as fp: 94 | cfg = yaml.load(fp) 95 | 96 | validate(cfg, args) 97 | --------------------------------------------------------------------------------