├── .gitignore ├── Data ├── cmap_kitti.npy └── cmap_nyud.npy ├── Examples ├── Example_KITTI_Segm_Depth │ ├── 000099.png │ └── segm_gt_000099.png ├── Example_NYUDv2_Segm_Depth │ ├── 000464.png │ ├── depth_gt_000464.png │ └── segm_gt_000464.png └── Example_NYUDv2_Segm_Depth_SurfNorm │ ├── 000433.png │ ├── depth_gt_000433.png │ ├── norm_gt_000433.png │ └── segm_gt_000433.png ├── Images ├── LWRN_General.PNG ├── LW_CRP_RCU_FUSION_Blocks.PNG ├── Network_architecture.PNG ├── ResNEt_34_Arch.png └── Skip_Connection.PNG ├── README.md ├── Weights ├── ExpKITTI_joint.ckpt ├── ExpNYUDKITTI_joint.ckpt ├── ExpNYUD_joint.ckpt └── ExpNYUD_three.ckpt ├── eval_KITTI.py ├── eval_KITTI_NYUD.py ├── eval_NYUDv2.py ├── eval_NYUDv2_SurfNorm.py └── model.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /Data/cmap_kitti.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Data/cmap_kitti.npy -------------------------------------------------------------------------------- /Data/cmap_nyud.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Data/cmap_nyud.npy -------------------------------------------------------------------------------- /Examples/Example_KITTI_Segm_Depth/000099.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_KITTI_Segm_Depth/000099.png -------------------------------------------------------------------------------- /Examples/Example_KITTI_Segm_Depth/segm_gt_000099.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_KITTI_Segm_Depth/segm_gt_000099.png -------------------------------------------------------------------------------- /Examples/Example_NYUDv2_Segm_Depth/000464.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth/000464.png -------------------------------------------------------------------------------- /Examples/Example_NYUDv2_Segm_Depth/depth_gt_000464.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth/depth_gt_000464.png -------------------------------------------------------------------------------- /Examples/Example_NYUDv2_Segm_Depth/segm_gt_000464.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth/segm_gt_000464.png -------------------------------------------------------------------------------- /Examples/Example_NYUDv2_Segm_Depth_SurfNorm/000433.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth_SurfNorm/000433.png -------------------------------------------------------------------------------- /Examples/Example_NYUDv2_Segm_Depth_SurfNorm/depth_gt_000433.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth_SurfNorm/depth_gt_000433.png -------------------------------------------------------------------------------- /Examples/Example_NYUDv2_Segm_Depth_SurfNorm/norm_gt_000433.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth_SurfNorm/norm_gt_000433.png -------------------------------------------------------------------------------- /Examples/Example_NYUDv2_Segm_Depth_SurfNorm/segm_gt_000433.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth_SurfNorm/segm_gt_000433.png -------------------------------------------------------------------------------- /Images/LWRN_General.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Images/LWRN_General.PNG -------------------------------------------------------------------------------- /Images/LW_CRP_RCU_FUSION_Blocks.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Images/LW_CRP_RCU_FUSION_Blocks.PNG -------------------------------------------------------------------------------- /Images/Network_architecture.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Images/Network_architecture.PNG -------------------------------------------------------------------------------- /Images/ResNEt_34_Arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Images/ResNEt_34_Arch.png -------------------------------------------------------------------------------- /Images/Skip_Connection.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Images/Skip_Connection.PNG -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EE8204---Real-Time-Multi-Task-Learning 2 | ## Real-Time Joint Semantic Segmentation and Depth Estimation Using Asymmetric Annotations Implementation 3 | 4 | ### Summary 5 | This repository provides a python implementation using PyTorch for the following research paper: 6 | Title: Real-Time Joint Semantic Segmentation and Depth Estimation Using Asymmetric Annotations 7 | Link: https://arxiv.org/abs/1809.04766 8 | 9 | This page will serve as a guide to explain the paper, but also to walk anyone interested in the paper through the steps needed to implement the network in python. 10 | Currently the implementation us in PyTorch but in the future I would like to convert the implementation to TensorFlow. 11 | 12 | ### Intro 13 | The focus of this paper is to accomplish semantic segmentation and depth estimation using asymmetrical data sets. An asymmetrical data set is simply a data set that contains labels for one of the tasks but not all of them. In the case of this paper the data sets used are the NYUDv2 indoor and KITTI outdoor images. The data set images may have labelled data referencing the semantic map or the depth information. 14 | 15 | In order to accomplish the goal of performing both semantic segmentation and depth estimation the author of the paper V. Nekrasov et al. utilizes the following two techniques: 16 | 1. Multi-task Learning - Used to create a network that can accomplish both Semantic Segmentation and Depth Estimation 17 | 2. Knowldge Distillation - Used to estimate missing label information in data sets based on expert pre-trained teacher network 18 | 19 | Data Sets Used to Train the weights 20 | A previous network was used with the NYUDv2 and KITTI outdoor data sets to pre-train the weights. These weights were then used in this implementation to show how the network can quickly perform semantic segmentation and depth estimation. 21 | 22 | Dependencies 23 | * --find-links https://download.pytorch.org/whl/torch_stable.html 24 | * torch===1.6.0 25 | * torchvision==0.7.0 26 | * numpy 27 | * opencv-python 28 | * jupyter 29 | * matplotlib 30 | * Pillow 31 | 32 | ### Network Architecture 33 | 34 | The network architecture found in this paper can be broken down into four major parts: 35 | 1. Encoder Network 36 | 2. Light-Weight Refine Network 37 | 3. Chained Residual Pooling blocks 38 | 4. Task specific Convolution 39 | - Segmentation 40 | - Depth Estimation 41 | 42 | A visual summary of the network architecture is provided below directly from the paper. 43 | 44 | ![Network Architecture](https://github.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/blob/master/Images/Network_architecture.PNG?raw=true) 45 | 46 | The Encoder network is built from the ResNet architecture and supports ResNet [50, 101, 152]. The ResNet architecutre for the 34 layer network is shown below. 47 | 48 | ![ResNet_34](https://github.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/blob/master/Images/ResNEt_34_Arch.png?raw=true) 49 | 50 | The ResNet architecture employees residual learning which in short is a skip connection that allows the input to a group of layers to skip and be added back to the output of that layer. This can be visualized as mathematically as F(x) + x where x is the input image and F(x) if the input image after convolution-batch normalization-activation have been perform (possibly also pooling for up/down sampling). 51 | 52 | ![Skip_Connection](https://github.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/blob/master/Images/Skip_Connection.PNG?raw=true) 53 | 54 | The encoder network (ResNet) can be broken down into smaller chunks as seen in the ResNet 34 architecutre. The basics for a 34 layer ResNet are: 55 | 56 | 1.Input Image 57 | - 224 x 224 x 3 image (RGB) 58 | 59 | 2. Convolution Layer 1 (grouping layer 1) 60 | - Input: Input image 61 | - Input image = x 62 | - Conv: 7x7 kernel, 64 feature maps, stride 2, padding = 3 63 | - size = 112 x 112 x 64 (row x column x feature maps) 64 | - Batch normalization 65 | - Max Pooling, stride 2 66 | - size = 56 x 56 x 64 (row x column x feature maps) 67 | - Output: Grouping1_Conv_1 68 | - size = 56 x 56 x 64 (row x column x feature maps) 69 | - let Grouping1_Conv1 = x_G1 70 | 71 | 3. Convolution Layer 2 (grouping layer 2) 72 | - Input: Grouping1_Conv_1, skip connection to after 2 convolutional layers 73 | - Input: x_1 = x_G1 74 | - Conv: 3x3 kernel, 64 feature maps, stride 1, padding = 1 75 | - size = 56 x 56 x 64 (row x column x feature maps) 76 | - Conv: 3x3 kernel, 64 feature maps, stride 1, padding = 1 77 | - size = 56 x 56 x 64 (row x column x feature maps) 78 | - Output: Output_after_2_convolutions + Grouping1_Conv_1 79 | - Let the output after two convolutions be F1(x) = x_G2_Conv1 80 | - Output of skip connection is H1(x) = F1(x) + x_1 81 | - H1(x) = x_G2_Conv1 + x_G1 82 | - Let H1(x) = x_G2_SL1 83 | - Input: Output of previous skip connection H1(x) = x_G2_SL1, skip connection to after 2 convolutional layers 84 | - Input: x_2 = x_G2_SL1 85 | - Conv: 3x3 kernel, 64 feature maps, stride 1 86 | - size = 56 x 56 x 64 (row x column x feature maps) 87 | - Conv: 3x3 kernel, 64 feature maps, stride 1 88 | - size = 56 x 56 x 64 (row x column x feature maps) 89 | - Output: Output_after_2_convolutions + x_G2_SL1 90 | - Let the output after two convolutions be F2(x) = x_G2_Conv2 91 | - Output of skip connection is H2(x) = F2(x) + x_2 92 | - H2(x) = x_G2_Conv2 + x_G2_SL1 93 | - Let H2(x) = x_G2_SL2 94 | - Input: Output of previous skip connection H2(x) = x_G2_SL2, skip connection to after 2 convolutional layers 95 | - Input: x_3 = x_G2_SL2 96 | - Conv: 3x3 kernel, 64 feature maps, stride 1 97 | - size = 56 x 56 x 64 (row x column x feature maps) 98 | - Conv: 3x3 kernel, 64 feature maps, stride 1 99 | - size = 56 x 56 x 64 (row x column x feature maps) 100 | - Output: Output_after_2_convolutions + x_G2_SL2 101 | - Let the output after two convolutions be F3(x) = x_G2_Conv3 102 | - Output of skip connection is H3(x) = F3(x) + x_3 103 | - H3(x) = x_G2_Conv3 + x_G2_SL2 104 | - Let H3(x) = x_G2_SL3 105 | 106 | 4. Convolution Layer 3 (Grouping Layer 3) 107 | - Input: Output of previous skip connection H3(x) = x_G2_SL3, skip connection to after 2 convolutional layers 108 | - Input: x_4 = x_G2_SL3 109 | - Conv: 3x3 kernel, 128 feature maps, stride 2 110 | - size = 28 x 28 x 128 (row x column x feature maps) 111 | - Conv: 3x3 kernel, 128 feature maps, stride 1 112 | - size = 28 x 28 x 128 (row x column x feature maps) 113 | - Output: Output_after_2_convolutions + x_G2_SL3 114 | - Let the output after two convolutions be F4(x) = x_G3_Conv1 115 | - Output of skip connection is H4(x) = F4(x) + x_4 116 | - H3(x) = x_G3_Conv1 + x_G2_SL3 117 | - Let H4(x) = x_G3_SL1 118 | - Input: Output of previous skip connection H4(x) = x_G3_SL1, skip connection to after 2 convolutional layers 119 | - Input: x_5 = x_G3_SL1 120 | - Conv: 3x3 kernel, 128 feature maps, stride 1 121 | - size = 28 x 28 x 128 (row x column x feature maps) 122 | - Conv: 3x3 kernel, 128 feature maps, stride 1 123 | - size = 28 x 28 x 128 (row x column x feature maps) 124 | - Output: Output_after_2_convolutions + x_G3_SL1 125 | - Let the output after two convolutions be F5(x) = x_G3_Conv2 126 | - Output of skip connection is H5(x) = F5(x) + x_5 127 | - H5(x) = x_G3_Conv2 + x_G3_SL1 128 | - Let H5(x) = x_G3_SL2 129 | - Input: Output of previous skip connection H5(x) = x_G3_SL2, skip connection to after 2 convolutional layers 130 | - Input: x_6 = x_G3_SL2 131 | - Conv: 3x3 kernel, 128 feature maps, stride 1 132 | - size = 28 x 28 x 128 (row x column x feature maps) 133 | - Conv: 3x3 kernel, 128 feature maps, stride 1 134 | - size = 28 x 28 x 128 (row x column x feature maps) 135 | - Output: Output_after_2_convolutions + x_G3_SL2 136 | - Let the output after two convolutions be F6(x) = x_G3_Conv3 137 | - Output of skip connection is H6(x) = F6(x) + x_6 138 | - H6(x) = x_G3_Conv3 + x_G3_SL2 139 | - Let H6(x) = x_G3_SL3 140 | - Input: Output of previous skip connection H6(x) = x_G3_SL3, skip connection to after 2 convolutional layers 141 | - Input: x_7 = x_G3_SL3 142 | - Conv: 3x3 kernel, 128 feature maps, stride 1 143 | - size = 28 x 28 x 128 (row x column x feature maps) 144 | - Conv: 3x3 kernel, 128 feature maps, stride 1 145 | - size = 28 x 28 x 128 (row x column x feature maps) 146 | - Output: Output_after_2_convolutions + x_G3_SL3 147 | - Let the output after two convolutions be F7(x) = x_G3_Conv4 148 | - Output of skip connection is H6(x) = F7(x) + x_7 149 | - H7(x) = x_G3_Conv4 + x_G3_SL3 150 | - Let H7(x) = x_G3_SL4 151 | 152 | 5. Convolution Layer 4 (Grouping Layer 4) 153 | - Input: Output of previous skip connection H7(x) = x_G3_SL4, skip connection to after 2 convolutional layers 154 | - Input: x_8 = x_G3_SL4 155 | - Conv: 3x3 kernel, 256 feature maps, stride 2 156 | - size = 14 x 14 x 256 (row x column x feature maps) 157 | - Conv: 3x3 kernel, 256 feature maps, stride 1 158 | - size = 14 x 14 x 256 (row x column x feature maps) 159 | - Output: Output_after_2_convolutions + x_G3_SL4 160 | - Let the output after two convolutions be F8(x) = x_G4_Conv1 161 | - Output of skip connection is H8(x) = F8(x) + x_8 162 | - H8(x) = x_G4_Conv1 + x_G3_SL4 163 | - Let H8(x) = x_G4_SL1 164 | - Input: Output of previous skip connection H8(x) = x_G4_SL1, skip connection to after 2 convolutional layers 165 | - Input: x_9 = x_G4_SL1 166 | - Conv: 3x3 kernel, 256 feature maps, stride 1 167 | - size = 14 x 14 x 256 (row x column x feature maps) 168 | - Conv: 3x3 kernel, 256 feature maps, stride 1 169 | - size = 14 x 14 x 256 (row x column x feature maps) 170 | - Output: Output_after_2_convolutions + x_G4_SL1 171 | - Let the output after two convolutions be F9(x) = x_G4_Conv2 172 | - Output of skip connection is H9(x) = F9(x) + x_9 173 | - H9(x) = x_G4_Conv2 + x_G4_SL1 174 | - Let H9(x) = x_G4_SL2 175 | - Input: Output of previous skip connection H9(x) = x_G4_SL2, skip connection to after 2 convolutional layers 176 | - Input: x_10 = x_G4_SL2 177 | - Conv: 3x3 kernel, 256 feature maps, stride 1 178 | - size = 14 x 14 x 256 (row x column x feature maps) 179 | - Conv: 3x3 kernel, 256 feature maps, stride 1 180 | - size = 14 x 14 x 256 (row x column x feature maps) 181 | - Output: Output_after_2_convolutions + x_G4_SL2 182 | - Let the output after two convolutions be F10(x) = x_G4_Conv3 183 | - Output of skip connection is H10(x) = F10(x) + x_10 184 | - H10(x) = x_G4_Conv3 + x_G4_SL2 185 | - Let H10(x) = x_G4_SL3 186 | - Input: Output of previous skip connection H10(x) = x_G4_SL3, skip connection to after 2 convolutional layers 187 | - Input: x_11 = x_G4_SL3 188 | - Conv: 3x3 kernel, 256 feature maps, stride 1 189 | - size = 14 x 14 x 256 (row x column x feature maps) 190 | - Conv: 3x3 kernel, 256 feature maps, stride 1 191 | - size = 14 x 14 x 256 (row x column x feature maps) 192 | - Output: Output_after_2_convolutions + x_G4_SL3 193 | - Let the output after two convolutions be F11(x) = x_G4_Conv4 194 | - Output of skip connection is H11(x) = F11(x) + x_11 195 | - H11(x) = x_G4_Conv4 + x_G4_SL3 196 | - Let H11(x) = x_G4_SL4 197 | - Input: Output of previous skip connection H11(x) = x_G4_SL4, skip connection to after 2 convolutional layers 198 | - Input: x_12 = x_G4_SL4 199 | - Conv: 3x3 kernel, 256 feature maps, stride 1 200 | - size = 14 x 14 x 256 (row x column x feature maps) 201 | - Conv: 3x3 kernel, 256 feature maps, stride 1 202 | - size = 14 x 14 x 256 (row x column x feature maps) 203 | - Output: Output_after_2_convolutions + x_G4_SL4 204 | - Let the output after two convolutions be F12(x) = x_G4_Conv5 205 | - Output of skip connection is H12(x) = F12(x) + x_12 206 | - H12(x) = x_G4_Conv5 + x_G4_SL4 207 | - Let H12(x) = x_G4_SL5 208 | - Input: Output of previous skip connection H12(x) = x_G4_SL5, skip connection to after 2 convolutional layers 209 | - Input: x_13 = x_G4_SL5 210 | - Conv: 3x3 kernel, 256 feature maps, stride 1 211 | - size = 14 x 14 x 256 (row x column x feature maps) 212 | - Conv: 3x3 kernel, 256 feature maps, stride 1 213 | - size = 14 x 14 x 256 (row x column x feature maps) 214 | - Output: Output_after_2_convolutions + x_G4_SL5 215 | - Let the output after two convolutions be F13(x) = x_G4_Conv6 216 | - Output of skip connection is H13(x) = F13(x) + x_13 217 | - H13(x) = x_G4_Conv6 + x_G4_SL5 218 | - Let H13(x) = x_G4_SL6 219 | 220 | 6. Convolution Layer 5 (Grouping Layer 5) 221 | - Input: Output of previous skip connection H13(x) = x_G4_SL6, skip connection to after 2 convolutional layers 222 | - Input: x_14 = x_G4_SL6 223 | - Conv: 3x3 kernel, 512 feature maps, stride 2 224 | - size = 7 x 7 x 512 (row x column x feature maps) 225 | - Conv: 3x3 kernel, 512 feature maps, stride 1 226 | - size = 7 x 7 x 512 (row x column x feature maps) 227 | - Output: Output_after_2_convolutions + x_G4_SL6 228 | - Let the output after two convolutions be F14(x) = x_G5_Conv1 229 | - Output of skip connection is H14(x) = F14(x) + x_14 230 | - H14(x) = x_G5_Conv1 + x_G4_SL6 231 | - Let H14(x) = x_G5_SL1 232 | - Input: Output of previous skip connection H14(x) = x_G5_SL1, skip connection to after 2 convolutional layers 233 | - Input: x_15 = x_G5_SL1 234 | - Conv: 3x3 kernel, 512 feature maps, stride 1 235 | - size = 7 x 7 x 512 (row x column x feature maps) 236 | - Conv: 3x3 kernel, 512 feature maps, stride 1 237 | - size = 7 x 7 x 512 (row x column x feature maps) 238 | - Output: Output_after_2_convolutions + x_G5_SL1 239 | - Let the output after two convolutions be F15(x) = x_G5_Conv2 240 | - Output of skip connection is H15(x) = F15(x) + x_15 241 | - H15(x) = x_G5_Conv2 + x_G5_SL1 242 | - Let H15(x) = x_G5_SL2 243 | - Input: Output of previous skip connection H15(x) = x_G5_SL2, skip connection to after 2 convolutional layers 244 | - Input: x_16 = x_G5_SL2 245 | - Conv: 3x3 kernel, 512 feature maps, stride 1 246 | - size = 7 x 7 x 512 (row x column x feature maps) 247 | - Conv: 3x3 kernel, 512 feature maps, stride 1 248 | - size = 7 x 7 x 512 (row x column x feature maps) 249 | - Output: Output_after_2_convolutions + x_G5_SL2 250 | - Let the output after two convolutions be F16(x) = x_G5_Conv3 251 | - Output of skip connection is H16(x) = F16(x) + x_16 252 | - H16(x) = x_G5_Conv3 + x_G5_SL3 253 | - Let H16(x) = x_G5_SL3 254 | 255 | 7. Average Pooling (Global Average Pooling) 256 | - Global Average Pooling is applied to the output of the last convolutional grouping in the ResNet 34. This global average pooling takes the 7 x 7 x 512 tensor of feature maps and averages the 7 x 7 feature map into a 1 x 1 feature map of depth 512. The output of this layer is a feature map of 1 x 1 x 512. 257 | 258 | 8. Full Connected Layer 259 | - The Global Average Pooling is the connected to each output neuron. Each feature map in the 1 x 1 x 512 is connected to the output neurons making this a fully connected layer. 260 | 261 | The ResNet 34 architecutre shown above can be extended to 50/101/152 layers. This example is just to illustrate the encoder architecutre used in the paper. The different ResNet architecutres can all be used to determine which architecture gives the best results. For each ResNet architecture the decoder architecture which uses the Light Weight RefineNet will need to be updated. 262 | 263 | The encoder passes the output directly to the Light Weight RefineNet at the output of the encorder and through chained residual pooling blocks. The Light Weight RefineNet implementation is used as a decoder with an architecture as described in the following paper https://arxiv.org/pdf/1810.03272.pdf where modification are made to the original RefineNet to make it more desirable for real time semantic segmentation. A basic idea of the architecture is shown in the picture below: 264 | 265 | ![Light Weight RefineNet](https://github.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/blob/master/Images/LWRN_General.PNG?raw=true) 266 | 267 | The main changes found in this paper to the architecutre of the Light Weight Refine Network is the changing of the Residual Convolution Units (RCU) that connect the output of each encoder feature map to the decoder are now Chained Residual Pooling (CRP) blocks. This means that at the input of each decoder layered grouping the corresponding feature map from the encoder will pass throught the CRP blocks before beginning the process of following the Light Weight Refine Network architecutre of passing through a light weight fusion block, leight weight CRP block, and a leight weight RCU block before all being added together to create an output image the same size as the input image. 268 | 269 | ![Light Weight RefineNet](https://github.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/blob/master/Images/LW_CRP_RCU_FUSION_Blocks.PNG?raw=true) 270 | 271 | Finally, the paper makes use of two task branches at the output of the Light Weight Refine Network. Each branch has the same architecture with a 1x1 depth convolution and a 3x3 convolution. Using Multitask learning each branch is able to perform a signle task such as semantic segmentation and depth estimation. 272 | 273 | ### Paper Implementation 274 | 275 | How to run the code 276 | In order to get this code to run I recommend copying the entire repository to your local drive. Create a folder and then use the python venv function to create a local copy of the python interpreter. 277 | 278 | The dependencies for this project will be listed below. This specific implementation was done in windows 7 ultimate 64 bit. Using a pretrained network for real time semantic segmentation and depth estimation. The pre trained network can be found in the weights folder. It is possible to pre-train a netowkr with specific weights using a different architecture but in this implementation I decicded to follow the original authors methodoolgy to try and get results as close as possibly to the original paper. 279 | 280 | Exmaple of projecy directory creation in command line: 281 | ``` 282 | mkdir my_Project 283 | -m venv my_Project\venv 284 | ``` 285 | Extract this repositor to within the folder you created but do not place the venv folder. Once the requirements.txt folder is in your selected folder run the following command in command prompt to download the required libraries/frameworks: 286 | ``` 287 | my_Project\venv\Scripts\activate.bat 288 | pip install -r requirements.txt 289 | ``` 290 | 291 | Once all the dependencies are installed you will be able to add aditional images to the examples folder and then upadate the folder path within the evaluation python file by changing the img_Path variable to have the associated path to your new image. 292 | 293 | This implementation makes use of check point files and pretrained weights for the network. The pretrained weights are determined by using the Light Weight Refine Net with a ResNet encoder to determine the optimal weights for semantic segmentation. Then the new network built specifically for this paper makes use of those weights and multi task learning to accurately guess the missing label data for either semantic mapping or depth estimation. 294 | 295 | ### Conclusions 296 | 297 | This paper provides a deep dive for the reader if they want to become familiar with encoder decoder networks, ResNet, RefineNet and Leight Weight Refine Net. Using pretrained wieghts significantly speeds up the time to process an input image into is respective semantic and depth mapping. A full pipeline implementation from ResNet to Light Weight Refine net is the next goal of work to test the real-time capabilities of this method from start to finish. 298 | 299 | The ease with which the network performs multi-task learning using the pre-train weights is a great indicator of possible future work. Aside from the full pipe line implementation it would be interesting to try and change the network architecture to use different encoder and decoders to see if this method can stil be improved. 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | -------------------------------------------------------------------------------- /Weights/ExpKITTI_joint.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Weights/ExpKITTI_joint.ckpt -------------------------------------------------------------------------------- /Weights/ExpNYUDKITTI_joint.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Weights/ExpNYUDKITTI_joint.ckpt -------------------------------------------------------------------------------- /Weights/ExpNYUD_joint.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Weights/ExpNYUD_joint.ckpt -------------------------------------------------------------------------------- /Weights/ExpNYUD_three.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Weights/ExpNYUD_three.ckpt -------------------------------------------------------------------------------- /eval_KITTI.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Aug 14 12:20:05 2020 4 | 5 | @author: abanbur 6 | """ 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | from PIL import Image 11 | import numpy as np 12 | from model import network 13 | import cv2 14 | import torch 15 | from torch.autograd import Variable 16 | 17 | color_Map = np.load('Data/cmap_kitti.npy') 18 | depth_Coeff = 800. # Converts into meters 19 | has_Cuda = torch.cuda.is_available() 20 | img_Scale = 1./255 21 | img_Mean = np.array([0.485, 0.456, 0.406]).reshape((1, 1, 3)) 22 | img_Std = np.array([0.229, 0.224, 0.225]).reshape((1, 1, 3)) 23 | max_Depth = 80. 24 | min_Depth = 0. 25 | num_CLASSES = 6 26 | num_TASKS = 2 # segm + depth 27 | 28 | def pre_Processing(img): 29 | return (img * img_Scale - img_Mean) / img_Std 30 | 31 | model_Object = network(num_classes=num_CLASSES, num_tasks=num_TASKS) 32 | if has_Cuda: 33 | _ = model_Object.cuda() 34 | _ = model_Object.eval() 35 | 36 | check_Point = torch.load('Weights/ExpKITTI_joint.ckpt') 37 | model_Object.load_state_dict(check_Point['state_dict']) 38 | 39 | img_Path = 'Examples/Example_KITTI_Segm_Depth/000099.png' 40 | img = np.array(Image.open(img_Path)) 41 | gt_segm = np.array(Image.open('Examples\Example_KITTI_Segm_Depth\segm_gt_000099.png')) 42 | 43 | with torch.no_grad(): 44 | img_var = Variable(torch.from_numpy(pre_Processing(img).transpose(2, 0, 1)[None]), requires_grad = False).float() 45 | if has_Cuda: 46 | img_var = img_var.cuda() 47 | segm, depth = model_Object(img_var) 48 | segm = cv2.resize(segm[0, :num_CLASSES].cpu().data.numpy().transpose(1, 2, 0), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC) 49 | depth = cv2.resize(depth[0, 0].cpu().data.numpy(), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC) 50 | segm = color_Map[segm.argmax(axis=2)].astype(np.uint8) 51 | depth = np.abs(depth) 52 | plt.figure(figsize=(18, 12)) 53 | plt.subplot(141) 54 | plt.imshow(img) 55 | plt.title('orig img') 56 | plt.axis('off') 57 | plt.subplot(142) 58 | plt.imshow(gt_segm) 59 | plt.title('gt segm') 60 | plt.axis('off') 61 | plt.subplot(143) 62 | plt.imshow(segm) 63 | plt.title('pred segm') 64 | plt.axis('off') 65 | plt.subplot(144) 66 | plt.imshow(depth, cmap='plasma', vmin=min_Depth, vmax=max_Depth) 67 | plt.title('pred depth') 68 | plt.axis('off'); 69 | -------------------------------------------------------------------------------- /eval_KITTI_NYUD.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Aug 14 21:44:34 2020 4 | 5 | @author: abanbur 6 | """ 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | from PIL import Image 11 | import numpy as np 12 | from model import network 13 | import cv2 14 | import torch 15 | from torch.autograd import Variable 16 | 17 | cmap_Nyud = np.load('Data/cmap_nyud.npy') 18 | cmap_Kitti = np.load('Data/cmap_kitti.npy') 19 | depth_Coeff_Nyud = 5000. # to convert into metres 20 | depth_Coeff_Kitti = 800. 21 | has_Cuda = torch.cuda.is_available() 22 | img_Scale = 1./255 23 | img_Mean = np.array([0.485, 0.456, 0.406]).reshape((1, 1, 3)) 24 | img_Std = np.array([0.229, 0.224, 0.225]).reshape((1, 1, 3)) 25 | max_Depth_Nyud = 8. 26 | min_Depth_Nyud = 0. 27 | max_Depth_Kitti = 80. 28 | min_Depth_Kitti = 0. 29 | num_CLASSES = 46 30 | num_CLASSES_NYUD = 40 31 | num_CLASSES_KITTI = 6 32 | num_TASKS = 2 # segm + depth 33 | 34 | def pre_Processing(img): 35 | return (img * img_Scale - img_Mean) / img_Std 36 | 37 | model_Object = network(num_classes = num_CLASSES, num_tasks = num_TASKS) 38 | if has_Cuda: 39 | _ = model_Object.cuda() 40 | _ = model_Object.eval() 41 | 42 | check_Point = torch.load('Weights/ExpNYUDKITTI_joint.ckpt') 43 | model_Object.load_state_dict(check_Point['state_dict']) 44 | 45 | # NYUD 46 | img_path = 'Examples/Example_NYUDv2_Segm_Depth/000464.png' 47 | img_nyud = np.array(Image.open(img_path)) 48 | gt_segm_nyud = np.array(Image.open('Examples/Example_NYUDv2_Segm_Depth/segm_gt_000464.png')) 49 | 50 | # KITTI 51 | img_path = 'Examples/Example_KITTI_Segm_Depth/000099.png' 52 | img_kitti = np.array(Image.open(img_path)) 53 | gt_segm_kitti = np.array(Image.open('Examples/Example_KITTI_Segm_Depth/segm_gt_000099.png')) 54 | 55 | with torch.no_grad(): 56 | # nyud 57 | img_var = Variable(torch.from_numpy(pre_Processing(img_nyud).transpose(2, 0, 1)[None]), requires_grad=False).float() 58 | if has_Cuda: 59 | img_var = img_var.cuda() 60 | segm, depth = model_Object(img_var) 61 | segm = cv2.resize(segm[0, :(num_CLASSES_NYUD)].cpu().data.numpy().transpose(1, 2, 0), img_nyud.shape[:2][::-1], interpolation=cv2.INTER_CUBIC) 62 | depth = cv2.resize(depth[0, 0].cpu().data.numpy(), img_nyud.shape[:2][::-1], interpolation=cv2.INTER_CUBIC) 63 | segm_nyud = cmap_Nyud[segm.argmax(axis=2) + 1].astype(np.uint8) 64 | depth_nyud = np.abs(depth) 65 | # kitti 66 | img_var = Variable(torch.from_numpy(pre_Processing(img_kitti).transpose(2, 0, 1)[None]), requires_grad=False).float() 67 | if has_Cuda: 68 | img_var = img_var.cuda() 69 | segm, depth = model_Object(img_var) 70 | segm = cv2.resize(segm[0, (num_CLASSES_NYUD):(num_CLASSES_NYUD + num_CLASSES_KITTI)].cpu().data.numpy().transpose(1, 2, 0), img_kitti.shape[:2][::-1], interpolation=cv2.INTER_CUBIC) 71 | depth = cv2.resize(depth[0, 0].cpu().data.numpy(), img_kitti.shape[:2][::-1], interpolation=cv2.INTER_CUBIC) 72 | segm_kitti = cmap_Kitti[segm.argmax(axis=2)].astype(np.uint8) 73 | depth_kitti = np.abs(depth) 74 | 75 | plt.figure(figsize=(18, 12)) 76 | plt.subplot(141) 77 | plt.imshow(img_nyud) 78 | plt.title('NYUD: img') 79 | plt.axis('off') 80 | plt.subplot(142) 81 | plt.imshow(cmap_Nyud[gt_segm_nyud + 1]) 82 | plt.title('NYUD: gt segm') 83 | plt.axis('off') 84 | plt.subplot(143) 85 | plt.imshow(segm_nyud) 86 | plt.title('NYUD: pred segm') 87 | plt.axis('off') 88 | plt.subplot(144) 89 | plt.imshow(depth_nyud, cmap='plasma', vmin=min_Depth_Nyud, vmax=max_Depth_Nyud) 90 | plt.title('NYUD: pred depth') 91 | plt.axis('off') 92 | plt.figure(figsize=(18,12)) 93 | plt.subplot(141) 94 | plt.imshow(img_kitti) 95 | plt.title('KITTI: img') 96 | plt.axis('off') 97 | plt.subplot(142) 98 | plt.imshow(gt_segm_kitti) 99 | plt.title('KITTI: gt segm') 100 | plt.axis('off') 101 | plt.subplot(143) 102 | plt.imshow(segm_kitti) 103 | plt.title('KITTI: pred segm') 104 | plt.axis('off') 105 | plt.subplot(144) 106 | plt.imshow(depth_kitti, cmap='plasma', vmin=min_Depth_Kitti, vmax=max_Depth_Kitti) 107 | plt.title('KITTI: pred depth') 108 | plt.axis('off'); 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /eval_NYUDv2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Aug 14 22:28:03 2020 4 | 5 | @author: abanbur 6 | """ 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | from PIL import Image 11 | import numpy as np 12 | from model import network 13 | import cv2 14 | import torch 15 | from torch.autograd import Variable 16 | 17 | cmap_Nyud = np.load('Data/cmap_nyud.npy') 18 | depth_Coeff = 5000. # Converts into meters 19 | has_Cuda = torch.cuda.is_available() 20 | img_Scale = 1./255 21 | img_Mean = np.array([0.485, 0.456, 0.406]).reshape((1, 1, 3)) 22 | img_Std = np.array([0.229, 0.224, 0.225]).reshape((1, 1, 3)) 23 | max_Depth = 8. 24 | min_Depth = 0. 25 | num_CLASSES = 40 26 | num_TASKS = 2 # segm + depth 27 | 28 | def pre_Processing(img): 29 | return (img * img_Scale - img_Mean) / img_Std 30 | 31 | model_Object = network(num_classes=num_CLASSES, num_tasks=num_TASKS) 32 | if has_Cuda: 33 | _ = model_Object.cuda() 34 | _ = model_Object.eval() 35 | 36 | check_Point = torch.load('Weights/ExpNYUD_joint.ckpt') 37 | model_Object.load_state_dict(check_Point['state_dict']) 38 | 39 | img_path = 'Examples/Example_NYUDv2_Segm_Depth/000464.png' 40 | img = np.array(Image.open(img_path)) 41 | gt_segm = np.array(Image.open('Examples/Example_NYUDv2_Segm_Depth/segm_gt_000464.png')) 42 | gt_depth = np.array(Image.open('Examples/Example_NYUDv2_Segm_Depth/depth_gt_000464.png')) 43 | 44 | 45 | with torch.no_grad(): 46 | img_var = Variable(torch.from_numpy(pre_Processing(img).transpose(2, 0, 1)[None]), requires_grad=False).float() 47 | if has_Cuda: 48 | img_var = img_var.cuda() 49 | segm, depth = model_Object(img_var) 50 | segm = cv2.resize(segm[0, :num_CLASSES].cpu().data.numpy().transpose(1, 2, 0), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC) 51 | depth = cv2.resize(depth[0, 0].cpu().data.numpy(), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC) 52 | segm = cmap_Nyud[segm.argmax(axis=2) + 1].astype(np.uint8) 53 | depth = np.abs(depth) 54 | plt.figure(figsize=(18, 12)) 55 | plt.subplot(151) 56 | plt.imshow(img) 57 | plt.title('orig img') 58 | plt.axis('off') 59 | plt.subplot(152) 60 | plt.imshow(cmap_Nyud[gt_segm + 1]) 61 | plt.title('gt segm') 62 | plt.axis('off') 63 | plt.subplot(153) 64 | plt.imshow(segm) 65 | plt.title('pred segm') 66 | plt.axis('off') 67 | plt.subplot(154) 68 | plt.imshow(gt_depth / depth_Coeff, cmap='plasma', vmin=min_Depth, vmax=max_Depth) 69 | plt.title('gt depth') 70 | plt.axis('off') 71 | plt.subplot(155) 72 | plt.imshow(depth, cmap='plasma', vmin=min_Depth, vmax=max_Depth) 73 | plt.title('pred depth') 74 | plt.axis('off'); -------------------------------------------------------------------------------- /eval_NYUDv2_SurfNorm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Aug 14 22:43:26 2020 4 | 5 | @author: abanbur 6 | """ 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | from PIL import Image 11 | import numpy as np 12 | from model import network 13 | import cv2 14 | import torch 15 | from torch.autograd import Variable 16 | 17 | cmap_Nyud = np.load('Data/cmap_nyud.npy') 18 | depth_Coeff = 5000. # Converts into meters 19 | has_Cuda = torch.cuda.is_available() 20 | img_Scale = 1./255 21 | img_Mean = np.array([0.485, 0.456, 0.406]).reshape((1, 1, 3)) 22 | img_Std = np.array([0.229, 0.224, 0.225]).reshape((1, 1, 3)) 23 | max_Depth = 8. 24 | min_Depth = 0. 25 | num_CLASSES = 40 26 | num_TASKS = 3 # segm + depth 27 | 28 | def pre_Processing(img): 29 | return (img * img_Scale - img_Mean) / img_Std 30 | 31 | model_Object = network(num_classes=num_CLASSES, num_tasks=num_TASKS) 32 | if has_Cuda: 33 | _ = model_Object.cuda() 34 | _ = model_Object.eval() 35 | 36 | check_Point = torch.load('Weights/ExpNYUD_three.ckpt') 37 | model_Object.load_state_dict(check_Point['state_dict']) 38 | 39 | img_path = 'Examples/Example_NYUDv2_Segm_Depth_SurfNorm/000433.png' 40 | img = np.array(Image.open(img_path)) 41 | gt_segm = np.array(Image.open('Examples/Example_NYUDv2_Segm_Depth_SurfNorm/segm_gt_000433.png')) 42 | gt_depth = np.array(Image.open('Examples/Example_NYUDv2_Segm_Depth_SurfNorm/depth_gt_000433.png')) 43 | gt_norm = np.array(Image.open('Examples/Example_NYUDv2_Segm_Depth_SurfNorm/norm_gt_000433.png')) 44 | 45 | with torch.no_grad(): 46 | img_var = Variable(torch.from_numpy(pre_Processing(img).transpose(2, 0, 1)[None]), requires_grad=False).float() 47 | if has_Cuda: 48 | img_var = img_var.cuda() 49 | segm, depth, norm = model_Object(img_var) 50 | segm = cv2.resize(segm[0, :num_CLASSES].cpu().data.numpy().transpose(1, 2, 0), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC) 51 | depth = cv2.resize(depth[0, 0].cpu().data.numpy(), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC) 52 | norm = cv2.resize(norm[0].cpu().data.numpy().transpose(1, 2, 0), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC) 53 | segm = cmap_Nyud[segm.argmax(axis=2) + 1].astype(np.uint8) 54 | depth = np.abs(depth) 55 | out_norm = norm / np.linalg.norm(norm, axis=2, keepdims=True) 56 | ## xzy->RGB ## 57 | out_norm[:, :, 0] = ((out_norm[:, :, 0] + 1.) / 2.) * 255. 58 | out_norm[:, :, 1] = ((out_norm[:, :, 1] + 1.) / 2.) * 255. 59 | out_norm[:, :, 2] = ((1. - out_norm[:, :, 2]) / 2.) * 255. 60 | out_norm = out_norm.astype(np.uint8) 61 | 62 | plt.figure(figsize=(18, 12)) 63 | plt.subplot(171) 64 | plt.imshow(img) 65 | plt.title('orig img') 66 | plt.axis('off') 67 | plt.subplot(172) 68 | plt.imshow(cmap_Nyud[gt_segm + 1]) 69 | plt.title('gt segm') 70 | plt.axis('off') 71 | plt.subplot(173) 72 | plt.imshow(segm) 73 | plt.title('pred segm') 74 | plt.axis('off') 75 | plt.subplot(174) 76 | plt.imshow(gt_depth / depth_Coeff, cmap='plasma', vmin=min_Depth, vmax=max_Depth) 77 | plt.title('gt depth') 78 | plt.axis('off') 79 | plt.subplot(175) 80 | plt.imshow(depth, cmap='plasma', vmin=min_Depth, vmax=max_Depth) 81 | plt.title('pred depth') 82 | plt.axis('off') 83 | plt.subplot(176) 84 | plt.imshow(gt_norm) 85 | plt.title('gt norm') 86 | plt.axis('off') 87 | plt.subplot(177) 88 | plt.imshow(out_norm) 89 | plt.title('pred norm') 90 | plt.axis('off'); -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Spyder Editor 4 | 5 | This is a temporary script file. 6 | """ 7 | 8 | 9 | import torch 10 | import torch.nn as nn 11 | import math 12 | 13 | #Creating a 3x3 convolution definition 14 | def conv3x3(in_channel, out_channel, stride = 1, bias = False, dilation = 1, groups = 1): 15 | "Creating a method for 2D convolution of a 3x3 kernel" 16 | #Inputs to the definition are: 17 | #Input channel size - Number of channels in the input image 18 | #Output channel size - Number of channels produced by the convolution 19 | #Stride = 1, Stride of the convolution 20 | #bias = False, Setting the bias to be learnable or not 21 | #Dilation = 1, Spacing between kernel elements during convolution 22 | #Groups = 1, Number of blocked connections from input channels to output channels 23 | return nn.Conv2d(in_channel, out_channel, kernel_size = 3, stride = stride, padding = dilation, dilation = dilation, bias = bias, groups = groups) 24 | 25 | #Creating a 1x1 convolution definition 26 | def conv1x1(in_channel, out_channel, stride = 1, bias = False, groups = 1): 27 | "Creating a method for 2D convolution of a 1x1 kernel" 28 | #Inputs to the definition are: 29 | #Input channel size - Number of channels in the input image 30 | #Output channel size - Number of channels produced by the convolution 31 | #Stride = 1, Stride of the convolution 32 | #Groups = 1, Number of blocked connections from input channels to output channels 33 | return nn.Conv2d(in_channel, out_channel, kernel_size = 1, stride = stride, padding = 0, bias = bias, groups = groups) 34 | 35 | #Creating a batch normalization definition 36 | def batch_norm(num_features): 37 | "Creating a method for 2D batch normalization" 38 | #Inputs to batchnorm2d: 39 | #Number of features - An expected input of size C 40 | #Eps - Denominator value in batch norm equation added for stability 41 | #Momentum - Value used to calculate running mean and running var computation 42 | #Affine - Boolean value that when set to true, the module has learnable affine parameters 43 | return nn.BatchNorm2d(num_features, eps = 1e-5, momentum = 0.1, affine = True) 44 | 45 | #Creating the conv-bn-relu sequence 46 | def con_bn_act(in_channel, out_channel, kernel_size, stride = 1, groups = 1, act = True): 47 | "Creating a method for the convolution, batch normalization, and activation using ReLU using the PyTorch nn.Sequential function" 48 | if act: 49 | return nn.Sequential(nn.Conv2d(in_channel, out_channel, kernel_size, stride = stride, padding = int(kernel_size / 2.), groups = groups, bias = False), batch_norm(out_channel), nn.ReLU6(inplace = True)) 50 | else: 51 | return nn.Sequential(nn.Conv2d(in_channel, out_channel, kernel_size, stride = stride, padding = int(kernel_size / 2.), groups = groups, bias = False), batch_norm(out_channel)) 52 | 53 | 54 | #Creating the Chained Residual Pooling class 55 | #This class is a child class of the PyTorch Parent nn.Module 56 | class chained_residual_pooling(nn.Module): 57 | "This is the Chained Residual Pooling class" 58 | #Constructor method __init__() is used to initialize class variables 59 | #Input channel size 60 | #Output channel size 61 | #Number of stages 62 | #Groups 63 | def __init__(self, in_channel, out_channel, n_stages, groups = False): 64 | #Using the super function we are able to call the __init__() method of the nn.Module parent 65 | #In this case super(chained_residual_pooling, self).__init__() = super().__init__() 66 | #This is becuase the first argument of super is the same as the class we are calling from within 67 | super(chained_residual_pooling, self).__init__() 68 | for i in range(n_stages): 69 | setattr(self, "{}_{}".format(i + 1, 'outvar_dimred'), conv1x1(in_channel if (i == 0) else out_channel, out_channel, stride = 1, bias=False, groups = in_channel if groups else 1)) 70 | 71 | #Initializing class variables for object instantiating using the self parameter 72 | #Using self will allow the current instance of the class to be linked the object calling the class 73 | self.stride = 1 #Setting the stride 74 | self.n_stages = n_stages #Setting the number of stages 75 | self.maxpool = nn.MaxPool2d(kernel_size = 5, stride = 1, padding = 2) #Defining maxpool as the PyTorch nn.MaxPool2d method 76 | 77 | def forward(self, x): 78 | top = x 79 | for i in range(self.n_stages): 80 | top = self.maxpool(top) 81 | top = getattr(self, '{}_{}'.format(i + 1, 'outvar_dimred'))(top) 82 | x = top + x 83 | return x 84 | 85 | #Creating the Inverted Residual block 86 | #This block was taken directly from the paper in the link with minor adjustments to variable names 87 | class Inverted_Residual_Block(nn.Module): 88 | """Inverted Residual Block from https://arxiv.org/abs/1801.04381""" 89 | def __init__(self, in_channel, out_channel, expansion_factor, stride = 1): 90 | super(Inverted_Residual_Block, self).__init__() 91 | intermed_channel = in_channel * expansion_factor 92 | self.residual = (in_channel == out_channel) and (stride == 1) 93 | self.output = nn.Sequential(con_bn_act(in_channel, intermed_channel, 1), 94 | con_bn_act(intermed_channel, intermed_channel, 3, stride = stride, groups = intermed_channel), 95 | con_bn_act(intermed_channel, out_channel, 1, act = False)) 96 | 97 | def forward(self, x): 98 | residual = x 99 | out = self.output(x) 100 | if self.residual: 101 | return (out + residual) 102 | else: 103 | return out 104 | 105 | 106 | #Creating the network Architecture 107 | class network_Arch(nn.Module): 108 | """"Real Time Semantic Segmenataion and Depth Estimation Neural Network Arch""" 109 | mobile_Net_Config = [[1, 16, 1, 1], 110 | [6, 24, 2, 2], 111 | [6, 32, 3, 2], 112 | [6, 64, 4, 2], 113 | [6, 96, 3, 1], 114 | [6, 160, 3, 2], 115 | [6, 320, 1, 1], 116 | ] 117 | 118 | in_channel = 32 119 | num_layers = len(mobile_Net_Config) 120 | def __init__(self, num_classes, num_tasks = 2): 121 | super(network_Arch, self).__init__() 122 | self.num_tasks = num_tasks 123 | assert self.num_tasks in [2, 3], "Number of tasks supported is either 2 or 3, got {}".format(self.num_tasks) 124 | 125 | self.layer1 = con_bn_act(3, self.in_channel, kernel_size=3, stride=2) 126 | c_layer = 2 127 | for t,c,n,s in (self.mobile_Net_Config): 128 | layers = [] 129 | for idx in range(n): 130 | layers.append(Inverted_Residual_Block(self.in_channel, c, expansion_factor = t, stride = s if idx == 0 else 1)) 131 | self.in_channel = c 132 | setattr(self, 'layer{}'.format(c_layer), nn.Sequential(*layers)) 133 | c_layer += 1 134 | 135 | #Creating the Leight-Weight Refine Network Architecture 136 | self.conv8 = conv1x1(320, 256, bias=False) #in_channel = 320, out_channel = 256 137 | self.conv7 = conv1x1(160, 256, bias=False) #in_channel = 160, out_channel = 256 138 | self.conv6 = conv1x1(96, 256, bias=False) #in_channel = 96, out_channel = 256 139 | self.conv5 = conv1x1(64, 256, bias=False) #in_channel = 64, out_channel = 256 140 | self.conv4 = conv1x1(32, 256, bias=False) #in_channel = 32, out_channel = 256 141 | self.conv3 = conv1x1(24, 256, bias=False) #in_channel = 24, out_channel = 256 142 | self.crp4 = self._make_crp(256, 256, 4, groups=False) #in_channel = 256, out_channel = 256, stages = 4 143 | self.crp3 = self._make_crp(256, 256, 4, groups=False) #in_channel = 256, out_channel = 256, stages = 4 144 | self.crp2 = self._make_crp(256, 256, 4, groups=False) #in_channel = 256, out_channel = 256, stages = 4 145 | self.crp1 = self._make_crp(256, 256, 4, groups=True) #in_channel = 256, out_channel = 256, stages = 4, groups = True 146 | 147 | self.conv_adapt4 = conv1x1(256, 256, bias=False) #in_channel = 256, out_channel = 256 148 | self.conv_adapt3 = conv1x1(256, 256, bias=False) #in_channel = 256, out_channel = 256 149 | self.conv_adapt2 = conv1x1(256, 256, bias=False) #in_channel = 256, out_channel = 256 150 | 151 | self.pre_depth = conv1x1(256, 256, groups=256, bias=False) #in_channel = 256, out_channel = 256, groups = 256 152 | self.depth = conv3x3(256, 1, bias=True) #in_channel = 256, out_channel = 1, bias = True 153 | 154 | self.pre_segm = conv1x1(256, 256, groups=256, bias=False) #in_channel = 256, out_channel = 256, groups = 256 155 | self.segm = conv3x3(256, num_classes, bias=True) #in_channel = 256, out_channel = num_classes 156 | self.relu = nn.ReLU6(inplace=True) #nn.ReLU6 is a call to the PyTorch method ReLU6 setting inplace = True 157 | 158 | if self.num_tasks == 3: 159 | self.pre_normal = conv1x1(256, 256, groups=256, bias=False) #in_channel = 256, out_channel = 256 160 | self.normal = conv3x3(256, 3, bias=True) #in_channel = 256, out_channel = 3, bias = True 161 | self._initialize_weights() #Using the object to link the initialized weights instantiated with the method 162 | 163 | def forward(self, x): 164 | x = self.layer1(x) 165 | x = self.layer2(x) # x / 2 166 | l3 = self.layer3(x) # 24, x / 4 167 | l4 = self.layer4(l3) # 32, x / 8 168 | l5 = self.layer5(l4) # 64, x / 16 169 | l6 = self.layer6(l5) # 96, x / 16 170 | l7 = self.layer7(l6) # 160, x / 32 171 | l8 = self.layer8(l7) # 320, x / 32 172 | l8 = self.conv8(l8) 173 | l7 = self.conv7(l7) 174 | l7 = self.relu(l8 + l7) 175 | l7 = self.crp4(l7) 176 | l7 = self.conv_adapt4(l7) 177 | l7 = nn.Upsample(size = l6.size()[2:], mode='bilinear', align_corners = False)(l7) 178 | 179 | l6 = self.conv6(l6) 180 | l5 = self.conv5(l5) 181 | l5 = self.relu(l5 + l6 + l7) 182 | l5 = self.crp3(l5) 183 | l5 = self.conv_adapt3(l5) 184 | l5 = nn.Upsample(size = l4.size()[2:], mode='bilinear', align_corners = False)(l5) 185 | 186 | l4 = self.conv4(l4) 187 | l4 = self.relu(l5 + l4) 188 | l4 = self.crp2(l4) 189 | l4 = self.conv_adapt2(l4) 190 | l4 = nn.Upsample(size = l3.size()[2:], mode='bilinear', align_corners = False)(l4) 191 | 192 | l3 = self.conv3(l3) 193 | l3 = self.relu(l3 + l4) 194 | l3 = self.crp1(l3) 195 | 196 | out_segm = self.pre_segm(l3) 197 | out_segm = self.relu(out_segm) 198 | out_segm = self.segm(out_segm) 199 | 200 | out_d = self.pre_depth(l3) 201 | out_d = self.relu(out_d) 202 | out_d = self.depth(out_d) 203 | 204 | if self.num_tasks == 3: 205 | out_n = self.pre_normal(l3) 206 | out_n = self.relu(out_n) 207 | out_n = self.normal(out_n) 208 | return out_segm, out_d, out_n 209 | else: 210 | return out_segm, out_d 211 | 212 | def _initialize_weights(self): 213 | for m in self.modules(): 214 | if isinstance(m, nn.Conv2d): 215 | m.weight.data.normal_(0, 0.01) 216 | if m.bias is not None: 217 | m.bias.data.zero_() 218 | elif isinstance(m, nn.BatchNorm2d): 219 | m.weight.data.fill_(1) 220 | m.bias.data.zero_() 221 | 222 | def _make_crp(self, in_channel, out_channel, stages, groups = False): 223 | layers = [chained_residual_pooling(in_channel, out_channel,stages, groups = groups)] 224 | return nn.Sequential(*layers) 225 | 226 | def network(num_classes, num_tasks): 227 | """Constructs the network by calling the network architecture class. This call will return the network model. 228 | Args: 229 | num_classes (int): the number of classes for the segmentation head to output. 230 | num_tasks (int): the number of tasks, either 2 - segm + depth, or 3 - segm + depth + surface normals 231 | """ 232 | model = network_Arch(num_classes, num_tasks) 233 | return model --------------------------------------------------------------------------------