├── .gitignore
├── Data
    ├── cmap_kitti.npy
    └── cmap_nyud.npy
├── Examples
    ├── Example_KITTI_Segm_Depth
    │   ├── 000099.png
    │   └── segm_gt_000099.png
    ├── Example_NYUDv2_Segm_Depth
    │   ├── 000464.png
    │   ├── depth_gt_000464.png
    │   └── segm_gt_000464.png
    └── Example_NYUDv2_Segm_Depth_SurfNorm
    │   ├── 000433.png
    │   ├── depth_gt_000433.png
    │   ├── norm_gt_000433.png
    │   └── segm_gt_000433.png
├── Images
    ├── LWRN_General.PNG
    ├── LW_CRP_RCU_FUSION_Blocks.PNG
    ├── Network_architecture.PNG
    ├── ResNEt_34_Arch.png
    └── Skip_Connection.PNG
├── README.md
├── Weights
    ├── ExpKITTI_joint.ckpt
    ├── ExpNYUDKITTI_joint.ckpt
    ├── ExpNYUD_joint.ckpt
    └── ExpNYUD_three.ckpt
├── eval_KITTI.py
├── eval_KITTI_NYUD.py
├── eval_NYUDv2.py
├── eval_NYUDv2_SurfNorm.py
└── model.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/Data/cmap_kitti.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Data/cmap_kitti.npy


--------------------------------------------------------------------------------
/Data/cmap_nyud.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Data/cmap_nyud.npy


--------------------------------------------------------------------------------
/Examples/Example_KITTI_Segm_Depth/000099.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_KITTI_Segm_Depth/000099.png


--------------------------------------------------------------------------------
/Examples/Example_KITTI_Segm_Depth/segm_gt_000099.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_KITTI_Segm_Depth/segm_gt_000099.png


--------------------------------------------------------------------------------
/Examples/Example_NYUDv2_Segm_Depth/000464.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth/000464.png


--------------------------------------------------------------------------------
/Examples/Example_NYUDv2_Segm_Depth/depth_gt_000464.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth/depth_gt_000464.png


--------------------------------------------------------------------------------
/Examples/Example_NYUDv2_Segm_Depth/segm_gt_000464.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth/segm_gt_000464.png


--------------------------------------------------------------------------------
/Examples/Example_NYUDv2_Segm_Depth_SurfNorm/000433.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth_SurfNorm/000433.png


--------------------------------------------------------------------------------
/Examples/Example_NYUDv2_Segm_Depth_SurfNorm/depth_gt_000433.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth_SurfNorm/depth_gt_000433.png


--------------------------------------------------------------------------------
/Examples/Example_NYUDv2_Segm_Depth_SurfNorm/norm_gt_000433.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth_SurfNorm/norm_gt_000433.png


--------------------------------------------------------------------------------
/Examples/Example_NYUDv2_Segm_Depth_SurfNorm/segm_gt_000433.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Examples/Example_NYUDv2_Segm_Depth_SurfNorm/segm_gt_000433.png


--------------------------------------------------------------------------------
/Images/LWRN_General.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Images/LWRN_General.PNG


--------------------------------------------------------------------------------
/Images/LW_CRP_RCU_FUSION_Blocks.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Images/LW_CRP_RCU_FUSION_Blocks.PNG


--------------------------------------------------------------------------------
/Images/Network_architecture.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Images/Network_architecture.PNG


--------------------------------------------------------------------------------
/Images/ResNEt_34_Arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Images/ResNEt_34_Arch.png


--------------------------------------------------------------------------------
/Images/Skip_Connection.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Images/Skip_Connection.PNG


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # EE8204---Real-Time-Multi-Task-Learning
  2 | ## Real-Time Joint Semantic Segmentation and Depth Estimation Using Asymmetric Annotations Implementation
  3 | 
  4 | ### Summary
  5 | This repository provides a python implementation using PyTorch for the following research paper:
  6 | Title: Real-Time Joint Semantic Segmentation and Depth Estimation Using Asymmetric Annotations
  7 | Link: https://arxiv.org/abs/1809.04766
  8 | 
  9 | This page will serve as a guide to explain the paper, but also to walk anyone interested in the paper through the steps needed to implement the network in python.
 10 | Currently the implementation us in PyTorch but in the future I would like to convert the implementation to TensorFlow.
 11 | 
 12 | ### Intro
 13 | The focus of this paper is to accomplish semantic segmentation and depth estimation using asymmetrical data sets. An asymmetrical data set is simply a data set that contains labels for one of the tasks but not all of them. In the case of this paper the data sets used are the NYUDv2 indoor and KITTI outdoor images. The data set images may have labelled data referencing the semantic map or the depth information.
 14 | 
 15 | In order to accomplish the goal of performing both semantic segmentation and depth estimation the author of the paper V. Nekrasov et al. utilizes the following two techniques:
 16 | 1. Multi-task Learning - Used to create a network that can accomplish both Semantic Segmentation and Depth Estimation
 17 | 2. Knowldge Distillation - Used to estimate missing label information in data sets based on expert pre-trained teacher network
 18 | 
 19 | Data Sets Used to Train the weights
 20 | A previous network was used with the NYUDv2 and KITTI outdoor data sets to pre-train the weights. These weights were then used in this implementation to show how the network can quickly perform semantic segmentation and depth estimation.
 21 | 
 22 | Dependencies
 23 | * --find-links https://download.pytorch.org/whl/torch_stable.html
 24 | * torch===1.6.0
 25 | * torchvision==0.7.0
 26 | * numpy
 27 | * opencv-python
 28 | * jupyter
 29 | * matplotlib
 30 | * Pillow
 31 | 
 32 | ### Network Architecture
 33 | 
 34 | The network architecture found in this paper can be broken down into four major parts:
 35 | 1. Encoder Network
 36 | 2. Light-Weight Refine Network
 37 | 3. Chained Residual Pooling blocks
 38 | 4. Task specific Convolution
 39 |     - Segmentation
 40 |     - Depth Estimation
 41 | 
 42 | A visual summary of the network architecture is provided below directly from the paper.
 43 | 
 44 | ![Network Architecture](https://github.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/blob/master/Images/Network_architecture.PNG?raw=true)
 45 |   
 46 | The Encoder network is built from the ResNet architecture and supports ResNet [50, 101, 152]. The ResNet architecutre for the 34 layer network is shown below.
 47 |  
 48 | ![ResNet_34](https://github.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/blob/master/Images/ResNEt_34_Arch.png?raw=true)
 49 |  
 50 | The ResNet architecture employees residual learning which in short is a skip connection that allows the input to a group of layers to skip and be added back to the output of that layer. This can be visualized as mathematically as F(x) + x where x is the input image and F(x) if the input image after convolution-batch normalization-activation have been perform (possibly also pooling for up/down sampling).
 51 | 
 52 | ![Skip_Connection](https://github.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/blob/master/Images/Skip_Connection.PNG?raw=true)
 53 | 
 54 | The encoder network (ResNet) can be broken down into smaller chunks as seen in the ResNet 34 architecutre. The basics for a 34 layer ResNet are:
 55 | 
 56 | 1.Input Image
 57 |     - 224 x 224 x 3 image (RGB)
 58 | 
 59 | 2. Convolution Layer 1 (grouping layer 1)
 60 |     - Input: Input image
 61 |         - Input image = x
 62 |     - Conv: 7x7 kernel, 64 feature maps, stride 2, padding = 3
 63 |         - size = 112 x 112 x 64 (row x column x feature maps)
 64 |     - Batch normalization
 65 |     - Max Pooling, stride 2
 66 |         - size = 56 x 56 x 64 (row x column x feature maps)
 67 |     - Output: Grouping1_Conv_1
 68 |         - size = 56 x 56 x 64 (row x column x feature maps)
 69 |         - let Grouping1_Conv1 = x_G1
 70 |     
 71 | 3. Convolution Layer 2 (grouping layer 2)
 72 |     - Input: Grouping1_Conv_1, skip connection to after 2 convolutional layers
 73 |         - Input: x_1 = x_G1
 74 |     - Conv: 3x3 kernel, 64 feature maps, stride 1, padding = 1
 75 |         - size = 56 x 56 x 64 (row x column x feature maps)
 76 |     - Conv: 3x3 kernel, 64 feature maps, stride 1, padding = 1
 77 |         - size = 56 x 56 x 64 (row x column x feature maps)
 78 |     - Output: Output_after_2_convolutions + Grouping1_Conv_1
 79 |         - Let the output after two convolutions be F1(x) = x_G2_Conv1
 80 |         - Output of skip connection is H1(x) = F1(x) + x_1
 81 |         - H1(x) = x_G2_Conv1 + x_G1
 82 |         - Let H1(x) = x_G2_SL1
 83 |     - Input: Output of previous skip connection H1(x) = x_G2_SL1, skip connection to after 2 convolutional layers
 84 |         - Input: x_2 = x_G2_SL1
 85 |     - Conv: 3x3 kernel, 64 feature maps, stride 1
 86 |         - size = 56 x 56 x 64 (row x column x feature maps)
 87 |     - Conv: 3x3 kernel, 64 feature maps, stride 1
 88 |         - size = 56 x 56 x 64 (row x column x feature maps)
 89 |     - Output: Output_after_2_convolutions + x_G2_SL1
 90 |         - Let the output after two convolutions be F2(x) = x_G2_Conv2
 91 |         - Output of skip connection is H2(x) = F2(x) + x_2
 92 |         - H2(x) = x_G2_Conv2 + x_G2_SL1
 93 |         - Let H2(x) = x_G2_SL2
 94 |     - Input: Output of previous skip connection H2(x) = x_G2_SL2, skip connection to after 2 convolutional layers
 95 |         - Input: x_3 = x_G2_SL2
 96 |     - Conv: 3x3 kernel, 64 feature maps, stride 1
 97 |         - size = 56 x 56 x 64 (row x column x feature maps)
 98 |     - Conv: 3x3 kernel, 64 feature maps, stride 1
 99 |         - size = 56 x 56 x 64 (row x column x feature maps)
100 |     - Output: Output_after_2_convolutions + x_G2_SL2
101 |         - Let the output after two convolutions be F3(x) = x_G2_Conv3
102 |         - Output of skip connection is H3(x) = F3(x) + x_3
103 |         - H3(x) = x_G2_Conv3 + x_G2_SL2
104 |         - Let H3(x) = x_G2_SL3
105 |     
106 | 4. Convolution Layer 3 (Grouping Layer 3)
107 |     - Input: Output of previous skip connection H3(x) = x_G2_SL3, skip connection to after 2 convolutional layers
108 |         - Input: x_4 = x_G2_SL3
109 |     - Conv: 3x3 kernel, 128 feature maps, stride 2
110 |         - size = 28 x 28 x 128 (row x column x feature maps)
111 |     - Conv: 3x3 kernel, 128 feature maps, stride 1
112 |         - size = 28 x 28 x 128 (row x column x feature maps)
113 |     - Output: Output_after_2_convolutions + x_G2_SL3
114 |         - Let the output after two convolutions be F4(x) = x_G3_Conv1
115 |         - Output of skip connection is H4(x) = F4(x) + x_4
116 |         - H3(x) = x_G3_Conv1 + x_G2_SL3
117 |         - Let H4(x) = x_G3_SL1
118 |     - Input: Output of previous skip connection H4(x) = x_G3_SL1, skip connection to after 2 convolutional layers
119 |         - Input: x_5 = x_G3_SL1
120 |     - Conv: 3x3 kernel, 128 feature maps, stride 1
121 |         - size = 28 x 28 x 128 (row x column x feature maps)
122 |     - Conv: 3x3 kernel, 128 feature maps, stride 1
123 |         - size = 28 x 28 x 128 (row x column x feature maps)
124 |     - Output: Output_after_2_convolutions + x_G3_SL1
125 |         - Let the output after two convolutions be F5(x) = x_G3_Conv2
126 |         - Output of skip connection is H5(x) = F5(x) + x_5
127 |         - H5(x) = x_G3_Conv2 + x_G3_SL1
128 |         - Let H5(x) = x_G3_SL2
129 |     - Input: Output of previous skip connection H5(x) = x_G3_SL2, skip connection to after 2 convolutional layers
130 |         - Input: x_6 = x_G3_SL2
131 |     - Conv: 3x3 kernel, 128 feature maps, stride 1
132 |         - size = 28 x 28 x 128 (row x column x feature maps)
133 |     - Conv: 3x3 kernel, 128 feature maps, stride 1
134 |         - size = 28 x 28 x 128 (row x column x feature maps)
135 |     - Output: Output_after_2_convolutions + x_G3_SL2
136 |         - Let the output after two convolutions be F6(x) = x_G3_Conv3
137 |         - Output of skip connection is H6(x) = F6(x) + x_6
138 |         - H6(x) = x_G3_Conv3 + x_G3_SL2
139 |         - Let H6(x) = x_G3_SL3
140 |     - Input: Output of previous skip connection H6(x) = x_G3_SL3, skip connection to after 2 convolutional layers
141 |         - Input: x_7 = x_G3_SL3
142 |     - Conv: 3x3 kernel, 128 feature maps, stride 1
143 |         - size = 28 x 28 x 128 (row x column x feature maps)   
144 |     - Conv: 3x3 kernel, 128 feature maps, stride 1
145 |         - size = 28 x 28 x 128 (row x column x feature maps)
146 |     - Output: Output_after_2_convolutions + x_G3_SL3
147 |         - Let the output after two convolutions be F7(x) = x_G3_Conv4
148 |         - Output of skip connection is H6(x) = F7(x) + x_7
149 |         - H7(x) = x_G3_Conv4 + x_G3_SL3
150 |         - Let H7(x) = x_G3_SL4
151 |     
152 | 5. Convolution Layer 4 (Grouping Layer 4)
153 |     - Input: Output of previous skip connection H7(x) = x_G3_SL4, skip connection to after 2 convolutional layers
154 |         - Input: x_8 = x_G3_SL4
155 |     - Conv: 3x3 kernel, 256 feature maps, stride 2
156 |         - size = 14 x 14 x 256 (row x column x feature maps)
157 |     - Conv: 3x3 kernel, 256 feature maps, stride 1
158 |         - size = 14 x 14 x 256 (row x column x feature maps)
159 |     - Output: Output_after_2_convolutions + x_G3_SL4
160 |         - Let the output after two convolutions be F8(x) = x_G4_Conv1
161 |         - Output of skip connection is H8(x) = F8(x) + x_8
162 |         - H8(x) = x_G4_Conv1 + x_G3_SL4
163 |         - Let H8(x) = x_G4_SL1
164 |     - Input: Output of previous skip connection H8(x) = x_G4_SL1, skip connection to after 2 convolutional layers
165 |         - Input: x_9 = x_G4_SL1
166 |     - Conv: 3x3 kernel, 256 feature maps, stride 1
167 |         - size = 14 x 14 x 256 (row x column x feature maps)
168 |     - Conv: 3x3 kernel, 256 feature maps, stride 1
169 |         - size = 14 x 14 x 256 (row x column x feature maps)
170 |     - Output: Output_after_2_convolutions + x_G4_SL1
171 |         - Let the output after two convolutions be F9(x) = x_G4_Conv2
172 |         - Output of skip connection is H9(x) = F9(x) + x_9
173 |         - H9(x) = x_G4_Conv2 + x_G4_SL1
174 |         - Let H9(x) = x_G4_SL2
175 |     - Input: Output of previous skip connection H9(x) = x_G4_SL2, skip connection to after 2 convolutional layers
176 |         - Input: x_10 = x_G4_SL2
177 |     - Conv: 3x3 kernel, 256 feature maps, stride 1
178 |         - size = 14 x 14 x 256 (row x column x feature maps)
179 |     - Conv: 3x3 kernel, 256 feature maps, stride 1
180 |         - size = 14 x 14 x 256 (row x column x feature maps)
181 |     - Output: Output_after_2_convolutions + x_G4_SL2
182 |         - Let the output after two convolutions be F10(x) = x_G4_Conv3
183 |         - Output of skip connection is H10(x) = F10(x) + x_10
184 |         - H10(x) = x_G4_Conv3 + x_G4_SL2
185 |         - Let H10(x) = x_G4_SL3
186 |     - Input: Output of previous skip connection H10(x) = x_G4_SL3, skip connection to after 2 convolutional layers
187 |         - Input: x_11 = x_G4_SL3
188 |     - Conv: 3x3 kernel, 256 feature maps, stride 1
189 |         - size = 14 x 14 x 256 (row x column x feature maps)
190 |     - Conv: 3x3 kernel, 256 feature maps, stride 1
191 |         - size = 14 x 14 x 256 (row x column x feature maps)
192 |     - Output: Output_after_2_convolutions + x_G4_SL3
193 |         - Let the output after two convolutions be F11(x) = x_G4_Conv4
194 |         - Output of skip connection is H11(x) = F11(x) + x_11
195 |         - H11(x) = x_G4_Conv4 + x_G4_SL3
196 |         - Let H11(x) = x_G4_SL4
197 |     - Input: Output of previous skip connection H11(x) = x_G4_SL4, skip connection to after 2 convolutional layers
198 |         - Input: x_12 = x_G4_SL4
199 |     - Conv: 3x3 kernel, 256 feature maps, stride 1
200 |         - size = 14 x 14 x 256 (row x column x feature maps)
201 |     - Conv: 3x3 kernel, 256 feature maps, stride 1
202 |         - size = 14 x 14 x 256 (row x column x feature maps)
203 |     - Output: Output_after_2_convolutions + x_G4_SL4
204 |         - Let the output after two convolutions be F12(x) = x_G4_Conv5
205 |         - Output of skip connection is H12(x) = F12(x) + x_12
206 |         - H12(x) = x_G4_Conv5 + x_G4_SL4
207 |         - Let H12(x) = x_G4_SL5
208 |     - Input: Output of previous skip connection H12(x) = x_G4_SL5, skip connection to after 2 convolutional layers
209 |         - Input: x_13 = x_G4_SL5
210 |     - Conv: 3x3 kernel, 256 feature maps, stride 1
211 |         - size = 14 x 14 x 256 (row x column x feature maps)
212 |     - Conv: 3x3 kernel, 256 feature maps, stride 1
213 |         - size = 14 x 14 x 256 (row x column x feature maps)
214 |     - Output: Output_after_2_convolutions + x_G4_SL5
215 |         - Let the output after two convolutions be F13(x) = x_G4_Conv6
216 |         - Output of skip connection is H13(x) = F13(x) + x_13
217 |         - H13(x) = x_G4_Conv6 + x_G4_SL5
218 |         - Let H13(x) = x_G4_SL6
219 |     
220 | 6. Convolution Layer 5 (Grouping Layer 5)
221 |     - Input: Output of previous skip connection H13(x) = x_G4_SL6, skip connection to after 2 convolutional layers
222 |         - Input: x_14 = x_G4_SL6
223 |     - Conv: 3x3 kernel, 512 feature maps, stride 2
224 |         - size = 7 x 7 x 512 (row x column x feature maps)
225 |     - Conv: 3x3 kernel, 512 feature maps, stride 1
226 |         - size = 7 x 7 x 512 (row x column x feature maps)
227 |     - Output: Output_after_2_convolutions + x_G4_SL6
228 |         - Let the output after two convolutions be F14(x) = x_G5_Conv1
229 |         - Output of skip connection is H14(x) = F14(x) + x_14
230 |         - H14(x) = x_G5_Conv1 + x_G4_SL6
231 |         - Let H14(x) = x_G5_SL1
232 |     - Input: Output of previous skip connection H14(x) = x_G5_SL1, skip connection to after 2 convolutional layers
233 |         - Input: x_15 = x_G5_SL1
234 |     - Conv: 3x3 kernel, 512 feature maps, stride 1
235 |         - size = 7 x 7 x 512 (row x column x feature maps)
236 |     - Conv: 3x3 kernel, 512 feature maps, stride 1
237 |         - size = 7 x 7 x 512 (row x column x feature maps)
238 |     - Output: Output_after_2_convolutions + x_G5_SL1
239 |         - Let the output after two convolutions be F15(x) = x_G5_Conv2
240 |         - Output of skip connection is H15(x) = F15(x) + x_15
241 |         - H15(x) = x_G5_Conv2 + x_G5_SL1
242 |         - Let H15(x) = x_G5_SL2
243 |     - Input: Output of previous skip connection H15(x) = x_G5_SL2, skip connection to after 2 convolutional layers
244 |         - Input: x_16 = x_G5_SL2
245 |     - Conv: 3x3 kernel, 512 feature maps, stride 1
246 |         - size = 7 x 7 x 512 (row x column x feature maps)
247 |     - Conv: 3x3 kernel, 512 feature maps, stride 1
248 |         - size = 7 x 7 x 512 (row x column x feature maps)
249 |     - Output: Output_after_2_convolutions + x_G5_SL2
250 |         - Let the output after two convolutions be F16(x) = x_G5_Conv3
251 |         - Output of skip connection is H16(x) = F16(x) + x_16
252 |         - H16(x) = x_G5_Conv3 + x_G5_SL3
253 |         - Let H16(x) = x_G5_SL3
254 |     
255 | 7. Average Pooling (Global Average Pooling)
256 |     - Global Average Pooling is applied to the output of the last convolutional grouping in the ResNet 34. This global average pooling takes the 7 x 7 x 512 tensor of feature maps and averages the 7 x 7 feature map into a 1 x 1 feature map of depth 512. The output of this layer is a feature map of 1 x 1 x 512.
257 | 
258 | 8. Full Connected Layer
259 |     - The Global Average Pooling is the connected to each output neuron. Each feature map in the 1 x 1 x 512 is connected to the output neurons making this a fully connected layer.
260 | 
261 | The ResNet 34 architecutre shown above can be extended to 50/101/152 layers. This example is just to illustrate the encoder architecutre used in the paper. The different ResNet architecutres can all be used to determine which architecture gives the best results. For each ResNet architecture the decoder architecture which uses the Light Weight RefineNet will need to be updated.
262 | 
263 | The encoder passes the output directly to the Light Weight RefineNet at the output of the encorder and through chained residual pooling blocks. The Light Weight RefineNet implementation is used as a decoder with an architecture as described in the following paper https://arxiv.org/pdf/1810.03272.pdf where modification are made to the original RefineNet to make it more desirable for real time semantic segmentation. A basic idea of the architecture is shown in the picture below:
264 | 
265 | ![Light Weight RefineNet](https://github.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/blob/master/Images/LWRN_General.PNG?raw=true)
266 | 
267 | The main changes found in this paper to the architecutre of the Light Weight Refine Network is the changing of the Residual Convolution Units (RCU) that connect the output of each encoder feature map to the decoder are now Chained Residual Pooling (CRP) blocks. This means that at the input of each decoder layered grouping the corresponding feature map from the encoder will pass throught the CRP blocks before beginning the process of following the Light Weight Refine Network architecutre of passing through a light weight fusion block, leight weight CRP block, and a leight weight RCU block before all being added together to create an output image the same size as the input image.
268 | 
269 | ![Light Weight RefineNet](https://github.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/blob/master/Images/LW_CRP_RCU_FUSION_Blocks.PNG?raw=true)
270 | 
271 | Finally, the paper makes use of two task branches at the output of the Light Weight Refine Network. Each branch has the same architecture with a 1x1 depth convolution and a 3x3 convolution. Using Multitask learning each branch is able to perform a signle task such as semantic segmentation and depth estimation.
272 |   
273 | ### Paper Implementation
274 | 
275 | How to run the code
276 | In order to get this code to run I recommend copying the entire repository to your local drive. Create a folder and then use the python venv function to create a local copy of the python interpreter.
277 | 
278 | The dependencies for this project will be listed below. This specific implementation was done in windows 7 ultimate 64 bit. Using a pretrained network for real time semantic segmentation and depth estimation. The pre trained network can be found in the weights folder. It is possible to pre-train a netowkr with specific weights using a different architecture but in this implementation I decicded to follow the original authors methodoolgy to try and get results as close as possibly to the original paper.
279 | 
280 | Exmaple of projecy directory creation in command line:
281 | ```
282 | mkdir my_Project
283 | -m venv my_Project\venv
284 | ```
285 | Extract this repositor to within the folder you created but do not place the venv folder. Once the requirements.txt folder is in your selected folder run the following command in command prompt to download the required libraries/frameworks:
286 | ```
287 | my_Project\venv\Scripts\activate.bat
288 | pip install -r requirements.txt
289 | ```
290 | 
291 | Once all the dependencies are installed you will be able to add aditional images to the examples folder and then upadate the folder path within the evaluation python file by changing the img_Path variable to have the associated path to your new image.
292 | 
293 | This implementation makes use of check point files and pretrained weights for the network. The pretrained weights are determined by using the Light Weight Refine Net with a ResNet encoder to determine the optimal weights for semantic segmentation. Then the new network built specifically for this paper makes use of those weights and multi task learning to accurately guess the missing label data for either semantic mapping or depth estimation.
294 | 
295 | ### Conclusions
296 | 
297 | This paper provides a deep dive for the reader if they want to become familiar with encoder decoder networks, ResNet, RefineNet and Leight Weight Refine Net. Using pretrained wieghts significantly speeds up the time to process an input image into is respective semantic and depth mapping. A full pipeline implementation from ResNet to Light Weight Refine net is the next goal of work to test the real-time capabilities of this method from start to finish.
298 | 
299 | The ease with which the network performs multi-task learning using the pre-train weights is a great indicator of possible future work. Aside from the full pipe line implementation it would be interesting to try and change the network architecture to use different encoder and decoders to see if this method can stil be improved.
300 | 
301 | 
302 | 
303 | 
304 | 
305 | 
306 | 
307 | 
308 | 
309 | 


--------------------------------------------------------------------------------
/Weights/ExpKITTI_joint.ckpt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Weights/ExpKITTI_joint.ckpt


--------------------------------------------------------------------------------
/Weights/ExpNYUDKITTI_joint.ckpt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Weights/ExpNYUDKITTI_joint.ckpt


--------------------------------------------------------------------------------
/Weights/ExpNYUD_joint.ckpt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Weights/ExpNYUD_joint.ckpt


--------------------------------------------------------------------------------
/Weights/ExpNYUD_three.ckpt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AleksBanbur/EE8204---Real-Time-Multi-Task-Learning/cd59557486ee1ed292e95886ed15b9a0fe586941/Weights/ExpNYUD_three.ckpt


--------------------------------------------------------------------------------
/eval_KITTI.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Aug 14 12:20:05 2020
 4 | 
 5 | @author: abanbur
 6 | """
 7 | 
 8 | 
 9 | import matplotlib.pyplot as plt
10 | from PIL import Image
11 | import numpy as np
12 | from model import network
13 | import cv2
14 | import torch
15 | from torch.autograd import Variable
16 | 
17 | color_Map = np.load('Data/cmap_kitti.npy')
18 | depth_Coeff = 800. # Converts into meters
19 | has_Cuda = torch.cuda.is_available()
20 | img_Scale  = 1./255
21 | img_Mean = np.array([0.485, 0.456, 0.406]).reshape((1, 1, 3))
22 | img_Std = np.array([0.229, 0.224, 0.225]).reshape((1, 1, 3))
23 | max_Depth = 80.
24 | min_Depth = 0.
25 | num_CLASSES = 6
26 | num_TASKS = 2 # segm + depth
27 | 
28 | def pre_Processing(img):
29 |     return (img * img_Scale - img_Mean) / img_Std
30 | 
31 | model_Object = network(num_classes=num_CLASSES, num_tasks=num_TASKS)
32 | if has_Cuda:
33 |     _ = model_Object.cuda()
34 | _ = model_Object.eval()
35 | 
36 | check_Point = torch.load('Weights/ExpKITTI_joint.ckpt')
37 | model_Object.load_state_dict(check_Point['state_dict'])
38 | 
39 | img_Path = 'Examples/Example_KITTI_Segm_Depth/000099.png'
40 | img = np.array(Image.open(img_Path))
41 | gt_segm = np.array(Image.open('Examples\Example_KITTI_Segm_Depth\segm_gt_000099.png'))
42 | 
43 | with torch.no_grad():
44 |     img_var = Variable(torch.from_numpy(pre_Processing(img).transpose(2, 0, 1)[None]), requires_grad = False).float()
45 |     if has_Cuda:
46 |         img_var = img_var.cuda()
47 |     segm, depth = model_Object(img_var)
48 |     segm = cv2.resize(segm[0, :num_CLASSES].cpu().data.numpy().transpose(1, 2, 0), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC)
49 |     depth = cv2.resize(depth[0, 0].cpu().data.numpy(), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC)
50 |     segm = color_Map[segm.argmax(axis=2)].astype(np.uint8)
51 |     depth = np.abs(depth)
52 | plt.figure(figsize=(18, 12))
53 | plt.subplot(141)
54 | plt.imshow(img)
55 | plt.title('orig img')
56 | plt.axis('off')
57 | plt.subplot(142)
58 | plt.imshow(gt_segm)
59 | plt.title('gt segm')
60 | plt.axis('off')
61 | plt.subplot(143)
62 | plt.imshow(segm)
63 | plt.title('pred segm')
64 | plt.axis('off')
65 | plt.subplot(144)
66 | plt.imshow(depth, cmap='plasma', vmin=min_Depth, vmax=max_Depth)
67 | plt.title('pred depth')
68 | plt.axis('off');
69 | 


--------------------------------------------------------------------------------
/eval_KITTI_NYUD.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Aug 14 21:44:34 2020
  4 | 
  5 | @author: abanbur
  6 | """
  7 | 
  8 | 
  9 | import matplotlib.pyplot as plt
 10 | from PIL import Image
 11 | import numpy as np
 12 | from model import network
 13 | import cv2
 14 | import torch
 15 | from torch.autograd import Variable
 16 | 
 17 | cmap_Nyud = np.load('Data/cmap_nyud.npy')
 18 | cmap_Kitti = np.load('Data/cmap_kitti.npy')
 19 | depth_Coeff_Nyud = 5000. # to convert into metres
 20 | depth_Coeff_Kitti = 800.
 21 | has_Cuda = torch.cuda.is_available()
 22 | img_Scale  = 1./255
 23 | img_Mean = np.array([0.485, 0.456, 0.406]).reshape((1, 1, 3))
 24 | img_Std = np.array([0.229, 0.224, 0.225]).reshape((1, 1, 3))
 25 | max_Depth_Nyud = 8.
 26 | min_Depth_Nyud = 0.
 27 | max_Depth_Kitti = 80.
 28 | min_Depth_Kitti = 0.
 29 | num_CLASSES = 46
 30 | num_CLASSES_NYUD = 40
 31 | num_CLASSES_KITTI = 6
 32 | num_TASKS = 2 # segm + depth
 33 | 
 34 | def pre_Processing(img):
 35 |     return (img * img_Scale - img_Mean) / img_Std
 36 | 
 37 | model_Object = network(num_classes = num_CLASSES, num_tasks = num_TASKS)
 38 | if has_Cuda:
 39 |     _ = model_Object.cuda()
 40 | _ = model_Object.eval()
 41 | 
 42 | check_Point = torch.load('Weights/ExpNYUDKITTI_joint.ckpt')
 43 | model_Object.load_state_dict(check_Point['state_dict'])
 44 | 
 45 | # NYUD
 46 | img_path = 'Examples/Example_NYUDv2_Segm_Depth/000464.png'
 47 | img_nyud = np.array(Image.open(img_path))
 48 | gt_segm_nyud = np.array(Image.open('Examples/Example_NYUDv2_Segm_Depth/segm_gt_000464.png'))
 49 | 
 50 | # KITTI
 51 | img_path = 'Examples/Example_KITTI_Segm_Depth/000099.png'
 52 | img_kitti = np.array(Image.open(img_path))
 53 | gt_segm_kitti = np.array(Image.open('Examples/Example_KITTI_Segm_Depth/segm_gt_000099.png'))
 54 | 
 55 | with torch.no_grad():
 56 |     # nyud
 57 |     img_var = Variable(torch.from_numpy(pre_Processing(img_nyud).transpose(2, 0, 1)[None]), requires_grad=False).float()
 58 |     if has_Cuda:
 59 |         img_var = img_var.cuda()
 60 |     segm, depth = model_Object(img_var)
 61 |     segm = cv2.resize(segm[0, :(num_CLASSES_NYUD)].cpu().data.numpy().transpose(1, 2, 0), img_nyud.shape[:2][::-1], interpolation=cv2.INTER_CUBIC)
 62 |     depth = cv2.resize(depth[0, 0].cpu().data.numpy(), img_nyud.shape[:2][::-1], interpolation=cv2.INTER_CUBIC)
 63 |     segm_nyud = cmap_Nyud[segm.argmax(axis=2) + 1].astype(np.uint8)
 64 |     depth_nyud = np.abs(depth)
 65 |     # kitti
 66 |     img_var = Variable(torch.from_numpy(pre_Processing(img_kitti).transpose(2, 0, 1)[None]), requires_grad=False).float()
 67 |     if has_Cuda:
 68 |         img_var = img_var.cuda()
 69 |     segm, depth = model_Object(img_var)
 70 |     segm = cv2.resize(segm[0, (num_CLASSES_NYUD):(num_CLASSES_NYUD + num_CLASSES_KITTI)].cpu().data.numpy().transpose(1, 2, 0), img_kitti.shape[:2][::-1], interpolation=cv2.INTER_CUBIC)
 71 |     depth = cv2.resize(depth[0, 0].cpu().data.numpy(), img_kitti.shape[:2][::-1], interpolation=cv2.INTER_CUBIC)
 72 |     segm_kitti = cmap_Kitti[segm.argmax(axis=2)].astype(np.uint8)
 73 |     depth_kitti = np.abs(depth)
 74 | 
 75 | plt.figure(figsize=(18, 12))
 76 | plt.subplot(141)
 77 | plt.imshow(img_nyud)
 78 | plt.title('NYUD: img')
 79 | plt.axis('off')
 80 | plt.subplot(142)
 81 | plt.imshow(cmap_Nyud[gt_segm_nyud + 1])
 82 | plt.title('NYUD: gt segm')
 83 | plt.axis('off')
 84 | plt.subplot(143)
 85 | plt.imshow(segm_nyud)
 86 | plt.title('NYUD: pred segm')
 87 | plt.axis('off')
 88 | plt.subplot(144)
 89 | plt.imshow(depth_nyud, cmap='plasma', vmin=min_Depth_Nyud, vmax=max_Depth_Nyud)
 90 | plt.title('NYUD: pred depth')
 91 | plt.axis('off')
 92 | plt.figure(figsize=(18,12))
 93 | plt.subplot(141)
 94 | plt.imshow(img_kitti)
 95 | plt.title('KITTI: img')
 96 | plt.axis('off')
 97 | plt.subplot(142)
 98 | plt.imshow(gt_segm_kitti)
 99 | plt.title('KITTI: gt segm')
100 | plt.axis('off')
101 | plt.subplot(143)
102 | plt.imshow(segm_kitti)
103 | plt.title('KITTI: pred segm')
104 | plt.axis('off')
105 | plt.subplot(144)
106 | plt.imshow(depth_kitti, cmap='plasma', vmin=min_Depth_Kitti, vmax=max_Depth_Kitti)
107 | plt.title('KITTI: pred depth')
108 | plt.axis('off');
109 | 
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/eval_NYUDv2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Aug 14 22:28:03 2020
 4 | 
 5 | @author: abanbur
 6 | """
 7 | 
 8 | 
 9 | import matplotlib.pyplot as plt
10 | from PIL import Image
11 | import numpy as np
12 | from model import network
13 | import cv2
14 | import torch
15 | from torch.autograd import Variable
16 | 
17 | cmap_Nyud = np.load('Data/cmap_nyud.npy')
18 | depth_Coeff = 5000. # Converts into meters
19 | has_Cuda = torch.cuda.is_available()
20 | img_Scale  = 1./255
21 | img_Mean = np.array([0.485, 0.456, 0.406]).reshape((1, 1, 3))
22 | img_Std = np.array([0.229, 0.224, 0.225]).reshape((1, 1, 3))
23 | max_Depth = 8.
24 | min_Depth = 0.
25 | num_CLASSES = 40
26 | num_TASKS = 2 # segm + depth
27 | 
28 | def pre_Processing(img):
29 |     return (img * img_Scale - img_Mean) / img_Std
30 | 
31 | model_Object = network(num_classes=num_CLASSES, num_tasks=num_TASKS)
32 | if has_Cuda:
33 |     _ = model_Object.cuda()
34 | _ = model_Object.eval()
35 | 
36 | check_Point = torch.load('Weights/ExpNYUD_joint.ckpt')
37 | model_Object.load_state_dict(check_Point['state_dict'])
38 | 
39 | img_path = 'Examples/Example_NYUDv2_Segm_Depth/000464.png'
40 | img = np.array(Image.open(img_path))
41 | gt_segm = np.array(Image.open('Examples/Example_NYUDv2_Segm_Depth/segm_gt_000464.png'))
42 | gt_depth = np.array(Image.open('Examples/Example_NYUDv2_Segm_Depth/depth_gt_000464.png'))
43 | 
44 | 
45 | with torch.no_grad():
46 |     img_var = Variable(torch.from_numpy(pre_Processing(img).transpose(2, 0, 1)[None]), requires_grad=False).float()
47 |     if has_Cuda:
48 |         img_var = img_var.cuda()
49 |     segm, depth = model_Object(img_var)
50 |     segm = cv2.resize(segm[0, :num_CLASSES].cpu().data.numpy().transpose(1, 2, 0), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC)
51 |     depth = cv2.resize(depth[0, 0].cpu().data.numpy(), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC)
52 |     segm = cmap_Nyud[segm.argmax(axis=2) + 1].astype(np.uint8)
53 |     depth = np.abs(depth)
54 | plt.figure(figsize=(18, 12))
55 | plt.subplot(151)
56 | plt.imshow(img)
57 | plt.title('orig img')
58 | plt.axis('off')
59 | plt.subplot(152)
60 | plt.imshow(cmap_Nyud[gt_segm + 1])
61 | plt.title('gt segm')
62 | plt.axis('off')
63 | plt.subplot(153)
64 | plt.imshow(segm)
65 | plt.title('pred segm')
66 | plt.axis('off')
67 | plt.subplot(154)
68 | plt.imshow(gt_depth / depth_Coeff, cmap='plasma', vmin=min_Depth, vmax=max_Depth)
69 | plt.title('gt depth')
70 | plt.axis('off')
71 | plt.subplot(155)
72 | plt.imshow(depth, cmap='plasma', vmin=min_Depth, vmax=max_Depth)
73 | plt.title('pred depth')
74 | plt.axis('off');


--------------------------------------------------------------------------------
/eval_NYUDv2_SurfNorm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Aug 14 22:43:26 2020
 4 | 
 5 | @author: abanbur
 6 | """
 7 | 
 8 | 
 9 | import matplotlib.pyplot as plt
10 | from PIL import Image
11 | import numpy as np
12 | from model import network
13 | import cv2
14 | import torch
15 | from torch.autograd import Variable
16 | 
17 | cmap_Nyud = np.load('Data/cmap_nyud.npy')
18 | depth_Coeff = 5000. # Converts into meters
19 | has_Cuda = torch.cuda.is_available()
20 | img_Scale  = 1./255
21 | img_Mean = np.array([0.485, 0.456, 0.406]).reshape((1, 1, 3))
22 | img_Std = np.array([0.229, 0.224, 0.225]).reshape((1, 1, 3))
23 | max_Depth = 8.
24 | min_Depth = 0.
25 | num_CLASSES = 40
26 | num_TASKS = 3 # segm + depth
27 | 
28 | def pre_Processing(img):
29 |     return (img * img_Scale - img_Mean) / img_Std
30 | 
31 | model_Object = network(num_classes=num_CLASSES, num_tasks=num_TASKS)
32 | if has_Cuda:
33 |     _ = model_Object.cuda()
34 | _ = model_Object.eval()
35 | 
36 | check_Point = torch.load('Weights/ExpNYUD_three.ckpt')
37 | model_Object.load_state_dict(check_Point['state_dict'])
38 | 
39 | img_path = 'Examples/Example_NYUDv2_Segm_Depth_SurfNorm/000433.png'
40 | img = np.array(Image.open(img_path))
41 | gt_segm = np.array(Image.open('Examples/Example_NYUDv2_Segm_Depth_SurfNorm/segm_gt_000433.png'))
42 | gt_depth = np.array(Image.open('Examples/Example_NYUDv2_Segm_Depth_SurfNorm/depth_gt_000433.png'))
43 | gt_norm = np.array(Image.open('Examples/Example_NYUDv2_Segm_Depth_SurfNorm/norm_gt_000433.png'))
44 | 
45 | with torch.no_grad():
46 |     img_var = Variable(torch.from_numpy(pre_Processing(img).transpose(2, 0, 1)[None]), requires_grad=False).float()
47 |     if has_Cuda:
48 |         img_var = img_var.cuda()
49 |     segm, depth, norm = model_Object(img_var)
50 |     segm = cv2.resize(segm[0, :num_CLASSES].cpu().data.numpy().transpose(1, 2, 0), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC)
51 |     depth = cv2.resize(depth[0, 0].cpu().data.numpy(), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC)
52 |     norm = cv2.resize(norm[0].cpu().data.numpy().transpose(1, 2, 0), img.shape[:2][::-1], interpolation=cv2.INTER_CUBIC)
53 |     segm = cmap_Nyud[segm.argmax(axis=2) + 1].astype(np.uint8)
54 |     depth = np.abs(depth)
55 |     out_norm = norm / np.linalg.norm(norm, axis=2, keepdims=True)
56 |     ## xzy->RGB ##
57 |     out_norm[:, :, 0] = ((out_norm[:, :, 0] + 1.) / 2.) * 255.
58 |     out_norm[:, :, 1] = ((out_norm[:, :, 1] + 1.) / 2.) * 255.
59 |     out_norm[:, :, 2] = ((1. - out_norm[:, :, 2]) / 2.) * 255.
60 |     out_norm = out_norm.astype(np.uint8)
61 | 
62 | plt.figure(figsize=(18, 12))
63 | plt.subplot(171)
64 | plt.imshow(img)
65 | plt.title('orig img')
66 | plt.axis('off')
67 | plt.subplot(172)
68 | plt.imshow(cmap_Nyud[gt_segm + 1])
69 | plt.title('gt segm')
70 | plt.axis('off')
71 | plt.subplot(173)
72 | plt.imshow(segm)
73 | plt.title('pred segm')
74 | plt.axis('off')
75 | plt.subplot(174)
76 | plt.imshow(gt_depth / depth_Coeff, cmap='plasma', vmin=min_Depth, vmax=max_Depth)
77 | plt.title('gt depth')
78 | plt.axis('off')
79 | plt.subplot(175)
80 | plt.imshow(depth, cmap='plasma', vmin=min_Depth, vmax=max_Depth)
81 | plt.title('pred depth')
82 | plt.axis('off')
83 | plt.subplot(176)
84 | plt.imshow(gt_norm)
85 | plt.title('gt norm')
86 | plt.axis('off')
87 | plt.subplot(177)
88 | plt.imshow(out_norm)
89 | plt.title('pred norm')
90 | plt.axis('off');


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Spyder Editor
  4 | 
  5 | This is a temporary script file.
  6 | """
  7 | 
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import math
 12 | 
 13 | #Creating a 3x3 convolution definition
 14 | def conv3x3(in_channel, out_channel, stride = 1, bias = False, dilation = 1, groups = 1):
 15 |     "Creating a method for 2D convolution of a 3x3 kernel"
 16 |     #Inputs to the definition are:
 17 |     #Input channel size - Number of channels in the input image
 18 |     #Output channel size - Number of channels produced by the convolution
 19 |     #Stride = 1, Stride of the convolution
 20 |     #bias = False, Setting the bias to be learnable or not
 21 |     #Dilation = 1, Spacing between kernel elements during convolution
 22 |     #Groups = 1, Number of blocked connections from input channels to output channels
 23 |     return nn.Conv2d(in_channel, out_channel, kernel_size = 3, stride = stride, padding = dilation, dilation = dilation, bias = bias, groups = groups)
 24 | 
 25 | #Creating a 1x1 convolution definition
 26 | def conv1x1(in_channel, out_channel, stride = 1, bias = False, groups = 1):
 27 |     "Creating a method for 2D convolution of a 1x1 kernel"
 28 |     #Inputs to the definition are:
 29 |     #Input channel size - Number of channels in the input image
 30 |     #Output channel size - Number of channels produced by the convolution
 31 |     #Stride = 1, Stride of the convolution
 32 |     #Groups = 1, Number of blocked connections from input channels to output channels
 33 |     return nn.Conv2d(in_channel, out_channel, kernel_size = 1, stride = stride, padding = 0, bias = bias, groups = groups)
 34 | 
 35 | #Creating a batch normalization definition
 36 | def batch_norm(num_features):
 37 |     "Creating a method for 2D batch normalization"
 38 |     #Inputs to batchnorm2d:
 39 |     #Number of features - An expected input of size C
 40 |     #Eps - Denominator value in batch norm equation added for stability
 41 |     #Momentum - Value used to calculate running mean and running var computation
 42 |     #Affine - Boolean value that when set to true, the module has learnable affine parameters
 43 |     return nn.BatchNorm2d(num_features, eps = 1e-5, momentum = 0.1, affine = True)
 44 | 
 45 | #Creating the conv-bn-relu sequence
 46 | def con_bn_act(in_channel, out_channel, kernel_size, stride = 1, groups = 1, act = True):
 47 |     "Creating a method for the convolution, batch normalization, and activation using ReLU using the PyTorch nn.Sequential function"
 48 |     if act:
 49 |         return nn.Sequential(nn.Conv2d(in_channel, out_channel, kernel_size, stride = stride, padding = int(kernel_size / 2.), groups = groups, bias = False), batch_norm(out_channel), nn.ReLU6(inplace = True))
 50 |     else:
 51 |         return nn.Sequential(nn.Conv2d(in_channel, out_channel, kernel_size, stride = stride, padding = int(kernel_size / 2.), groups = groups, bias = False), batch_norm(out_channel))
 52 | 
 53 | 
 54 | #Creating the Chained Residual Pooling class
 55 | #This class is a child class of the PyTorch Parent nn.Module
 56 | class chained_residual_pooling(nn.Module):
 57 |     "This is the Chained Residual Pooling class"
 58 |     #Constructor method __init__() is used to initialize class variables
 59 |     #Input channel size
 60 |     #Output channel size
 61 |     #Number of stages
 62 |     #Groups
 63 |     def __init__(self, in_channel, out_channel, n_stages, groups = False):
 64 |         #Using the super function we are able to call the __init__() method of the nn.Module parent
 65 |         #In this case super(chained_residual_pooling, self).__init__() = super().__init__()
 66 |         #This is becuase the first argument of super is the same as the class we are calling from within
 67 |         super(chained_residual_pooling, self).__init__()
 68 |         for i in range(n_stages):
 69 |             setattr(self, "{}_{}".format(i + 1, 'outvar_dimred'), conv1x1(in_channel if (i == 0) else out_channel, out_channel, stride = 1, bias=False, groups = in_channel if groups else 1))
 70 |         
 71 |         #Initializing class variables for object instantiating using the self parameter
 72 |         #Using self will allow the current instance of the class to be linked the object calling the class
 73 |         self.stride = 1 #Setting the stride
 74 |         self.n_stages = n_stages #Setting the number of stages
 75 |         self.maxpool = nn.MaxPool2d(kernel_size = 5, stride = 1, padding = 2) #Defining maxpool as the PyTorch nn.MaxPool2d method
 76 |     
 77 |     def forward(self, x):
 78 |         top = x
 79 |         for i in range(self.n_stages):
 80 |             top = self.maxpool(top)
 81 |             top = getattr(self, '{}_{}'.format(i + 1, 'outvar_dimred'))(top)
 82 |             x = top + x
 83 |         return x
 84 | 
 85 | #Creating the Inverted Residual block
 86 | #This block was taken directly from the paper in the link with minor adjustments to variable names
 87 | class Inverted_Residual_Block(nn.Module):
 88 |     """Inverted Residual Block from https://arxiv.org/abs/1801.04381"""
 89 |     def __init__(self, in_channel, out_channel, expansion_factor, stride = 1):
 90 |         super(Inverted_Residual_Block, self).__init__()
 91 |         intermed_channel = in_channel * expansion_factor
 92 |         self.residual = (in_channel == out_channel) and (stride == 1)
 93 |         self.output = nn.Sequential(con_bn_act(in_channel, intermed_channel, 1),
 94 |                                     con_bn_act(intermed_channel, intermed_channel, 3, stride = stride, groups = intermed_channel),
 95 |                                     con_bn_act(intermed_channel, out_channel, 1, act = False))
 96 | 
 97 |     def forward(self, x):
 98 |         residual = x
 99 |         out = self.output(x)
100 |         if self.residual:
101 |             return (out + residual)
102 |         else:
103 |             return out
104 |         
105 | 
106 | #Creating the network Architecture
107 | class network_Arch(nn.Module):
108 |     """"Real Time Semantic Segmenataion and Depth Estimation Neural Network Arch"""
109 |     mobile_Net_Config = [[1, 16, 1, 1],
110 |                          [6, 24, 2, 2],
111 |                          [6, 32, 3, 2],
112 |                          [6, 64, 4, 2],
113 |                          [6, 96, 3, 1],
114 |                          [6, 160, 3, 2],
115 |                          [6, 320, 1, 1],
116 |                          ]
117 |     
118 |     in_channel = 32
119 |     num_layers = len(mobile_Net_Config)
120 |     def __init__(self, num_classes, num_tasks = 2):
121 |         super(network_Arch, self).__init__()
122 |         self.num_tasks = num_tasks
123 |         assert self.num_tasks in [2, 3], "Number of tasks supported is either 2 or 3, got {}".format(self.num_tasks)
124 | 
125 |         self.layer1 = con_bn_act(3, self.in_channel, kernel_size=3, stride=2)
126 |         c_layer = 2
127 |         for t,c,n,s in (self.mobile_Net_Config):
128 |             layers = []
129 |             for idx in range(n):
130 |                 layers.append(Inverted_Residual_Block(self.in_channel, c, expansion_factor = t, stride = s if idx == 0 else 1))
131 |                 self.in_channel = c
132 |             setattr(self, 'layer{}'.format(c_layer), nn.Sequential(*layers))
133 |             c_layer += 1
134 |             
135 |         #Creating the Leight-Weight Refine Network Architecture
136 |         self.conv8 = conv1x1(320, 256, bias=False) #in_channel = 320, out_channel = 256
137 |         self.conv7 = conv1x1(160, 256, bias=False) #in_channel = 160, out_channel = 256
138 |         self.conv6 = conv1x1(96, 256, bias=False) #in_channel = 96, out_channel = 256
139 |         self.conv5 = conv1x1(64, 256, bias=False) #in_channel = 64, out_channel = 256
140 |         self.conv4 = conv1x1(32, 256, bias=False) #in_channel = 32, out_channel = 256
141 |         self.conv3 = conv1x1(24, 256, bias=False) #in_channel = 24, out_channel = 256
142 |         self.crp4 = self._make_crp(256, 256, 4, groups=False) #in_channel = 256, out_channel = 256, stages = 4
143 |         self.crp3 = self._make_crp(256, 256, 4, groups=False) #in_channel = 256, out_channel = 256, stages = 4
144 |         self.crp2 = self._make_crp(256, 256, 4, groups=False) #in_channel = 256, out_channel = 256, stages = 4
145 |         self.crp1 = self._make_crp(256, 256, 4, groups=True) #in_channel = 256, out_channel = 256, stages = 4, groups = True
146 | 
147 |         self.conv_adapt4 = conv1x1(256, 256, bias=False) #in_channel = 256, out_channel = 256
148 |         self.conv_adapt3 = conv1x1(256, 256, bias=False) #in_channel = 256, out_channel = 256
149 |         self.conv_adapt2 = conv1x1(256, 256, bias=False) #in_channel = 256, out_channel = 256
150 |         
151 |         self.pre_depth = conv1x1(256, 256, groups=256, bias=False) #in_channel = 256, out_channel = 256, groups = 256
152 |         self.depth = conv3x3(256, 1, bias=True) #in_channel = 256, out_channel = 1, bias = True
153 | 
154 |         self.pre_segm = conv1x1(256, 256, groups=256, bias=False) #in_channel = 256, out_channel = 256, groups = 256
155 |         self.segm = conv3x3(256, num_classes, bias=True) #in_channel = 256, out_channel = num_classes
156 |         self.relu = nn.ReLU6(inplace=True) #nn.ReLU6 is a call to the PyTorch method ReLU6 setting inplace = True
157 | 
158 |         if self.num_tasks == 3:
159 |             self.pre_normal = conv1x1(256, 256, groups=256, bias=False) #in_channel = 256, out_channel = 256
160 |             self.normal = conv3x3(256, 3, bias=True) #in_channel = 256, out_channel = 3, bias = True
161 |         self._initialize_weights() #Using the object to link the initialized weights instantiated with the method
162 |         
163 |     def forward(self, x):
164 |         x = self.layer1(x)
165 |         x = self.layer2(x) # x / 2
166 |         l3 = self.layer3(x) # 24, x / 4
167 |         l4 = self.layer4(l3) # 32, x / 8
168 |         l5 = self.layer5(l4) # 64, x / 16
169 |         l6 = self.layer6(l5) # 96, x / 16
170 |         l7 = self.layer7(l6) # 160, x / 32
171 |         l8 = self.layer8(l7) # 320, x / 32
172 |         l8 = self.conv8(l8)
173 |         l7 = self.conv7(l7)
174 |         l7 = self.relu(l8 + l7)
175 |         l7 = self.crp4(l7)
176 |         l7 = self.conv_adapt4(l7)
177 |         l7 = nn.Upsample(size = l6.size()[2:], mode='bilinear', align_corners = False)(l7)
178 | 
179 |         l6 = self.conv6(l6)
180 |         l5 = self.conv5(l5)
181 |         l5 = self.relu(l5 + l6 + l7)
182 |         l5 = self.crp3(l5)
183 |         l5 = self.conv_adapt3(l5)
184 |         l5 = nn.Upsample(size = l4.size()[2:], mode='bilinear', align_corners = False)(l5)
185 | 
186 |         l4 = self.conv4(l4)
187 |         l4 = self.relu(l5 + l4)
188 |         l4 = self.crp2(l4)
189 |         l4 = self.conv_adapt2(l4)
190 |         l4 = nn.Upsample(size = l3.size()[2:], mode='bilinear', align_corners = False)(l4)
191 | 
192 |         l3 = self.conv3(l3)
193 |         l3 = self.relu(l3 + l4)
194 |         l3 = self.crp1(l3)
195 |         
196 |         out_segm = self.pre_segm(l3)
197 |         out_segm = self.relu(out_segm)
198 |         out_segm = self.segm(out_segm)
199 | 
200 |         out_d = self.pre_depth(l3)
201 |         out_d = self.relu(out_d)
202 |         out_d = self.depth(out_d)
203 | 
204 |         if self.num_tasks == 3:
205 |             out_n = self.pre_normal(l3)
206 |             out_n = self.relu(out_n)
207 |             out_n = self.normal(out_n)
208 |             return out_segm, out_d, out_n
209 |         else:
210 |             return out_segm, out_d
211 |         
212 |     def _initialize_weights(self):
213 |         for m in self.modules():
214 |             if isinstance(m, nn.Conv2d):
215 |                 m.weight.data.normal_(0, 0.01)
216 |                 if m.bias is not None:
217 |                     m.bias.data.zero_()
218 |             elif isinstance(m, nn.BatchNorm2d):
219 |                 m.weight.data.fill_(1)
220 |                 m.bias.data.zero_()
221 |     
222 |     def _make_crp(self, in_channel, out_channel, stages, groups = False):
223 |         layers = [chained_residual_pooling(in_channel, out_channel,stages, groups = groups)]
224 |         return nn.Sequential(*layers)
225 |     
226 | def network(num_classes, num_tasks):
227 |     """Constructs the network by calling the network architecture class. This call will return the network model.
228 |     Args:
229 |         num_classes (int): the number of classes for the segmentation head to output.
230 |         num_tasks (int): the number of tasks, either 2 - segm + depth, or 3 - segm + depth + surface normals
231 |     """
232 |     model = network_Arch(num_classes, num_tasks)
233 |     return model


--------------------------------------------------------------------------------