├── modules
    ├── gitignore
    ├── AbG
    │   └── requirements.txt
    ├── AbM
    │   └── requirements.txt
    ├── AfN
    │   └── requirements.txt
    ├── DfN
    │   └── requirements.txt
    └── Efficient-ResNet
    │   └── gitClone
├── tools
    ├── gitignore
    └── requirements.txt
├── data
    ├── ADE20K
    │   └── ADE20K_val.mlx
    ├── BDD100K
    │   └── bdd100k_val.mlx
    ├── CamVid
    │   ├── CamVid_NetworkMetrics.mat
    │   ├── camVidConfig.py
    │   └── readMe.md
    └── Cityscapes
    │   ├── cityscapesConfig.py
    │   ├── LICENSE
    │   └── readMe.md
├── models
    ├── segmentation
    │   ├── getStarted
    │   ├── importSegmentation.py
    │   └── readMe.md
    └── classification
    │   ├── getStarted
    │   ├── lstm.m
    │   ├── fcn.m
    │   ├── cnn.m
    │   ├── importClassification.py
    │   ├── lstm.py
    │   ├── fcn.py
    │   ├── cnn.py
    │   ├── cnn_d3_v2.m
    │   ├── cnn_d3_v2.py
    │   └── readMe.md
├── libs
    ├── requirements.txt
    └── readMe.md
├── LICENSE
├── ImageClassification.ipynb
├── README.md
├── requirements.txt
└── ImageSegmentation.ipynb


/modules/gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tools/gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/data/ADE20K/ADE20K_val.mlx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/serdarch/SERNet-Former/HEAD/data/ADE20K/ADE20K_val.mlx


--------------------------------------------------------------------------------
/data/BDD100K/bdd100k_val.mlx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/serdarch/SERNet-Former/HEAD/data/BDD100K/bdd100k_val.mlx


--------------------------------------------------------------------------------
/data/CamVid/CamVid_NetworkMetrics.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/serdarch/SERNet-Former/HEAD/data/CamVid/CamVid_NetworkMetrics.mat


--------------------------------------------------------------------------------
/models/segmentation/getStarted:
--------------------------------------------------------------------------------
1 | #SERNet-Former uses versions of Efficient-ResNet as the baseline architectures in semantic segmentation tasks.
2 | 


--------------------------------------------------------------------------------
/models/classification/getStarted:
--------------------------------------------------------------------------------
1 | #SERNet-Former applies versions of Efficient-ResNet as the baseline architectures
2 | #for classification tasks.
3 | 


--------------------------------------------------------------------------------
/libs/requirements.txt:
--------------------------------------------------------------------------------
 1 | pytorch
 2 | torchvision
 3 | matplotlib
 4 | numpy
 5 | packaging
 6 | prettytable
 7 | scipy
 8 | codecov
 9 | flake8
10 | ftfy
11 | interrogate
12 | pytest
13 | regex
14 | yapf
15 | 


--------------------------------------------------------------------------------
/modules/AbG/requirements.txt:
--------------------------------------------------------------------------------
1 | Serdar Erişen, 2024.
2 | All rights are reserved!
3 | 
4 | Details are TBD
5 | 
6 | The code runs with MATLAB
7 | 
8 | Versions for different languages are being developed.
9 | 


--------------------------------------------------------------------------------
/tools/requirements.txt:
--------------------------------------------------------------------------------
 1 | pytorch
 2 | torchvision
 3 | matplotlib
 4 | numpy
 5 | packaging
 6 | prettytable
 7 | scipy
 8 | codecov
 9 | flake8
10 | ftfy
11 | interrogate
12 | pytest
13 | regex
14 | yapf
15 | 


--------------------------------------------------------------------------------
/models/segmentation/importSegmentation.py:
--------------------------------------------------------------------------------
1 | import torchvision.models as models
2 | 	deeplabv3_resnet50 = models.deeplabv3_resnet50(pretrained=True)
3 | 	deeplabv3_resnet101 = models.deeplabv3_resnet101(pretrained=True)


--------------------------------------------------------------------------------
/modules/AbM/requirements.txt:
--------------------------------------------------------------------------------
 1 | Serdar Erişen, 2024.
 2 | All rights are reserved!
 3 | 
 4 | Details are TBD
 5 | 
 6 | The code runs with MATLAB
 7 | 
 8 | Versions for different languages are being developed.
 9 | 
10 | 


--------------------------------------------------------------------------------
/modules/AfN/requirements.txt:
--------------------------------------------------------------------------------
 1 | Serdar Erişen, 2024.
 2 | All rights are reserved!
 3 | 
 4 | Details are TBD
 5 | 
 6 | The code runs with MATLAB
 7 | 
 8 | Versions for different languages are being developed.
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/modules/DfN/requirements.txt:
--------------------------------------------------------------------------------
 1 | Serdar Erişen, 2024.
 2 | All rights are reserved!
 3 | 
 4 | Details are TBD
 5 | 
 6 | The code runs with MATLAB
 7 | 
 8 | Versions for different languages are being developed.
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/modules/Efficient-ResNet/gitClone:
--------------------------------------------------------------------------------
1 | #You can clone the repository of Efficient-ResNet https://github.com/serdarch/Efficient-ResNet.git 
2 | #into your environment.
3 | 
4 | git clone https://github.com/serdarch/Efficient-ResNet.git
5 | 


--------------------------------------------------------------------------------
/models/classification/lstm.m:
--------------------------------------------------------------------------------
1 | layers = [
2 |     sequenceInputLayer(numFeatures)
3 |     lstmLayer(hiddenSize, 'OutputMode', 'last')
4 |     fullyConnectedLayer(numClasses)
5 |     softmaxLayer
6 |     classificationLayer
7 | ];
8 | 


--------------------------------------------------------------------------------
/models/classification/fcn.m:
--------------------------------------------------------------------------------
 1 | hiddenSize = 100;
 2 | layers = [
 3 |     imageInputLayer([28 28 1])
 4 |     fullyConnectedLayer(hiddenSize)
 5 |     reluLayer
 6 |     fullyConnectedLayer(numClasses)
 7 |     softmaxLayer
 8 |     classificationLayer
 9 | ];
10 | 


--------------------------------------------------------------------------------
/models/classification/cnn.m:
--------------------------------------------------------------------------------
 1 | layers = [
 2 |     imageInputLayer([28 28 1])
 3 |     convolution2dLayer(5, 20)
 4 |     reluLayer
 5 |     maxPooling2dLayer(2, 'Stride', 2)
 6 |     fullyConnectedLayer(numClasses)
 7 |     softmaxLayer
 8 |     classificationLayer
 9 | ];
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | All rights reserved. 
 2 | 
 3 | Copyright (c) 2024 Serdar Erişen
 4 | 
 5 | The copyright holder reserves all the rights provided by the copyright law, 
 6 | such as distribution, performance, and the creation of derivative works.
 7 | 
 8 | This repository is being developed only to backup and augment the different language options 
 9 | of the SERNet-Former and increase the research capacities throughout 
10 | the open-source codes and datasets, 
11 | but not directly shares the original details of the network.
12 | 


--------------------------------------------------------------------------------
/models/classification/importClassification.py:
--------------------------------------------------------------------------------
 1 | import torchvision.models as models
 2 | 	resnet50 = models.resnet50(pretrained=True)
 3 | 	resnet101 = models.resnet101(pretrained=True)
 4 | 	resnext101 = models.resnext101_64X4D1(pretrained=True)
 5 | 	efficientnet_b6 = models.efficientnet_b6(pretrained=True)
 6 | 	regnet_y_128gg = models.regnet_Y_128GF_SWAG_E2E_V1(pretrained=True)
 7 | 	vit_b_16 = models.vit_b_16_SWAG_E2E_V16(pretrained=True)
 8 | 	vit_b_32 = models.vit_b_32(pretrained=True)
 9 | 	vit_l_16 = models.vit_l_16(pretrained=True)
10 | 


--------------------------------------------------------------------------------
/models/classification/lstm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class YourModel(nn.Module):
 5 |     def __init__(self, num_features, hidden_size, num_classes):
 6 |         super(YourModel, self).__init__()
 7 |         self.lstm = nn.LSTM(num_features, hidden_size, batch_first=True)
 8 |         self.fc = nn.Linear(hidden_size, num_classes)
 9 |         self.softmax = nn.Softmax(dim=1)  # Softmax across classes
10 | 
11 |     def forward(self, x):
12 |         _, (h_n, _) = self.lstm(x)
13 |         x = self.fc(h_n.squeeze(0))
14 |         x = self.softmax(x)
15 |         return x
16 | 
17 | # Example usage:
18 | # model = YourModel(num_features=numFeatures, hidden_size=hiddenSize, num_classes=numClasses)
19 | 


--------------------------------------------------------------------------------
/data/CamVid/camVidConfig.py:
--------------------------------------------------------------------------------
 1 | # @package training
 2 | # Those arguments defines the training hyper-parameters
 3 | epochs: 80
 4 | num_workers: 1
 5 | batch_size: 3
 6 | shuffle: True
 7 | cuda: 0
 8 | precompute_multi_scale: False # Compute multiscate features on cpu for faster training / inference
 9 | optim:
10 | 	base_lr: 0.001
11 | 	optimizer:
12 | 	class: SGD
13 | 	params:
14 | 	lr: ${training.optim.base_lr}
15 | 	lr_scheduler: ${lr_scheduler_v}
16 | 	bn_scheduler:
17 | 	bn_policy: "step_decay"
18 | 	params: 
19 | 	bn_momentum: 0.9
20 | 	bn_decay: 0.95
21 | 	decay_step: 10
22 | 	bn_clip: 1
23 | weight_name: "latest" # can be named/changed according to the shared model weights
24 | enable_cudnn: False
25 | checkpoint_dir: "..."
26 | 


--------------------------------------------------------------------------------
/data/Cityscapes/cityscapesConfig.py:
--------------------------------------------------------------------------------
 1 | # @package training
 2 | # Those arguments defines the training hyper-parameters
 3 | epochs: 80
 4 | num_workers: 1
 5 | batch_size: 1
 6 | shuffle: True
 7 | cuda: 0
 8 | precompute_multi_scale: False # Compute multiscate features on cpu for faster training / inference
 9 | optim:
10 | 	base_lr: 0.0005
11 | 	optimizer:
12 | 	class: SGD
13 | 	params:
14 | 	lr: ${training.optim.base_lr}
15 | 	lr_scheduler: ${lr_scheduler}
16 | 	bn_scheduler:
17 | 	bn_policy: "step_decay"
18 | 	params: 
19 | 	bn_momentum: 0.9
20 | 	bn_decay: 0.95
21 | 	decay_step: 10
22 | 	bn_clip: 1
23 | weight_name: "latest" # can be named/changed according to the shared model weights
24 | enable_cudnn: False
25 | checkpoint_dir: "..."
26 | 


--------------------------------------------------------------------------------
/models/classification/fcn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class YourModel(nn.Module):
 5 |     def __init__(self, hidden_size, num_classes):
 6 |         super(YourModel, self).__init__()
 7 |         self.fc1 = nn.Linear(28 * 28 * 1, hidden_size)
 8 |         self.relu = nn.ReLU()
 9 |         self.fc2 = nn.Linear(hidden_size, num_classes)
10 |         self.softmax = nn.Softmax(dim=1)  # Softmax across classes
11 | 
12 |     def forward(self, x):
13 |         x = x.view(-1, 28 * 28 * 1)  # Flatten the input images
14 |         x = self.fc1(x)
15 |         x = self.relu(x)
16 |         x = self.fc2(x)
17 |         x = self.softmax(x)
18 |         return x
19 | 
20 | # Example usage:
21 | # model = YourModel(hidden_size=100, num_classes=numClasses)
22 | 


--------------------------------------------------------------------------------
/models/classification/cnn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class YourModel(nn.Module):
 5 |     def __init__(self, num_classes):
 6 |         super(YourModel, self).__init__()
 7 |         self.conv1 = nn.Conv2d(in_channels=1, out_channels=20, kernel_size=5)
 8 |         self.relu = nn.ReLU()
 9 |         self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
10 |         self.fc = nn.Linear(20 * 12 * 12, num_classes)  # Assuming input image size is 28x28
11 | 
12 |     def forward(self, x):
13 |         x = self.conv1(x)
14 |         x = self.relu(x)
15 |         x = self.pool(x)
16 |         x = x.view(-1, 20 * 12 * 12)  # Flatten the tensor for fully connected layer
17 |         x = self.fc(x)
18 |         return x
19 | 
20 | # Example usage:
21 | # model = YourModel(num_classes=numClasses)
22 | 


--------------------------------------------------------------------------------
/models/segmentation/readMe.md:
--------------------------------------------------------------------------------
 1 | # Model Zoo
 2 | 
 3 | The models are the open-source segmentation baseline checkpoints 
 4 | 
 5 | <table><tbody>
 6 | <!-- START TABLE -->
 7 | <!-- TABLE HEADER -->
 8 | <th valign="bottom">Baseline</th>
 9 | <th valign="bottom">Download</th>
10 | <!-- TABLE BODY -->
11 | <!-- ROW: 1 -->
12 | <tr><td align="left">DeepLab_v3 ResNet-50</td>
13 | <td align="center"><a href="https://download.pytorch.org/models/deeplabv3_resnet50_coco-cd0a2569.pth">model</a></td>
14 | </tr>
15 | <!-- ROW: 2 -->
16 | <tr><td align="left">DeepLab_v3 ResNet-101</td>
17 | <td align="center"><a href="https://download.pytorch.org/models/deeplabv3_resnet101_coco-586e9e4e.pth">model</a></td>
18 | </tr>
19 | <!-- ROW: 3 -->
20 | <tr><td align="left">HR-Net W48</td>
21 | <td align="center"><a href="https://huggingface.co/spaces/serdarerisen/SERNet-Former/blob/main/cityscapes_trainval.pth">model</a></td>
22 | </tr>
23 | </tbody></table>
24 | 


--------------------------------------------------------------------------------
/data/Cityscapes/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Serdar Erişen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/models/classification/cnn_d3_v2.m:
--------------------------------------------------------------------------------
 1 | CNN_D3_layers = [
 2 |     imageInputLayer([1 6 1], "Name", "imageinput")
 3 |     convolution2dLayer([1 1], 4, "Name", "conv", "Padding", "same")
 4 |     batchNormalizationLayer("Name", "batchnorm")
 5 |     reluLayer("Name", "relu")
 6 |     globalMaxPooling2dLayer("Name", "gmpool")
 7 |     convolution2dLayer([1 1], 8, "Name", "conv_1", "Padding", "same")
 8 |     convolution2dLayer([1 1], 8, "Name", "conv_2", "Padding", "same")
 9 |     batchNormalizationLayer("Name", "batchnorm_1")
10 |     reluLayer("Name", "relu_1")
11 |     globalMaxPooling2dLayer("Name", "gmpool_1")
12 |     convolution2dLayer([1 1], 16, "Name", "conv_3", "Padding", "same")
13 |     convolution2dLayer([1 1], 16, "Name", "conv_4", "Padding", "same")
14 |     batchNormalizationLayer("Name", "batchnorm_2")
15 |     convolution2dLayer([1 1], 32, "Name", "conv_5", "Padding", "same")
16 |     convolution2dLayer([1 1], 32, "Name", "conv_6", "Padding", "same")
17 |     batchNormalizationLayer("Name", "batchnorm_3")
18 |     reluLayer("Name", "relu_2")
19 |     fullyConnectedLayer(4, "Name", "fc")
20 |     softmaxLayer("Name", "softmax")
21 |     classificationLayer("Name", "classoutput")
22 | ];
23 | 
24 | 


--------------------------------------------------------------------------------
/models/classification/cnn_d3_v2.py:
--------------------------------------------------------------------------------
 1 | # Python code for CNN_D3_v2 architecture
 2 | # Serdar Erisen, 2024
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 |  
 7 | class CNN_D3_v2(nn.Module):
 8 |      def __init__(self):
 9 |         super(CNN_D3_v2, self).__init__()
10 |         self.conv = nn.Conv2d(1, 4, kernel_size=1, padding='same')
11 |         self.batchnorm = nn.BatchNorm2d(4)
12 |         self.relu = nn.ReLU()
13 |         self.gmpool = nn.AdaptiveMaxPool2d((1, 1))
14 |          
15 |         self.conv_1 = nn.Conv2d(4, 8, kernel_size=1, padding='same')
16 |         self.conv_2 = nn.Conv2d(8, 8, kernel_size=1, padding='same')
17 |         self.batchnorm_1 = nn.BatchNorm2d(8)
18 |         self.relu_1 = nn.ReLU()
19 |         self.gmpool_1 = nn.AdaptiveMaxPool2d((1, 1))
20 |          
21 |         self.conv_3 = nn.Conv2d(8, 16, kernel_size=1, padding='same')
22 |         self.conv_4 = nn.Conv2d(16, 16, kernel_size=1, padding='same')
23 |         self.batchnorm_2 = nn.BatchNorm2d(16)
24 |          
25 |         self.conv_5 = nn.Conv2d(16, 32, kernel_size=1, padding='same')
26 |         self.conv_6 = nn.Conv2d(32, 32, kernel_size=1, padding='same')
27 |         self.batchnorm_3 = nn.BatchNorm2d(32)
28 |         self.relu_2 = nn.ReLU()
29 |          
30 |         self.fc = nn.Linear(32, 4)
31 |         self.softmax = nn.Softmax(dim=1)
32 |  
33 |      def forward(self, x):
34 |         x = self.conv(x)
35 |         x = self.batchnorm(x)
36 |         x = self.relu(x)
37 |         x = self.gmpool(x)
38 |          
39 |         x = self.conv_1(x)
40 |         x = self.conv_2(x)
41 |         x = self.batchnorm_1(x)
42 |         x = self.relu_1(x)
43 |         x = self.gmpool_1(x)
44 |         
45 |         x = self.conv_3(x)
46 |         x = self.conv_4(x)
47 |         x = self.batchnorm_2(x)
48 |         
49 |         x = self.conv_5(x)
50 |         x = self.conv_6(x)
51 |         x = self.batchnorm_3(x)
52 |         x = self.relu_2(x)
53 |         
54 |         x = torch.flatten(x, 1)
55 |         
56 |         x = self.fc(x)
57 |         x = self.softmax(x)
58 |         
59 |         return x
60 | 
61 | # Example usage:
62 | # model = CNN_D3_v2()
63 | # # Print model architecture
64 | # print(model)
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/data/Cityscapes/readMe.md:
--------------------------------------------------------------------------------
 1 | ⁰# Cityscapes dataset
 2 | 
 3 | Cityscapes is one of the most challenging datasets for the semantic segmentation of urban street scenes. 
 4 | 
 5 | It contains high-quality pixel-level annotations for 5000 images, as well as coarsely annotated 20000 images. 
 6 | 
 7 | The dataset contains diverse stereo video sequences with the sizes of 1024 by 2048 pixels, 
 8 | recorded during the daytime of 50 European cities visited in several months (spring, summer, and fall) 
 9 | with good or average weather conditions. 
10 | 
11 | The dataset of 5000 fine annotations is divided into three sets: 2975 for training, 500 for validation, and 1525 for testing. 
12 | 
13 | The dataset includes semantic, instance-wise, and dense pixel annotations of 30 classes grouped into eight categories. 
14 | 
15 | However, most literature uses annotations with 20 classes, 19 of which are semantic labels containing objects and stuff, 
16 | in addition to one additional void class for do-not-care regions.
17 | # Models
18 | <table><tbody>
19 | <!-- START TABLE -->
20 | <!-- TABLE HEADER -->
21 | <th valign="bottom">Model / Method</th>
22 | <th valign="bottom">Baseline</th>
23 | <th valign="bottom">mIoU</th>
24 | <!-- TABLE BODY -->
25 | <!-- ROW: 2 -->
26 | <tr><td align="left">SERNet-Former</td>
27 | <td align="left">ResNet-50</td>
28 | <td align="center">73.31</td>
29 | </tr>
30 | <!-- ROW: 3 -->
31 | <tr><td align="left">SERNet-Former</td>
32 | <td align="left">Efficient-ResNet_R101</td>
33 | <td align="center">77.04</td>
34 | <!-- ROW: 4 -->
35 | <tr><td align="left">SERNet-Former</td>
36 | <td align="left">Efficient-ResNet [final]</td>
37 | <td align="center">84.83</td>
38 | </tr>
39 | </tbody></table>
40 | 
41 | ## Please cite
42 | 
43 | ```bibtex
44 | @inproceedings{Cordts2016CVPR,
45 |   title={The cityscapes dataset for semantic urban scene understanding},
46 |   author={M. Cordts and M. Omran and S. Ramos and T. Rehfeld and M. Enzweiler and R. Benenson and U. Franke and S. Roth, and B. Schiele},
47 |   booktitle={CVPR},
48 |   year={2016},
49 | }
50 | 
51 | @article{Erisen2024SERNetFormer,
52 |   title={SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks},
53 |   author={Erişen, Serdar},
54 |   journal={arXiv preprint arXiv:2401.15741},
55 |   year={2024}
56 | }
57 | ```
58 | 


--------------------------------------------------------------------------------
/models/classification/readMe.md:
--------------------------------------------------------------------------------
 1 | # Model Zoo
 2 | 
 3 | The models are the open-source checkpoints pretrained on ImageNet dataset 
 4 | 
 5 | <table><tbody>
 6 | <!-- START TABLE -->
 7 | <!-- TABLE HEADER -->
 8 | <th valign="bottom">Baseline</th>
 9 | <th valign="bottom">Download</th>
10 | <!-- TABLE BODY -->
11 | <!-- ROW: 1 -->
12 | <tr><td align="left">ResNet-50</td>
13 | <td align="center"><a href="https://download.pytorch.org/models/resnet50-11ad3fa6.pth">model</a></td>
14 | </tr>
15 | <!-- ROW: 2 -->
16 | <tr><td align="left">ResNet-101</td>
17 | <td align="center"><a href="https://download.pytorch.org/models/resnet101-cd907fc2.pth">model</a></td>
18 | </tr>
19 | <!-- ROW: 3 -->
20 | <tr><td align="left">Swin_V2_S</td>
21 | <td align="center"><a href=""https://download.pytorch.org/models/swin_v2_s-637d8ceb.pth"">model</a></td>
22 | </tr>
23 | <!-- ROW: 4 -->
24 | <tr><td align="left">Swin_V2_B</td>
25 | <td align="center"><a href=""https://download.pytorch.org/models/swin_v2_b-781e5279.pth"">model</a></td>
26 | </tr>
27 | <!-- ROW: 5 -->
28 | <tr><td align="left">ViT_B_16_SWAG_E2E_V1</td>
29 | <td align="center"><a href="https://download.pytorch.org/models/vit_b_16_swag-9ac1b537.pth">model</a></td>
30 | </tr>
31 |   <!-- ROW: 6 -->
32 | <tr><td align="left">ViT_H_14_SWAG_E2E_V1</td>
33 | <td align="center"><a href="https://download.pytorch.org/models/vit_h_14_swag-80465313.pth">model</a></td>
34 | </tr>
35 | <!-- ROW: 7 -->
36 | <tr><td align="left">EfficientNet_B6</td>
37 | <td align="center"><a href="https://download.pytorch.org/models/efficientnet_b6_lukemelas-24a108a5.pth">model</a></td>
38 | </tr>  
39 |  <!-- ROW: 8 -->
40 |   <tr><td align="left">EfficientNet_V2_L</td>
41 | <td align="center"><a href="https://download.pytorch.org/models/efficientnet_v2_l-59c71312.pth">model</a></td>
42 | </tr>
43 |   <!-- ROW: 9 -->
44 | <tr><td align="left">RegNet_Y_128GF_SWAG_E2E_V1</td>
45 | <td align="center"><a href="https://download.pytorch.org/models/regnet_y_128gf_swag-c8ce3e52.pth">model</a></td>
46 | </tr>
47 | <!-- ROW: 10 -->
48 |   <tr><td align="left">ResNeXt101_64X4D</td>
49 | <td align="center"><a href="https://download.pytorch.org/models/resnext101_64x4d-173b62eb.pth">model</a></td>
50 | </tr>
51 | <!-- ROW: 11 -->
52 |   <tr><td align="left">CNN-D3</td>
53 | <td align="center"><a href="https://github.com/serdarch/CNN-D3/blob/main/model/CNN_D3.mat">model</a></td>
54 | </tr>
55 | </tbody></table>
56 | 


--------------------------------------------------------------------------------
/libs/readMe.md:
--------------------------------------------------------------------------------
 1 | # Model Zoo
 2 | 
 3 | ## Classification Networks 
 4 | The models are the open-source checkpoints pretrained on ImageNet dataset 
 5 | 
 6 | <table><tbody>
 7 | <!-- START TABLE -->
 8 | <!-- TABLE HEADER -->
 9 | <th valign="bottom">Baseline</th>
10 | <th valign="bottom">Download</th>
11 | <!-- TABLE BODY -->
12 | <!-- ROW: 1 -->
13 | <tr><td align="left">ResNet-50</td>
14 | <td align="center"><a href="https://download.pytorch.org/models/resnet50-11ad3fa6.pth">model</a></td>
15 | </tr>
16 | <!-- ROW: 2 -->
17 | <tr><td align="left">ResNet-101</td>
18 | <td align="center"><a href="https://download.pytorch.org/models/resnet101-cd907fc2.pth">model</a></td>
19 | </tr>
20 | <!-- ROW: 3 -->
21 | <tr><td align="left">Swin_V2_S</td>
22 | <td align="center"><a href=""https://download.pytorch.org/models/swin_v2_s-637d8ceb.pth"">model</a></td>
23 | </tr>
24 | <!-- ROW: 4 -->
25 | <tr><td align="left">Swin_V2_B</td>
26 | <td align="center"><a href=""https://download.pytorch.org/models/swin_v2_b-781e5279.pth"">model</a></td>
27 | </tr>
28 | <!-- ROW: 5 -->
29 | <tr><td align="left">ViT_B_16_SWAG_E2E_V1</td>
30 | <td align="center"><a href="https://download.pytorch.org/models/vit_b_16_swag-9ac1b537.pth">model</a></td>
31 | </tr>
32 |   <!-- ROW: 6 -->
33 | <tr><td align="left">ViT_H_14_SWAG_E2E_V1</td>
34 | <td align="center"><a href="https://download.pytorch.org/models/vit_h_14_swag-80465313.pth">model</a></td>
35 | </tr>
36 | <!-- ROW: 7 -->
37 | <tr><td align="left">EfficientNet_B6</td>
38 | <td align="center"><a href="https://download.pytorch.org/models/efficientnet_b6_lukemelas-24a108a5.pth">model</a></td>
39 | </tr>  
40 |  <!-- ROW: 8 -->
41 |   <tr><td align="left">EfficientNet_V2_L</td>
42 | <td align="center"><a href="https://download.pytorch.org/models/efficientnet_v2_l-59c71312.pth">model</a></td>
43 | </tr>
44 |   <!-- ROW: 9 -->
45 | <tr><td align="left">RegNet_Y_128GF_SWAG_E2E_V1</td>
46 | <td align="center"><a href="https://download.pytorch.org/models/regnet_y_128gf_swag-c8ce3e52.pth">model</a></td>
47 | </tr>
48 | <!-- ROW: 10 -->
49 |   <tr><td align="left">ResNeXt101_64X4D</td>
50 | <td align="center"><a href="https://download.pytorch.org/models/resnext101_64x4d-173b62eb.pth">model</a></td>
51 | </tr>
52 | </tbody></table>
53 | 
54 | ## Segmentation Networks
55 | The models are the open-source segmentation baseline checkpoints pretrained on COCO dataset 
56 | <table><tbody>
57 | <!-- START TABLE -->
58 | <!-- TABLE HEADER -->
59 | <th valign="bottom">Baseline</th>
60 | <th valign="bottom">Download</th>
61 | <!-- TABLE BODY -->
62 | <!-- ROW: 1 -->
63 | <tr><td align="left">DeepLab_v3 ResNet-50</td>
64 | <td align="center"><a href="https://download.pytorch.org/models/deeplabv3_resnet50_coco-cd0a2569.pth">model</a></td>
65 | </tr>
66 | <!-- ROW: 2 -->
67 | <tr><td align="left">DeepLab_v3 ResNet-101</td>
68 | <td align="center"><a href="https://download.pytorch.org/models/deeplabv3_resnet101_coco-586e9e4e.pth">model</a></td>
69 | </tr>
70 | </tbody></table>
71 | 
72 |  
73 | </tbody></table>
74 | 


--------------------------------------------------------------------------------
/data/CamVid/readMe.md:
--------------------------------------------------------------------------------
 1 | # CamVid Dataset
 2 | 
 3 | The Cambridge-driving Labelled Video Database (CamVid) is one of the first scene understanding databases, 
 4 | and it is based on the motion-based video collections of driving scenes recorded for semantic segmentation of object classes. 
 5 | 
 6 | This database contains 701 frames with sizes of 720 by 960 pixels that were captured in five video sequences, 
 7 | shot via the fixed-position CCTV-style cameras mounted on a car. The densely annotated images were manually 
 8 | generated through 32 classes and merged into 11 classes later. 
 9 | 
10 | The original dataset is divided into 367 training, 101 validation, and 233 test images, as most literature practiced. 
11 | 
12 | ## Model metrics
13 | 
14 | <table><tbody>
15 | <!-- START TABLE -->
16 | <!-- TABLE HEADER -->
17 | <th valign="bottom">Model</th>
18 | <th valign="bottom">Metrics File</th>
19 | <!-- TABLE BODY -->
20 | <!-- ROW: 1 -->
21 | <tr><td align="left">SERNet-Former [checkpoint]</td>
22 | <td align="center"><a href="https://huggingface.co/spaces/serdarerisen/SERNet-Former/blob/main/CamVid_NetworkMetrics_Checkpoint.mat">download</a></td>
23 | </tr>
24 | <!-- ROW: 2 -->
25 | <tr><td align="left">SERNet-Former [final]</td>
26 | <td align="center"><a href="https://huggingface.co/spaces/serdarerisen/SERNet-Former/blob/main/CamVid_NetworkMetrics_Final.mat">download</a></td>
27 | </tr>
28 | </tbody></table>
29 | 
30 | ## Ablation works
31 | 
32 | <table><tbody>
33 | <!-- START TABLE -->
34 | <!-- TABLE HEADER -->
35 | <th valign="bottom">AbM</th>
36 | <th valign="bottom">DbN</th>
37 | <th valign="bottom">AfN1</th>
38 | <th valign="bottom">AfN2</th>
39 | <th valign="bottom">mIoU %</th>
40 | <!-- TABLE BODY -->
41 | <!-- ROW: 1 -->
42 | <tr><td align="left">N</td>
43 | <td align="left">Y</td>
44 | <td align="left">Y</td>
45 | <td align="left">Y</td>
46 | <td align="left">81.22</td>
47 | </tr>
48 | <!-- ROW: 2 -->
49 | <tr><td align="left">Y</td>
50 | <td align="left">N</td>
51 | <td align="left">Y</td>
52 | <td align="left">Y</td>
53 | <td align="left">80.56</td>
54 | </tr>
55 | <!-- ROW: 3 -->
56 | <tr><td align="left">Y</td>
57 | <td align="left">Y</td>
58 | <td align="left">N</td>
59 | <td align="left">Y</td>
60 | <td align="left">78.99</td>
61 | </tr>
62 | <!-- ROW: 4 -->
63 | <tr><td align="left">Y</td>
64 | <td align="left">Y</td>
65 | <td align="left">Y</td>
66 | <td align="left">N</td>
67 | <td align="left">75.37</td>
68 | </tr>
69 | <!-- ROW: 5 -->
70 | <tr><td align="left">Y</td>
71 | <td align="left">Y</td>
72 | <td align="left">Y</td>
73 | <td align="left">Y</td>
74 | <td align="left">82.88</td>
75 | </tr>
76 | </tbody></table>
77 | 
78 | 
79 | ## Please cite
80 | 
81 | ```bibtex
82 | @article{Brostow2019,
83 |   title={Semantic object classes in video: A high-definition ground truth database},
84 |   author={G. J. Brostow and J. Fauqueur and R. Cipolla},
85 |   journal={Pattern Recognition Letters},
86 |   volume=90
87 |   pages=119-133
88 |   year={2019}
89 | }
90 | 
91 | @article{Erisen2024SERNetFormer,
92 |   title={SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks},
93 |   author={Erişen, Serdar},
94 |   journal={arXiv preprint arXiv:2401.15741},
95 |   year={2024}
96 | }
97 | ```
98 | 


--------------------------------------------------------------------------------
/ImageClassification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyN57eNgsltG+a1X/fNILjb8",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/serdarch/SERNet-Former/blob/master/ImageClassification.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "markdown",
 31 |       "source": [
 32 |         "**Image classification**\n",
 33 |         "\n",
 34 |         "Image classification tutorial based on the pretrained PyTorch baselines.\n",
 35 |         "Used model: ViT_h_14 with Weights IMAGENET1K_SWAG_E2E_V1\n",
 36 |         "\n",
 37 |         "Please save a copy of this tutorial into your own environment/drive folder."
 38 |       ],
 39 |       "metadata": {
 40 |         "id": "7kG9ZBiN238_"
 41 |       }
 42 |     },
 43 |     {
 44 |       "cell_type": "code",
 45 |       "source": [
 46 |         "from torchvision.io import read_image"
 47 |       ],
 48 |       "metadata": {
 49 |         "id": "UB3maNwt0roV"
 50 |       },
 51 |       "execution_count": 2,
 52 |       "outputs": []
 53 |     },
 54 |     {
 55 |       "cell_type": "code",
 56 |       "source": [
 57 |         "!wget http://images.cocodataset.org/val2017/000000005477.jpg -q -O input.jpg"
 58 |       ],
 59 |       "metadata": {
 60 |         "id": "5_vXR0dT0r9L"
 61 |       },
 62 |       "execution_count": 1,
 63 |       "outputs": []
 64 |     },
 65 |     {
 66 |       "cell_type": "code",
 67 |       "source": [
 68 |         "im1=read_image(\"input.jpg\")"
 69 |       ],
 70 |       "metadata": {
 71 |         "id": "-hGZui_z0w97"
 72 |       },
 73 |       "execution_count": 3,
 74 |       "outputs": []
 75 |     },
 76 |     {
 77 |       "cell_type": "code",
 78 |       "execution_count": 4,
 79 |       "metadata": {
 80 |         "colab": {
 81 |           "base_uri": "https://localhost:8080/"
 82 |         },
 83 |         "id": "th3lBuOHzu6e",
 84 |         "outputId": "229ec5c7-86b0-4eb4-81fd-cf126d1d9c36"
 85 |       },
 86 |       "outputs": [
 87 |         {
 88 |           "output_type": "stream",
 89 |           "name": "stderr",
 90 |           "text": [
 91 |             "Downloading: \"https://download.pytorch.org/models/vit_h_14_swag-80465313.pth\" to /root/.cache/torch/hub/checkpoints/vit_h_14_swag-80465313.pth\n",
 92 |             "100%|██████████| 2.36G/2.36G [01:57<00:00, 21.6MB/s]\n"
 93 |           ]
 94 |         },
 95 |         {
 96 |           "output_type": "stream",
 97 |           "name": "stdout",
 98 |           "text": [
 99 |             "airliner: 91.4%\n"
100 |           ]
101 |         }
102 |       ],
103 |       "source": [
104 |         "from torchvision.models import vit_h_14, ViT_H_14_Weights\n",
105 |         "\n",
106 |         "# Step 1: Initialize model with the best available weights\n",
107 |         "weights = ViT_H_14_Weights.DEFAULT\n",
108 |         "model = vit_h_14(weights='DEFAULT')\n",
109 |         "model.eval()\n",
110 |         "\n",
111 |         "# Step 2: Initialize the inference transforms\n",
112 |         "preprocess = weights.transforms()\n",
113 |         "\n",
114 |         "# Step 3: Apply inference preprocessing transforms\n",
115 |         "batch = preprocess(im1).unsqueeze(0)\n",
116 |         "\n",
117 |         "# Step 4: Use the model and print the predicted category\n",
118 |         "prediction = model(batch).squeeze(0).softmax(0)\n",
119 |         "class_id = prediction.argmax().item()\n",
120 |         "score = prediction[class_id].item()\n",
121 |         "category_name = weights.meta[\"categories\"][class_id]\n",
122 |         "print(f\"{category_name}: {100 * score:.1f}%\") #airliner: ~91.4%"
123 |       ]
124 |     }
125 |   ]
126 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SERNet-Former
  2 | <div align="center">
  3 | 
  4 | [![[CVPR 2024 Workshops] YouTube Video](https://img.shields.io/badge/CVPRW'24-YouTube-blue)](https://youtu.be/XXzMkotcdb4?feature=shared)
  5 | [![CVPR 2024 Workshop](https://img.shields.io/badge/CVPR'24-Workshop-yellow)](https://equivision.github.io/index.html#papers)
  6 | [![ArXiv paper](https://img.shields.io/badge/SERNetFormer-ArXiv-red)](https://doi.org/10.48550/arXiv.2401.15741)
  7 | [![CVMI 2024](https://img.shields.io/badge/CVMI-2024-blue)](https://cvmi2024.iiita.ac.in/AcceptedPapers.php)
  8 | 
  9 | </div>
 10 | 
 11 | [CVPR 2024 Workshops] SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks
 12 | 
 13 | ## Tutorials
 14 | Various implementations of SERNet-Former with different baselines for Multi-tasking (without our additional methods) is now online. 
 15 | 
 16 | The example deploys ViT_h_14 baseline with 'Weights' 'IMAGENET1K_SWAG_E2E_V1' and simple U-Net decoder architecture.
 17 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/185TZK796f425vsduhpm9NcCrtMS4rIkb#scrollTo=q3e5V2NephbJ&forceEdit=true&sandboxMode=true)
 18 | 
 19 | 
 20 | Please also see the tutorials for 
 21 | 
 22 | Image Segmentation based on DeepLabV3+_ResNet101 baseline 
 23 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sMRRcUsFaUwHSvIch9Koqxb4ogtgfVFs#scrollTo=-Gei9D03SWmM&forceEdit=true&sandboxMode=true)
 24 | 
 25 | 
 26 | Image Classification based on ViT_h_14 baseline 
 27 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Nj82jyovcQcuZotx-pRSBzd1uEXCbOp4#scrollTo=7kG9ZBiN238)
 28 | 
 29 | 
 30 | ## News
 31 | - `16 May 2024`   [CVPR 2024 Workshops] The article "SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks" is now accepted to CVPR 2024 Workshops. Equivariant Vision: From Theory to Practice
 32 | - `January 2024`   SERNet-Former set state-of-the-art result on Cityscapes validation dataset for pixel-level segmentation: 87.35 % mIoU
 33 | - `January 2024`   SERNet-Former set state-of-the-art result on CamVid dataset: 84.62 % mIoU
 34 | - `January 2024`   SERNet-Former ranked as the seventh on Cityscapes test dataset for pixel-level segmentation according to PapersWithCode.com: 84.83 % mIoU
 35 | 
 36 | 
 37 | ## GitHub Badges
 38 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/semantic-segmentation-on-ade20k-val)](https://paperswithcode.com/sota/semantic-segmentation-on-ade20k-val?p=sernet-former-semantic-segmentation-by)
 39 | 
 40 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/semantic-segmentation-on-bdd100k-val)](https://paperswithcode.com/sota/semantic-segmentation-on-bdd100k-val?p=sernet-former-semantic-segmentation-by)
 41 | 
 42 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/semantic-segmentation-on-camvid)](https://paperswithcode.com/sota/semantic-segmentation-on-camvid?p=sernet-former-semantic-segmentation-by)
 43 | 
 44 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/2d-semantic-segmentation-on-camvid)](https://paperswithcode.com/sota/2d-semantic-segmentation-on-camvid?p=sernet-former-semantic-segmentation-by)
 45 | 
 46 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/semantic-segmentation-on-cityscapes-val)](https://paperswithcode.com/sota/semantic-segmentation-on-cityscapes-val?p=sernet-former-semantic-segmentation-by)
 47 | 
 48 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/2d-semantic-segmentation-on-cityscapes-val)](https://paperswithcode.com/sota/2d-semantic-segmentation-on-cityscapes-val?p=sernet-former-semantic-segmentation-by)
 49 | 
 50 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/semantic-segmentation-on-cityscapes)](https://paperswithcode.com/sota/semantic-segmentation-on-cityscapes?p=sernet-former-semantic-segmentation-by)
 51 | 
 52 | ## SERNet-Former Conceptual 
 53 | 
 54 | [![Efficient-ResNet](https://img.shields.io/badge/github-EfficientResNet-black)](https://github.com/serdarch/Efficient-ResNet)
 55 | 
 56 | ![Figure1](https://github.com/serdarch/SERNet-Former/assets/61043858/084416d7-f982-4f46-b1bf-871aed81557b)
 57 | 
 58 | (a) Attention-boosting Gate (AbG) and Attention-boosting Module (AbM) are fused into the encoder part. 
 59 | 
 60 | (b) Attention-fusion Network (AfN), introduced into the decoder
 61 | 
 62 | ## Experiment Results
 63 | 
 64 | ### CamVid Dataset 
 65 | 
 66 | The breakdown of class accuracies on CamVid dataset
 67 | 
 68 | <table><tbody>
 69 | <!-- TABLE HEADER -->
 70 | <th valign="bottom">Model</th>
 71 | <th valign="bottom">Baseline Architecture</th>  
 72 | <th valign="bottom">Building</th>
 73 | <th valign="bottom">Tree</th>
 74 | <th valign="bottom">Sky</th>
 75 | <th valign="bottom">Car</th>
 76 | <th valign="bottom">Sign</th>
 77 | <th valign="bottom">Road</th>
 78 | <th valign="bottom">Pedestrian</th>
 79 | <th valign="bottom">Fence</th>
 80 | <th valign="bottom">Pole</th>
 81 | <th valign="bottom">Sidewalk</th>
 82 | <th valign="bottom">Bicycle</th>
 83 | <th valign="bottom">mIoU</th>
 84 | <!-- TABLE BODY -->
 85 | <!-- ROW: 1 -->
 86 | <tr>
 87 | <td align="center">SERNet-Former</td>
 88 | <td align="center">Efficient-ResNet</td>
 89 | <td align="center">93.0</td>
 90 | <td align="center">88.8</td>
 91 | <td align="center">95.1</td>
 92 | <td align="center">91.9</td>
 93 | <td align="center">73.9</td>
 94 | <td align="center">97.7</td>
 95 | <td align="center">76.4</td>
 96 | <td align="center">83.4</td>
 97 | <td align="center">57.3</td>
 98 | <td align="center">90.3</td>
 99 | <td align="center">83.1</td>
100 | <td align="center">84.62</td>
101 | </tr>
102 | </tbody></table>
103 | 
104 | 
105 | The experiment outcomes on CamVid dataset
106 | 
107 | ![camvid_output](https://github.com/serdarch/SERNet-Former/assets/61043858/f11f44a6-b245-43f1-b323-2f107f0b330e)
108 | 
109 | ### Cityscapes
110 | 
111 | <table><tbody>
112 | <!-- START TABLE -->
113 | <!-- TABLE HEADER -->
114 | <th valign="bottom">Model</th>
115 | <th valign="bottom">Baseline Architecture</th>
116 | <th valign="bottom">road</th>
117 | <th valign="bottom">sidewalk</th>
118 | <th valign="bottom">building</th>
119 | <th valign="bottom">wall</th>
120 | <th valign="bottom">fence</th>
121 | <th valign="bottom">pole</th>
122 | <th valign="bottom">traffic light</th>
123 | <th valign="bottom">traffic sign</th>
124 | <th valign="bottom">vegetation</th>
125 | <th valign="bottom">terrain</th>
126 | <th valign="bottom">sky</th>
127 | <th valign="bottom">person</th>
128 | <th valign="bottom">rider</th>
129 | <th valign="bottom">car</th>
130 | <th valign="bottom">truck</th>
131 | <th valign="bottom">bus</th>
132 | <th valign="bottom">train</th>
133 | <th valign="bottom">motorcycle</th>
134 | <th valign="bottom">bicycle</th>
135 | <th valign="bottom">mIoU</th>
136 | <!-- TABLE BODY -->
137 | <!-- ROW: 1 -->
138 | <tr>
139 | <td align="center">SERNet-Former</td>
140 | <td align="center">Efficient-ResNet</td>
141 | <td align="center">98.2</td>
142 | <td align="center">90.2</td>
143 | <td align="center">94.0</td>
144 | <td align="center">67.6</td>
145 | <td align="center">68.2</td>
146 | <td align="center">73.6</td>
147 | <td align="center">78.2</td>
148 | <td align="center">82.1</td>
149 | <td align="center">94.6</td>
150 | <td align="center">75.9</td>
151 | <td align="center">96.9</td>
152 | <td align="center">90.0</td>
153 | <td align="center">77.7</td>
154 | <td align="center">96.9</td>
155 | <td align="center">86.1</td>
156 | <td align="center">93.9</td>
157 | <td align="center">91.7</td>
158 | <td align="center">70.0</td>
159 | <td align="center">82.9</td>
160 | <td align="center">84.83</td>
161 | </tr>
162 | </tbody></table>
163 | 
164 | The experiment outcomes on Cityscapes dataset
165 | 
166 | ![cityscapes_output](https://github.com/serdarch/SERNet-Former/assets/61043858/9a613193-6761-422c-bb7c-d2a3499548c5)
167 | 
168 | ## Installation Support
169 | 
170 | You can simply download this repository into your environment by running
171 | ```bash
172 | git clone https://github.com/serdarch/SERNet-Former.git
173 | ```
174 | 
175 | ## Citations
176 | 
177 | ```bibtex
178 | @article{Erisen2024SERNetFormer,
179 |   title={SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks},
180 |   author={Erişen, Serdar},
181 |   journal={arXiv preprint arXiv:2401.15741},
182 |   year={2024}
183 | }
184 | 
185 | @inproceedings{Erisen2024CVPRW,
186 |   title={SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks},
187 |   author={Erişen, Serdar},
188 |   booktitle={CVPRW},
189 |   year={2024},
190 | }
191 | ```
192 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==1.4.0
  2 | accelerate==0.34.2
  3 | addict==2.4.0
  4 | aiohappyeyeballs==2.4.0
  5 | aiohttp==3.10.5
  6 | aiosignal==1.3.1
  7 | alabaster==0.7.16
  8 | albucore==0.0.16
  9 | albumentations==1.4.15
 10 | aliyun-python-sdk-core==2.15.2
 11 | aliyun-python-sdk-kms==2.16.5
 12 | altair==4.2.2
 13 | annotated-types==0.7.0
 14 | anyio==3.7.1
 15 | argon2-cffi==23.1.0
 16 | argon2-cffi-bindings==21.2.0
 17 | array_record==0.5.1
 18 | arviz==0.19.0
 19 | astropy==6.1.3
 20 | astropy-iers-data==0.2024.9.16.0.32.21
 21 | astunparse==1.6.3
 22 | async-timeout==4.0.3
 23 | atpublic==4.1.0
 24 | attrs==24.2.0
 25 | audioread==3.0.1
 26 | autograd==1.7.0
 27 | babel==2.16.0
 28 | backcall==0.2.0
 29 | beautifulsoup4==4.12.3
 30 | bigframes==1.18.0
 31 | bigquery-magics==0.2.0
 32 | bleach==6.1.0
 33 | blinker==1.4
 34 | blis==0.7.11
 35 | blosc2==2.0.0
 36 | bokeh==3.4.3
 37 | bqplot==0.12.43
 38 | branca==0.7.2
 39 | build==1.2.2
 40 | CacheControl==0.14.0
 41 | cachetools==5.5.0
 42 | catalogue==2.0.10
 43 | certifi==2024.8.30
 44 | cffi==1.17.1
 45 | chardet==5.2.0
 46 | charset-normalizer==3.3.2
 47 | chex==0.1.86
 48 | clarabel==0.9.0
 49 | click==8.1.7
 50 | cloudpathlib==0.19.0
 51 | cloudpickle==2.2.1
 52 | cmake==3.30.3
 53 | cmdstanpy==1.2.4
 54 | colorama==0.4.6
 55 | colorcet==3.1.0
 56 | colorlover==0.3.0
 57 | colour==0.1.5
 58 | community==1.0.0b1
 59 | confection==0.1.5
 60 | cons==0.4.6
 61 | contextlib2==21.6.0
 62 | contourpy==1.3.0
 63 | crcmod==1.7
 64 | cryptography==43.0.1
 65 | cuda-python==12.2.1
 66 | cudf-cu12 @ https://pypi.nvidia.com/cudf-cu12/cudf_cu12-24.4.1-cp310-cp310-manylinux_2_28_x86_64.whl#sha256=57366e7ef09dc63e0b389aff20df6c37d91e2790065861ee31a4720149f5b694
 67 | cufflinks==0.17.3
 68 | cupy-cuda12x==12.2.0
 69 | cvxopt==1.3.2
 70 | cvxpy==1.5.3
 71 | cycler==0.12.1
 72 | cymem==2.0.8
 73 | Cython==3.0.11
 74 | dask==2024.8.0
 75 | datascience==0.17.6
 76 | db-dtypes==1.3.0
 77 | dbus-python==1.2.18
 78 | debugpy==1.6.6
 79 | decorator==4.4.2
 80 | defusedxml==0.7.1
 81 | distributed==2024.8.0
 82 | distro==1.7.0
 83 | dlib==19.24.2
 84 | dm-tree==0.1.8
 85 | docstring_parser==0.16
 86 | docutils==0.18.1
 87 | dopamine_rl==4.0.9
 88 | duckdb==1.1.0
 89 | earthengine-api==1.0.0
 90 | easydict==1.13
 91 | ecos==2.0.14
 92 | editdistance==0.8.1
 93 | eerepr==0.0.4
 94 | einops==0.8.0
 95 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
 96 | entrypoints==0.4
 97 | et-xmlfile==1.1.0
 98 | etils==1.9.4
 99 | etuples==0.3.9
100 | eval_type_backport==0.2.0
101 | exceptiongroup==1.2.2
102 | fastai==2.7.17
103 | fastcore==1.7.8
104 | fastdownload==0.0.7
105 | fastjsonschema==2.20.0
106 | fastprogress==1.0.3
107 | fastrlock==0.8.2
108 | filelock==3.14.0
109 | firebase-admin==6.5.0
110 | Flask==2.2.5
111 | flatbuffers==24.3.25
112 | flax==0.8.5
113 | folium==0.17.0
114 | fonttools==4.53.1
115 | frozendict==2.4.4
116 | frozenlist==1.4.1
117 | fsspec==2024.6.1
118 | ftfy==6.2.3
119 | future==1.0.0
120 | gast==0.6.0
121 | gcsfs==2024.6.1
122 | GDAL==3.6.4
123 | gdown==5.2.0
124 | geemap==0.34.3
125 | gensim==4.3.3
126 | geocoder==1.38.1
127 | geographiclib==2.0
128 | geopandas==1.0.1
129 | geopy==2.4.1
130 | gin-config==0.5.0
131 | glob2==0.7
132 | google==2.0.3
133 | google-ai-generativelanguage==0.6.6
134 | google-api-core==2.19.2
135 | google-api-python-client==2.137.0
136 | google-auth==2.27.0
137 | google-auth-httplib2==0.2.0
138 | google-auth-oauthlib==1.2.1
139 | google-cloud-aiplatform==1.67.1
140 | google-cloud-bigquery==3.25.0
141 | google-cloud-bigquery-connection==1.15.5
142 | google-cloud-bigquery-storage==2.26.0
143 | google-cloud-bigtable==2.26.0
144 | google-cloud-core==2.4.1
145 | google-cloud-datastore==2.19.0
146 | google-cloud-firestore==2.16.1
147 | google-cloud-functions==1.16.5
148 | google-cloud-iam==2.15.2
149 | google-cloud-language==2.13.4
150 | google-cloud-pubsub==2.23.1
151 | google-cloud-resource-manager==1.12.5
152 | google-cloud-storage==2.8.0
153 | google-cloud-translate==3.15.5
154 | google-colab @ file:///colabtools/dist/google_colab-1.0.0.tar.gz#sha256=07bb3e866a2fb3dc3072920a4722b4a4c9c2fc953a97253597f3e5391c3dd17c
155 | google-crc32c==1.6.0
156 | google-generativeai==0.7.2
157 | google-pasta==0.2.0
158 | google-resumable-media==2.7.2
159 | googleapis-common-protos==1.65.0
160 | googledrivedownloader==0.4
161 | graphviz==0.20.3
162 | greenlet==3.1.1
163 | grpc-google-iam-v1==0.13.1
164 | grpcio==1.64.1
165 | grpcio-status==1.48.2
166 | gspread==6.0.2
167 | gspread-dataframe==3.3.1
168 | gym==0.25.2
169 | gym-notices==0.0.8
170 | h5netcdf==1.3.0
171 | h5py==3.11.0
172 | holidays==0.57
173 | holoviews==1.19.1
174 | html5lib==1.1
175 | httpimport==1.4.0
176 | httplib2==0.22.0
177 | huggingface-hub==0.24.7
178 | humanize==4.10.0
179 | hyperopt==0.2.7
180 | ibis-framework==9.2.0
181 | idna==3.10
182 | imageio==2.35.1
183 | imageio-ffmpeg==0.5.1
184 | imagesize==1.4.1
185 | imbalanced-learn==0.12.3
186 | imgaug==0.4.0
187 | immutabledict==4.2.0
188 | importlib_metadata==8.5.0
189 | importlib_resources==6.4.5
190 | imutils==0.5.4
191 | inflect==7.4.0
192 | iniconfig==2.0.0
193 | intel-cmplr-lib-ur==2024.2.1
194 | intel-openmp==2024.2.1
195 | ipyevents==2.0.2
196 | ipyfilechooser==0.6.0
197 | ipykernel==5.5.6
198 | ipyleaflet==0.19.2
199 | ipyparallel==8.8.0
200 | ipython==7.34.0
201 | ipython-genutils==0.2.0
202 | ipython-sql==0.5.0
203 | ipytree==0.2.2
204 | ipywidgets==7.7.1
205 | itsdangerous==2.2.0
206 | jax==0.4.33
207 | jax-cuda12-pjrt==0.4.33
208 | jax-cuda12-plugin==0.4.33
209 | jaxlib==0.4.33
210 | jeepney==0.7.1
211 | jellyfish==1.1.0
212 | jieba==0.42.1
213 | Jinja2==3.1.4
214 | jmespath==0.10.0
215 | joblib==1.4.2
216 | jsonpickle==3.3.0
217 | jsonschema==4.23.0
218 | jsonschema-specifications==2023.12.1
219 | jupyter-client==6.1.12
220 | jupyter-console==6.1.0
221 | jupyter-leaflet==0.19.2
222 | jupyter-server==1.24.0
223 | jupyter_core==5.7.2
224 | jupyterlab_pygments==0.3.0
225 | jupyterlab_widgets==3.0.13
226 | kaggle==1.6.17
227 | kagglehub==0.3.0
228 | keras==3.4.1
229 | keyring==23.5.0
230 | kiwisolver==1.4.7
231 | langcodes==3.4.0
232 | language_data==1.2.0
233 | launchpadlib==1.10.16
234 | lazr.restfulclient==0.14.4
235 | lazr.uri==1.0.6
236 | lazy_loader==0.4
237 | libclang==18.1.1
238 | librosa==0.10.2.post1
239 | lightgbm==4.5.0
240 | linkify-it-py==2.0.3
241 | llvmlite==0.43.0
242 | locket==1.0.0
243 | logical-unification==0.4.6
244 | lxml==4.9.4
245 | marisa-trie==1.2.0
246 | Markdown==3.7
247 | markdown-it-py==3.0.0
248 | MarkupSafe==2.1.5
249 | matplotlib==3.7.1
250 | matplotlib-inline==0.1.7
251 | matplotlib-venn==1.1.1
252 | mdit-py-plugins==0.4.2
253 | mdurl==0.1.2
254 | miniKanren==1.0.3
255 | missingno==0.5.2
256 | mistune==0.8.4
257 | mizani==0.11.4
258 | mkl==2024.2.2
259 | ml-dtypes==0.4.1
260 | mlxtend==0.23.1
261 | model-index==0.1.11
262 | more-itertools==10.5.0
263 | moviepy==1.0.3
264 | mpmath==1.3.0
265 | msgpack==1.0.8
266 | multidict==6.1.0
267 | multipledispatch==1.0.0
268 | multitasking==0.0.11
269 | murmurhash==1.0.10
270 | music21==9.1.0
271 | namex==0.0.8
272 | natsort==8.4.0
273 | nbclassic==1.1.0
274 | nbclient==0.10.0
275 | nbconvert==6.5.4
276 | nbformat==5.10.4
277 | nest-asyncio==1.6.0
278 | networkx==3.3
279 | nibabel==5.2.1
280 | nltk==3.8.1
281 | notebook==6.5.5
282 | notebook_shim==0.2.4
283 | numba==0.60.0
284 | numexpr==2.10.1
285 | numpy==1.26.4
286 | nvidia-cublas-cu12==12.6.1.4
287 | nvidia-cuda-cupti-cu12==12.6.68
288 | nvidia-cuda-nvcc-cu12==12.6.68
289 | nvidia-cuda-runtime-cu12==12.6.68
290 | nvidia-cudnn-cu12==9.4.0.58
291 | nvidia-cufft-cu12==11.2.6.59
292 | nvidia-cusolver-cu12==11.6.4.69
293 | nvidia-cusparse-cu12==12.5.3.3
294 | nvidia-nccl-cu12==2.23.4
295 | nvidia-nvjitlink-cu12==12.6.68
296 | nvtx==0.2.10
297 | oauth2client==4.1.3
298 | oauthlib==3.2.2
299 | opencv-contrib-python==4.10.0.84
300 | opencv-python==4.10.0.84
301 | opencv-python-headless==4.10.0.84
302 | opendatalab==0.0.10
303 | openmim==0.3.9
304 | openpyxl==3.1.5
305 | openxlab==0.1.1
306 | opt-einsum==3.3.0
307 | optax==0.2.3
308 | optree==0.12.1
309 | orbax-checkpoint==0.6.4
310 | ordered-set==4.1.0
311 | osqp==0.6.7.post0
312 | oss2==2.17.0
313 | packaging==24.1
314 | pandas==2.1.4
315 | pandas-datareader==0.10.0
316 | pandas-gbq==0.23.1
317 | pandas-stubs==2.1.4.231227
318 | pandocfilters==1.5.1
319 | panel==1.4.5
320 | panopticapi @ git+https://github.com/cocodataset/panopticapi.git@7bb4655548f98f3fedc07bf37e9040a992b054b0
321 | param==2.1.1
322 | parso==0.8.4
323 | parsy==2.1
324 | partd==1.4.2
325 | pathlib==1.0.1
326 | patsy==0.5.6
327 | peewee==3.17.6
328 | pexpect==4.9.0
329 | pickleshare==0.7.5
330 | pillow==10.4.0
331 | pip-tools==7.4.1
332 | platformdirs==4.3.6
333 | plotly==5.24.1
334 | plotnine==0.13.6
335 | pluggy==1.5.0
336 | polars==1.6.0
337 | pooch==1.8.2
338 | portpicker==1.5.2
339 | prefetch_generator==1.0.3
340 | preshed==3.0.9
341 | prettytable==3.11.0
342 | proglog==0.1.10
343 | progressbar2==4.5.0
344 | prometheus_client==0.21.0
345 | promise==2.3
346 | prompt_toolkit==3.0.47
347 | prophet==1.1.5
348 | proto-plus==1.24.0
349 | protobuf==3.20.3
350 | psutil==5.9.5
351 | psycopg2==2.9.9
352 | ptyprocess==0.7.0
353 | py-cpuinfo==9.0.0
354 | py4j==0.10.9.7
355 | pyarrow==14.0.2
356 | pyarrow-hotfix==0.6
357 | pyasn1==0.6.1
358 | pyasn1_modules==0.4.1
359 | pycocotools==2.0.8
360 | pycparser==2.22
361 | pycryptodome==3.20.0
362 | pydantic==2.9.2
363 | pydantic_core==2.23.4
364 | pydata-google-auth==1.8.2
365 | pydot==3.0.1
366 | pydot-ng==2.0.0
367 | pydotplus==2.0.2
368 | PyDrive==1.3.1
369 | PyDrive2==1.20.0
370 | pyerfa==2.0.1.4
371 | pygame==2.6.0
372 | Pygments==2.18.0
373 | PyGObject==3.42.1
374 | PyJWT==2.9.0
375 | pymc==5.16.2
376 | pymystem3==0.2.0
377 | pynvjitlink-cu12==0.3.0
378 | pyogrio==0.9.0
379 | PyOpenGL==3.1.7
380 | pyOpenSSL==24.2.1
381 | pyparsing==3.1.4
382 | pyperclip==1.9.0
383 | pyproj==3.6.1
384 | pyproject_hooks==1.1.0
385 | pyshp==2.3.1
386 | PySocks==1.7.1
387 | pytensor==2.25.4
388 | pytest==7.4.4
389 | python-apt==2.4.0
390 | python-box==7.2.0
391 | python-dateutil==2.8.2
392 | python-louvain==0.16
393 | python-slugify==8.0.4
394 | python-utils==3.8.2
395 | pytz==2023.4
396 | pyviz_comms==3.0.3
397 | PyYAML==6.0.2
398 | pyzmq==24.0.1
399 | qdldl==0.1.7.post4
400 | ratelim==0.1.6
401 | referencing==0.35.1
402 | regex==2024.9.11
403 | requests==2.28.2
404 | requests-oauthlib==1.3.1
405 | requirements-parser==0.9.0
406 | rich==13.4.2
407 | rmm-cu12==24.4.0
408 | rpds-py==0.20.0
409 | rpy2==3.4.2
410 | rsa==4.9
411 | safetensors==0.4.5
412 | scikit-image==0.24.0
413 | scikit-learn==1.5.2
414 | scipy==1.13.1
415 | scooby==0.10.0
416 | scs==3.2.7
417 | seaborn==0.13.1
418 | SecretStorage==3.3.1
419 | Send2Trash==1.8.3
420 | sentencepiece==0.2.0
421 | shapely==2.0.6
422 | shellingham==1.5.4
423 | simple-parsing==0.1.6
424 | six==1.16.0
425 | sklearn-pandas==2.2.0
426 | smart-open==7.0.4
427 | sniffio==1.3.1
428 | snowballstemmer==2.2.0
429 | sortedcontainers==2.4.0
430 | soundfile==0.12.1
431 | soupsieve==2.6
432 | soxr==0.5.0.post1
433 | spacy==3.7.6
434 | spacy-legacy==3.0.12
435 | spacy-loggers==1.0.5
436 | Sphinx==5.0.2
437 | sphinxcontrib-applehelp==2.0.0
438 | sphinxcontrib-devhelp==2.0.0
439 | sphinxcontrib-htmlhelp==2.1.0
440 | sphinxcontrib-jsmath==1.0.1
441 | sphinxcontrib-qthelp==2.0.0
442 | sphinxcontrib-serializinghtml==2.0.0
443 | SQLAlchemy==2.0.35
444 | sqlglot==25.1.0
445 | sqlparse==0.5.1
446 | srsly==2.4.8
447 | stanio==0.5.1
448 | statsmodels==0.14.3
449 | StrEnum==0.4.15
450 | sympy==1.13.3
451 | tables==3.8.0
452 | tabulate==0.9.0
453 | tbb==2021.13.1
454 | tblib==3.0.0
455 | tenacity==9.0.0
456 | tensorboard==2.17.0
457 | tensorboard-data-server==0.7.2
458 | tensorflow==2.17.0
459 | tensorflow-datasets==4.9.6
460 | tensorflow-hub==0.16.1
461 | tensorflow-io-gcs-filesystem==0.37.1
462 | tensorflow-metadata==1.15.0
463 | tensorflow-probability==0.24.0
464 | tensorstore==0.1.65
465 | termcolor==2.4.0
466 | terminado==0.18.1
467 | terminaltables==3.1.10
468 | text-unidecode==1.3
469 | textblob==0.17.1
470 | tf-slim==1.1.0
471 | tf_keras==2.17.0
472 | thinc==8.2.5
473 | threadpoolctl==3.5.0
474 | tifffile==2024.9.20
475 | timm==1.0.9
476 | tinycss2==1.3.0
477 | tokenizers==0.19.1
478 | toml==0.10.2
479 | tomli==2.0.1
480 | toolz==0.12.1
481 | torch @ https://download.pytorch.org/whl/cu121_full/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl#sha256=f3ed9a2b7f8671b2b32a2f036d1b81055eb3ad9b18ba43b705aa34bae4289e1a
482 | torchaudio @ https://download.pytorch.org/whl/cu121_full/torchaudio-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl#sha256=da8c87c80a1c1376a48dc33eef30b03bbdf1df25a05bd2b1c620b8811c7b19be
483 | torchsummary==1.5.1
484 | torchvision @ https://download.pytorch.org/whl/cu121_full/torchvision-0.19.1%2Bcu121-cp310-cp310-linux_x86_64.whl#sha256=b8cc4bf381b75522995b601e07a1b433b5fd925dc3e34a7fa6cd22f449d65379
485 | tornado==6.3.3
486 | tqdm==4.65.2
487 | traitlets==5.7.1
488 | traittypes==0.2.1
489 | transformers==4.44.2
490 | tweepy==4.14.0
491 | typeguard==4.3.0
492 | typer==0.12.5
493 | types-pytz==2024.2.0.20240913
494 | types-setuptools==75.1.0.20240917
495 | typing_extensions==4.12.2
496 | tzdata==2024.1
497 | tzlocal==5.2
498 | uc-micro-py==1.0.3
499 | uritemplate==4.1.1
500 | urllib3==1.26.20
501 | vega-datasets==0.9.0
502 | wadllib==1.3.6
503 | wasabi==1.1.3
504 | wcwidth==0.2.13
505 | weasel==0.4.1
506 | webcolors==24.8.0
507 | webencodings==0.5.1
508 | websocket-client==1.8.0
509 | Werkzeug==3.0.4
510 | widgetsnbextension==3.6.9
511 | wordcloud==1.9.3
512 | wrapt==1.16.0
513 | xarray==2024.9.0
514 | xarray-einstats==0.8.0
515 | xgboost==2.1.1
516 | xlrd==2.0.1
517 | xyzservices==2024.9.0
518 | yapf==0.40.2
519 | yarl==1.11.1
520 | yellowbrick==1.5
521 | yfinance==0.2.43
522 | zict==3.0.0
523 | zipp==3.20.2
524 | 


--------------------------------------------------------------------------------
/ImageSegmentation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyNhlaPsPB9W8n1aX35ZbIiR",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/serdarch/SERNet-Former/blob/master/ImageSegmentation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "markdown",
 31 |       "source": [
 32 |         "# **Image Segmentation**"
 33 |       ],
 34 |       "metadata": {
 35 |         "id": "GlHJjHksu6hh"
 36 |       }
 37 |     },
 38 |     {
 39 |       "cell_type": "markdown",
 40 |       "source": [
 41 |         "Please save a copy of this file into your own drive/environment instead of asking for the shared version.\n",
 42 |         "\n",
 43 |         "请将此文件保存到您自己的 Google Drive 或操作系统环境中，而不是请求共享版本\n",
 44 |         "\n",
 45 |         "このファイルを共有バージョンを要求するのではなく、自分の Google ドライブや OS 環境にコピーを保存してください"
 46 |       ],
 47 |       "metadata": {
 48 |         "id": "V-XSpU5MoEgs"
 49 |       }
 50 |     },
 51 |     {
 52 |       "cell_type": "code",
 53 |       "execution_count": null,
 54 |       "metadata": {
 55 |         "id": "0txetkFtI9Bv"
 56 |       },
 57 |       "outputs": [],
 58 |       "source": [
 59 |         "import torch\n",
 60 |         "import torch.nn as nn\n",
 61 |         "import torchvision.models.segmentation as segmentation\n",
 62 |         "from torchvision.transforms import Compose, Resize, ToTensor, Normalize\n",
 63 |         "from PIL import Image\n",
 64 |         "import numpy as np"
 65 |       ]
 66 |     },
 67 |     {
 68 |       "cell_type": "code",
 69 |       "source": [
 70 |         "from torchvision.io.image import read_image\n",
 71 |         "from torchvision.models.segmentation import deeplabv3_resnet101, DeepLabV3_ResNet101_Weights\n",
 72 |         "from torchvision.transforms.functional import to_pil_image"
 73 |       ],
 74 |       "metadata": {
 75 |         "id": "EvI-1SbZMhQU"
 76 |       },
 77 |       "execution_count": null,
 78 |       "outputs": []
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "source": [
 83 |         "# Step 1: Initialize model with the best available weights\n",
 84 |         "weights = DeepLabV3_ResNet101_Weights.DEFAULT\n",
 85 |         "model = deeplabv3_resnet101(weights=weights)\n",
 86 |         "model.eval()"
 87 |       ],
 88 |       "metadata": {
 89 |         "colab": {
 90 |           "base_uri": "https://localhost:8080/"
 91 |         },
 92 |         "id": "5cTuZ2c7Mkxq",
 93 |         "outputId": "6ec742fa-cecd-4070-a268-62501aa3e696"
 94 |       },
 95 |       "execution_count": null,
 96 |       "outputs": [
 97 |         {
 98 |           "output_type": "stream",
 99 |           "name": "stderr",
100 |           "text": [
101 |             "Downloading: \"https://download.pytorch.org/models/deeplabv3_resnet101_coco-586e9e4e.pth\" to /root/.cache/torch/hub/checkpoints/deeplabv3_resnet101_coco-586e9e4e.pth\n",
102 |             "100%|██████████| 233M/233M [00:02<00:00, 83.8MB/s]\n"
103 |           ]
104 |         },
105 |         {
106 |           "output_type": "execute_result",
107 |           "data": {
108 |             "text/plain": [
109 |               "DeepLabV3(\n",
110 |               "  (backbone): IntermediateLayerGetter(\n",
111 |               "    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n",
112 |               "    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
113 |               "    (relu): ReLU(inplace=True)\n",
114 |               "    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n",
115 |               "    (layer1): Sequential(\n",
116 |               "      (0): Bottleneck(\n",
117 |               "        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
118 |               "        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
119 |               "        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
120 |               "        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
121 |               "        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
122 |               "        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
123 |               "        (relu): ReLU(inplace=True)\n",
124 |               "        (downsample): Sequential(\n",
125 |               "          (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
126 |               "          (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
127 |               "        )\n",
128 |               "      )\n",
129 |               "      (1): Bottleneck(\n",
130 |               "        (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
131 |               "        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
132 |               "        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
133 |               "        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
134 |               "        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
135 |               "        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
136 |               "        (relu): ReLU(inplace=True)\n",
137 |               "      )\n",
138 |               "      (2): Bottleneck(\n",
139 |               "        (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
140 |               "        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
141 |               "        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
142 |               "        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
143 |               "        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
144 |               "        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
145 |               "        (relu): ReLU(inplace=True)\n",
146 |               "      )\n",
147 |               "    )\n",
148 |               "    (layer2): Sequential(\n",
149 |               "      (0): Bottleneck(\n",
150 |               "        (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
151 |               "        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
152 |               "        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
153 |               "        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
154 |               "        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
155 |               "        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
156 |               "        (relu): ReLU(inplace=True)\n",
157 |               "        (downsample): Sequential(\n",
158 |               "          (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
159 |               "          (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
160 |               "        )\n",
161 |               "      )\n",
162 |               "      (1): Bottleneck(\n",
163 |               "        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
164 |               "        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
165 |               "        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
166 |               "        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
167 |               "        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
168 |               "        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
169 |               "        (relu): ReLU(inplace=True)\n",
170 |               "      )\n",
171 |               "      (2): Bottleneck(\n",
172 |               "        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
173 |               "        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
174 |               "        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
175 |               "        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
176 |               "        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
177 |               "        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
178 |               "        (relu): ReLU(inplace=True)\n",
179 |               "      )\n",
180 |               "      (3): Bottleneck(\n",
181 |               "        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
182 |               "        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
183 |               "        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
184 |               "        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
185 |               "        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
186 |               "        (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
187 |               "        (relu): ReLU(inplace=True)\n",
188 |               "      )\n",
189 |               "    )\n",
190 |               "    (layer3): Sequential(\n",
191 |               "      (0): Bottleneck(\n",
192 |               "        (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
193 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
194 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
195 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
196 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
197 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
198 |               "        (relu): ReLU(inplace=True)\n",
199 |               "        (downsample): Sequential(\n",
200 |               "          (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
201 |               "          (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
202 |               "        )\n",
203 |               "      )\n",
204 |               "      (1): Bottleneck(\n",
205 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
206 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
207 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
208 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
209 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
210 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
211 |               "        (relu): ReLU(inplace=True)\n",
212 |               "      )\n",
213 |               "      (2): Bottleneck(\n",
214 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
215 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
216 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
217 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
218 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
219 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
220 |               "        (relu): ReLU(inplace=True)\n",
221 |               "      )\n",
222 |               "      (3): Bottleneck(\n",
223 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
224 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
225 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
226 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
227 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
228 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
229 |               "        (relu): ReLU(inplace=True)\n",
230 |               "      )\n",
231 |               "      (4): Bottleneck(\n",
232 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
233 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
234 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
235 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
236 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
237 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
238 |               "        (relu): ReLU(inplace=True)\n",
239 |               "      )\n",
240 |               "      (5): Bottleneck(\n",
241 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
242 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
243 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
244 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
245 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
246 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
247 |               "        (relu): ReLU(inplace=True)\n",
248 |               "      )\n",
249 |               "      (6): Bottleneck(\n",
250 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
251 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
252 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
253 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
254 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
255 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
256 |               "        (relu): ReLU(inplace=True)\n",
257 |               "      )\n",
258 |               "      (7): Bottleneck(\n",
259 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
260 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
261 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
262 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
263 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
264 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
265 |               "        (relu): ReLU(inplace=True)\n",
266 |               "      )\n",
267 |               "      (8): Bottleneck(\n",
268 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
269 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
270 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
271 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
272 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
273 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
274 |               "        (relu): ReLU(inplace=True)\n",
275 |               "      )\n",
276 |               "      (9): Bottleneck(\n",
277 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
278 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
279 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
280 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
281 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
282 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
283 |               "        (relu): ReLU(inplace=True)\n",
284 |               "      )\n",
285 |               "      (10): Bottleneck(\n",
286 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
287 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
288 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
289 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
290 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
291 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
292 |               "        (relu): ReLU(inplace=True)\n",
293 |               "      )\n",
294 |               "      (11): Bottleneck(\n",
295 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
296 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
297 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
298 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
299 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
300 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
301 |               "        (relu): ReLU(inplace=True)\n",
302 |               "      )\n",
303 |               "      (12): Bottleneck(\n",
304 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
305 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
306 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
307 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
308 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
309 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
310 |               "        (relu): ReLU(inplace=True)\n",
311 |               "      )\n",
312 |               "      (13): Bottleneck(\n",
313 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
314 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
315 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
316 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
317 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
318 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
319 |               "        (relu): ReLU(inplace=True)\n",
320 |               "      )\n",
321 |               "      (14): Bottleneck(\n",
322 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
323 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
324 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
325 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
326 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
327 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
328 |               "        (relu): ReLU(inplace=True)\n",
329 |               "      )\n",
330 |               "      (15): Bottleneck(\n",
331 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
332 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
333 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
334 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
335 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
336 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
337 |               "        (relu): ReLU(inplace=True)\n",
338 |               "      )\n",
339 |               "      (16): Bottleneck(\n",
340 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
341 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
342 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
343 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
344 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
345 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
346 |               "        (relu): ReLU(inplace=True)\n",
347 |               "      )\n",
348 |               "      (17): Bottleneck(\n",
349 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
350 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
351 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
352 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
353 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
354 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
355 |               "        (relu): ReLU(inplace=True)\n",
356 |               "      )\n",
357 |               "      (18): Bottleneck(\n",
358 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
359 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
360 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
361 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
362 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
363 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
364 |               "        (relu): ReLU(inplace=True)\n",
365 |               "      )\n",
366 |               "      (19): Bottleneck(\n",
367 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
368 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
369 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
370 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
371 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
372 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
373 |               "        (relu): ReLU(inplace=True)\n",
374 |               "      )\n",
375 |               "      (20): Bottleneck(\n",
376 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
377 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
378 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
379 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
380 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
381 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
382 |               "        (relu): ReLU(inplace=True)\n",
383 |               "      )\n",
384 |               "      (21): Bottleneck(\n",
385 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
386 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
387 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
388 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
389 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
390 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
391 |               "        (relu): ReLU(inplace=True)\n",
392 |               "      )\n",
393 |               "      (22): Bottleneck(\n",
394 |               "        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
395 |               "        (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
396 |               "        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
397 |               "        (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
398 |               "        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
399 |               "        (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
400 |               "        (relu): ReLU(inplace=True)\n",
401 |               "      )\n",
402 |               "    )\n",
403 |               "    (layer4): Sequential(\n",
404 |               "      (0): Bottleneck(\n",
405 |               "        (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
406 |               "        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
407 |               "        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
408 |               "        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
409 |               "        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
410 |               "        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
411 |               "        (relu): ReLU(inplace=True)\n",
412 |               "        (downsample): Sequential(\n",
413 |               "          (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
414 |               "          (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
415 |               "        )\n",
416 |               "      )\n",
417 |               "      (1): Bottleneck(\n",
418 |               "        (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
419 |               "        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
420 |               "        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(4, 4), dilation=(4, 4), bias=False)\n",
421 |               "        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
422 |               "        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
423 |               "        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
424 |               "        (relu): ReLU(inplace=True)\n",
425 |               "      )\n",
426 |               "      (2): Bottleneck(\n",
427 |               "        (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
428 |               "        (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
429 |               "        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(4, 4), dilation=(4, 4), bias=False)\n",
430 |               "        (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
431 |               "        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
432 |               "        (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
433 |               "        (relu): ReLU(inplace=True)\n",
434 |               "      )\n",
435 |               "    )\n",
436 |               "  )\n",
437 |               "  (classifier): DeepLabHead(\n",
438 |               "    (0): ASPP(\n",
439 |               "      (convs): ModuleList(\n",
440 |               "        (0): Sequential(\n",
441 |               "          (0): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
442 |               "          (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
443 |               "          (2): ReLU()\n",
444 |               "        )\n",
445 |               "        (1): ASPPConv(\n",
446 |               "          (0): Conv2d(2048, 256, kernel_size=(3, 3), stride=(1, 1), padding=(12, 12), dilation=(12, 12), bias=False)\n",
447 |               "          (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
448 |               "          (2): ReLU()\n",
449 |               "        )\n",
450 |               "        (2): ASPPConv(\n",
451 |               "          (0): Conv2d(2048, 256, kernel_size=(3, 3), stride=(1, 1), padding=(24, 24), dilation=(24, 24), bias=False)\n",
452 |               "          (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
453 |               "          (2): ReLU()\n",
454 |               "        )\n",
455 |               "        (3): ASPPConv(\n",
456 |               "          (0): Conv2d(2048, 256, kernel_size=(3, 3), stride=(1, 1), padding=(36, 36), dilation=(36, 36), bias=False)\n",
457 |               "          (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
458 |               "          (2): ReLU()\n",
459 |               "        )\n",
460 |               "        (4): ASPPPooling(\n",
461 |               "          (0): AdaptiveAvgPool2d(output_size=1)\n",
462 |               "          (1): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
463 |               "          (2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
464 |               "          (3): ReLU()\n",
465 |               "        )\n",
466 |               "      )\n",
467 |               "      (project): Sequential(\n",
468 |               "        (0): Conv2d(1280, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
469 |               "        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
470 |               "        (2): ReLU()\n",
471 |               "        (3): Dropout(p=0.5, inplace=False)\n",
472 |               "      )\n",
473 |               "    )\n",
474 |               "    (1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
475 |               "    (2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
476 |               "    (3): ReLU()\n",
477 |               "    (4): Conv2d(256, 21, kernel_size=(1, 1), stride=(1, 1))\n",
478 |               "  )\n",
479 |               "  (aux_classifier): FCNHead(\n",
480 |               "    (0): Conv2d(1024, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
481 |               "    (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
482 |               "    (2): ReLU()\n",
483 |               "    (3): Dropout(p=0.1, inplace=False)\n",
484 |               "    (4): Conv2d(256, 21, kernel_size=(1, 1), stride=(1, 1))\n",
485 |               "  )\n",
486 |               ")"
487 |             ]
488 |           },
489 |           "metadata": {},
490 |           "execution_count": 3
491 |         }
492 |       ]
493 |     },
494 |     {
495 |       "cell_type": "code",
496 |       "source": [
497 |         "!wget http://images.cocodataset.org/val2017/000000005477.jpg -q -O input.jpg\n",
498 |         "im1= read_image(\"input.jpg\")"
499 |       ],
500 |       "metadata": {
501 |         "id": "bwizfLVML1pL"
502 |       },
503 |       "execution_count": null,
504 |       "outputs": []
505 |     },
506 |     {
507 |       "cell_type": "code",
508 |       "source": [
509 |         "# Step 2: Initialize the inference transforms\n",
510 |         "preprocess = weights.transforms()\n",
511 |         "\n",
512 |         "# Step 3: Apply inference preprocessing transforms\n",
513 |         "batch = preprocess(im1).unsqueeze(0)\n",
514 |         "\n",
515 |         "# Step 4: Use the model and visualize the prediction\n",
516 |         "prediction = model(batch)[\"out\"]\n",
517 |         "normalized_masks = prediction.softmax(dim=1)\n",
518 |         "class_to_idx = {cls: idx for (idx, cls) in enumerate(weights.meta[\"categories\"])}"
519 |       ],
520 |       "metadata": {
521 |         "id": "XuZL4yWRS8Gx"
522 |       },
523 |       "execution_count": null,
524 |       "outputs": []
525 |     },
526 |     {
527 |       "cell_type": "code",
528 |       "source": [
529 |         "print(weights.meta[\"categories\"])"
530 |       ],
531 |       "metadata": {
532 |         "colab": {
533 |           "base_uri": "https://localhost:8080/"
534 |         },
535 |         "id": "mJgEIsYxPbix",
536 |         "outputId": "410ab130-581c-4551-ae8a-cdb6e9626d83"
537 |       },
538 |       "execution_count": null,
539 |       "outputs": [
540 |         {
541 |           "output_type": "stream",
542 |           "name": "stdout",
543 |           "text": [
544 |             "['__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']\n"
545 |           ]
546 |         }
547 |       ]
548 |     },
549 |     {
550 |       "cell_type": "code",
551 |       "source": [
552 |         "mask = normalized_masks[0, class_to_idx[\"aeroplane\"]]"
553 |       ],
554 |       "metadata": {
555 |         "id": "f1R-TRZuPnnM"
556 |       },
557 |       "execution_count": null,
558 |       "outputs": []
559 |     },
560 |     {
561 |       "cell_type": "code",
562 |       "source": [
563 |         "to_pil_image(mask).show()"
564 |       ],
565 |       "metadata": {
566 |         "id": "IsO6WXQdytTG"
567 |       },
568 |       "execution_count": null,
569 |       "outputs": []
570 |     },
571 |     {
572 |       "cell_type": "code",
573 |       "source": [
574 |         "from google.colab import drive\n",
575 |         "drive.mount('/content/gdrive')"
576 |       ],
577 |       "metadata": {
578 |         "colab": {
579 |           "base_uri": "https://localhost:8080/"
580 |         },
581 |         "id": "-Gei9D03SWmM",
582 |         "outputId": "51672b3c-5bd8-414a-d2f4-d78e9b94e576"
583 |       },
584 |       "execution_count": null,
585 |       "outputs": [
586 |         {
587 |           "output_type": "stream",
588 |           "name": "stdout",
589 |           "text": [
590 |             "Mounted at /content/gdrive\n"
591 |           ]
592 |         }
593 |       ]
594 |     },
595 |     {
596 |       "cell_type": "code",
597 |       "source": [
598 |         "torch.save(mask, '***/MyDrive/mask.pt')"
599 |       ],
600 |       "metadata": {
601 |         "id": "ODJZDuIBSbmm"
602 |       },
603 |       "execution_count": null,
604 |       "outputs": []
605 |     }
606 |   ]
607 | }


--------------------------------------------------------------------------------