├── modules
├── gitignore
├── AbG
│ └── requirements.txt
├── AbM
│ └── requirements.txt
├── AfN
│ └── requirements.txt
├── DfN
│ └── requirements.txt
└── Efficient-ResNet
│ └── gitClone
├── tools
├── gitignore
└── requirements.txt
├── data
├── ADE20K
│ └── ADE20K_val.mlx
├── BDD100K
│ └── bdd100k_val.mlx
├── CamVid
│ ├── CamVid_NetworkMetrics.mat
│ ├── camVidConfig.py
│ └── readMe.md
└── Cityscapes
│ ├── cityscapesConfig.py
│ ├── LICENSE
│ └── readMe.md
├── models
├── segmentation
│ ├── getStarted
│ ├── importSegmentation.py
│ └── readMe.md
└── classification
│ ├── getStarted
│ ├── lstm.m
│ ├── fcn.m
│ ├── cnn.m
│ ├── importClassification.py
│ ├── lstm.py
│ ├── fcn.py
│ ├── cnn.py
│ ├── cnn_d3_v2.m
│ ├── cnn_d3_v2.py
│ └── readMe.md
├── libs
├── requirements.txt
└── readMe.md
├── LICENSE
├── ImageClassification.ipynb
├── README.md
├── requirements.txt
└── ImageSegmentation.ipynb
/modules/gitignore:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/tools/gitignore:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/data/ADE20K/ADE20K_val.mlx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/serdarch/SERNet-Former/HEAD/data/ADE20K/ADE20K_val.mlx
--------------------------------------------------------------------------------
/data/BDD100K/bdd100k_val.mlx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/serdarch/SERNet-Former/HEAD/data/BDD100K/bdd100k_val.mlx
--------------------------------------------------------------------------------
/data/CamVid/CamVid_NetworkMetrics.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/serdarch/SERNet-Former/HEAD/data/CamVid/CamVid_NetworkMetrics.mat
--------------------------------------------------------------------------------
/models/segmentation/getStarted:
--------------------------------------------------------------------------------
1 | #SERNet-Former uses versions of Efficient-ResNet as the baseline architectures in semantic segmentation tasks.
2 |
--------------------------------------------------------------------------------
/models/classification/getStarted:
--------------------------------------------------------------------------------
1 | #SERNet-Former applies versions of Efficient-ResNet as the baseline architectures
2 | #for classification tasks.
3 |
--------------------------------------------------------------------------------
/libs/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch
2 | torchvision
3 | matplotlib
4 | numpy
5 | packaging
6 | prettytable
7 | scipy
8 | codecov
9 | flake8
10 | ftfy
11 | interrogate
12 | pytest
13 | regex
14 | yapf
15 |
--------------------------------------------------------------------------------
/modules/AbG/requirements.txt:
--------------------------------------------------------------------------------
1 | Serdar Erişen, 2024.
2 | All rights are reserved!
3 |
4 | Details are TBD
5 |
6 | The code runs with MATLAB
7 |
8 | Versions for different languages are being developed.
9 |
--------------------------------------------------------------------------------
/tools/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch
2 | torchvision
3 | matplotlib
4 | numpy
5 | packaging
6 | prettytable
7 | scipy
8 | codecov
9 | flake8
10 | ftfy
11 | interrogate
12 | pytest
13 | regex
14 | yapf
15 |
--------------------------------------------------------------------------------
/models/segmentation/importSegmentation.py:
--------------------------------------------------------------------------------
1 | import torchvision.models as models
2 | deeplabv3_resnet50 = models.deeplabv3_resnet50(pretrained=True)
3 | deeplabv3_resnet101 = models.deeplabv3_resnet101(pretrained=True)
--------------------------------------------------------------------------------
/modules/AbM/requirements.txt:
--------------------------------------------------------------------------------
1 | Serdar Erişen, 2024.
2 | All rights are reserved!
3 |
4 | Details are TBD
5 |
6 | The code runs with MATLAB
7 |
8 | Versions for different languages are being developed.
9 |
10 |
--------------------------------------------------------------------------------
/modules/AfN/requirements.txt:
--------------------------------------------------------------------------------
1 | Serdar Erişen, 2024.
2 | All rights are reserved!
3 |
4 | Details are TBD
5 |
6 | The code runs with MATLAB
7 |
8 | Versions for different languages are being developed.
9 |
10 |
11 |
--------------------------------------------------------------------------------
/modules/DfN/requirements.txt:
--------------------------------------------------------------------------------
1 | Serdar Erişen, 2024.
2 | All rights are reserved!
3 |
4 | Details are TBD
5 |
6 | The code runs with MATLAB
7 |
8 | Versions for different languages are being developed.
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/modules/Efficient-ResNet/gitClone:
--------------------------------------------------------------------------------
1 | #You can clone the repository of Efficient-ResNet https://github.com/serdarch/Efficient-ResNet.git
2 | #into your environment.
3 |
4 | git clone https://github.com/serdarch/Efficient-ResNet.git
5 |
--------------------------------------------------------------------------------
/models/classification/lstm.m:
--------------------------------------------------------------------------------
1 | layers = [
2 | sequenceInputLayer(numFeatures)
3 | lstmLayer(hiddenSize, 'OutputMode', 'last')
4 | fullyConnectedLayer(numClasses)
5 | softmaxLayer
6 | classificationLayer
7 | ];
8 |
--------------------------------------------------------------------------------
/models/classification/fcn.m:
--------------------------------------------------------------------------------
1 | hiddenSize = 100;
2 | layers = [
3 | imageInputLayer([28 28 1])
4 | fullyConnectedLayer(hiddenSize)
5 | reluLayer
6 | fullyConnectedLayer(numClasses)
7 | softmaxLayer
8 | classificationLayer
9 | ];
10 |
--------------------------------------------------------------------------------
/models/classification/cnn.m:
--------------------------------------------------------------------------------
1 | layers = [
2 | imageInputLayer([28 28 1])
3 | convolution2dLayer(5, 20)
4 | reluLayer
5 | maxPooling2dLayer(2, 'Stride', 2)
6 | fullyConnectedLayer(numClasses)
7 | softmaxLayer
8 | classificationLayer
9 | ];
10 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | All rights reserved.
2 |
3 | Copyright (c) 2024 Serdar Erişen
4 |
5 | The copyright holder reserves all the rights provided by the copyright law,
6 | such as distribution, performance, and the creation of derivative works.
7 |
8 | This repository is being developed only to backup and augment the different language options
9 | of the SERNet-Former and increase the research capacities throughout
10 | the open-source codes and datasets,
11 | but not directly shares the original details of the network.
12 |
--------------------------------------------------------------------------------
/models/classification/importClassification.py:
--------------------------------------------------------------------------------
1 | import torchvision.models as models
2 | resnet50 = models.resnet50(pretrained=True)
3 | resnet101 = models.resnet101(pretrained=True)
4 | resnext101 = models.resnext101_64X4D1(pretrained=True)
5 | efficientnet_b6 = models.efficientnet_b6(pretrained=True)
6 | regnet_y_128gg = models.regnet_Y_128GF_SWAG_E2E_V1(pretrained=True)
7 | vit_b_16 = models.vit_b_16_SWAG_E2E_V16(pretrained=True)
8 | vit_b_32 = models.vit_b_32(pretrained=True)
9 | vit_l_16 = models.vit_l_16(pretrained=True)
10 |
--------------------------------------------------------------------------------
/models/classification/lstm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class YourModel(nn.Module):
5 | def __init__(self, num_features, hidden_size, num_classes):
6 | super(YourModel, self).__init__()
7 | self.lstm = nn.LSTM(num_features, hidden_size, batch_first=True)
8 | self.fc = nn.Linear(hidden_size, num_classes)
9 | self.softmax = nn.Softmax(dim=1) # Softmax across classes
10 |
11 | def forward(self, x):
12 | _, (h_n, _) = self.lstm(x)
13 | x = self.fc(h_n.squeeze(0))
14 | x = self.softmax(x)
15 | return x
16 |
17 | # Example usage:
18 | # model = YourModel(num_features=numFeatures, hidden_size=hiddenSize, num_classes=numClasses)
19 |
--------------------------------------------------------------------------------
/data/CamVid/camVidConfig.py:
--------------------------------------------------------------------------------
1 | # @package training
2 | # Those arguments defines the training hyper-parameters
3 | epochs: 80
4 | num_workers: 1
5 | batch_size: 3
6 | shuffle: True
7 | cuda: 0
8 | precompute_multi_scale: False # Compute multiscate features on cpu for faster training / inference
9 | optim:
10 | base_lr: 0.001
11 | optimizer:
12 | class: SGD
13 | params:
14 | lr: ${training.optim.base_lr}
15 | lr_scheduler: ${lr_scheduler_v}
16 | bn_scheduler:
17 | bn_policy: "step_decay"
18 | params:
19 | bn_momentum: 0.9
20 | bn_decay: 0.95
21 | decay_step: 10
22 | bn_clip: 1
23 | weight_name: "latest" # can be named/changed according to the shared model weights
24 | enable_cudnn: False
25 | checkpoint_dir: "..."
26 |
--------------------------------------------------------------------------------
/data/Cityscapes/cityscapesConfig.py:
--------------------------------------------------------------------------------
1 | # @package training
2 | # Those arguments defines the training hyper-parameters
3 | epochs: 80
4 | num_workers: 1
5 | batch_size: 1
6 | shuffle: True
7 | cuda: 0
8 | precompute_multi_scale: False # Compute multiscate features on cpu for faster training / inference
9 | optim:
10 | base_lr: 0.0005
11 | optimizer:
12 | class: SGD
13 | params:
14 | lr: ${training.optim.base_lr}
15 | lr_scheduler: ${lr_scheduler}
16 | bn_scheduler:
17 | bn_policy: "step_decay"
18 | params:
19 | bn_momentum: 0.9
20 | bn_decay: 0.95
21 | decay_step: 10
22 | bn_clip: 1
23 | weight_name: "latest" # can be named/changed according to the shared model weights
24 | enable_cudnn: False
25 | checkpoint_dir: "..."
26 |
--------------------------------------------------------------------------------
/models/classification/fcn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class YourModel(nn.Module):
5 | def __init__(self, hidden_size, num_classes):
6 | super(YourModel, self).__init__()
7 | self.fc1 = nn.Linear(28 * 28 * 1, hidden_size)
8 | self.relu = nn.ReLU()
9 | self.fc2 = nn.Linear(hidden_size, num_classes)
10 | self.softmax = nn.Softmax(dim=1) # Softmax across classes
11 |
12 | def forward(self, x):
13 | x = x.view(-1, 28 * 28 * 1) # Flatten the input images
14 | x = self.fc1(x)
15 | x = self.relu(x)
16 | x = self.fc2(x)
17 | x = self.softmax(x)
18 | return x
19 |
20 | # Example usage:
21 | # model = YourModel(hidden_size=100, num_classes=numClasses)
22 |
--------------------------------------------------------------------------------
/models/classification/cnn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class YourModel(nn.Module):
5 | def __init__(self, num_classes):
6 | super(YourModel, self).__init__()
7 | self.conv1 = nn.Conv2d(in_channels=1, out_channels=20, kernel_size=5)
8 | self.relu = nn.ReLU()
9 | self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
10 | self.fc = nn.Linear(20 * 12 * 12, num_classes) # Assuming input image size is 28x28
11 |
12 | def forward(self, x):
13 | x = self.conv1(x)
14 | x = self.relu(x)
15 | x = self.pool(x)
16 | x = x.view(-1, 20 * 12 * 12) # Flatten the tensor for fully connected layer
17 | x = self.fc(x)
18 | return x
19 |
20 | # Example usage:
21 | # model = YourModel(num_classes=numClasses)
22 |
--------------------------------------------------------------------------------
/models/segmentation/readMe.md:
--------------------------------------------------------------------------------
1 | # Model Zoo
2 |
3 | The models are the open-source segmentation baseline checkpoints
4 |
5 |
6 |
7 |
8 | | Baseline |
9 | Download |
10 |
11 |
12 | | DeepLab_v3 ResNet-50 |
13 | model |
14 |
15 |
16 | | DeepLab_v3 ResNet-101 |
17 | model |
18 |
19 |
20 | | HR-Net W48 |
21 | model |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/data/Cityscapes/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Serdar Erişen
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/models/classification/cnn_d3_v2.m:
--------------------------------------------------------------------------------
1 | CNN_D3_layers = [
2 | imageInputLayer([1 6 1], "Name", "imageinput")
3 | convolution2dLayer([1 1], 4, "Name", "conv", "Padding", "same")
4 | batchNormalizationLayer("Name", "batchnorm")
5 | reluLayer("Name", "relu")
6 | globalMaxPooling2dLayer("Name", "gmpool")
7 | convolution2dLayer([1 1], 8, "Name", "conv_1", "Padding", "same")
8 | convolution2dLayer([1 1], 8, "Name", "conv_2", "Padding", "same")
9 | batchNormalizationLayer("Name", "batchnorm_1")
10 | reluLayer("Name", "relu_1")
11 | globalMaxPooling2dLayer("Name", "gmpool_1")
12 | convolution2dLayer([1 1], 16, "Name", "conv_3", "Padding", "same")
13 | convolution2dLayer([1 1], 16, "Name", "conv_4", "Padding", "same")
14 | batchNormalizationLayer("Name", "batchnorm_2")
15 | convolution2dLayer([1 1], 32, "Name", "conv_5", "Padding", "same")
16 | convolution2dLayer([1 1], 32, "Name", "conv_6", "Padding", "same")
17 | batchNormalizationLayer("Name", "batchnorm_3")
18 | reluLayer("Name", "relu_2")
19 | fullyConnectedLayer(4, "Name", "fc")
20 | softmaxLayer("Name", "softmax")
21 | classificationLayer("Name", "classoutput")
22 | ];
23 |
24 |
--------------------------------------------------------------------------------
/models/classification/cnn_d3_v2.py:
--------------------------------------------------------------------------------
1 | # Python code for CNN_D3_v2 architecture
2 | # Serdar Erisen, 2024
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 | class CNN_D3_v2(nn.Module):
8 | def __init__(self):
9 | super(CNN_D3_v2, self).__init__()
10 | self.conv = nn.Conv2d(1, 4, kernel_size=1, padding='same')
11 | self.batchnorm = nn.BatchNorm2d(4)
12 | self.relu = nn.ReLU()
13 | self.gmpool = nn.AdaptiveMaxPool2d((1, 1))
14 |
15 | self.conv_1 = nn.Conv2d(4, 8, kernel_size=1, padding='same')
16 | self.conv_2 = nn.Conv2d(8, 8, kernel_size=1, padding='same')
17 | self.batchnorm_1 = nn.BatchNorm2d(8)
18 | self.relu_1 = nn.ReLU()
19 | self.gmpool_1 = nn.AdaptiveMaxPool2d((1, 1))
20 |
21 | self.conv_3 = nn.Conv2d(8, 16, kernel_size=1, padding='same')
22 | self.conv_4 = nn.Conv2d(16, 16, kernel_size=1, padding='same')
23 | self.batchnorm_2 = nn.BatchNorm2d(16)
24 |
25 | self.conv_5 = nn.Conv2d(16, 32, kernel_size=1, padding='same')
26 | self.conv_6 = nn.Conv2d(32, 32, kernel_size=1, padding='same')
27 | self.batchnorm_3 = nn.BatchNorm2d(32)
28 | self.relu_2 = nn.ReLU()
29 |
30 | self.fc = nn.Linear(32, 4)
31 | self.softmax = nn.Softmax(dim=1)
32 |
33 | def forward(self, x):
34 | x = self.conv(x)
35 | x = self.batchnorm(x)
36 | x = self.relu(x)
37 | x = self.gmpool(x)
38 |
39 | x = self.conv_1(x)
40 | x = self.conv_2(x)
41 | x = self.batchnorm_1(x)
42 | x = self.relu_1(x)
43 | x = self.gmpool_1(x)
44 |
45 | x = self.conv_3(x)
46 | x = self.conv_4(x)
47 | x = self.batchnorm_2(x)
48 |
49 | x = self.conv_5(x)
50 | x = self.conv_6(x)
51 | x = self.batchnorm_3(x)
52 | x = self.relu_2(x)
53 |
54 | x = torch.flatten(x, 1)
55 |
56 | x = self.fc(x)
57 | x = self.softmax(x)
58 |
59 | return x
60 |
61 | # Example usage:
62 | # model = CNN_D3_v2()
63 | # # Print model architecture
64 | # print(model)
65 |
66 |
67 |
--------------------------------------------------------------------------------
/data/Cityscapes/readMe.md:
--------------------------------------------------------------------------------
1 | ⁰# Cityscapes dataset
2 |
3 | Cityscapes is one of the most challenging datasets for the semantic segmentation of urban street scenes.
4 |
5 | It contains high-quality pixel-level annotations for 5000 images, as well as coarsely annotated 20000 images.
6 |
7 | The dataset contains diverse stereo video sequences with the sizes of 1024 by 2048 pixels,
8 | recorded during the daytime of 50 European cities visited in several months (spring, summer, and fall)
9 | with good or average weather conditions.
10 |
11 | The dataset of 5000 fine annotations is divided into three sets: 2975 for training, 500 for validation, and 1525 for testing.
12 |
13 | The dataset includes semantic, instance-wise, and dense pixel annotations of 30 classes grouped into eight categories.
14 |
15 | However, most literature uses annotations with 20 classes, 19 of which are semantic labels containing objects and stuff,
16 | in addition to one additional void class for do-not-care regions.
17 | # Models
18 |
19 |
20 |
21 | | Model / Method |
22 | Baseline |
23 | mIoU |
24 |
25 |
26 | | SERNet-Former |
27 | ResNet-50 |
28 | 73.31 |
29 |
30 |
31 | | SERNet-Former |
32 | Efficient-ResNet_R101 |
33 | 77.04 |
34 |
35 |
| SERNet-Former |
36 | Efficient-ResNet [final] |
37 | 84.83 |
38 |
39 |
40 |
41 | ## Please cite
42 |
43 | ```bibtex
44 | @inproceedings{Cordts2016CVPR,
45 | title={The cityscapes dataset for semantic urban scene understanding},
46 | author={M. Cordts and M. Omran and S. Ramos and T. Rehfeld and M. Enzweiler and R. Benenson and U. Franke and S. Roth, and B. Schiele},
47 | booktitle={CVPR},
48 | year={2016},
49 | }
50 |
51 | @article{Erisen2024SERNetFormer,
52 | title={SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks},
53 | author={Erişen, Serdar},
54 | journal={arXiv preprint arXiv:2401.15741},
55 | year={2024}
56 | }
57 | ```
58 |
--------------------------------------------------------------------------------
/models/classification/readMe.md:
--------------------------------------------------------------------------------
1 | # Model Zoo
2 |
3 | The models are the open-source checkpoints pretrained on ImageNet dataset
4 |
5 |
6 |
7 |
8 | | Baseline |
9 | Download |
10 |
11 |
12 | | ResNet-50 |
13 | model |
14 |
15 |
16 | | ResNet-101 |
17 | model |
18 |
19 |
20 | | Swin_V2_S |
21 | model |
22 |
23 |
24 | | Swin_V2_B |
25 | model |
26 |
27 |
28 | | ViT_B_16_SWAG_E2E_V1 |
29 | model |
30 |
31 |
32 | | ViT_H_14_SWAG_E2E_V1 |
33 | model |
34 |
35 |
36 | | EfficientNet_B6 |
37 | model |
38 |
39 |
40 | | EfficientNet_V2_L |
41 | model |
42 |
43 |
44 | | RegNet_Y_128GF_SWAG_E2E_V1 |
45 | model |
46 |
47 |
48 | | ResNeXt101_64X4D |
49 | model |
50 |
51 |
52 | | CNN-D3 |
53 | model |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/libs/readMe.md:
--------------------------------------------------------------------------------
1 | # Model Zoo
2 |
3 | ## Classification Networks
4 | The models are the open-source checkpoints pretrained on ImageNet dataset
5 |
6 |
7 |
8 |
9 | | Baseline |
10 | Download |
11 |
12 |
13 | | ResNet-50 |
14 | model |
15 |
16 |
17 | | ResNet-101 |
18 | model |
19 |
20 |
21 | | Swin_V2_S |
22 | model |
23 |
24 |
25 | | Swin_V2_B |
26 | model |
27 |
28 |
29 | | ViT_B_16_SWAG_E2E_V1 |
30 | model |
31 |
32 |
33 | | ViT_H_14_SWAG_E2E_V1 |
34 | model |
35 |
36 |
37 | | EfficientNet_B6 |
38 | model |
39 |
40 |
41 | | EfficientNet_V2_L |
42 | model |
43 |
44 |
45 | | RegNet_Y_128GF_SWAG_E2E_V1 |
46 | model |
47 |
48 |
49 | | ResNeXt101_64X4D |
50 | model |
51 |
52 |
53 |
54 | ## Segmentation Networks
55 | The models are the open-source segmentation baseline checkpoints pretrained on COCO dataset
56 |
57 |
58 |
59 | | Baseline |
60 | Download |
61 |
62 |
63 | | DeepLab_v3 ResNet-50 |
64 | model |
65 |
66 |
67 | | DeepLab_v3 ResNet-101 |
68 | model |
69 |
70 |
71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/data/CamVid/readMe.md:
--------------------------------------------------------------------------------
1 | # CamVid Dataset
2 |
3 | The Cambridge-driving Labelled Video Database (CamVid) is one of the first scene understanding databases,
4 | and it is based on the motion-based video collections of driving scenes recorded for semantic segmentation of object classes.
5 |
6 | This database contains 701 frames with sizes of 720 by 960 pixels that were captured in five video sequences,
7 | shot via the fixed-position CCTV-style cameras mounted on a car. The densely annotated images were manually
8 | generated through 32 classes and merged into 11 classes later.
9 |
10 | The original dataset is divided into 367 training, 101 validation, and 233 test images, as most literature practiced.
11 |
12 | ## Model metrics
13 |
14 |
15 |
16 |
17 | | Model |
18 | Metrics File |
19 |
20 |
21 | | SERNet-Former [checkpoint] |
22 | download |
23 |
24 |
25 | | SERNet-Former [final] |
26 | download |
27 |
28 |
29 |
30 | ## Ablation works
31 |
32 |
33 |
34 |
35 | | AbM |
36 | DbN |
37 | AfN1 |
38 | AfN2 |
39 | mIoU % |
40 |
41 |
42 | | N |
43 | Y |
44 | Y |
45 | Y |
46 | 81.22 |
47 |
48 |
49 | | Y |
50 | N |
51 | Y |
52 | Y |
53 | 80.56 |
54 |
55 |
56 | | Y |
57 | Y |
58 | N |
59 | Y |
60 | 78.99 |
61 |
62 |
63 | | Y |
64 | Y |
65 | Y |
66 | N |
67 | 75.37 |
68 |
69 |
70 | | Y |
71 | Y |
72 | Y |
73 | Y |
74 | 82.88 |
75 |
76 |
77 |
78 |
79 | ## Please cite
80 |
81 | ```bibtex
82 | @article{Brostow2019,
83 | title={Semantic object classes in video: A high-definition ground truth database},
84 | author={G. J. Brostow and J. Fauqueur and R. Cipolla},
85 | journal={Pattern Recognition Letters},
86 | volume=90
87 | pages=119-133
88 | year={2019}
89 | }
90 |
91 | @article{Erisen2024SERNetFormer,
92 | title={SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks},
93 | author={Erişen, Serdar},
94 | journal={arXiv preprint arXiv:2401.15741},
95 | year={2024}
96 | }
97 | ```
98 |
--------------------------------------------------------------------------------
/ImageClassification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyN57eNgsltG+a1X/fNILjb8",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "source": [
32 | "**Image classification**\n",
33 | "\n",
34 | "Image classification tutorial based on the pretrained PyTorch baselines.\n",
35 | "Used model: ViT_h_14 with Weights IMAGENET1K_SWAG_E2E_V1\n",
36 | "\n",
37 | "Please save a copy of this tutorial into your own environment/drive folder."
38 | ],
39 | "metadata": {
40 | "id": "7kG9ZBiN238_"
41 | }
42 | },
43 | {
44 | "cell_type": "code",
45 | "source": [
46 | "from torchvision.io import read_image"
47 | ],
48 | "metadata": {
49 | "id": "UB3maNwt0roV"
50 | },
51 | "execution_count": 2,
52 | "outputs": []
53 | },
54 | {
55 | "cell_type": "code",
56 | "source": [
57 | "!wget http://images.cocodataset.org/val2017/000000005477.jpg -q -O input.jpg"
58 | ],
59 | "metadata": {
60 | "id": "5_vXR0dT0r9L"
61 | },
62 | "execution_count": 1,
63 | "outputs": []
64 | },
65 | {
66 | "cell_type": "code",
67 | "source": [
68 | "im1=read_image(\"input.jpg\")"
69 | ],
70 | "metadata": {
71 | "id": "-hGZui_z0w97"
72 | },
73 | "execution_count": 3,
74 | "outputs": []
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 4,
79 | "metadata": {
80 | "colab": {
81 | "base_uri": "https://localhost:8080/"
82 | },
83 | "id": "th3lBuOHzu6e",
84 | "outputId": "229ec5c7-86b0-4eb4-81fd-cf126d1d9c36"
85 | },
86 | "outputs": [
87 | {
88 | "output_type": "stream",
89 | "name": "stderr",
90 | "text": [
91 | "Downloading: \"https://download.pytorch.org/models/vit_h_14_swag-80465313.pth\" to /root/.cache/torch/hub/checkpoints/vit_h_14_swag-80465313.pth\n",
92 | "100%|██████████| 2.36G/2.36G [01:57<00:00, 21.6MB/s]\n"
93 | ]
94 | },
95 | {
96 | "output_type": "stream",
97 | "name": "stdout",
98 | "text": [
99 | "airliner: 91.4%\n"
100 | ]
101 | }
102 | ],
103 | "source": [
104 | "from torchvision.models import vit_h_14, ViT_H_14_Weights\n",
105 | "\n",
106 | "# Step 1: Initialize model with the best available weights\n",
107 | "weights = ViT_H_14_Weights.DEFAULT\n",
108 | "model = vit_h_14(weights='DEFAULT')\n",
109 | "model.eval()\n",
110 | "\n",
111 | "# Step 2: Initialize the inference transforms\n",
112 | "preprocess = weights.transforms()\n",
113 | "\n",
114 | "# Step 3: Apply inference preprocessing transforms\n",
115 | "batch = preprocess(im1).unsqueeze(0)\n",
116 | "\n",
117 | "# Step 4: Use the model and print the predicted category\n",
118 | "prediction = model(batch).squeeze(0).softmax(0)\n",
119 | "class_id = prediction.argmax().item()\n",
120 | "score = prediction[class_id].item()\n",
121 | "category_name = weights.meta[\"categories\"][class_id]\n",
122 | "print(f\"{category_name}: {100 * score:.1f}%\") #airliner: ~91.4%"
123 | ]
124 | }
125 | ]
126 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SERNet-Former
2 |
3 |
4 | [![[CVPR 2024 Workshops] YouTube Video](https://img.shields.io/badge/CVPRW'24-YouTube-blue)](https://youtu.be/XXzMkotcdb4?feature=shared)
5 | [](https://equivision.github.io/index.html#papers)
6 | [](https://doi.org/10.48550/arXiv.2401.15741)
7 | [](https://cvmi2024.iiita.ac.in/AcceptedPapers.php)
8 |
9 |
10 |
11 | [CVPR 2024 Workshops] SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks
12 |
13 | ## Tutorials
14 | Various implementations of SERNet-Former with different baselines for Multi-tasking (without our additional methods) is now online.
15 |
16 | The example deploys ViT_h_14 baseline with 'Weights' 'IMAGENET1K_SWAG_E2E_V1' and simple U-Net decoder architecture.
17 | [](https://colab.research.google.com/drive/185TZK796f425vsduhpm9NcCrtMS4rIkb#scrollTo=q3e5V2NephbJ&forceEdit=true&sandboxMode=true)
18 |
19 |
20 | Please also see the tutorials for
21 |
22 | Image Segmentation based on DeepLabV3+_ResNet101 baseline
23 | [](https://colab.research.google.com/drive/1sMRRcUsFaUwHSvIch9Koqxb4ogtgfVFs#scrollTo=-Gei9D03SWmM&forceEdit=true&sandboxMode=true)
24 |
25 |
26 | Image Classification based on ViT_h_14 baseline
27 | [](https://colab.research.google.com/drive/1Nj82jyovcQcuZotx-pRSBzd1uEXCbOp4#scrollTo=7kG9ZBiN238)
28 |
29 |
30 | ## News
31 | - `16 May 2024` [CVPR 2024 Workshops] The article "SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks" is now accepted to CVPR 2024 Workshops. Equivariant Vision: From Theory to Practice
32 | - `January 2024` SERNet-Former set state-of-the-art result on Cityscapes validation dataset for pixel-level segmentation: 87.35 % mIoU
33 | - `January 2024` SERNet-Former set state-of-the-art result on CamVid dataset: 84.62 % mIoU
34 | - `January 2024` SERNet-Former ranked as the seventh on Cityscapes test dataset for pixel-level segmentation according to PapersWithCode.com: 84.83 % mIoU
35 |
36 |
37 | ## GitHub Badges
38 | [](https://paperswithcode.com/sota/semantic-segmentation-on-ade20k-val?p=sernet-former-semantic-segmentation-by)
39 |
40 | [](https://paperswithcode.com/sota/semantic-segmentation-on-bdd100k-val?p=sernet-former-semantic-segmentation-by)
41 |
42 | [](https://paperswithcode.com/sota/semantic-segmentation-on-camvid?p=sernet-former-semantic-segmentation-by)
43 |
44 | [](https://paperswithcode.com/sota/2d-semantic-segmentation-on-camvid?p=sernet-former-semantic-segmentation-by)
45 |
46 | [](https://paperswithcode.com/sota/semantic-segmentation-on-cityscapes-val?p=sernet-former-semantic-segmentation-by)
47 |
48 | [](https://paperswithcode.com/sota/2d-semantic-segmentation-on-cityscapes-val?p=sernet-former-semantic-segmentation-by)
49 |
50 | [](https://paperswithcode.com/sota/semantic-segmentation-on-cityscapes?p=sernet-former-semantic-segmentation-by)
51 |
52 | ## SERNet-Former Conceptual
53 |
54 | [](https://github.com/serdarch/Efficient-ResNet)
55 |
56 | 
57 |
58 | (a) Attention-boosting Gate (AbG) and Attention-boosting Module (AbM) are fused into the encoder part.
59 |
60 | (b) Attention-fusion Network (AfN), introduced into the decoder
61 |
62 | ## Experiment Results
63 |
64 | ### CamVid Dataset
65 |
66 | The breakdown of class accuracies on CamVid dataset
67 |
68 |
69 |
70 | | Model |
71 | Baseline Architecture |
72 | Building |
73 | Tree |
74 | Sky |
75 | Car |
76 | Sign |
77 | Road |
78 | Pedestrian |
79 | Fence |
80 | Pole |
81 | Sidewalk |
82 | Bicycle |
83 | mIoU |
84 |
85 |
86 |
87 | | SERNet-Former |
88 | Efficient-ResNet |
89 | 93.0 |
90 | 88.8 |
91 | 95.1 |
92 | 91.9 |
93 | 73.9 |
94 | 97.7 |
95 | 76.4 |
96 | 83.4 |
97 | 57.3 |
98 | 90.3 |
99 | 83.1 |
100 | 84.62 |
101 |
102 |
103 |
104 |
105 | The experiment outcomes on CamVid dataset
106 |
107 | 
108 |
109 | ### Cityscapes
110 |
111 |
112 |
113 |
114 | | Model |
115 | Baseline Architecture |
116 | road |
117 | sidewalk |
118 | building |
119 | wall |
120 | fence |
121 | pole |
122 | traffic light |
123 | traffic sign |
124 | vegetation |
125 | terrain |
126 | sky |
127 | person |
128 | rider |
129 | car |
130 | truck |
131 | bus |
132 | train |
133 | motorcycle |
134 | bicycle |
135 | mIoU |
136 |
137 |
138 |
139 | | SERNet-Former |
140 | Efficient-ResNet |
141 | 98.2 |
142 | 90.2 |
143 | 94.0 |
144 | 67.6 |
145 | 68.2 |
146 | 73.6 |
147 | 78.2 |
148 | 82.1 |
149 | 94.6 |
150 | 75.9 |
151 | 96.9 |
152 | 90.0 |
153 | 77.7 |
154 | 96.9 |
155 | 86.1 |
156 | 93.9 |
157 | 91.7 |
158 | 70.0 |
159 | 82.9 |
160 | 84.83 |
161 |
162 |
163 |
164 | The experiment outcomes on Cityscapes dataset
165 |
166 | 
167 |
168 | ## Installation Support
169 |
170 | You can simply download this repository into your environment by running
171 | ```bash
172 | git clone https://github.com/serdarch/SERNet-Former.git
173 | ```
174 |
175 | ## Citations
176 |
177 | ```bibtex
178 | @article{Erisen2024SERNetFormer,
179 | title={SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks},
180 | author={Erişen, Serdar},
181 | journal={arXiv preprint arXiv:2401.15741},
182 | year={2024}
183 | }
184 |
185 | @inproceedings{Erisen2024CVPRW,
186 | title={SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks},
187 | author={Erişen, Serdar},
188 | booktitle={CVPRW},
189 | year={2024},
190 | }
191 | ```
192 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py==1.4.0
2 | accelerate==0.34.2
3 | addict==2.4.0
4 | aiohappyeyeballs==2.4.0
5 | aiohttp==3.10.5
6 | aiosignal==1.3.1
7 | alabaster==0.7.16
8 | albucore==0.0.16
9 | albumentations==1.4.15
10 | aliyun-python-sdk-core==2.15.2
11 | aliyun-python-sdk-kms==2.16.5
12 | altair==4.2.2
13 | annotated-types==0.7.0
14 | anyio==3.7.1
15 | argon2-cffi==23.1.0
16 | argon2-cffi-bindings==21.2.0
17 | array_record==0.5.1
18 | arviz==0.19.0
19 | astropy==6.1.3
20 | astropy-iers-data==0.2024.9.16.0.32.21
21 | astunparse==1.6.3
22 | async-timeout==4.0.3
23 | atpublic==4.1.0
24 | attrs==24.2.0
25 | audioread==3.0.1
26 | autograd==1.7.0
27 | babel==2.16.0
28 | backcall==0.2.0
29 | beautifulsoup4==4.12.3
30 | bigframes==1.18.0
31 | bigquery-magics==0.2.0
32 | bleach==6.1.0
33 | blinker==1.4
34 | blis==0.7.11
35 | blosc2==2.0.0
36 | bokeh==3.4.3
37 | bqplot==0.12.43
38 | branca==0.7.2
39 | build==1.2.2
40 | CacheControl==0.14.0
41 | cachetools==5.5.0
42 | catalogue==2.0.10
43 | certifi==2024.8.30
44 | cffi==1.17.1
45 | chardet==5.2.0
46 | charset-normalizer==3.3.2
47 | chex==0.1.86
48 | clarabel==0.9.0
49 | click==8.1.7
50 | cloudpathlib==0.19.0
51 | cloudpickle==2.2.1
52 | cmake==3.30.3
53 | cmdstanpy==1.2.4
54 | colorama==0.4.6
55 | colorcet==3.1.0
56 | colorlover==0.3.0
57 | colour==0.1.5
58 | community==1.0.0b1
59 | confection==0.1.5
60 | cons==0.4.6
61 | contextlib2==21.6.0
62 | contourpy==1.3.0
63 | crcmod==1.7
64 | cryptography==43.0.1
65 | cuda-python==12.2.1
66 | cudf-cu12 @ https://pypi.nvidia.com/cudf-cu12/cudf_cu12-24.4.1-cp310-cp310-manylinux_2_28_x86_64.whl#sha256=57366e7ef09dc63e0b389aff20df6c37d91e2790065861ee31a4720149f5b694
67 | cufflinks==0.17.3
68 | cupy-cuda12x==12.2.0
69 | cvxopt==1.3.2
70 | cvxpy==1.5.3
71 | cycler==0.12.1
72 | cymem==2.0.8
73 | Cython==3.0.11
74 | dask==2024.8.0
75 | datascience==0.17.6
76 | db-dtypes==1.3.0
77 | dbus-python==1.2.18
78 | debugpy==1.6.6
79 | decorator==4.4.2
80 | defusedxml==0.7.1
81 | distributed==2024.8.0
82 | distro==1.7.0
83 | dlib==19.24.2
84 | dm-tree==0.1.8
85 | docstring_parser==0.16
86 | docutils==0.18.1
87 | dopamine_rl==4.0.9
88 | duckdb==1.1.0
89 | earthengine-api==1.0.0
90 | easydict==1.13
91 | ecos==2.0.14
92 | editdistance==0.8.1
93 | eerepr==0.0.4
94 | einops==0.8.0
95 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
96 | entrypoints==0.4
97 | et-xmlfile==1.1.0
98 | etils==1.9.4
99 | etuples==0.3.9
100 | eval_type_backport==0.2.0
101 | exceptiongroup==1.2.2
102 | fastai==2.7.17
103 | fastcore==1.7.8
104 | fastdownload==0.0.7
105 | fastjsonschema==2.20.0
106 | fastprogress==1.0.3
107 | fastrlock==0.8.2
108 | filelock==3.14.0
109 | firebase-admin==6.5.0
110 | Flask==2.2.5
111 | flatbuffers==24.3.25
112 | flax==0.8.5
113 | folium==0.17.0
114 | fonttools==4.53.1
115 | frozendict==2.4.4
116 | frozenlist==1.4.1
117 | fsspec==2024.6.1
118 | ftfy==6.2.3
119 | future==1.0.0
120 | gast==0.6.0
121 | gcsfs==2024.6.1
122 | GDAL==3.6.4
123 | gdown==5.2.0
124 | geemap==0.34.3
125 | gensim==4.3.3
126 | geocoder==1.38.1
127 | geographiclib==2.0
128 | geopandas==1.0.1
129 | geopy==2.4.1
130 | gin-config==0.5.0
131 | glob2==0.7
132 | google==2.0.3
133 | google-ai-generativelanguage==0.6.6
134 | google-api-core==2.19.2
135 | google-api-python-client==2.137.0
136 | google-auth==2.27.0
137 | google-auth-httplib2==0.2.0
138 | google-auth-oauthlib==1.2.1
139 | google-cloud-aiplatform==1.67.1
140 | google-cloud-bigquery==3.25.0
141 | google-cloud-bigquery-connection==1.15.5
142 | google-cloud-bigquery-storage==2.26.0
143 | google-cloud-bigtable==2.26.0
144 | google-cloud-core==2.4.1
145 | google-cloud-datastore==2.19.0
146 | google-cloud-firestore==2.16.1
147 | google-cloud-functions==1.16.5
148 | google-cloud-iam==2.15.2
149 | google-cloud-language==2.13.4
150 | google-cloud-pubsub==2.23.1
151 | google-cloud-resource-manager==1.12.5
152 | google-cloud-storage==2.8.0
153 | google-cloud-translate==3.15.5
154 | google-colab @ file:///colabtools/dist/google_colab-1.0.0.tar.gz#sha256=07bb3e866a2fb3dc3072920a4722b4a4c9c2fc953a97253597f3e5391c3dd17c
155 | google-crc32c==1.6.0
156 | google-generativeai==0.7.2
157 | google-pasta==0.2.0
158 | google-resumable-media==2.7.2
159 | googleapis-common-protos==1.65.0
160 | googledrivedownloader==0.4
161 | graphviz==0.20.3
162 | greenlet==3.1.1
163 | grpc-google-iam-v1==0.13.1
164 | grpcio==1.64.1
165 | grpcio-status==1.48.2
166 | gspread==6.0.2
167 | gspread-dataframe==3.3.1
168 | gym==0.25.2
169 | gym-notices==0.0.8
170 | h5netcdf==1.3.0
171 | h5py==3.11.0
172 | holidays==0.57
173 | holoviews==1.19.1
174 | html5lib==1.1
175 | httpimport==1.4.0
176 | httplib2==0.22.0
177 | huggingface-hub==0.24.7
178 | humanize==4.10.0
179 | hyperopt==0.2.7
180 | ibis-framework==9.2.0
181 | idna==3.10
182 | imageio==2.35.1
183 | imageio-ffmpeg==0.5.1
184 | imagesize==1.4.1
185 | imbalanced-learn==0.12.3
186 | imgaug==0.4.0
187 | immutabledict==4.2.0
188 | importlib_metadata==8.5.0
189 | importlib_resources==6.4.5
190 | imutils==0.5.4
191 | inflect==7.4.0
192 | iniconfig==2.0.0
193 | intel-cmplr-lib-ur==2024.2.1
194 | intel-openmp==2024.2.1
195 | ipyevents==2.0.2
196 | ipyfilechooser==0.6.0
197 | ipykernel==5.5.6
198 | ipyleaflet==0.19.2
199 | ipyparallel==8.8.0
200 | ipython==7.34.0
201 | ipython-genutils==0.2.0
202 | ipython-sql==0.5.0
203 | ipytree==0.2.2
204 | ipywidgets==7.7.1
205 | itsdangerous==2.2.0
206 | jax==0.4.33
207 | jax-cuda12-pjrt==0.4.33
208 | jax-cuda12-plugin==0.4.33
209 | jaxlib==0.4.33
210 | jeepney==0.7.1
211 | jellyfish==1.1.0
212 | jieba==0.42.1
213 | Jinja2==3.1.4
214 | jmespath==0.10.0
215 | joblib==1.4.2
216 | jsonpickle==3.3.0
217 | jsonschema==4.23.0
218 | jsonschema-specifications==2023.12.1
219 | jupyter-client==6.1.12
220 | jupyter-console==6.1.0
221 | jupyter-leaflet==0.19.2
222 | jupyter-server==1.24.0
223 | jupyter_core==5.7.2
224 | jupyterlab_pygments==0.3.0
225 | jupyterlab_widgets==3.0.13
226 | kaggle==1.6.17
227 | kagglehub==0.3.0
228 | keras==3.4.1
229 | keyring==23.5.0
230 | kiwisolver==1.4.7
231 | langcodes==3.4.0
232 | language_data==1.2.0
233 | launchpadlib==1.10.16
234 | lazr.restfulclient==0.14.4
235 | lazr.uri==1.0.6
236 | lazy_loader==0.4
237 | libclang==18.1.1
238 | librosa==0.10.2.post1
239 | lightgbm==4.5.0
240 | linkify-it-py==2.0.3
241 | llvmlite==0.43.0
242 | locket==1.0.0
243 | logical-unification==0.4.6
244 | lxml==4.9.4
245 | marisa-trie==1.2.0
246 | Markdown==3.7
247 | markdown-it-py==3.0.0
248 | MarkupSafe==2.1.5
249 | matplotlib==3.7.1
250 | matplotlib-inline==0.1.7
251 | matplotlib-venn==1.1.1
252 | mdit-py-plugins==0.4.2
253 | mdurl==0.1.2
254 | miniKanren==1.0.3
255 | missingno==0.5.2
256 | mistune==0.8.4
257 | mizani==0.11.4
258 | mkl==2024.2.2
259 | ml-dtypes==0.4.1
260 | mlxtend==0.23.1
261 | model-index==0.1.11
262 | more-itertools==10.5.0
263 | moviepy==1.0.3
264 | mpmath==1.3.0
265 | msgpack==1.0.8
266 | multidict==6.1.0
267 | multipledispatch==1.0.0
268 | multitasking==0.0.11
269 | murmurhash==1.0.10
270 | music21==9.1.0
271 | namex==0.0.8
272 | natsort==8.4.0
273 | nbclassic==1.1.0
274 | nbclient==0.10.0
275 | nbconvert==6.5.4
276 | nbformat==5.10.4
277 | nest-asyncio==1.6.0
278 | networkx==3.3
279 | nibabel==5.2.1
280 | nltk==3.8.1
281 | notebook==6.5.5
282 | notebook_shim==0.2.4
283 | numba==0.60.0
284 | numexpr==2.10.1
285 | numpy==1.26.4
286 | nvidia-cublas-cu12==12.6.1.4
287 | nvidia-cuda-cupti-cu12==12.6.68
288 | nvidia-cuda-nvcc-cu12==12.6.68
289 | nvidia-cuda-runtime-cu12==12.6.68
290 | nvidia-cudnn-cu12==9.4.0.58
291 | nvidia-cufft-cu12==11.2.6.59
292 | nvidia-cusolver-cu12==11.6.4.69
293 | nvidia-cusparse-cu12==12.5.3.3
294 | nvidia-nccl-cu12==2.23.4
295 | nvidia-nvjitlink-cu12==12.6.68
296 | nvtx==0.2.10
297 | oauth2client==4.1.3
298 | oauthlib==3.2.2
299 | opencv-contrib-python==4.10.0.84
300 | opencv-python==4.10.0.84
301 | opencv-python-headless==4.10.0.84
302 | opendatalab==0.0.10
303 | openmim==0.3.9
304 | openpyxl==3.1.5
305 | openxlab==0.1.1
306 | opt-einsum==3.3.0
307 | optax==0.2.3
308 | optree==0.12.1
309 | orbax-checkpoint==0.6.4
310 | ordered-set==4.1.0
311 | osqp==0.6.7.post0
312 | oss2==2.17.0
313 | packaging==24.1
314 | pandas==2.1.4
315 | pandas-datareader==0.10.0
316 | pandas-gbq==0.23.1
317 | pandas-stubs==2.1.4.231227
318 | pandocfilters==1.5.1
319 | panel==1.4.5
320 | panopticapi @ git+https://github.com/cocodataset/panopticapi.git@7bb4655548f98f3fedc07bf37e9040a992b054b0
321 | param==2.1.1
322 | parso==0.8.4
323 | parsy==2.1
324 | partd==1.4.2
325 | pathlib==1.0.1
326 | patsy==0.5.6
327 | peewee==3.17.6
328 | pexpect==4.9.0
329 | pickleshare==0.7.5
330 | pillow==10.4.0
331 | pip-tools==7.4.1
332 | platformdirs==4.3.6
333 | plotly==5.24.1
334 | plotnine==0.13.6
335 | pluggy==1.5.0
336 | polars==1.6.0
337 | pooch==1.8.2
338 | portpicker==1.5.2
339 | prefetch_generator==1.0.3
340 | preshed==3.0.9
341 | prettytable==3.11.0
342 | proglog==0.1.10
343 | progressbar2==4.5.0
344 | prometheus_client==0.21.0
345 | promise==2.3
346 | prompt_toolkit==3.0.47
347 | prophet==1.1.5
348 | proto-plus==1.24.0
349 | protobuf==3.20.3
350 | psutil==5.9.5
351 | psycopg2==2.9.9
352 | ptyprocess==0.7.0
353 | py-cpuinfo==9.0.0
354 | py4j==0.10.9.7
355 | pyarrow==14.0.2
356 | pyarrow-hotfix==0.6
357 | pyasn1==0.6.1
358 | pyasn1_modules==0.4.1
359 | pycocotools==2.0.8
360 | pycparser==2.22
361 | pycryptodome==3.20.0
362 | pydantic==2.9.2
363 | pydantic_core==2.23.4
364 | pydata-google-auth==1.8.2
365 | pydot==3.0.1
366 | pydot-ng==2.0.0
367 | pydotplus==2.0.2
368 | PyDrive==1.3.1
369 | PyDrive2==1.20.0
370 | pyerfa==2.0.1.4
371 | pygame==2.6.0
372 | Pygments==2.18.0
373 | PyGObject==3.42.1
374 | PyJWT==2.9.0
375 | pymc==5.16.2
376 | pymystem3==0.2.0
377 | pynvjitlink-cu12==0.3.0
378 | pyogrio==0.9.0
379 | PyOpenGL==3.1.7
380 | pyOpenSSL==24.2.1
381 | pyparsing==3.1.4
382 | pyperclip==1.9.0
383 | pyproj==3.6.1
384 | pyproject_hooks==1.1.0
385 | pyshp==2.3.1
386 | PySocks==1.7.1
387 | pytensor==2.25.4
388 | pytest==7.4.4
389 | python-apt==2.4.0
390 | python-box==7.2.0
391 | python-dateutil==2.8.2
392 | python-louvain==0.16
393 | python-slugify==8.0.4
394 | python-utils==3.8.2
395 | pytz==2023.4
396 | pyviz_comms==3.0.3
397 | PyYAML==6.0.2
398 | pyzmq==24.0.1
399 | qdldl==0.1.7.post4
400 | ratelim==0.1.6
401 | referencing==0.35.1
402 | regex==2024.9.11
403 | requests==2.28.2
404 | requests-oauthlib==1.3.1
405 | requirements-parser==0.9.0
406 | rich==13.4.2
407 | rmm-cu12==24.4.0
408 | rpds-py==0.20.0
409 | rpy2==3.4.2
410 | rsa==4.9
411 | safetensors==0.4.5
412 | scikit-image==0.24.0
413 | scikit-learn==1.5.2
414 | scipy==1.13.1
415 | scooby==0.10.0
416 | scs==3.2.7
417 | seaborn==0.13.1
418 | SecretStorage==3.3.1
419 | Send2Trash==1.8.3
420 | sentencepiece==0.2.0
421 | shapely==2.0.6
422 | shellingham==1.5.4
423 | simple-parsing==0.1.6
424 | six==1.16.0
425 | sklearn-pandas==2.2.0
426 | smart-open==7.0.4
427 | sniffio==1.3.1
428 | snowballstemmer==2.2.0
429 | sortedcontainers==2.4.0
430 | soundfile==0.12.1
431 | soupsieve==2.6
432 | soxr==0.5.0.post1
433 | spacy==3.7.6
434 | spacy-legacy==3.0.12
435 | spacy-loggers==1.0.5
436 | Sphinx==5.0.2
437 | sphinxcontrib-applehelp==2.0.0
438 | sphinxcontrib-devhelp==2.0.0
439 | sphinxcontrib-htmlhelp==2.1.0
440 | sphinxcontrib-jsmath==1.0.1
441 | sphinxcontrib-qthelp==2.0.0
442 | sphinxcontrib-serializinghtml==2.0.0
443 | SQLAlchemy==2.0.35
444 | sqlglot==25.1.0
445 | sqlparse==0.5.1
446 | srsly==2.4.8
447 | stanio==0.5.1
448 | statsmodels==0.14.3
449 | StrEnum==0.4.15
450 | sympy==1.13.3
451 | tables==3.8.0
452 | tabulate==0.9.0
453 | tbb==2021.13.1
454 | tblib==3.0.0
455 | tenacity==9.0.0
456 | tensorboard==2.17.0
457 | tensorboard-data-server==0.7.2
458 | tensorflow==2.17.0
459 | tensorflow-datasets==4.9.6
460 | tensorflow-hub==0.16.1
461 | tensorflow-io-gcs-filesystem==0.37.1
462 | tensorflow-metadata==1.15.0
463 | tensorflow-probability==0.24.0
464 | tensorstore==0.1.65
465 | termcolor==2.4.0
466 | terminado==0.18.1
467 | terminaltables==3.1.10
468 | text-unidecode==1.3
469 | textblob==0.17.1
470 | tf-slim==1.1.0
471 | tf_keras==2.17.0
472 | thinc==8.2.5
473 | threadpoolctl==3.5.0
474 | tifffile==2024.9.20
475 | timm==1.0.9
476 | tinycss2==1.3.0
477 | tokenizers==0.19.1
478 | toml==0.10.2
479 | tomli==2.0.1
480 | toolz==0.12.1
481 | torch @ https://download.pytorch.org/whl/cu121_full/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl#sha256=f3ed9a2b7f8671b2b32a2f036d1b81055eb3ad9b18ba43b705aa34bae4289e1a
482 | torchaudio @ https://download.pytorch.org/whl/cu121_full/torchaudio-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl#sha256=da8c87c80a1c1376a48dc33eef30b03bbdf1df25a05bd2b1c620b8811c7b19be
483 | torchsummary==1.5.1
484 | torchvision @ https://download.pytorch.org/whl/cu121_full/torchvision-0.19.1%2Bcu121-cp310-cp310-linux_x86_64.whl#sha256=b8cc4bf381b75522995b601e07a1b433b5fd925dc3e34a7fa6cd22f449d65379
485 | tornado==6.3.3
486 | tqdm==4.65.2
487 | traitlets==5.7.1
488 | traittypes==0.2.1
489 | transformers==4.44.2
490 | tweepy==4.14.0
491 | typeguard==4.3.0
492 | typer==0.12.5
493 | types-pytz==2024.2.0.20240913
494 | types-setuptools==75.1.0.20240917
495 | typing_extensions==4.12.2
496 | tzdata==2024.1
497 | tzlocal==5.2
498 | uc-micro-py==1.0.3
499 | uritemplate==4.1.1
500 | urllib3==1.26.20
501 | vega-datasets==0.9.0
502 | wadllib==1.3.6
503 | wasabi==1.1.3
504 | wcwidth==0.2.13
505 | weasel==0.4.1
506 | webcolors==24.8.0
507 | webencodings==0.5.1
508 | websocket-client==1.8.0
509 | Werkzeug==3.0.4
510 | widgetsnbextension==3.6.9
511 | wordcloud==1.9.3
512 | wrapt==1.16.0
513 | xarray==2024.9.0
514 | xarray-einstats==0.8.0
515 | xgboost==2.1.1
516 | xlrd==2.0.1
517 | xyzservices==2024.9.0
518 | yapf==0.40.2
519 | yarl==1.11.1
520 | yellowbrick==1.5
521 | yfinance==0.2.43
522 | zict==3.0.0
523 | zipp==3.20.2
524 |
--------------------------------------------------------------------------------
/ImageSegmentation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyNhlaPsPB9W8n1aX35ZbIiR",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "source": [
32 | "# **Image Segmentation**"
33 | ],
34 | "metadata": {
35 | "id": "GlHJjHksu6hh"
36 | }
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "source": [
41 | "Please save a copy of this file into your own drive/environment instead of asking for the shared version.\n",
42 | "\n",
43 | "请将此文件保存到您自己的 Google Drive 或操作系统环境中,而不是请求共享版本\n",
44 | "\n",
45 | "このファイルを共有バージョンを要求するのではなく、自分の Google ドライブや OS 環境にコピーを保存してください"
46 | ],
47 | "metadata": {
48 | "id": "V-XSpU5MoEgs"
49 | }
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {
55 | "id": "0txetkFtI9Bv"
56 | },
57 | "outputs": [],
58 | "source": [
59 | "import torch\n",
60 | "import torch.nn as nn\n",
61 | "import torchvision.models.segmentation as segmentation\n",
62 | "from torchvision.transforms import Compose, Resize, ToTensor, Normalize\n",
63 | "from PIL import Image\n",
64 | "import numpy as np"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "source": [
70 | "from torchvision.io.image import read_image\n",
71 | "from torchvision.models.segmentation import deeplabv3_resnet101, DeepLabV3_ResNet101_Weights\n",
72 | "from torchvision.transforms.functional import to_pil_image"
73 | ],
74 | "metadata": {
75 | "id": "EvI-1SbZMhQU"
76 | },
77 | "execution_count": null,
78 | "outputs": []
79 | },
80 | {
81 | "cell_type": "code",
82 | "source": [
83 | "# Step 1: Initialize model with the best available weights\n",
84 | "weights = DeepLabV3_ResNet101_Weights.DEFAULT\n",
85 | "model = deeplabv3_resnet101(weights=weights)\n",
86 | "model.eval()"
87 | ],
88 | "metadata": {
89 | "colab": {
90 | "base_uri": "https://localhost:8080/"
91 | },
92 | "id": "5cTuZ2c7Mkxq",
93 | "outputId": "6ec742fa-cecd-4070-a268-62501aa3e696"
94 | },
95 | "execution_count": null,
96 | "outputs": [
97 | {
98 | "output_type": "stream",
99 | "name": "stderr",
100 | "text": [
101 | "Downloading: \"https://download.pytorch.org/models/deeplabv3_resnet101_coco-586e9e4e.pth\" to /root/.cache/torch/hub/checkpoints/deeplabv3_resnet101_coco-586e9e4e.pth\n",
102 | "100%|██████████| 233M/233M [00:02<00:00, 83.8MB/s]\n"
103 | ]
104 | },
105 | {
106 | "output_type": "execute_result",
107 | "data": {
108 | "text/plain": [
109 | "DeepLabV3(\n",
110 | " (backbone): IntermediateLayerGetter(\n",
111 | " (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n",
112 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
113 | " (relu): ReLU(inplace=True)\n",
114 | " (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n",
115 | " (layer1): Sequential(\n",
116 | " (0): Bottleneck(\n",
117 | " (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
118 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
119 | " (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
120 | " (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
121 | " (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
122 | " (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
123 | " (relu): ReLU(inplace=True)\n",
124 | " (downsample): Sequential(\n",
125 | " (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
126 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
127 | " )\n",
128 | " )\n",
129 | " (1): Bottleneck(\n",
130 | " (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
131 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
132 | " (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
133 | " (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
134 | " (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
135 | " (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
136 | " (relu): ReLU(inplace=True)\n",
137 | " )\n",
138 | " (2): Bottleneck(\n",
139 | " (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
140 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
141 | " (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
142 | " (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
143 | " (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
144 | " (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
145 | " (relu): ReLU(inplace=True)\n",
146 | " )\n",
147 | " )\n",
148 | " (layer2): Sequential(\n",
149 | " (0): Bottleneck(\n",
150 | " (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
151 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
152 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n",
153 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
154 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
155 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
156 | " (relu): ReLU(inplace=True)\n",
157 | " (downsample): Sequential(\n",
158 | " (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n",
159 | " (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
160 | " )\n",
161 | " )\n",
162 | " (1): Bottleneck(\n",
163 | " (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
164 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
165 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
166 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
167 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
168 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
169 | " (relu): ReLU(inplace=True)\n",
170 | " )\n",
171 | " (2): Bottleneck(\n",
172 | " (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
173 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
174 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
175 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
176 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
177 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
178 | " (relu): ReLU(inplace=True)\n",
179 | " )\n",
180 | " (3): Bottleneck(\n",
181 | " (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
182 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
183 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
184 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
185 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
186 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
187 | " (relu): ReLU(inplace=True)\n",
188 | " )\n",
189 | " )\n",
190 | " (layer3): Sequential(\n",
191 | " (0): Bottleneck(\n",
192 | " (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
193 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
194 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
195 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
196 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
197 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
198 | " (relu): ReLU(inplace=True)\n",
199 | " (downsample): Sequential(\n",
200 | " (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
201 | " (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
202 | " )\n",
203 | " )\n",
204 | " (1): Bottleneck(\n",
205 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
206 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
207 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
208 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
209 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
210 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
211 | " (relu): ReLU(inplace=True)\n",
212 | " )\n",
213 | " (2): Bottleneck(\n",
214 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
215 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
216 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
217 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
218 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
219 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
220 | " (relu): ReLU(inplace=True)\n",
221 | " )\n",
222 | " (3): Bottleneck(\n",
223 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
224 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
225 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
226 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
227 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
228 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
229 | " (relu): ReLU(inplace=True)\n",
230 | " )\n",
231 | " (4): Bottleneck(\n",
232 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
233 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
234 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
235 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
236 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
237 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
238 | " (relu): ReLU(inplace=True)\n",
239 | " )\n",
240 | " (5): Bottleneck(\n",
241 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
242 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
243 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
244 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
245 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
246 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
247 | " (relu): ReLU(inplace=True)\n",
248 | " )\n",
249 | " (6): Bottleneck(\n",
250 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
251 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
252 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
253 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
254 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
255 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
256 | " (relu): ReLU(inplace=True)\n",
257 | " )\n",
258 | " (7): Bottleneck(\n",
259 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
260 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
261 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
262 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
263 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
264 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
265 | " (relu): ReLU(inplace=True)\n",
266 | " )\n",
267 | " (8): Bottleneck(\n",
268 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
269 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
270 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
271 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
272 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
273 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
274 | " (relu): ReLU(inplace=True)\n",
275 | " )\n",
276 | " (9): Bottleneck(\n",
277 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
278 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
279 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
280 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
281 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
282 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
283 | " (relu): ReLU(inplace=True)\n",
284 | " )\n",
285 | " (10): Bottleneck(\n",
286 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
287 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
288 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
289 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
290 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
291 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
292 | " (relu): ReLU(inplace=True)\n",
293 | " )\n",
294 | " (11): Bottleneck(\n",
295 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
296 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
297 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
298 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
299 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
300 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
301 | " (relu): ReLU(inplace=True)\n",
302 | " )\n",
303 | " (12): Bottleneck(\n",
304 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
305 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
306 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
307 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
308 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
309 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
310 | " (relu): ReLU(inplace=True)\n",
311 | " )\n",
312 | " (13): Bottleneck(\n",
313 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
314 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
315 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
316 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
317 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
318 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
319 | " (relu): ReLU(inplace=True)\n",
320 | " )\n",
321 | " (14): Bottleneck(\n",
322 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
323 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
324 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
325 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
326 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
327 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
328 | " (relu): ReLU(inplace=True)\n",
329 | " )\n",
330 | " (15): Bottleneck(\n",
331 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
332 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
333 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
334 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
335 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
336 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
337 | " (relu): ReLU(inplace=True)\n",
338 | " )\n",
339 | " (16): Bottleneck(\n",
340 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
341 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
342 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
343 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
344 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
345 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
346 | " (relu): ReLU(inplace=True)\n",
347 | " )\n",
348 | " (17): Bottleneck(\n",
349 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
350 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
351 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
352 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
353 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
354 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
355 | " (relu): ReLU(inplace=True)\n",
356 | " )\n",
357 | " (18): Bottleneck(\n",
358 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
359 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
360 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
361 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
362 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
363 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
364 | " (relu): ReLU(inplace=True)\n",
365 | " )\n",
366 | " (19): Bottleneck(\n",
367 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
368 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
369 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
370 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
371 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
372 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
373 | " (relu): ReLU(inplace=True)\n",
374 | " )\n",
375 | " (20): Bottleneck(\n",
376 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
377 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
378 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
379 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
380 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
381 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
382 | " (relu): ReLU(inplace=True)\n",
383 | " )\n",
384 | " (21): Bottleneck(\n",
385 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
386 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
387 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
388 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
389 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
390 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
391 | " (relu): ReLU(inplace=True)\n",
392 | " )\n",
393 | " (22): Bottleneck(\n",
394 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
395 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
396 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
397 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
398 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
399 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
400 | " (relu): ReLU(inplace=True)\n",
401 | " )\n",
402 | " )\n",
403 | " (layer4): Sequential(\n",
404 | " (0): Bottleneck(\n",
405 | " (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
406 | " (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
407 | " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n",
408 | " (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
409 | " (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
410 | " (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
411 | " (relu): ReLU(inplace=True)\n",
412 | " (downsample): Sequential(\n",
413 | " (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
414 | " (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
415 | " )\n",
416 | " )\n",
417 | " (1): Bottleneck(\n",
418 | " (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
419 | " (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
420 | " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(4, 4), dilation=(4, 4), bias=False)\n",
421 | " (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
422 | " (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
423 | " (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
424 | " (relu): ReLU(inplace=True)\n",
425 | " )\n",
426 | " (2): Bottleneck(\n",
427 | " (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
428 | " (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
429 | " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(4, 4), dilation=(4, 4), bias=False)\n",
430 | " (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
431 | " (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
432 | " (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
433 | " (relu): ReLU(inplace=True)\n",
434 | " )\n",
435 | " )\n",
436 | " )\n",
437 | " (classifier): DeepLabHead(\n",
438 | " (0): ASPP(\n",
439 | " (convs): ModuleList(\n",
440 | " (0): Sequential(\n",
441 | " (0): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
442 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
443 | " (2): ReLU()\n",
444 | " )\n",
445 | " (1): ASPPConv(\n",
446 | " (0): Conv2d(2048, 256, kernel_size=(3, 3), stride=(1, 1), padding=(12, 12), dilation=(12, 12), bias=False)\n",
447 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
448 | " (2): ReLU()\n",
449 | " )\n",
450 | " (2): ASPPConv(\n",
451 | " (0): Conv2d(2048, 256, kernel_size=(3, 3), stride=(1, 1), padding=(24, 24), dilation=(24, 24), bias=False)\n",
452 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
453 | " (2): ReLU()\n",
454 | " )\n",
455 | " (3): ASPPConv(\n",
456 | " (0): Conv2d(2048, 256, kernel_size=(3, 3), stride=(1, 1), padding=(36, 36), dilation=(36, 36), bias=False)\n",
457 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
458 | " (2): ReLU()\n",
459 | " )\n",
460 | " (4): ASPPPooling(\n",
461 | " (0): AdaptiveAvgPool2d(output_size=1)\n",
462 | " (1): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
463 | " (2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
464 | " (3): ReLU()\n",
465 | " )\n",
466 | " )\n",
467 | " (project): Sequential(\n",
468 | " (0): Conv2d(1280, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
469 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
470 | " (2): ReLU()\n",
471 | " (3): Dropout(p=0.5, inplace=False)\n",
472 | " )\n",
473 | " )\n",
474 | " (1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
475 | " (2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
476 | " (3): ReLU()\n",
477 | " (4): Conv2d(256, 21, kernel_size=(1, 1), stride=(1, 1))\n",
478 | " )\n",
479 | " (aux_classifier): FCNHead(\n",
480 | " (0): Conv2d(1024, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n",
481 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
482 | " (2): ReLU()\n",
483 | " (3): Dropout(p=0.1, inplace=False)\n",
484 | " (4): Conv2d(256, 21, kernel_size=(1, 1), stride=(1, 1))\n",
485 | " )\n",
486 | ")"
487 | ]
488 | },
489 | "metadata": {},
490 | "execution_count": 3
491 | }
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "source": [
497 | "!wget http://images.cocodataset.org/val2017/000000005477.jpg -q -O input.jpg\n",
498 | "im1= read_image(\"input.jpg\")"
499 | ],
500 | "metadata": {
501 | "id": "bwizfLVML1pL"
502 | },
503 | "execution_count": null,
504 | "outputs": []
505 | },
506 | {
507 | "cell_type": "code",
508 | "source": [
509 | "# Step 2: Initialize the inference transforms\n",
510 | "preprocess = weights.transforms()\n",
511 | "\n",
512 | "# Step 3: Apply inference preprocessing transforms\n",
513 | "batch = preprocess(im1).unsqueeze(0)\n",
514 | "\n",
515 | "# Step 4: Use the model and visualize the prediction\n",
516 | "prediction = model(batch)[\"out\"]\n",
517 | "normalized_masks = prediction.softmax(dim=1)\n",
518 | "class_to_idx = {cls: idx for (idx, cls) in enumerate(weights.meta[\"categories\"])}"
519 | ],
520 | "metadata": {
521 | "id": "XuZL4yWRS8Gx"
522 | },
523 | "execution_count": null,
524 | "outputs": []
525 | },
526 | {
527 | "cell_type": "code",
528 | "source": [
529 | "print(weights.meta[\"categories\"])"
530 | ],
531 | "metadata": {
532 | "colab": {
533 | "base_uri": "https://localhost:8080/"
534 | },
535 | "id": "mJgEIsYxPbix",
536 | "outputId": "410ab130-581c-4551-ae8a-cdb6e9626d83"
537 | },
538 | "execution_count": null,
539 | "outputs": [
540 | {
541 | "output_type": "stream",
542 | "name": "stdout",
543 | "text": [
544 | "['__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']\n"
545 | ]
546 | }
547 | ]
548 | },
549 | {
550 | "cell_type": "code",
551 | "source": [
552 | "mask = normalized_masks[0, class_to_idx[\"aeroplane\"]]"
553 | ],
554 | "metadata": {
555 | "id": "f1R-TRZuPnnM"
556 | },
557 | "execution_count": null,
558 | "outputs": []
559 | },
560 | {
561 | "cell_type": "code",
562 | "source": [
563 | "to_pil_image(mask).show()"
564 | ],
565 | "metadata": {
566 | "id": "IsO6WXQdytTG"
567 | },
568 | "execution_count": null,
569 | "outputs": []
570 | },
571 | {
572 | "cell_type": "code",
573 | "source": [
574 | "from google.colab import drive\n",
575 | "drive.mount('/content/gdrive')"
576 | ],
577 | "metadata": {
578 | "colab": {
579 | "base_uri": "https://localhost:8080/"
580 | },
581 | "id": "-Gei9D03SWmM",
582 | "outputId": "51672b3c-5bd8-414a-d2f4-d78e9b94e576"
583 | },
584 | "execution_count": null,
585 | "outputs": [
586 | {
587 | "output_type": "stream",
588 | "name": "stdout",
589 | "text": [
590 | "Mounted at /content/gdrive\n"
591 | ]
592 | }
593 | ]
594 | },
595 | {
596 | "cell_type": "code",
597 | "source": [
598 | "torch.save(mask, '***/MyDrive/mask.pt')"
599 | ],
600 | "metadata": {
601 | "id": "ODJZDuIBSbmm"
602 | },
603 | "execution_count": null,
604 | "outputs": []
605 | }
606 | ]
607 | }
--------------------------------------------------------------------------------