├── modules ├── gitignore ├── AbG │ └── requirements.txt ├── AbM │ └── requirements.txt ├── AfN │ └── requirements.txt ├── DfN │ └── requirements.txt └── Efficient-ResNet │ └── gitClone ├── tools ├── gitignore └── requirements.txt ├── data ├── ADE20K │ └── ADE20K_val.mlx ├── BDD100K │ └── bdd100k_val.mlx ├── CamVid │ ├── CamVid_NetworkMetrics.mat │ ├── camVidConfig.py │ └── readMe.md └── Cityscapes │ ├── cityscapesConfig.py │ ├── LICENSE │ └── readMe.md ├── models ├── segmentation │ ├── getStarted │ ├── importSegmentation.py │ └── readMe.md └── classification │ ├── getStarted │ ├── lstm.m │ ├── fcn.m │ ├── cnn.m │ ├── importClassification.py │ ├── lstm.py │ ├── fcn.py │ ├── cnn.py │ ├── cnn_d3_v2.m │ ├── cnn_d3_v2.py │ └── readMe.md ├── libs ├── requirements.txt └── readMe.md ├── LICENSE ├── ImageClassification.ipynb ├── README.md ├── requirements.txt └── ImageSegmentation.ipynb /modules/gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tools/gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data/ADE20K/ADE20K_val.mlx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/serdarch/SERNet-Former/HEAD/data/ADE20K/ADE20K_val.mlx -------------------------------------------------------------------------------- /data/BDD100K/bdd100k_val.mlx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/serdarch/SERNet-Former/HEAD/data/BDD100K/bdd100k_val.mlx -------------------------------------------------------------------------------- /data/CamVid/CamVid_NetworkMetrics.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/serdarch/SERNet-Former/HEAD/data/CamVid/CamVid_NetworkMetrics.mat -------------------------------------------------------------------------------- /models/segmentation/getStarted: -------------------------------------------------------------------------------- 1 | #SERNet-Former uses versions of Efficient-ResNet as the baseline architectures in semantic segmentation tasks. 2 | -------------------------------------------------------------------------------- /models/classification/getStarted: -------------------------------------------------------------------------------- 1 | #SERNet-Former applies versions of Efficient-ResNet as the baseline architectures 2 | #for classification tasks. 3 | -------------------------------------------------------------------------------- /libs/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch 2 | torchvision 3 | matplotlib 4 | numpy 5 | packaging 6 | prettytable 7 | scipy 8 | codecov 9 | flake8 10 | ftfy 11 | interrogate 12 | pytest 13 | regex 14 | yapf 15 | -------------------------------------------------------------------------------- /modules/AbG/requirements.txt: -------------------------------------------------------------------------------- 1 | Serdar Erişen, 2024. 2 | All rights are reserved! 3 | 4 | Details are TBD 5 | 6 | The code runs with MATLAB 7 | 8 | Versions for different languages are being developed. 9 | -------------------------------------------------------------------------------- /tools/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch 2 | torchvision 3 | matplotlib 4 | numpy 5 | packaging 6 | prettytable 7 | scipy 8 | codecov 9 | flake8 10 | ftfy 11 | interrogate 12 | pytest 13 | regex 14 | yapf 15 | -------------------------------------------------------------------------------- /models/segmentation/importSegmentation.py: -------------------------------------------------------------------------------- 1 | import torchvision.models as models 2 | deeplabv3_resnet50 = models.deeplabv3_resnet50(pretrained=True) 3 | deeplabv3_resnet101 = models.deeplabv3_resnet101(pretrained=True) -------------------------------------------------------------------------------- /modules/AbM/requirements.txt: -------------------------------------------------------------------------------- 1 | Serdar Erişen, 2024. 2 | All rights are reserved! 3 | 4 | Details are TBD 5 | 6 | The code runs with MATLAB 7 | 8 | Versions for different languages are being developed. 9 | 10 | -------------------------------------------------------------------------------- /modules/AfN/requirements.txt: -------------------------------------------------------------------------------- 1 | Serdar Erişen, 2024. 2 | All rights are reserved! 3 | 4 | Details are TBD 5 | 6 | The code runs with MATLAB 7 | 8 | Versions for different languages are being developed. 9 | 10 | 11 | -------------------------------------------------------------------------------- /modules/DfN/requirements.txt: -------------------------------------------------------------------------------- 1 | Serdar Erişen, 2024. 2 | All rights are reserved! 3 | 4 | Details are TBD 5 | 6 | The code runs with MATLAB 7 | 8 | Versions for different languages are being developed. 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /modules/Efficient-ResNet/gitClone: -------------------------------------------------------------------------------- 1 | #You can clone the repository of Efficient-ResNet https://github.com/serdarch/Efficient-ResNet.git 2 | #into your environment. 3 | 4 | git clone https://github.com/serdarch/Efficient-ResNet.git 5 | -------------------------------------------------------------------------------- /models/classification/lstm.m: -------------------------------------------------------------------------------- 1 | layers = [ 2 | sequenceInputLayer(numFeatures) 3 | lstmLayer(hiddenSize, 'OutputMode', 'last') 4 | fullyConnectedLayer(numClasses) 5 | softmaxLayer 6 | classificationLayer 7 | ]; 8 | -------------------------------------------------------------------------------- /models/classification/fcn.m: -------------------------------------------------------------------------------- 1 | hiddenSize = 100; 2 | layers = [ 3 | imageInputLayer([28 28 1]) 4 | fullyConnectedLayer(hiddenSize) 5 | reluLayer 6 | fullyConnectedLayer(numClasses) 7 | softmaxLayer 8 | classificationLayer 9 | ]; 10 | -------------------------------------------------------------------------------- /models/classification/cnn.m: -------------------------------------------------------------------------------- 1 | layers = [ 2 | imageInputLayer([28 28 1]) 3 | convolution2dLayer(5, 20) 4 | reluLayer 5 | maxPooling2dLayer(2, 'Stride', 2) 6 | fullyConnectedLayer(numClasses) 7 | softmaxLayer 8 | classificationLayer 9 | ]; 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | All rights reserved. 2 | 3 | Copyright (c) 2024 Serdar Erişen 4 | 5 | The copyright holder reserves all the rights provided by the copyright law, 6 | such as distribution, performance, and the creation of derivative works. 7 | 8 | This repository is being developed only to backup and augment the different language options 9 | of the SERNet-Former and increase the research capacities throughout 10 | the open-source codes and datasets, 11 | but not directly shares the original details of the network. 12 | -------------------------------------------------------------------------------- /models/classification/importClassification.py: -------------------------------------------------------------------------------- 1 | import torchvision.models as models 2 | resnet50 = models.resnet50(pretrained=True) 3 | resnet101 = models.resnet101(pretrained=True) 4 | resnext101 = models.resnext101_64X4D1(pretrained=True) 5 | efficientnet_b6 = models.efficientnet_b6(pretrained=True) 6 | regnet_y_128gg = models.regnet_Y_128GF_SWAG_E2E_V1(pretrained=True) 7 | vit_b_16 = models.vit_b_16_SWAG_E2E_V16(pretrained=True) 8 | vit_b_32 = models.vit_b_32(pretrained=True) 9 | vit_l_16 = models.vit_l_16(pretrained=True) 10 | -------------------------------------------------------------------------------- /models/classification/lstm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class YourModel(nn.Module): 5 | def __init__(self, num_features, hidden_size, num_classes): 6 | super(YourModel, self).__init__() 7 | self.lstm = nn.LSTM(num_features, hidden_size, batch_first=True) 8 | self.fc = nn.Linear(hidden_size, num_classes) 9 | self.softmax = nn.Softmax(dim=1) # Softmax across classes 10 | 11 | def forward(self, x): 12 | _, (h_n, _) = self.lstm(x) 13 | x = self.fc(h_n.squeeze(0)) 14 | x = self.softmax(x) 15 | return x 16 | 17 | # Example usage: 18 | # model = YourModel(num_features=numFeatures, hidden_size=hiddenSize, num_classes=numClasses) 19 | -------------------------------------------------------------------------------- /data/CamVid/camVidConfig.py: -------------------------------------------------------------------------------- 1 | # @package training 2 | # Those arguments defines the training hyper-parameters 3 | epochs: 80 4 | num_workers: 1 5 | batch_size: 3 6 | shuffle: True 7 | cuda: 0 8 | precompute_multi_scale: False # Compute multiscate features on cpu for faster training / inference 9 | optim: 10 | base_lr: 0.001 11 | optimizer: 12 | class: SGD 13 | params: 14 | lr: ${training.optim.base_lr} 15 | lr_scheduler: ${lr_scheduler_v} 16 | bn_scheduler: 17 | bn_policy: "step_decay" 18 | params: 19 | bn_momentum: 0.9 20 | bn_decay: 0.95 21 | decay_step: 10 22 | bn_clip: 1 23 | weight_name: "latest" # can be named/changed according to the shared model weights 24 | enable_cudnn: False 25 | checkpoint_dir: "..." 26 | -------------------------------------------------------------------------------- /data/Cityscapes/cityscapesConfig.py: -------------------------------------------------------------------------------- 1 | # @package training 2 | # Those arguments defines the training hyper-parameters 3 | epochs: 80 4 | num_workers: 1 5 | batch_size: 1 6 | shuffle: True 7 | cuda: 0 8 | precompute_multi_scale: False # Compute multiscate features on cpu for faster training / inference 9 | optim: 10 | base_lr: 0.0005 11 | optimizer: 12 | class: SGD 13 | params: 14 | lr: ${training.optim.base_lr} 15 | lr_scheduler: ${lr_scheduler} 16 | bn_scheduler: 17 | bn_policy: "step_decay" 18 | params: 19 | bn_momentum: 0.9 20 | bn_decay: 0.95 21 | decay_step: 10 22 | bn_clip: 1 23 | weight_name: "latest" # can be named/changed according to the shared model weights 24 | enable_cudnn: False 25 | checkpoint_dir: "..." 26 | -------------------------------------------------------------------------------- /models/classification/fcn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class YourModel(nn.Module): 5 | def __init__(self, hidden_size, num_classes): 6 | super(YourModel, self).__init__() 7 | self.fc1 = nn.Linear(28 * 28 * 1, hidden_size) 8 | self.relu = nn.ReLU() 9 | self.fc2 = nn.Linear(hidden_size, num_classes) 10 | self.softmax = nn.Softmax(dim=1) # Softmax across classes 11 | 12 | def forward(self, x): 13 | x = x.view(-1, 28 * 28 * 1) # Flatten the input images 14 | x = self.fc1(x) 15 | x = self.relu(x) 16 | x = self.fc2(x) 17 | x = self.softmax(x) 18 | return x 19 | 20 | # Example usage: 21 | # model = YourModel(hidden_size=100, num_classes=numClasses) 22 | -------------------------------------------------------------------------------- /models/classification/cnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class YourModel(nn.Module): 5 | def __init__(self, num_classes): 6 | super(YourModel, self).__init__() 7 | self.conv1 = nn.Conv2d(in_channels=1, out_channels=20, kernel_size=5) 8 | self.relu = nn.ReLU() 9 | self.pool = nn.MaxPool2d(kernel_size=2, stride=2) 10 | self.fc = nn.Linear(20 * 12 * 12, num_classes) # Assuming input image size is 28x28 11 | 12 | def forward(self, x): 13 | x = self.conv1(x) 14 | x = self.relu(x) 15 | x = self.pool(x) 16 | x = x.view(-1, 20 * 12 * 12) # Flatten the tensor for fully connected layer 17 | x = self.fc(x) 18 | return x 19 | 20 | # Example usage: 21 | # model = YourModel(num_classes=numClasses) 22 | -------------------------------------------------------------------------------- /models/segmentation/readMe.md: -------------------------------------------------------------------------------- 1 | # Model Zoo 2 | 3 | The models are the open-source segmentation baseline checkpoints 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
BaselineDownload
DeepLab_v3 ResNet-50model
DeepLab_v3 ResNet-101model
HR-Net W48model
24 | -------------------------------------------------------------------------------- /data/Cityscapes/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Serdar Erişen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /models/classification/cnn_d3_v2.m: -------------------------------------------------------------------------------- 1 | CNN_D3_layers = [ 2 | imageInputLayer([1 6 1], "Name", "imageinput") 3 | convolution2dLayer([1 1], 4, "Name", "conv", "Padding", "same") 4 | batchNormalizationLayer("Name", "batchnorm") 5 | reluLayer("Name", "relu") 6 | globalMaxPooling2dLayer("Name", "gmpool") 7 | convolution2dLayer([1 1], 8, "Name", "conv_1", "Padding", "same") 8 | convolution2dLayer([1 1], 8, "Name", "conv_2", "Padding", "same") 9 | batchNormalizationLayer("Name", "batchnorm_1") 10 | reluLayer("Name", "relu_1") 11 | globalMaxPooling2dLayer("Name", "gmpool_1") 12 | convolution2dLayer([1 1], 16, "Name", "conv_3", "Padding", "same") 13 | convolution2dLayer([1 1], 16, "Name", "conv_4", "Padding", "same") 14 | batchNormalizationLayer("Name", "batchnorm_2") 15 | convolution2dLayer([1 1], 32, "Name", "conv_5", "Padding", "same") 16 | convolution2dLayer([1 1], 32, "Name", "conv_6", "Padding", "same") 17 | batchNormalizationLayer("Name", "batchnorm_3") 18 | reluLayer("Name", "relu_2") 19 | fullyConnectedLayer(4, "Name", "fc") 20 | softmaxLayer("Name", "softmax") 21 | classificationLayer("Name", "classoutput") 22 | ]; 23 | 24 | -------------------------------------------------------------------------------- /models/classification/cnn_d3_v2.py: -------------------------------------------------------------------------------- 1 | # Python code for CNN_D3_v2 architecture 2 | # Serdar Erisen, 2024 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | class CNN_D3_v2(nn.Module): 8 | def __init__(self): 9 | super(CNN_D3_v2, self).__init__() 10 | self.conv = nn.Conv2d(1, 4, kernel_size=1, padding='same') 11 | self.batchnorm = nn.BatchNorm2d(4) 12 | self.relu = nn.ReLU() 13 | self.gmpool = nn.AdaptiveMaxPool2d((1, 1)) 14 | 15 | self.conv_1 = nn.Conv2d(4, 8, kernel_size=1, padding='same') 16 | self.conv_2 = nn.Conv2d(8, 8, kernel_size=1, padding='same') 17 | self.batchnorm_1 = nn.BatchNorm2d(8) 18 | self.relu_1 = nn.ReLU() 19 | self.gmpool_1 = nn.AdaptiveMaxPool2d((1, 1)) 20 | 21 | self.conv_3 = nn.Conv2d(8, 16, kernel_size=1, padding='same') 22 | self.conv_4 = nn.Conv2d(16, 16, kernel_size=1, padding='same') 23 | self.batchnorm_2 = nn.BatchNorm2d(16) 24 | 25 | self.conv_5 = nn.Conv2d(16, 32, kernel_size=1, padding='same') 26 | self.conv_6 = nn.Conv2d(32, 32, kernel_size=1, padding='same') 27 | self.batchnorm_3 = nn.BatchNorm2d(32) 28 | self.relu_2 = nn.ReLU() 29 | 30 | self.fc = nn.Linear(32, 4) 31 | self.softmax = nn.Softmax(dim=1) 32 | 33 | def forward(self, x): 34 | x = self.conv(x) 35 | x = self.batchnorm(x) 36 | x = self.relu(x) 37 | x = self.gmpool(x) 38 | 39 | x = self.conv_1(x) 40 | x = self.conv_2(x) 41 | x = self.batchnorm_1(x) 42 | x = self.relu_1(x) 43 | x = self.gmpool_1(x) 44 | 45 | x = self.conv_3(x) 46 | x = self.conv_4(x) 47 | x = self.batchnorm_2(x) 48 | 49 | x = self.conv_5(x) 50 | x = self.conv_6(x) 51 | x = self.batchnorm_3(x) 52 | x = self.relu_2(x) 53 | 54 | x = torch.flatten(x, 1) 55 | 56 | x = self.fc(x) 57 | x = self.softmax(x) 58 | 59 | return x 60 | 61 | # Example usage: 62 | # model = CNN_D3_v2() 63 | # # Print model architecture 64 | # print(model) 65 | 66 | 67 | -------------------------------------------------------------------------------- /data/Cityscapes/readMe.md: -------------------------------------------------------------------------------- 1 | ⁰# Cityscapes dataset 2 | 3 | Cityscapes is one of the most challenging datasets for the semantic segmentation of urban street scenes. 4 | 5 | It contains high-quality pixel-level annotations for 5000 images, as well as coarsely annotated 20000 images. 6 | 7 | The dataset contains diverse stereo video sequences with the sizes of 1024 by 2048 pixels, 8 | recorded during the daytime of 50 European cities visited in several months (spring, summer, and fall) 9 | with good or average weather conditions. 10 | 11 | The dataset of 5000 fine annotations is divided into three sets: 2975 for training, 500 for validation, and 1525 for testing. 12 | 13 | The dataset includes semantic, instance-wise, and dense pixel annotations of 30 classes grouped into eight categories. 14 | 15 | However, most literature uses annotations with 20 classes, 19 of which are semantic labels containing objects and stuff, 16 | in addition to one additional void class for do-not-care regions. 17 | # Models 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 |
Model / MethodBaselinemIoU
SERNet-FormerResNet-5073.31
SERNet-FormerEfficient-ResNet_R10177.04
SERNet-FormerEfficient-ResNet [final]84.83
40 | 41 | ## Please cite 42 | 43 | ```bibtex 44 | @inproceedings{Cordts2016CVPR, 45 | title={The cityscapes dataset for semantic urban scene understanding}, 46 | author={M. Cordts and M. Omran and S. Ramos and T. Rehfeld and M. Enzweiler and R. Benenson and U. Franke and S. Roth, and B. Schiele}, 47 | booktitle={CVPR}, 48 | year={2016}, 49 | } 50 | 51 | @article{Erisen2024SERNetFormer, 52 | title={SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks}, 53 | author={Erişen, Serdar}, 54 | journal={arXiv preprint arXiv:2401.15741}, 55 | year={2024} 56 | } 57 | ``` 58 | -------------------------------------------------------------------------------- /models/classification/readMe.md: -------------------------------------------------------------------------------- 1 | # Model Zoo 2 | 3 | The models are the open-source checkpoints pretrained on ImageNet dataset 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 |
BaselineDownload
ResNet-50model
ResNet-101model
Swin_V2_Smodel
Swin_V2_Bmodel
ViT_B_16_SWAG_E2E_V1model
ViT_H_14_SWAG_E2E_V1model
EfficientNet_B6model
EfficientNet_V2_Lmodel
RegNet_Y_128GF_SWAG_E2E_V1model
ResNeXt101_64X4Dmodel
CNN-D3model
56 | -------------------------------------------------------------------------------- /libs/readMe.md: -------------------------------------------------------------------------------- 1 | # Model Zoo 2 | 3 | ## Classification Networks 4 | The models are the open-source checkpoints pretrained on ImageNet dataset 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 |
BaselineDownload
ResNet-50model
ResNet-101model
Swin_V2_Smodel
Swin_V2_Bmodel
ViT_B_16_SWAG_E2E_V1model
ViT_H_14_SWAG_E2E_V1model
EfficientNet_B6model
EfficientNet_V2_Lmodel
RegNet_Y_128GF_SWAG_E2E_V1model
ResNeXt101_64X4Dmodel
53 | 54 | ## Segmentation Networks 55 | The models are the open-source segmentation baseline checkpoints pretrained on COCO dataset 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 |
BaselineDownload
DeepLab_v3 ResNet-50model
DeepLab_v3 ResNet-101model
71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /data/CamVid/readMe.md: -------------------------------------------------------------------------------- 1 | # CamVid Dataset 2 | 3 | The Cambridge-driving Labelled Video Database (CamVid) is one of the first scene understanding databases, 4 | and it is based on the motion-based video collections of driving scenes recorded for semantic segmentation of object classes. 5 | 6 | This database contains 701 frames with sizes of 720 by 960 pixels that were captured in five video sequences, 7 | shot via the fixed-position CCTV-style cameras mounted on a car. The densely annotated images were manually 8 | generated through 32 classes and merged into 11 classes later. 9 | 10 | The original dataset is divided into 367 training, 101 validation, and 233 test images, as most literature practiced. 11 | 12 | ## Model metrics 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
ModelMetrics File
SERNet-Former [checkpoint]download
SERNet-Former [final]download
29 | 30 | ## Ablation works 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 |
AbMDbNAfN1AfN2mIoU %
NYYY81.22
YNYY80.56
YYNY78.99
YYYN75.37
YYYY82.88
77 | 78 | 79 | ## Please cite 80 | 81 | ```bibtex 82 | @article{Brostow2019, 83 | title={Semantic object classes in video: A high-definition ground truth database}, 84 | author={G. J. Brostow and J. Fauqueur and R. Cipolla}, 85 | journal={Pattern Recognition Letters}, 86 | volume=90 87 | pages=119-133 88 | year={2019} 89 | } 90 | 91 | @article{Erisen2024SERNetFormer, 92 | title={SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks}, 93 | author={Erişen, Serdar}, 94 | journal={arXiv preprint arXiv:2401.15741}, 95 | year={2024} 96 | } 97 | ``` 98 | -------------------------------------------------------------------------------- /ImageClassification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyN57eNgsltG+a1X/fNILjb8", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "source": [ 32 | "**Image classification**\n", 33 | "\n", 34 | "Image classification tutorial based on the pretrained PyTorch baselines.\n", 35 | "Used model: ViT_h_14 with Weights IMAGENET1K_SWAG_E2E_V1\n", 36 | "\n", 37 | "Please save a copy of this tutorial into your own environment/drive folder." 38 | ], 39 | "metadata": { 40 | "id": "7kG9ZBiN238_" 41 | } 42 | }, 43 | { 44 | "cell_type": "code", 45 | "source": [ 46 | "from torchvision.io import read_image" 47 | ], 48 | "metadata": { 49 | "id": "UB3maNwt0roV" 50 | }, 51 | "execution_count": 2, 52 | "outputs": [] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "source": [ 57 | "!wget http://images.cocodataset.org/val2017/000000005477.jpg -q -O input.jpg" 58 | ], 59 | "metadata": { 60 | "id": "5_vXR0dT0r9L" 61 | }, 62 | "execution_count": 1, 63 | "outputs": [] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "source": [ 68 | "im1=read_image(\"input.jpg\")" 69 | ], 70 | "metadata": { 71 | "id": "-hGZui_z0w97" 72 | }, 73 | "execution_count": 3, 74 | "outputs": [] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "metadata": { 80 | "colab": { 81 | "base_uri": "https://localhost:8080/" 82 | }, 83 | "id": "th3lBuOHzu6e", 84 | "outputId": "229ec5c7-86b0-4eb4-81fd-cf126d1d9c36" 85 | }, 86 | "outputs": [ 87 | { 88 | "output_type": "stream", 89 | "name": "stderr", 90 | "text": [ 91 | "Downloading: \"https://download.pytorch.org/models/vit_h_14_swag-80465313.pth\" to /root/.cache/torch/hub/checkpoints/vit_h_14_swag-80465313.pth\n", 92 | "100%|██████████| 2.36G/2.36G [01:57<00:00, 21.6MB/s]\n" 93 | ] 94 | }, 95 | { 96 | "output_type": "stream", 97 | "name": "stdout", 98 | "text": [ 99 | "airliner: 91.4%\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "from torchvision.models import vit_h_14, ViT_H_14_Weights\n", 105 | "\n", 106 | "# Step 1: Initialize model with the best available weights\n", 107 | "weights = ViT_H_14_Weights.DEFAULT\n", 108 | "model = vit_h_14(weights='DEFAULT')\n", 109 | "model.eval()\n", 110 | "\n", 111 | "# Step 2: Initialize the inference transforms\n", 112 | "preprocess = weights.transforms()\n", 113 | "\n", 114 | "# Step 3: Apply inference preprocessing transforms\n", 115 | "batch = preprocess(im1).unsqueeze(0)\n", 116 | "\n", 117 | "# Step 4: Use the model and print the predicted category\n", 118 | "prediction = model(batch).squeeze(0).softmax(0)\n", 119 | "class_id = prediction.argmax().item()\n", 120 | "score = prediction[class_id].item()\n", 121 | "category_name = weights.meta[\"categories\"][class_id]\n", 122 | "print(f\"{category_name}: {100 * score:.1f}%\") #airliner: ~91.4%" 123 | ] 124 | } 125 | ] 126 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SERNet-Former 2 |
3 | 4 | [![[CVPR 2024 Workshops] YouTube Video](https://img.shields.io/badge/CVPRW'24-YouTube-blue)](https://youtu.be/XXzMkotcdb4?feature=shared) 5 | [![CVPR 2024 Workshop](https://img.shields.io/badge/CVPR'24-Workshop-yellow)](https://equivision.github.io/index.html#papers) 6 | [![ArXiv paper](https://img.shields.io/badge/SERNetFormer-ArXiv-red)](https://doi.org/10.48550/arXiv.2401.15741) 7 | [![CVMI 2024](https://img.shields.io/badge/CVMI-2024-blue)](https://cvmi2024.iiita.ac.in/AcceptedPapers.php) 8 | 9 |
10 | 11 | [CVPR 2024 Workshops] SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks 12 | 13 | ## Tutorials 14 | Various implementations of SERNet-Former with different baselines for Multi-tasking (without our additional methods) is now online. 15 | 16 | The example deploys ViT_h_14 baseline with 'Weights' 'IMAGENET1K_SWAG_E2E_V1' and simple U-Net decoder architecture. 17 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/185TZK796f425vsduhpm9NcCrtMS4rIkb#scrollTo=q3e5V2NephbJ&forceEdit=true&sandboxMode=true) 18 | 19 | 20 | Please also see the tutorials for 21 | 22 | Image Segmentation based on DeepLabV3+_ResNet101 baseline 23 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sMRRcUsFaUwHSvIch9Koqxb4ogtgfVFs#scrollTo=-Gei9D03SWmM&forceEdit=true&sandboxMode=true) 24 | 25 | 26 | Image Classification based on ViT_h_14 baseline 27 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Nj82jyovcQcuZotx-pRSBzd1uEXCbOp4#scrollTo=7kG9ZBiN238) 28 | 29 | 30 | ## News 31 | - `16 May 2024` [CVPR 2024 Workshops] The article "SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks" is now accepted to CVPR 2024 Workshops. Equivariant Vision: From Theory to Practice 32 | - `January 2024` SERNet-Former set state-of-the-art result on Cityscapes validation dataset for pixel-level segmentation: 87.35 % mIoU 33 | - `January 2024` SERNet-Former set state-of-the-art result on CamVid dataset: 84.62 % mIoU 34 | - `January 2024` SERNet-Former ranked as the seventh on Cityscapes test dataset for pixel-level segmentation according to PapersWithCode.com: 84.83 % mIoU 35 | 36 | 37 | ## GitHub Badges 38 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/semantic-segmentation-on-ade20k-val)](https://paperswithcode.com/sota/semantic-segmentation-on-ade20k-val?p=sernet-former-semantic-segmentation-by) 39 | 40 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/semantic-segmentation-on-bdd100k-val)](https://paperswithcode.com/sota/semantic-segmentation-on-bdd100k-val?p=sernet-former-semantic-segmentation-by) 41 | 42 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/semantic-segmentation-on-camvid)](https://paperswithcode.com/sota/semantic-segmentation-on-camvid?p=sernet-former-semantic-segmentation-by) 43 | 44 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/2d-semantic-segmentation-on-camvid)](https://paperswithcode.com/sota/2d-semantic-segmentation-on-camvid?p=sernet-former-semantic-segmentation-by) 45 | 46 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/semantic-segmentation-on-cityscapes-val)](https://paperswithcode.com/sota/semantic-segmentation-on-cityscapes-val?p=sernet-former-semantic-segmentation-by) 47 | 48 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/2d-semantic-segmentation-on-cityscapes-val)](https://paperswithcode.com/sota/2d-semantic-segmentation-on-cityscapes-val?p=sernet-former-semantic-segmentation-by) 49 | 50 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sernet-former-semantic-segmentation-by/semantic-segmentation-on-cityscapes)](https://paperswithcode.com/sota/semantic-segmentation-on-cityscapes?p=sernet-former-semantic-segmentation-by) 51 | 52 | ## SERNet-Former Conceptual 53 | 54 | [![Efficient-ResNet](https://img.shields.io/badge/github-EfficientResNet-black)](https://github.com/serdarch/Efficient-ResNet) 55 | 56 | ![Figure1](https://github.com/serdarch/SERNet-Former/assets/61043858/084416d7-f982-4f46-b1bf-871aed81557b) 57 | 58 | (a) Attention-boosting Gate (AbG) and Attention-boosting Module (AbM) are fused into the encoder part. 59 | 60 | (b) Attention-fusion Network (AfN), introduced into the decoder 61 | 62 | ## Experiment Results 63 | 64 | ### CamVid Dataset 65 | 66 | The breakdown of class accuracies on CamVid dataset 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 |
ModelBaseline ArchitectureBuildingTreeSkyCarSignRoadPedestrianFencePoleSidewalkBicyclemIoU
SERNet-FormerEfficient-ResNet93.088.895.191.973.997.776.483.457.390.383.184.62
103 | 104 | 105 | The experiment outcomes on CamVid dataset 106 | 107 | ![camvid_output](https://github.com/serdarch/SERNet-Former/assets/61043858/f11f44a6-b245-43f1-b323-2f107f0b330e) 108 | 109 | ### Cityscapes 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 |
ModelBaseline Architectureroadsidewalkbuildingwallfencepoletraffic lighttraffic signvegetationterrainskypersonridercartruckbustrainmotorcyclebicyclemIoU
SERNet-FormerEfficient-ResNet98.290.294.067.668.273.678.282.194.675.996.990.077.796.986.193.991.770.082.984.83
163 | 164 | The experiment outcomes on Cityscapes dataset 165 | 166 | ![cityscapes_output](https://github.com/serdarch/SERNet-Former/assets/61043858/9a613193-6761-422c-bb7c-d2a3499548c5) 167 | 168 | ## Installation Support 169 | 170 | You can simply download this repository into your environment by running 171 | ```bash 172 | git clone https://github.com/serdarch/SERNet-Former.git 173 | ``` 174 | 175 | ## Citations 176 | 177 | ```bibtex 178 | @article{Erisen2024SERNetFormer, 179 | title={SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks}, 180 | author={Erişen, Serdar}, 181 | journal={arXiv preprint arXiv:2401.15741}, 182 | year={2024} 183 | } 184 | 185 | @inproceedings{Erisen2024CVPRW, 186 | title={SERNet-Former: Semantic Segmentation by Efficient Residual Network with Attention-Boosting Gates and Attention-Fusion Networks}, 187 | author={Erişen, Serdar}, 188 | booktitle={CVPRW}, 189 | year={2024}, 190 | } 191 | ``` 192 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.4.0 2 | accelerate==0.34.2 3 | addict==2.4.0 4 | aiohappyeyeballs==2.4.0 5 | aiohttp==3.10.5 6 | aiosignal==1.3.1 7 | alabaster==0.7.16 8 | albucore==0.0.16 9 | albumentations==1.4.15 10 | aliyun-python-sdk-core==2.15.2 11 | aliyun-python-sdk-kms==2.16.5 12 | altair==4.2.2 13 | annotated-types==0.7.0 14 | anyio==3.7.1 15 | argon2-cffi==23.1.0 16 | argon2-cffi-bindings==21.2.0 17 | array_record==0.5.1 18 | arviz==0.19.0 19 | astropy==6.1.3 20 | astropy-iers-data==0.2024.9.16.0.32.21 21 | astunparse==1.6.3 22 | async-timeout==4.0.3 23 | atpublic==4.1.0 24 | attrs==24.2.0 25 | audioread==3.0.1 26 | autograd==1.7.0 27 | babel==2.16.0 28 | backcall==0.2.0 29 | beautifulsoup4==4.12.3 30 | bigframes==1.18.0 31 | bigquery-magics==0.2.0 32 | bleach==6.1.0 33 | blinker==1.4 34 | blis==0.7.11 35 | blosc2==2.0.0 36 | bokeh==3.4.3 37 | bqplot==0.12.43 38 | branca==0.7.2 39 | build==1.2.2 40 | CacheControl==0.14.0 41 | cachetools==5.5.0 42 | catalogue==2.0.10 43 | certifi==2024.8.30 44 | cffi==1.17.1 45 | chardet==5.2.0 46 | charset-normalizer==3.3.2 47 | chex==0.1.86 48 | clarabel==0.9.0 49 | click==8.1.7 50 | cloudpathlib==0.19.0 51 | cloudpickle==2.2.1 52 | cmake==3.30.3 53 | cmdstanpy==1.2.4 54 | colorama==0.4.6 55 | colorcet==3.1.0 56 | colorlover==0.3.0 57 | colour==0.1.5 58 | community==1.0.0b1 59 | confection==0.1.5 60 | cons==0.4.6 61 | contextlib2==21.6.0 62 | contourpy==1.3.0 63 | crcmod==1.7 64 | cryptography==43.0.1 65 | cuda-python==12.2.1 66 | cudf-cu12 @ https://pypi.nvidia.com/cudf-cu12/cudf_cu12-24.4.1-cp310-cp310-manylinux_2_28_x86_64.whl#sha256=57366e7ef09dc63e0b389aff20df6c37d91e2790065861ee31a4720149f5b694 67 | cufflinks==0.17.3 68 | cupy-cuda12x==12.2.0 69 | cvxopt==1.3.2 70 | cvxpy==1.5.3 71 | cycler==0.12.1 72 | cymem==2.0.8 73 | Cython==3.0.11 74 | dask==2024.8.0 75 | datascience==0.17.6 76 | db-dtypes==1.3.0 77 | dbus-python==1.2.18 78 | debugpy==1.6.6 79 | decorator==4.4.2 80 | defusedxml==0.7.1 81 | distributed==2024.8.0 82 | distro==1.7.0 83 | dlib==19.24.2 84 | dm-tree==0.1.8 85 | docstring_parser==0.16 86 | docutils==0.18.1 87 | dopamine_rl==4.0.9 88 | duckdb==1.1.0 89 | earthengine-api==1.0.0 90 | easydict==1.13 91 | ecos==2.0.14 92 | editdistance==0.8.1 93 | eerepr==0.0.4 94 | einops==0.8.0 95 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889 96 | entrypoints==0.4 97 | et-xmlfile==1.1.0 98 | etils==1.9.4 99 | etuples==0.3.9 100 | eval_type_backport==0.2.0 101 | exceptiongroup==1.2.2 102 | fastai==2.7.17 103 | fastcore==1.7.8 104 | fastdownload==0.0.7 105 | fastjsonschema==2.20.0 106 | fastprogress==1.0.3 107 | fastrlock==0.8.2 108 | filelock==3.14.0 109 | firebase-admin==6.5.0 110 | Flask==2.2.5 111 | flatbuffers==24.3.25 112 | flax==0.8.5 113 | folium==0.17.0 114 | fonttools==4.53.1 115 | frozendict==2.4.4 116 | frozenlist==1.4.1 117 | fsspec==2024.6.1 118 | ftfy==6.2.3 119 | future==1.0.0 120 | gast==0.6.0 121 | gcsfs==2024.6.1 122 | GDAL==3.6.4 123 | gdown==5.2.0 124 | geemap==0.34.3 125 | gensim==4.3.3 126 | geocoder==1.38.1 127 | geographiclib==2.0 128 | geopandas==1.0.1 129 | geopy==2.4.1 130 | gin-config==0.5.0 131 | glob2==0.7 132 | google==2.0.3 133 | google-ai-generativelanguage==0.6.6 134 | google-api-core==2.19.2 135 | google-api-python-client==2.137.0 136 | google-auth==2.27.0 137 | google-auth-httplib2==0.2.0 138 | google-auth-oauthlib==1.2.1 139 | google-cloud-aiplatform==1.67.1 140 | google-cloud-bigquery==3.25.0 141 | google-cloud-bigquery-connection==1.15.5 142 | google-cloud-bigquery-storage==2.26.0 143 | google-cloud-bigtable==2.26.0 144 | google-cloud-core==2.4.1 145 | google-cloud-datastore==2.19.0 146 | google-cloud-firestore==2.16.1 147 | google-cloud-functions==1.16.5 148 | google-cloud-iam==2.15.2 149 | google-cloud-language==2.13.4 150 | google-cloud-pubsub==2.23.1 151 | google-cloud-resource-manager==1.12.5 152 | google-cloud-storage==2.8.0 153 | google-cloud-translate==3.15.5 154 | google-colab @ file:///colabtools/dist/google_colab-1.0.0.tar.gz#sha256=07bb3e866a2fb3dc3072920a4722b4a4c9c2fc953a97253597f3e5391c3dd17c 155 | google-crc32c==1.6.0 156 | google-generativeai==0.7.2 157 | google-pasta==0.2.0 158 | google-resumable-media==2.7.2 159 | googleapis-common-protos==1.65.0 160 | googledrivedownloader==0.4 161 | graphviz==0.20.3 162 | greenlet==3.1.1 163 | grpc-google-iam-v1==0.13.1 164 | grpcio==1.64.1 165 | grpcio-status==1.48.2 166 | gspread==6.0.2 167 | gspread-dataframe==3.3.1 168 | gym==0.25.2 169 | gym-notices==0.0.8 170 | h5netcdf==1.3.0 171 | h5py==3.11.0 172 | holidays==0.57 173 | holoviews==1.19.1 174 | html5lib==1.1 175 | httpimport==1.4.0 176 | httplib2==0.22.0 177 | huggingface-hub==0.24.7 178 | humanize==4.10.0 179 | hyperopt==0.2.7 180 | ibis-framework==9.2.0 181 | idna==3.10 182 | imageio==2.35.1 183 | imageio-ffmpeg==0.5.1 184 | imagesize==1.4.1 185 | imbalanced-learn==0.12.3 186 | imgaug==0.4.0 187 | immutabledict==4.2.0 188 | importlib_metadata==8.5.0 189 | importlib_resources==6.4.5 190 | imutils==0.5.4 191 | inflect==7.4.0 192 | iniconfig==2.0.0 193 | intel-cmplr-lib-ur==2024.2.1 194 | intel-openmp==2024.2.1 195 | ipyevents==2.0.2 196 | ipyfilechooser==0.6.0 197 | ipykernel==5.5.6 198 | ipyleaflet==0.19.2 199 | ipyparallel==8.8.0 200 | ipython==7.34.0 201 | ipython-genutils==0.2.0 202 | ipython-sql==0.5.0 203 | ipytree==0.2.2 204 | ipywidgets==7.7.1 205 | itsdangerous==2.2.0 206 | jax==0.4.33 207 | jax-cuda12-pjrt==0.4.33 208 | jax-cuda12-plugin==0.4.33 209 | jaxlib==0.4.33 210 | jeepney==0.7.1 211 | jellyfish==1.1.0 212 | jieba==0.42.1 213 | Jinja2==3.1.4 214 | jmespath==0.10.0 215 | joblib==1.4.2 216 | jsonpickle==3.3.0 217 | jsonschema==4.23.0 218 | jsonschema-specifications==2023.12.1 219 | jupyter-client==6.1.12 220 | jupyter-console==6.1.0 221 | jupyter-leaflet==0.19.2 222 | jupyter-server==1.24.0 223 | jupyter_core==5.7.2 224 | jupyterlab_pygments==0.3.0 225 | jupyterlab_widgets==3.0.13 226 | kaggle==1.6.17 227 | kagglehub==0.3.0 228 | keras==3.4.1 229 | keyring==23.5.0 230 | kiwisolver==1.4.7 231 | langcodes==3.4.0 232 | language_data==1.2.0 233 | launchpadlib==1.10.16 234 | lazr.restfulclient==0.14.4 235 | lazr.uri==1.0.6 236 | lazy_loader==0.4 237 | libclang==18.1.1 238 | librosa==0.10.2.post1 239 | lightgbm==4.5.0 240 | linkify-it-py==2.0.3 241 | llvmlite==0.43.0 242 | locket==1.0.0 243 | logical-unification==0.4.6 244 | lxml==4.9.4 245 | marisa-trie==1.2.0 246 | Markdown==3.7 247 | markdown-it-py==3.0.0 248 | MarkupSafe==2.1.5 249 | matplotlib==3.7.1 250 | matplotlib-inline==0.1.7 251 | matplotlib-venn==1.1.1 252 | mdit-py-plugins==0.4.2 253 | mdurl==0.1.2 254 | miniKanren==1.0.3 255 | missingno==0.5.2 256 | mistune==0.8.4 257 | mizani==0.11.4 258 | mkl==2024.2.2 259 | ml-dtypes==0.4.1 260 | mlxtend==0.23.1 261 | model-index==0.1.11 262 | more-itertools==10.5.0 263 | moviepy==1.0.3 264 | mpmath==1.3.0 265 | msgpack==1.0.8 266 | multidict==6.1.0 267 | multipledispatch==1.0.0 268 | multitasking==0.0.11 269 | murmurhash==1.0.10 270 | music21==9.1.0 271 | namex==0.0.8 272 | natsort==8.4.0 273 | nbclassic==1.1.0 274 | nbclient==0.10.0 275 | nbconvert==6.5.4 276 | nbformat==5.10.4 277 | nest-asyncio==1.6.0 278 | networkx==3.3 279 | nibabel==5.2.1 280 | nltk==3.8.1 281 | notebook==6.5.5 282 | notebook_shim==0.2.4 283 | numba==0.60.0 284 | numexpr==2.10.1 285 | numpy==1.26.4 286 | nvidia-cublas-cu12==12.6.1.4 287 | nvidia-cuda-cupti-cu12==12.6.68 288 | nvidia-cuda-nvcc-cu12==12.6.68 289 | nvidia-cuda-runtime-cu12==12.6.68 290 | nvidia-cudnn-cu12==9.4.0.58 291 | nvidia-cufft-cu12==11.2.6.59 292 | nvidia-cusolver-cu12==11.6.4.69 293 | nvidia-cusparse-cu12==12.5.3.3 294 | nvidia-nccl-cu12==2.23.4 295 | nvidia-nvjitlink-cu12==12.6.68 296 | nvtx==0.2.10 297 | oauth2client==4.1.3 298 | oauthlib==3.2.2 299 | opencv-contrib-python==4.10.0.84 300 | opencv-python==4.10.0.84 301 | opencv-python-headless==4.10.0.84 302 | opendatalab==0.0.10 303 | openmim==0.3.9 304 | openpyxl==3.1.5 305 | openxlab==0.1.1 306 | opt-einsum==3.3.0 307 | optax==0.2.3 308 | optree==0.12.1 309 | orbax-checkpoint==0.6.4 310 | ordered-set==4.1.0 311 | osqp==0.6.7.post0 312 | oss2==2.17.0 313 | packaging==24.1 314 | pandas==2.1.4 315 | pandas-datareader==0.10.0 316 | pandas-gbq==0.23.1 317 | pandas-stubs==2.1.4.231227 318 | pandocfilters==1.5.1 319 | panel==1.4.5 320 | panopticapi @ git+https://github.com/cocodataset/panopticapi.git@7bb4655548f98f3fedc07bf37e9040a992b054b0 321 | param==2.1.1 322 | parso==0.8.4 323 | parsy==2.1 324 | partd==1.4.2 325 | pathlib==1.0.1 326 | patsy==0.5.6 327 | peewee==3.17.6 328 | pexpect==4.9.0 329 | pickleshare==0.7.5 330 | pillow==10.4.0 331 | pip-tools==7.4.1 332 | platformdirs==4.3.6 333 | plotly==5.24.1 334 | plotnine==0.13.6 335 | pluggy==1.5.0 336 | polars==1.6.0 337 | pooch==1.8.2 338 | portpicker==1.5.2 339 | prefetch_generator==1.0.3 340 | preshed==3.0.9 341 | prettytable==3.11.0 342 | proglog==0.1.10 343 | progressbar2==4.5.0 344 | prometheus_client==0.21.0 345 | promise==2.3 346 | prompt_toolkit==3.0.47 347 | prophet==1.1.5 348 | proto-plus==1.24.0 349 | protobuf==3.20.3 350 | psutil==5.9.5 351 | psycopg2==2.9.9 352 | ptyprocess==0.7.0 353 | py-cpuinfo==9.0.0 354 | py4j==0.10.9.7 355 | pyarrow==14.0.2 356 | pyarrow-hotfix==0.6 357 | pyasn1==0.6.1 358 | pyasn1_modules==0.4.1 359 | pycocotools==2.0.8 360 | pycparser==2.22 361 | pycryptodome==3.20.0 362 | pydantic==2.9.2 363 | pydantic_core==2.23.4 364 | pydata-google-auth==1.8.2 365 | pydot==3.0.1 366 | pydot-ng==2.0.0 367 | pydotplus==2.0.2 368 | PyDrive==1.3.1 369 | PyDrive2==1.20.0 370 | pyerfa==2.0.1.4 371 | pygame==2.6.0 372 | Pygments==2.18.0 373 | PyGObject==3.42.1 374 | PyJWT==2.9.0 375 | pymc==5.16.2 376 | pymystem3==0.2.0 377 | pynvjitlink-cu12==0.3.0 378 | pyogrio==0.9.0 379 | PyOpenGL==3.1.7 380 | pyOpenSSL==24.2.1 381 | pyparsing==3.1.4 382 | pyperclip==1.9.0 383 | pyproj==3.6.1 384 | pyproject_hooks==1.1.0 385 | pyshp==2.3.1 386 | PySocks==1.7.1 387 | pytensor==2.25.4 388 | pytest==7.4.4 389 | python-apt==2.4.0 390 | python-box==7.2.0 391 | python-dateutil==2.8.2 392 | python-louvain==0.16 393 | python-slugify==8.0.4 394 | python-utils==3.8.2 395 | pytz==2023.4 396 | pyviz_comms==3.0.3 397 | PyYAML==6.0.2 398 | pyzmq==24.0.1 399 | qdldl==0.1.7.post4 400 | ratelim==0.1.6 401 | referencing==0.35.1 402 | regex==2024.9.11 403 | requests==2.28.2 404 | requests-oauthlib==1.3.1 405 | requirements-parser==0.9.0 406 | rich==13.4.2 407 | rmm-cu12==24.4.0 408 | rpds-py==0.20.0 409 | rpy2==3.4.2 410 | rsa==4.9 411 | safetensors==0.4.5 412 | scikit-image==0.24.0 413 | scikit-learn==1.5.2 414 | scipy==1.13.1 415 | scooby==0.10.0 416 | scs==3.2.7 417 | seaborn==0.13.1 418 | SecretStorage==3.3.1 419 | Send2Trash==1.8.3 420 | sentencepiece==0.2.0 421 | shapely==2.0.6 422 | shellingham==1.5.4 423 | simple-parsing==0.1.6 424 | six==1.16.0 425 | sklearn-pandas==2.2.0 426 | smart-open==7.0.4 427 | sniffio==1.3.1 428 | snowballstemmer==2.2.0 429 | sortedcontainers==2.4.0 430 | soundfile==0.12.1 431 | soupsieve==2.6 432 | soxr==0.5.0.post1 433 | spacy==3.7.6 434 | spacy-legacy==3.0.12 435 | spacy-loggers==1.0.5 436 | Sphinx==5.0.2 437 | sphinxcontrib-applehelp==2.0.0 438 | sphinxcontrib-devhelp==2.0.0 439 | sphinxcontrib-htmlhelp==2.1.0 440 | sphinxcontrib-jsmath==1.0.1 441 | sphinxcontrib-qthelp==2.0.0 442 | sphinxcontrib-serializinghtml==2.0.0 443 | SQLAlchemy==2.0.35 444 | sqlglot==25.1.0 445 | sqlparse==0.5.1 446 | srsly==2.4.8 447 | stanio==0.5.1 448 | statsmodels==0.14.3 449 | StrEnum==0.4.15 450 | sympy==1.13.3 451 | tables==3.8.0 452 | tabulate==0.9.0 453 | tbb==2021.13.1 454 | tblib==3.0.0 455 | tenacity==9.0.0 456 | tensorboard==2.17.0 457 | tensorboard-data-server==0.7.2 458 | tensorflow==2.17.0 459 | tensorflow-datasets==4.9.6 460 | tensorflow-hub==0.16.1 461 | tensorflow-io-gcs-filesystem==0.37.1 462 | tensorflow-metadata==1.15.0 463 | tensorflow-probability==0.24.0 464 | tensorstore==0.1.65 465 | termcolor==2.4.0 466 | terminado==0.18.1 467 | terminaltables==3.1.10 468 | text-unidecode==1.3 469 | textblob==0.17.1 470 | tf-slim==1.1.0 471 | tf_keras==2.17.0 472 | thinc==8.2.5 473 | threadpoolctl==3.5.0 474 | tifffile==2024.9.20 475 | timm==1.0.9 476 | tinycss2==1.3.0 477 | tokenizers==0.19.1 478 | toml==0.10.2 479 | tomli==2.0.1 480 | toolz==0.12.1 481 | torch @ https://download.pytorch.org/whl/cu121_full/torch-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl#sha256=f3ed9a2b7f8671b2b32a2f036d1b81055eb3ad9b18ba43b705aa34bae4289e1a 482 | torchaudio @ https://download.pytorch.org/whl/cu121_full/torchaudio-2.4.1%2Bcu121-cp310-cp310-linux_x86_64.whl#sha256=da8c87c80a1c1376a48dc33eef30b03bbdf1df25a05bd2b1c620b8811c7b19be 483 | torchsummary==1.5.1 484 | torchvision @ https://download.pytorch.org/whl/cu121_full/torchvision-0.19.1%2Bcu121-cp310-cp310-linux_x86_64.whl#sha256=b8cc4bf381b75522995b601e07a1b433b5fd925dc3e34a7fa6cd22f449d65379 485 | tornado==6.3.3 486 | tqdm==4.65.2 487 | traitlets==5.7.1 488 | traittypes==0.2.1 489 | transformers==4.44.2 490 | tweepy==4.14.0 491 | typeguard==4.3.0 492 | typer==0.12.5 493 | types-pytz==2024.2.0.20240913 494 | types-setuptools==75.1.0.20240917 495 | typing_extensions==4.12.2 496 | tzdata==2024.1 497 | tzlocal==5.2 498 | uc-micro-py==1.0.3 499 | uritemplate==4.1.1 500 | urllib3==1.26.20 501 | vega-datasets==0.9.0 502 | wadllib==1.3.6 503 | wasabi==1.1.3 504 | wcwidth==0.2.13 505 | weasel==0.4.1 506 | webcolors==24.8.0 507 | webencodings==0.5.1 508 | websocket-client==1.8.0 509 | Werkzeug==3.0.4 510 | widgetsnbextension==3.6.9 511 | wordcloud==1.9.3 512 | wrapt==1.16.0 513 | xarray==2024.9.0 514 | xarray-einstats==0.8.0 515 | xgboost==2.1.1 516 | xlrd==2.0.1 517 | xyzservices==2024.9.0 518 | yapf==0.40.2 519 | yarl==1.11.1 520 | yellowbrick==1.5 521 | yfinance==0.2.43 522 | zict==3.0.0 523 | zipp==3.20.2 524 | -------------------------------------------------------------------------------- /ImageSegmentation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyNhlaPsPB9W8n1aX35ZbIiR", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "source": [ 32 | "# **Image Segmentation**" 33 | ], 34 | "metadata": { 35 | "id": "GlHJjHksu6hh" 36 | } 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "source": [ 41 | "Please save a copy of this file into your own drive/environment instead of asking for the shared version.\n", 42 | "\n", 43 | "请将此文件保存到您自己的 Google Drive 或操作系统环境中,而不是请求共享版本\n", 44 | "\n", 45 | "このファイルを共有バージョンを要求するのではなく、自分の Google ドライブや OS 環境にコピーを保存してください" 46 | ], 47 | "metadata": { 48 | "id": "V-XSpU5MoEgs" 49 | } 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "id": "0txetkFtI9Bv" 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "import torch\n", 60 | "import torch.nn as nn\n", 61 | "import torchvision.models.segmentation as segmentation\n", 62 | "from torchvision.transforms import Compose, Resize, ToTensor, Normalize\n", 63 | "from PIL import Image\n", 64 | "import numpy as np" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "source": [ 70 | "from torchvision.io.image import read_image\n", 71 | "from torchvision.models.segmentation import deeplabv3_resnet101, DeepLabV3_ResNet101_Weights\n", 72 | "from torchvision.transforms.functional import to_pil_image" 73 | ], 74 | "metadata": { 75 | "id": "EvI-1SbZMhQU" 76 | }, 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "source": [ 83 | "# Step 1: Initialize model with the best available weights\n", 84 | "weights = DeepLabV3_ResNet101_Weights.DEFAULT\n", 85 | "model = deeplabv3_resnet101(weights=weights)\n", 86 | "model.eval()" 87 | ], 88 | "metadata": { 89 | "colab": { 90 | "base_uri": "https://localhost:8080/" 91 | }, 92 | "id": "5cTuZ2c7Mkxq", 93 | "outputId": "6ec742fa-cecd-4070-a268-62501aa3e696" 94 | }, 95 | "execution_count": null, 96 | "outputs": [ 97 | { 98 | "output_type": "stream", 99 | "name": "stderr", 100 | "text": [ 101 | "Downloading: \"https://download.pytorch.org/models/deeplabv3_resnet101_coco-586e9e4e.pth\" to /root/.cache/torch/hub/checkpoints/deeplabv3_resnet101_coco-586e9e4e.pth\n", 102 | "100%|██████████| 233M/233M [00:02<00:00, 83.8MB/s]\n" 103 | ] 104 | }, 105 | { 106 | "output_type": "execute_result", 107 | "data": { 108 | "text/plain": [ 109 | "DeepLabV3(\n", 110 | " (backbone): IntermediateLayerGetter(\n", 111 | " (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)\n", 112 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 113 | " (relu): ReLU(inplace=True)\n", 114 | " (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)\n", 115 | " (layer1): Sequential(\n", 116 | " (0): Bottleneck(\n", 117 | " (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 118 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 119 | " (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", 120 | " (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 121 | " (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 122 | " (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 123 | " (relu): ReLU(inplace=True)\n", 124 | " (downsample): Sequential(\n", 125 | " (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 126 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 127 | " )\n", 128 | " )\n", 129 | " (1): Bottleneck(\n", 130 | " (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 131 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 132 | " (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", 133 | " (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 134 | " (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 135 | " (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 136 | " (relu): ReLU(inplace=True)\n", 137 | " )\n", 138 | " (2): Bottleneck(\n", 139 | " (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 140 | " (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 141 | " (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", 142 | " (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 143 | " (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 144 | " (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 145 | " (relu): ReLU(inplace=True)\n", 146 | " )\n", 147 | " )\n", 148 | " (layer2): Sequential(\n", 149 | " (0): Bottleneck(\n", 150 | " (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 151 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 152 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)\n", 153 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 154 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 155 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 156 | " (relu): ReLU(inplace=True)\n", 157 | " (downsample): Sequential(\n", 158 | " (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)\n", 159 | " (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 160 | " )\n", 161 | " )\n", 162 | " (1): Bottleneck(\n", 163 | " (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 164 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 165 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", 166 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 167 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 168 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 169 | " (relu): ReLU(inplace=True)\n", 170 | " )\n", 171 | " (2): Bottleneck(\n", 172 | " (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 173 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 174 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", 175 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 176 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 177 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 178 | " (relu): ReLU(inplace=True)\n", 179 | " )\n", 180 | " (3): Bottleneck(\n", 181 | " (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 182 | " (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 183 | " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", 184 | " (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 185 | " (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 186 | " (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 187 | " (relu): ReLU(inplace=True)\n", 188 | " )\n", 189 | " )\n", 190 | " (layer3): Sequential(\n", 191 | " (0): Bottleneck(\n", 192 | " (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 193 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 194 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", 195 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 196 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 197 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 198 | " (relu): ReLU(inplace=True)\n", 199 | " (downsample): Sequential(\n", 200 | " (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 201 | " (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 202 | " )\n", 203 | " )\n", 204 | " (1): Bottleneck(\n", 205 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 206 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 207 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 208 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 209 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 210 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 211 | " (relu): ReLU(inplace=True)\n", 212 | " )\n", 213 | " (2): Bottleneck(\n", 214 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 215 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 216 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 217 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 218 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 219 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 220 | " (relu): ReLU(inplace=True)\n", 221 | " )\n", 222 | " (3): Bottleneck(\n", 223 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 224 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 225 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 226 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 227 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 228 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 229 | " (relu): ReLU(inplace=True)\n", 230 | " )\n", 231 | " (4): Bottleneck(\n", 232 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 233 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 234 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 235 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 236 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 237 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 238 | " (relu): ReLU(inplace=True)\n", 239 | " )\n", 240 | " (5): Bottleneck(\n", 241 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 242 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 243 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 244 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 245 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 246 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 247 | " (relu): ReLU(inplace=True)\n", 248 | " )\n", 249 | " (6): Bottleneck(\n", 250 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 251 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 252 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 253 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 254 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 255 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 256 | " (relu): ReLU(inplace=True)\n", 257 | " )\n", 258 | " (7): Bottleneck(\n", 259 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 260 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 261 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 262 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 263 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 264 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 265 | " (relu): ReLU(inplace=True)\n", 266 | " )\n", 267 | " (8): Bottleneck(\n", 268 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 269 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 270 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 271 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 272 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 273 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 274 | " (relu): ReLU(inplace=True)\n", 275 | " )\n", 276 | " (9): Bottleneck(\n", 277 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 278 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 279 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 280 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 281 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 282 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 283 | " (relu): ReLU(inplace=True)\n", 284 | " )\n", 285 | " (10): Bottleneck(\n", 286 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 287 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 288 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 289 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 290 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 291 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 292 | " (relu): ReLU(inplace=True)\n", 293 | " )\n", 294 | " (11): Bottleneck(\n", 295 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 296 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 297 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 298 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 299 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 300 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 301 | " (relu): ReLU(inplace=True)\n", 302 | " )\n", 303 | " (12): Bottleneck(\n", 304 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 305 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 306 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 307 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 308 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 309 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 310 | " (relu): ReLU(inplace=True)\n", 311 | " )\n", 312 | " (13): Bottleneck(\n", 313 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 314 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 315 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 316 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 317 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 318 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 319 | " (relu): ReLU(inplace=True)\n", 320 | " )\n", 321 | " (14): Bottleneck(\n", 322 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 323 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 324 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 325 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 326 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 327 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 328 | " (relu): ReLU(inplace=True)\n", 329 | " )\n", 330 | " (15): Bottleneck(\n", 331 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 332 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 333 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 334 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 335 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 336 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 337 | " (relu): ReLU(inplace=True)\n", 338 | " )\n", 339 | " (16): Bottleneck(\n", 340 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 341 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 342 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 343 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 344 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 345 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 346 | " (relu): ReLU(inplace=True)\n", 347 | " )\n", 348 | " (17): Bottleneck(\n", 349 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 350 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 351 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 352 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 353 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 354 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 355 | " (relu): ReLU(inplace=True)\n", 356 | " )\n", 357 | " (18): Bottleneck(\n", 358 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 359 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 360 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 361 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 362 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 363 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 364 | " (relu): ReLU(inplace=True)\n", 365 | " )\n", 366 | " (19): Bottleneck(\n", 367 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 368 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 369 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 370 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 371 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 372 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 373 | " (relu): ReLU(inplace=True)\n", 374 | " )\n", 375 | " (20): Bottleneck(\n", 376 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 377 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 378 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 379 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 380 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 381 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 382 | " (relu): ReLU(inplace=True)\n", 383 | " )\n", 384 | " (21): Bottleneck(\n", 385 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 386 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 387 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 388 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 389 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 390 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 391 | " (relu): ReLU(inplace=True)\n", 392 | " )\n", 393 | " (22): Bottleneck(\n", 394 | " (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 395 | " (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 396 | " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 397 | " (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 398 | " (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 399 | " (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 400 | " (relu): ReLU(inplace=True)\n", 401 | " )\n", 402 | " )\n", 403 | " (layer4): Sequential(\n", 404 | " (0): Bottleneck(\n", 405 | " (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 406 | " (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 407 | " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)\n", 408 | " (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 409 | " (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 410 | " (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 411 | " (relu): ReLU(inplace=True)\n", 412 | " (downsample): Sequential(\n", 413 | " (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 414 | " (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 415 | " )\n", 416 | " )\n", 417 | " (1): Bottleneck(\n", 418 | " (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 419 | " (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 420 | " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(4, 4), dilation=(4, 4), bias=False)\n", 421 | " (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 422 | " (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 423 | " (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 424 | " (relu): ReLU(inplace=True)\n", 425 | " )\n", 426 | " (2): Bottleneck(\n", 427 | " (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 428 | " (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 429 | " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(4, 4), dilation=(4, 4), bias=False)\n", 430 | " (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 431 | " (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 432 | " (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 433 | " (relu): ReLU(inplace=True)\n", 434 | " )\n", 435 | " )\n", 436 | " )\n", 437 | " (classifier): DeepLabHead(\n", 438 | " (0): ASPP(\n", 439 | " (convs): ModuleList(\n", 440 | " (0): Sequential(\n", 441 | " (0): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 442 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 443 | " (2): ReLU()\n", 444 | " )\n", 445 | " (1): ASPPConv(\n", 446 | " (0): Conv2d(2048, 256, kernel_size=(3, 3), stride=(1, 1), padding=(12, 12), dilation=(12, 12), bias=False)\n", 447 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 448 | " (2): ReLU()\n", 449 | " )\n", 450 | " (2): ASPPConv(\n", 451 | " (0): Conv2d(2048, 256, kernel_size=(3, 3), stride=(1, 1), padding=(24, 24), dilation=(24, 24), bias=False)\n", 452 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 453 | " (2): ReLU()\n", 454 | " )\n", 455 | " (3): ASPPConv(\n", 456 | " (0): Conv2d(2048, 256, kernel_size=(3, 3), stride=(1, 1), padding=(36, 36), dilation=(36, 36), bias=False)\n", 457 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 458 | " (2): ReLU()\n", 459 | " )\n", 460 | " (4): ASPPPooling(\n", 461 | " (0): AdaptiveAvgPool2d(output_size=1)\n", 462 | " (1): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 463 | " (2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 464 | " (3): ReLU()\n", 465 | " )\n", 466 | " )\n", 467 | " (project): Sequential(\n", 468 | " (0): Conv2d(1280, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)\n", 469 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 470 | " (2): ReLU()\n", 471 | " (3): Dropout(p=0.5, inplace=False)\n", 472 | " )\n", 473 | " )\n", 474 | " (1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", 475 | " (2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 476 | " (3): ReLU()\n", 477 | " (4): Conv2d(256, 21, kernel_size=(1, 1), stride=(1, 1))\n", 478 | " )\n", 479 | " (aux_classifier): FCNHead(\n", 480 | " (0): Conv2d(1024, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)\n", 481 | " (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 482 | " (2): ReLU()\n", 483 | " (3): Dropout(p=0.1, inplace=False)\n", 484 | " (4): Conv2d(256, 21, kernel_size=(1, 1), stride=(1, 1))\n", 485 | " )\n", 486 | ")" 487 | ] 488 | }, 489 | "metadata": {}, 490 | "execution_count": 3 491 | } 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "source": [ 497 | "!wget http://images.cocodataset.org/val2017/000000005477.jpg -q -O input.jpg\n", 498 | "im1= read_image(\"input.jpg\")" 499 | ], 500 | "metadata": { 501 | "id": "bwizfLVML1pL" 502 | }, 503 | "execution_count": null, 504 | "outputs": [] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "source": [ 509 | "# Step 2: Initialize the inference transforms\n", 510 | "preprocess = weights.transforms()\n", 511 | "\n", 512 | "# Step 3: Apply inference preprocessing transforms\n", 513 | "batch = preprocess(im1).unsqueeze(0)\n", 514 | "\n", 515 | "# Step 4: Use the model and visualize the prediction\n", 516 | "prediction = model(batch)[\"out\"]\n", 517 | "normalized_masks = prediction.softmax(dim=1)\n", 518 | "class_to_idx = {cls: idx for (idx, cls) in enumerate(weights.meta[\"categories\"])}" 519 | ], 520 | "metadata": { 521 | "id": "XuZL4yWRS8Gx" 522 | }, 523 | "execution_count": null, 524 | "outputs": [] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "source": [ 529 | "print(weights.meta[\"categories\"])" 530 | ], 531 | "metadata": { 532 | "colab": { 533 | "base_uri": "https://localhost:8080/" 534 | }, 535 | "id": "mJgEIsYxPbix", 536 | "outputId": "410ab130-581c-4551-ae8a-cdb6e9626d83" 537 | }, 538 | "execution_count": null, 539 | "outputs": [ 540 | { 541 | "output_type": "stream", 542 | "name": "stdout", 543 | "text": [ 544 | "['__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']\n" 545 | ] 546 | } 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "source": [ 552 | "mask = normalized_masks[0, class_to_idx[\"aeroplane\"]]" 553 | ], 554 | "metadata": { 555 | "id": "f1R-TRZuPnnM" 556 | }, 557 | "execution_count": null, 558 | "outputs": [] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "source": [ 563 | "to_pil_image(mask).show()" 564 | ], 565 | "metadata": { 566 | "id": "IsO6WXQdytTG" 567 | }, 568 | "execution_count": null, 569 | "outputs": [] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "source": [ 574 | "from google.colab import drive\n", 575 | "drive.mount('/content/gdrive')" 576 | ], 577 | "metadata": { 578 | "colab": { 579 | "base_uri": "https://localhost:8080/" 580 | }, 581 | "id": "-Gei9D03SWmM", 582 | "outputId": "51672b3c-5bd8-414a-d2f4-d78e9b94e576" 583 | }, 584 | "execution_count": null, 585 | "outputs": [ 586 | { 587 | "output_type": "stream", 588 | "name": "stdout", 589 | "text": [ 590 | "Mounted at /content/gdrive\n" 591 | ] 592 | } 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "source": [ 598 | "torch.save(mask, '***/MyDrive/mask.pt')" 599 | ], 600 | "metadata": { 601 | "id": "ODJZDuIBSbmm" 602 | }, 603 | "execution_count": null, 604 | "outputs": [] 605 | } 606 | ] 607 | } --------------------------------------------------------------------------------