├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── feature_request.md
    │   └── new-architecture-request.md
├── Architectures
    ├── AlexNet
    │   ├── README.md
    │   ├── alexnet.py
    │   ├── requirements.txt
    │   └── src
    │   │   └── AlexNet.png
    ├── DenseNet
    │   ├── README.md
    │   ├── densenet.py
    │   ├── densenet_blocks.py
    │   ├── requirements.txt
    │   └── src
    │   │   ├── DenseNet_Transition_block.png
    │   │   └── Dense_Block.png
    ├── EfficientNet
    │   ├── EfficientNet_Module.py
    │   ├── README.md
    │   ├── efficientnetb0.py
    │   └── requirements.txt
    ├── GoogLeNet
    │   ├── README.md
    │   ├── googlenet.py
    │   ├── requirements.txt
    │   └── src
    │   │   ├── GoogLeNet.png
    │   │   └── Inception Module Optimized.png
    ├── MobileNet
    │   ├── README.md
    │   ├── mobilenet.py
    │   ├── mobilenets_module.py
    │   ├── requirements.txt
    │   └── src
    │   │   ├── MobileNet_architecture.png
    │   │   └── MobileNets_DepthwiseSeparableConv.png
    ├── MobileNetV2
    │   ├── README.md
    │   ├── mobilenets_module.py
    │   ├── mobilenetv2.py
    │   └── requirements.txt
    ├── ResNeXt
    │   ├── README.md
    │   ├── requirements.txt
    │   ├── resnext50.py
    │   ├── resnext_module.py
    │   └── src
    │   │   └── ResNext_Module.png
    ├── ResNet
    │   ├── README.md
    │   ├── requirements.txt
    │   ├── residual_block.py
    │   ├── resnet101.py
    │   ├── resnet152.py
    │   ├── resnet18.py
    │   ├── resnet34.py
    │   ├── resnet50.py
    │   └── src
    │   │   └── ResNet_building_block.png
    ├── ResidualAttentionNetwork
    │   ├── README.md
    │   ├── ResNetAttention92.py
    │   ├── ran_modules.py
    │   ├── requirements.txt
    │   └── src
    │   │   ├── RAN_Soft_Mask_Brach.png
    │   │   ├── RAN_attention_module.png
    │   │   └── RAN_residual_unit.png
    ├── Rethinked Inception
    │   ├── README.md
    │   ├── inceptionV3.py
    │   ├── inception_blocks.py
    │   ├── requirements.txt
    │   └── src
    │   │   ├── Improved Inception Module with more factorization.png
    │   │   ├── Improved Inception Module.png
    │   │   ├── Inception Module with expanded filter bank.png
    │   │   └── InceptionV3.png
    ├── SENet
    │   ├── README.md
    │   ├── requirements.txt
    │   ├── senet.py
    │   ├── senet_block.py
    │   └── src
    │   │   └── SENet_Block.png
    ├── ShuffleNet
    │   ├── README.md
    │   ├── requirements.txt
    │   ├── shuffle_unit.py
    │   ├── shufflenet.py
    │   └── src
    │   │   ├── shuffleblock_no_stride.png
    │   │   └── shuffleblock_with_stride.png
    ├── SqueezeNet
    │   ├── README.md
    │   ├── requirements.txt
    │   ├── squeezenet.py
    │   ├── squeezenet_blocks.py
    │   ├── squeezenet_with_bypass.py
    │   └── src
    │   │   ├── SqueezeNet_Fire_module.png
    │   │   └── SqueezeNet_architecture.png
    ├── VGG16
    │   ├── README.md
    │   ├── requirements.txt
    │   ├── src
    │   │   └── VGG16.png
    │   └── vgg16.py
    ├── VisionTransformer
    │   ├── README.md
    │   ├── Vanilla_ViT.py
    │   ├── requirements.txt
    │   ├── transformer_module.py
    │   └── transformer_utils.py
    ├── Xception
    │   ├── README.md
    │   ├── requirements.txt
    │   ├── src
    │   │   ├── Xception_Entry_Flow.png
    │   │   ├── Xception_Exit_Flow.png
    │   │   └── Xception_middle_flow.png
    │   ├── xception.py
    │   └── xception_blocks.py
    └── ZFNet
    │   ├── README.md
    │   ├── requirements.txt
    │   ├── src
    │       └── ZFNet.png
    │   └── zfnet.py
├── Attention Mechanism
    └── CBAM
    │   ├── README.md
    │   └── cbam_module.py
├── CODE_OF_CONDUCT.md
├── Generative Adversarial Networks
    └── GAN (2014)
    │   ├── README.md
    │   ├── discriminator.py
    │   ├── generator.py
    │   └── requirements.txt
├── LICENSE
├── README.md
└── Semantic Segmentation
    ├── AttentionUnet
        ├── README.md
        ├── UNet2D.py
        ├── UNet3D.py
        ├── requirements.txt
        ├── unet2D_utils.py
        └── unet3D_utils.py
    ├── UNet++
        ├── README.md
        ├── UNetPlus.py
        ├── UNetPlusPlus.py
        ├── UnetPlusPlus_utils.py
        └── requirements.txt
    └── UNet
        ├── README.md
        ├── UNet.py
        ├── requirements.txt
        └── unet_utils.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/new-architecture-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: New Architecture Request
 3 | about: 'Suggest a new computer vision deep learning architecture to be implemented
 4 |   title: ''[NEW ARCH] '''
 5 | title: "[NEW ARCHITECTURE]"
 6 | labels: ''
 7 | assignees: ''
 8 | 
 9 | ---
10 | 
11 | ## Architecture Name
12 | [Provide the full name of the architecture]
13 | 
14 | ## Paper Reference
15 | [Provide a link to the paper or full citation]
16 | 
17 | ## Brief Description
18 | [Provide a 2-3 sentence description of the architecture and its key features]
19 | 
20 | ## Significance
21 | [Explain why this architecture is important and should be added to the repository]
22 | 
23 | ## Implementation Considerations
24 | [List any specific considerations for implementing this architecture, e.g., computational requirements, dependencies, etc.]
25 | 
26 | ## Potential Use Cases
27 | [Describe potential applications or use cases for this architecture]
28 | 
29 | ## Additional Resources
30 | [Provide links to any existing implementations, blog posts, or other relevant resources]
31 | 
32 | ## Checklist
33 | - [ ] I have searched the existing issues to make sure this is not a duplicate request
34 | - [ ] I have provided all the necessary information about the architecture
35 | - [ ] I understand that the implementation timeline depends on the project maintainers' capacity
36 | 
37 | ## Additional Comments
38 | [Add any other information that might be relevant to this request]
39 | 


--------------------------------------------------------------------------------
/Architectures/AlexNet/README.md:
--------------------------------------------------------------------------------
 1 | # [AlexNet]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of AlexNet. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://gvdmnni.notion.site/AlexNet-caa1c8a968a54179a1da454ff764cb5e?pvs=4).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | - **Depth and Complexity:** AlexNet demonstrated that deeper networks with many layers could achieve significantly better performance on complex image classification tasks
12 | - **ReLU Activation**: Substitute tanh activation function with **Re**ctified **L**inear **U**nit, showing lower training time.
13 | - **Dropout:** Utilized dropout as a regularization technique to prevent overfitting.
14 | - **GPU Implementation:** Leveraged the power of GPUs to accelerate the training process, making it feasible to train large networks on large datasets.
15 | 
16 | 
17 | ## Architecture Scheme
18 | Below is a schematic representation of the architecture:
19 | 
20 | ![Architecture Scheme](./src/AlexNet.png)
21 | 
22 | ## Reproduced Results (TBD)
23 | The following results were reproduced as per the methodology described in the paper:
24 | - Result 1: [Description and value]
25 | - Result 2: [Description and value]
26 | - Result 3: [Description and value]
27 | - ...
28 | 
29 | ## References
30 | - [Original Paper](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks)
31 | - [Detailed Blog Post](https://gvdmnni.notion.site/AlexNet-caa1c8a968a54179a1da454ff764cb5e?pvs=4)
32 | 


--------------------------------------------------------------------------------
/Architectures/AlexNet/alexnet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | 
 5 | 
 6 | class AlexNet(nn.Module):
 7 |     def __init__(self, num_classes=1000):
 8 |         super(AlexNet, self).__init__()
 9 | 
10 |         # First Convolutional Layer
11 |         self.conv1 = nn.Sequential(
12 |             nn.Conv2d(in_channels=3, out_channels=48, kernel_size=11, stride=4),
13 |             nn.ReLU(),
14 |             nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),
15 |             nn.MaxPool2d(kernel_size=3, stride=2)
16 |         )
17 |         # Second Convolutional Layer
18 |         self.conv2 = nn.Sequential(
19 |             nn.Conv2d(in_channels=48, out_channels=128, kernel_size=5, stride=2, padding=2),
20 |             nn.ReLU(),
21 |             nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),
22 |             nn.MaxPool2d(kernel_size=3, stride=2)
23 |         )
24 |         # Third Convolutional Layer
25 |         self.conv3 = nn.Sequential(
26 |             nn.Conv2d(in_channels=128, out_channels=192, kernel_size=3, stride=1, padding=1),
27 |             nn.ReLU()
28 |         )
29 |         # Fourth Convolutional Layer
30 |         self.conv4 = nn.Sequential(
31 |             nn.Conv2d(in_channels=192, out_channels=192, kernel_size=3, stride=1, padding=1),
32 |             nn.ReLU()
33 |         )
34 |         # Fifth Convolutional Layer
35 |         self.conv5 = nn.Sequential(
36 |             nn.Conv2d(in_channels=192, out_channels=128, kernel_size=3, stride=1, padding=1),
37 |             nn.ReLU(),
38 |             nn.MaxPool2d(kernel_size=3, stride=2)
39 |         )
40 | 
41 |         # Dense Layer
42 |         self.output = nn.Sequential(
43 |             nn.Flatten(),
44 |             nn.Linear(128 * 2 * 2, 2048),
45 |             nn.ReLU(),
46 |             nn.Dropout(0.5),
47 |             nn.Linear(2048, 2048),
48 |             nn.ReLU(),
49 |             nn.Dropout(0.5),
50 |             nn.Linear(2048, num_classes)
51 |         )
52 | 
53 |     def forward(self, x):
54 |         x = self.conv1(x)
55 |         x = self.conv2(x)
56 |         x = self.conv3(x)
57 |         x = self.conv4(x)
58 |         x = self.conv5(x)
59 |         x = self.output(x)
60 |         return x
61 | 
62 | if __name__ == '__main__':
63 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
64 |     n_classes = 1000
65 |     model = AlexNet(n_classes).to(device)
66 |     x = torch.randn(1, 3, 224, 224, device=device)
67 |     summary(model, (3, 224, 224))
68 |     print(model(x).shape)
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/Architectures/AlexNet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/AlexNet/src/AlexNet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/AlexNet/src/AlexNet.png


--------------------------------------------------------------------------------
/Architectures/DenseNet/README.md:
--------------------------------------------------------------------------------
 1 | # [DenseNet]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of DenseNet. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://gvdmnni.notion.site/DenseNet-bd578f8d65b0490abfec9c5491ec1de4).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | - Dense connectivity: Direct connections between any two layers in a feed-forward fashion, enabling feature reuse throughout the network and reducing redundancy.
12 | - Alleviating vanishing gradients: Each layer has direct access to the gradients from the loss function and original input, providing implicit deep supervision.
13 | - Reduced parameter count: DenseNets require fewer parameters than traditional CNNs as there is no need to re-learn redundant feature maps.
14 | - Improved information and gradient flow: Enables training of very deep networks in an efficient and easy to optimize manner.
15 | 
16 | 
17 | ## Architecture Scheme
18 | Below a schematic representation of the modules that are used in the architecture:
19 | ![Inception Module](./src/Dense_Block.png)*Dense Block with bottleneck*
20 | 
21 | ![Inception Module](./src/DenseNet_Transition_block.png)
22 | *Transition Block*
23 | 
24 | 
25 | 
26 | 
27 | ## Reproduced Results (TBD)
28 | The following results were reproduced as per the methodology described in the paper:
29 | - Result 1: [Description and value]
30 | - Result 2: [Description and value]
31 | - Result 3: [Description and value]
32 | - ...
33 | 
34 | 
35 | ## References
36 | - [Original Paper](https://arxiv.org/abs/1608.06993)
37 | - [Detailed Blog Post](https://www.notion.so/gvdmnni/DenseNet-bd578f8d65b0490abfec9c5491ec1de4?pm=c)
38 | 


--------------------------------------------------------------------------------
/Architectures/DenseNet/densenet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | 
 5 | from densenet_blocks import *
 6 | 
 7 | 
 8 | class DenseNet121(nn.Module):
 9 |     def __init__(self, growthRate = 12, num_classes = 1000, n_dense_block = [6, 12, 24, 16]):
10 |         super(DenseNet121, self).__init__()
11 | 
12 |         nChannels = 2 * growthRate
13 | 
14 |         # Input Layers
15 |         self.input_layers = nn.Sequential(
16 |             nn.Conv2d(in_channels=3, out_channels=nChannels, kernel_size=7, stride=2, padding=3),
17 |             nn.BatchNorm2d(nChannels),
18 |             nn.ReLU(),
19 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
20 |         )
21 | 
22 | 
23 |         # Dense Block 1
24 |         first_dense_layer = []
25 |         for i in range(n_dense_block[0]):
26 |             first_dense_layer.append(DenseBlock(in_channels=nChannels, growth_rate=growthRate))
27 |             nChannels += growthRate
28 |         self.dense_block_1 = nn.Sequential(*first_dense_layer)
29 | 
30 |         # Transition Block 1
31 |         self.transition_block_1 = TransitionBlock(in_channels=nChannels, out_1x1=nChannels)
32 | 
33 |         # Dense Block 2
34 |         second_dense_layer = []
35 |         for i in range(n_dense_block[1]):
36 |             second_dense_layer.append(DenseBlock(in_channels=nChannels, growth_rate=growthRate))
37 |             nChannels += growthRate
38 |         self.dense_block_2 = nn.Sequential(*second_dense_layer)
39 | 
40 |         # Transition Block 2
41 |         self.transition_block_2 = TransitionBlock(in_channels=nChannels, out_1x1=nChannels)
42 | 
43 |         # Dense Block 3
44 |         third_dense_layer = []
45 |         for i in range(n_dense_block[2]):
46 |             third_dense_layer.append(DenseBlock(in_channels=nChannels, growth_rate=growthRate))
47 |             nChannels += growthRate
48 |         self.dense_block_3 = nn.Sequential(*third_dense_layer)
49 | 
50 |         # Transition Block 3
51 |         self.transition_block_3 = TransitionBlock(in_channels=nChannels, out_1x1=nChannels)
52 | 
53 |         # Dense Block 4
54 |         fourth_dense_layer = []
55 |         for i in range(n_dense_block[3]):
56 |             fourth_dense_layer.append(DenseBlock(in_channels=nChannels, growth_rate=growthRate))
57 |             nChannels += growthRate
58 |         self.dense_block_4 = nn.Sequential(*fourth_dense_layer)
59 | 
60 |         # Output Layer
61 |         self.output_layer = nn.Sequential(
62 |             nn.AdaptiveAvgPool2d((7, 7)),
63 |             nn.Flatten(),
64 |             nn.Linear((720 * 7 * 7), num_classes)
65 |         )
66 | 
67 |     def forward(self, x):
68 |         x = self.input_layers(x)
69 |         x = self.dense_block_1(x)
70 |         x = self.transition_block_1(x)
71 |         x = self.dense_block_2(x)
72 |         x = self.transition_block_2(x)
73 |         x = self.dense_block_3(x)
74 |         x = self.transition_block_3(x)
75 |         x = self.dense_block_4(x)
76 |         x = self.output_layer(x)
77 |         return x
78 | 
79 | if __name__ == '__main__':
80 |     n_dense_block = [6, 12, 24, 16] # DenseNet-121
81 |     # n_dense_block = [6, 12, 32, 32] # DenseNet-169
82 |     # n_dense_block = [6, 12, 48, 32] # DenseNet-201
83 |     # n_dense_block = [6, 12, 64, 48] # DenseNet-264
84 |     model = DenseNet121(n_dense_block=n_dense_block)
85 |     summary(model, (3, 224, 224))


--------------------------------------------------------------------------------
/Architectures/DenseNet/densenet_blocks.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class ConvBlock(nn.Module):
 6 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
 7 |         super(ConvBlock, self).__init__()
 8 | 
 9 |         self.conv_block = nn.Sequential(
10 |             nn.BatchNorm2d(in_channels),
11 |             nn.ReLU(),
12 |             nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
13 |         )
14 | 
15 |     def forward(self, x):
16 |         return self.conv_block(x)
17 | 
18 | 
19 | class DenseBlock(nn.Module):
20 |     '''
21 |     This is the implementation of the variant with the bottleneck
22 |     '''
23 |     def __init__(self, in_channels, growth_rate):
24 |         super(DenseBlock, self).__init__()
25 |         out_1x1 = 4 * growth_rate
26 | 
27 |         self.conv_block1x1 = ConvBlock(in_channels=in_channels, out_channels=out_1x1, kernel_size=1)
28 |         self.conv_block3x3 = ConvBlock(in_channels=out_1x1, out_channels=growth_rate, kernel_size=3, padding=1)
29 | 
30 | 
31 |     def forward(self, x):
32 |         out = self.conv_block1x1(x)
33 |         out = self.conv_block3x3(out)
34 |         out = torch.cat([x, out], 1)
35 |         return out
36 | 
37 | 
38 | 
39 | class TransitionBlock(nn.Module):
40 |     def __init__(self, in_channels, out_1x1):
41 |         super(TransitionBlock, self).__init__()
42 | 
43 |         self.transition_block = nn.Sequential(
44 |             nn.BatchNorm2d(in_channels),
45 |             nn.Conv2d(in_channels, out_1x1, kernel_size=1),
46 |             nn.AvgPool2d(kernel_size=2, )
47 |         )
48 | 
49 |     def forward(self, x):
50 |         return self.transition_block(x)
51 | 
52 | 


--------------------------------------------------------------------------------
/Architectures/DenseNet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/DenseNet/src/DenseNet_Transition_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/DenseNet/src/DenseNet_Transition_block.png


--------------------------------------------------------------------------------
/Architectures/DenseNet/src/Dense_Block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/DenseNet/src/Dense_Block.png


--------------------------------------------------------------------------------
/Architectures/EfficientNet/EfficientNet_Module.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class DepthwiseSeparableConv2d(nn.Module):
 5 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
 6 |         super(DepthwiseSeparableConv2d, self).__init__()
 7 |         self.depthwise = nn.Conv2d(
 8 |             in_channels, in_channels, kernel_size, stride, padding, groups=in_channels, bias=False
 9 |         )
10 |         self.pointwise = nn.Conv2d(in_channels, out_channels, 1, 1, 0, bias=False)
11 |         self.bn = nn.BatchNorm2d(out_channels)
12 |         self.act = nn.SiLU()  # Changed to SiLU (Swish) as per EfficientNet
13 | 
14 |     def forward(self, x):
15 |         x = self.depthwise(x)
16 |         x = self.pointwise(x)
17 |         x = self.bn(x)
18 |         x = self.act(x)
19 |         return x
20 | 
21 | 
22 | class ConvBNAct(nn.Module):
23 |     def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, groups=1):
24 |         super().__init__()
25 |         padding = (kernel_size - 1) // 2
26 |         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, groups=groups, bias=False)
27 |         self.bn = nn.BatchNorm2d(out_channels)
28 |         self.act = nn.SiLU()
29 | 
30 |     def forward(self, x):
31 |         return self.act(self.bn(self.conv(x)))
32 | 
33 | 
34 | class SqueezeExcitation(nn.Module):
35 |     def __init__(self, in_channels, reduced_dim):
36 |         super().__init__()
37 |         self.se = nn.Sequential(
38 |             nn.AdaptiveAvgPool2d(1),
39 |             nn.Conv2d(in_channels, reduced_dim, 1),
40 |             nn.SiLU(),
41 |             nn.Conv2d(reduced_dim, in_channels, 1),
42 |             nn.Sigmoid()
43 |         )
44 | 
45 |     def forward(self, x):
46 |         return x * self.se(x)
47 | 
48 | 
49 | class MBConvBlock(nn.Module):
50 |     def __init__(self, in_channels, out_channels, expand_ratio, kernel_size, stride, reduction_ratio=4):
51 |         super().__init__()
52 |         self.use_residual = in_channels == out_channels and stride == 1
53 |         hidden_dim = in_channels * expand_ratio
54 |         reduced_dim = max(1, int(in_channels / reduction_ratio))
55 | 
56 |         layers = []
57 |         if expand_ratio != 1:
58 |             layers.append(ConvBNAct(in_channels, hidden_dim, kernel_size=1))
59 | 
60 |         layers.extend([
61 |             ConvBNAct(hidden_dim, hidden_dim, kernel_size, stride, groups=hidden_dim),
62 |             SqueezeExcitation(hidden_dim, reduced_dim),
63 |             nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
64 |             nn.BatchNorm2d(out_channels)
65 |         ])
66 | 
67 |         self.conv = nn.Sequential(*layers)
68 | 
69 |     def forward(self, x):
70 |         if self.use_residual:
71 |             return x + self.conv(x)
72 |         else:
73 |             return self.conv(x)


--------------------------------------------------------------------------------
/Architectures/EfficientNet/README.md:
--------------------------------------------------------------------------------
1 | # [EfficientNet]
2 | 
3 | ## Overview
4 | This repository contains the implementation of EfficientNet. Below you will find detailed information and resources related to this architecture.
5 | 
6 | # TODO: Add more information about EfficientNet
7 | # TODO: Add Resources
8 | 


--------------------------------------------------------------------------------
/Architectures/EfficientNet/efficientnetb0.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchsummary import summary
 3 | from EfficientNet_Module import *
 4 | 
 5 | 
 6 | class EfficientNetB0(nn.Module):
 7 |     def __init__(self, num_classes=1000):
 8 |         super().__init__()
 9 | 
10 |         self.stem = ConvBNAct(3, 32, stride=2)
11 | 
12 |         self.blocks = nn.Sequential(
13 |             MBConvBlock(32, 16, 1, 3, 1),
14 |             MBConvBlock(16, 24, 6, 3, 2),
15 |             MBConvBlock(24, 24, 6, 3, 1),
16 |             MBConvBlock(24, 40, 6, 5, 2),
17 |             MBConvBlock(40, 40, 6, 5, 1),
18 |             MBConvBlock(40, 80, 6, 3, 2),
19 |             MBConvBlock(80, 80, 6, 3, 1),
20 |             MBConvBlock(80, 80, 6, 3, 1),
21 |             MBConvBlock(80, 112, 6, 5, 1),
22 |             MBConvBlock(112, 112, 6, 5, 1),
23 |             MBConvBlock(112, 112, 6, 5, 1),
24 |             MBConvBlock(112, 192, 6, 5, 2),
25 |             MBConvBlock(192, 192, 6, 5, 1),
26 |             MBConvBlock(192, 192, 6, 5, 1),
27 |             MBConvBlock(192, 192, 6, 5, 1),
28 |             MBConvBlock(192, 320, 6, 3, 1)
29 |         )
30 | 
31 |         self.head = nn.Sequential(
32 |             ConvBNAct(320, 1280, kernel_size=1),
33 |             nn.AdaptiveAvgPool2d(1),
34 |             nn.Dropout(0.2),
35 |             nn.Flatten(),
36 |             nn.Linear(1280, num_classes)
37 |         )
38 | 
39 |     def forward(self, x):
40 |         x = self.stem(x)
41 |         x = self.blocks(x)
42 |         x = self.head(x)
43 |         return x
44 | 
45 | 
46 | from torchvision.models import efficientnet_b0
47 | 
48 | if __name__ == "__main__":
49 |     model = EfficientNetB0(num_classes=1000)
50 |     summary(model, (3, 224, 224))
51 | 
52 |     model1 = efficientnet_b0()
53 |     summary(model1, (3, 224, 224))


--------------------------------------------------------------------------------
/Architectures/EfficientNet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/GoogLeNet/README.md:
--------------------------------------------------------------------------------
 1 | # [GoogLeNet]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of GoogLeNet. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://gvdmnni.notion.site/GoogLeNet-f32d46b07e564b7da8a45a899556fe5a?pvs=4).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | - **Development of the Inception architecture**: A novel deep neural network architecture that balances depth and width to optimize computational efficiency.
12 | - **Hebbian principle and multi-scale processing**: The architectural decisions were informed by these principles to enhance the network's performance.
13 | - **GoogLeNet implementation**: A 22-layer deep network that significantly outperforms previous models on the ILSVRC 2014 classification and detection tasks.
14 | - **Efficiency considerations**: The network design emphasizes computational efficiency, making it suitable for real-world applications with limited computational resources.
15 | 
16 | 
17 | ## Architecture Scheme
18 | Below is a schematic representation of the architecture:
19 | 
20 | ![Architecture Scheme](https://github.com/GuidoManni/DeepLearningImplementation/blob/main/Architectures/GoogLeNet/src/GoogLeNet.png)
21 | *Figure: Inception Architecture from Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., ... & Rabinovich, A. (2015). Going deeper with convolutions. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1-9).*
22 | 
23 | And a schematic representation of the Inception module:
24 | ![Inception Module](https://github.com/GuidoManni/DeepLearningImplementation/blob/main/Architectures/GoogLeNet/src/Inception%20Module%20Optimized.png)
25 | 
26 | ## Reproduced Results (TBD)
27 | The following results were reproduced as per the methodology described in the paper:
28 | - Result 1: [Description and value]
29 | - Result 2: [Description and value]
30 | - Result 3: [Description and value]
31 | - ...
32 | 
33 | 
34 | ## References
35 | - [Original Paper](https://arxiv.org/abs/1409.4842)
36 | - [Detailed Blog Post](https://gvdmnni.notion.site/GoogLeNet-f32d46b07e564b7da8a45a899556fe5a?pvs=4)
37 | 


--------------------------------------------------------------------------------
/Architectures/GoogLeNet/googlenet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torchsummary import summary
  4 | 
  5 | class InceptionModule(nn.Module):
  6 |     def __init__(self, in_channels, out_1x1, out_reduce_3x3, out_3x3, out_reduce_5x5, out_5x5, out_pool):
  7 |         super(InceptionModule, self).__init__()
  8 | 
  9 |         # 1x1 Convolution
 10 |         self.branch1 = nn.Conv2d(in_channels, out_1x1, kernel_size=1, stride=1)
 11 | 
 12 |         # 1x1 Convolution -> 3x3 Convolution
 13 |         self.branch2 = nn.Sequential(
 14 |             nn.Conv2d(in_channels, out_reduce_3x3, kernel_size=1, stride=1),
 15 |             nn.ReLU(),
 16 |             nn.Conv2d(out_reduce_3x3, out_3x3, kernel_size=3, stride=1, padding=1)
 17 |         )
 18 | 
 19 |         # 1x1 Convolution -> 5x5 Convolution
 20 |         self.branch3 = nn.Sequential(
 21 |             nn.Conv2d(in_channels, out_reduce_5x5, kernel_size=1, stride=1),
 22 |             nn.ReLU(),
 23 |             nn.Conv2d(out_reduce_5x5, out_5x5, kernel_size=5, stride=1, padding=2)
 24 |         )
 25 | 
 26 |         # 3x3 Max Pooling -> 1x1 Convolution
 27 |         self.branch4 = nn.Sequential(
 28 |             nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
 29 |             nn.Conv2d(in_channels, out_pool, kernel_size=1, stride=1)
 30 |         )
 31 | 
 32 |     def forward(self, x):
 33 |         branch1 = self.branch1(x)
 34 |         branch2 = self.branch2(x)
 35 |         branch3 = self.branch3(x)
 36 |         branch4 = self.branch4(x)
 37 | 
 38 |         return torch.cat([branch1, branch2, branch3, branch4], 1)
 39 | 
 40 | class GoogLeNet(nn.Module):
 41 |     def __init__(self, in_channels, num_classes):
 42 |         super(GoogLeNet, self).__init__()
 43 | 
 44 |         # First Convolutional Layer
 45 |         self.conv1 = nn.Sequential(
 46 |             nn.Conv2d(in_channels, out_channels=64, kernel_size=7, stride=2, padding=3),
 47 |             nn.ReLU(),
 48 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
 49 |             nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),
 50 |         )
 51 | 
 52 |         # Second Convolutional Layer
 53 |         self.conv2 = nn.Sequential(
 54 |             nn.Conv2d(64, 192, kernel_size=3, stride=1, padding=1),
 55 |             nn.ReLU(),
 56 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
 57 |             nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),
 58 |         )
 59 | 
 60 |         # Inception Module 1
 61 |         self.inception1 = InceptionModule(in_channels=192, out_1x1=64, out_reduce_3x3=96, out_3x3=128, out_reduce_5x5=16, out_5x5=32, out_pool=32)
 62 | 
 63 |         # Inception Module 2
 64 |         self.inception2 = InceptionModule(in_channels=256, out_1x1=128, out_reduce_3x3=128, out_3x3=192, out_reduce_5x5=32, out_5x5=96, out_pool=64)
 65 | 
 66 |         # Max Pooling Layer
 67 |         self.max_pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 68 | 
 69 |         # Inception Module 3
 70 |         self.inception3 = InceptionModule(in_channels=480, out_1x1=192, out_reduce_3x3=96, out_3x3=208, out_reduce_5x5=16, out_5x5=48, out_pool=64)
 71 | 
 72 |         # Inception Module 4
 73 |         self.inception4 = InceptionModule(in_channels=512, out_1x1=160, out_reduce_3x3=112, out_3x3=224, out_reduce_5x5=24, out_5x5=64, out_pool=64)
 74 | 
 75 |         # Inception Module 5
 76 |         self.inception5 = InceptionModule(in_channels=512, out_1x1=128, out_reduce_3x3=128, out_3x3=256, out_reduce_5x5=24, out_5x5=64, out_pool=64)
 77 | 
 78 |         # Inception Module 6
 79 |         self.inception6 = InceptionModule(in_channels=512, out_1x1=112, out_reduce_3x3=144, out_3x3=288, out_reduce_5x5=32, out_5x5=64, out_pool=64)
 80 | 
 81 |         # Inception Module 7
 82 |         self.inception7 = InceptionModule(in_channels=528, out_1x1=256, out_reduce_3x3=160, out_3x3=320, out_reduce_5x5=32, out_5x5=128, out_pool=128)
 83 | 
 84 |         # Max Pooling Layer
 85 |         self.max_pool2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 86 | 
 87 |         # Inception Module 8
 88 |         self.inception8 = InceptionModule(in_channels=832, out_1x1=256, out_reduce_3x3=160, out_3x3=320, out_reduce_5x5=32, out_5x5=128, out_pool=128)
 89 | 
 90 |         # Inception Module 9
 91 |         self.inception9 = InceptionModule(in_channels=832, out_1x1=384, out_reduce_3x3=192, out_3x3=384, out_reduce_5x5=48, out_5x5=128, out_pool=128)
 92 | 
 93 |         # output layer
 94 |         self.output_layer = nn.Sequential(
 95 |             nn.AvgPool2d(kernel_size=7, stride=1),
 96 |             nn.Flatten(),
 97 |             nn.Dropout(p=0.4),
 98 |             nn.Linear(1024, num_classes)
 99 |         )
100 | 
101 |         # auxilary classifier 1 -> connect to inception 4
102 |         self.aux1 = nn.Sequential(
103 |             nn.AvgPool2d(kernel_size=5, stride=3),
104 |             nn.Conv2d(512, 128, kernel_size=1, stride=1),
105 |             nn.ReLU(),
106 |             nn.Flatten(),
107 |             nn.Linear(2048, num_classes)
108 |         )
109 | 
110 |         # auxilary classifier 2 -> connected to inception 6
111 |         self.aux2 = nn.Sequential(
112 |             nn.AvgPool2d(kernel_size=5, stride=3),
113 |             nn.Conv2d(528, 128, kernel_size=1, stride=1),
114 |             nn.ReLU(),
115 |             nn.Flatten(),
116 |             nn.Linear(2048, num_classes)
117 |         )
118 | 
119 |     def forward(self, x):
120 |         x = self.conv1(x)
121 |         x = self.conv2(x)
122 |         x = self.inception1(x)
123 |         x = self.inception2(x)
124 |         x = self.max_pool(x)
125 |         x = self.inception3(x)
126 |         x4 = self.inception4(x)
127 |         x = self.inception5(x4)
128 |         x6 = self.inception6(x)
129 |         x = self.inception7(x6)
130 |         x = self.max_pool2(x)
131 |         x = self.inception8(x)
132 |         x = self.inception9(x)
133 |         print(x.size())
134 | 
135 |         x = self.output_layer(x)
136 | 
137 |         aux1 = self.aux1(x4)
138 |         aux2 = self.aux2(x6)
139 | 
140 |         return x, aux1, aux2
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     model = GoogLeNet(in_channels=3, num_classes=1000)
145 |     x = torch.randn(1, 3, 224, 224)
146 |     y, aux1, aux2 = model(x)
147 |     print(y.size())
148 |     print(aux1.size())
149 |     print(aux2.size())
150 | 
151 |     print(summary(model, (3, 224, 224)))
152 | 
153 | 


--------------------------------------------------------------------------------
/Architectures/GoogLeNet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/GoogLeNet/src/GoogLeNet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/GoogLeNet/src/GoogLeNet.png


--------------------------------------------------------------------------------
/Architectures/GoogLeNet/src/Inception Module Optimized.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/GoogLeNet/src/Inception Module Optimized.png


--------------------------------------------------------------------------------
/Architectures/MobileNet/README.md:
--------------------------------------------------------------------------------
 1 | # [MobileNet]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of MobileNet. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://gvdmnni.notion.site/MobileNets-387efa72839b4b0fa980cfc858c5052f).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | 1. Introduction of the MobileNet architecture, which uses depthwise separable convolutions as its core building block.
12 | 2. Proposal of two hyperparameters - width multiplier and resolution multiplier - that allow for easy adjustment of the model size and computational requirements.
13 | 3. Extensive experiments demonstrating the effectiveness of MobileNets across various tasks and applications, including image classification, object detection, and face attribute detection.
14 | 4. Comparison with other popular models, showing that MobileNets can achieve comparable accuracy with significantly reduced computational cost and model size.
15 | 
16 | ## Architecture Scheme
17 | Below a schematic representation of the MobileNet architecture:
18 | ![Image](./src/MobileNet_architecture.png)
19 | 
20 | Below a schematic representation of the DepthWise Convolution modules that are used in the architecture:
21 | ![Image](./src/MobileNets_DepthwiseSeparableConv.png)
22 | 
23 | 
24 | 
25 | 
26 | ## Reproduced Results (TBD)
27 | The following results were reproduced as per the methodology described in the paper:
28 | - Result 1: [Description and value]
29 | - Result 2: [Description and value]
30 | - Result 3: [Description and value]
31 | - ...
32 | 
33 | 
34 | ## References
35 | - [Original Paper](https://arxiv.org/abs/1704.04861)
36 | - [Detailed Blog Post](https://gvdmnni.notion.site/MobileNets-387efa72839b4b0fa980cfc858c5052f)
37 | 


--------------------------------------------------------------------------------
/Architectures/MobileNet/mobilenet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | 
 5 | from mobilenets_module import DepthwiseSeprableConv2d, ConvBlock
 6 | 
 7 | class MobileNet(nn.Module):
 8 |     def __init__(self, num_classes = 1000, alpha = 1.0):
 9 |         super(MobileNet, self).__init__()
10 |         output_channel = int(alpha * 32)
11 | 
12 |         self.input_block = ConvBlock(in_channels = 3, out_channels = output_channel, kernel_size = 3, stride = 2)
13 | 
14 |         self.intermediate_block1 = nn.Sequential(
15 |             DepthwiseSeprableConv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=3, stride=1),
16 |             ConvBlock(in_channels=output_channel, out_channels=2*output_channel, kernel_size=1, stride=1), # This is similar to the expanding layer of the SqueezeNet
17 |             DepthwiseSeprableConv2d(in_channels=2*output_channel, out_channels=2*output_channel, kernel_size=3, stride=2),
18 |             ConvBlock(in_channels=2*output_channel, out_channels = 4*output_channel, kernel_size=1, stride=1),
19 |             DepthwiseSeprableConv2d(in_channels=4*output_channel, out_channels=4*output_channel, kernel_size=3, stride=1),
20 |             ConvBlock(in_channels=4 * output_channel, out_channels=4 * output_channel, kernel_size=1, stride=1),
21 |             DepthwiseSeprableConv2d(in_channels=4 * output_channel, out_channels=4 * output_channel, kernel_size=3, stride=2),
22 |             ConvBlock(in_channels=4*output_channel, out_channels=8 * output_channel, kernel_size=1, stride=1), # This is similar to the expanding layer of the SqueezeNet
23 |             DepthwiseSeprableConv2d(in_channels=8 * output_channel, out_channels=8 * output_channel, kernel_size=3, stride=1),
24 |             ConvBlock(in_channels=8 * output_channel, out_channels=8 * output_channel, kernel_size=1, stride=1),
25 |             DepthwiseSeprableConv2d(in_channels=8 * output_channel, out_channels=8 * output_channel, kernel_size=3, stride=2),
26 |             ConvBlock(in_channels=8 * output_channel, out_channels=16 * output_channel, kernel_size=1, stride=1),# This is similar to the expanding layer of the SqueezeNet
27 |         )
28 | 
29 |         intermediate_block = []
30 |         for i in range(5):
31 |             intermediate_block.append(DepthwiseSeprableConv2d(in_channels=16 * output_channel, out_channels=16 * output_channel, kernel_size=3, stride=1))
32 |             intermediate_block.append(ConvBlock(in_channels=16 * output_channel, out_channels=16 * output_channel, kernel_size=1, stride=1))
33 |         self.intermediate_block2 = nn.Sequential(*intermediate_block)
34 | 
35 |         self.final_block = nn.Sequential(
36 |             DepthwiseSeprableConv2d(in_channels=16 * output_channel, out_channels=16 * output_channel, kernel_size=3,stride=2),
37 |             ConvBlock(in_channels=16 * output_channel, out_channels=32 * output_channel, kernel_size=1, stride=1), # This is similar to the expanding layer of the SqueezeNet
38 |             DepthwiseSeprableConv2d(in_channels=32 * output_channel, out_channels=32 * output_channel, kernel_size=3, stride=2),
39 |             ConvBlock(in_channels=32 * output_channel, out_channels=32 * output_channel, kernel_size=1, stride=1),
40 |         )
41 | 
42 |         self.output_layer = nn.Sequential(
43 |             nn.AdaptiveAvgPool2d((1,1)),
44 |             nn.Flatten(),
45 |             nn.Linear(in_features=32*output_channel, out_features=1000)
46 |         )
47 | 
48 |     def forward(self, x):
49 |         x = self.input_block(x)
50 |         x = self.intermediate_block1(x)
51 |         x = self.intermediate_block2(x)
52 |         x = self.final_block(x)
53 |         x = self.output_layer(x)
54 |         return x
55 | 
56 | if __name__ == '__main__':
57 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
58 |     model = MobileNet().to(device)
59 |     summary(model, (3, 224, 224))


--------------------------------------------------------------------------------
/Architectures/MobileNet/mobilenets_module.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class DepthwiseSeprableConv2d(nn.Module):
 6 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1):
 7 |         '''
 8 |         A separable convolution block is a combination of depthwise convolution and pointwise convolution. This means
 9 |         that the input tensor is convolved with a kernel of size (kernel_size, kernel_size) and then the output of this
10 |         operation is convolved with a 1x1 kernel. This is done to reduce the number of parameters in the model.
11 |         '''
12 |         super(DepthwiseSeprableConv2d, self).__init__()
13 | 
14 |         self.depthwise = nn.Sequential(
15 |             nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=in_channels),
16 |             nn.BatchNorm2d(in_channels),
17 |             nn.ReLU())
18 |         self.pointwise = nn.Sequential(
19 |             nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
20 |             nn.BatchNorm2d(out_channels),
21 |             nn.ReLU())
22 | 
23 |     def forward(self, x):
24 |         x = self.depthwise(x)
25 |         x = self.pointwise(x)
26 |         return x
27 | 
28 | 
29 | class ConvBlock(nn.Module):
30 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1):
31 |         super(ConvBlock, self).__init__()
32 | 
33 |         self.conv_block = nn.Sequential(
34 |             nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding),
35 |             nn.BatchNorm2d(out_channels),
36 |             nn.ReLU()
37 |         )
38 | 
39 |     def forward(self, x):
40 |         return self.conv_block(x)


--------------------------------------------------------------------------------
/Architectures/MobileNet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/MobileNet/src/MobileNet_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/MobileNet/src/MobileNet_architecture.png


--------------------------------------------------------------------------------
/Architectures/MobileNet/src/MobileNets_DepthwiseSeparableConv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/MobileNet/src/MobileNets_DepthwiseSeparableConv.png


--------------------------------------------------------------------------------
/Architectures/MobileNetV2/README.md:
--------------------------------------------------------------------------------
1 | # [MobileNetV2]
2 | 
3 | ## Overview
4 | This repository contains the implementation of MobileNetV2. Below you will find detailed information and resources related to this architecture.
5 | 
6 | # TODO: Add more information about MobileNetV2
7 | # TODO: Add Resources
8 | 


--------------------------------------------------------------------------------
/Architectures/MobileNetV2/mobilenets_module.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class DepthwiseSeprableConv2d(nn.Module):
 6 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding='same'):
 7 |         '''
 8 |         A separable convolution block is a combination of depthwise convolution and pointwise convolution. This means
 9 |         that the input tensor is convolved with a kernel of size (kernel_size, kernel_size) and then the output of this
10 |         operation is convolved with a 1x1 kernel. This is done to reduce the number of parameters in the model.
11 |         '''
12 |         super(DepthwiseSeprableConv2d, self).__init__()
13 | 
14 |         self.depthwise = nn.Sequential(
15 |             nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=in_channels),
16 |             nn.BatchNorm2d(in_channels),
17 |             nn.ReLU())
18 |         self.pointwise = nn.Sequential(
19 |             nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
20 |             nn.BatchNorm2d(out_channels),
21 |             nn.ReLU())
22 | 
23 |     def forward(self, x):
24 |         x = self.depthwise(x)
25 |         x = self.pointwise(x)
26 |         return x
27 | 
28 | 
29 | class BottleneckResidualBlock(nn.Module):
30 |     def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, expansion_factor=6):
31 |         super(BottleneckResidualBlock, self).__init__()
32 |         self.stride = stride
33 |         expanded_channels = in_channels * expansion_factor
34 | 
35 |         layers = []
36 |         # Expansion phase
37 |         if expansion_factor != 1:
38 |             layers.extend([
39 |                 nn.Conv2d(in_channels, expanded_channels, 1, bias=False),
40 |                 nn.BatchNorm2d(expanded_channels),
41 |                 nn.ReLU6(inplace=True)
42 |             ])
43 | 
44 |         # Depthwise phase
45 |         layers.extend([
46 |             nn.Conv2d(expanded_channels, expanded_channels, kernel_size, stride=stride,
47 |                       padding=kernel_size // 2, groups=expanded_channels, bias=False),
48 |             nn.BatchNorm2d(expanded_channels),
49 |             nn.ReLU6(inplace=True)
50 |         ])
51 | 
52 |         # Projection phase
53 |         layers.extend([
54 |             nn.Conv2d(expanded_channels, out_channels, 1, bias=False),
55 |             nn.BatchNorm2d(out_channels)
56 |         ])
57 | 
58 |         self.conv = nn.Sequential(*layers)
59 |         self.use_residual = in_channels == out_channels and stride == 1
60 | 
61 |     def forward(self, x):
62 |         if self.use_residual:
63 |             return x + self.conv(x)
64 |         else:
65 |             return self.conv(x)
66 | 
67 | 
68 | class ConvBlock(nn.Module):
69 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
70 |         super(ConvBlock, self).__init__()
71 |         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
72 |         self.bn = nn.BatchNorm2d(out_channels)
73 |         self.relu = nn.ReLU6(inplace=True)
74 | 
75 |     def forward(self, x):
76 |         x = self.conv(x)
77 |         x = self.bn(x)
78 |         x = self.relu(x)
79 |         return x
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/Architectures/MobileNetV2/mobilenetv2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from mobilenets_module import *
 3 | from torchsummary import summary
 4 | 
 5 | class MobileNetV2(nn.Module):
 6 |     def __init__(self, in_channels, num_classes, expansion_factor_first_layer=1, expansion_factor=6):
 7 |         super(MobileNetV2, self).__init__()
 8 | 
 9 |         self.input_layer = ConvBlock(in_channels=in_channels, out_channels=32, kernel_size=3, stride=2, padding=1)
10 | 
11 |         self.bottleneck_1 = BottleneckResidualBlock(in_channels=32, out_channels=16, stride=1, expansion_factor=expansion_factor_first_layer)
12 |         self.bottleneck_2 = nn.Sequential(
13 |             BottleneckResidualBlock(in_channels=16, out_channels=24, stride=2, expansion_factor=expansion_factor),
14 |             BottleneckResidualBlock(in_channels=24, out_channels=24, stride=1, expansion_factor=expansion_factor)
15 |         )
16 |         self.bottleneck_3 = nn.Sequential(
17 |             BottleneckResidualBlock(in_channels=24, out_channels=32, stride=2, expansion_factor=expansion_factor),
18 |             BottleneckResidualBlock(in_channels=32, out_channels=32, stride=1, expansion_factor=expansion_factor),
19 |             BottleneckResidualBlock(in_channels=32, out_channels=32, stride=1, expansion_factor=expansion_factor)
20 |         )
21 |         self.bottleneck_4 = nn.Sequential(
22 |             BottleneckResidualBlock(in_channels=32, out_channels=64, stride=2, expansion_factor=expansion_factor),
23 |             BottleneckResidualBlock(in_channels=64, out_channels=64, stride=1, expansion_factor=expansion_factor),
24 |             BottleneckResidualBlock(in_channels=64, out_channels=64, stride=1, expansion_factor=expansion_factor),
25 |             BottleneckResidualBlock(in_channels=64, out_channels=64, stride=1, expansion_factor=expansion_factor)
26 |         )
27 |         self.bottleneck_5 = nn.Sequential(
28 |             BottleneckResidualBlock(in_channels=64, out_channels=96, stride=1, expansion_factor=expansion_factor),
29 |             BottleneckResidualBlock(in_channels=96, out_channels=96, stride=1, expansion_factor=expansion_factor),
30 |             BottleneckResidualBlock(in_channels=96, out_channels=96, stride=1, expansion_factor=expansion_factor)
31 |         )
32 |         self.bottleneck_6 = nn.Sequential(
33 |             BottleneckResidualBlock(in_channels=96, out_channels=160, stride=2, expansion_factor=expansion_factor),
34 |             BottleneckResidualBlock(in_channels=160, out_channels=160, stride=1, expansion_factor=expansion_factor),
35 |             BottleneckResidualBlock(in_channels=160, out_channels=160, stride=1, expansion_factor=expansion_factor)
36 |         )
37 | 
38 |         self.bottleneck_7 = nn.Sequential(
39 |             BottleneckResidualBlock(in_channels=160, out_channels=320, stride=1, expansion_factor=expansion_factor)
40 |         )
41 | 
42 | 
43 |         # This does not follow the original implementation but it does not really matter
44 |         self.output_layer = nn.Sequential(
45 |             ConvBlock(in_channels=320, out_channels=1280, kernel_size=1, stride=1),
46 |             nn.AdaptiveAvgPool2d(1),
47 |             nn.Flatten(),
48 |             nn.Linear(1280, num_classes)
49 |             )
50 | 
51 |     def forward(self, x):
52 |         x = self.input_layer(x)
53 |         print(x.shape)
54 |         x = self.bottleneck_1(x)
55 |         print(x.shape)
56 |         x = self.bottleneck_2(x)
57 |         x = self.bottleneck_3(x)
58 |         x = self.bottleneck_4(x)
59 |         x = self.bottleneck_5(x)
60 |         x = self.bottleneck_6(x)
61 |         x = self.bottleneck_7(x)
62 |         x = self.output_layer(x)
63 | 
64 |         return x
65 | 
66 | 
67 | from torchvision.models import mobilenet_v2
68 | if __name__ == '__main__':
69 |     model = MobileNetV2(in_channels=3, num_classes=1000)
70 |     summary(model, (3, 224, 224))


--------------------------------------------------------------------------------
/Architectures/MobileNetV2/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/ResNeXt/README.md:
--------------------------------------------------------------------------------
 1 | # [ResNeXt]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of ResNeXt. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://gvdmnni.notion.site/ResNext-05bd5cb9ae0a40fb811746f5a4331f65?pvs=4).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | 1. ResNeXt: A homogeneous, multi-branch architecture that aggregates a set of transformations with the same topology. This design is easy to implement and scale up.
12 | 2. Cardinality: A new dimension that controls the number of transformations in ResNeXt. Experiments show that increasing cardinality is more effective than going deeper or wider for improving accuracy while maintaining model complexity.
13 | 3. Improved accuracy: ResNeXt achieves better accuracy than previous state-of-the-art models on the ImageNet-1K dataset while having lower complexity. It also performs well on the CIFAR datasets and COCO object detection task.
14 | 
15 | ## Architecture Scheme
16 | Below a schematic representation of the ResNext modules that are used in the architecture:
17 | ![Image](./src/ResNext_Module.png)
18 | 
19 | 
20 | 
21 | 
22 | ## Reproduced Results (TBD)
23 | The following results were reproduced as per the methodology described in the paper:
24 | - Result 1: [Description and value]
25 | - Result 2: [Description and value]
26 | - Result 3: [Description and value]
27 | - ...
28 | 
29 | 
30 | ## References
31 | - [Original Paper](https://arxiv.org/abs/1611.05431)
32 | - [Detailed Blog Post](https://gvdmnni.notion.site/ResNext-05bd5cb9ae0a40fb811746f5a4331f65?pvs=4)
33 | 


--------------------------------------------------------------------------------
/Architectures/ResNeXt/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/ResNeXt/resnext50.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | 
 5 | from resnext_module import *
 6 | 
 7 | 
 8 | class ResNext50(nn.Module):
 9 |     def __init__(self, num_classes = 1000, cardinality = 32):
10 |         super(ResNext50, self).__init__()
11 | 
12 |         self.input_layers = nn.Sequential(
13 |             nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=3),
14 |             nn.BatchNorm2d(64),
15 |             nn.ReLU(inplace=True),
16 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
17 |         )
18 | 
19 |         conv_2_list = []
20 |         for i in range(3):
21 |             if i == 0:
22 |                 # The first blok of each stage perform downsampling using stride = 2 on the grouped conv 3x3
23 |                 conv_2_list.append(ResNextBlock(in_channels=64, out_channels_bottleneck=128, out_channels=256, stride=1, groups=cardinality))
24 |             else:
25 |                 conv_2_list.append(ResNextBlock(in_channels=256, out_channels_bottleneck=128, out_channels=256, stride=1, groups=cardinality))
26 | 
27 |         self.conv2 = nn.Sequential(*conv_2_list)
28 | 
29 |         conv_3_list = []
30 |         for i in range(4):
31 |             if i == 0:
32 |                 conv_3_list.append(ResNextBlock(in_channels=256, out_channels_bottleneck=256, out_channels=512, stride=2, groups=cardinality))
33 |             else:
34 |                 conv_3_list.append(ResNextBlock(in_channels=512, out_channels_bottleneck=256, out_channels=512, stride=1, groups=cardinality))
35 |         self.conv3 = nn.Sequential(*conv_3_list)
36 | 
37 |         conv_4_list = []
38 |         for i in range(6):
39 |             if i == 0:
40 |                 conv_4_list.append(ResNextBlock(in_channels=512, out_channels_bottleneck=512, out_channels=1024, stride=2, groups=cardinality))
41 |             else:
42 |                 conv_4_list.append(ResNextBlock(in_channels=1024, out_channels_bottleneck=512, out_channels=1024, stride=1, groups=cardinality))
43 |         self.conv4 = nn.Sequential(*conv_4_list)
44 | 
45 |         conv_5_list = []
46 |         for i in range(3):
47 |             if i == 0:
48 |                 conv_5_list.append(ResNextBlock(in_channels=1024, out_channels_bottleneck=1024, out_channels=2048, stride=2, groups=cardinality))
49 |             else:
50 |                 conv_5_list.append(ResNextBlock(in_channels=2048, out_channels_bottleneck=1024, out_channels=2048, stride=1, groups=cardinality))
51 |         self.conv5 = nn.Sequential(*conv_5_list)
52 | 
53 |         self.output_layers = nn.Sequential(
54 |             nn.AdaptiveAvgPool2d((1, 1)),
55 |             nn.Flatten(),
56 |             nn.Linear(2048, num_classes)
57 |         )
58 |     def forward(self, x):
59 |         x = self.input_layers(x)
60 |         x = self.conv2(x)
61 |         x = self.conv3(x)
62 |         x = self.conv4(x)
63 |         x = self.conv5(x)
64 |         x = self.output_layers(x)
65 |         return x
66 | 
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     model = ResNext50(1000)
71 |     summary(model, (3, 224, 224))


--------------------------------------------------------------------------------
/Architectures/ResNeXt/resnext_module.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class ConvBlock(nn.Module):
 6 |     '''
 7 |     This is a simple Convolutional block that consists of Conv2d, BatchNorm2d and ReLU
 8 |     '''
 9 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, groups=1):
10 |         super(ConvBlock, self).__init__()
11 | 
12 |         self.conv_block = nn.Sequential(
13 |             nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups),
14 |             nn.BatchNorm2d(out_channels),
15 |             nn.ReLU()
16 |         )
17 |     def forward(self, x):
18 |         return self.conv_block(x)
19 | 
20 | 
21 | class ResNextBlock(nn.Module):
22 |     '''
23 |     This is the ResNext block that consists of 3 ConvBlocks, the first one is 1x1, the second one is 3x3 grouped and the last one is 1x1.
24 |     The shortcut is projection shortcut (option B from the paper)
25 |     '''
26 |     def __init__(self, in_channels, out_channels_bottleneck, out_channels, groups, stride = 1):
27 |         super(ResNextBlock, self).__init__()
28 | 
29 |         # first conv is a 1x1 (squeezing the channel)
30 |         self.squeezeconv1x1 = ConvBlock(in_channels=in_channels, out_channels=out_channels_bottleneck, kernel_size=1, stride=1, padding=0)
31 | 
32 |         # second conv is 3x3 and it is also grouped
33 |         self.grouped_conv3x3 = ConvBlock(in_channels=out_channels_bottleneck, out_channels=out_channels_bottleneck, kernel_size=3, stride=stride, padding=1, groups=groups)
34 | 
35 |         # third conv is again a 1x1 (expanding the channel)
36 |         self.expandconv1x1 = ConvBlock(in_channels=out_channels_bottleneck, out_channels=out_channels, kernel_size=1, stride=1, padding=0)
37 | 
38 |         # as a shortcut we are using the option B from the paper (projection shortcut)
39 |         self.shortcut = nn.Identity()
40 |         if stride != 1 or in_channels != out_channels:
41 |             self.shortcut = nn.Sequential(
42 |                 nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
43 |                 nn.BatchNorm2d(out_channels),
44 |                 # we do not use the ReLU activation function in the shortcut because we want to preserve the information flow
45 |             )
46 | 
47 |         self.relu = nn.ReLU(inplace=True)
48 | 
49 |     def forward(self, x):
50 |         residual = x
51 |         x = self.squeezeconv1x1(x)
52 |         x = self.grouped_conv3x3(x)
53 |         x = self.expandconv1x1(x)
54 |         x += self.shortcut(residual)
55 |         x = self.relu(x)
56 |         return x
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/Architectures/ResNeXt/src/ResNext_Module.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/ResNeXt/src/ResNext_Module.png


--------------------------------------------------------------------------------
/Architectures/ResNet/README.md:
--------------------------------------------------------------------------------
 1 | # [ResNet]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of ResNet. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://gvdmnni.notion.site/ResNet-f34ba2f4ad474fbdbda365b881f8b1cc?pvs=4).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | - Introduction of the residual learning framework for training very deep neural networks.
12 | - Demonstration that degradation of training accuracy in very deep networks is addressed by residual learning.
13 | - Comprehensive empirical evidence showing that residual networks are easier to optimize and can gain accuracy from substantially increased depth.
14 | - State-of-the-art results on ImageNet classification, with architectures exceeding 100 layers.
15 | - Generalization of the residual learning concept to object detection tasks.
16 | 
17 | ## Architecture Scheme
18 | Below is a schematic representation of the architecture:
19 | 
20 | ![Architecture Scheme](https://github.com/GuidoManni/DeepLearningImplementation/blob/main/Architectures/ResNet/src/ResNet_building_block.png)
21 | 
22 | ## Reproduced Results (TBD)
23 | The following results were reproduced as per the methodology described in the paper:
24 | - Result 1: [Description and value]
25 | - Result 2: [Description and value]
26 | - Result 3: [Description and value]
27 | - ...
28 | 
29 | ## References
30 | - [Original Paper](https://arxiv.org/pdf/1512.03385)
31 | - [Detailed Blog Post](https://gvdmnni.notion.site/ResNet-f34ba2f4ad474fbdbda365b881f8b1cc?pvs=4)
32 | 


--------------------------------------------------------------------------------
/Architectures/ResNet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/ResNet/residual_block.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | class ResidualBlock1(nn.Module):
 4 |     def __init__(self, in_channels, out_channels, stride=1):
 5 |         super(ResidualBlock1, self).__init__()
 6 |         self.sub_block1 = nn.Sequential(
 7 |             nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
 8 |             nn.BatchNorm2d(out_channels),
 9 |             nn.ReLU(inplace=True),
10 |         )
11 |         self.sub_block2 = nn.Sequential(
12 |             nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False),
13 |             nn.BatchNorm2d(out_channels),
14 |         )
15 | 
16 |         # as a shortcut we are using the option B from the paper (projection shortcut)
17 |         self.shortcut = nn.Identity()
18 |         if stride != 1 or in_channels != out_channels:
19 |             self.shortcut = nn.Sequential(
20 |                 nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
21 |                 nn.BatchNorm2d(out_channels),
22 |             )
23 | 
24 |         self.relu = nn.ReLU(inplace=True)
25 | 
26 |     def forward(self, x):
27 |         residual = x
28 |         x = self.sub_block1(x)
29 |         x = self.sub_block2(x)
30 |         residual = self.shortcut(residual)
31 |         x += residual
32 |         x = self.relu(x)
33 |         return x
34 | 
35 | class ResidualBlock2(nn.Module):
36 |     def __init__(self, in_channels, out_channels1, out_channels2, stride=1):
37 |         super(ResidualBlock2, self).__init__()
38 |         self.sub_block1 = nn.Sequential(
39 |             nn.Conv2d(in_channels, out_channels1, kernel_size=1, stride=1, padding=0, bias=False),
40 |             nn.BatchNorm2d(out_channels1),
41 |             nn.ReLU(inplace=True),
42 |         )
43 |         self.sub_block2 = nn.Sequential(
44 |             nn.Conv2d(out_channels1, out_channels1, kernel_size=3, stride=stride, padding=1, bias=False),
45 |             nn.BatchNorm2d(out_channels1),
46 |         )
47 | 
48 |         self.sub_block3 = nn.Sequential(
49 |             nn.Conv2d(out_channels1, out_channels2, kernel_size=1, stride=1, padding=0, bias=False),
50 |             nn.BatchNorm2d(out_channels2),
51 |         )
52 | 
53 |         # as a shortcut we are using the option B from the paper (projection shortcut)
54 |         self.shortcut = nn.Identity()
55 |         if stride != 1 or in_channels != out_channels2:
56 |             self.shortcut = nn.Sequential(
57 |                 nn.Conv2d(in_channels, out_channels2, kernel_size=1, stride=stride),
58 |                 nn.BatchNorm2d(out_channels2),
59 |                 # we do not use the ReLU activation function in the shortcut because we want to preserve the information flow
60 |             )
61 | 
62 |         self.relu = nn.ReLU(inplace=True)
63 |     def forward(self, x):
64 |         residual = x
65 |         x = self.sub_block1(x)
66 |         x = self.sub_block2(x)
67 |         x = self.sub_block3(x)
68 |         residual = self.shortcut(residual)
69 |         x += residual
70 |         x = self.relu(x)
71 |         return x


--------------------------------------------------------------------------------
/Architectures/ResNet/resnet101.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from residual_block import ResidualBlock2
 4 | from torchsummary import summary
 5 | 
 6 | class ResNet101(nn.Module):
 7 |     def __init__(self, num_classes):
 8 |         super(ResNet101, self).__init__()
 9 |         self.input_layer = nn.Sequential(
10 |             nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=3),
11 |             nn.BatchNorm2d(64),
12 |             nn.ReLU(inplace=True),
13 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
14 |         )
15 | 
16 |         self.conv2_x = nn.Sequential(
17 |             ResidualBlock2(in_channels=64, out_channels1=64, out_channels2=256, stride=1),
18 |             ResidualBlock2(in_channels=256, out_channels1=64, out_channels2=256, stride=1),
19 |             ResidualBlock2(in_channels=256, out_channels1=64, out_channels2=256, stride=1),
20 |         )
21 | 
22 |         self.conv3_x = nn.Sequential(
23 |             ResidualBlock2(in_channels=256, out_channels1=128, out_channels2=512, stride=2),
24 |             ResidualBlock2(in_channels=512, out_channels1=128, out_channels2=512, stride=1),
25 |             ResidualBlock2(in_channels=512, out_channels1=128, out_channels2=512, stride=1),
26 |             ResidualBlock2(in_channels=512, out_channels1=128, out_channels2=512, stride=1),
27 |         )
28 | 
29 |         layers = []
30 |         for i in range(23):
31 |             if i == 0:
32 |                 layers.append(ResidualBlock2(in_channels=512, out_channels1=256, out_channels2=1024, stride=2))
33 |             else:
34 |                 layers.append(ResidualBlock2(in_channels=1024, out_channels1=256, out_channels2=1024, stride=1))
35 |         self.conv4_x = nn.Sequential(*layers)
36 | 
37 |         self.conv5_x = nn.Sequential(
38 |             ResidualBlock2(in_channels=1024, out_channels1=512, out_channels2=2048, stride=2),
39 |             ResidualBlock2(in_channels=2048, out_channels1=512, out_channels2=2048, stride=1),
40 |             ResidualBlock2(in_channels=2048, out_channels1=512, out_channels2=2048, stride=1),
41 |         )
42 | 
43 |         self.output_layer = nn.Sequential(
44 |             nn.AdaptiveAvgPool2d((1, 1)),
45 |             nn.Flatten(),
46 |             nn.Linear((2048), num_classes)
47 |         )
48 | 
49 |     def forward(self, x):
50 |         x = self.input_layer(x)
51 |         x = self.conv2_x(x)
52 |         x = self.conv3_x(x)
53 |         x = self.conv4_x(x)
54 |         x = self.conv5_x(x)
55 |         x = self.output_layer(x)
56 |         return x
57 | 
58 | if __name__ == '__main__':
59 |     model = ResNet101(1000)
60 |     summary(model, (3, 224, 224))
61 | 


--------------------------------------------------------------------------------
/Architectures/ResNet/resnet152.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from residual_block import ResidualBlock2
 4 | from torchsummary import summary
 5 | 
 6 | class ResNet152(nn.Module):
 7 |     def __init__(self, num_classes=1000):
 8 |         super(ResNet152, self).__init__()
 9 | 
10 |         self.input_layer = nn.Sequential(
11 |             nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=3),
12 |             nn.BatchNorm2d(64),
13 |             nn.ReLU(inplace=True),
14 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
15 |         )
16 | 
17 |         self.conv2_x = nn.Sequential(
18 |             ResidualBlock2(in_channels=64, out_channels1=64, out_channels2=256, stride=1),
19 |             ResidualBlock2(in_channels=256, out_channels1=64, out_channels2=256, stride=1),
20 |             ResidualBlock2(in_channels=256, out_channels1=64, out_channels2=256, stride=1),
21 |         )
22 | 
23 |         self.conv3_x = nn.Sequential(
24 |             ResidualBlock2(in_channels=256, out_channels1=128, out_channels2=512, stride=2),
25 |             ResidualBlock2(in_channels=512, out_channels1=128, out_channels2=512, stride=1),
26 |             ResidualBlock2(in_channels=512, out_channels1=128, out_channels2=512, stride=1),
27 |             ResidualBlock2(in_channels=512, out_channels1=128, out_channels2=512, stride=1),
28 |             ResidualBlock2(in_channels=512, out_channels1=128, out_channels2=512, stride=1),
29 |             ResidualBlock2(in_channels=512, out_channels1=128, out_channels2=512, stride=1),
30 |             ResidualBlock2(in_channels=512, out_channels1=128, out_channels2=512, stride=1),
31 |             ResidualBlock2(in_channels=512, out_channels1=128, out_channels2=512, stride=1),
32 |         )
33 | 
34 |         layers = []
35 |         for i in range(36):
36 |             if i == 0:
37 |                 layers.append(ResidualBlock2(in_channels=512, out_channels1=256, out_channels2=1024, stride=2))
38 |             else:
39 |                 layers.append(ResidualBlock2(in_channels=1024, out_channels1=256, out_channels2=1024, stride=1))
40 |         self.conv4_x = nn.Sequential(*layers)
41 | 
42 |         self.conv5_x = nn.Sequential(
43 |             ResidualBlock2(in_channels=1024, out_channels1=512, out_channels2=2048, stride=2),
44 |             ResidualBlock2(in_channels=2048, out_channels1=512, out_channels2=2048, stride=1),
45 |             ResidualBlock2(in_channels=2048, out_channels1=512, out_channels2=2048, stride=1),
46 |         )
47 | 
48 |         self.output_layer = nn.Sequential(
49 |             nn.AdaptiveAvgPool2d((1, 1)),
50 |             nn.Flatten(),
51 |             nn.Linear((2048), num_classes)
52 |         )
53 | 
54 |     def forward(self, x):
55 |         x = self.input_layer(x)
56 |         x = self.conv2_x(x)
57 |         x = self.conv3_x(x)
58 |         x = self.conv4_x(x)
59 |         x = self.conv5_x(x)
60 |         x = self.output_layer(x)
61 |         return x
62 | 
63 | if __name__ == '__main__':
64 |     model = ResNet152(1000)
65 |     summary(model, (3, 224, 224))


--------------------------------------------------------------------------------
/Architectures/ResNet/resnet18.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from residual_block import ResidualBlock1
 4 | from torchsummary import summary
 5 | 
 6 | 
 7 | 
 8 | 
 9 | 
10 | class ResNet18(nn.Module):
11 |     def __init__(self, num_classes=1000):
12 |         super(ResNet18, self).__init__()
13 | 
14 |         self.input_layer = nn.Sequential(
15 |             nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=3),
16 |             nn.BatchNorm2d(64),
17 |             nn.ReLU(inplace=True),
18 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
19 |         )
20 | 
21 |         self.conv2_x = nn.Sequential(
22 |             ResidualBlock1(in_channels=64, out_channels=64, stride=1),
23 |             ResidualBlock1(in_channels=64, out_channels=128, stride=1),
24 |         )
25 | 
26 |         self.conv3_x = nn.Sequential(
27 |             ResidualBlock1(in_channels=128, out_channels=128, stride=2),
28 |             ResidualBlock1(in_channels=128, out_channels=256, stride=1),
29 |         )
30 | 
31 |         self.conv4_x = nn.Sequential(
32 |             ResidualBlock1(in_channels=256, out_channels=256, stride=2),
33 |             ResidualBlock1(in_channels=256, out_channels=512, stride=1),
34 |         )
35 | 
36 |         self.conv5_x = nn.Sequential(
37 |             ResidualBlock1(in_channels=512, out_channels=512, stride=2),
38 |             ResidualBlock1(in_channels=512, out_channels=512, stride=1),
39 |         )
40 | 
41 |         self.output_layer = nn.Sequential(
42 |             nn.AdaptiveAvgPool2d((1,1)),
43 |             nn.Flatten(),
44 |             nn.Linear((512), num_classes)
45 |         )
46 | 
47 |     def forward(self, x):
48 |         x = self.input_layer(x)
49 |         x = self.conv2_x(x)
50 |         x = self.conv3_x(x)
51 |         x = self.conv4_x(x)
52 |         x = self.conv5_x(x)
53 |         x = self.output_layer(x)
54 |         return x
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     model = ResNet18(10)
59 |     summary(model, (3, 224, 224))
60 | 
61 | 


--------------------------------------------------------------------------------
/Architectures/ResNet/resnet34.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from residual_block import ResidualBlock1
 4 | from torchsummary import summary
 5 | 
 6 | class ResNet34(nn.Module):
 7 |     def __init__(self, num_classes=1000):
 8 |         super(ResNet34, self).__init__()
 9 | 
10 |         self.input_layer = nn.Sequential(
11 |             nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=3),
12 |             nn.BatchNorm2d(64),
13 |             nn.ReLU(inplace=True),
14 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
15 |         )
16 | 
17 |         # repeat 3 times
18 |         self.conv2_x = nn.Sequential(
19 |             ResidualBlock1(in_channels=64, out_channels=64, stride=1),
20 |             ResidualBlock1(in_channels=64, out_channels=64, stride=1),
21 |             ResidualBlock1(in_channels=64, out_channels=128, stride=1),
22 |         )
23 | 
24 |         # repeat 4 times
25 |         self.conv3_x = nn.Sequential(
26 |             ResidualBlock1(in_channels=128, out_channels=128, stride=2),
27 |             ResidualBlock1(in_channels=128, out_channels=128, stride=1),
28 |             ResidualBlock1(in_channels=128, out_channels=128, stride=1),
29 |             ResidualBlock1(in_channels=128, out_channels=256, stride=1),
30 |         )
31 | 
32 |         # repeat 6 times
33 |         self.conv4_x = nn.Sequential(
34 |             ResidualBlock1(in_channels=256, out_channels=256, stride=2),
35 |             ResidualBlock1(in_channels=256, out_channels=256, stride=1),
36 |             ResidualBlock1(in_channels=256, out_channels=256, stride=1),
37 |             ResidualBlock1(in_channels=256, out_channels=256, stride=1),
38 |             ResidualBlock1(in_channels=256, out_channels=256, stride=1),
39 |             ResidualBlock1(in_channels=256, out_channels=512, stride=1),
40 |         )
41 | 
42 |         # repeat 3 times
43 |         self.conv5_x = nn.Sequential(
44 |             ResidualBlock1(in_channels=512, out_channels=512, stride=2),
45 |             ResidualBlock1(in_channels=512, out_channels=512, stride=1),
46 |             ResidualBlock1(in_channels=512, out_channels=512, stride=1),
47 |         )
48 | 
49 |         self.output_layer = nn.Sequential(
50 |             nn.AdaptiveAvgPool2d((1, 1)),
51 |             nn.Flatten(),
52 |             nn.Linear((512), num_classes)
53 |         )
54 | 
55 |     def forward(self, x):
56 |         x = self.input_layer(x)
57 |         print(x.shape)
58 |         x = self.conv2_x(x)
59 |         x = self.conv3_x(x)
60 |         x = self.conv4_x(x)
61 |         x = self.conv5_x(x)
62 |         print(x.shape)
63 |         x = self.output_layer(x)
64 |         return x
65 | 
66 | if __name__ == '__main__':
67 |     model = ResNet34(1000)
68 |     summary(model, (3, 224, 224))


--------------------------------------------------------------------------------
/Architectures/ResNet/resnet50.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from residual_block import ResidualBlock2
 4 | from torchsummary import summary
 5 | 
 6 | class ResNet50(nn.Module):
 7 |     def __init__(self, num_classes=1000):
 8 |         super(ResNet50, self).__init__()
 9 | 
10 |         self.input_layer = nn.Sequential(
11 |             nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=3),
12 |             nn.BatchNorm2d(64),
13 |             nn.ReLU(inplace=True),
14 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
15 |         )
16 | 
17 |         self.conv2_x = nn.Sequential(
18 |             ResidualBlock2(in_channels=64, out_channels1=64, out_channels2=256, stride=1),
19 |             ResidualBlock2(in_channels=256, out_channels1=64, out_channels2=256, stride=1),
20 |             ResidualBlock2(in_channels=256, out_channels1=64, out_channels2=256, stride=1),
21 |         )
22 | 
23 |         self.conv3_x = nn.Sequential(
24 |             ResidualBlock2(in_channels=256, out_channels1=128, out_channels2=512, stride=2),
25 |             ResidualBlock2(in_channels=512, out_channels1=128, out_channels2=512, stride=1),
26 |             ResidualBlock2(in_channels=512, out_channels1=128, out_channels2=512, stride=1),
27 |             ResidualBlock2(in_channels=512, out_channels1=128, out_channels2=512, stride=1),
28 |         )
29 | 
30 |         self.conv4_x = nn.Sequential(
31 |             ResidualBlock2(in_channels=512, out_channels1=256, out_channels2=1024, stride=2),
32 |             ResidualBlock2(in_channels=1024, out_channels1=256, out_channels2=1024, stride=1),
33 |             ResidualBlock2(in_channels=1024, out_channels1=256, out_channels2=1024, stride=1),
34 |             ResidualBlock2(in_channels=1024, out_channels1=256, out_channels2=1024, stride=1),
35 |             ResidualBlock2(in_channels=1024, out_channels1=256, out_channels2=1024, stride=1),
36 |             ResidualBlock2(in_channels=1024, out_channels1=256, out_channels2=1024, stride=1),
37 |         )
38 | 
39 |         self.conv5_x = nn.Sequential(
40 |             ResidualBlock2(in_channels=1024, out_channels1=512, out_channels2=2048, stride=2),
41 |             ResidualBlock2(in_channels=2048, out_channels1=512, out_channels2=2048, stride=1),
42 |             ResidualBlock2(in_channels=2048, out_channels1=512, out_channels2=2048, stride=1),
43 |         )
44 | 
45 |         self.output_layer = nn.Sequential(
46 |             nn.AdaptiveAvgPool2d((1, 1)),
47 |             nn.Flatten(),
48 |             nn.Linear((2048), num_classes)
49 |         )
50 | 
51 |     def forward(self, x):
52 |         x = self.input_layer(x)
53 |         x = self.conv2_x(x)
54 |         x = self.conv3_x(x)
55 |         x = self.conv4_x(x)
56 |         x = self.conv5_x(x)
57 |         x = self.output_layer(x)
58 |         return x
59 | 
60 | if __name__ == '__main__':
61 |     model = ResNet50(1000)
62 |     summary(model, (3, 224, 224))


--------------------------------------------------------------------------------
/Architectures/ResNet/src/ResNet_building_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/ResNet/src/ResNet_building_block.png


--------------------------------------------------------------------------------
/Architectures/ResidualAttentionNetwork/README.md:
--------------------------------------------------------------------------------
 1 | # [Residual Attention Network]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of Residual Attention Network. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://gvdmnni.notion.site/Residual-Attention-Network-54828e44946a4306b774610ceb251b26?pvs=4).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | 1. Proposing a novel attention mechanism that can be integrated into state-of-the-art feedforward architectures
12 | 2. Introducing the Attention Module, which generates attention-aware features via a bottom-up top-down structure
13 | 3. Developing attention residual learning to handle the optimization difficulties caused by stacking Attention Modules
14 | 4. Demonstrating state-of-the-art performance on CIFAR-10, CIFAR-100, and ImageNet while being computationally efficient
15 | 
16 | 
17 | ## Architecture Scheme
18 | Below a schematic representation of the Residual Attention Network units that are used in the architecture:
19 | ![Image](./src/RAN_attention_module.png)**Residual Attention Module**
20 | ![Image](./src/RAN_residual_unit.png)**Residual Unit**
21 | ![Image](./src/RAN_Soft_Mask_Brach.png)**Soft Masking Branch**
22 | 
23 | 
24 | 
25 | 
26 | ## Reproduced Results (TBD)
27 | The following results were reproduced as per the methodology described in the paper:
28 | - Result 1: [Description and value]
29 | - Result 2: [Description and value]
30 | - Result 3: [Description and value]
31 | - ...
32 | 
33 | 
34 | ## References
35 | - [Original Paper](https://arxiv.org/abs/1704.06904)
36 | - [Detailed Blog Post](https://gvdmnni.notion.site/Residual-Attention-Network-54828e44946a4306b774610ceb251b26?pvs=4)
37 | 


--------------------------------------------------------------------------------
/Architectures/ResidualAttentionNetwork/ResNetAttention92.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from ran_modules import *
 3 | import torch.nn as nn
 4 | from torchsummary import summary
 5 | 
 6 | class ResidualAttentionModule_92(nn.Module):
 7 |     def __init__(self, num_classes, in_channels, out_channels, num_of_updown, t=2, r=1):
 8 |         super(ResidualAttentionModule_92, self).__init__()
 9 | 
10 |         self.input_layer = nn.Sequential(
11 |             nn.Conv2d(in_channels, out_channels, kernel_size=7, stride=2, padding=3),
12 |             nn.BatchNorm2d(out_channels),
13 |             nn.ReLU(inplace=True),
14 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
15 |         )
16 | 
17 |         self.residual_unit_1 = ResidualUnit(in_channels=64, out_channels=128)
18 | 
19 |         self.attention_module_1 = ResidualAttentionModule(in_channels=128, out_channels=128, num_of_updown=num_of_updown, t=t, r=r)
20 | 
21 |         self.residual_unit_2 = ResidualUnit(in_channels=128, out_channels=256, stride=2)
22 | 
23 |         self.attention_module_2 = ResidualAttentionModule(in_channels=256, out_channels=256, num_of_updown=num_of_updown, t=t, r=r)
24 | 
25 |         self.attention_module_3 = ResidualAttentionModule(in_channels=256, out_channels=256, num_of_updown=num_of_updown, t=t, r=r)
26 | 
27 |         self.residual_unit_3 = ResidualUnit(in_channels=256, out_channels=512, stride=2)
28 | 
29 |         self.attention_module_4 = ResidualAttentionModule(in_channels=512, out_channels=512, num_of_updown=num_of_updown, t=t, r=r)
30 | 
31 |         self.attention_module_5 = ResidualAttentionModule(in_channels=512, out_channels=512, num_of_updown=num_of_updown, t=t, r=r)
32 | 
33 |         self.residual_unit_4 = ResidualUnit(in_channels=512, out_channels=1024, stride=2)
34 | 
35 | 
36 |         self.output_layer = nn.Sequential(
37 |             nn.AdaptiveAvgPool2d((1, 1)),
38 |             nn.Flatten(),
39 |             nn.Linear(1024, num_classes)
40 |         )
41 | 
42 | 
43 |     def forward(self, x):
44 |         x = self.input_layer(x)
45 |         x = self.residual_unit_1(x)
46 |         x = self.attention_module_1(x)
47 |         x = self.residual_unit_2(x)
48 |         x = self.attention_module_2(x)
49 |         x = self.attention_module_3(x)
50 |         x = self.residual_unit_3(x)
51 |         x = self.attention_module_4(x)
52 |         x = self.attention_module_5(x)
53 |         x = self.residual_unit_4(x)
54 | 
55 | 
56 |         return x
57 | 
58 | 
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     model = ResidualAttentionModule_92(num_classes=1000, in_channels=3, out_channels=64, num_of_updown=1, t=2, r=1)
63 |     x = torch.randn(1, 3, 224, 224)
64 |     summary(model, (3, 224, 224))


--------------------------------------------------------------------------------
/Architectures/ResidualAttentionNetwork/ran_modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class ResidualUnit(nn.Module):
 6 |     def __init__(self, in_channels, out_channels, stride=1):
 7 |         super(ResidualUnit, self).__init__()
 8 |         self.sub_block1 = nn.Sequential(
 9 |             nn.BatchNorm2d(in_channels),
10 |             nn.ReLU(inplace=True),
11 |             nn.Conv2d(in_channels, (4*in_channels), kernel_size=3, stride=stride, padding=1, bias=False),
12 |         )
13 |         self.sub_block2 = nn.Sequential(
14 |             nn.BatchNorm2d(4*in_channels),
15 |             nn.ReLU(inplace=True),
16 |             nn.Conv2d((4*in_channels), out_channels, kernel_size=1, stride=1, padding=0, bias=False),
17 |         )
18 | 
19 |         # as a shortcut we are using the option B from the paper (projection shortcut)
20 |         self.shortcut = nn.Identity()
21 |         if stride != 1 or in_channels != out_channels:
22 |             self.shortcut = nn.Sequential(
23 |                 nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
24 |                 nn.BatchNorm2d(out_channels),
25 |             )
26 | 
27 |     def forward(self, x):
28 |         residual = x
29 |         x = self.sub_block1(x)
30 |         x = self.sub_block2(x)
31 |         residual = self.shortcut(residual)
32 |         x += residual
33 |         return x
34 | 
35 | 
36 | 
37 | class ResidualAttentionModule(nn.Module):
38 |     def __init__(self, in_channels, out_channels, num_of_updown, t=2, r=1):
39 |         super(ResidualAttentionModule, self).__init__()
40 | 
41 |         # Trunk branch
42 |         residual_unit_trunck_list = []
43 |         for _ in range(t):
44 |             residual_unit_trunck_list.append(ResidualUnit(in_channels, out_channels))
45 |         self.trunk_branch = nn.Sequential(*residual_unit_trunck_list)
46 | 
47 |         # Initial downsampling
48 |         list_of_initial_downsampling = []
49 |         for i in range(num_of_updown - 1):
50 |             list_of_initial_downsampling.append(nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
51 |             for _ in range(r):
52 |                 list_of_initial_downsampling.append(ResidualUnit(in_channels, out_channels))
53 |         self.initial_downsampling = nn.Sequential(*list_of_initial_downsampling)
54 | 
55 |         # Ending downsampling
56 |         list_of_ending_downsampling = []
57 |         list_of_ending_downsampling.append(nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
58 |         for _ in range(2 * r):
59 |             list_of_ending_downsampling.append(ResidualUnit(in_channels, out_channels))
60 |         self.ending_downsampling = nn.Sequential(*list_of_ending_downsampling)
61 | 
62 |         # Upsampling
63 |         list_of_upsampling = []
64 |         list_of_upsampling.append(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True))
65 |         for _ in range(r):
66 |             list_of_upsampling.append(ResidualUnit(in_channels, out_channels))
67 | 
68 |         for _ in range(num_of_updown - 1):
69 |             list_of_upsampling.append(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True))
70 |             for _ in range(r):
71 |                 list_of_upsampling.append(ResidualUnit(in_channels, out_channels))
72 |         self.upsampling = nn.Sequential(*list_of_upsampling)
73 | 
74 |         # Skip connection
75 |         self.skip_connection = ResidualUnit(in_channels, out_channels)
76 |         self.sigmoid = nn.Sigmoid()
77 | 
78 | 
79 | 
80 |     def forward(self, x):
81 |         # Trunk branch
82 |         trunk_output = self.trunk_branch(x)
83 | 
84 |         # Soft mask branch
85 |         downsampled = self.initial_downsampling(x)
86 |         downsampled = self.ending_downsampling(downsampled)
87 |         upsampled = self.upsampling(downsampled)
88 |         soft_mask = self.sigmoid(upsampled)
89 |         print(soft_mask.shape)
90 |         print(trunk_output.shape)
91 | 
92 |         # Skip connection
93 |         skip_output = self.skip_connection(x)
94 | 
95 |         # Combining trunk and soft mask branches
96 |         output = trunk_output * (1 + soft_mask) + skip_output
97 |         return output
98 | 


--------------------------------------------------------------------------------
/Architectures/ResidualAttentionNetwork/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/ResidualAttentionNetwork/src/RAN_Soft_Mask_Brach.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/ResidualAttentionNetwork/src/RAN_Soft_Mask_Brach.png


--------------------------------------------------------------------------------
/Architectures/ResidualAttentionNetwork/src/RAN_attention_module.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/ResidualAttentionNetwork/src/RAN_attention_module.png


--------------------------------------------------------------------------------
/Architectures/ResidualAttentionNetwork/src/RAN_residual_unit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/ResidualAttentionNetwork/src/RAN_residual_unit.png


--------------------------------------------------------------------------------
/Architectures/Rethinked Inception/README.md:
--------------------------------------------------------------------------------
 1 | # [InceptionV2/V3]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of InceptionV2/V3. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://gvdmnni.notion.site/InceptionV2-V3-a5fa66c1e34c495aabd5b8950e4389f5?pvs=4).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | - Introduction of design principles for efficiently scaling up CNNs, focusing on factorized convolutions and dimension reduction.
12 | - Development of Inception-v2 and Inception-v3 architectures, which significantly improve upon the original GoogLeNet design.
13 | - Proposal of label smoothing as a regularization technique to prevent the network from becoming too confident in its predictions.
14 | - Investigation of the impact of input resolution on network performance, showing that lower resolution inputs can still achieve competitive results when the network is properly adapted.
15 | - Achievement of state-of-the-art performance on the ILSVRC 2012 classification benchmark, with a substantial reduction in computational cost compared to other top-performing networks.
16 | 
17 | 
18 | ## Architecture Scheme
19 | Below a schematic representation of the Inception modules that are used in the architecture:
20 | ![Inception Module](./src/InceptionV3.png)
21 | *Figure: Inception modules with factorized convolutions. The original 5x5 convolutional filter has been replaced with two consecutive 3x3 convolutional filters, reducing computational cost while maintaining the effective receptive field.*
22 | 
23 | ![Inception Module](https://github.com/GuidoManni/DeepLearningImplementation/blob/main/Architectures/Rethinked%20Inception/src/Improved%20Inception%20Module%20with%20more%20factorization.png)
24 | *Figure: Inception modules after the factorization of the nxn convolutions.*
25 | 
26 | ![Inception Module](https://github.com/GuidoManni/DeepLearningImplementation/blob/main/Architectures/Rethinked%20Inception/src/Inception%20Module%20with%20expanded%20filter%20bank.png)
27 | *Figure: Inception modules with expanded filter bank outputs to promote high dimensional representations.*
28 | 
29 | 
30 | ## Reproduced Results (TBD)
31 | The following results were reproduced as per the methodology described in the paper:
32 | - Result 1: [Description and value]
33 | - Result 2: [Description and value]
34 | - Result 3: [Description and value]
35 | - ...
36 | 
37 | 
38 | ## References
39 | - [Original Paper](https://arxiv.org/abs/1512.00567)
40 | - [Detailed Blog Post](https://gvdmnni.notion.site/InceptionV2-V3-a5fa66c1e34c495aabd5b8950e4389f5?pvs=44)
41 | 


--------------------------------------------------------------------------------
/Architectures/Rethinked Inception/inceptionV3.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Inception-v3 includes all the improvements of Inception-v2, plus the following additional enhancements:
 3 | - RMSProp optimizer
 4 | - BatchNorm in the auxiliary classifiers
 5 | - Label smoothing regularization
 6 | Since these changes are more about the training process than the architecture itself, we will directly use the Inception-v3.
 7 | '''
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | from torchsummary import summary
12 | 
13 | from inception_blocks import *
14 | 
15 | class InceptionV3(nn.Module):
16 |     def __init__(self, num_classes=1000):
17 |         super(InceptionV3, self).__init__()
18 | 
19 |         # Input layers
20 |         self.input_layers = nn.Sequential(
21 |             nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=2),
22 |             nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1),
23 |             nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
24 |             nn.MaxPool2d(kernel_size=3, stride=2),
25 |             nn.Conv2d(in_channels=64, out_channels=80, kernel_size=1, stride=1),
26 |             nn.Conv2d(in_channels=80, out_channels=192, kernel_size=3),
27 |             nn.MaxPool2d(kernel_size=3, stride=2)
28 |         )
29 | 
30 |         # Inception Blocks5 (from Figure 5 of the original paper)
31 |         self.inception_block5_1 = InceptionBlock5(in_channels=192, pool_features=32)
32 |         self.inception_block5_2 = InceptionBlock5(in_channels=256, pool_features=64)
33 |         self.inception_block5_3 = InceptionBlock5(in_channels=288, pool_features=64)
34 | 
35 |         # Before the next inception block, we need to add a reduction block
36 |         self.reduction_block1 = ReductionBlock(in_channels=288, out_channels1=[384, 384], out_channels2=[64, 96, 96], is_second=False)
37 | 
38 | 
39 |         # Inception Blocks6 (from Figure 6 of the original paper)
40 |         self.inception_block6_1 = InceptionBlock6(in_channels=768, out_1x1=192, red_7x7=160, out_7x7=192, red_1x7_7x1=160, out_1x7_7x1=192, out_pool=192)
41 |         self.inception_block6_2 = InceptionBlock6(in_channels=768, out_1x1=192, red_7x7=160, out_7x7=192, red_1x7_7x1=160, out_1x7_7x1=192, out_pool=192)
42 |         self.inception_block6_3 = InceptionBlock6(in_channels=768, out_1x1=192, red_7x7=160, out_7x7=192, red_1x7_7x1=160, out_1x7_7x1=192, out_pool=192)
43 |         self.inception_block6_4 = InceptionBlock6(in_channels=768, out_1x1=192, red_7x7=160, out_7x7=192, red_1x7_7x1=160, out_1x7_7x1=192, out_pool=192)
44 |         self.inception_block6_5 = InceptionBlock6(in_channels=768, out_1x1=192, red_7x7=160, out_7x7=192, red_1x7_7x1=160, out_1x7_7x1=192, out_pool=192)
45 | 
46 |         # Before the next inception block, we need to add a reduction block
47 |         self.reduction_block2 = ReductionBlock(in_channels=768, out_channels1=[192, 320], out_channels2=[192, 192, 192, 192], is_second=True)
48 | 
49 |         # Inception Blocks7 (from Figure 7 of the original paper)
50 |         self.inception_block7_1 = InceptionBlock7(in_channels=1280)
51 |         self.inception_block7_2 = InceptionBlock7(in_channels=2048)
52 | 
53 |         # Output layers
54 |         self.output_layers = nn.Sequential(
55 |             nn.AdaptiveAvgPool2d((1, 1)),
56 |             nn.Flatten(),
57 |             nn.Linear(2048, num_classes)
58 |         )
59 | 
60 | 
61 |     def forward(self, x):
62 |         x = self.input_layers(x)
63 |         x = self.inception_block5_1(x)
64 |         x = self.inception_block5_2(x)
65 |         x = self.inception_block5_3(x)
66 |         x = self.reduction_block1(x)
67 |         x = self.inception_block6_1(x)
68 |         x = self.inception_block6_2(x)
69 |         x = self.inception_block6_3(x)
70 |         x = self.inception_block6_4(x)
71 |         x = self.inception_block6_5(x)
72 |         x = self.reduction_block2(x)
73 |         x = self.inception_block7_1(x)
74 |         x = self.inception_block7_2(x)
75 |         x = self.output_layers(x)
76 |         
77 |         return x
78 | 
79 | if __name__ == '__main__':
80 |     model = InceptionV3()
81 |     summary(model, (3, 299, 299))


--------------------------------------------------------------------------------
/Architectures/Rethinked Inception/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/Rethinked Inception/src/Improved Inception Module with more factorization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/Rethinked Inception/src/Improved Inception Module with more factorization.png


--------------------------------------------------------------------------------
/Architectures/Rethinked Inception/src/Improved Inception Module.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/Rethinked Inception/src/Improved Inception Module.png


--------------------------------------------------------------------------------
/Architectures/Rethinked Inception/src/Inception Module with expanded filter bank.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/Rethinked Inception/src/Inception Module with expanded filter bank.png


--------------------------------------------------------------------------------
/Architectures/Rethinked Inception/src/InceptionV3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/Rethinked Inception/src/InceptionV3.png


--------------------------------------------------------------------------------
/Architectures/SENet/README.md:
--------------------------------------------------------------------------------
 1 | # [SENet]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of SENet. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://gvdmnni.notion.site/SENet-4af63658ec1f48649378e864c2232231?pvs=4).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | 1. Introduction of the Squeeze-and-Excitation (SE) block, a lightweight yet effective module for modeling channel-wise relationships in CNN features.
12 | 2. Development of the SENet architecture, which can be created by stacking multiple SE blocks.
13 | 3. Demonstration of significant performance improvements when integrating SE blocks into existing state-of-the-art architectures like ResNet, Inception, and MobileNet.
14 | 4. Extensive experimentation showing the effectiveness of SENets across various datasets and tasks, including image classification, scene classification, and object detection.
15 | 5. Winning the ILSVRC 2017 classification competition, showcasing the practical impact of the proposed method.
16 | 
17 | ## Architecture Scheme
18 | Below a schematic representation of the SENet block that is the main contribution of this paper 
19 | 
20 | ![Image](./src/SENet_Block.png)
21 | 
22 | 
23 | 
24 | 
25 | ## Reproduced Results (TBD)
26 | The following results were reproduced as per the methodology described in the paper:
27 | - Result 1: [Description and value]
28 | - Result 2: [Description and value]
29 | - Result 3: [Description and value]
30 | - ...
31 | 
32 | 
33 | ## References
34 | - [Original Paper](https://arxiv.org/abs/1602.07360)
35 | - [Detailed Blog Post](https://gvdmnni.notion.site/SENet-4af63658ec1f48649378e864c2232231?pvs=4)
36 | 


--------------------------------------------------------------------------------
/Architectures/SENet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/SENet/senet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | 
 5 | from senet_block import *
 6 | 
 7 | 
 8 | class SENet(nn.Module):
 9 |     def __init__(self, num_classes = 1000, reduction = 16):
10 |         super(SENet, self).__init__()
11 | 
12 |         self.input_layers = nn.Sequential(
13 |             ConvBlock(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=3),
14 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
15 |         )
16 | 
17 |         self.conv2_x = nn.Sequential(
18 |             ResidualBlock(in_channels=64, out_channels_bottleneck=64, out_channels=256, stride=1),
19 |             SEBlock(in_feature=256, reduction=reduction),
20 |             ResidualBlock(in_channels=256, out_channels_bottleneck=64, out_channels=256, stride=1),
21 |             SEBlock(in_feature=256, reduction=reduction),
22 |             ResidualBlock(in_channels=256, out_channels_bottleneck=64, out_channels=256, stride=1),
23 |             SEBlock(in_feature=256, reduction=reduction),
24 |         )
25 | 
26 | 
27 | 
28 |         conv_3x_list = []
29 |         for i in range(4):
30 |             if i == 0:
31 |                 conv_3x_list.append(ResidualBlock(in_channels=256, out_channels_bottleneck=128, out_channels=512, stride=2))
32 |             else:
33 |                 conv_3x_list.append(ResidualBlock(in_channels=512, out_channels_bottleneck=128, out_channels=512, stride=1))
34 | 
35 |             conv_3x_list.append(SEBlock(in_feature=512, reduction=reduction))
36 |         self.conv3_x = nn.Sequential(*conv_3x_list)
37 | 
38 | 
39 |         conv_4x_list = []
40 |         for i in range(6):
41 |             if i == 0:
42 |                 conv_4x_list.append(ResidualBlock(in_channels=512, out_channels_bottleneck=256, out_channels=1024, stride=2))
43 |             else:
44 |                 conv_4x_list.append(ResidualBlock(in_channels=1024, out_channels_bottleneck=256, out_channels=1024, stride=1))
45 | 
46 |             conv_4x_list.append(SEBlock(in_feature=1024, reduction=reduction))
47 |         self.conv4_x = nn.Sequential(*conv_4x_list)
48 | 
49 | 
50 |         self.conv5_x = nn.Sequential(
51 |             ResidualBlock(in_channels=1024, out_channels_bottleneck=512, out_channels=2048, stride=2),
52 |             SEBlock(in_feature=2048, reduction=reduction),
53 |             ResidualBlock(in_channels=2048, out_channels_bottleneck=512, out_channels=2048, stride=1),
54 |             SEBlock(in_feature=2048, reduction=reduction),
55 |             ResidualBlock(in_channels=2048, out_channels_bottleneck=512, out_channels=2048, stride=1),
56 |             SEBlock(in_feature=2048, reduction=reduction),
57 |         )
58 | 
59 | 
60 | 
61 |         self.output_layer = nn.Sequential(
62 |             nn.AdaptiveAvgPool2d((1, 1)),
63 |             nn.Flatten(),
64 |             nn.Linear(2048, num_classes)
65 |         )
66 | 
67 |     def forward(self, x):
68 |         x = self.input_layers(x)
69 |         x = self.conv2_x(x)
70 | 
71 |         x = self.conv3_x(x)
72 | 
73 |         x = self.conv4_x(x)
74 | 
75 |         x = self.conv5_x(x)
76 | 
77 |         x = self.output_layer(x)
78 |         return x
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     model = SENet(1000)
83 |     summary(model, (3, 224, 224))
84 | 
85 | 


--------------------------------------------------------------------------------
/Architectures/SENet/senet_block.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class ConvBlock(nn.Module):
 5 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
 6 |         super(ConvBlock, self).__init__()
 7 | 
 8 |         self.conv_block = nn.Sequential(
 9 |             nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding),
10 |             nn.BatchNorm2d(out_channels),
11 |             nn.ReLU()
12 |         )
13 | 
14 |     def forward(self, x):
15 |         return self.conv_block(x)
16 | 
17 | 
18 | class SEBlock(nn.Module):
19 |     def __init__(self, in_feature, reduction):
20 |         super(SEBlock, self).__init__()
21 |         bootleneck_feature = int(in_feature / reduction)
22 |         self.se_block = nn.Sequential(
23 |             nn.AdaptiveAvgPool2d(1),
24 |             nn.Flatten(),
25 |             nn.Linear(in_features= in_feature, out_features=bootleneck_feature),
26 |             nn.ReLU(),
27 |             nn.Linear(in_features=bootleneck_feature, out_features=in_feature),
28 |             nn.Sigmoid(),
29 |         )
30 | 
31 |     def forward(self, x):
32 |         initial_feature = x
33 |         x = self.se_block(x)
34 |         scaled_feature = initial_feature * torch.reshape(x, (x.shape[0], x.shape[1], 1, 1))
35 |         return scaled_feature
36 | 
37 | 
38 | class ResidualBlock(nn.Module):
39 |     def __init__(self, in_channels, out_channels_bottleneck, out_channels, stride=1):
40 |         super(ResidualBlock, self).__init__()
41 | 
42 |         self.bootleneck = nn.Sequential(
43 |             ConvBlock(in_channels=in_channels, out_channels=out_channels_bottleneck, kernel_size=1),
44 |             ConvBlock(in_channels=out_channels_bottleneck, out_channels=out_channels_bottleneck, kernel_size=3, stride=stride, padding=1),
45 |             ConvBlock(in_channels=out_channels_bottleneck, out_channels=out_channels, kernel_size=1)
46 |         )
47 |         self.shortcut = nn.Identity()
48 |         if stride!=1 or in_channels != out_channels:
49 |             self.shortcut = nn.Sequential(
50 |                 nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
51 |                 nn.BatchNorm2d(out_channels),
52 |                 # we do not use the ReLU activation function in the shortcut because we want to preserve the information flow
53 |             )
54 | 
55 |         self.relu = nn.ReLU()
56 | 
57 |     def forward(self, x):
58 |         residual = x
59 |         x = self.bootleneck(x)
60 |         x += self.shortcut(residual)
61 |         x = self.relu(x)
62 |         return x
63 | 


--------------------------------------------------------------------------------
/Architectures/SENet/src/SENet_Block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/SENet/src/SENet_Block.png


--------------------------------------------------------------------------------
/Architectures/ShuffleNet/README.md:
--------------------------------------------------------------------------------
 1 | # [ShuffleNet]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of ShuffleNet. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://gvdmnni.notion.site/ShuffleNet-97075c040be24950b7d1ec244a00ed4e?pvs=4).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | 1. Pointwise Group Convolutions: The authors introduce the use of grouped convolutions for 1x1 layers, significantly reducing computational cost.
12 | 2. Channel Shuffle Operation: To overcome the limitations of grouped convolutions, they propose a channel shuffle operation that enables information flow across feature channels from different groups.
13 | 3. ShuffleNet Unit: Building on these innovations, they design a new basic unit for CNN architectures that is both highly efficient and maintains strong performance.
14 | 4. Comprehensive Experiments: The paper presents extensive comparisons with other architectures across various computational complexities, demonstrating ShuffleNet's superior performance.
15 | 5. Real-world Performance: Unlike many papers that focus solely on theoretical complexity, the authors evaluate actual inference time on mobile devices, providing practical insights for deployment.
16 | ## Architecture Scheme
17 | Below a schematic representation of the ShuffleNet units that are used in the architecture:
18 | ![Image](./src/shuffleblock_no_stride.png)**ShuffleNet Unit without Stride**
19 | ![Image](./src/shuffleblock_with_stride.png)**ShuffleNet Unit with Stride**
20 | 
21 | 
22 | 
23 | 
24 | ## Reproduced Results (TBD)
25 | The following results were reproduced as per the methodology described in the paper:
26 | - Result 1: [Description and value]
27 | - Result 2: [Description and value]
28 | - Result 3: [Description and value]
29 | - ...
30 | 
31 | 
32 | ## References
33 | - [Original Paper](https://arxiv.org/abs/1707.01083)
34 | - [Detailed Blog Post](https://gvdmnni.notion.site/ShuffleNet-97075c040be24950b7d1ec244a00ed4e?pvs=4)
35 | 


--------------------------------------------------------------------------------
/Architectures/ShuffleNet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/ShuffleNet/shuffle_unit.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class ChannelShuffle(nn.Module):
 6 |     def __init__(self, groups):
 7 |         super(ChannelShuffle, self).__init__()
 8 |         self.groups = groups
 9 | 
10 |     def forward(self, x):
11 |         batch_size, num_channels, height, width = x.size()
12 |         channels_per_group = num_channels // self.groups
13 | 
14 |         # reshape
15 |         x = x.view(batch_size, self.groups, channels_per_group, height, width)
16 |         # transpose
17 |         x = torch.transpose(x, 1, 2).contiguous()
18 |         # flatten
19 |         x = x.view(batch_size, -1, height, width)
20 |         return x
21 | 
22 | 
23 | 
24 | class ShuffleUnitNoStride(nn.Module):
25 |     def __init__(self, in_channels, out_channels, groups):
26 |         super(ShuffleUnitNoStride, self).__init__()
27 |         mid_channels = out_channels // 4
28 | 
29 |         self.bottleneck = nn.Sequential(
30 |             nn.Conv2d(in_channels, mid_channels, kernel_size=1, groups=groups),
31 |             nn.BatchNorm2d(mid_channels),
32 |             nn.ReLU(inplace=True),
33 |             ChannelShuffle(groups),
34 |             nn.Conv2d(mid_channels, mid_channels, kernel_size=3, padding=1, groups=mid_channels),
35 |             nn.BatchNorm2d(mid_channels),
36 |             nn.Conv2d(mid_channels, out_channels, kernel_size=1,),
37 |             nn.BatchNorm2d(out_channels)
38 |         )
39 | 
40 |         self.skip_connection = nn.Identity()
41 | 
42 |     def forward(self, x):
43 |         residual = x
44 |         x = self.bottleneck(x)
45 |         residual = self.skip_connection(residual)
46 |         x += residual
47 |         return x
48 | 
49 | class ShuffleUnitWithStride(nn.Module):
50 |     def __init__(self, in_channels, out_channels, groups, stride=2):
51 |         super(ShuffleUnitWithStride, self).__init__()
52 |         out_channels = out_channels - in_channels
53 |         mid_channels = out_channels // 4
54 | 
55 | 
56 |         self.bottleneck = nn.Sequential(
57 |             nn.Conv2d(in_channels, mid_channels, kernel_size=1, groups=groups, bias=False),
58 |             nn.BatchNorm2d(mid_channels),
59 |             nn.ReLU(inplace=True),
60 |             ChannelShuffle(groups),
61 |             nn.Conv2d(mid_channels, mid_channels, kernel_size=3, padding=1, groups=mid_channels, stride=stride, bias=False),
62 |             nn.BatchNorm2d(mid_channels),
63 |             nn.Conv2d(mid_channels, out_channels, kernel_size=1,bias=False),
64 |             nn.BatchNorm2d(out_channels)
65 |         )
66 | 
67 |         self.skip_connection = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
68 | 
69 |     def forward(self, x):
70 |         residual = x
71 |         x = self.bottleneck(x)
72 |         residual = self.skip_connection(residual)
73 |         x = torch.cat((residual, x), dim=1)
74 |         return x
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/Architectures/ShuffleNet/shufflenet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchsummary import summary
 3 | 
 4 | from shuffle_unit import *
 5 | 
 6 | 
 7 | class ShuffleNet(nn.Module):
 8 |     def __init__(self, num_classes=100, s=1, groups=1):
 9 |         super(ShuffleNet, self).__init__()
10 |         self.s = s
11 |         if groups == 1:
12 |             out_channels = [144, 288, 576]
13 |         elif groups == 2:
14 |             out_channels = [200, 400, 800]
15 |         elif groups == 3:
16 |             out_channels = [240, 480, 960]
17 |         elif groups == 4:
18 |             out_channels = [272, 544, 1088]
19 |         elif groups == 8:
20 |             out_channels = [384, 768, 1536]
21 | 
22 |         out_channels = self._perform_channel_reduction(out_channels)
23 | 
24 |         self.initial_block = nn.Sequential(
25 |             nn.Conv2d(3, 24, kernel_size=3, stride=2, padding=1),
26 |             nn.BatchNorm2d(24),
27 |             nn.ReLU(inplace=True),
28 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
29 |         )
30 | 
31 |         self.stage2 = nn.Sequential(
32 |             ShuffleUnitWithStride(24, out_channels[0], groups, stride=2),
33 |             ShuffleUnitNoStride(out_channels[0], out_channels[0], groups),
34 |             ShuffleUnitNoStride(out_channels[0], out_channels[0], groups),
35 |             ShuffleUnitNoStride(out_channels[0], out_channels[0], groups),
36 |         )
37 | 
38 |         self.stage3 = nn.Sequential(
39 |             ShuffleUnitWithStride(out_channels[0], out_channels[1], groups),
40 |             ShuffleUnitNoStride(out_channels[1], out_channels[1], groups),
41 |             ShuffleUnitNoStride(out_channels[1], out_channels[1], groups),
42 |             ShuffleUnitNoStride(out_channels[1], out_channels[1], groups),
43 |             ShuffleUnitNoStride(out_channels[1], out_channels[1], groups),
44 |             ShuffleUnitNoStride(out_channels[1], out_channels[1], groups),
45 |             ShuffleUnitNoStride(out_channels[1], out_channels[1], groups),
46 |             ShuffleUnitNoStride(out_channels[1], out_channels[1], groups),
47 |         )
48 | 
49 |         self.stage4 = nn.Sequential(
50 |             ShuffleUnitWithStride(out_channels[1], out_channels[2], groups),
51 |             ShuffleUnitNoStride(out_channels[2], out_channels[2], groups),
52 |             ShuffleUnitNoStride(out_channels[2], out_channels[2], groups),
53 |             ShuffleUnitNoStride(out_channels[2], out_channels[2], groups),
54 |         )
55 | 
56 |         self.output_layer = nn.Sequential(
57 |             nn.AdaptiveAvgPool2d(1),
58 |             nn.Flatten(),
59 |             nn.Linear(out_channels[2], num_classes),
60 |         )
61 | 
62 |     def _perform_channel_reduction(self, out_channels):
63 |         out_channels = torch.Tensor(out_channels)
64 |         out_channels = out_channels * self.s
65 |         return out_channels.int().tolist()
66 | 
67 | 
68 | 
69 |     def forward(self, x):
70 |         x = self.initial_block(x)
71 |         x = self.stage2(x)
72 |         x = self.stage3(x)
73 |         x = self.stage4(x)
74 |         x = self.output_layer(x)
75 |         return x
76 | 
77 | if __name__ == '__main__':
78 |     model = ShuffleNet()
79 |     summary(model, (3, 224, 224))


--------------------------------------------------------------------------------
/Architectures/ShuffleNet/src/shuffleblock_no_stride.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/ShuffleNet/src/shuffleblock_no_stride.png


--------------------------------------------------------------------------------
/Architectures/ShuffleNet/src/shuffleblock_with_stride.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/ShuffleNet/src/shuffleblock_with_stride.png


--------------------------------------------------------------------------------
/Architectures/SqueezeNet/README.md:
--------------------------------------------------------------------------------
 1 | # [SqueezeNet]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of SqueezeNet. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://gvdmnni.notion.site/SqueezeNet-6872b7d0b1b849c5956de2927a880105?pvs=4).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | 1. Introduction of architectural design strategies for creating small CNNs
12 | 2. Presentation of the SqueezeNet architecture, which achieves AlexNet-level accuracy with 50x fewer parameters
13 | 3. Demonstration that SqueezeNet can be compressed to 510x smaller than AlexNet while maintaining accuracy
14 | 4. Exploration of the CNN microarchitecture design space, providing insights into the impact of various design choices on model size and accuracy
15 | 5. Investigation of different CNN macroarchitecture configurations, including the use of bypass connections
16 | 
17 | ## Architecture Scheme
18 | Below a schematic representation of the SqueezeNet architecture:
19 | ![Image](./src/SqueezeNet_architecture.png)
20 | 
21 | Below a schematic representation of the FIRE modules that are used in the architecture:
22 | ![Image](./src/SqueezeNet_Fire_module.png)
23 | 
24 | 
25 | 
26 | 
27 | ## Reproduced Results (TBD)
28 | The following results were reproduced as per the methodology described in the paper:
29 | - Result 1: [Description and value]
30 | - Result 2: [Description and value]
31 | - Result 3: [Description and value]
32 | - ...
33 | 
34 | 
35 | ## References
36 | - [Original Paper](https://arxiv.org/abs/1602.07360)
37 | - [Detailed Blog Post](https://gvdmnni.notion.site/SqueezeNet-6872b7d0b1b849c5956de2927a880105?pvs=4)
38 | 


--------------------------------------------------------------------------------
/Architectures/SqueezeNet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/SqueezeNet/squeezenet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | 
 5 | from squeezenet_blocks import *
 6 | 
 7 | class SqueezeNet(nn.Module):
 8 |     def __init__(self, num_classes=1000):
 9 |         super(SqueezeNet, self).__init__()
10 | 
11 |         self.input_layers = nn.Sequential(
12 |             nn.Conv2d(in_channels=3, out_channels=96, kernel_size=7, stride=2),
13 |             nn.ReLU(),
14 |             nn.MaxPool2d(kernel_size=3, stride=2)
15 |         )
16 | 
17 |         self.fire_block2 = FireModule(in_channels=96, s1x1=16, e1x1=64, e3x3=64)
18 |         self.fire_block3 = FireModule(in_channels=128, s1x1=16, e1x1=64, e3x3=64)
19 |         self.fire_block4 = FireModule(in_channels=128, s1x1=32, e1x1=128, e3x3=128)
20 | 
21 |         self.maxpool4 = nn.MaxPool2d(kernel_size=3, stride=2)
22 | 
23 |         self.fire_block5 = FireModule(in_channels=256, s1x1=32, e1x1=128, e3x3=128)
24 |         self.fire_block6 = FireModule(in_channels=256, s1x1=48, e1x1=192, e3x3=192)
25 |         self.fire_block7 = FireModule(in_channels=384, s1x1=48, e1x1=192, e3x3=192)
26 |         self.fire_block8 = FireModule(in_channels=384, s1x1=64, e1x1=256, e3x3=256)
27 | 
28 |         self.maxpool8 = nn.MaxPool2d(kernel_size=3, stride=2)
29 | 
30 |         self.fire_block9 = FireModule(in_channels=512, s1x1=64, e1x1=256, e3x3=256)
31 | 
32 |         self.output_layers = nn.Sequential(
33 |             nn.Dropout(p=0.5),
34 |             nn.Conv2d(in_channels=512, out_channels=num_classes, kernel_size=1),
35 |             nn.ReLU(),
36 |             nn.AdaptiveAvgPool2d((1, 1)),
37 |             nn.Flatten()
38 |         )
39 | 
40 |     def forward(self, x):
41 |         x = self.input_layers(x)
42 |         x = self.fire_block2(x)
43 |         x = self.fire_block3(x)
44 |         x = self.fire_block4(x)
45 |         x = self.maxpool4(x)
46 |         x = self.fire_block5(x)
47 |         x = self.fire_block6(x)
48 |         x = self.fire_block7(x)
49 |         x = self.fire_block8(x)
50 |         x = self.maxpool8(x)
51 |         x = self.fire_block9(x)
52 |         x = self.output_layers(x)
53 |         return x
54 | 
55 | if __name__ == '__main__':
56 |     model = SqueezeNet(num_classes=1000)
57 |     summary(model, (3, 224, 224))
58 | 


--------------------------------------------------------------------------------
/Architectures/SqueezeNet/squeezenet_blocks.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class FireModule(nn.Module):
 5 |     def __init__(self, in_channels, s1x1, e1x1, e3x3):
 6 |         '''
 7 |         :param in_channels: the input channel of the FireModule
 8 |         :param s1x1: the output channel of the squeeze layer
 9 |         :param e1x1: the output channel of the expand 1x1 layer
10 |         :param e3x3: the output channel of the expand 3x3 layer
11 |         '''
12 |         super(FireModule, self).__init__()
13 | 
14 |         self.squeeze = nn.Conv2d(in_channels=in_channels, out_channels=s1x1, kernel_size=1)
15 |         self.expand1x1 = nn.Conv2d(in_channels=s1x1, out_channels=e1x1, kernel_size=1)
16 |         self.expand3x3 = nn.Conv2d(in_channels=s1x1, out_channels=e3x3, kernel_size=3, padding=1)
17 | 
18 |     def forward(self, x):
19 |         x = self.squeeze(x)
20 |         x = torch.cat([self.expand1x1(x), self.expand3x3(x)], 1)
21 |         return x
22 | 
23 | 


--------------------------------------------------------------------------------
/Architectures/SqueezeNet/squeezenet_with_bypass.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | 
 5 | from squeezenet_blocks import *
 6 | 
 7 | class SqueezeNet(nn.Module):
 8 |     def __init__(self, num_classes=1000):
 9 |         super(SqueezeNet, self).__init__()
10 | 
11 |         self.input_layers = nn.Sequential(
12 |             nn.Conv2d(in_channels=3, out_channels=96, kernel_size=7, stride=2),
13 |             nn.ReLU(),
14 |             nn.MaxPool2d(kernel_size=3, stride=2)
15 |         )
16 | 
17 |         self.fire_block2 = FireModule(in_channels=96, s1x1=16, e1x1=64, e3x3=64)
18 |         self.fire_block3 = FireModule(in_channels=128, s1x1=16, e1x1=64, e3x3=64)
19 |         self.fire_block4 = FireModule(in_channels=128, s1x1=32, e1x1=128, e3x3=128)
20 | 
21 |         self.maxpool4 = nn.MaxPool2d(kernel_size=3, stride=2)
22 | 
23 |         self.fire_block5 = FireModule(in_channels=256, s1x1=32, e1x1=128, e3x3=128)
24 |         self.fire_block6 = FireModule(in_channels=256, s1x1=48, e1x1=192, e3x3=192)
25 |         self.fire_block7 = FireModule(in_channels=384, s1x1=48, e1x1=192, e3x3=192)
26 |         self.fire_block8 = FireModule(in_channels=384, s1x1=64, e1x1=256, e3x3=256)
27 | 
28 |         self.maxpool8 = nn.MaxPool2d(kernel_size=3, stride=2)
29 | 
30 |         self.fire_block9 = FireModule(in_channels=512, s1x1=64, e1x1=256, e3x3=256)
31 | 
32 |         self.output_layers = nn.Sequential(
33 |             nn.Dropout(p=0.5),
34 |             nn.Conv2d(in_channels=512, out_channels=num_classes, kernel_size=1),
35 |             nn.ReLU(),
36 |             nn.AdaptiveAvgPool2d((1, 1)),
37 |             nn.Flatten()
38 |         )
39 | 
40 |     def forward(self, x):
41 |         x = self.input_layers(x)
42 |         x = self.fire_block2(x)
43 |         bypass1 = x
44 |         x = self.fire_block3(x)
45 |         x = x + bypass1
46 |         x = self.fire_block4(x)
47 |         x = self.maxpool4(x)
48 |         bypass2 = x
49 |         x = self.fire_block5(x)
50 |         x = x + bypass2
51 |         x = self.fire_block6(x)
52 |         bypass3 = x
53 |         x = self.fire_block7(x)
54 |         x = x + bypass3
55 |         x = self.fire_block8(x)
56 |         x = self.maxpool8(x)
57 |         bypass4 = x
58 |         x = self.fire_block9(x)
59 |         x = x + bypass4
60 |         x = self.output_layers(x)
61 |         return x
62 | 
63 | if __name__ == '__main__':
64 |     model = SqueezeNet(num_classes=1000)
65 |     summary(model, (3, 224, 224))
66 |     


--------------------------------------------------------------------------------
/Architectures/SqueezeNet/src/SqueezeNet_Fire_module.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/SqueezeNet/src/SqueezeNet_Fire_module.png


--------------------------------------------------------------------------------
/Architectures/SqueezeNet/src/SqueezeNet_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/SqueezeNet/src/SqueezeNet_architecture.png


--------------------------------------------------------------------------------
/Architectures/VGG16/README.md:
--------------------------------------------------------------------------------
 1 | # [VGG16]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of VGG16. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://gvdmnni.notion.site/VGG16-f4a2d933dfc74fbca3add229a19b4f3a?pvs=4).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | - **Development of the Inception architecture**: A novel deep neural network architecture that balances depth and width to optimize computational efficiency.
12 | - **Hebbian principle and multi-scale processing**: The architectural decisions were informed by these principles to enhance the network's performance.
13 | - **VGG16 implementation**: A 22-layer deep network that significantly outperforms previous models on the ILSVRC 2014 classification and detection tasks.
14 | - **Efficiency considerations**: The network design emphasizes computational efficiency, making it suitable for real-world applications with limited computational resources.
15 | 
16 | 
17 | ## Architecture Scheme
18 | Below is a schematic representation of the architecture:
19 | 
20 | ![Architecture Scheme](./src/VGG16.png)
21 | 
22 | 
23 | ## Reproduced Results (TBD)
24 | The following results were reproduced as per the methodology described in the paper:
25 | - Result 1: [Description and value]
26 | - Result 2: [Description and value]
27 | - Result 3: [Description and value]
28 | - ...
29 | 
30 | 
31 | ## References
32 | - [Original Paper](https://arxiv.org/pdf/1409.1556)
33 | - [Detailed Blog Post](https://gvdmnni.notion.site/VGG16-f4a2d933dfc74fbca3add229a19b4f3a?pvs=4)
34 | 


--------------------------------------------------------------------------------
/Architectures/VGG16/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/VGG16/src/VGG16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/VGG16/src/VGG16.png


--------------------------------------------------------------------------------
/Architectures/VGG16/vgg16.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | 
 5 | class VGG16Block1(nn.Module):
 6 |     def __init__(self, in_channels, out_conv):
 7 |         super(VGG16Block1, self).__init__()
 8 |         self.block = nn.Sequential(
 9 |             nn.Conv2d(in_channels = in_channels, out_channels=out_conv, kernel_size=3, stride=1, padding=1),
10 |             nn.ReLU(),
11 |             nn.Conv2d(in_channels = out_conv, out_channels=out_conv, kernel_size=3, stride=1, padding=1),
12 |             nn.ReLU(),
13 |             nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
14 |         )
15 | 
16 |     def forward(self, x):
17 |         x = self.block(x)
18 |         return x
19 | 
20 | 
21 | class VGG16Block2(nn.Module):
22 |     def __init__(self, in_channels, out_conv):
23 |         super(VGG16Block2, self).__init__()
24 |         self.block = nn.Sequential(
25 |             nn.Conv2d(in_channels=in_channels, out_channels=out_conv, kernel_size=3, stride=1, padding=1),
26 |             nn.ReLU(),
27 |             nn.Conv2d(in_channels=out_conv, out_channels=out_conv, kernel_size=3, stride=1, padding=1),
28 |             nn.ReLU(),
29 |             nn.Conv2d(in_channels=out_conv, out_channels=out_conv, kernel_size=3, stride=1, padding=1),
30 |             nn.ReLU(),
31 |             nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
32 |         )
33 | 
34 |     def forward(self, x):
35 |         x = self.block(x)
36 |         return x
37 | 
38 | class VGG16(nn.Module):
39 |     def __init__(self, n_classes):
40 |         super(VGG16, self).__init__()
41 | 
42 |         # First Block
43 |         self.block1 = VGG16Block1(in_channels=3, out_conv=64)
44 | 
45 |         # Second Block
46 |         self.block2 = VGG16Block1(in_channels=64, out_conv=128)
47 | 
48 |         # Third Block
49 |         self.block3 = VGG16Block2(in_channels=128, out_conv=256)
50 | 
51 |         # Fourth Block
52 |         self.block4 = VGG16Block2(in_channels=256, out_conv=512)
53 | 
54 |         # Fifth Block
55 |         self.block5 = VGG16Block2(in_channels=512, out_conv=512)
56 | 
57 |         # Output Block
58 |         self.output_layers = nn.Sequential(
59 |             nn.Flatten(),
60 |             nn.Linear(in_features=25088, out_features=4096),
61 |             nn.Dropout(0.5),
62 |             nn.Linear(in_features=4096, out_features=4096),
63 |             nn.Dropout(0.5),
64 |             nn.Linear(in_features=4096, out_features=n_classes)
65 |         )
66 | 
67 |         self.flatten = nn.Flatten()
68 | 
69 |     def forward(self, x):
70 |         x = self.block1(x)
71 |         x = self.block2(x)
72 |         x = self.block3(x)
73 |         x = self.block4(x)
74 |         x = self.block5(x)
75 |         x = self.output_layers(x)
76 | 
77 |         return x
78 | 
79 | if __name__ == '__main__':
80 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
81 |     model = VGG16(n_classes=1000).to(device)
82 |     x = torch.randn(1, 3, 224, 224, device=device)
83 |     summary(model, (3, 224, 224))
84 |     print(model(x).shape)
85 | 
86 | 


--------------------------------------------------------------------------------
/Architectures/VisionTransformer/README.md:
--------------------------------------------------------------------------------
 1 | # [ViT]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of the Vision Transformer. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation (TBD)
 7 | 
 8 | 
 9 | ## Major Contributions (TBD)
10 | 
11 | 
12 | ## Architecture Scheme
13 | Below is a schematic representation of the architecture:
14 | 
15 | ![Architecture Scheme](./src/VGG16.png)
16 | 
17 | 
18 | ## Reproduced Results (TBD)
19 | The following results were reproduced as per the methodology described in the paper:
20 | - Result 1: [Description and value]
21 | - Result 2: [Description and value]
22 | - Result 3: [Description and value]
23 | - ...
24 | 
25 | 
26 | ## References
27 | - [Original Paper](https://arxiv.org/pdf/2010.11929)
28 | 
29 | 


--------------------------------------------------------------------------------
/Architectures/VisionTransformer/Vanilla_ViT.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | 
 5 | from transformer_module import TransformerEncoder
 6 | from transformer_utils import patchify, get_positional_embeddings
 7 | 
 8 | 
 9 | class VanillaViT(nn.Module):
10 |     def __init__(self, input_dim=(3, 128, 128), n_patches=8, embed_dim=512, num_classes=1000):
11 |         super(VanillaViT, self).__init__()
12 |         self.channels, self.height, self.width = input_dim
13 |         self.n_patches = n_patches
14 |         self.embed_dim = embed_dim
15 |         self.num_classes = num_classes
16 | 
17 |         assert self.height % self.n_patches == 0, "Input height not entirely divisible by number of patches"
18 |         assert self.width % self.n_patches == 0, "Input width not entirely divisible by number of patches"
19 | 
20 |         # The transformer has a linear mapper to embed the patches
21 |         self.linear_mapper = nn.Linear(
22 |             in_features=self.channels * (self.height // self.n_patches) * (self.width // self.n_patches),
23 |             out_features=self.embed_dim
24 |         )
25 | 
26 |         # The learnable classification token
27 |         self.class_token = nn.Parameter(torch.rand(1, 1, self.embed_dim))
28 | 
29 |         # Positional embeddings
30 |         self.positional_embeddings = nn.Parameter(
31 |             get_positional_embeddings(self.n_patches * self.n_patches + 1, self.embed_dim)
32 |         )
33 |         self.positional_embeddings.requires_grad = False
34 | 
35 |         # The transformer encoder
36 |         self.transformer_encoder = TransformerEncoder(
37 |             embed_dim=self.embed_dim,
38 |             num_heads=8,
39 |             ff_dim=2048,
40 |             dropout=0.1
41 |         )
42 | 
43 |         # The MLP head for classification
44 |         self.mlp_head = nn.Linear(self.embed_dim, self.num_classes)
45 | 
46 | 
47 | 
48 |     def forward(self, x):
49 |         # Split the image into patches
50 |         patches = patchify(x, self.n_patches)
51 | 
52 |         # Embed the patches
53 |         embeddings = self.linear_mapper(patches) # values (v)
54 | 
55 |         # Add the class token
56 |         class_token = self.class_token.expand(x.size(0), -1, -1) # keys (k)
57 |         embeddings = torch.cat([class_token, embeddings], dim = 1)
58 | 
59 |         # Add the positional embeddings
60 |         embeddings += self.positional_embeddings # queries (q)
61 |         # Run the transformer encoder
62 |         embeddings = self.transformer_encoder(embeddings)
63 | 
64 |         class_representation = embeddings[:, 0, :] # Get the class token representation
65 | 
66 |         output = self.mlp_head(class_representation) # Classify the class token representation
67 | 
68 |         return output
69 | 
70 | if __name__ == '__main__':
71 |     # Test the vanilla ViT
72 |     model = VanillaViT(input_dim=(3, 128, 128), n_patches=8, embed_dim=512, num_classes=1000).to('cuda')
73 |     x = torch.rand(16, 3, 128, 128).to('cuda') # 16 images of 3 channels with 128x128 resolution
74 |     output = model(x)
75 |     print(output.size())
76 | 
77 |     summary(model, (3, 128, 128))
78 | 
79 | 


--------------------------------------------------------------------------------
/Architectures/VisionTransformer/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/VisionTransformer/transformer_module.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Pytorch implementation of Vision Transformer Encoder
 3 | '''
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | 
 9 | 
10 | class TransformerEncoder(nn.Module):
11 |     def __init__(self, embed_dim, num_heads, ff_dim, dropout = 0.1):
12 |         '''
13 |         Transformer Encoder
14 |         - embed_dim: the dimension of the embeddings
15 |         - num_heads: the number of attention heads
16 |         - ff_dim: the dimension of the feedforward network
17 |         - dropout: the dropout rate
18 |         '''
19 |         super(TransformerEncoder, self).__init__()
20 | 
21 |         # Layer normalization
22 |         self.layer_norm_1 = nn.LayerNorm(normalized_shape = embed_dim)
23 | 
24 |         # Multi-head self-attention
25 |         self.multi_head_self_attention = nn.MultiheadAttention(embed_dim = embed_dim, num_heads = num_heads, dropout = dropout)
26 | 
27 |         # Layer normalization
28 |         self.layer_norm_2 = nn.LayerNorm(normalized_shape = embed_dim)
29 | 
30 |         # MLP
31 |         self.mlp = nn.Sequential(
32 |             nn.Linear(in_features = embed_dim, out_features = ff_dim),
33 |             nn.ReLU(),
34 |             nn.Linear(in_features = ff_dim, out_features = embed_dim)
35 |         )
36 | 
37 |         # Dropout
38 |         self.dropout = nn.Dropout(dropout)
39 | 
40 |     def forward(self, sequences):
41 |         '''
42 |         Forward pass
43 |         - sequences: the input sequences
44 |         '''
45 |         # Layer normalization
46 |         sequences = self.layer_norm_1(sequences)
47 | 
48 |         # Multi-head self-attention
49 |         attention_output, _ = self.multi_head_self_attention(sequences, sequences, sequences)
50 | 
51 |         # Dropout
52 |         attention_output = self.dropout(attention_output)
53 | 
54 |         # Residual connection
55 |         sequences = sequences + attention_output
56 | 
57 |         # Layer normalization
58 |         sequences = self.layer_norm_2(sequences)
59 | 
60 |         # MLP
61 |         mlp_output = self.mlp(sequences)
62 | 
63 |         # Dropout
64 |         mlp_output = self.dropout(mlp_output)
65 | 
66 |         # Residual connection
67 |         sequences = sequences + mlp_output
68 | 
69 |         return sequences
70 | 
71 | if __name__ == '__main__':
72 |     # Test the transformer encoder
73 |     encoder = TransformerEncoder(embed_dim = 512, num_heads = 8, ff_dim = 2048)
74 |     x = torch.rand(16, 49, 512) # 16 sequences of 49 positions with 512 dimensions
75 |     print(encoder(x).shape) # torch.Size([16, 49, 512])
76 | 
77 | 


--------------------------------------------------------------------------------
/Architectures/VisionTransformer/transformer_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import numpy as np
 4 | def patchify(images, n_patches):
 5 |     '''
 6 |     Split the image into patches
 7 |     - images: input images
 8 |     - n_patches: number of patches to split the image into
 9 |     '''
10 |     # Get the image size
11 |     n, c, h, w = images.size()  # n = batch size, c = number of channels, h = height, w = width
12 | 
13 |     assert h == w, 'Input image must be square'
14 | 
15 |     # Calculate patch size
16 |     patch_size = h // n_patches
17 | 
18 |     # Use unfold to create patches
19 |     patches = images.unfold(2, patch_size, patch_size).unfold(3, patch_size, patch_size)
20 |     patches = patches.contiguous().view(n, c, n_patches, n_patches, patch_size, patch_size)
21 |     patches = patches.permute(0, 2, 3, 1, 4, 5).contiguous().view(n, n_patches * n_patches, -1)
22 | 
23 |     return patches
24 | 
25 | 
26 | def get_positional_embeddings(sequence_length, d):
27 |     '''
28 |     Generate sinusoidal positional embeddings.
29 |     - sequence_length: the length of the sequence (number of positions)
30 |     - d: the dimension of the embeddings
31 |     '''
32 |     # Initialize a tensor to store the positional embeddings
33 |     result = torch.ones(sequence_length, d)
34 | 
35 |     # Iterate over each position in the sequence
36 |     for i in range(sequence_length):
37 |         # Iterate over each dimension of the embedding
38 |         for j in range(d):
39 |             # Calculate the positional embedding value
40 |             # Use sine for even indices and cosine for odd indices
41 |             if j % 2 == 0:
42 |                 result[i][j] = np.sin(i / (10000 ** (j / d)))
43 |             else:
44 |                 result[i][j] = np.cos(i / (10000 ** ((j - 1) / d)))
45 | 
46 |     return result


--------------------------------------------------------------------------------
/Architectures/Xception/README.md:
--------------------------------------------------------------------------------
 1 | # [Xception]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of Xception. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://gvdmnni.notion.site/Xception-d96a577832534a4da9e87272edf25d48?pvs=4).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | ## 
12 | 
13 | - Xception is based on the hypothesis that cross-channel correlations and spatial correlations in convolutional networks can be entirely decoupled.
14 | - It maps all spatial correlations with a single convolutional layer and performs 1x1 depthwise convolutions to capture cross-channel correlations.
15 | - This is mathematically equivalent to performing standard convolutions but allows the network to separately map spatial and depthwise correlations.
16 | - Xception uses residual connections, making it easy to define and modify the architecture.
17 | - Experiments show that Xception outperforms Inception V3 on large datasets like JFT while using a similar number of parameters.
18 | 
19 | ## Architecture Scheme
20 | Below a schematic representation of the modules that are used in the architecture:
21 | ![Module](./src/Xception_Entry_Flow.png)*Figure: Xception Entry Flow*
22 | 
23 | ![Module](./src/Xception_middle_flow.png)*Figure: Xception Middle Flow*
24 | 
25 | ![Module](./src/Xception_Exit_Flow.png)*Figure: Xception Exit Flow*
26 | 
27 | 
28 | 
29 | 
30 | ## Reproduced Results (TBD)
31 | The following results were reproduced as per the methodology described in the paper:
32 | - Result 1: [Description and value]
33 | - Result 2: [Description and value]
34 | - Result 3: [Description and value]
35 | - ...
36 | 
37 | 
38 | ## References
39 | - [Original Paper](https://arxiv.org/abs/1512.00567)
40 | - [Detailed Blog Post](https://gvdmnni.notion.site/Xception-d96a577832534a4da9e87272edf25d48?pvs=4)
41 | 


--------------------------------------------------------------------------------
/Architectures/Xception/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/Xception/src/Xception_Entry_Flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/Xception/src/Xception_Entry_Flow.png


--------------------------------------------------------------------------------
/Architectures/Xception/src/Xception_Exit_Flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/Xception/src/Xception_Exit_Flow.png


--------------------------------------------------------------------------------
/Architectures/Xception/src/Xception_middle_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/Xception/src/Xception_middle_flow.png


--------------------------------------------------------------------------------
/Architectures/Xception/xception.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | 
 5 | from xception_blocks import *
 6 | 
 7 | class Xception(nn.Module):
 8 |     def __init__(self, num_classes=1000):
 9 |         super(Xception, self).__init__()
10 | 
11 |         # Entry Flow
12 |         self.input_layers = nn.Sequential(
13 |             nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=2),
14 |             nn.BatchNorm2d(32),
15 |             nn.ReLU(),
16 |             nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3),
17 |             nn.BatchNorm2d(64),
18 |             nn.ReLU()
19 |         )
20 | 
21 |         self.entry_flow = nn.Sequential(
22 |             XceptionEFlowBlock(in_channels=64, out_channels=128),
23 |             XceptionEFlowBlock(in_channels=128, out_channels=256),
24 |             XceptionEFlowBlock(in_channels=256, out_channels=728)
25 |         )
26 | 
27 |         # Middle Flow
28 |         middle_layers = []
29 |         for _ in range(8):
30 |             middle_layers.append(XceptionMiddleFlowBlock(in_channels=728))
31 |         self.middle_flow = nn.Sequential(*middle_layers)
32 | 
33 |         # Exit Flow
34 |         self.exit_flow = nn.Sequential(
35 |             XceptionEFlowBlock(in_channels=728, out_channels=728),
36 |             SeparableConv2d(in_channels=728, out_channels=1024, kernel_size=3, stride=1, padding=1),
37 |             nn.BatchNorm2d(1024),
38 |             nn.ReLU(),
39 |             SeparableConv2d(in_channels=1024, out_channels=1536, kernel_size=3, stride=1, padding=1),
40 |             nn.BatchNorm2d(1536),
41 |             nn.ReLU(),
42 |             nn.AdaptiveAvgPool2d((1, 1)),
43 |         )
44 | 
45 |         self.output_layers = nn.Sequential(
46 |             nn.Flatten(),
47 |             nn.Linear(1536, num_classes)
48 |         )
49 | 
50 |     def forward(self, x):
51 |         x = self.input_layers(x)
52 |         x = self.entry_flow(x)
53 |         x = self.middle_flow(x)
54 |         x = self.exit_flow(x)
55 |         x = self.output_layers(x)
56 |         return x
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     model = Xception()
61 |     summary(model, (3, 299, 299))
62 | 


--------------------------------------------------------------------------------
/Architectures/Xception/xception_blocks.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | 
 6 | class SeparableConv2d(nn.Module):
 7 |     def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
 8 |         '''
 9 |         A separable convolution block is a combination of depthwise convolution and pointwise convolution. This means
10 |         that the input tensor is convolved with a kernel of size (kernel_size, kernel_size) and then the output of this
11 |         operation is convolved with a 1x1 kernel. This is done to reduce the number of parameters in the model.
12 |         '''
13 |         super(SeparableConv2d, self).__init__()
14 | 
15 |         self.depthwise = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=in_channels)
16 |         self.pointwise = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1)
17 | 
18 |     def forward(self, x):
19 |         x = self.depthwise(x)
20 |         x = self.pointwise(x)
21 |         return x
22 | 
23 | 
24 | class XceptionEFlowBlock(nn.Module):
25 |     def __init__(self, in_channels, out_channels):
26 |         super(XceptionEFlowBlock, self).__init__()
27 |         self.main_branch = nn.Sequential(
28 |             SeparableConv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1),
29 |             nn.BatchNorm2d(out_channels),
30 |             nn.ReLU(),
31 |             SeparableConv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1),
32 |             nn.BatchNorm2d(out_channels),
33 |             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
34 |         )
35 | 
36 |         self.skip_connection = nn.Sequential(
37 |             nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=2),
38 |             nn.BatchNorm2d(out_channels),
39 |         )
40 |     def forward(self, x):
41 |         x = self.main_branch(x) + self.skip_connection(x)
42 |         return x
43 | 
44 | 
45 | class XceptionMiddleFlowBlock(nn.Module):
46 |     def __init__(self, in_channels):
47 |         super(XceptionMiddleFlowBlock, self).__init__()
48 |         self.main_branch = nn.Sequential(
49 |             nn.ReLU(),
50 |             SeparableConv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1),
51 |             nn.BatchNorm2d(in_channels),
52 |             nn.ReLU(),
53 |             SeparableConv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1),
54 |             nn.BatchNorm2d(in_channels),
55 |             nn.ReLU(),
56 |             SeparableConv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1),
57 |             nn.BatchNorm2d(in_channels)
58 |         )
59 | 
60 |     def forward(self, x):
61 |         x = self.main_branch(x) + x
62 |         return x
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/Architectures/ZFNet/README.md:
--------------------------------------------------------------------------------
 1 | # [ZFNet]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of ZFNet. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post](https://www.notion.so/gvdmnni/ZFNet-0eb644d269a3465cb8a900ebddbefb7c).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | - Development of a novel visualization technique to analyze convolutional networks.
12 | - Enhanced understanding of the inner workings and feature activations within CNNs.
13 | - Insights into the significance of deeper architectures for improved performance.
14 | - Implementation of smaller kernel sizes for capturing finer-grained patterns.
15 | - Pioneering use of transfer learning to generalize models across different datasets
16 | 
17 | 
18 | ## Architecture Scheme
19 | Below is a schematic representation of the architecture:
20 | 
21 | ![Architecture Scheme](./src/ZFNet.png)
22 | 
23 | ## Reproduced Results (TBD)
24 | The following results were reproduced as per the methodology described in the paper:
25 | - Result 1: [Description and value]
26 | - Result 2: [Description and value]
27 | - Result 3: [Description and value]
28 | - ...
29 | 
30 | 
31 | ## References
32 | - [Original Paper](https://arxiv.org/pdf/1311.2901)
33 | - [Detailed Blog Post](https://www.notion.so/gvdmnni/ZFNet-0eb644d269a3465cb8a900ebddbefb7c)
34 | 


--------------------------------------------------------------------------------
/Architectures/ZFNet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Architectures/ZFNet/src/ZFNet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GuidoManni/DeepLearningImplementation/12f458d40141d00752af5c25ef81c3a0c511f349/Architectures/ZFNet/src/ZFNet.png


--------------------------------------------------------------------------------
/Architectures/ZFNet/zfnet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | 
 5 | 
 6 | class ZFNet(nn.Module):
 7 |     def __init__(self, num_classes=1000):
 8 |         super(ZFNet, self).__init__()
 9 |         # First Convolutional Layer
10 |         self.conv1 = nn.Sequential(
11 |             nn.Conv2d(in_channels=3, out_channels=96, kernel_size=7, stride=2),
12 |             nn.ReLU(),
13 |             nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),
14 |             nn.MaxPool2d(kernel_size=3, stride=2)
15 |         )
16 |         # Second Convolutional Layer
17 |         self.conv2 = nn.Sequential(
18 |             nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=2, padding=2),
19 |             nn.ReLU(),
20 |             nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),
21 |             nn.MaxPool2d(kernel_size=3, stride=2)
22 |         )
23 |         # Third Convolutional Layer
24 |         self.conv3 = nn.Sequential(
25 |             nn.Conv2d(in_channels=256, out_channels=384, kernel_size=3, stride=1, padding=1),
26 |             nn.ReLU()
27 |         )
28 |         # Fourth Convolutional Layer
29 |         self.conv4 = nn.Sequential(
30 |             nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, stride=1, padding=1),
31 |             nn.ReLU()
32 |         )
33 |         # Fifth Convolutional Layer
34 |         self.conv5 = nn.Sequential(
35 |             nn.Conv2d(in_channels=384, out_channels=256, kernel_size=3, stride=1, padding=1),
36 |             nn.ReLU(),
37 |             nn.MaxPool2d(kernel_size=3, stride=2)
38 |         )
39 | 
40 |         # Dense Layer
41 |         self.output = nn.Sequential(
42 |             nn.Linear(256 * 6 * 6, 4096),
43 |             nn.ReLU(),
44 |             nn.Dropout(0.5),
45 |             nn.Linear(4096, 4096),
46 |             nn.ReLU(),
47 |             nn.Dropout(0.5),
48 |             nn.Linear(4096, 1000)
49 |         )
50 | 
51 |     def forward(self, x):
52 |         x = self.conv1(x)
53 |         x = self.conv2(x)
54 |         x = self.conv3(x)
55 |         x = self.conv4(x)
56 |         x = self.conv5(x)
57 |         print(x.shape)
58 |         x = torch.reshape(x, (x.shape[0], 256 * 6 * 6))  # reshaping the tensor for the dense layer (we are flattening)
59 |         x = self.output(x)
60 |         return x
61 | 
62 | if __name__ == '__main__':
63 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
64 |     n_classes = 1000
65 |     model = ZFNet(n_classes).to(device)
66 |     x = torch.randn(1, 3, 224, 224, device=device)
67 |     summary(model, (3, 224, 224))
68 |     print(model(x).shape)


--------------------------------------------------------------------------------
/Attention Mechanism/CBAM/README.md:
--------------------------------------------------------------------------------
 1 | # [CBAM]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of CBAM. Below you will find detailed information and resources related to this attention mechanism.
 5 | 
 6 | ## Detailed Explanation (TBD)
 7 | 
 8 | ## Major Contributions (TBD)
 9 | 
10 | ## Architecture Scheme (TBD)
11 | Below is a schematic representation of the architecture:
12 | 
13 | ![Architecture Scheme]()
14 | 
15 | ## Reproduced Results (TBD)
16 | The following results were reproduced as per the methodology described in the paper:
17 | - Result 1: [Description and value]
18 | - Result 2: [Description and value]
19 | - Result 3: [Description and value]
20 | - ...
21 | 
22 | ## References
23 | - [Original Paper](https://arxiv.org/abs/1807.06521)
24 | - [Detailed Blog Post (TBD)]()
25 | 


--------------------------------------------------------------------------------
/Attention Mechanism/CBAM/cbam_module.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | The Convolutional Block Attention Module (CBAM) is a module that performs attention mechanism on the feature maps.
  3 | It consists of two sub-modules: Channel Attention Module and Spatial Attention Module.
  4 | - The Channel Attention Module performs attention mechanism on the channel dimension of the feature maps.
  5 | - The Spatial Attention Module performs attention mechanism on the spatial dimension of the feature maps.
  6 | '''
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | 
 12 | 
 13 | class ChannelAttentionModule(nn.Module):
 14 |     def __init__(self, in_channels, reduction):
 15 |         super(ChannelAttentionModule, self).__init__()
 16 |         '''
 17 |         The channel attention module is comprised of:
 18 |         - Global Average Pooling & Max Pooling (in parallel)
 19 |         - MLP with one hidden layer (shared)
 20 |         - The output of the MLP is merged (element-wise addition) 
 21 |         - Then we apply the Sigmoid activation function
 22 |         '''
 23 | 
 24 |         self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
 25 |         self.global_max_pool = nn.AdaptiveMaxPool2d(1)
 26 | 
 27 |         self.mlp = nn.Sequential(
 28 |             nn.Linear(in_features=in_channels, out_features=in_channels // reduction),
 29 |             nn.ReLU(),
 30 |             nn.Linear(in_features=in_channels // reduction, out_features=in_channels)
 31 |         )
 32 | 
 33 |         self.sigmoid = nn.Sigmoid()
 34 | 
 35 |     def forward(self, x):
 36 |         avg_pool = self.global_avg_pool(x)
 37 |         max_pool = self.global_max_pool(x)
 38 | 
 39 |         avg_pool = avg_pool.view(avg_pool.size(0), -1)
 40 |         max_pool = max_pool.view(max_pool.size(0), -1)
 41 | 
 42 |         avg_pool = self.mlp(avg_pool)
 43 |         max_pool = self.mlp(max_pool)
 44 | 
 45 |         channel_attention = self.sigmoid(avg_pool + max_pool).unsqueeze(2).unsqueeze(3)
 46 | 
 47 |         return channel_attention * x
 48 | 
 49 | 
 50 | class SpatialAttentionModule(nn.Module):
 51 |     def __init__(self, kernel_size=7):
 52 |         super(SpatialAttentionModule, self).__init__()
 53 |         '''
 54 |         The spatial attention module is comprised of:
 55 |         - average pooling & max pooling (in parallel)
 56 |         - concatenation of the two pooled features
 57 |         - 7x7 convolutional layer
 58 |         - Sigmoid activation function
 59 |         '''
 60 | 
 61 |         # Make sure kernel size is odd for same padding
 62 |         padding = kernel_size // 2
 63 | 
 64 |         # Single conv layer after concatenation
 65 |         self.conv = nn.Conv2d(
 66 |             in_channels=2,  # 2 because we concatenate avg_pool and max_pool
 67 |             out_channels=1,  # Output one attention map
 68 |             kernel_size=kernel_size,
 69 |             padding=padding,
 70 |             bias=False
 71 |         )
 72 | 
 73 |         self.sigmoid = nn.Sigmoid()
 74 | 
 75 |     def forward(self, x):
 76 |         # Average pooling along channel dimension
 77 |         avg_pool = torch.mean(x, dim=1, keepdim=True)  # (B,1,H,W)
 78 | 
 79 |         # Max pooling along channel dimension
 80 |         max_pool, _ = torch.max(x, dim=1, keepdim=True)  # (B,1,H,W)
 81 | 
 82 |         # Concatenate along the channel dimension
 83 |         concat = torch.cat([avg_pool, max_pool], dim=1)  # (B,2,H,W)
 84 | 
 85 |         # Generate attention map
 86 |         attention_map = self.conv(concat)  # (B,1,H,W)
 87 |         attention_map = self.sigmoid(attention_map)
 88 | 
 89 |         return attention_map * x
 90 | 
 91 | 
 92 | class CBAMModule(nn.Module):
 93 |     def __init__(self, in_channels, reduction):
 94 |         super(CBAMModule, self).__init__()
 95 |         '''
 96 |         The CBAM module is comprised of:
 97 |         - Channel Attention Module
 98 |         - Spatial Attention Module
 99 |         '''
100 | 
101 |         self.channel_attention = ChannelAttentionModule(in_channels, reduction)
102 |         self.spatial_attention = SpatialAttentionModule()
103 | 
104 |     def forward(self, x):
105 |         out = self.channel_attention(x)
106 |         out = self.spatial_attention(out)
107 |         out = x + out
108 |         return out


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | .
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/Generative Adversarial Networks/GAN (2014)/README.md:
--------------------------------------------------------------------------------
 1 | # [Original GAN Implementation]
 2 | 
 3 | ## Overview
 4 | This repository contains a PyTorch implementation of the original Generative Adversarial Network (GAN) as proposed by Goodfellow et al. in 2014. This implementation maintains the original architecture choices, using simple MLPs for both generator and discriminator.
 5 | 
 6 | ## Detailed Explanation
 7 | For a comprehensive understanding of GANs and their significance, please refer to the [original paper](https://arxiv.org/abs/1406.2661).
 8 | 
 9 | ## Major Contributions
10 | The major contributions of the paper include:
11 | - **Novel Framework:** Introduced the adversarial framework where two networks compete against each other
12 | - **Simple Architecture:** Demonstrated that even simple MLPs could generate realistic images
13 | - **Versatile Approach:** Provided a framework that could be applied to various types of data generation
14 | - **Theoretical Foundation:** Established theoretical guarantees for the training process
15 | 
16 | ## Architecture Details
17 | 
18 | ### Generator Architecture
19 | - **Input:** 100-dimensional uniform random noise z ∈ [-1, 1]
20 | - **Hidden Layer:** 240 units with ReLU activation
21 | - **Output Layer:** 784 units (28×28) with Sigmoid activation
22 | - **Output Reshaping:** Flat vector reshaped to 28×28 image
23 | 
24 | ### Discriminator Architecture
25 | - **Input:** 784-dimensional flattened image (28×28)
26 | - **Hidden Layer:** 240 units with ReLU activation
27 | - **Output Layer:** Single unit with Sigmoid activation
28 | 
29 | ## Key Implementation Features
30 | 1. **Generator Features:**
31 |    - Flexible batch size handling
32 |    - Built-in noise sampling
33 |    - Automatic reshaping to image dimensions
34 |    
35 | 2. **Discriminator Features:**
36 |    - Integrated flattening in the architecture
37 |    - Single probability output for real/fake classification
38 | 
39 | ## Usage Example
40 | ```python
41 | # Initialize models
42 | generator = Generator(z_dim=100, output_dim=28)
43 | discriminator = Discriminator(input_dim=28)
44 | 
45 | # Generate fake images
46 | fake_images = generator(batch_size=64)  # Shape: [64, 1, 28, 28]
47 | 
48 | # Discriminate images
49 | predictions = discriminator(fake_images)  # Shape: [64, 1]


--------------------------------------------------------------------------------
/Generative Adversarial Networks/GAN (2014)/discriminator.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | 
 5 | 
 6 | class Discriminator(nn.Module):
 7 |     def __init__(self, input_dim=28):
 8 |         super(Discriminator, self).__init__()
 9 |         # Calculate input features (e.g., 28*28=784 for MNIST)
10 |         in_features = input_dim * input_dim
11 | 
12 |         # Build the MLP architecture from original GAN paper:
13 |         # 784 -> 240 -> 1
14 |         self.mlp = nn.Sequential(
15 |             # Flatten 2D image to 1D vector
16 |             nn.Flatten(),  # (batch_size, 1, 28, 28) -> (batch_size, 784)
17 | 
18 |             # First layer transforms flattened image to hidden representation
19 |             nn.Linear(in_features=in_features, out_features=240),
20 |             nn.ReLU(),  # ReLU activation as per original paper
21 | 
22 |             # Output layer produces single scalar
23 |             nn.Linear(in_features=240, out_features=1),
24 |             nn.Sigmoid()  # Sigmoid to get probability in [0,1]
25 |         )
26 | 
27 |     def forward(self, x):
28 |         # x shape: (batch_size, 1, 28, 28)
29 |         # output shape: (batch_size, 1)
30 |         return self.mlp(x)
31 | 
32 | if __name__ == '__main__':
33 |     disc = Discriminator(input_dim=28)
34 |     summary(disc)
35 | 


--------------------------------------------------------------------------------
/Generative Adversarial Networks/GAN (2014)/generator.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | 
 5 | 
 6 | class Generator(nn.Module):
 7 |     def __init__(self, z_dim=100, output_dim=28):
 8 |         super(Generator, self).__init__()
 9 |         # Calculate total dimensions for output (e.g., 28*28=784 for MNIST)
10 |         out_features = output_dim * output_dim
11 |         self.output_dim = output_dim
12 |         self.z_dim = z_dim
13 | 
14 |         # Build the MLP architecture from the original GAN paper:
15 |         # z_dim -> 240 -> 784 (reshaped to 28x28)
16 |         self.mlp = nn.Sequential(
17 |             # First layer transforms from latent space to hidden layer
18 |             nn.Linear(in_features=z_dim, out_features=240),
19 |             nn.ReLU(),  # ReLU activation as per original paper
20 | 
21 |             # Output layer transforms to flattened image dimensions
22 |             nn.Linear(in_features=240, out_features=out_features),
23 |             nn.Sigmoid()  # Sigmoid ensures output is in [0,1] range for images
24 |         )
25 | 
26 |     def sample_z(self, batch_size):
27 |         # Sample from uniform distribution [-1, 1]
28 |         # Shape: (batch_size, z_dim)
29 |         z = 2 * torch.rand(batch_size, self.z_dim) - 1
30 |         return z
31 | 
32 |     def to_image(self, out_flattened):
33 |         # Reshape flat tensor to image format:
34 |         # (batch_size, 784) -> (batch_size, 1, 28, 28)
35 |         return out_flattened.view(out_flattened.size(0), 1, self.output_dim, self.output_dim)
36 | 
37 |     def forward(self, batch_size=None, z=None):
38 |         # Allow both automatic sampling and manual z input
39 |         if z is None:
40 |             if batch_size is None:
41 |                 raise ValueError("Must provide either batch_size or z")
42 |             # Sample random noise if z not provided
43 |             z = self.sample_z(batch_size)
44 | 
45 |         # Generate flattened images
46 |         out_flattened = self.mlp(z)
47 | 
48 |         # Reshape to proper image format
49 |         out = self.to_image(out_flattened)
50 |         return out
51 | 
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     gen = Generator(z_dim=100, output_dim=28)
56 |     summary(gen)
57 | 
58 | 


--------------------------------------------------------------------------------
/Generative Adversarial Networks/GAN (2014)/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Guido Manni
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DeepLearningImplementation 🧠
  2 | 
  3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  4 | [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
  5 | [![PyTorch](https://img.shields.io/badge/PyTorch-%23EE4C2C.svg?style=flat&logo=PyTorch&logoColor=white)](https://pytorch.org/)
  6 | 
  7 | Welcome to the DeepLearningImplementation repository! This project provides clean, readable implementations of seminal deep learning architectures for computer vision. Whether you're a researcher, student, or practitioner, you'll find comprehensive implementations, training scripts, and documentation for some of the most influential models in the field.
  8 | 
  9 | ## 🎯 Project Philosophy
 10 | 
 11 | We prioritize clarity and understanding over optimization. Our implementations focus on:
 12 | 
 13 | - **Simplicity**: Clean, straightforward code that's easy to follow
 14 | - **Readability**: Clear variable names, thorough comments, and structured organization
 15 | - **Learning-Oriented**: Focus on fundamental mechanisms for deeper understanding
 16 | - **Minimal Dependencies**: Built primarily with PyTorch for simplified setup
 17 | 
 18 | ## 📚 Available and Planned Implementations
 19 | 
 20 | ### Computer Vision Architectures
 21 | - [✅] [AlexNet (2012)](./Architectures/AlexNet)
 22 | - [✅] [ZFNet (2013)](./Architectures/ZFNet)
 23 | - [✅] [GoogLeNet (2014)](./Architectures/GoogLeNet)
 24 | - [✅] [VGG16 (2015)](./Architectures/VGG16)
 25 | - [✅] [ResNet (2015)](./Architectures/ResNet)
 26 | - [✅] [Rethinked Inception (2015)](./Architectures/Rethinked%20Inception)
 27 | - [✅] [DenseNet (2016)](./Architectures/DenseNet)
 28 | - [✅] [Xception (2016)](./Architectures/Xception)
 29 | - [✅] [SqueezeNet (2016)](./Architectures/SqueezeNet)
 30 | - [✅] [ResNeXt (2016)](./Architectures/ResNeXt)
 31 | - [✅] [SENet (2017)](./Architectures/SENet)
 32 | - [✅] [MobileNet (2017)](./Architectures/MobileNet)
 33 | - [✅] [ShuffleNet (2017)](./Architectures/ShuffleNet)
 34 | - [✅] [Residual Attention Network (2017)](./Architectures/ResidualAttentionNetwork)
 35 | - [✅] [MobileNetV2 (2018)](./Architectures/MobileNetV2)
 36 | - [✅] [EfficientNet (2019)](./Architectures/EfficientNet)
 37 | - [✅] [VisionTransformer (2020)](https://arxiv.org/pdf/2010.11929)
 38 | - [ ] [DeepViT (2021)](https://arxiv.org/abs/2103.11886)
 39 | - [ ] [Tokens-to-Token ViT (2021)](https://arxiv.org/abs/2103.11886)
 40 | - [ ] [CCT (2021)](https://arxiv.org/abs/2103.11886)
 41 | - [ ] [LeViT (2021)](https://arxiv.org/abs/2104.01136)
 42 | - [ ] [SwinTransformer (2021)](https://arxiv.org/pdf/2103.14030)
 43 | - [ ] [MobileVIT (2021)](https://arxiv.org/abs/2110.02178)
 44 | - [ ] [Vision Transformer for Small-Size Datasets (2021)](https://arxiv.org/abs/2202.12015)
 45 | - [ ] [SepViT (2022)](https://arxiv.org/abs/2203.15380)
 46 | - [ ] [MaxViT (2022)](https://arxiv.org/pdf/2204.01697)
 47 | - [ ] [Patch Merger (2022)](https://arxiv.org/abs/2202.12015)
 48 | - [ ] [ConvNet (2022)](https://arxiv.org/abs/2201.03545)
 49 | - [ ] [ConvNext V2 (2023)](https://arxiv.org/abs/2301.00808)
 50 | - [ ] [RepVIT (2023)](https://arxiv.org/abs/2307.09283)
 51 | - [ ] [VisionLSTM (2024)](https://arxiv.org/pdf/2406.04303)
 52 | 
 53 | 
 54 | ### Semantic Segmentation
 55 | - [ ] [FCN (2014)](https://arxiv.org/abs/1411.4038)
 56 | - [ ] [SegNet (2015)](https://arxiv.org/abs/1511.00561)
 57 | - [✅] [UNet (2015)](./Semantic%20Segmentation/UNet/)
 58 | - [ ] [PSPNet (2016)](https://arxiv.org/abs/1612.01105)
 59 | - [ ] [DeepLab (2016)](https://arxiv.org/abs/1606.00915)
 60 | - [ ] [ENet (2016)](https://arxiv.org/abs/1606.02147)
 61 | - [ ] [Mask R-CNN (2017)](https://arxiv.org/abs/1703.06870)
 62 | - [ ] [DeepLabV3 (2017)](https://arxiv.org/abs/1706.05587)
 63 | - [ ] [ICNet (2018)](https://arxiv.org/abs/1704.08545)
 64 | - [✅] [Attention Unet (2018)](./Semantic%20Segmentation/AttentionUnet/)
 65 | - [ ] [HRNet (2019)](https://arxiv.org/abs/1904.04514)
 66 | - [ ] [OCRNet (2019)](https://arxiv.org/abs/1909.11065)
 67 | - [✅] [U-Net++ (2019)](./Semantic%20Segmentation/UNet++/)
 68 | - [ ] [SegFormer (2021)](https://arxiv.org/abs/2105.15203)
 69 | - [ ] [Mask2Former (2022)](https://arxiv.org/abs/2204.01697)
 70 | 
 71 | ### Object Detection
 72 | - [ ] [RCNN (2014)](https://arxiv.org/abs/1311.2524)
 73 | - [ ] [Fast-RCNN (2015)](https://arxiv.org/abs/1504.08083)
 74 | - [ ] [Faster-RCNN (2015)](https://arxiv.org/abs/1506.01497)
 75 | - [ ] [YOLO (2015)](https://arxiv.org/abs/1506.02640)
 76 | - [ ] [SSD (2016)](https://arxiv.org/abs/1512.02325)
 77 | - [ ] [YOLO9000 (2016)](https://arxiv.org/abs/1612.08242)
 78 | - [ ] [RetinaNet (2017)](https://arxiv.org/abs/1708.02002)
 79 | - [ ] [YOLOv3 (2018)](https://arxiv.org/abs/1804.02767)
 80 | - [ ] [YOLOv4 (2020)](https://arxiv.org/abs/2004.10934)
 81 | 
 82 | ### Generative Adversarial Networks
 83 | - [✅] [GAN (2014)](./Generative%20Adversarial%20Networks/GAN%20(2014)/)
 84 | - [ ] [DCGAN (2015)](https://arxiv.org/abs/1511.06434)
 85 | - [ ] [InfoGAN (2016)](https://arxiv.org/abs/1606.03657)
 86 | - [ ] [Pix2Pix (2016)](https://arxiv.org/abs/1611.07004)
 87 | - [ ] [WGAN (2017)](https://arxiv.org/abs/1701.07875)
 88 | - [ ] [CycleGAN (2017)](https://arxiv.org/abs/1703.10593)
 89 | - [ ] [BigGAN (2018)](https://arxiv.org/abs/1809.11096)
 90 | - [ ] [StyleGAN (2018)](https://arxiv.org/abs/1812.04948)
 91 | - [ ] [StyleGAN2 (2019)](https://arxiv.org/abs/1912.04958)
 92 | 
 93 | ### Diffusion Generative Models
 94 | - [ ] [DDPM (2020)](https://arxiv.org/abs/2006.11239)
 95 | 
 96 | ### Autoregressive Generative Networks
 97 | - [ ] [PixelRNN (2016)](https://arxiv.org/pdf/1601.06759)
 98 | - [ ] [PixelCNN (2016)](https://arxiv.org/abs/1606.05328)
 99 | - [ ] [PixelSNAIL (2017)](https://arxiv.org/abs/1712.09763)
100 | 
101 | ### 3D Reconstruction from 2D Images
102 | - [ ] [3D-R2N2 (2016)](https://arxiv.org/abs/1604.00449)
103 | - [ ] [3D-RecGAN (2017)](https://arxiv.org/abs/1708.07969)
104 | - [ ] [3D-GAN (2017)](https://arxiv.org/abs/1707.09557)
105 | - [ ] [3D-RecGAN++ (2018)](https://arxiv.org/abs/1802.00411)
106 | - [ ] [AtlasNet (2018)](https://arxiv.org/abs/1802.05384)
107 | - [ ] [Occupancy Networks (2018)](https://arxiv.org/abs/1812.03828)
108 | - [ ] [DeepSDF (2019)](https://arxiv.org/abs/1901.05103)
109 | - [ ] [NeRF (2020)](https://arxiv.org/abs/2003.08934)
110 | 
111 | ### Attention Mechanism
112 | - [✅] [SENet (2017)](./Architectures/SENet)
113 | - [✅] [Residual Attention Network (2017)](./Architectures/ResidualAttentionNetwork)
114 | - [✅] [Attention Unet (2018)](./Semantic%20Segmentation/AttentionUnet/)
115 | - [✅] [CBAM (2018)](./Attention%20Mechanism/CBAM)
116 | 
117 | ## 🚀 Getting Started
118 | 
119 | ### Prerequisites
120 | - Python 3.8+
121 | - PyTorch 1.8+
122 | - CUDA-capable GPU (recommended)
123 | 
124 | ### Installation
125 | 
126 | 1. Clone the repository:
127 | ```bash
128 | git clone https://github.com/yourusername/DeepLearningImplementation.git
129 | cd DeepLearningImplementation
130 | ```
131 | 
132 | 2. Create a virtual environment (recommended):
133 | ```bash
134 | python -m venv venv
135 | source venv/bin/activate  # On Windows: venv\Scripts\activate
136 | ```
137 | 
138 | 3. Install dependencies for specific architecture:
139 | ```bash
140 | cd Architectures/DesiredModel
141 | pip install -r requirements.txt
142 | ```
143 | 
144 | ## 📁 Project Structure
145 | 
146 | ```
147 | DeepLearningImplementation/
148 | ├── Architectures/          # CNN architectures
149 | │   ├── AlexNet/
150 | │   │   ├── README.md
151 | │   │   ├── alexnet.py
152 | │   │   └── requirements.txt
153 | │   └── ...
154 | ├── SemanticSegmentation/
155 | ├── ObjectDetection/
156 | ├── GANs/
157 | ├── LICENSE
158 | └── README.md
159 | ```
160 | 
161 | ## 🛠️ Project Phases
162 | 
163 | ### Phase 1: Implementation and Initial Documentation (Current)
164 | - Writing clear, understandable code for each model
165 | - Providing basic documentation
166 | - Setting foundation for further development
167 | 
168 | ### Phase 2: Training and Performance Evaluation (Planned)
169 | - Training models on relevant datasets
170 | - Computing performance metrics
171 | - Comparing model strengths and weaknesses
172 | 
173 | ### Phase 3: Code Refinement and Documentation Enhancement (Planned)
174 | - Refining code implementations
175 | - Enhancing documentation
176 | - Adding detailed explanations and best practices
177 | 
178 | ## 👥 Contributing
179 | 
180 | Contributions are welcome! Please feel free to submit issues or pull requests to help improve the implementations and documentation.
181 | 
182 | ## 📄 License
183 | 
184 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
185 | 
186 | ## 📬 Contact
187 | 
188 | For any questions, please open an issue or contact the repository maintainer.
189 | 
190 | ---
191 | Made with ❤️ for the deep learning community
192 | 


--------------------------------------------------------------------------------
/Semantic Segmentation/AttentionUnet/README.md:
--------------------------------------------------------------------------------
 1 | # [Attention UNet]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of the Attention UNet. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation - TBD
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post]().
 8 | 
 9 | ## Major Contributions - TBD
10 | The major contributions of the paper include:
11 | 
12 | 
13 | ## Architecture Scheme - TBD
14 | Below is a schematic representation of the architecture:
15 | 
16 | ![Architecture Scheme]()
17 | 
18 | ## Reproduced Results (TBD)
19 | The following results were reproduced as per the methodology described in the paper:
20 | - Result 1: [Description and value]
21 | - Result 2: [Description and value]
22 | - Result 3: [Description and value]
23 | - ...
24 | 
25 | ## References
26 | - [Original Paper](https://arxiv.org/abs/1804.03999)
27 | - [Detailed Blog Post]()
28 | 


--------------------------------------------------------------------------------
/Semantic Segmentation/AttentionUnet/UNet2D.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torchsummary import summary
  4 | from unet2D_utils import *
  5 | 
  6 | '''
  7 | This implementation differs from the original paper "Attention U-Net: Learning Where to Look for the Pancreas" in the following ways:
  8 | 
  9 | 1. Dimensionality: This is a 2D implementation while the original paper uses 3D convolutions for volumetric data
 10 |    (CT scans). All 3D operations (3D convolutions, trilinear interpolation) are replaced with their 2D counterparts.
 11 | 
 12 | 2. Resampling Approach: The paper performs downsampling of input features to match gating signal resolution 
 13 |    before the attention operations. While we maintain this concept, we use bilinear interpolation instead of 
 14 |    trilinear interpolation due to the 2D nature of our implementation.
 15 | 
 16 | 3. Deep Supervision: The original paper uses deep supervision (mentioned in Section 2) which is not implemented
 17 |    in this version for simplicity.
 18 | '''
 19 | 
 20 | 
 21 | 
 22 | class ContractingPath(nn.Module):
 23 |     def __init__(self, in_channels = 3):
 24 |         super(ContractingPath, self).__init__()
 25 | 
 26 |         # In the original U-Net implementation, there are 4 contracting levels (also called downsampling steps).
 27 |         # Starting from the input, each level reduces the spatial dimensions by half and doubles the channel dimensions.
 28 | 
 29 |         self.first_level = ConvBlock(in_channels=in_channels, out_channels=64, maxpool=True)
 30 |         self.second_level = ConvBlock(in_channels=64, out_channels=128, maxpool=True)
 31 |         self.third_level = ConvBlock(in_channels=128, out_channels=256, maxpool=True)
 32 |         self.fourth_level = ConvBlock(in_channels=256, out_channels=512, maxpool=True)
 33 | 
 34 |         # Then we have the bridge of the Unet
 35 |         self.bridge = ConvBlock(in_channels=512, out_channels=1024, maxpool=True)
 36 | 
 37 | 
 38 | 
 39 | 
 40 |     def forward(self, x):
 41 |         out1, flv1 = self.first_level(x)           # feature level 1 (flv1), out1 is flv1 after maxpooling
 42 |         out2, flv2 = self.second_level(out1)       # feature level 2 (flv2), out2 is flv2 after maxpooling
 43 |         out3, flv3 = self.third_level(out2)        # feature level 3 (flv3). out3 is flv3 after maxpooling
 44 |         out4, flv4 = self.fourth_level(out3)       # feature level 4 (flv4), out4 is flv4 after maxpooling
 45 |         _, brdout = self.bridge(out4)              # bridge output (brdout)
 46 | 
 47 |         return [flv1, flv2, flv3, flv4, brdout]
 48 | 
 49 | 
 50 | class ExpandingPath(nn.Module):
 51 |     def __init__(self, n_classes):
 52 |         super(ExpandingPath, self).__init__()
 53 | 
 54 |         # Symmetrically, there are 4 expanding levels (also called upsampling steps).
 55 |         # Starting from the output of the bridge, each level increases the spatial dimensions by two and half the channel dimensions.
 56 | 
 57 |         self.cc = CropAndConc()
 58 | 
 59 |         # Add attention gates before each skip connection
 60 |         self.attention1 = AttentionGate(input_channels=512, gate_channels=512, intermediate_channels=512)
 61 |         self.attention2 = AttentionGate(input_channels=256, gate_channels=256, intermediate_channels=256)
 62 |         self.attention3 = AttentionGate(input_channels=128, gate_channels=128, intermediate_channels=128)
 63 |         self.attention4 = AttentionGate(input_channels=64, gate_channels=64, intermediate_channels=64)
 64 | 
 65 |         self.up1 = UP(in_channels=1024, out_channels=512)
 66 |         self.fourth_level = ConvBlock(in_channels=1024, out_channels=512, maxpool=False)
 67 |         self.up2 = UP(in_channels=512, out_channels=256)
 68 |         self.third_level = ConvBlock(in_channels=512, out_channels=256, maxpool=False)
 69 |         self.up3 = UP(in_channels=256, out_channels=128)
 70 |         self.second_level = ConvBlock(in_channels=256, out_channels=128, maxpool=False)
 71 |         self.up4 = UP(in_channels=128, out_channels=64)
 72 |         self.first_level = ConvBlock(in_channels=128, out_channels=64, maxpool=False)
 73 | 
 74 |         # output layer
 75 |         self.output_layer = nn.Conv2d(in_channels=64, out_channels=n_classes, kernel_size=1)
 76 | 
 77 | 
 78 | 
 79 |     def forward(self, output_features):
 80 |         [flv1, flv2, flv3, flv4, brdout] = output_features
 81 |         print(brdout.shape)
 82 |         # First up-sampling + attention gate
 83 |         x = self.up1(brdout)
 84 |         gated_flv4 = self.attention1(x, flv4)  # Note: gating signal is from decoder path
 85 |         x = self.cc(gated_flv4, x)
 86 |         x = self.fourth_level(x)
 87 | 
 88 |         # Second up-sampling + attention gate
 89 |         x = self.up2(x)
 90 |         gated_flv3 = self.attention2(x, flv3)
 91 |         x = self.cc(gated_flv3, x)
 92 |         x = self.third_level(x)
 93 | 
 94 |         # Third up-sampling + attention gate
 95 |         x = self.up3(x)
 96 |         gated_flv2 = self.attention3(x, flv2)
 97 |         x = self.cc(gated_flv2, x)
 98 |         x = self.second_level(x)
 99 | 
100 |         # Fourth up-sampling + attention gate
101 |         x = self.up4(x)
102 |         gated_flv1 = self.attention4(x, flv1)
103 |         x = self.cc(gated_flv1, x)
104 |         x = self.first_level(x)
105 |         x = self.output_layer(x)
106 | 
107 |         return x
108 | 
109 | 
110 | 
111 | 
112 | class UNet(nn.Module):
113 |     def __init__(self, in_channels = 3, n_classes = 2):
114 |         super(UNet, self).__init__()
115 |         self.contracting_path = ContractingPath(in_channels=in_channels)
116 |         self.expanding_path = ExpandingPath(n_classes=n_classes)
117 | 
118 |     def forward(self, x):
119 |         output_feature = self.contracting_path(x)
120 |         output = self.expanding_path(output_feature)
121 |         return output
122 | 
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     unet = UNet()
127 |     summary(unet, (3, 572, 572))
128 | 


--------------------------------------------------------------------------------
/Semantic Segmentation/AttentionUnet/UNet3D.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | from unet3D_utils import *
 5 | 
 6 | '''
 7 | This implementation follows the original paper "Attention U-Net: Learning Where to Look for the Pancreas" 
 8 | which uses 3D convolutions for volumetric data (CT scans).
 9 | 
10 | Key differences from the original paper:
11 | 
12 | 1. Deep Supervision: The original paper uses deep supervision (mentioned in Section 2) which is not implemented
13 |    in this version for simplicity.
14 | '''
15 | 
16 | class ContractingPath(nn.Module):
17 |     def __init__(self, in_channels=1):  # Changed default in_channels to 1 for medical images
18 |         super(ContractingPath, self).__init__()
19 | 
20 |         self.first_level = ConvBlock(in_channels=in_channels, out_channels=64, maxpool=True)
21 |         self.second_level = ConvBlock(in_channels=64, out_channels=128, maxpool=True)
22 |         self.third_level = ConvBlock(in_channels=128, out_channels=256, maxpool=True)
23 |         self.fourth_level = ConvBlock(in_channels=256, out_channels=512, maxpool=True)
24 | 
25 |         self.bridge = ConvBlock(in_channels=512, out_channels=1024, maxpool=True)
26 | 
27 |     def forward(self, x):
28 |         out1, flv1 = self.first_level(x)
29 |         out2, flv2 = self.second_level(out1)
30 |         out3, flv3 = self.third_level(out2)
31 |         out4, flv4 = self.fourth_level(out3)
32 |         _, brdout = self.bridge(out4)
33 | 
34 |         return [flv1, flv2, flv3, flv4, brdout]
35 | 
36 | class ExpandingPath(nn.Module):
37 |     def __init__(self, n_classes):
38 |         super(ExpandingPath, self).__init__()
39 | 
40 |         self.cc = CropAndConc()
41 | 
42 |         # Add attention gates before each skip connection
43 |         self.attention1 = AttentionGate(input_channels=512, gate_channels=512, intermediate_channels=512)
44 |         self.attention2 = AttentionGate(input_channels=256, gate_channels=256, intermediate_channels=256)
45 |         self.attention3 = AttentionGate(input_channels=128, gate_channels=128, intermediate_channels=128)
46 |         self.attention4 = AttentionGate(input_channels=64, gate_channels=64, intermediate_channels=64)
47 | 
48 |         self.up1 = UP(in_channels=1024, out_channels=512)
49 |         self.fourth_level = ConvBlock(in_channels=1024, out_channels=512, maxpool=False)
50 |         self.up2 = UP(in_channels=512, out_channels=256)
51 |         self.third_level = ConvBlock(in_channels=512, out_channels=256, maxpool=False)
52 |         self.up3 = UP(in_channels=256, out_channels=128)
53 |         self.second_level = ConvBlock(in_channels=256, out_channels=128, maxpool=False)
54 |         self.up4 = UP(in_channels=128, out_channels=64)
55 |         self.first_level = ConvBlock(in_channels=128, out_channels=64, maxpool=False)
56 | 
57 |         self.output_layer = nn.Conv3d(in_channels=64, out_channels=n_classes, kernel_size=1)
58 | 
59 |     def forward(self, output_features):
60 |         [flv1, flv2, flv3, flv4, brdout] = output_features
61 | 
62 |         x = self.up1(brdout)
63 |         gated_flv4 = self.attention1(x, flv4)
64 |         x = self.cc(gated_flv4, x)
65 |         x = self.fourth_level(x)
66 | 
67 |         x = self.up2(x)
68 |         gated_flv3 = self.attention2(x, flv3)
69 |         x = self.cc(gated_flv3, x)
70 |         x = self.third_level(x)
71 | 
72 |         x = self.up3(x)
73 |         gated_flv2 = self.attention3(x, flv2)
74 |         x = self.cc(gated_flv2, x)
75 |         x = self.second_level(x)
76 | 
77 |         x = self.up4(x)
78 |         gated_flv1 = self.attention4(x, flv1)
79 |         x = self.cc(gated_flv1, x)
80 |         x = self.first_level(x)
81 |         x = self.output_layer(x)
82 | 
83 |         return x
84 | 
85 | class UNet(nn.Module):
86 |     def __init__(self, in_channels=1, n_classes=2):  # Changed default in_channels to 1
87 |         super(UNet, self).__init__()
88 |         self.contracting_path = ContractingPath(in_channels=in_channels)
89 |         self.expanding_path = ExpandingPath(n_classes=n_classes)
90 | 
91 |     def forward(self, x):
92 |         output_feature = self.contracting_path(x)
93 |         output = self.expanding_path(output_feature)
94 |         return output
95 | 
96 | if __name__ == "__main__":
97 |     unet = UNet()
98 |     # Modified input size for 3D (channels, depth, height, width)
99 |     summary(unet, (1, 64, 128, 128))


--------------------------------------------------------------------------------
/Semantic Segmentation/AttentionUnet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Semantic Segmentation/AttentionUnet/unet2D_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | def crop_tensor(tensor, target_tensor):
  7 |     target_size = target_tensor.size()[2]  # Get height/width of target
  8 |     tensor_size = tensor.size()[2]         # Get height/width of tensor
  9 |     delta = tensor_size - target_size
 10 |     delta = delta // 2
 11 |     return tensor[:, :, delta:tensor_size-delta, delta:tensor_size-delta]
 12 | 
 13 | class ConvBlock(nn.Module):
 14 |     '''
 15 |     In the original implementation of the UNet the Covolutional Block used in the contracting path consists of:
 16 |     - conv 3x3
 17 |     - ReLu
 18 |     - conv 3x3
 19 |     - ReLu
 20 |     - maxpool 2x2
 21 |     '''
 22 |     def __init__(self, in_channels, out_channels, maxpool = False):
 23 |         super(ConvBlock, self).__init__()
 24 | 
 25 |         self.maxpool = maxpool
 26 | 
 27 |         self.convblock_1 = nn.Sequential(
 28 |             nn.Conv2d(in_channels, out_channels, kernel_size=3, bias=True), # **
 29 |             nn.ReLU(),
 30 |         )
 31 | 
 32 |         self.convblock_2 = nn.Sequential(
 33 |             nn.Conv2d(out_channels, out_channels, kernel_size=3, bias=True),  # **
 34 |             nn.ReLU(),
 35 |         )
 36 | 
 37 |         self.maxpool2d = nn.MaxPool2d(kernel_size=2, stride=2)
 38 | 
 39 | 
 40 | 
 41 |     # ** In the original U-Net paper and implementation, the convolutional layers do include bias terms (bias=True).
 42 |     #    However, it's worth noting that in modern deep learning practice, especially when using batch normalization,
 43 |     #    it's common to set bias=False for the convolution layers that are immediately followed by batch normalization.
 44 |     #    This is because the batch norm layer has its own learnable bias parameter, making the convolution's bias redundant.
 45 | 
 46 |     def forward(self, x):
 47 |         out_conv1 = self.convblock_1(x)
 48 |         out_conv2 = self.convblock_2(out_conv1)
 49 |         if self.maxpool:
 50 |             out = self.maxpool2d(out_conv2)
 51 |             return out, out_conv2
 52 |         else:
 53 |             return out_conv2
 54 | 
 55 | 
 56 | class UP(nn.Module):
 57 |     def __init__(self, in_channels, out_channels, scale_factor = 2):
 58 |         super(UP, self).__init__()
 59 | 
 60 |         '''
 61 |         The original U-Net paper used transposed convolution, but many modern implementations prefer 
 62 |         the following approach for upsampling with bilinear upsampling because:
 63 | 
 64 |         - It helps avoid checkerboard artifacts that can occur with transposed convolutions
 65 |         - It's often more computationally efficient
 66 |         - It can lead to smoother outputs
 67 |         '''
 68 | 
 69 |         self.up = nn.Sequential(
 70 |             nn.Upsample(scale_factor=scale_factor, mode='bilinear', align_corners=True),
 71 |             nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1)
 72 |         )
 73 | 
 74 |     def forward(self, x):
 75 |         return self.up(x)
 76 | 
 77 | class CropAndConc(nn.Module):
 78 |     def __init__(self):
 79 |         super(CropAndConc, self).__init__()
 80 | 
 81 |         '''
 82 |         Following the original implementation, this submodule perform two operations:
 83 |         - Crop the feature map from the contracting path
 84 |         - Concatenate the cropped feature map with the feature from the expanding path
 85 |         
 86 |         Modern U-Net implementations typically don't use cropping anymore. 
 87 |         Instead, they use padding in the convolutions to maintain spatial dimensions. 
 88 |         This makes the architecture simpler and ensures that the feature maps from the contracting path match exactly with 
 89 |         the expanding path.
 90 |         
 91 |         Benefits of using padding instead of cropping:
 92 |         - Simpler implementation
 93 |         - No loss of border information
 94 |         - Easier to predict output sizes
 95 |         - Better feature preservation at boundaries
 96 |         - More stable training in many cases
 97 |         '''
 98 |         pass
 99 | 
100 |     def forward(self, tensor, target_tensor):
101 |         cropped_tensor = crop_tensor(tensor, target_tensor)
102 |         x = torch.cat([cropped_tensor, target_tensor], 1)
103 |         return x
104 | 
105 | 
106 | class AttentionGate(nn.Module):
107 |     def __init__(self, input_channels, gate_channels, intermediate_channels):
108 |         super(AttentionGate, self).__init__()
109 | 
110 |         # channel wise 2D conv for the gating signal
111 |         self.wg = nn.Conv2d(in_channels= gate_channels, out_channels=intermediate_channels, kernel_size=1, bias=True)
112 |         self.wl = nn.Conv2d(in_channels= input_channels, out_channels=intermediate_channels, kernel_size=1, bias=True)
113 | 
114 |         # ψ (psi) channel wise
115 |         self.psi = nn.Sequential(nn.Conv2d(in_channels=intermediate_channels, out_channels=1, kernel_size=1, bias=True),
116 |                                nn.Sigmoid())
117 | 
118 |         self.relu = nn.ReLU(inplace=True)
119 | 
120 | 
121 | 
122 | 
123 |     def forward(self, gating_signal, input_features):
124 |         '''
125 |         - gating_signal: it is the gating signal from the decoder path at coarser scale, which contains contextual information aggregated from multiple scales
126 |         - input_features: features from the skip connection (encoder path)
127 |         '''
128 | 
129 |         # Downsample input_features to match gating signal size
130 |         input_features_resized = F.interpolate(input_features, size=(gating_signal.shape[2], gating_signal.shape[3]),
131 |                                                mode='bilinear', align_corners=False)
132 | 
133 |         # step 1: we perform two separate channel wise convolution 1x1
134 |         wg_out = self.wg(gating_signal)
135 |         wl_out = self.wl(input_features_resized)
136 | 
137 | 
138 |         # step 2: addition of these transformation
139 |         intermediate_output = self.relu(wg_out + wl_out)
140 | 
141 |         # step 3: psi linear transformation
142 |         attention_coefficients = self.psi(intermediate_output)
143 | 
144 |         # step 4: Resampling if needed (if sizes don't match)
145 |         if attention_coefficients.shape != input_features.shape:
146 |             attention_coefficients = F.interpolate(attention_coefficients,
147 |                                                    size=input_features.shape[2:],
148 |                                                    mode="bilinear",
149 |                                                    align_corners=True)
150 |         # step 5: multiply with input features
151 |         output = attention_coefficients * input_features
152 |         return output
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/Semantic Segmentation/AttentionUnet/unet3D_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | def crop_tensor(tensor, target_tensor):
  6 |     target_size = target_tensor.size()[2:]  # Get depth/height/width of target
  7 |     tensor_size = tensor.size()[2:]         # Get depth/height/width of tensor
  8 |     delta = [t - p for t, p in zip(tensor_size, target_size)]
  9 |     delta = [d // 2 for d in delta]
 10 |     return tensor[:, :,
 11 |                  delta[0]:tensor_size[0]-delta[0],
 12 |                  delta[1]:tensor_size[1]-delta[1],
 13 |                  delta[2]:tensor_size[2]-delta[2]]
 14 | 
 15 | class ConvBlock(nn.Module):
 16 |     '''
 17 |     3D version of the Convolutional Block:
 18 |     - conv 3x3x3 with padding=1
 19 |     - ReLu
 20 |     - conv 3x3x3 with padding=1
 21 |     - ReLu
 22 |     - maxpool 2x2x2
 23 |     '''
 24 |     def __init__(self, in_channels, out_channels, maxpool=False):
 25 |         super(ConvBlock, self).__init__()
 26 | 
 27 |         self.maxpool = maxpool
 28 | 
 29 |         self.convblock_1 = nn.Sequential(
 30 |             # Added padding=1 to maintain spatial dimensions
 31 |             nn.Conv3d(in_channels, out_channels, kernel_size=3, padding=1, bias=True),
 32 |             nn.ReLU(),
 33 |         )
 34 | 
 35 |         self.convblock_2 = nn.Sequential(
 36 |             # Added padding=1 to maintain spatial dimensions
 37 |             nn.Conv3d(out_channels, out_channels, kernel_size=3, padding=1, bias=True),
 38 |             nn.ReLU(),
 39 |         )
 40 | 
 41 |         self.maxpool3d = nn.MaxPool3d(kernel_size=2, stride=2)
 42 | 
 43 |     def forward(self, x):
 44 |         out_conv1 = self.convblock_1(x)
 45 |         out_conv2 = self.convblock_2(out_conv1)
 46 |         if self.maxpool:
 47 |             out = self.maxpool3d(out_conv2)
 48 |             return out, out_conv2
 49 |         else:
 50 |             return out_conv2
 51 | 
 52 | class UP(nn.Module):
 53 |     def __init__(self, in_channels, out_channels, scale_factor=2):
 54 |         super(UP, self).__init__()
 55 | 
 56 |         self.up = nn.Sequential(
 57 |             nn.Upsample(scale_factor=scale_factor, mode='trilinear', align_corners=True),
 58 |             nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=1)
 59 |         )
 60 | 
 61 |     def forward(self, x):
 62 |         return self.up(x)
 63 | 
 64 | class CropAndConc(nn.Module):
 65 |     def __init__(self):
 66 |         super(CropAndConc, self).__init__()
 67 |         pass
 68 | 
 69 |     def forward(self, tensor, target_tensor):
 70 |         cropped_tensor = crop_tensor(tensor, target_tensor)
 71 |         x = torch.cat([cropped_tensor, target_tensor], 1)
 72 |         return x
 73 | 
 74 | class AttentionGate(nn.Module):
 75 |     def __init__(self, input_channels, gate_channels, intermediate_channels):
 76 |         super(AttentionGate, self).__init__()
 77 | 
 78 |         # channel wise 3D conv for the gating signal
 79 |         self.wg = nn.Conv3d(in_channels=gate_channels, out_channels=intermediate_channels, kernel_size=1, bias=True)
 80 |         self.wl = nn.Conv3d(in_channels=input_channels, out_channels=intermediate_channels, kernel_size=1, bias=True)
 81 | 
 82 |         # ψ (psi) channel wise
 83 |         self.psi = nn.Sequential(
 84 |             nn.Conv3d(in_channels=intermediate_channels, out_channels=1, kernel_size=1, bias=True),
 85 |             nn.Sigmoid()
 86 |         )
 87 | 
 88 |         self.relu = nn.ReLU(inplace=True)
 89 | 
 90 |     def forward(self, gating_signal, input_features):
 91 |         '''
 92 |         - gating_signal: it is the gating signal from the decoder path at coarser scale
 93 |         - input_features: features from the skip connection (encoder path)
 94 |         '''
 95 | 
 96 |         # Downsample input_features to match gating signal size
 97 |         input_features_resized = F.interpolate(input_features,
 98 |                                              size=(gating_signal.shape[2],
 99 |                                                   gating_signal.shape[3],
100 |                                                   gating_signal.shape[4]),
101 |                                              mode='trilinear',
102 |                                              align_corners=False)
103 | 
104 |         # step 1: we perform two separate channel wise convolution 1x1x1
105 |         wg_out = self.wg(gating_signal)
106 |         wl_out = self.wl(input_features_resized)
107 | 
108 |         # step 2: addition of these transformation
109 |         intermediate_output = self.relu(wg_out + wl_out)
110 | 
111 |         # step 3: psi linear transformation
112 |         attention_coefficients = self.psi(intermediate_output)
113 | 
114 |         # step 4: Resampling if needed (if sizes don't match)
115 |         if attention_coefficients.shape[2:] != input_features.shape[2:]:
116 |             attention_coefficients = F.interpolate(attention_coefficients,
117 |                                                 size=input_features.shape[2:],
118 |                                                 mode="trilinear",
119 |                                                 align_corners=True)
120 | 
121 |         # step 5: multiply with input features
122 |         output = attention_coefficients * input_features
123 |         return output


--------------------------------------------------------------------------------
/Semantic Segmentation/UNet++/README.md:
--------------------------------------------------------------------------------
 1 | # [UNet++]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of the UNet++. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation - TBD
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post]().
 8 | 
 9 | ## Major Contributions - TBD
10 | The major contributions of the paper include:
11 | 
12 | 
13 | ## Architecture Scheme - TBD
14 | Below is a schematic representation of the architecture:
15 | 
16 | ![Architecture Scheme]()
17 | 
18 | ## Reproduced Results (TBD)
19 | The following results were reproduced as per the methodology described in the paper:
20 | - Result 1: [Description and value]
21 | - Result 2: [Description and value]
22 | - Result 3: [Description and value]
23 | - ...
24 | 
25 | ## References
26 | - [Original Paper](https://arxiv.org/abs/1912.05074)
27 | - [Detailed Blog Post]()
28 | 


--------------------------------------------------------------------------------
/Semantic Segmentation/UNet++/UNetPlus.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torchsummary import summary
  4 | from UnetPlusPlus_utils import *
  5 | 
  6 | 
  7 | class ContractingPath(nn.Module):
  8 |     def __init__(self, in_channels=3):
  9 |         super(ContractingPath, self).__init__()
 10 | 
 11 |         # Encoder blocks following original U-Net architecture
 12 |         # Each level doubles the number of channels and halves spatial dimensions
 13 |         # These features will be shared across all nested U-Nets
 14 |         self.first_level = ConvBlock(in_channels=in_channels, out_channels=64, maxpool=True)  # X₁,₀
 15 |         self.second_level = ConvBlock(in_channels=64, out_channels=128, maxpool=True)  # X₂,₀
 16 |         self.third_level = ConvBlock(in_channels=128, out_channels=256, maxpool=True)  # X₃,₀
 17 |         self.fourth_level = ConvBlock(in_channels=256, out_channels=512, maxpool=True)  # X₄,₀
 18 | 
 19 |         # Bridge (bottom of the U)
 20 |         self.bridge = ConvBlock(in_channels=512, out_channels=1024, maxpool=True)  # X₅,₀
 21 | 
 22 |     def forward(self, x):
 23 |         # Forward pass through encoder, storing features for skip connections
 24 |         out1, flv1 = self.first_level(x)  # X₁,₀: First encoder block output
 25 |         out2, flv2 = self.second_level(out1)  # X₂,₀: Second encoder block output
 26 |         out3, flv3 = self.third_level(out2)  # X₃,₀: Third encoder block output
 27 |         out4, flv4 = self.fourth_level(out3)  # X₄,₀: Fourth encoder block output
 28 |         _, brdout = self.bridge(out4)  # X₅,₀: Bridge output
 29 | 
 30 |         return [flv1, flv2, flv3, flv4, brdout]
 31 | 
 32 | 
 33 | class ExpandingPath(nn.Module):
 34 |     def __init__(self, n_classes):
 35 |         super(ExpandingPath, self).__init__()
 36 | 
 37 |         self.cc = CropAndConc()  # For skip connections concatenation
 38 | 
 39 |         # Upsampling operations
 40 |         self.up1 = UP(in_channels=1024, out_channels=512)  # X₄,₁ upsampling
 41 |         self.up2 = UP(in_channels=512, out_channels=256)  # X₃,₁ upsampling
 42 |         self.up3 = UP(in_channels=256, out_channels=128)  # X₂,₁ upsampling
 43 |         self.up4 = UP(in_channels=128, out_channels=64)  # X₁,₁ upsampling
 44 | 
 45 |         # Decoder blocks for each level
 46 |         # Following paper's notation Xi,j where:
 47 |         # i: decoder level (1-4 from bottom to top)
 48 |         # j: block index within level (increases left to right)
 49 | 
 50 |         # Fourth level (closest to bridge)
 51 |         self.fourth_level = ConvBlock(in_channels=1024, out_channels=512, maxpool=False)  # X₄,₁
 52 | 
 53 |         # Third level blocks
 54 |         self.third_level = ConvBlock(in_channels=512, out_channels=256, maxpool=False)  # X₃,₁
 55 | 
 56 |         # Second level blocks
 57 |         self.second_level = ConvBlock(in_channels=256, out_channels=128, maxpool=False)  # X₂,₁
 58 | 
 59 |         # First level blocks (closest to output)
 60 |         self.first_level = ConvBlock(in_channels=128, out_channels=64, maxpool=False)  # X₁,₁
 61 | 
 62 |         # Final 1x1 conv for class prediction
 63 |         # Separate output layers for deep supervision
 64 |         self.output_layer1 = nn.Sequential(
 65 |             nn.Conv2d(in_channels=64, out_channels=n_classes, kernel_size=1),
 66 |             nn.Sigmoid() if n_classes == 1 else nn.Identity()
 67 |         )
 68 |         self.output_layer2 = nn.Sequential(
 69 |             nn.Conv2d(in_channels=64, out_channels=n_classes, kernel_size=1),
 70 |             nn.Sigmoid() if n_classes == 1 else nn.Identity()
 71 |         )
 72 |         self.output_layer3 = nn.Sequential(
 73 |             nn.Conv2d(in_channels=64, out_channels=n_classes, kernel_size=1),
 74 |             nn.Sigmoid() if n_classes == 1 else nn.Identity()
 75 |         )
 76 |         self.output_layer4 = nn.Sequential(
 77 |             nn.Conv2d(in_channels=64, out_channels=n_classes, kernel_size=1),
 78 |             nn.Sigmoid() if n_classes == 1 else nn.Identity()
 79 |         )
 80 | 
 81 | 
 82 |     def forward(self, output_features):
 83 |         [x0_0, x1_0, x2_0, x3_0, x4_0] = output_features
 84 | 
 85 |         # Level 4 - Deepest decoder level
 86 |         x3_1 = self.up1(x4_0)  # Upsample bridge
 87 |         x3_1 = self.cc(x3_0, x4_0)  # Skip connection x3_0 -> x4_0
 88 |         x3_1 = self.fourth_level(x4_0)  # Conv operations
 89 | 
 90 |         # Level 3: Two possible paths
 91 |         # Path 1:
 92 |         x2_1 = self.up2(x3_0)
 93 |         x2_1 = self.cc(x2_0, x2_1) # skip connection x2_0 -> x2_1
 94 |         x2_1 = self.third_level(x2_1)
 95 | 
 96 |         # Path 2
 97 |         x2_2 = self.up2(x3_1)
 98 |         x2_2 = self.cc(x2_1, x2_2) # skip connection x2_1 -> x2_2
 99 |         x2_2 = self.third_level(x2_2)
100 | 
101 |         # Level 2: 3 possible paths
102 |         # Path 1:
103 |         x1_1 = self.up3(x2_0)
104 |         x1_1 = self.cc(x1_0, x1_1) # skip connection x1_0 -> x1_1
105 |         x1_1 = self.second_level(x1_1)
106 | 
107 |         # Path 2:
108 |         x1_2 = self.up3(x2_1)
109 |         x1_2 = self.cc(x1_1, x1_2)
110 |         x1_2 = self.second_level(x1_2)
111 | 
112 |         # Path 3:
113 |         x1_3 = self.up3(x2_2)
114 |         x1_3 = self.cc(x1_2, x1_3)
115 |         x1_3 = self.second_level(x1_3)
116 | 
117 |         # Level 1: 4 possible paths
118 | 
119 |         # Path 1:
120 |         x0_1 = self.up4(x1_0)
121 |         x0_1 = self.cc(x0_0, x0_1)
122 |         x0_1 = self.first_level(x0_1)
123 | 
124 |         # Path 2:
125 |         x0_2 = self.up4(x1_1)
126 |         x0_2 = self.cc(x0_1, x0_2)
127 |         x0_2 = self.first_level(x0_2)
128 | 
129 |         # Path 3:
130 |         x0_3 = self.up4(x1_2)
131 |         x0_3 = self.cc(x0_2, x0_3)
132 |         x0_3 = self.first_level(x0_3)
133 | 
134 |         # Path 4:
135 |         x0_4 = self.up4(x1_3)
136 |         x0_4 = self.cc(x0_3, x0_4)
137 |         x0_4 = self.first_level(x0_4)
138 | 
139 |         # Output Path for Deep Supervision
140 |         out1 = self.output_layer1(x0_1)
141 |         out2 = self.output_layer2(x0_2)
142 |         out3 = self.output_layer3(x0_3)
143 |         out4 = self.output_layer4(x0_4)
144 | 
145 |         return out1, out2, out3, out4
146 | 
147 | 
148 | 
149 | 
150 | class UNetPlus(nn.Module):
151 |     def __init__(self, in_channels = 3, n_classes = 2):
152 |         super(UNetPlus, self).__init__()
153 |         self.contracting_path = ContractingPath(in_channels=in_channels)
154 |         self.expanding_path = ExpandingPath(n_classes=n_classes)
155 | 
156 |     def forward(self, x):
157 |         output_feature = self.contracting_path(x)
158 |         output = self.expanding_path(output_feature)
159 |         return output
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     unet = UNetPlus()
164 |     summary(unet, (3, 572, 572))


--------------------------------------------------------------------------------
/Semantic Segmentation/UNet++/UNetPlusPlus.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torchsummary import summary
  4 | from UnetPlusPlus_utils import *
  5 | 
  6 | 
  7 | class ContractingPath(nn.Module):
  8 |     def __init__(self, in_channels=3):
  9 |         super(ContractingPath, self).__init__()
 10 | 
 11 |         # Encoder blocks following original U-Net architecture
 12 |         # Each level doubles the number of channels and halves spatial dimensions
 13 |         # These features will be shared across all nested U-Nets
 14 |         self.first_level = ConvBlock(in_channels=in_channels, out_channels=64, maxpool=True)  # X₁,₀
 15 |         self.second_level = ConvBlock(in_channels=64, out_channels=128, maxpool=True)  # X₂,₀
 16 |         self.third_level = ConvBlock(in_channels=128, out_channels=256, maxpool=True)  # X₃,₀
 17 |         self.fourth_level = ConvBlock(in_channels=256, out_channels=512, maxpool=True)  # X₄,₀
 18 | 
 19 |         # Bridge (bottom of the U)
 20 |         self.bridge = ConvBlock(in_channels=512, out_channels=1024, maxpool=True)  # X₅,₀
 21 | 
 22 |     def forward(self, x):
 23 |         # Forward pass through encoder, storing features for skip connections
 24 |         out1, flv1 = self.first_level(x)  # X₁,₀: First encoder block output
 25 |         out2, flv2 = self.second_level(out1)  # X₂,₀: Second encoder block output
 26 |         out3, flv3 = self.third_level(out2)  # X₃,₀: Third encoder block output
 27 |         out4, flv4 = self.fourth_level(out3)  # X₄,₀: Fourth encoder block output
 28 |         _, brdout = self.bridge(out4)  # X₅,₀: Bridge output
 29 | 
 30 |         return [flv1, flv2, flv3, flv4, brdout]
 31 | 
 32 | 
 33 | class ExpandingPath(nn.Module):
 34 |     def __init__(self, n_classes):
 35 |         super(ExpandingPath, self).__init__()
 36 | 
 37 |         self.cc = CropAndConc()  # For skip connections concatenation
 38 | 
 39 |         # Upsampling operations
 40 |         self.up1 = UP(in_channels=1024, out_channels=512)  # X₄,₁ upsampling
 41 |         self.up2 = UP(in_channels=512, out_channels=256)  # X₃,₁ upsampling
 42 |         self.up3 = UP(in_channels=256, out_channels=128)  # X₂,₁ upsampling
 43 |         self.up4 = UP(in_channels=128, out_channels=64)  # X₁,₁ upsampling
 44 | 
 45 |         # Decoder blocks for each level
 46 |         # Following paper's notation Xi,j where:
 47 |         # i: decoder level (1-4 from bottom to top)
 48 |         # j: block index within level (increases left to right)
 49 | 
 50 |         # Fourth level (closest to bridge)
 51 |         self.fourth_level = ConvBlock(in_channels=1024, out_channels=512, maxpool=False)
 52 | 
 53 |         # Third level blocks
 54 |         self.third_level_p1 = ConvBlock(in_channels=512, out_channels=256, maxpool=False) # path 1
 55 |         self.third_level_p2 = ConvBlock(in_channels=768, out_channels=256, maxpool=False)  # path 2
 56 | 
 57 |         # Second level blocks
 58 |         self.second_level_p1 = ConvBlock(in_channels=256, out_channels=128, maxpool=False) # path 1
 59 |         self.second_level_p2 = ConvBlock(in_channels=384, out_channels=128, maxpool=False) # path 2
 60 |         self.second_level_p3 = ConvBlock(in_channels=512, out_channels=128, maxpool=False) # path 3
 61 | 
 62 | 
 63 |         # First level blocks (closest to output)
 64 |         self.first_level_p1 = ConvBlock(in_channels=128, out_channels=64, maxpool=False) # path 1
 65 |         self.first_level_p2 = ConvBlock(in_channels=192, out_channels=64, maxpool=False) # path 2
 66 |         self.first_level_p3 = ConvBlock(in_channels=256, out_channels=64, maxpool=False) # path 3
 67 |         self.first_level_p4 = ConvBlock(in_channels=320, out_channels=64, maxpool=False) # path 4
 68 | 
 69 | 
 70 |         # Final 1x1 conv for class prediction
 71 |         # Separate output layers for deep supervision
 72 |         self.output_layer1 = nn.Sequential(
 73 |             nn.Conv2d(in_channels=64, out_channels=n_classes, kernel_size=1),
 74 |             nn.Sigmoid() if n_classes == 1 else nn.Identity()
 75 |         )
 76 |         self.output_layer2 = nn.Sequential(
 77 |             nn.Conv2d(in_channels=64, out_channels=n_classes, kernel_size=1),
 78 |             nn.Sigmoid() if n_classes == 1 else nn.Identity()
 79 |         )
 80 |         self.output_layer3 = nn.Sequential(
 81 |             nn.Conv2d(in_channels=64, out_channels=n_classes, kernel_size=1),
 82 |             nn.Sigmoid() if n_classes == 1 else nn.Identity()
 83 |         )
 84 |         self.output_layer4 = nn.Sequential(
 85 |             nn.Conv2d(in_channels=64, out_channels=n_classes, kernel_size=1),
 86 |             nn.Sigmoid() if n_classes == 1 else nn.Identity()
 87 |         )
 88 | 
 89 |     def forward(self, output_features):
 90 |         [x0_0, x1_0, x2_0, x3_0, x4_0] = output_features
 91 | 
 92 |         # Level 4 - Deepest decoder level
 93 |         x3_1 = self.up1(x4_0)  # Upsample bridge
 94 |         x3_1 = self.cc(x3_0, x3_1)  # Skip connection x3_0 -> x4_0
 95 |         x3_1 = self.fourth_level(x3_1)  # Conv operations
 96 | 
 97 |         # Level 3: 3 possible paths
 98 |         # Path 1:
 99 |         x2_1 = self.up2(x3_0)
100 |         x2_1 = self.cc(x2_0, x2_1)  # skip connection x2_0 -> x2_1
101 |         x2_1 = self.third_level_p1(x2_1)
102 | 
103 |         # Path 2
104 |         x2_2 = self.up2(x3_1)
105 |         x2_2 = self.cc(x2_0, x2_1, x2_2)  # skip connection x2_1 -> x2_2
106 |         x2_2 = self.third_level_p2(x2_2)
107 | 
108 |         # Level 2: 3 possible paths
109 |         # Path 1:
110 |         x1_1 = self.up3(x2_0)
111 |         x1_1 = self.cc(x1_0, x1_1)  # skip connection x1_0 -> x1_1
112 |         x1_1 = self.second_level_p1(x1_1)
113 | 
114 |         # Path 2:
115 |         x1_2 = self.up3(x2_1)
116 |         x1_2 = self.cc(x1_0, x1_1, x1_2)
117 |         x1_2 = self.second_level_p2(x1_2)
118 | 
119 |         # Path 3:
120 |         x1_3 = self.up3(x2_2)
121 |         x1_3 = self.cc(x1_0, x1_1, x1_2, x1_3)
122 |         x1_3 = self.second_level_p3(x1_3)
123 | 
124 |         # Level 1: 4 possible paths
125 | 
126 |         # Path 1:
127 |         x0_1 = self.up4(x1_0)
128 |         x0_1 = self.cc(x0_0, x0_1)
129 |         x0_1 = self.first_level_p1(x0_1)
130 | 
131 |         # Path 2:
132 |         x0_2 = self.up4(x1_1)
133 |         x0_2 = self.cc(x0_0, x0_1, x0_2)
134 |         x0_2 = self.first_level_p2(x0_2)
135 | 
136 |         # Path 3:
137 |         x0_3 = self.up4(x1_2)
138 |         x0_3 = self.cc(x0_0, x0_1, x0_2, x0_3)
139 |         x0_3 = self.first_level_p3(x0_3)
140 | 
141 |         # Path 4:
142 |         x0_4 = self.up4(x1_3)
143 |         x0_4 = self.cc(x0_0, x0_1, x0_2, x0_3, x0_4)
144 |         x0_4 = self.first_level_p4(x0_4)
145 | 
146 |         # Output Path for Deep Supervision
147 |         out1 = self.output_layer1(x0_1)
148 |         out2 = self.output_layer2(x0_2)
149 |         out3 = self.output_layer3(x0_3)
150 |         out4 = self.output_layer4(x0_4)
151 | 
152 |         return out1, out2, out3, out4
153 | 
154 | 
155 | 
156 | 
157 | class UNetPlusPlus(nn.Module):
158 |     def __init__(self, in_channels=3, n_classes=2):
159 |         super(UNetPlusPlus, self).__init__()
160 |         self.contracting_path = ContractingPath(in_channels=in_channels)
161 |         self.expanding_path = ExpandingPath(n_classes=n_classes)
162 | 
163 |     def forward(self, x):
164 |         output_feature = self.contracting_path(x)
165 |         output = self.expanding_path(output_feature)
166 |         return output
167 | 
168 | 
169 | if __name__ == "__main__":
170 |     unet = UNetPlusPlus()
171 |     summary(unet, (3, 572, 572))
172 | 


--------------------------------------------------------------------------------
/Semantic Segmentation/UNet++/UnetPlusPlus_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | 
  5 | def crop_tensor(tensor, target_tensor):
  6 |     target_size = target_tensor.size()[2]  # Get height/width of target
  7 |     tensor_size = tensor.size()[2]  # Get height/width of tensor
  8 |     delta = tensor_size - target_size
  9 |     delta = delta // 2
 10 |     return tensor[:, :, delta:tensor_size - delta, delta:tensor_size - delta]
 11 | 
 12 | 
 13 | class ConvBlock(nn.Module):
 14 |     '''
 15 |     In the original implementation of the UNet the Covolutional Block used in the contracting path consists of:
 16 |     - conv 3x3
 17 |     - ReLu
 18 |     - conv 3x3
 19 |     - ReLu
 20 |     - maxpool 2x2
 21 |     '''
 22 | 
 23 |     def __init__(self, in_channels, out_channels, maxpool=False):
 24 |         super(ConvBlock, self).__init__()
 25 | 
 26 |         self.maxpool = maxpool
 27 | 
 28 |         self.convblock_1 = nn.Sequential(
 29 |             nn.Conv2d(in_channels, out_channels, kernel_size=3, bias=True),  # **
 30 |             nn.ReLU(),
 31 |         )
 32 | 
 33 |         self.convblock_2 = nn.Sequential(
 34 |             nn.Conv2d(out_channels, out_channels, kernel_size=3, bias=True),  # **
 35 |             nn.ReLU(),
 36 |         )
 37 | 
 38 |         self.maxpool2d = nn.MaxPool2d(kernel_size=2, stride=2)
 39 | 
 40 |     # ** In the original U-Net paper and implementation, the convolutional layers do include bias terms (bias=True).
 41 |     #    However, it's worth noting that in modern deep learning practice, especially when using batch normalization,
 42 |     #    it's common to set bias=False for the convolution layers that are immediately followed by batch normalization.
 43 |     #    This is because the batch norm layer has its own learnable bias parameter, making the convolution's bias redundant.
 44 | 
 45 |     def forward(self, x):
 46 |         out_conv1 = self.convblock_1(x)
 47 |         out_conv2 = self.convblock_2(out_conv1)
 48 |         if self.maxpool:
 49 |             out = self.maxpool2d(out_conv2)
 50 |             return out, out_conv2
 51 |         else:
 52 |             return out_conv2
 53 | 
 54 | 
 55 | class UP(nn.Module):
 56 |     def __init__(self, in_channels, out_channels, scale_factor=2):
 57 |         super(UP, self).__init__()
 58 | 
 59 |         '''
 60 |         The original U-Net paper used transposed convolution, but many modern implementations prefer 
 61 |         the following approach for upsampling with bilinear upsampling because:
 62 | 
 63 |         - It helps avoid checkerboard artifacts that can occur with transposed convolutions
 64 |         - It's often more computationally efficient
 65 |         - It can lead to smoother outputs
 66 |         '''
 67 | 
 68 |         self.up = nn.Sequential(
 69 |             nn.Upsample(scale_factor=scale_factor, mode='bilinear', align_corners=True),
 70 |             nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1)
 71 |         )
 72 | 
 73 |     def forward(self, x):
 74 |         return self.up(x)
 75 | 
 76 | 
 77 | class CropAndConc(nn.Module):
 78 |     def __init__(self):
 79 |         super(CropAndConc, self).__init__()
 80 | 
 81 |         '''
 82 |         Following the original implementation, this submodule perform two operations:
 83 |         - Crop the feature maps from the contracting path
 84 |         - Concatenate the cropped feature maps with the feature from the expanding path
 85 | 
 86 |         Modern U-Net implementations typically don't use cropping anymore. 
 87 |         Instead, they use padding in the convolutions to maintain spatial dimensions. 
 88 |         This makes the architecture simpler and ensures that the feature maps from the contracting path match exactly with 
 89 |         the expanding path.
 90 | 
 91 |         Benefits of using padding instead of cropping:
 92 |         - Simpler implementation
 93 |         - No loss of border information
 94 |         - Easier to predict output sizes
 95 |         - Better feature preservation at boundaries
 96 |         - More stable training in many cases
 97 |         '''
 98 |         pass
 99 | 
100 |     def forward(self, *tensors):
101 |         """
102 |         Crops and concatenates multiple tensors along channel dimension.
103 |         The spatial dimensions will be cropped to match the smallest tensor.
104 | 
105 |         Args:
106 |             *tensors: Variable number of tensors to concatenate
107 | 
108 |         Returns:
109 |             Concatenated tensor along channel dimension
110 |         """
111 |         if len(tensors) < 2:
112 |             raise ValueError("At least two tensors are required for concatenation")
113 | 
114 |         # Find the smallest spatial dimensions among all tensors
115 |         min_spatial_dims = [float('inf')] * (len(tensors[0].shape) - 2)  # Exclude batch and channel dims
116 |         for tensor in tensors:
117 |             spatial_dims = tensor.shape[2:]  # Get spatial dimensions
118 |             for i, dim in enumerate(spatial_dims):
119 |                 min_spatial_dims[i] = min(min_spatial_dims[i], dim)
120 | 
121 |         # Crop all tensors to match the smallest dimensions
122 |         cropped_tensors = []
123 |         for tensor in tensors:
124 |             if tensor.shape[2:] != tuple(min_spatial_dims):
125 |                 # Calculate crop for each dimension
126 |                 dims = len(tensor.shape) - 2  # Number of spatial dimensions
127 |                 slices = [slice(None), slice(None)]  # Keep batch and channel dims as is
128 | 
129 |                 for i in range(dims):
130 |                     current_size = tensor.shape[i + 2]
131 |                     target_size = min_spatial_dims[i]
132 |                     delta = (current_size - target_size) // 2
133 |                     slices.append(slice(delta, delta + target_size))
134 | 
135 |                 cropped_tensors.append(tensor[slices])
136 |             else:
137 |                 cropped_tensors.append(tensor)
138 | 
139 |         # Concatenate along channel dimension (dim=1)
140 |         return torch.cat(cropped_tensors, dim=1)
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/Semantic Segmentation/UNet++/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Semantic Segmentation/UNet/README.md:
--------------------------------------------------------------------------------
 1 | # [UNet]
 2 | 
 3 | ## Overview
 4 | This repository contains the implementation of the UNet. Below you will find detailed information and resources related to this architecture.
 5 | 
 6 | ## Detailed Explanation - TBD
 7 | For a comprehensive understanding of the paper and its contributions, please refer to the [detailed blog post]().
 8 | 
 9 | ## Major Contributions - TBD
10 | The major contributions of the paper include:
11 | 
12 | 
13 | ## Architecture Scheme - TBD
14 | Below is a schematic representation of the architecture:
15 | 
16 | ![Architecture Scheme]()
17 | 
18 | ## Reproduced Results (TBD)
19 | The following results were reproduced as per the methodology described in the paper:
20 | - Result 1: [Description and value]
21 | - Result 2: [Description and value]
22 | - Result 3: [Description and value]
23 | - ...
24 | 
25 | ## References
26 | - [Original Paper](https://arxiv.org/abs/1505.04597)
27 | - [Detailed Blog Post]()
28 | 


--------------------------------------------------------------------------------
/Semantic Segmentation/UNet/UNet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchsummary import summary
 4 | from unet_utils import *
 5 | 
 6 | 
 7 | class ContractingPath(nn.Module):
 8 |     def __init__(self, in_channels = 3):
 9 |         super(ContractingPath, self).__init__()
10 | 
11 |         # In the original U-Net implementation, there are 4 contracting levels (also called downsampling steps).
12 |         # Starting from the input, each level reduces the spatial dimensions by half and doubles the channel dimensions.
13 | 
14 |         self.first_level = ConvBlock(in_channels=in_channels, out_channels=64, maxpool=True)
15 |         self.second_level = ConvBlock(in_channels=64, out_channels=128, maxpool=True)
16 |         self.third_level = ConvBlock(in_channels=128, out_channels=256, maxpool=True)
17 |         self.fourth_level = ConvBlock(in_channels=256, out_channels=512, maxpool=True)
18 | 
19 |         # Then we have the bridge of the Unet
20 |         self.bridge = ConvBlock(in_channels=512, out_channels=1024, maxpool=True)
21 | 
22 | 
23 | 
24 | 
25 |     def forward(self, x):
26 |         out1, flv1 = self.first_level(x)           # feature level 1 (flv1), out1 is flv1 after maxpooling
27 |         out2, flv2 = self.second_level(out1)       # feature level 2 (flv2), out2 is flv2 after maxpooling
28 |         out3, flv3 = self.third_level(out2)        # feature level 3 (flv3). out3 is flv3 after maxpooling
29 |         out4, flv4 = self.fourth_level(out3)       # feature level 4 (flv4), out4 is flv4 after maxpooling
30 |         _, brdout = self.bridge(out4)              # bridge output (brdout)
31 | 
32 |         return [flv1, flv2, flv3, flv4, brdout]
33 | 
34 | 
35 | class ExpandingPath(nn.Module):
36 |     def __init__(self, n_classes):
37 |         super(ExpandingPath, self).__init__()
38 | 
39 |         # Symmetrically, there are 4 expanding levels (also called upsampling steps).
40 |         # Starting from the output of the bridge, each level increases the spatial dimensions by two and half the channel dimensions.
41 | 
42 |         self.cc = CropAndConc()
43 | 
44 |         self.up1 = UP(in_channels=1024, out_channels=512)
45 |         self.fourth_level = ConvBlock(in_channels=1024, out_channels=512, maxpool=False)
46 |         self.up2 = UP(in_channels=512, out_channels=256)
47 |         self.third_level = ConvBlock(in_channels=512, out_channels=256, maxpool=False)
48 |         self.up3 = UP(in_channels=256, out_channels=128)
49 |         self.second_level = ConvBlock(in_channels=256, out_channels=128, maxpool=False)
50 |         self.up4 = UP(in_channels=128, out_channels=64)
51 |         self.first_level = ConvBlock(in_channels=128, out_channels=64, maxpool=False)
52 | 
53 |         # output layer
54 |         self.output_layer = nn.Conv2d(in_channels=64, out_channels=n_classes, kernel_size=1)
55 | 
56 | 
57 | 
58 |     def forward(self, output_features):
59 |         [flv1, flv2, flv3, flv4, brdout] = output_features
60 |         x = self.up1(brdout)
61 |         x = self.cc(flv4, x)
62 |         x = self.fourth_level(x)
63 |         x = self.up2(x)
64 |         x = self.cc(flv3, x)
65 |         x = self.third_level(x)
66 |         x = self.up3(x)
67 |         x = self.cc(flv2, x)
68 |         x = self.second_level(x)
69 |         x = self.up4(x)
70 |         x = self.cc(flv1, x)
71 |         x = self.first_level(x)
72 |         x = self.output_layer(x)
73 | 
74 |         return x
75 | 
76 | 
77 | 
78 | 
79 | class UNet(nn.Module):
80 |     def __init__(self, in_channels = 3, n_classes = 2):
81 |         super(UNet, self).__init__()
82 |         self.contracting_path = ContractingPath(in_channels=in_channels)
83 |         self.expanding_path = ExpandingPath(n_classes=n_classes)
84 | 
85 |     def forward(self, x):
86 |         output_feature = self.contracting_path(x)
87 |         output = self.expanding_path(output_feature)
88 |         return output
89 | 
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     unet = UNet()
94 |     summary(unet, (3, 572, 572))
95 | 


--------------------------------------------------------------------------------
/Semantic Segmentation/UNet/requirements.txt:
--------------------------------------------------------------------------------
1 | torch=2.3.1+cu118
2 | torchvision=0.18.1+cu118
3 | torchsummary


--------------------------------------------------------------------------------
/Semantic Segmentation/UNet/unet_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | 
  5 | def crop_tensor(tensor, target_tensor):
  6 |     target_size = target_tensor.size()[2]  # Get height/width of target
  7 |     tensor_size = tensor.size()[2]         # Get height/width of tensor
  8 |     delta = tensor_size - target_size
  9 |     delta = delta // 2
 10 |     return tensor[:, :, delta:tensor_size-delta, delta:tensor_size-delta]
 11 | 
 12 | class ConvBlock(nn.Module):
 13 |     '''
 14 |     In the original implementation of the UNet the Covolutional Block used in the contracting path consists of:
 15 |     - conv 3x3
 16 |     - ReLu
 17 |     - conv 3x3
 18 |     - ReLu
 19 |     - maxpool 2x2
 20 |     '''
 21 |     def __init__(self, in_channels, out_channels, maxpool = False):
 22 |         super(ConvBlock, self).__init__()
 23 | 
 24 |         self.maxpool = maxpool
 25 | 
 26 |         self.convblock_1 = nn.Sequential(
 27 |             nn.Conv2d(in_channels, out_channels, kernel_size=3, bias=True), # **
 28 |             nn.ReLU(),
 29 |         )
 30 | 
 31 |         self.convblock_2 = nn.Sequential(
 32 |             nn.Conv2d(out_channels, out_channels, kernel_size=3, bias=True),  # **
 33 |             nn.ReLU(),
 34 |         )
 35 | 
 36 |         self.maxpool2d = nn.MaxPool2d(kernel_size=2, stride=2)
 37 | 
 38 | 
 39 | 
 40 |     # ** In the original U-Net paper and implementation, the convolutional layers do include bias terms (bias=True).
 41 |     #    However, it's worth noting that in modern deep learning practice, especially when using batch normalization,
 42 |     #    it's common to set bias=False for the convolution layers that are immediately followed by batch normalization.
 43 |     #    This is because the batch norm layer has its own learnable bias parameter, making the convolution's bias redundant.
 44 | 
 45 |     def forward(self, x):
 46 |         out_conv1 = self.convblock_1(x)
 47 |         out_conv2 = self.convblock_2(out_conv1)
 48 |         if self.maxpool:
 49 |             out = self.maxpool2d(out_conv2)
 50 |             return out, out_conv2
 51 |         else:
 52 |             return out_conv2
 53 | 
 54 | 
 55 | class UP(nn.Module):
 56 |     def __init__(self, in_channels, out_channels, scale_factor = 2):
 57 |         super(UP, self).__init__()
 58 | 
 59 |         '''
 60 |         The original U-Net paper used transposed convolution, but many modern implementations prefer 
 61 |         the following approach for upsampling with bilinear upsampling because:
 62 | 
 63 |         - It helps avoid checkerboard artifacts that can occur with transposed convolutions
 64 |         - It's often more computationally efficient
 65 |         - It can lead to smoother outputs
 66 |         '''
 67 | 
 68 |         self.up = nn.Sequential(
 69 |             nn.Upsample(scale_factor=scale_factor, mode='bilinear', align_corners=True),
 70 |             nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1)
 71 |         )
 72 | 
 73 |     def forward(self, x):
 74 |         return self.up(x)
 75 | 
 76 | class CropAndConc(nn.Module):
 77 |     def __init__(self):
 78 |         super(CropAndConc, self).__init__()
 79 | 
 80 |         '''
 81 |         Following the original implementation, this submodule perform two operations:
 82 |         - Crop the feature map from the contracting path
 83 |         - Concatenate the cropped feature map with the feature from the expanding path
 84 |         
 85 |         Modern U-Net implementations typically don't use cropping anymore. 
 86 |         Instead, they use padding in the convolutions to maintain spatial dimensions. 
 87 |         This makes the architecture simpler and ensures that the feature maps from the contracting path match exactly with 
 88 |         the expanding path.
 89 |         
 90 |         Benefits of using padding instead of cropping:
 91 |         - Simpler implementation
 92 |         - No loss of border information
 93 |         - Easier to predict output sizes
 94 |         - Better feature preservation at boundaries
 95 |         - More stable training in many cases
 96 |         '''
 97 |         pass
 98 | 
 99 |     def forward(self, tensor, target_tensor):
100 |         cropped_tensor = crop_tensor(tensor, target_tensor)
101 |         x = torch.cat([cropped_tensor, target_tensor], 1)
102 |         return x
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------