├── .gitignore ├── Dockerfile ├── README.md ├── conf ├── .ipynb_checkpoints │ └── global_settings-checkpoint.py ├── __init__.py └── global_settings.py ├── dataset.py ├── logs ├── test_2023-08-01_01-32-35.log ├── test_2023-08-01_01-34-15.log ├── test_2023-08-01_01-56-59.log ├── test_2023-08-01_03-34-38.log └── test_2023-08-01_06-33-18.log ├── lr_finder.py ├── models ├── .ipynb_checkpoints │ └── vgg-checkpoint.py ├── attention.py ├── densenet.py ├── googlenet.py ├── inceptionv3.py ├── inceptionv4.py ├── mobilenet.py ├── mobilenetv2.py ├── nasnet.py ├── preactresnet.py ├── resnet.py ├── resnext.py ├── rir.py ├── senet.py ├── shufflenet.py ├── shufflenetv2.py ├── squeezenet.py ├── stochasticdepth.py ├── vgg.py ├── wideresidual.py └── xception.py ├── requirements.txt ├── test.py ├── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | data 3 | checkpoint 4 | runs 5 | **/*.ipynb_checkpoints/ 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | RUN mkdir /pytorch-cifar100-workspace 4 | COPY . /pytorch-cifar100-workspace/ 5 | 6 | WORKDIR /pytorch-cifar100-workspace 7 | RUN pip install -r requirements.txt 8 | 9 | ENV HOME=/pytorch-cifar100-workspace 10 | 11 | CMD [ "python" , "train.py", "--net", "vgg19", "--gpu" ] 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pytorch-cifar100 2 | 3 | practice on cifar100 using pytorch 4 | 5 | ## Requirements 6 | 7 | This is my experiment eviroument 8 | - python3.6 9 | - pytorch1.6.0+cu101 10 | - tensorboard 2.2.2(optional) 11 | 12 | 13 | ## Usage 14 | 15 | ### 1. enter directory 16 | ```bash 17 | $ cd pytorch-cifar100 18 | ``` 19 | 20 | ### 2. dataset 21 | I will use cifar100 dataset from torchvision since it's more convenient, but I also 22 | kept the sample code for writing your own dataset module in dataset folder, as an 23 | example for people don't know how to write it. 24 | 25 | ### 3. run tensorbard(optional) 26 | Install tensorboard 27 | ```bash 28 | $ pip install tensorboard 29 | $ mkdir runs 30 | Run tensorboard 31 | $ tensorboard --logdir='runs' --port=6006 --host='localhost' 32 | ``` 33 | 34 | ### 4. train the model 35 | You need to specify the net you want to train using arg -net 36 | 37 | ```bash 38 | # use gpu to train vgg16 39 | $ python train.py -net vgg16 -gpu 40 | ``` 41 | 42 | sometimes, you might want to use warmup training by set ```-warm``` to 1 or 2, to prevent network 43 | diverge during early training phase. 44 | 45 | The supported net args are: 46 | ``` 47 | squeezenet 48 | mobilenet 49 | mobilenetv2 50 | shufflenet 51 | shufflenetv2 52 | vgg11 53 | vgg13 54 | vgg16 55 | vgg19 56 | densenet121 57 | densenet161 58 | densenet201 59 | googlenet 60 | inceptionv3 61 | inceptionv4 62 | inceptionresnetv2 63 | xception 64 | resnet18 65 | resnet34 66 | resnet50 67 | resnet101 68 | resnet152 69 | preactresnet18 70 | preactresnet34 71 | preactresnet50 72 | preactresnet101 73 | preactresnet152 74 | resnext50 75 | resnext101 76 | resnext152 77 | attention56 78 | attention92 79 | seresnet18 80 | seresnet34 81 | seresnet50 82 | seresnet101 83 | seresnet152 84 | nasnet 85 | wideresnet 86 | stochasticdepth18 87 | stochasticdepth34 88 | stochasticdepth50 89 | stochasticdepth101 90 | ``` 91 | Normally, the weights file with the best accuracy would be written to the disk with name suffix 'best'(default in checkpoint folder). 92 | 93 | 94 | ### 5. test the model 95 | Test the model using test.py 96 | ```bash 97 | $ python test.py -net vgg16 -weights path_to_vgg16_weights_file 98 | ``` 99 | 100 | ## Implementated NetWork 101 | 102 | - vgg [Very Deep Convolutional Networks for Large-Scale Image Recognition](https://arxiv.org/abs/1409.1556v6) 103 | - googlenet [Going Deeper with Convolutions](https://arxiv.org/abs/1409.4842v1) 104 | - inceptionv3 [Rethinking the Inception Architecture for Computer Vision](https://arxiv.org/abs/1512.00567v3) 105 | - inceptionv4, inception_resnet_v2 [Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning](https://arxiv.org/abs/1602.07261) 106 | - xception [Xception: Deep Learning with Depthwise Separable Convolutions](https://arxiv.org/abs/1610.02357) 107 | - resnet [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385v1) 108 | - resnext [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/abs/1611.05431v2) 109 | - resnet in resnet [Resnet in Resnet: Generalizing Residual Architectures](https://arxiv.org/abs/1603.08029v1) 110 | - densenet [Densely Connected Convolutional Networks](https://arxiv.org/abs/1608.06993v5) 111 | - shufflenet [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://arxiv.org/abs/1707.01083v2) 112 | - shufflenetv2 [ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design](https://arxiv.org/abs/1807.11164v1) 113 | - mobilenet [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 114 | - mobilenetv2 [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) 115 | - residual attention network [Residual Attention Network for Image Classification](https://arxiv.org/abs/1704.06904) 116 | - senet [Squeeze-and-Excitation Networks](https://arxiv.org/abs/1709.01507) 117 | - squeezenet [SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size](https://arxiv.org/abs/1602.07360v4) 118 | - nasnet [Learning Transferable Architectures for Scalable Image Recognition](https://arxiv.org/abs/1707.07012v4) 119 | - wide residual network[Wide Residual Networks](https://arxiv.org/abs/1605.07146) 120 | - stochastic depth networks[Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382) 121 | 122 | ## Training Details 123 | I didn't use any training tricks to improve accuray, if you want to learn more about training tricks, 124 | please refer to my another [repo](https://github.com/weiaicunzai/Bag_of_Tricks_for_Image_Classification_with_Convolutional_Neural_Networks), contains 125 | various common training tricks and their pytorch implementations. 126 | 127 | 128 | I follow the hyperparameter settings in paper [Improved Regularization of Convolutional Neural Networks with Cutout](https://arxiv.org/abs/1708.04552v2), which is init lr = 0.1 divide by 5 at 60th, 120th, 160th epochs, train for 200 129 | epochs with batchsize 128 and weight decay 5e-4, Nesterov momentum of 0.9. You could also use the hyperparameters from paper [Regularizing Neural Networks by Penalizing Confident Output Distributions](https://arxiv.org/abs/1701.06548v1) and [Random Erasing Data Augmentation](https://arxiv.org/abs/1708.04896v2), which is initial lr = 0.1, lr divied by 10 at 150th and 225th epochs, and training for 300 epochs with batchsize 128, this is more commonly used. You could decrese the batchsize to 64 or whatever suits you, if you dont have enough gpu memory. 130 | 131 | You can choose whether to use TensorBoard to visualize your training procedure 132 | 133 | ## Results 134 | The result I can get from a certain model, since I use the same hyperparameters to train all the networks, some networks might not get the best result from these hyperparameters, you could try yourself by finetuning the hyperparameters to get 135 | better result. 136 | 137 | |dataset|network|params|top1 err|top5 err|epoch(lr = 0.1)|epoch(lr = 0.02)|epoch(lr = 0.004)|epoch(lr = 0.0008)|total epoch| 138 | |:-----:|:-----:|:----:|:------:|:------:|:-------------:|:--------------:|:---------------:|:----------------:|:---------:| 139 | |cifar100|mobilenet|3.3M|34.02|10.56|60|60|40|40|200| 140 | |cifar100|mobilenetv2|2.36M|31.92|09.02|60|60|40|40|200| 141 | |cifar100|squeezenet|0.78M|30.59|8.36|60|60|40|40|200| 142 | |cifar100|shufflenet|1.0M|29.94|8.35|60|60|40|40|200| 143 | |cifar100|shufflenetv2|1.3M|30.49|8.49|60|60|40|40|200| 144 | |cifar100|vgg11_bn|28.5M|31.36|11.85|60|60|40|40|200| 145 | |cifar100|vgg13_bn|28.7M|28.00|9.71|60|60|40|40|200| 146 | |cifar100|vgg16_bn|34.0M|27.07|8.84|60|60|40|40|200| 147 | |cifar100|vgg19_bn|39.0M|27.77|8.84|60|60|40|40|200| 148 | |cifar100|resnet18|11.2M|24.39|6.95|60|60|40|40|200| 149 | |cifar100|resnet34|21.3M|23.24|6.63|60|60|40|40|200| 150 | |cifar100|resnet50|23.7M|22.61|6.04|60|60|40|40|200| 151 | |cifar100|resnet101|42.7M|22.22|5.61|60|60|40|40|200| 152 | |cifar100|resnet152|58.3M|22.31|5.81|60|60|40|40|200| 153 | |cifar100|preactresnet18|11.3M|27.08|8.53|60|60|40|40|200| 154 | |cifar100|preactresnet34|21.5M|24.79|7.68|60|60|40|40|200| 155 | |cifar100|preactresnet50|23.9M|25.73|8.15|60|60|40|40|200| 156 | |cifar100|preactresnet101|42.9M|24.84|7.83|60|60|40|40|200| 157 | |cifar100|preactresnet152|58.6M|22.71|6.62|60|60|40|40|200| 158 | |cifar100|resnext50|14.8M|22.23|6.00|60|60|40|40|200| 159 | |cifar100|resnext101|25.3M|22.22|5.99|60|60|40|40|200| 160 | |cifar100|resnext152|33.3M|22.40|5.58|60|60|40|40|200| 161 | |cifar100|attention59|55.7M|33.75|12.90|60|60|40|40|200| 162 | |cifar100|attention92|102.5M|36.52|11.47|60|60|40|40|200| 163 | |cifar100|densenet121|7.0M|22.99|6.45|60|60|40|40|200| 164 | |cifar100|densenet161|26M|21.56|6.04|60|60|60|40|200| 165 | |cifar100|densenet201|18M|21.46|5.9|60|60|40|40|200| 166 | |cifar100|googlenet|6.2M|21.97|5.94|60|60|40|40|200| 167 | |cifar100|inceptionv3|22.3M|22.81|6.39|60|60|40|40|200| 168 | |cifar100|inceptionv4|41.3M|24.14|6.90|60|60|40|40|200| 169 | |cifar100|inceptionresnetv2|65.4M|27.51|9.11|60|60|40|40|200| 170 | |cifar100|xception|21.0M|25.07|7.32|60|60|40|40|200| 171 | |cifar100|seresnet18|11.4M|23.56|6.68|60|60|40|40|200| 172 | |cifar100|seresnet34|21.6M|22.07|6.12|60|60|40|40|200| 173 | |cifar100|seresnet50|26.5M|21.42|5.58|60|60|40|40|200| 174 | |cifar100|seresnet101|47.7M|20.98|5.41|60|60|40|40|200| 175 | |cifar100|seresnet152|66.2M|20.66|5.19|60|60|40|40|200| 176 | |cifar100|nasnet|5.2M|22.71|5.91|60|60|40|40|200| 177 | |cifar100|wideresnet-40-10|55.9M|21.25|5.77|60|60|40|40|200| 178 | |cifar100|stochasticdepth18|11.22M|31.40|8.84|60|60|40|40|200| 179 | |cifar100|stochasticdepth34|21.36M|27.72|7.32|60|60|40|40|200| 180 | |cifar100|stochasticdepth50|23.71M|23.35|5.76|60|60|40|40|200| 181 | |cifar100|stochasticdepth101|42.69M|21.28|5.39|60|60|40|40|200| 182 | 183 | 184 | 185 | -------------------------------------------------------------------------------- /conf/.ipynb_checkpoints/global_settings-checkpoint.py: -------------------------------------------------------------------------------- 1 | """ configurations for this project 2 | 3 | author baiyu 4 | """ 5 | import os 6 | from datetime import datetime 7 | 8 | #CIFAR100 dataset path (python version) 9 | #CIFAR100_PATH = '/nfs/private/cifar100/cifar-100-python' 10 | 11 | #mean and std of cifar100 dataset 12 | CIFAR100_TRAIN_MEAN = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343) 13 | CIFAR100_TRAIN_STD = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404) 14 | 15 | #CIFAR100_TEST_MEAN = (0.5088964127604166, 0.48739301317401956, 0.44194221124387256) 16 | #CIFAR100_TEST_STD = (0.2682515741720801, 0.2573637364478126, 0.2770957707973042) 17 | 18 | #directory to save weights file 19 | CHECKPOINT_PATH = 'checkpoint' 20 | 21 | #total training epoches 22 | EPOCH = 200 23 | MILESTONES = [1, 50, 100, 150] 24 | 25 | #initial learning rate 26 | #INIT_LR = 0.1 27 | 28 | DATE_FORMAT = '%A_%d_%B_%Y_%Hh_%Mm_%Ss' 29 | #time of we run the script 30 | TIME_NOW = datetime.now().strftime(DATE_FORMAT) 31 | 32 | #tensorboard log dir 33 | LOG_DIR = 'runs' 34 | 35 | #save weights file per SAVE_EPOCH epoch 36 | SAVE_EPOCH = 10 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /conf/__init__.py: -------------------------------------------------------------------------------- 1 | """ dynamically load settings 2 | 3 | author baiyu 4 | """ 5 | import conf.global_settings as settings 6 | 7 | class Settings: 8 | def __init__(self, settings): 9 | 10 | for attr in dir(settings): 11 | if attr.isupper(): 12 | setattr(self, attr, getattr(settings, attr)) 13 | 14 | settings = Settings(settings) -------------------------------------------------------------------------------- /conf/global_settings.py: -------------------------------------------------------------------------------- 1 | """ configurations for this project 2 | 3 | author baiyu 4 | """ 5 | import os 6 | from datetime import datetime 7 | 8 | #CIFAR100 dataset path (python version) 9 | #CIFAR100_PATH = '/nfs/private/cifar100/cifar-100-python' 10 | 11 | #mean and std of cifar100 dataset 12 | CIFAR100_TRAIN_MEAN = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343) 13 | CIFAR100_TRAIN_STD = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404) 14 | 15 | #CIFAR100_TEST_MEAN = (0.5088964127604166, 0.48739301317401956, 0.44194221124387256) 16 | #CIFAR100_TEST_STD = (0.2682515741720801, 0.2573637364478126, 0.2770957707973042) 17 | 18 | #directory to save weights file 19 | CHECKPOINT_PATH = 'checkpoint' 20 | 21 | #total training epoches 22 | EPOCH = 200 23 | MILESTONES = [1, 50, 100, 150] 24 | 25 | #initial learning rate 26 | #INIT_LR = 0.1 27 | 28 | DATE_FORMAT = '%A_%d_%B_%Y_%Hh_%Mm_%Ss' 29 | #time of we run the script 30 | TIME_NOW = datetime.now().strftime(DATE_FORMAT) 31 | 32 | #tensorboard log dir 33 | LOG_DIR = 'runs' 34 | 35 | #save weights file per SAVE_EPOCH epoch 36 | SAVE_EPOCH = 10 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | """ train and test dataset 2 | 3 | author baiyu 4 | """ 5 | import os 6 | import sys 7 | import pickle 8 | 9 | from skimage import io 10 | import matplotlib.pyplot as plt 11 | import numpy 12 | import torch 13 | from torch.utils.data import Dataset 14 | 15 | class CIFAR100Train(Dataset): 16 | """cifar100 test dataset, derived from 17 | torch.utils.data.DataSet 18 | """ 19 | 20 | def __init__(self, path, transform=None): 21 | #if transform is given, we transoform data using 22 | with open(os.path.join(path, 'train'), 'rb') as cifar100: 23 | self.data = pickle.load(cifar100, encoding='bytes') 24 | self.transform = transform 25 | 26 | def __len__(self): 27 | return len(self.data['fine_labels'.encode()]) 28 | 29 | def __getitem__(self, index): 30 | label = self.data['fine_labels'.encode()][index] 31 | r = self.data['data'.encode()][index, :1024].reshape(32, 32) 32 | g = self.data['data'.encode()][index, 1024:2048].reshape(32, 32) 33 | b = self.data['data'.encode()][index, 2048:].reshape(32, 32) 34 | image = numpy.dstack((r, g, b)) 35 | 36 | if self.transform: 37 | image = self.transform(image) 38 | return label, image 39 | 40 | class CIFAR100Test(Dataset): 41 | """cifar100 test dataset, derived from 42 | torch.utils.data.DataSet 43 | """ 44 | 45 | def __init__(self, path, transform=None): 46 | with open(os.path.join(path, 'test'), 'rb') as cifar100: 47 | self.data = pickle.load(cifar100, encoding='bytes') 48 | self.transform = transform 49 | 50 | def __len__(self): 51 | return len(self.data['data'.encode()]) 52 | 53 | def __getitem__(self, index): 54 | label = self.data['fine_labels'.encode()][index] 55 | r = self.data['data'.encode()][index, :1024].reshape(32, 32) 56 | g = self.data['data'.encode()][index, 1024:2048].reshape(32, 32) 57 | b = self.data['data'.encode()][index, 2048:].reshape(32, 32) 58 | image = numpy.dstack((r, g, b)) 59 | 60 | if self.transform: 61 | image = self.transform(image) 62 | return label, image 63 | 64 | -------------------------------------------------------------------------------- /logs/test_2023-08-01_01-32-35.log: -------------------------------------------------------------------------------- 1 | 2023-08-01 01:32:42,502 [INFO] Parsed args: { 2 | "net": "vgg19", 3 | "weights": "checkpoint/vgg19/weights/vgg19-297-best.pth", 4 | "gpu": true, 5 | "b": 16, 6 | "log": "./logs/test_{datetime}.log" 7 | } 8 | 2023-08-01 01:32:43,203 [INFO] VGG( 9 | (features): Sequential( 10 | (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 11 | (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 12 | (2): ReLU(inplace=True) 13 | (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 14 | (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 15 | (5): ReLU(inplace=True) 16 | (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 17 | (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 18 | (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 19 | (9): ReLU(inplace=True) 20 | (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 21 | (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 22 | (12): ReLU(inplace=True) 23 | (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 24 | (14): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 25 | (15): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 26 | (16): ReLU(inplace=True) 27 | (17): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 28 | (18): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 29 | (19): ReLU(inplace=True) 30 | (20): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 31 | (21): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 32 | (22): ReLU(inplace=True) 33 | (23): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 34 | (24): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 35 | (25): ReLU(inplace=True) 36 | (26): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 37 | (27): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 38 | (28): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 39 | (29): ReLU(inplace=True) 40 | (30): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 41 | (31): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 42 | (32): ReLU(inplace=True) 43 | (33): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 44 | (34): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 45 | (35): ReLU(inplace=True) 46 | (36): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 47 | (37): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 48 | (38): ReLU(inplace=True) 49 | (39): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 50 | (40): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 51 | (41): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 52 | (42): ReLU(inplace=True) 53 | (43): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 54 | (44): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 55 | (45): ReLU(inplace=True) 56 | (46): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 57 | (47): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 58 | (48): ReLU(inplace=True) 59 | (49): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 60 | (50): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 61 | (51): ReLU(inplace=True) 62 | (52): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 63 | ) 64 | (classifier): Sequential( 65 | (0): Linear(in_features=512, out_features=4096, bias=True) 66 | (1): ReLU(inplace=True) 67 | (2): Dropout(p=0.5, inplace=False) 68 | (3): Linear(in_features=4096, out_features=4096, bias=True) 69 | (4): ReLU(inplace=True) 70 | (5): Dropout(p=0.5, inplace=False) 71 | (6): Linear(in_features=4096, out_features=100, bias=True) 72 | ) 73 | ) 74 | 2023-08-01 01:32:43,205 [INFO] 75 | 76 | 2023-08-01 01:32:57,959 [INFO] Average throughput: 1115.7068388150608 77 | 2023-08-01 01:32:57,961 [INFO] Average inference time: 0.014317739610671995 78 | 2023-08-01 01:32:57,961 [INFO] GPU INFO..... 79 | 80 | 2023-08-01 01:32:57,963 [INFO] 81 | |===========================================================================| 82 | | PyTorch CUDA memory summary, device ID 0 | 83 | |---------------------------------------------------------------------------| 84 | | CUDA OOMs: 0 | cudaMalloc retries: 0 | 85 | |===========================================================================| 86 | | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | 87 | |---------------------------------------------------------------------------| 88 | | Allocated memory | 162647 KiB | 309915 KiB | 251994 MiB | 251835 MiB | 89 | | from large pool | 161280 KiB | 307584 KiB | 238979 MiB | 238822 MiB | 90 | | from small pool | 1367 KiB | 3415 KiB | 13014 MiB | 13013 MiB | 91 | |---------------------------------------------------------------------------| 92 | | Active memory | 162647 KiB | 309915 KiB | 251994 MiB | 251835 MiB | 93 | | from large pool | 161280 KiB | 307584 KiB | 238979 MiB | 238822 MiB | 94 | | from small pool | 1367 KiB | 3415 KiB | 13014 MiB | 13013 MiB | 95 | |---------------------------------------------------------------------------| 96 | | Requested memory | 162186 KiB | 307333 KiB | 251404 MiB | 251246 MiB | 97 | | from large pool | 160832 KiB | 305024 KiB | 238391 MiB | 238234 MiB | 98 | | from small pool | 1354 KiB | 3402 KiB | 13013 MiB | 13011 MiB | 99 | |---------------------------------------------------------------------------| 100 | | GPU reserved memory | 339968 KiB | 339968 KiB | 339968 KiB | 0 B | 101 | | from large pool | 335872 KiB | 335872 KiB | 335872 KiB | 0 B | 102 | | from small pool | 4096 KiB | 4096 KiB | 4096 KiB | 0 B | 103 | |---------------------------------------------------------------------------| 104 | | Non-releasable memory | 27816 KiB | 55976 KiB | 175476 MiB | 175449 MiB | 105 | | from large pool | 27136 KiB | 53504 KiB | 158574 MiB | 158547 MiB | 106 | | from small pool | 680 KiB | 2600 KiB | 16902 MiB | 16902 MiB | 107 | |---------------------------------------------------------------------------| 108 | | Allocations | 127 | 236 | 45239 | 45112 | 109 | | from large pool | 16 | 30 | 14406 | 14390 | 110 | | from small pool | 111 | 206 | 30833 | 30722 | 111 | |---------------------------------------------------------------------------| 112 | | Active allocs | 127 | 236 | 45239 | 45112 | 113 | | from large pool | 16 | 30 | 14406 | 14390 | 114 | | from small pool | 111 | 206 | 30833 | 30722 | 115 | |---------------------------------------------------------------------------| 116 | | GPU reserved segments | 14 | 14 | 14 | 0 | 117 | | from large pool | 12 | 12 | 12 | 0 | 118 | | from small pool | 2 | 2 | 2 | 0 | 119 | |---------------------------------------------------------------------------| 120 | | Non-releasable allocs | 9 | 11 | 25036 | 25027 | 121 | | from large pool | 5 | 9 | 10022 | 10017 | 122 | | from small pool | 4 | 6 | 15014 | 15010 | 123 | |---------------------------------------------------------------------------| 124 | | Oversize allocations | 0 | 0 | 0 | 0 | 125 | |---------------------------------------------------------------------------| 126 | | Oversize GPU segments | 0 | 0 | 0 | 0 | 127 | |===========================================================================| 128 | 129 | 2023-08-01 01:32:57,963 [INFO] 130 | 131 | 2023-08-01 01:32:57,973 [INFO] Top 1 err: 0.2890000343322754 132 | 133 | 2023-08-01 01:32:57,976 [INFO] Top 5 err: 0.12160003185272217 134 | 135 | 2023-08-01 01:32:57,977 [INFO] Parameter numbers: 39327652 136 | -------------------------------------------------------------------------------- /logs/test_2023-08-01_01-34-15.log: -------------------------------------------------------------------------------- 1 | 2023-08-01 01:34:21,603 [INFO] Parsed args: { 2 | "net": "vgg19_lora", 3 | "weights": "checkpoint/vgg19_lora/weights/vgg19_lora-175-best.pth", 4 | "gpu": true, 5 | "b": 16, 6 | "log": "./logs/test_{datetime}.log" 7 | } 8 | 2023-08-01 01:34:21,672 [INFO] VGGLORA( 9 | (features): Sequential( 10 | (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 11 | (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 12 | (2): ReLU(inplace=True) 13 | (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 14 | (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 15 | (5): ReLU(inplace=True) 16 | (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 17 | (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 18 | (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 19 | (9): ReLU(inplace=True) 20 | (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 21 | (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 22 | (12): ReLU(inplace=True) 23 | (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 24 | (14): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 25 | (15): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 26 | (16): ReLU(inplace=True) 27 | (17): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 28 | (18): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 29 | (19): ReLU(inplace=True) 30 | (20): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 31 | (21): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 32 | (22): ReLU(inplace=True) 33 | (23): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 34 | (24): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 35 | (25): ReLU(inplace=True) 36 | (26): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 37 | (27): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 38 | (28): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 39 | (29): ReLU(inplace=True) 40 | (30): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 41 | (31): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 42 | (32): ReLU(inplace=True) 43 | (33): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 44 | (34): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 45 | (35): ReLU(inplace=True) 46 | (36): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 47 | (37): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 48 | (38): ReLU(inplace=True) 49 | (39): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 50 | (40): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 51 | (41): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 52 | (42): ReLU(inplace=True) 53 | (43): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 54 | (44): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 55 | (45): ReLU(inplace=True) 56 | (46): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 57 | (47): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 58 | (48): ReLU(inplace=True) 59 | (49): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 60 | (50): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 61 | (51): ReLU(inplace=True) 62 | (52): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 63 | ) 64 | (classifier): Sequential( 65 | (0): Linear(in_features=512, out_features=4096, bias=True) 66 | (1): ReLU(inplace=True) 67 | (2): Dropout(p=0.5, inplace=False) 68 | (3): Linear(in_features=4096, out_features=4096, bias=True) 69 | (4): ReLU(inplace=True) 70 | (5): Dropout(p=0.5, inplace=False) 71 | (6): Linear(in_features=4096, out_features=100, bias=True) 72 | ) 73 | ) 74 | 2023-08-01 01:34:21,673 [INFO] 75 | 76 | 2023-08-01 01:34:35,724 [INFO] Average throughput: 1161.935813143864 77 | 2023-08-01 01:34:35,726 [INFO] Average inference time: 0.013748091606521604 78 | 2023-08-01 01:34:35,726 [INFO] GPU INFO..... 79 | 80 | 2023-08-01 01:34:35,728 [INFO] 81 | |===========================================================================| 82 | | PyTorch CUDA memory summary, device ID 0 | 83 | |---------------------------------------------------------------------------| 84 | | CUDA OOMs: 0 | cudaMalloc retries: 0 | 85 | |===========================================================================| 86 | | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | 87 | |---------------------------------------------------------------------------| 88 | | Allocated memory | 164772 KiB | 204964 KiB | 252288 MiB | 252127 MiB | 89 | | from large pool | 161280 KiB | 200448 KiB | 238633 MiB | 238476 MiB | 90 | | from small pool | 3492 KiB | 5540 KiB | 13654 MiB | 13651 MiB | 91 | |---------------------------------------------------------------------------| 92 | | Active memory | 164772 KiB | 204964 KiB | 252288 MiB | 252127 MiB | 93 | | from large pool | 161280 KiB | 200448 KiB | 238633 MiB | 238476 MiB | 94 | | from small pool | 3492 KiB | 5540 KiB | 13654 MiB | 13651 MiB | 95 | |---------------------------------------------------------------------------| 96 | | Requested memory | 164310 KiB | 204502 KiB | 251895 MiB | 251734 MiB | 97 | | from large pool | 160832 KiB | 200000 KiB | 238243 MiB | 238085 MiB | 98 | | from small pool | 3478 KiB | 5526 KiB | 13652 MiB | 13648 MiB | 99 | |---------------------------------------------------------------------------| 100 | | GPU reserved memory | 235520 KiB | 235520 KiB | 235520 KiB | 0 B | 101 | | from large pool | 229376 KiB | 229376 KiB | 229376 KiB | 0 B | 102 | | from small pool | 6144 KiB | 6144 KiB | 6144 KiB | 0 B | 103 | |---------------------------------------------------------------------------| 104 | | Non-releasable memory | 27740 KiB | 54876 KiB | 122702 MiB | 122675 MiB | 105 | | from large pool | 27136 KiB | 54272 KiB | 103578 MiB | 103552 MiB | 106 | | from small pool | 604 KiB | 2533 KiB | 19124 MiB | 19123 MiB | 107 | |---------------------------------------------------------------------------| 108 | | Allocations | 133 | 138 | 50758 | 50625 | 109 | | from large pool | 16 | 19 | 14391 | 14375 | 110 | | from small pool | 117 | 122 | 36367 | 36250 | 111 | |---------------------------------------------------------------------------| 112 | | Active allocs | 133 | 138 | 50758 | 50625 | 113 | | from large pool | 16 | 19 | 14391 | 14375 | 114 | | from small pool | 117 | 122 | 36367 | 36250 | 115 | |---------------------------------------------------------------------------| 116 | | GPU reserved segments | 11 | 11 | 11 | 0 | 117 | | from large pool | 8 | 8 | 8 | 0 | 118 | | from small pool | 3 | 3 | 3 | 0 | 119 | |---------------------------------------------------------------------------| 120 | | Non-releasable allocs | 9 | 13 | 34094 | 34085 | 121 | | from large pool | 5 | 6 | 10634 | 10629 | 122 | | from small pool | 4 | 7 | 23460 | 23456 | 123 | |---------------------------------------------------------------------------| 124 | | Oversize allocations | 0 | 0 | 0 | 0 | 125 | |---------------------------------------------------------------------------| 126 | | Oversize GPU segments | 0 | 0 | 0 | 0 | 127 | |===========================================================================| 128 | 129 | 2023-08-01 01:34:35,728 [INFO] 130 | 131 | 2023-08-01 01:34:35,730 [INFO] Top 1 err: 0.9900000095367432 132 | 133 | 2023-08-01 01:34:35,732 [INFO] Top 5 err: 0.949999988079071 134 | 135 | 2023-08-01 01:34:35,733 [INFO] Parameter numbers: 39871524 136 | -------------------------------------------------------------------------------- /logs/test_2023-08-01_01-56-59.log: -------------------------------------------------------------------------------- 1 | 2023-08-01 01:57:06,452 [INFO] Parsed args: { 2 | "net": "vgg19_qlora", 3 | "weights": "checkpoint/vgg19_qlora/weights/vgg19_qlora-72-best.pth", 4 | "gpu": true, 5 | "b": 16, 6 | "log": "./logs/test_{datetime}.log" 7 | } 8 | 2023-08-01 01:57:06,672 [INFO] VGGQLORA( 9 | (features): Sequential( 10 | (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 11 | (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 12 | (2): ReLU(inplace=True) 13 | (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 14 | (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 15 | (5): ReLU(inplace=True) 16 | (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 17 | (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 18 | (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 19 | (9): ReLU(inplace=True) 20 | (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 21 | (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 22 | (12): ReLU(inplace=True) 23 | (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 24 | (14): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 25 | (15): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 26 | (16): ReLU(inplace=True) 27 | (17): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 28 | (18): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 29 | (19): ReLU(inplace=True) 30 | (20): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 31 | (21): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 32 | (22): ReLU(inplace=True) 33 | (23): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 34 | (24): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 35 | (25): ReLU(inplace=True) 36 | (26): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 37 | (27): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 38 | (28): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 39 | (29): ReLU(inplace=True) 40 | (30): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 41 | (31): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 42 | (32): ReLU(inplace=True) 43 | (33): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 44 | (34): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 45 | (35): ReLU(inplace=True) 46 | (36): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 47 | (37): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 48 | (38): ReLU(inplace=True) 49 | (39): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 50 | (40): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 51 | (41): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 52 | (42): ReLU(inplace=True) 53 | (43): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 54 | (44): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 55 | (45): ReLU(inplace=True) 56 | (46): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 57 | (47): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 58 | (48): ReLU(inplace=True) 59 | (49): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 60 | (50): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 61 | (51): ReLU(inplace=True) 62 | (52): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 63 | ) 64 | (quant): QuantStub() 65 | (classifier): Sequential( 66 | (0): Linear(in_features=512, out_features=4096, bias=True) 67 | (1): ReLU(inplace=True) 68 | (2): Dropout(p=0.5, inplace=False) 69 | (3): Linear(in_features=4096, out_features=4096, bias=True) 70 | (4): ReLU(inplace=True) 71 | (5): Dropout(p=0.5, inplace=False) 72 | (6): Linear(in_features=4096, out_features=100, bias=True) 73 | ) 74 | (dequant): DeQuantStub() 75 | ) 76 | 2023-08-01 01:57:06,674 [INFO] 77 | 78 | 2023-08-01 01:57:18,402 [INFO] Average throughput: 1458.1225934412148 79 | 2023-08-01 01:57:18,404 [INFO] Average inference time: 0.010955457429885863 80 | 2023-08-01 01:57:18,404 [INFO] GPU INFO..... 81 | 82 | 2023-08-01 01:57:18,408 [INFO] 83 | |===========================================================================| 84 | | PyTorch CUDA memory summary, device ID 0 | 85 | |---------------------------------------------------------------------------| 86 | | CUDA OOMs: 0 | cudaMalloc retries: 0 | 87 | |===========================================================================| 88 | | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | 89 | |---------------------------------------------------------------------------| 90 | | Allocated memory | 164772 KiB | 314164 KiB | 252635 MiB | 252474 MiB | 91 | | from large pool | 161280 KiB | 307584 KiB | 238979 MiB | 238822 MiB | 92 | | from small pool | 3492 KiB | 6580 KiB | 13655 MiB | 13652 MiB | 93 | |---------------------------------------------------------------------------| 94 | | Active memory | 164772 KiB | 314164 KiB | 252635 MiB | 252474 MiB | 95 | | from large pool | 161280 KiB | 307584 KiB | 238979 MiB | 238822 MiB | 96 | | from small pool | 3492 KiB | 6580 KiB | 13655 MiB | 13652 MiB | 97 | |---------------------------------------------------------------------------| 98 | | Requested memory | 164310 KiB | 311582 KiB | 252045 MiB | 251884 MiB | 99 | | from large pool | 160832 KiB | 305024 KiB | 238391 MiB | 238234 MiB | 100 | | from small pool | 3478 KiB | 6558 KiB | 13653 MiB | 13650 MiB | 101 | |---------------------------------------------------------------------------| 102 | | GPU reserved memory | 344064 KiB | 344064 KiB | 344064 KiB | 0 B | 103 | | from large pool | 335872 KiB | 335872 KiB | 335872 KiB | 0 B | 104 | | from small pool | 8192 KiB | 8192 KiB | 8192 KiB | 0 B | 105 | |---------------------------------------------------------------------------| 106 | | Non-releasable memory | 27740 KiB | 55900 KiB | 177700 MiB | 177673 MiB | 107 | | from large pool | 27136 KiB | 53504 KiB | 158574 MiB | 158547 MiB | 108 | | from small pool | 604 KiB | 3353 KiB | 19126 MiB | 19125 MiB | 109 | |---------------------------------------------------------------------------| 110 | | Allocations | 133 | 248 | 50876 | 50743 | 111 | | from large pool | 16 | 30 | 14406 | 14390 | 112 | | from small pool | 117 | 218 | 36470 | 36353 | 113 | |---------------------------------------------------------------------------| 114 | | Active allocs | 133 | 248 | 50876 | 50743 | 115 | | from large pool | 16 | 30 | 14406 | 14390 | 116 | | from small pool | 117 | 218 | 36470 | 36353 | 117 | |---------------------------------------------------------------------------| 118 | | GPU reserved segments | 16 | 16 | 16 | 0 | 119 | | from large pool | 12 | 12 | 12 | 0 | 120 | | from small pool | 4 | 4 | 4 | 0 | 121 | |---------------------------------------------------------------------------| 122 | | Non-releasable allocs | 9 | 16 | 33484 | 33475 | 123 | | from large pool | 5 | 9 | 10022 | 10017 | 124 | | from small pool | 4 | 7 | 23462 | 23458 | 125 | |---------------------------------------------------------------------------| 126 | | Oversize allocations | 0 | 0 | 0 | 0 | 127 | |---------------------------------------------------------------------------| 128 | | Oversize GPU segments | 0 | 0 | 0 | 0 | 129 | |===========================================================================| 130 | 131 | 2023-08-01 01:57:18,408 [INFO] 132 | 133 | 2023-08-01 01:57:18,412 [INFO] Top 1 err: 0.2882000207901001 134 | 135 | 2023-08-01 01:57:18,416 [INFO] Top 5 err: 0.12200003862380981 136 | 137 | 2023-08-01 01:57:18,417 [INFO] Parameter numbers: 39871524 138 | -------------------------------------------------------------------------------- /logs/test_2023-08-01_03-34-38.log: -------------------------------------------------------------------------------- 1 | 2023-08-01 03:34:44,424 [INFO] Parsed args: { 2 | "net": "vgg19_qlora", 3 | "weights": "checkpoint/vgg19_qlora/weights_r_8/vgg19_qlora-31-best.pth", 4 | "gpu": true, 5 | "b": 16, 6 | "log": "./logs/test_{datetime}.log" 7 | } 8 | 2023-08-01 03:34:44,670 [INFO] VGGQLORA( 9 | (features): Sequential( 10 | (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 11 | (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 12 | (2): ReLU(inplace=True) 13 | (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 14 | (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 15 | (5): ReLU(inplace=True) 16 | (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 17 | (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 18 | (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 19 | (9): ReLU(inplace=True) 20 | (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 21 | (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 22 | (12): ReLU(inplace=True) 23 | (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 24 | (14): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 25 | (15): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 26 | (16): ReLU(inplace=True) 27 | (17): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 28 | (18): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 29 | (19): ReLU(inplace=True) 30 | (20): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 31 | (21): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 32 | (22): ReLU(inplace=True) 33 | (23): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 34 | (24): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 35 | (25): ReLU(inplace=True) 36 | (26): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 37 | (27): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 38 | (28): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 39 | (29): ReLU(inplace=True) 40 | (30): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 41 | (31): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 42 | (32): ReLU(inplace=True) 43 | (33): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 44 | (34): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 45 | (35): ReLU(inplace=True) 46 | (36): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 47 | (37): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 48 | (38): ReLU(inplace=True) 49 | (39): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 50 | (40): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 51 | (41): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 52 | (42): ReLU(inplace=True) 53 | (43): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 54 | (44): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 55 | (45): ReLU(inplace=True) 56 | (46): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 57 | (47): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 58 | (48): ReLU(inplace=True) 59 | (49): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 60 | (50): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 61 | (51): ReLU(inplace=True) 62 | (52): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 63 | ) 64 | (quant): QuantStub() 65 | (classifier): Sequential( 66 | (0): Linear(in_features=512, out_features=4096, bias=True) 67 | (1): ReLU(inplace=True) 68 | (2): Dropout(p=0.5, inplace=False) 69 | (3): Linear(in_features=4096, out_features=4096, bias=True) 70 | (4): ReLU(inplace=True) 71 | (5): Dropout(p=0.5, inplace=False) 72 | (6): Linear(in_features=4096, out_features=100, bias=True) 73 | ) 74 | (dequant): DeQuantStub() 75 | ) 76 | 2023-08-01 03:34:44,671 [INFO] 77 | 78 | 2023-08-01 03:34:56,223 [INFO] Average throughput: 1329.7352393108458 79 | 2023-08-01 03:34:56,225 [INFO] Average inference time: 0.012013218517303468 80 | 2023-08-01 03:34:56,225 [INFO] GPU INFO..... 81 | 82 | 2023-08-01 03:34:56,226 [INFO] 83 | |===========================================================================| 84 | | PyTorch CUDA memory summary, device ID 0 | 85 | |---------------------------------------------------------------------------| 86 | | CUDA OOMs: 0 | cudaMalloc retries: 0 | 87 | |===========================================================================| 88 | | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | 89 | |---------------------------------------------------------------------------| 90 | | Allocated memory | 163179 KiB | 310978 KiB | 252629 MiB | 252470 MiB | 91 | | from large pool | 161280 KiB | 307584 KiB | 238979 MiB | 238822 MiB | 92 | | from small pool | 1899 KiB | 3947 KiB | 13649 MiB | 13648 MiB | 93 | |---------------------------------------------------------------------------| 94 | | Active memory | 163179 KiB | 310978 KiB | 252629 MiB | 252470 MiB | 95 | | from large pool | 161280 KiB | 307584 KiB | 238979 MiB | 238822 MiB | 96 | | from small pool | 1899 KiB | 3947 KiB | 13649 MiB | 13648 MiB | 97 | |---------------------------------------------------------------------------| 98 | | Requested memory | 162717 KiB | 308395 KiB | 252039 MiB | 251880 MiB | 99 | | from large pool | 160832 KiB | 305024 KiB | 238391 MiB | 238234 MiB | 100 | | from small pool | 1885 KiB | 3933 KiB | 13647 MiB | 13645 MiB | 101 | |---------------------------------------------------------------------------| 102 | | GPU reserved memory | 342016 KiB | 342016 KiB | 342016 KiB | 0 B | 103 | | from large pool | 335872 KiB | 335872 KiB | 335872 KiB | 0 B | 104 | | from small pool | 6144 KiB | 6144 KiB | 6144 KiB | 0 B | 105 | |---------------------------------------------------------------------------| 106 | | Non-releasable memory | 27285 KiB | 55445 KiB | 179197 MiB | 179170 MiB | 107 | | from large pool | 27136 KiB | 53504 KiB | 158574 MiB | 158547 MiB | 108 | | from small pool | 149 KiB | 2267 KiB | 20623 MiB | 20623 MiB | 109 | |---------------------------------------------------------------------------| 110 | | Allocations | 133 | 248 | 50876 | 50743 | 111 | | from large pool | 16 | 30 | 14406 | 14390 | 112 | | from small pool | 117 | 218 | 36470 | 36353 | 113 | |---------------------------------------------------------------------------| 114 | | Active allocs | 133 | 248 | 50876 | 50743 | 115 | | from large pool | 16 | 30 | 14406 | 14390 | 116 | | from small pool | 117 | 218 | 36470 | 36353 | 117 | |---------------------------------------------------------------------------| 118 | | GPU reserved segments | 15 | 15 | 15 | 0 | 119 | | from large pool | 12 | 12 | 12 | 0 | 120 | | from small pool | 3 | 3 | 3 | 0 | 121 | |---------------------------------------------------------------------------| 122 | | Non-releasable allocs | 10 | 13 | 28936 | 28926 | 123 | | from large pool | 5 | 9 | 10022 | 10017 | 124 | | from small pool | 5 | 7 | 18914 | 18909 | 125 | |---------------------------------------------------------------------------| 126 | | Oversize allocations | 0 | 0 | 0 | 0 | 127 | |---------------------------------------------------------------------------| 128 | | Oversize GPU segments | 0 | 0 | 0 | 0 | 129 | |===========================================================================| 130 | 131 | 2023-08-01 03:34:56,226 [INFO] 132 | 133 | 2023-08-01 03:34:56,227 [INFO] Top 1 err: 0.2883000373840332 134 | 135 | 2023-08-01 03:34:56,227 [INFO] Top 5 err: 0.12240004539489746 136 | 137 | 2023-08-01 03:34:56,228 [INFO] Parameter numbers: 39463620 138 | -------------------------------------------------------------------------------- /logs/test_2023-08-01_06-33-18.log: -------------------------------------------------------------------------------- 1 | 2023-08-01 06:33:24,563 [INFO] Parsed args: { 2 | "net": "vgg19_qlora", 3 | "weights": "checkpoint/vgg19_qlora/weights_r_32/vgg19_qlora-3-best.pth", 4 | "gpu": true, 5 | "b": 16, 6 | "log": "./logs/test_{datetime}.log" 7 | } 8 | 2023-08-01 06:33:24,732 [INFO] Sequential( 9 | (0): tofp16() 10 | (1): VGGQLORA( 11 | (features): Sequential( 12 | (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 13 | (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 14 | (2): ReLU(inplace=True) 15 | (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 16 | (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 17 | (5): ReLU(inplace=True) 18 | (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 19 | (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 20 | (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 21 | (9): ReLU(inplace=True) 22 | (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 23 | (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 24 | (12): ReLU(inplace=True) 25 | (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 26 | (14): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 27 | (15): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 28 | (16): ReLU(inplace=True) 29 | (17): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 30 | (18): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 31 | (19): ReLU(inplace=True) 32 | (20): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 33 | (21): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 34 | (22): ReLU(inplace=True) 35 | (23): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 36 | (24): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 37 | (25): ReLU(inplace=True) 38 | (26): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 39 | (27): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 40 | (28): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 41 | (29): ReLU(inplace=True) 42 | (30): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 43 | (31): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 44 | (32): ReLU(inplace=True) 45 | (33): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 46 | (34): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 47 | (35): ReLU(inplace=True) 48 | (36): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 49 | (37): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 50 | (38): ReLU(inplace=True) 51 | (39): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 52 | (40): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 53 | (41): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 54 | (42): ReLU(inplace=True) 55 | (43): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 56 | (44): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 57 | (45): ReLU(inplace=True) 58 | (46): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 59 | (47): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 60 | (48): ReLU(inplace=True) 61 | (49): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) 62 | (50): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 63 | (51): ReLU(inplace=True) 64 | (52): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) 65 | ) 66 | (quant): QuantStub() 67 | (classifier): Sequential( 68 | (0): Linear(in_features=512, out_features=4096, bias=True) 69 | (1): ReLU(inplace=True) 70 | (2): Dropout(p=0.5, inplace=False) 71 | (3): Linear(in_features=4096, out_features=4096, bias=True) 72 | (4): ReLU(inplace=True) 73 | (5): Dropout(p=0.5, inplace=False) 74 | (6): Linear(in_features=4096, out_features=100, bias=True) 75 | ) 76 | (dequant): DeQuantStub() 77 | ) 78 | ) 79 | 2023-08-01 06:33:24,735 [INFO] 80 | 81 | 2023-08-01 06:33:34,561 [INFO] Average throughput: 1408.2774456313875 82 | 2023-08-01 06:33:34,563 [INFO] Average inference time: 0.011343219370269775 83 | 2023-08-01 06:33:34,563 [INFO] GPU INFO..... 84 | 85 | 2023-08-01 06:33:34,564 [INFO] 86 | |===========================================================================| 87 | | PyTorch CUDA memory summary, device ID 0 | 88 | |---------------------------------------------------------------------------| 89 | | CUDA OOMs: 0 | cudaMalloc retries: 0 | 90 | |===========================================================================| 91 | | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | 92 | |---------------------------------------------------------------------------| 93 | | Allocated memory | 89925 KiB | 236888 KiB | 76030 MiB | 75942 MiB | 94 | | from large pool | 86656 KiB | 230528 KiB | 64710 MiB | 64625 MiB | 95 | | from small pool | 3269 KiB | 6360 KiB | 11319 MiB | 11316 MiB | 96 | |---------------------------------------------------------------------------| 97 | | Active memory | 89925 KiB | 236888 KiB | 76030 MiB | 75942 MiB | 98 | | from large pool | 86656 KiB | 230528 KiB | 64710 MiB | 64625 MiB | 99 | | from small pool | 3269 KiB | 6360 KiB | 11319 MiB | 11316 MiB | 100 | |---------------------------------------------------------------------------| 101 | | Requested memory | 86455 KiB | 233729 KiB | 76018 MiB | 75934 MiB | 102 | | from large pool | 83200 KiB | 227392 KiB | 64701 MiB | 64620 MiB | 103 | | from small pool | 3255 KiB | 6337 KiB | 11317 MiB | 11314 MiB | 104 | |---------------------------------------------------------------------------| 105 | | GPU reserved memory | 249856 KiB | 249856 KiB | 249856 KiB | 0 B | 106 | | from large pool | 241664 KiB | 241664 KiB | 241664 KiB | 0 B | 107 | | from small pool | 8192 KiB | 8192 KiB | 8192 KiB | 0 B | 108 | |---------------------------------------------------------------------------| 109 | | Non-releasable memory | 30906 KiB | 58531 KiB | 53223 MiB | 53193 MiB | 110 | | from large pool | 28032 KiB | 55680 KiB | 41629 MiB | 41602 MiB | 111 | | from small pool | 2874 KiB | 3839 KiB | 11594 MiB | 11591 MiB | 112 | |---------------------------------------------------------------------------| 113 | | Allocations | 133 | 248 | 51048 | 50915 | 114 | | from large pool | 14 | 28 | 11917 | 11903 | 115 | | from small pool | 119 | 220 | 39131 | 39012 | 116 | |---------------------------------------------------------------------------| 117 | | Active allocs | 133 | 248 | 51048 | 50915 | 118 | | from large pool | 14 | 28 | 11917 | 11903 | 119 | | from small pool | 119 | 220 | 39131 | 39012 | 120 | |---------------------------------------------------------------------------| 121 | | GPU reserved segments | 13 | 13 | 13 | 0 | 122 | | from large pool | 9 | 9 | 9 | 0 | 123 | | from small pool | 4 | 4 | 4 | 0 | 124 | |---------------------------------------------------------------------------| 125 | | Non-releasable allocs | 20 | 26 | 21024 | 21004 | 126 | | from large pool | 2 | 7 | 3768 | 3766 | 127 | | from small pool | 18 | 23 | 17256 | 17238 | 128 | |---------------------------------------------------------------------------| 129 | | Oversize allocations | 0 | 0 | 0 | 0 | 130 | |---------------------------------------------------------------------------| 131 | | Oversize GPU segments | 0 | 0 | 0 | 0 | 132 | |===========================================================================| 133 | 134 | 2023-08-01 06:33:34,565 [INFO] 135 | 136 | 2023-08-01 06:33:34,565 [INFO] Top 1 err: 0.9900000095367432 137 | 138 | 2023-08-01 06:33:34,566 [INFO] Top 5 err: 0.949999988079071 139 | 140 | 2023-08-01 06:33:34,566 [INFO] Parameter numbers: 39871524 141 | -------------------------------------------------------------------------------- /lr_finder.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import glob 4 | import os 5 | 6 | import cv2 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | from torch.utils.data import DataLoader 11 | import numpy as np 12 | 13 | from torchvision import transforms 14 | from conf import settings 15 | from utils import * 16 | 17 | import matplotlib 18 | matplotlib.use('Agg') 19 | import matplotlib.pyplot as plt 20 | 21 | 22 | from torch.optim.lr_scheduler import _LRScheduler 23 | 24 | 25 | class FindLR(_LRScheduler): 26 | """exponentially increasing learning rate 27 | 28 | Args: 29 | optimizer: optimzier(e.g. SGD) 30 | num_iter: totoal_iters 31 | max_lr: maximum learning rate 32 | """ 33 | def __init__(self, optimizer, max_lr=10, num_iter=100, last_epoch=-1): 34 | 35 | self.total_iters = num_iter 36 | self.max_lr = max_lr 37 | super().__init__(optimizer, last_epoch) 38 | 39 | def get_lr(self): 40 | 41 | return [base_lr * (self.max_lr / base_lr) ** (self.last_epoch / (self.total_iters + 1e-32)) for base_lr in self.base_lrs] 42 | 43 | if __name__ == '__main__': 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument('-net', type=str, required=True, help='net type') 46 | parser.add_argument('-b', type=int, default=64, help='batch size for dataloader') 47 | parser.add_argument('-base_lr', type=float, default=1e-7, help='min learning rate') 48 | parser.add_argument('-max_lr', type=float, default=10, help='max learning rate') 49 | parser.add_argument('-num_iter', type=int, default=100, help='num of iteration') 50 | parser.add_argument('-gpu', type=bool, default=True, help='use gpu or not') 51 | parser.add_argument('-gpus', nargs='+', type=int, default=0, help='gpu device') 52 | args = parser.parse_args() 53 | 54 | cifar100_training_loader = get_training_dataloader( 55 | settings.CIFAR100_TRAIN_MEAN, 56 | settings.CIFAR100_TRAIN_STD, 57 | num_workers=4, 58 | batch_size=args.b, 59 | ) 60 | 61 | net = get_network(args) 62 | 63 | loss_function = nn.CrossEntropyLoss() 64 | optimizer = optim.SGD(net.parameters(), lr=args.base_lr, momentum=0.9, weight_decay=1e-4, nesterov=True) 65 | 66 | #set up warmup phase learning rate scheduler 67 | lr_scheduler = FindLR(optimizer, max_lr=args.max_lr, num_iter=args.num_iter) 68 | epoches = int(args.num_iter / len(cifar100_training_loader)) + 1 69 | 70 | n = 0 71 | 72 | learning_rate = [] 73 | losses = [] 74 | for epoch in range(epoches): 75 | 76 | #training procedure 77 | net.train() 78 | 79 | for batch_index, (images, labels) in enumerate(cifar100_training_loader): 80 | if n > args.num_iter: 81 | break 82 | 83 | lr_scheduler.step() 84 | 85 | images = images.cuda() 86 | labels = labels.cuda() 87 | 88 | optimizer.zero_grad() 89 | predicts = net(images) 90 | loss = loss_function(predicts, labels) 91 | if torch.isnan(loss).any(): 92 | n += 1e8 93 | break 94 | loss.backward() 95 | optimizer.step() 96 | 97 | print('Iterations: {iter_num} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.8f}'.format( 98 | loss.item(), 99 | optimizer.param_groups[0]['lr'], 100 | iter_num=n, 101 | trained_samples=batch_index * args.b + len(images), 102 | total_samples=len(cifar100_training_loader.dataset), 103 | )) 104 | 105 | learning_rate.append(optimizer.param_groups[0]['lr']) 106 | losses.append(loss.item()) 107 | n += 1 108 | 109 | learning_rate = learning_rate[10:-5] 110 | losses = losses[10:-5] 111 | 112 | fig, ax = plt.subplots(1,1) 113 | ax.plot(learning_rate, losses) 114 | ax.set_xlabel('learning rate') 115 | ax.set_ylabel('losses') 116 | ax.set_xscale('log') 117 | ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%.0e')) 118 | 119 | fig.savefig('result.jpg') 120 | -------------------------------------------------------------------------------- /models/.ipynb_checkpoints/vgg-checkpoint.py: -------------------------------------------------------------------------------- 1 | """vgg in pytorch 2 | 3 | 4 | [1] Karen Simonyan, Andrew Zisserman 5 | 6 | Very Deep Convolutional Networks for Large-Scale Image Recognition. 7 | https://arxiv.org/abs/1409.1556v6 8 | """ 9 | '''VGG11/13/16/19 in Pytorch.''' 10 | 11 | import torch 12 | import torch.nn as nn 13 | import loralib as lora 14 | 15 | cfg = { 16 | 'A' : [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 17 | 'B' : [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 18 | 'D' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 19 | 'E' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'] 20 | } 21 | 22 | class VGG(nn.Module): 23 | 24 | def __init__(self, features, num_class=100): 25 | super().__init__() 26 | self.features = features 27 | 28 | self.classifier = nn.Sequential( 29 | nn.Linear(512, 4096), 30 | nn.ReLU(inplace=True), 31 | nn.Dropout(), 32 | nn.Linear(4096, 4096), 33 | nn.ReLU(inplace=True), 34 | nn.Dropout(), 35 | nn.Linear(4096, num_class) 36 | ) 37 | 38 | def forward(self, x): 39 | output = self.features(x) 40 | output = output.view(output.size()[0], -1) 41 | output = self.classifier(output) 42 | 43 | return output 44 | 45 | class VGGLORA(nn.Module): 46 | 47 | def __init__(self, features, num_class=100): 48 | super().__init__() 49 | self.features = features 50 | # self.quant = torch.ao.quantization.QuantStub() 51 | self.classifier = nn.Sequential( 52 | lora.Linear(512, 4096, r=32), 53 | nn.ReLU(inplace=True), 54 | nn.Dropout(), 55 | lora.Linear(4096, 4096, r=32), 56 | nn.ReLU(inplace=True), 57 | nn.Dropout(), 58 | lora.Linear(4096, num_class, r=32), 59 | ) 60 | # self.dequant = torch.ao.quantization.DeQuantStub() 61 | 62 | def forward(self, x): 63 | # x = self.quant(x) 64 | output = self.features(x) 65 | # output = self.dequant(output) 66 | output = output.view(output.size()[0], -1) 67 | output = self.classifier(output) 68 | 69 | return output 70 | 71 | class VGGQLORA(nn.Module): 72 | """Quantize stub module, before calibration. 73 | 74 | Args: 75 | qconfig: quantization configuration for the tensor, 76 | if qconfig is not provided, we will get qconfig from parent modules 77 | """ 78 | def __init__(self, features, num_class=100): 79 | super().__init__() 80 | self.features = features 81 | self.quant = torch.ao.quantization.QuantStub() 82 | self.classifier = nn.Sequential( 83 | lora.Linear(512, 4096, r=32), 84 | nn.ReLU(inplace=True), 85 | nn.Dropout(), 86 | lora.Linear(4096, 4096, r=32), 87 | nn.ReLU(inplace=True), 88 | nn.Dropout(), 89 | lora.Linear(4096, num_class, r=32), 90 | ) 91 | self.dequant = torch.ao.quantization.DeQuantStub() 92 | 93 | def forward(self, x): 94 | x = self.quant(x) 95 | output = self.features(x) 96 | output = self.dequant(output) 97 | output = output.view(output.size()[0], -1) 98 | output = self.classifier(output) 99 | 100 | return output 101 | 102 | def make_layers(cfg, batch_norm=False): 103 | layers = [] 104 | 105 | input_channel = 3 106 | for l in cfg: 107 | if l == 'M': 108 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 109 | continue 110 | 111 | layers += [nn.Conv2d(input_channel, l, kernel_size=3, padding=1)] 112 | 113 | if batch_norm: 114 | layers += [nn.BatchNorm2d(l)] 115 | 116 | layers += [nn.ReLU(inplace=True)] 117 | input_channel = l 118 | 119 | return nn.Sequential(*layers) 120 | 121 | def vgg11_bn(): 122 | return VGG(make_layers(cfg['A'], batch_norm=True)) 123 | 124 | def vgg13_bn(): 125 | return VGG(make_layers(cfg['B'], batch_norm=True)) 126 | 127 | def vgg16_bn(): 128 | return VGG(make_layers(cfg['D'], batch_norm=True)) 129 | 130 | def vgg19_bn(): 131 | return VGG(make_layers(cfg['E'], batch_norm=True)) 132 | 133 | def vgg19_bn_lora(): 134 | return VGGLORA(make_layers(cfg['E'], batch_norm=True)) 135 | 136 | def vgg19_bn_qlora(): 137 | return VGGQLORA(make_layers(cfg['E'], batch_norm=True)) 138 | 139 | -------------------------------------------------------------------------------- /models/attention.py: -------------------------------------------------------------------------------- 1 | """residual attention network in pytorch 2 | 3 | 4 | 5 | [1] Fei Wang, Mengqing Jiang, Chen Qian, Shuo Yang, Cheng Li, Honggang Zhang, Xiaogang Wang, Xiaoou Tang 6 | 7 | Residual Attention Network for Image Classification 8 | https://arxiv.org/abs/1704.06904 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | #"""The Attention Module is built by pre-activation Residual Unit [11] with the 16 | #number of channels in each stage is the same as ResNet [10].""" 17 | 18 | class PreActResidualUnit(nn.Module): 19 | """PreAct Residual Unit 20 | Args: 21 | in_channels: residual unit input channel number 22 | out_channels: residual unit output channel numebr 23 | stride: stride of residual unit when stride = 2, downsample the featuremap 24 | """ 25 | 26 | def __init__(self, in_channels, out_channels, stride): 27 | super().__init__() 28 | 29 | bottleneck_channels = int(out_channels / 4) 30 | self.residual_function = nn.Sequential( 31 | #1x1 conv 32 | nn.BatchNorm2d(in_channels), 33 | nn.ReLU(inplace=True), 34 | nn.Conv2d(in_channels, bottleneck_channels, 1, stride), 35 | 36 | #3x3 conv 37 | nn.BatchNorm2d(bottleneck_channels), 38 | nn.ReLU(inplace=True), 39 | nn.Conv2d(bottleneck_channels, bottleneck_channels, 3, padding=1), 40 | 41 | #1x1 conv 42 | nn.BatchNorm2d(bottleneck_channels), 43 | nn.ReLU(inplace=True), 44 | nn.Conv2d(bottleneck_channels, out_channels, 1) 45 | ) 46 | 47 | self.shortcut = nn.Sequential() 48 | if stride != 2 or (in_channels != out_channels): 49 | self.shortcut = nn.Conv2d(in_channels, out_channels, 1, stride=stride) 50 | 51 | def forward(self, x): 52 | 53 | res = self.residual_function(x) 54 | shortcut = self.shortcut(x) 55 | 56 | return res + shortcut 57 | 58 | class AttentionModule1(nn.Module): 59 | 60 | def __init__(self, in_channels, out_channels, p=1, t=2, r=1): 61 | super().__init__() 62 | #"""The hyperparameter p denotes the number of preprocessing Residual 63 | #Units before splitting into trunk branch and mask branch. t denotes 64 | #the number of Residual Units in trunk branch. r denotes the number of 65 | #Residual Units between adjacent pooling layer in the mask branch.""" 66 | assert in_channels == out_channels 67 | 68 | self.pre = self._make_residual(in_channels, out_channels, p) 69 | self.trunk = self._make_residual(in_channels, out_channels, t) 70 | self.soft_resdown1 = self._make_residual(in_channels, out_channels, r) 71 | self.soft_resdown2 = self._make_residual(in_channels, out_channels, r) 72 | self.soft_resdown3 = self._make_residual(in_channels, out_channels, r) 73 | self.soft_resdown4 = self._make_residual(in_channels, out_channels, r) 74 | 75 | self.soft_resup1 = self._make_residual(in_channels, out_channels, r) 76 | self.soft_resup2 = self._make_residual(in_channels, out_channels, r) 77 | self.soft_resup3 = self._make_residual(in_channels, out_channels, r) 78 | self.soft_resup4 = self._make_residual(in_channels, out_channels, r) 79 | 80 | self.shortcut_short = PreActResidualUnit(in_channels, out_channels, 1) 81 | self.shortcut_long = PreActResidualUnit(in_channels, out_channels, 1) 82 | 83 | self.sigmoid = nn.Sequential( 84 | nn.BatchNorm2d(out_channels), 85 | nn.ReLU(inplace=True), 86 | nn.Conv2d(out_channels, out_channels, kernel_size=1), 87 | nn.BatchNorm2d(out_channels), 88 | nn.ReLU(inplace=True), 89 | nn.Conv2d(out_channels, out_channels, kernel_size=1), 90 | nn.Sigmoid() 91 | ) 92 | 93 | self.last = self._make_residual(in_channels, out_channels, p) 94 | 95 | def forward(self, x): 96 | ###We make the size of the smallest output map in each mask branch 7*7 to be consistent 97 | #with the smallest trunk output map size. 98 | ###Thus 3,2,1 max-pooling layers are used in mask branch with input size 56 * 56, 28 * 28, 14 * 14 respectively. 99 | x = self.pre(x) 100 | input_size = (x.size(2), x.size(3)) 101 | 102 | x_t = self.trunk(x) 103 | 104 | #first downsample out 28 105 | x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) 106 | x_s = self.soft_resdown1(x_s) 107 | 108 | #28 shortcut 109 | shape1 = (x_s.size(2), x_s.size(3)) 110 | shortcut_long = self.shortcut_long(x_s) 111 | 112 | #seccond downsample out 14 113 | x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) 114 | x_s = self.soft_resdown2(x_s) 115 | 116 | #14 shortcut 117 | shape2 = (x_s.size(2), x_s.size(3)) 118 | shortcut_short = self.soft_resdown3(x_s) 119 | 120 | #third downsample out 7 121 | x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) 122 | x_s = self.soft_resdown3(x_s) 123 | 124 | #mid 125 | x_s = self.soft_resdown4(x_s) 126 | x_s = self.soft_resup1(x_s) 127 | 128 | #first upsample out 14 129 | x_s = self.soft_resup2(x_s) 130 | x_s = F.interpolate(x_s, size=shape2) 131 | x_s += shortcut_short 132 | 133 | #second upsample out 28 134 | x_s = self.soft_resup3(x_s) 135 | x_s = F.interpolate(x_s, size=shape1) 136 | x_s += shortcut_long 137 | 138 | #thrid upsample out 54 139 | x_s = self.soft_resup4(x_s) 140 | x_s = F.interpolate(x_s, size=input_size) 141 | 142 | x_s = self.sigmoid(x_s) 143 | x = (1 + x_s) * x_t 144 | x = self.last(x) 145 | 146 | return x 147 | 148 | def _make_residual(self, in_channels, out_channels, p): 149 | 150 | layers = [] 151 | for _ in range(p): 152 | layers.append(PreActResidualUnit(in_channels, out_channels, 1)) 153 | 154 | return nn.Sequential(*layers) 155 | 156 | class AttentionModule2(nn.Module): 157 | 158 | def __init__(self, in_channels, out_channels, p=1, t=2, r=1): 159 | super().__init__() 160 | #"""The hyperparameter p denotes the number of preprocessing Residual 161 | #Units before splitting into trunk branch and mask branch. t denotes 162 | #the number of Residual Units in trunk branch. r denotes the number of 163 | #Residual Units between adjacent pooling layer in the mask branch.""" 164 | assert in_channels == out_channels 165 | 166 | self.pre = self._make_residual(in_channels, out_channels, p) 167 | self.trunk = self._make_residual(in_channels, out_channels, t) 168 | self.soft_resdown1 = self._make_residual(in_channels, out_channels, r) 169 | self.soft_resdown2 = self._make_residual(in_channels, out_channels, r) 170 | self.soft_resdown3 = self._make_residual(in_channels, out_channels, r) 171 | 172 | self.soft_resup1 = self._make_residual(in_channels, out_channels, r) 173 | self.soft_resup2 = self._make_residual(in_channels, out_channels, r) 174 | self.soft_resup3 = self._make_residual(in_channels, out_channels, r) 175 | 176 | self.shortcut = PreActResidualUnit(in_channels, out_channels, 1) 177 | 178 | self.sigmoid = nn.Sequential( 179 | nn.BatchNorm2d(out_channels), 180 | nn.ReLU(inplace=True), 181 | nn.Conv2d(out_channels, out_channels, kernel_size=1), 182 | nn.BatchNorm2d(out_channels), 183 | nn.ReLU(inplace=True), 184 | nn.Conv2d(out_channels, out_channels, kernel_size=1), 185 | nn.Sigmoid() 186 | ) 187 | 188 | self.last = self._make_residual(in_channels, out_channels, p) 189 | 190 | def forward(self, x): 191 | x = self.pre(x) 192 | input_size = (x.size(2), x.size(3)) 193 | 194 | x_t = self.trunk(x) 195 | 196 | #first downsample out 14 197 | x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) 198 | x_s = self.soft_resdown1(x_s) 199 | 200 | #14 shortcut 201 | shape1 = (x_s.size(2), x_s.size(3)) 202 | shortcut = self.shortcut(x_s) 203 | 204 | #seccond downsample out 7 205 | x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) 206 | x_s = self.soft_resdown2(x_s) 207 | 208 | #mid 209 | x_s = self.soft_resdown3(x_s) 210 | x_s = self.soft_resup1(x_s) 211 | 212 | #first upsample out 14 213 | x_s = self.soft_resup2(x_s) 214 | x_s = F.interpolate(x_s, size=shape1) 215 | x_s += shortcut 216 | 217 | #second upsample out 28 218 | x_s = self.soft_resup3(x_s) 219 | x_s = F.interpolate(x_s, size=input_size) 220 | 221 | x_s = self.sigmoid(x_s) 222 | x = (1 + x_s) * x_t 223 | x = self.last(x) 224 | 225 | return x 226 | 227 | def _make_residual(self, in_channels, out_channels, p): 228 | 229 | layers = [] 230 | for _ in range(p): 231 | layers.append(PreActResidualUnit(in_channels, out_channels, 1)) 232 | 233 | return nn.Sequential(*layers) 234 | 235 | class AttentionModule3(nn.Module): 236 | 237 | def __init__(self, in_channels, out_channels, p=1, t=2, r=1): 238 | super().__init__() 239 | 240 | assert in_channels == out_channels 241 | 242 | self.pre = self._make_residual(in_channels, out_channels, p) 243 | self.trunk = self._make_residual(in_channels, out_channels, t) 244 | self.soft_resdown1 = self._make_residual(in_channels, out_channels, r) 245 | self.soft_resdown2 = self._make_residual(in_channels, out_channels, r) 246 | 247 | self.soft_resup1 = self._make_residual(in_channels, out_channels, r) 248 | self.soft_resup2 = self._make_residual(in_channels, out_channels, r) 249 | 250 | self.shortcut = PreActResidualUnit(in_channels, out_channels, 1) 251 | 252 | self.sigmoid = nn.Sequential( 253 | nn.BatchNorm2d(out_channels), 254 | nn.ReLU(inplace=True), 255 | nn.Conv2d(out_channels, out_channels, kernel_size=1), 256 | nn.BatchNorm2d(out_channels), 257 | nn.ReLU(inplace=True), 258 | nn.Conv2d(out_channels, out_channels, kernel_size=1), 259 | nn.Sigmoid() 260 | ) 261 | 262 | self.last = self._make_residual(in_channels, out_channels, p) 263 | 264 | def forward(self, x): 265 | x = self.pre(x) 266 | input_size = (x.size(2), x.size(3)) 267 | 268 | x_t = self.trunk(x) 269 | 270 | #first downsample out 14 271 | x_s = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) 272 | x_s = self.soft_resdown1(x_s) 273 | 274 | #mid 275 | x_s = self.soft_resdown2(x_s) 276 | x_s = self.soft_resup1(x_s) 277 | 278 | #first upsample out 14 279 | x_s = self.soft_resup2(x_s) 280 | x_s = F.interpolate(x_s, size=input_size) 281 | 282 | x_s = self.sigmoid(x_s) 283 | x = (1 + x_s) * x_t 284 | x = self.last(x) 285 | 286 | return x 287 | 288 | def _make_residual(self, in_channels, out_channels, p): 289 | 290 | layers = [] 291 | for _ in range(p): 292 | layers.append(PreActResidualUnit(in_channels, out_channels, 1)) 293 | 294 | return nn.Sequential(*layers) 295 | 296 | class Attention(nn.Module): 297 | """residual attention netowrk 298 | Args: 299 | block_num: attention module number for each stage 300 | """ 301 | 302 | def __init__(self, block_num, class_num=100): 303 | 304 | super().__init__() 305 | self.pre_conv = nn.Sequential( 306 | nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1), 307 | nn.BatchNorm2d(64), 308 | nn.ReLU(inplace=True) 309 | ) 310 | 311 | self.stage1 = self._make_stage(64, 256, block_num[0], AttentionModule1) 312 | self.stage2 = self._make_stage(256, 512, block_num[1], AttentionModule2) 313 | self.stage3 = self._make_stage(512, 1024, block_num[2], AttentionModule3) 314 | self.stage4 = nn.Sequential( 315 | PreActResidualUnit(1024, 2048, 2), 316 | PreActResidualUnit(2048, 2048, 1), 317 | PreActResidualUnit(2048, 2048, 1) 318 | ) 319 | self.avg = nn.AdaptiveAvgPool2d(1) 320 | self.linear = nn.Linear(2048, 100) 321 | 322 | def forward(self, x): 323 | x = self.pre_conv(x) 324 | x = self.stage1(x) 325 | x = self.stage2(x) 326 | x = self.stage3(x) 327 | x = self.stage4(x) 328 | x = self.avg(x) 329 | x = x.view(x.size(0), -1) 330 | x = self.linear(x) 331 | 332 | return x 333 | 334 | def _make_stage(self, in_channels, out_channels, num, block): 335 | 336 | layers = [] 337 | layers.append(PreActResidualUnit(in_channels, out_channels, 2)) 338 | 339 | for _ in range(num): 340 | layers.append(block(out_channels, out_channels)) 341 | 342 | return nn.Sequential(*layers) 343 | 344 | def attention56(): 345 | return Attention([1, 1, 1]) 346 | 347 | def attention92(): 348 | return Attention([1, 2, 3]) 349 | 350 | -------------------------------------------------------------------------------- /models/densenet.py: -------------------------------------------------------------------------------- 1 | """dense net in pytorch 2 | 3 | 4 | 5 | [1] Gao Huang, Zhuang Liu, Laurens van der Maaten, Kilian Q. Weinberger. 6 | 7 | Densely Connected Convolutional Networks 8 | https://arxiv.org/abs/1608.06993v5 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | 15 | 16 | #"""Bottleneck layers. Although each layer only produces k 17 | #output feature-maps, it typically has many more inputs. It 18 | #has been noted in [37, 11] that a 1×1 convolution can be in- 19 | #troduced as bottleneck layer before each 3×3 convolution 20 | #to reduce the number of input feature-maps, and thus to 21 | #improve computational efficiency.""" 22 | class Bottleneck(nn.Module): 23 | def __init__(self, in_channels, growth_rate): 24 | super().__init__() 25 | #"""In our experiments, we let each 1×1 convolution 26 | #produce 4k feature-maps.""" 27 | inner_channel = 4 * growth_rate 28 | 29 | #"""We find this design especially effective for DenseNet and 30 | #we refer to our network with such a bottleneck layer, i.e., 31 | #to the BN-ReLU-Conv(1×1)-BN-ReLU-Conv(3×3) version of H ` , 32 | #as DenseNet-B.""" 33 | self.bottle_neck = nn.Sequential( 34 | nn.BatchNorm2d(in_channels), 35 | nn.ReLU(inplace=True), 36 | nn.Conv2d(in_channels, inner_channel, kernel_size=1, bias=False), 37 | nn.BatchNorm2d(inner_channel), 38 | nn.ReLU(inplace=True), 39 | nn.Conv2d(inner_channel, growth_rate, kernel_size=3, padding=1, bias=False) 40 | ) 41 | 42 | def forward(self, x): 43 | return torch.cat([x, self.bottle_neck(x)], 1) 44 | 45 | #"""We refer to layers between blocks as transition 46 | #layers, which do convolution and pooling.""" 47 | class Transition(nn.Module): 48 | def __init__(self, in_channels, out_channels): 49 | super().__init__() 50 | #"""The transition layers used in our experiments 51 | #consist of a batch normalization layer and an 1×1 52 | #convolutional layer followed by a 2×2 average pooling 53 | #layer""". 54 | self.down_sample = nn.Sequential( 55 | nn.BatchNorm2d(in_channels), 56 | nn.Conv2d(in_channels, out_channels, 1, bias=False), 57 | nn.AvgPool2d(2, stride=2) 58 | ) 59 | 60 | def forward(self, x): 61 | return self.down_sample(x) 62 | 63 | #DesneNet-BC 64 | #B stands for bottleneck layer(BN-RELU-CONV(1x1)-BN-RELU-CONV(3x3)) 65 | #C stands for compression factor(0<=theta<=1) 66 | class DenseNet(nn.Module): 67 | def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_class=100): 68 | super().__init__() 69 | self.growth_rate = growth_rate 70 | 71 | #"""Before entering the first dense block, a convolution 72 | #with 16 (or twice the growth rate for DenseNet-BC) 73 | #output channels is performed on the input images.""" 74 | inner_channels = 2 * growth_rate 75 | 76 | #For convolutional layers with kernel size 3×3, each 77 | #side of the inputs is zero-padded by one pixel to keep 78 | #the feature-map size fixed. 79 | self.conv1 = nn.Conv2d(3, inner_channels, kernel_size=3, padding=1, bias=False) 80 | 81 | self.features = nn.Sequential() 82 | 83 | for index in range(len(nblocks) - 1): 84 | self.features.add_module("dense_block_layer_{}".format(index), self._make_dense_layers(block, inner_channels, nblocks[index])) 85 | inner_channels += growth_rate * nblocks[index] 86 | 87 | #"""If a dense block contains m feature-maps, we let the 88 | #following transition layer generate θm output feature- 89 | #maps, where 0 < θ ≤ 1 is referred to as the compression 90 | #fac-tor. 91 | out_channels = int(reduction * inner_channels) # int() will automatic floor the value 92 | self.features.add_module("transition_layer_{}".format(index), Transition(inner_channels, out_channels)) 93 | inner_channels = out_channels 94 | 95 | self.features.add_module("dense_block{}".format(len(nblocks) - 1), self._make_dense_layers(block, inner_channels, nblocks[len(nblocks)-1])) 96 | inner_channels += growth_rate * nblocks[len(nblocks) - 1] 97 | self.features.add_module('bn', nn.BatchNorm2d(inner_channels)) 98 | self.features.add_module('relu', nn.ReLU(inplace=True)) 99 | 100 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 101 | 102 | self.linear = nn.Linear(inner_channels, num_class) 103 | 104 | def forward(self, x): 105 | output = self.conv1(x) 106 | output = self.features(output) 107 | output = self.avgpool(output) 108 | output = output.view(output.size()[0], -1) 109 | output = self.linear(output) 110 | return output 111 | 112 | def _make_dense_layers(self, block, in_channels, nblocks): 113 | dense_block = nn.Sequential() 114 | for index in range(nblocks): 115 | dense_block.add_module('bottle_neck_layer_{}'.format(index), block(in_channels, self.growth_rate)) 116 | in_channels += self.growth_rate 117 | return dense_block 118 | 119 | def densenet121(): 120 | return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32) 121 | 122 | def densenet169(): 123 | return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32) 124 | 125 | def densenet201(): 126 | return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32) 127 | 128 | def densenet161(): 129 | return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48) 130 | 131 | -------------------------------------------------------------------------------- /models/googlenet.py: -------------------------------------------------------------------------------- 1 | """google net in pytorch 2 | 3 | 4 | 5 | [1] Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, 6 | Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. 7 | 8 | Going Deeper with Convolutions 9 | https://arxiv.org/abs/1409.4842v1 10 | """ 11 | 12 | import torch 13 | import torch.nn as nn 14 | 15 | class Inception(nn.Module): 16 | def __init__(self, input_channels, n1x1, n3x3_reduce, n3x3, n5x5_reduce, n5x5, pool_proj): 17 | super().__init__() 18 | 19 | #1x1conv branch 20 | self.b1 = nn.Sequential( 21 | nn.Conv2d(input_channels, n1x1, kernel_size=1), 22 | nn.BatchNorm2d(n1x1), 23 | nn.ReLU(inplace=True) 24 | ) 25 | 26 | #1x1conv -> 3x3conv branch 27 | self.b2 = nn.Sequential( 28 | nn.Conv2d(input_channels, n3x3_reduce, kernel_size=1), 29 | nn.BatchNorm2d(n3x3_reduce), 30 | nn.ReLU(inplace=True), 31 | nn.Conv2d(n3x3_reduce, n3x3, kernel_size=3, padding=1), 32 | nn.BatchNorm2d(n3x3), 33 | nn.ReLU(inplace=True) 34 | ) 35 | 36 | #1x1conv -> 5x5conv branch 37 | #we use 2 3x3 conv filters stacked instead 38 | #of 1 5x5 filters to obtain the same receptive 39 | #field with fewer parameters 40 | self.b3 = nn.Sequential( 41 | nn.Conv2d(input_channels, n5x5_reduce, kernel_size=1), 42 | nn.BatchNorm2d(n5x5_reduce), 43 | nn.ReLU(inplace=True), 44 | nn.Conv2d(n5x5_reduce, n5x5, kernel_size=3, padding=1), 45 | nn.BatchNorm2d(n5x5, n5x5), 46 | nn.ReLU(inplace=True), 47 | nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1), 48 | nn.BatchNorm2d(n5x5), 49 | nn.ReLU(inplace=True) 50 | ) 51 | 52 | #3x3pooling -> 1x1conv 53 | #same conv 54 | self.b4 = nn.Sequential( 55 | nn.MaxPool2d(3, stride=1, padding=1), 56 | nn.Conv2d(input_channels, pool_proj, kernel_size=1), 57 | nn.BatchNorm2d(pool_proj), 58 | nn.ReLU(inplace=True) 59 | ) 60 | 61 | def forward(self, x): 62 | return torch.cat([self.b1(x), self.b2(x), self.b3(x), self.b4(x)], dim=1) 63 | 64 | 65 | class GoogleNet(nn.Module): 66 | 67 | def __init__(self, num_class=100): 68 | super().__init__() 69 | self.prelayer = nn.Sequential( 70 | nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False), 71 | nn.BatchNorm2d(64), 72 | nn.ReLU(inplace=True), 73 | nn.Conv2d(64, 64, kernel_size=3, padding=1, bias=False), 74 | nn.BatchNorm2d(64), 75 | nn.ReLU(inplace=True), 76 | nn.Conv2d(64, 192, kernel_size=3, padding=1, bias=False), 77 | nn.BatchNorm2d(192), 78 | nn.ReLU(inplace=True), 79 | ) 80 | 81 | #although we only use 1 conv layer as prelayer, 82 | #we still use name a3, b3....... 83 | self.a3 = Inception(192, 64, 96, 128, 16, 32, 32) 84 | self.b3 = Inception(256, 128, 128, 192, 32, 96, 64) 85 | 86 | ##"""In general, an Inception network is a network consisting of 87 | ##modules of the above type stacked upon each other, with occasional 88 | ##max-pooling layers with stride 2 to halve the resolution of the 89 | ##grid""" 90 | self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) 91 | 92 | self.a4 = Inception(480, 192, 96, 208, 16, 48, 64) 93 | self.b4 = Inception(512, 160, 112, 224, 24, 64, 64) 94 | self.c4 = Inception(512, 128, 128, 256, 24, 64, 64) 95 | self.d4 = Inception(512, 112, 144, 288, 32, 64, 64) 96 | self.e4 = Inception(528, 256, 160, 320, 32, 128, 128) 97 | 98 | self.a5 = Inception(832, 256, 160, 320, 32, 128, 128) 99 | self.b5 = Inception(832, 384, 192, 384, 48, 128, 128) 100 | 101 | #input feature size: 8*8*1024 102 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 103 | self.dropout = nn.Dropout2d(p=0.4) 104 | self.linear = nn.Linear(1024, num_class) 105 | 106 | def forward(self, x): 107 | x = self.prelayer(x) 108 | x = self.maxpool(x) 109 | x = self.a3(x) 110 | x = self.b3(x) 111 | 112 | x = self.maxpool(x) 113 | 114 | x = self.a4(x) 115 | x = self.b4(x) 116 | x = self.c4(x) 117 | x = self.d4(x) 118 | x = self.e4(x) 119 | 120 | x = self.maxpool(x) 121 | 122 | x = self.a5(x) 123 | x = self.b5(x) 124 | 125 | #"""It was found that a move from fully connected layers to 126 | #average pooling improved the top-1 accuracy by about 0.6%, 127 | #however the use of dropout remained essential even after 128 | #removing the fully connected layers.""" 129 | x = self.avgpool(x) 130 | x = self.dropout(x) 131 | x = x.view(x.size()[0], -1) 132 | x = self.linear(x) 133 | 134 | return x 135 | 136 | def googlenet(): 137 | return GoogleNet() 138 | 139 | 140 | -------------------------------------------------------------------------------- /models/inceptionv3.py: -------------------------------------------------------------------------------- 1 | """ inceptionv3 in pytorch 2 | 3 | 4 | [1] Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, Zbigniew Wojna 5 | 6 | Rethinking the Inception Architecture for Computer Vision 7 | https://arxiv.org/abs/1512.00567v3 8 | """ 9 | 10 | import torch 11 | import torch.nn as nn 12 | 13 | 14 | class BasicConv2d(nn.Module): 15 | 16 | def __init__(self, input_channels, output_channels, **kwargs): 17 | super().__init__() 18 | self.conv = nn.Conv2d(input_channels, output_channels, bias=False, **kwargs) 19 | self.bn = nn.BatchNorm2d(output_channels) 20 | self.relu = nn.ReLU(inplace=True) 21 | 22 | def forward(self, x): 23 | x = self.conv(x) 24 | x = self.bn(x) 25 | x = self.relu(x) 26 | 27 | return x 28 | 29 | #same naive inception module 30 | class InceptionA(nn.Module): 31 | 32 | def __init__(self, input_channels, pool_features): 33 | super().__init__() 34 | self.branch1x1 = BasicConv2d(input_channels, 64, kernel_size=1) 35 | 36 | self.branch5x5 = nn.Sequential( 37 | BasicConv2d(input_channels, 48, kernel_size=1), 38 | BasicConv2d(48, 64, kernel_size=5, padding=2) 39 | ) 40 | 41 | self.branch3x3 = nn.Sequential( 42 | BasicConv2d(input_channels, 64, kernel_size=1), 43 | BasicConv2d(64, 96, kernel_size=3, padding=1), 44 | BasicConv2d(96, 96, kernel_size=3, padding=1) 45 | ) 46 | 47 | self.branchpool = nn.Sequential( 48 | nn.AvgPool2d(kernel_size=3, stride=1, padding=1), 49 | BasicConv2d(input_channels, pool_features, kernel_size=3, padding=1) 50 | ) 51 | 52 | def forward(self, x): 53 | 54 | #x -> 1x1(same) 55 | branch1x1 = self.branch1x1(x) 56 | 57 | #x -> 1x1 -> 5x5(same) 58 | branch5x5 = self.branch5x5(x) 59 | #branch5x5 = self.branch5x5_2(branch5x5) 60 | 61 | #x -> 1x1 -> 3x3 -> 3x3(same) 62 | branch3x3 = self.branch3x3(x) 63 | 64 | #x -> pool -> 1x1(same) 65 | branchpool = self.branchpool(x) 66 | 67 | outputs = [branch1x1, branch5x5, branch3x3, branchpool] 68 | 69 | return torch.cat(outputs, 1) 70 | 71 | #downsample 72 | #Factorization into smaller convolutions 73 | class InceptionB(nn.Module): 74 | 75 | def __init__(self, input_channels): 76 | super().__init__() 77 | 78 | self.branch3x3 = BasicConv2d(input_channels, 384, kernel_size=3, stride=2) 79 | 80 | self.branch3x3stack = nn.Sequential( 81 | BasicConv2d(input_channels, 64, kernel_size=1), 82 | BasicConv2d(64, 96, kernel_size=3, padding=1), 83 | BasicConv2d(96, 96, kernel_size=3, stride=2) 84 | ) 85 | 86 | self.branchpool = nn.MaxPool2d(kernel_size=3, stride=2) 87 | 88 | def forward(self, x): 89 | 90 | #x - > 3x3(downsample) 91 | branch3x3 = self.branch3x3(x) 92 | 93 | #x -> 3x3 -> 3x3(downsample) 94 | branch3x3stack = self.branch3x3stack(x) 95 | 96 | #x -> avgpool(downsample) 97 | branchpool = self.branchpool(x) 98 | 99 | #"""We can use two parallel stride 2 blocks: P and C. P is a pooling 100 | #layer (either average or maximum pooling) the activation, both of 101 | #them are stride 2 the filter banks of which are concatenated as in 102 | #figure 10.""" 103 | outputs = [branch3x3, branch3x3stack, branchpool] 104 | 105 | return torch.cat(outputs, 1) 106 | 107 | #Factorizing Convolutions with Large Filter Size 108 | class InceptionC(nn.Module): 109 | def __init__(self, input_channels, channels_7x7): 110 | super().__init__() 111 | self.branch1x1 = BasicConv2d(input_channels, 192, kernel_size=1) 112 | 113 | c7 = channels_7x7 114 | 115 | #In theory, we could go even further and argue that one can replace any n × n 116 | #convolution by a 1 × n convolution followed by a n × 1 convolution and the 117 | #computational cost saving increases dramatically as n grows (see figure 6). 118 | self.branch7x7 = nn.Sequential( 119 | BasicConv2d(input_channels, c7, kernel_size=1), 120 | BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0)), 121 | BasicConv2d(c7, 192, kernel_size=(1, 7), padding=(0, 3)) 122 | ) 123 | 124 | self.branch7x7stack = nn.Sequential( 125 | BasicConv2d(input_channels, c7, kernel_size=1), 126 | BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0)), 127 | BasicConv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3)), 128 | BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0)), 129 | BasicConv2d(c7, 192, kernel_size=(1, 7), padding=(0, 3)) 130 | ) 131 | 132 | self.branch_pool = nn.Sequential( 133 | nn.AvgPool2d(kernel_size=3, stride=1, padding=1), 134 | BasicConv2d(input_channels, 192, kernel_size=1), 135 | ) 136 | 137 | def forward(self, x): 138 | 139 | #x -> 1x1(same) 140 | branch1x1 = self.branch1x1(x) 141 | 142 | #x -> 1layer 1*7 and 7*1 (same) 143 | branch7x7 = self.branch7x7(x) 144 | 145 | #x-> 2layer 1*7 and 7*1(same) 146 | branch7x7stack = self.branch7x7stack(x) 147 | 148 | #x-> avgpool (same) 149 | branchpool = self.branch_pool(x) 150 | 151 | outputs = [branch1x1, branch7x7, branch7x7stack, branchpool] 152 | 153 | return torch.cat(outputs, 1) 154 | 155 | class InceptionD(nn.Module): 156 | 157 | def __init__(self, input_channels): 158 | super().__init__() 159 | 160 | self.branch3x3 = nn.Sequential( 161 | BasicConv2d(input_channels, 192, kernel_size=1), 162 | BasicConv2d(192, 320, kernel_size=3, stride=2) 163 | ) 164 | 165 | self.branch7x7 = nn.Sequential( 166 | BasicConv2d(input_channels, 192, kernel_size=1), 167 | BasicConv2d(192, 192, kernel_size=(1, 7), padding=(0, 3)), 168 | BasicConv2d(192, 192, kernel_size=(7, 1), padding=(3, 0)), 169 | BasicConv2d(192, 192, kernel_size=3, stride=2) 170 | ) 171 | 172 | self.branchpool = nn.AvgPool2d(kernel_size=3, stride=2) 173 | 174 | def forward(self, x): 175 | 176 | #x -> 1x1 -> 3x3(downsample) 177 | branch3x3 = self.branch3x3(x) 178 | 179 | #x -> 1x1 -> 1x7 -> 7x1 -> 3x3 (downsample) 180 | branch7x7 = self.branch7x7(x) 181 | 182 | #x -> avgpool (downsample) 183 | branchpool = self.branchpool(x) 184 | 185 | outputs = [branch3x3, branch7x7, branchpool] 186 | 187 | return torch.cat(outputs, 1) 188 | 189 | 190 | #same 191 | class InceptionE(nn.Module): 192 | def __init__(self, input_channels): 193 | super().__init__() 194 | self.branch1x1 = BasicConv2d(input_channels, 320, kernel_size=1) 195 | 196 | self.branch3x3_1 = BasicConv2d(input_channels, 384, kernel_size=1) 197 | self.branch3x3_2a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1)) 198 | self.branch3x3_2b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0)) 199 | 200 | self.branch3x3stack_1 = BasicConv2d(input_channels, 448, kernel_size=1) 201 | self.branch3x3stack_2 = BasicConv2d(448, 384, kernel_size=3, padding=1) 202 | self.branch3x3stack_3a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1)) 203 | self.branch3x3stack_3b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0)) 204 | 205 | self.branch_pool = nn.Sequential( 206 | nn.AvgPool2d(kernel_size=3, stride=1, padding=1), 207 | BasicConv2d(input_channels, 192, kernel_size=1) 208 | ) 209 | 210 | def forward(self, x): 211 | 212 | #x -> 1x1 (same) 213 | branch1x1 = self.branch1x1(x) 214 | 215 | # x -> 1x1 -> 3x1 216 | # x -> 1x1 -> 1x3 217 | # concatenate(3x1, 1x3) 218 | #"""7. Inception modules with expanded the filter bank outputs. 219 | #This architecture is used on the coarsest (8 × 8) grids to promote 220 | #high dimensional representations, as suggested by principle 221 | #2 of Section 2.""" 222 | branch3x3 = self.branch3x3_1(x) 223 | branch3x3 = [ 224 | self.branch3x3_2a(branch3x3), 225 | self.branch3x3_2b(branch3x3) 226 | ] 227 | branch3x3 = torch.cat(branch3x3, 1) 228 | 229 | # x -> 1x1 -> 3x3 -> 1x3 230 | # x -> 1x1 -> 3x3 -> 3x1 231 | #concatenate(1x3, 3x1) 232 | branch3x3stack = self.branch3x3stack_1(x) 233 | branch3x3stack = self.branch3x3stack_2(branch3x3stack) 234 | branch3x3stack = [ 235 | self.branch3x3stack_3a(branch3x3stack), 236 | self.branch3x3stack_3b(branch3x3stack) 237 | ] 238 | branch3x3stack = torch.cat(branch3x3stack, 1) 239 | 240 | branchpool = self.branch_pool(x) 241 | 242 | outputs = [branch1x1, branch3x3, branch3x3stack, branchpool] 243 | 244 | return torch.cat(outputs, 1) 245 | 246 | class InceptionV3(nn.Module): 247 | 248 | def __init__(self, num_classes=100): 249 | super().__init__() 250 | self.Conv2d_1a_3x3 = BasicConv2d(3, 32, kernel_size=3, padding=1) 251 | self.Conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3, padding=1) 252 | self.Conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1) 253 | self.Conv2d_3b_1x1 = BasicConv2d(64, 80, kernel_size=1) 254 | self.Conv2d_4a_3x3 = BasicConv2d(80, 192, kernel_size=3) 255 | 256 | #naive inception module 257 | self.Mixed_5b = InceptionA(192, pool_features=32) 258 | self.Mixed_5c = InceptionA(256, pool_features=64) 259 | self.Mixed_5d = InceptionA(288, pool_features=64) 260 | 261 | #downsample 262 | self.Mixed_6a = InceptionB(288) 263 | 264 | self.Mixed_6b = InceptionC(768, channels_7x7=128) 265 | self.Mixed_6c = InceptionC(768, channels_7x7=160) 266 | self.Mixed_6d = InceptionC(768, channels_7x7=160) 267 | self.Mixed_6e = InceptionC(768, channels_7x7=192) 268 | 269 | #downsample 270 | self.Mixed_7a = InceptionD(768) 271 | 272 | self.Mixed_7b = InceptionE(1280) 273 | self.Mixed_7c = InceptionE(2048) 274 | 275 | #6*6 feature size 276 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 277 | self.dropout = nn.Dropout2d() 278 | self.linear = nn.Linear(2048, num_classes) 279 | 280 | def forward(self, x): 281 | 282 | #32 -> 30 283 | x = self.Conv2d_1a_3x3(x) 284 | x = self.Conv2d_2a_3x3(x) 285 | x = self.Conv2d_2b_3x3(x) 286 | x = self.Conv2d_3b_1x1(x) 287 | x = self.Conv2d_4a_3x3(x) 288 | 289 | #30 -> 30 290 | x = self.Mixed_5b(x) 291 | x = self.Mixed_5c(x) 292 | x = self.Mixed_5d(x) 293 | 294 | #30 -> 14 295 | #Efficient Grid Size Reduction to avoid representation 296 | #bottleneck 297 | x = self.Mixed_6a(x) 298 | 299 | #14 -> 14 300 | #"""In practice, we have found that employing this factorization does not 301 | #work well on early layers, but it gives very good results on medium 302 | #grid-sizes (On m × m feature maps, where m ranges between 12 and 20). 303 | #On that level, very good results can be achieved by using 1 × 7 convolutions 304 | #followed by 7 × 1 convolutions.""" 305 | x = self.Mixed_6b(x) 306 | x = self.Mixed_6c(x) 307 | x = self.Mixed_6d(x) 308 | x = self.Mixed_6e(x) 309 | 310 | #14 -> 6 311 | #Efficient Grid Size Reduction 312 | x = self.Mixed_7a(x) 313 | 314 | #6 -> 6 315 | #We are using this solution only on the coarsest grid, 316 | #since that is the place where producing high dimensional 317 | #sparse representation is the most critical as the ratio of 318 | #local processing (by 1 × 1 convolutions) is increased compared 319 | #to the spatial aggregation.""" 320 | x = self.Mixed_7b(x) 321 | x = self.Mixed_7c(x) 322 | 323 | #6 -> 1 324 | x = self.avgpool(x) 325 | x = self.dropout(x) 326 | x = x.view(x.size(0), -1) 327 | x = self.linear(x) 328 | return x 329 | 330 | 331 | def inceptionv3(): 332 | return InceptionV3() 333 | 334 | 335 | 336 | -------------------------------------------------------------------------------- /models/mobilenet.py: -------------------------------------------------------------------------------- 1 | """mobilenet in pytorch 2 | 3 | 4 | 5 | [1] Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam 6 | 7 | MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications 8 | https://arxiv.org/abs/1704.04861 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | 15 | class DepthSeperabelConv2d(nn.Module): 16 | 17 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 18 | super().__init__() 19 | self.depthwise = nn.Sequential( 20 | nn.Conv2d( 21 | input_channels, 22 | input_channels, 23 | kernel_size, 24 | groups=input_channels, 25 | **kwargs), 26 | nn.BatchNorm2d(input_channels), 27 | nn.ReLU(inplace=True) 28 | ) 29 | 30 | self.pointwise = nn.Sequential( 31 | nn.Conv2d(input_channels, output_channels, 1), 32 | nn.BatchNorm2d(output_channels), 33 | nn.ReLU(inplace=True) 34 | ) 35 | 36 | def forward(self, x): 37 | x = self.depthwise(x) 38 | x = self.pointwise(x) 39 | 40 | return x 41 | 42 | 43 | class BasicConv2d(nn.Module): 44 | 45 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 46 | 47 | super().__init__() 48 | self.conv = nn.Conv2d( 49 | input_channels, output_channels, kernel_size, **kwargs) 50 | self.bn = nn.BatchNorm2d(output_channels) 51 | self.relu = nn.ReLU(inplace=True) 52 | 53 | def forward(self, x): 54 | x = self.conv(x) 55 | x = self.bn(x) 56 | x = self.relu(x) 57 | 58 | return x 59 | 60 | 61 | class MobileNet(nn.Module): 62 | 63 | """ 64 | Args: 65 | width multipler: The role of the width multiplier α is to thin 66 | a network uniformly at each layer. For a given 67 | layer and width multiplier α, the number of 68 | input channels M becomes αM and the number of 69 | output channels N becomes αN. 70 | """ 71 | 72 | def __init__(self, width_multiplier=1, class_num=100): 73 | super().__init__() 74 | 75 | alpha = width_multiplier 76 | self.stem = nn.Sequential( 77 | BasicConv2d(3, int(32 * alpha), 3, padding=1, bias=False), 78 | DepthSeperabelConv2d( 79 | int(32 * alpha), 80 | int(64 * alpha), 81 | 3, 82 | padding=1, 83 | bias=False 84 | ) 85 | ) 86 | 87 | #downsample 88 | self.conv1 = nn.Sequential( 89 | DepthSeperabelConv2d( 90 | int(64 * alpha), 91 | int(128 * alpha), 92 | 3, 93 | stride=2, 94 | padding=1, 95 | bias=False 96 | ), 97 | DepthSeperabelConv2d( 98 | int(128 * alpha), 99 | int(128 * alpha), 100 | 3, 101 | padding=1, 102 | bias=False 103 | ) 104 | ) 105 | 106 | #downsample 107 | self.conv2 = nn.Sequential( 108 | DepthSeperabelConv2d( 109 | int(128 * alpha), 110 | int(256 * alpha), 111 | 3, 112 | stride=2, 113 | padding=1, 114 | bias=False 115 | ), 116 | DepthSeperabelConv2d( 117 | int(256 * alpha), 118 | int(256 * alpha), 119 | 3, 120 | padding=1, 121 | bias=False 122 | ) 123 | ) 124 | 125 | #downsample 126 | self.conv3 = nn.Sequential( 127 | DepthSeperabelConv2d( 128 | int(256 * alpha), 129 | int(512 * alpha), 130 | 3, 131 | stride=2, 132 | padding=1, 133 | bias=False 134 | ), 135 | 136 | DepthSeperabelConv2d( 137 | int(512 * alpha), 138 | int(512 * alpha), 139 | 3, 140 | padding=1, 141 | bias=False 142 | ), 143 | DepthSeperabelConv2d( 144 | int(512 * alpha), 145 | int(512 * alpha), 146 | 3, 147 | padding=1, 148 | bias=False 149 | ), 150 | DepthSeperabelConv2d( 151 | int(512 * alpha), 152 | int(512 * alpha), 153 | 3, 154 | padding=1, 155 | bias=False 156 | ), 157 | DepthSeperabelConv2d( 158 | int(512 * alpha), 159 | int(512 * alpha), 160 | 3, 161 | padding=1, 162 | bias=False 163 | ), 164 | DepthSeperabelConv2d( 165 | int(512 * alpha), 166 | int(512 * alpha), 167 | 3, 168 | padding=1, 169 | bias=False 170 | ) 171 | ) 172 | 173 | #downsample 174 | self.conv4 = nn.Sequential( 175 | DepthSeperabelConv2d( 176 | int(512 * alpha), 177 | int(1024 * alpha), 178 | 3, 179 | stride=2, 180 | padding=1, 181 | bias=False 182 | ), 183 | DepthSeperabelConv2d( 184 | int(1024 * alpha), 185 | int(1024 * alpha), 186 | 3, 187 | padding=1, 188 | bias=False 189 | ) 190 | ) 191 | 192 | self.fc = nn.Linear(int(1024 * alpha), class_num) 193 | self.avg = nn.AdaptiveAvgPool2d(1) 194 | 195 | def forward(self, x): 196 | x = self.stem(x) 197 | 198 | x = self.conv1(x) 199 | x = self.conv2(x) 200 | x = self.conv3(x) 201 | x = self.conv4(x) 202 | 203 | x = self.avg(x) 204 | x = x.view(x.size(0), -1) 205 | x = self.fc(x) 206 | return x 207 | 208 | 209 | def mobilenet(alpha=1, class_num=100): 210 | return MobileNet(alpha, class_num) 211 | 212 | -------------------------------------------------------------------------------- /models/mobilenetv2.py: -------------------------------------------------------------------------------- 1 | """mobilenetv2 in pytorch 2 | 3 | 4 | 5 | [1] Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen 6 | 7 | MobileNetV2: Inverted Residuals and Linear Bottlenecks 8 | https://arxiv.org/abs/1801.04381 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | 16 | class LinearBottleNeck(nn.Module): 17 | 18 | def __init__(self, in_channels, out_channels, stride, t=6, class_num=100): 19 | super().__init__() 20 | 21 | self.residual = nn.Sequential( 22 | nn.Conv2d(in_channels, in_channels * t, 1), 23 | nn.BatchNorm2d(in_channels * t), 24 | nn.ReLU6(inplace=True), 25 | 26 | nn.Conv2d(in_channels * t, in_channels * t, 3, stride=stride, padding=1, groups=in_channels * t), 27 | nn.BatchNorm2d(in_channels * t), 28 | nn.ReLU6(inplace=True), 29 | 30 | nn.Conv2d(in_channels * t, out_channels, 1), 31 | nn.BatchNorm2d(out_channels) 32 | ) 33 | 34 | self.stride = stride 35 | self.in_channels = in_channels 36 | self.out_channels = out_channels 37 | 38 | def forward(self, x): 39 | 40 | residual = self.residual(x) 41 | 42 | if self.stride == 1 and self.in_channels == self.out_channels: 43 | residual += x 44 | 45 | return residual 46 | 47 | class MobileNetV2(nn.Module): 48 | 49 | def __init__(self, class_num=100): 50 | super().__init__() 51 | 52 | self.pre = nn.Sequential( 53 | nn.Conv2d(3, 32, 1, padding=1), 54 | nn.BatchNorm2d(32), 55 | nn.ReLU6(inplace=True) 56 | ) 57 | 58 | self.stage1 = LinearBottleNeck(32, 16, 1, 1) 59 | self.stage2 = self._make_stage(2, 16, 24, 2, 6) 60 | self.stage3 = self._make_stage(3, 24, 32, 2, 6) 61 | self.stage4 = self._make_stage(4, 32, 64, 2, 6) 62 | self.stage5 = self._make_stage(3, 64, 96, 1, 6) 63 | self.stage6 = self._make_stage(3, 96, 160, 1, 6) 64 | self.stage7 = LinearBottleNeck(160, 320, 1, 6) 65 | 66 | self.conv1 = nn.Sequential( 67 | nn.Conv2d(320, 1280, 1), 68 | nn.BatchNorm2d(1280), 69 | nn.ReLU6(inplace=True) 70 | ) 71 | 72 | self.conv2 = nn.Conv2d(1280, class_num, 1) 73 | 74 | def forward(self, x): 75 | x = self.pre(x) 76 | x = self.stage1(x) 77 | x = self.stage2(x) 78 | x = self.stage3(x) 79 | x = self.stage4(x) 80 | x = self.stage5(x) 81 | x = self.stage6(x) 82 | x = self.stage7(x) 83 | x = self.conv1(x) 84 | x = F.adaptive_avg_pool2d(x, 1) 85 | x = self.conv2(x) 86 | x = x.view(x.size(0), -1) 87 | 88 | return x 89 | 90 | def _make_stage(self, repeat, in_channels, out_channels, stride, t): 91 | 92 | layers = [] 93 | layers.append(LinearBottleNeck(in_channels, out_channels, stride, t)) 94 | 95 | while repeat - 1: 96 | layers.append(LinearBottleNeck(out_channels, out_channels, 1, t)) 97 | repeat -= 1 98 | 99 | return nn.Sequential(*layers) 100 | 101 | def mobilenetv2(): 102 | return MobileNetV2() -------------------------------------------------------------------------------- /models/nasnet.py: -------------------------------------------------------------------------------- 1 | """nasnet in pytorch 2 | 3 | 4 | 5 | [1] Barret Zoph, Vijay Vasudevan, Jonathon Shlens, Quoc V. Le 6 | 7 | Learning Transferable Architectures for Scalable Image Recognition 8 | https://arxiv.org/abs/1707.07012 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | class SeperableConv2d(nn.Module): 15 | 16 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 17 | 18 | super().__init__() 19 | self.depthwise = nn.Conv2d( 20 | input_channels, 21 | input_channels, 22 | kernel_size, 23 | groups=input_channels, 24 | **kwargs 25 | ) 26 | 27 | self.pointwise = nn.Conv2d( 28 | input_channels, 29 | output_channels, 30 | 1 31 | ) 32 | def forward(self, x): 33 | x = self.depthwise(x) 34 | x = self.pointwise(x) 35 | 36 | return x 37 | 38 | class SeperableBranch(nn.Module): 39 | 40 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 41 | """Adds 2 blocks of [relu-separable conv-batchnorm].""" 42 | super().__init__() 43 | self.block1 = nn.Sequential( 44 | nn.ReLU(), 45 | SeperableConv2d(input_channels, output_channels, kernel_size, **kwargs), 46 | nn.BatchNorm2d(output_channels) 47 | ) 48 | 49 | self.block2 = nn.Sequential( 50 | nn.ReLU(), 51 | SeperableConv2d(output_channels, output_channels, kernel_size, stride=1, padding=int(kernel_size / 2)), 52 | nn.BatchNorm2d(output_channels) 53 | ) 54 | 55 | def forward(self, x): 56 | x = self.block1(x) 57 | x = self.block2(x) 58 | 59 | return x 60 | 61 | class Fit(nn.Module): 62 | """Make the cell outputs compatible 63 | 64 | Args: 65 | prev_filters: filter number of tensor prev, needs to be modified 66 | filters: filter number of normal cell branch output filters 67 | """ 68 | 69 | def __init__(self, prev_filters, filters): 70 | super().__init__() 71 | self.relu = nn.ReLU() 72 | 73 | self.p1 = nn.Sequential( 74 | nn.AvgPool2d(1, stride=2), 75 | nn.Conv2d(prev_filters, int(filters / 2), 1) 76 | ) 77 | 78 | #make sure there is no information loss 79 | self.p2 = nn.Sequential( 80 | nn.ConstantPad2d((0, 1, 0, 1), 0), 81 | nn.ConstantPad2d((-1, 0, -1, 0), 0), #cropping 82 | nn.AvgPool2d(1, stride=2), 83 | nn.Conv2d(prev_filters, int(filters / 2), 1) 84 | ) 85 | 86 | self.bn = nn.BatchNorm2d(filters) 87 | 88 | self.dim_reduce = nn.Sequential( 89 | nn.ReLU(), 90 | nn.Conv2d(prev_filters, filters, 1), 91 | nn.BatchNorm2d(filters) 92 | ) 93 | 94 | self.filters = filters 95 | 96 | def forward(self, inputs): 97 | x, prev = inputs 98 | if prev is None: 99 | return x 100 | 101 | #image size does not match 102 | elif x.size(2) != prev.size(2): 103 | prev = self.relu(prev) 104 | p1 = self.p1(prev) 105 | p2 = self.p2(prev) 106 | prev = torch.cat([p1, p2], 1) 107 | prev = self.bn(prev) 108 | 109 | elif prev.size(1) != self.filters: 110 | prev = self.dim_reduce(prev) 111 | 112 | return prev 113 | 114 | 115 | class NormalCell(nn.Module): 116 | 117 | def __init__(self, x_in, prev_in, output_channels): 118 | super().__init__() 119 | 120 | self.dem_reduce = nn.Sequential( 121 | nn.ReLU(), 122 | nn.Conv2d(x_in, output_channels, 1, bias=False), 123 | nn.BatchNorm2d(output_channels) 124 | ) 125 | 126 | self.block1_left = SeperableBranch( 127 | output_channels, 128 | output_channels, 129 | kernel_size=3, 130 | padding=1, 131 | bias=False 132 | ) 133 | self.block1_right = nn.Sequential() 134 | 135 | self.block2_left = SeperableBranch( 136 | output_channels, 137 | output_channels, 138 | kernel_size=3, 139 | padding=1, 140 | bias=False 141 | ) 142 | self.block2_right = SeperableBranch( 143 | output_channels, 144 | output_channels, 145 | kernel_size=5, 146 | padding=2, 147 | bias=False 148 | ) 149 | 150 | self.block3_left = nn.AvgPool2d(3, stride=1, padding=1) 151 | self.block3_right = nn.Sequential() 152 | 153 | self.block4_left = nn.AvgPool2d(3, stride=1, padding=1) 154 | self.block4_right = nn.AvgPool2d(3, stride=1, padding=1) 155 | 156 | self.block5_left = SeperableBranch( 157 | output_channels, 158 | output_channels, 159 | kernel_size=5, 160 | padding=2, 161 | bias=False 162 | ) 163 | self.block5_right = SeperableBranch( 164 | output_channels, 165 | output_channels, 166 | kernel_size=3, 167 | padding=1, 168 | bias=False 169 | ) 170 | 171 | self.fit = Fit(prev_in, output_channels) 172 | 173 | def forward(self, x): 174 | x, prev = x 175 | 176 | #return transformed x as new x, and original x as prev 177 | #only prev tensor needs to be modified 178 | prev = self.fit((x, prev)) 179 | 180 | h = self.dem_reduce(x) 181 | 182 | x1 = self.block1_left(h) + self.block1_right(h) 183 | x2 = self.block2_left(prev) + self.block2_right(h) 184 | x3 = self.block3_left(h) + self.block3_right(h) 185 | x4 = self.block4_left(prev) + self.block4_right(prev) 186 | x5 = self.block5_left(prev) + self.block5_right(prev) 187 | 188 | return torch.cat([prev, x1, x2, x3, x4, x5], 1), x 189 | 190 | class ReductionCell(nn.Module): 191 | 192 | def __init__(self, x_in, prev_in, output_channels): 193 | super().__init__() 194 | 195 | self.dim_reduce = nn.Sequential( 196 | nn.ReLU(), 197 | nn.Conv2d(x_in, output_channels, 1), 198 | nn.BatchNorm2d(output_channels) 199 | ) 200 | 201 | #block1 202 | self.layer1block1_left = SeperableBranch(output_channels, output_channels, 7, stride=2, padding=3) 203 | self.layer1block1_right = SeperableBranch(output_channels, output_channels, 5, stride=2, padding=2) 204 | 205 | #block2 206 | self.layer1block2_left = nn.MaxPool2d(3, stride=2, padding=1) 207 | self.layer1block2_right = SeperableBranch(output_channels, output_channels, 7, stride=2, padding=3) 208 | 209 | #block3 210 | self.layer1block3_left = nn.AvgPool2d(3, 2, 1) 211 | self.layer1block3_right = SeperableBranch(output_channels, output_channels, 5, stride=2, padding=2) 212 | 213 | #block5 214 | self.layer2block1_left = nn.MaxPool2d(3, 2, 1) 215 | self.layer2block1_right = SeperableBranch(output_channels, output_channels, 3, stride=1, padding=1) 216 | 217 | #block4 218 | self.layer2block2_left = nn.AvgPool2d(3, 1, 1) 219 | self.layer2block2_right = nn.Sequential() 220 | 221 | self.fit = Fit(prev_in, output_channels) 222 | 223 | def forward(self, x): 224 | x, prev = x 225 | prev = self.fit((x, prev)) 226 | 227 | h = self.dim_reduce(x) 228 | 229 | layer1block1 = self.layer1block1_left(prev) + self.layer1block1_right(h) 230 | layer1block2 = self.layer1block2_left(h) + self.layer1block2_right(prev) 231 | layer1block3 = self.layer1block3_left(h) + self.layer1block3_right(prev) 232 | layer2block1 = self.layer2block1_left(h) + self.layer2block1_right(layer1block1) 233 | layer2block2 = self.layer2block2_left(layer1block1) + self.layer2block2_right(layer1block2) 234 | 235 | return torch.cat([ 236 | layer1block2, #https://github.com/keras-team/keras-applications/blob/master/keras_applications/nasnet.py line 739 237 | layer1block3, 238 | layer2block1, 239 | layer2block2 240 | ], 1), x 241 | 242 | 243 | class NasNetA(nn.Module): 244 | 245 | def __init__(self, repeat_cell_num, reduction_num, filters, stemfilter, class_num=100): 246 | super().__init__() 247 | 248 | self.stem = nn.Sequential( 249 | nn.Conv2d(3, stemfilter, 3, padding=1, bias=False), 250 | nn.BatchNorm2d(stemfilter) 251 | ) 252 | 253 | self.prev_filters = stemfilter 254 | self.x_filters = stemfilter 255 | self.filters = filters 256 | 257 | self.cell_layers = self._make_layers(repeat_cell_num, reduction_num) 258 | 259 | self.relu = nn.ReLU() 260 | self.avg = nn.AdaptiveAvgPool2d(1) 261 | self.fc = nn.Linear(self.filters * 6, class_num) 262 | 263 | 264 | def _make_normal(self, block, repeat, output): 265 | """make normal cell 266 | Args: 267 | block: cell type 268 | repeat: number of repeated normal cell 269 | output: output filters for each branch in normal cell 270 | Returns: 271 | stacked normal cells 272 | """ 273 | 274 | layers = [] 275 | for r in range(repeat): 276 | layers.append(block(self.x_filters, self.prev_filters, output)) 277 | self.prev_filters = self.x_filters 278 | self.x_filters = output * 6 #concatenate 6 branches 279 | 280 | return layers 281 | 282 | def _make_reduction(self, block, output): 283 | """make normal cell 284 | Args: 285 | block: cell type 286 | output: output filters for each branch in reduction cell 287 | Returns: 288 | reduction cell 289 | """ 290 | 291 | reduction = block(self.x_filters, self.prev_filters, output) 292 | self.prev_filters = self.x_filters 293 | self.x_filters = output * 4 #stack for 4 branches 294 | 295 | return reduction 296 | 297 | def _make_layers(self, repeat_cell_num, reduction_num): 298 | 299 | layers = [] 300 | for i in range(reduction_num): 301 | 302 | layers.extend(self._make_normal(NormalCell, repeat_cell_num, self.filters)) 303 | self.filters *= 2 304 | layers.append(self._make_reduction(ReductionCell, self.filters)) 305 | 306 | layers.extend(self._make_normal(NormalCell, repeat_cell_num, self.filters)) 307 | 308 | return nn.Sequential(*layers) 309 | 310 | 311 | def forward(self, x): 312 | 313 | x = self.stem(x) 314 | prev = None 315 | x, prev = self.cell_layers((x, prev)) 316 | x = self.relu(x) 317 | x = self.avg(x) 318 | x = x.view(x.size(0), -1) 319 | x = self.fc(x) 320 | 321 | return x 322 | 323 | 324 | def nasnet(): 325 | 326 | #stem filters must be 44, it's a pytorch workaround, cant change to other number 327 | return NasNetA(4, 2, 44, 44) 328 | 329 | -------------------------------------------------------------------------------- /models/preactresnet.py: -------------------------------------------------------------------------------- 1 | """preactresnet in pytorch 2 | 3 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 4 | 5 | Identity Mappings in Deep Residual Networks 6 | https://arxiv.org/abs/1603.05027 7 | """ 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | 13 | class PreActBasic(nn.Module): 14 | 15 | expansion = 1 16 | def __init__(self, in_channels, out_channels, stride): 17 | super().__init__() 18 | self.residual = nn.Sequential( 19 | nn.BatchNorm2d(in_channels), 20 | nn.ReLU(inplace=True), 21 | nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1), 22 | nn.BatchNorm2d(out_channels), 23 | nn.ReLU(inplace=True), 24 | nn.Conv2d(out_channels, out_channels * PreActBasic.expansion, kernel_size=3, padding=1) 25 | ) 26 | 27 | self.shortcut = nn.Sequential() 28 | if stride != 1 or in_channels != out_channels * PreActBasic.expansion: 29 | self.shortcut = nn.Conv2d(in_channels, out_channels * PreActBasic.expansion, 1, stride=stride) 30 | 31 | def forward(self, x): 32 | 33 | res = self.residual(x) 34 | shortcut = self.shortcut(x) 35 | 36 | return res + shortcut 37 | 38 | 39 | class PreActBottleNeck(nn.Module): 40 | 41 | expansion = 4 42 | def __init__(self, in_channels, out_channels, stride): 43 | super().__init__() 44 | 45 | self.residual = nn.Sequential( 46 | nn.BatchNorm2d(in_channels), 47 | nn.ReLU(inplace=True), 48 | nn.Conv2d(in_channels, out_channels, 1, stride=stride), 49 | 50 | nn.BatchNorm2d(out_channels), 51 | nn.ReLU(inplace=True), 52 | nn.Conv2d(out_channels, out_channels, 3, padding=1), 53 | 54 | nn.BatchNorm2d(out_channels), 55 | nn.ReLU(inplace=True), 56 | nn.Conv2d(out_channels, out_channels * PreActBottleNeck.expansion, 1) 57 | ) 58 | 59 | self.shortcut = nn.Sequential() 60 | 61 | if stride != 1 or in_channels != out_channels * PreActBottleNeck.expansion: 62 | self.shortcut = nn.Conv2d(in_channels, out_channels * PreActBottleNeck.expansion, 1, stride=stride) 63 | 64 | def forward(self, x): 65 | 66 | res = self.residual(x) 67 | shortcut = self.shortcut(x) 68 | 69 | return res + shortcut 70 | 71 | class PreActResNet(nn.Module): 72 | 73 | def __init__(self, block, num_block, class_num=100): 74 | super().__init__() 75 | self.input_channels = 64 76 | 77 | self.pre = nn.Sequential( 78 | nn.Conv2d(3, 64, 3, padding=1), 79 | nn.BatchNorm2d(64), 80 | nn.ReLU(inplace=True) 81 | ) 82 | 83 | self.stage1 = self._make_layers(block, num_block[0], 64, 1) 84 | self.stage2 = self._make_layers(block, num_block[1], 128, 2) 85 | self.stage3 = self._make_layers(block, num_block[2], 256, 2) 86 | self.stage4 = self._make_layers(block, num_block[3], 512, 2) 87 | 88 | self.linear = nn.Linear(self.input_channels, class_num) 89 | 90 | def _make_layers(self, block, block_num, out_channels, stride): 91 | layers = [] 92 | 93 | layers.append(block(self.input_channels, out_channels, stride)) 94 | self.input_channels = out_channels * block.expansion 95 | 96 | while block_num - 1: 97 | layers.append(block(self.input_channels, out_channels, 1)) 98 | self.input_channels = out_channels * block.expansion 99 | block_num -= 1 100 | 101 | return nn.Sequential(*layers) 102 | 103 | def forward(self, x): 104 | x = self.pre(x) 105 | 106 | x = self.stage1(x) 107 | x = self.stage2(x) 108 | x = self.stage3(x) 109 | x = self.stage4(x) 110 | 111 | x = F.adaptive_avg_pool2d(x, 1) 112 | x = x.view(x.size(0), -1) 113 | x = self.linear(x) 114 | 115 | return x 116 | 117 | def preactresnet18(): 118 | return PreActResNet(PreActBasic, [2, 2, 2, 2]) 119 | 120 | def preactresnet34(): 121 | return PreActResNet(PreActBasic, [3, 4, 6, 3]) 122 | 123 | def preactresnet50(): 124 | return PreActResNet(PreActBottleNeck, [3, 4, 6, 3]) 125 | 126 | def preactresnet101(): 127 | return PreActResNet(PreActBottleNeck, [3, 4, 23, 3]) 128 | 129 | def preactresnet152(): 130 | return PreActResNet(PreActBottleNeck, [3, 8, 36, 3]) 131 | 132 | -------------------------------------------------------------------------------- /models/resnet.py: -------------------------------------------------------------------------------- 1 | """resnet in pytorch 2 | 3 | 4 | 5 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. 6 | 7 | Deep Residual Learning for Image Recognition 8 | https://arxiv.org/abs/1512.03385v1 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | class BasicBlock(nn.Module): 15 | """Basic Block for resnet 18 and resnet 34 16 | 17 | """ 18 | 19 | #BasicBlock and BottleNeck block 20 | #have different output size 21 | #we use class attribute expansion 22 | #to distinct 23 | expansion = 1 24 | 25 | def __init__(self, in_channels, out_channels, stride=1): 26 | super().__init__() 27 | 28 | #residual function 29 | self.residual_function = nn.Sequential( 30 | nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False), 31 | nn.BatchNorm2d(out_channels), 32 | nn.ReLU(inplace=True), 33 | nn.Conv2d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, padding=1, bias=False), 34 | nn.BatchNorm2d(out_channels * BasicBlock.expansion) 35 | ) 36 | 37 | #shortcut 38 | self.shortcut = nn.Sequential() 39 | 40 | #the shortcut output dimension is not the same with residual function 41 | #use 1*1 convolution to match the dimension 42 | if stride != 1 or in_channels != BasicBlock.expansion * out_channels: 43 | self.shortcut = nn.Sequential( 44 | nn.Conv2d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False), 45 | nn.BatchNorm2d(out_channels * BasicBlock.expansion) 46 | ) 47 | 48 | def forward(self, x): 49 | return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x)) 50 | 51 | class BottleNeck(nn.Module): 52 | """Residual block for resnet over 50 layers 53 | 54 | """ 55 | expansion = 4 56 | def __init__(self, in_channels, out_channels, stride=1): 57 | super().__init__() 58 | self.residual_function = nn.Sequential( 59 | nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False), 60 | nn.BatchNorm2d(out_channels), 61 | nn.ReLU(inplace=True), 62 | nn.Conv2d(out_channels, out_channels, stride=stride, kernel_size=3, padding=1, bias=False), 63 | nn.BatchNorm2d(out_channels), 64 | nn.ReLU(inplace=True), 65 | nn.Conv2d(out_channels, out_channels * BottleNeck.expansion, kernel_size=1, bias=False), 66 | nn.BatchNorm2d(out_channels * BottleNeck.expansion), 67 | ) 68 | 69 | self.shortcut = nn.Sequential() 70 | 71 | if stride != 1 or in_channels != out_channels * BottleNeck.expansion: 72 | self.shortcut = nn.Sequential( 73 | nn.Conv2d(in_channels, out_channels * BottleNeck.expansion, stride=stride, kernel_size=1, bias=False), 74 | nn.BatchNorm2d(out_channels * BottleNeck.expansion) 75 | ) 76 | 77 | def forward(self, x): 78 | return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x)) 79 | 80 | class ResNet(nn.Module): 81 | 82 | def __init__(self, block, num_block, num_classes=100): 83 | super().__init__() 84 | 85 | self.in_channels = 64 86 | 87 | self.conv1 = nn.Sequential( 88 | nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False), 89 | nn.BatchNorm2d(64), 90 | nn.ReLU(inplace=True)) 91 | #we use a different inputsize than the original paper 92 | #so conv2_x's stride is 1 93 | self.conv2_x = self._make_layer(block, 64, num_block[0], 1) 94 | self.conv3_x = self._make_layer(block, 128, num_block[1], 2) 95 | self.conv4_x = self._make_layer(block, 256, num_block[2], 2) 96 | self.conv5_x = self._make_layer(block, 512, num_block[3], 2) 97 | self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) 98 | self.fc = nn.Linear(512 * block.expansion, num_classes) 99 | 100 | def _make_layer(self, block, out_channels, num_blocks, stride): 101 | """make resnet layers(by layer i didnt mean this 'layer' was the 102 | same as a neuron netowork layer, ex. conv layer), one layer may 103 | contain more than one residual block 104 | 105 | Args: 106 | block: block type, basic block or bottle neck block 107 | out_channels: output depth channel number of this layer 108 | num_blocks: how many blocks per layer 109 | stride: the stride of the first block of this layer 110 | 111 | Return: 112 | return a resnet layer 113 | """ 114 | 115 | # we have num_block blocks per layer, the first block 116 | # could be 1 or 2, other blocks would always be 1 117 | strides = [stride] + [1] * (num_blocks - 1) 118 | layers = [] 119 | for stride in strides: 120 | layers.append(block(self.in_channels, out_channels, stride)) 121 | self.in_channels = out_channels * block.expansion 122 | 123 | return nn.Sequential(*layers) 124 | 125 | def forward(self, x): 126 | output = self.conv1(x) 127 | output = self.conv2_x(output) 128 | output = self.conv3_x(output) 129 | output = self.conv4_x(output) 130 | output = self.conv5_x(output) 131 | output = self.avg_pool(output) 132 | output = output.view(output.size(0), -1) 133 | output = self.fc(output) 134 | 135 | return output 136 | 137 | def resnet18(): 138 | """ return a ResNet 18 object 139 | """ 140 | return ResNet(BasicBlock, [2, 2, 2, 2]) 141 | 142 | def resnet34(): 143 | """ return a ResNet 34 object 144 | """ 145 | return ResNet(BasicBlock, [3, 4, 6, 3]) 146 | 147 | def resnet50(): 148 | """ return a ResNet 50 object 149 | """ 150 | return ResNet(BottleNeck, [3, 4, 6, 3]) 151 | 152 | def resnet101(): 153 | """ return a ResNet 101 object 154 | """ 155 | return ResNet(BottleNeck, [3, 4, 23, 3]) 156 | 157 | def resnet152(): 158 | """ return a ResNet 152 object 159 | """ 160 | return ResNet(BottleNeck, [3, 8, 36, 3]) 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /models/resnext.py: -------------------------------------------------------------------------------- 1 | """resnext in pytorch 2 | 3 | 4 | 5 | [1] Saining Xie, Ross Girshick, Piotr Dollár, Zhuowen Tu, Kaiming He. 6 | 7 | Aggregated Residual Transformations for Deep Neural Networks 8 | https://arxiv.org/abs/1611.05431 9 | """ 10 | 11 | import math 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | 16 | #only implements ResNext bottleneck c 17 | 18 | 19 | #"""This strategy exposes a new dimension, which we call “cardinality” 20 | #(the size of the set of transformations), as an essential factor 21 | #in addition to the dimensions of depth and width.""" 22 | CARDINALITY = 32 23 | DEPTH = 4 24 | BASEWIDTH = 64 25 | 26 | #"""The grouped convolutional layer in Fig. 3(c) performs 32 groups 27 | #of convolutions whose input and output channels are 4-dimensional. 28 | #The grouped convolutional layer concatenates them as the outputs 29 | #of the layer.""" 30 | 31 | class ResNextBottleNeckC(nn.Module): 32 | 33 | def __init__(self, in_channels, out_channels, stride): 34 | super().__init__() 35 | 36 | C = CARDINALITY #How many groups a feature map was splitted into 37 | 38 | #"""We note that the input/output width of the template is fixed as 39 | #256-d (Fig. 3), We note that the input/output width of the template 40 | #is fixed as 256-d (Fig. 3), and all widths are dou- bled each time 41 | #when the feature map is subsampled (see Table 1).""" 42 | D = int(DEPTH * out_channels / BASEWIDTH) #number of channels per group 43 | self.split_transforms = nn.Sequential( 44 | nn.Conv2d(in_channels, C * D, kernel_size=1, groups=C, bias=False), 45 | nn.BatchNorm2d(C * D), 46 | nn.ReLU(inplace=True), 47 | nn.Conv2d(C * D, C * D, kernel_size=3, stride=stride, groups=C, padding=1, bias=False), 48 | nn.BatchNorm2d(C * D), 49 | nn.ReLU(inplace=True), 50 | nn.Conv2d(C * D, out_channels * 4, kernel_size=1, bias=False), 51 | nn.BatchNorm2d(out_channels * 4), 52 | ) 53 | 54 | self.shortcut = nn.Sequential() 55 | 56 | if stride != 1 or in_channels != out_channels * 4: 57 | self.shortcut = nn.Sequential( 58 | nn.Conv2d(in_channels, out_channels * 4, stride=stride, kernel_size=1, bias=False), 59 | nn.BatchNorm2d(out_channels * 4) 60 | ) 61 | 62 | def forward(self, x): 63 | return F.relu(self.split_transforms(x) + self.shortcut(x)) 64 | 65 | class ResNext(nn.Module): 66 | 67 | def __init__(self, block, num_blocks, class_names=100): 68 | super().__init__() 69 | self.in_channels = 64 70 | 71 | self.conv1 = nn.Sequential( 72 | nn.Conv2d(3, 64, 3, stride=1, padding=1, bias=False), 73 | nn.BatchNorm2d(64), 74 | nn.ReLU(inplace=True) 75 | ) 76 | 77 | self.conv2 = self._make_layer(block, num_blocks[0], 64, 1) 78 | self.conv3 = self._make_layer(block, num_blocks[1], 128, 2) 79 | self.conv4 = self._make_layer(block, num_blocks[2], 256, 2) 80 | self.conv5 = self._make_layer(block, num_blocks[3], 512, 2) 81 | self.avg = nn.AdaptiveAvgPool2d((1, 1)) 82 | self.fc = nn.Linear(512 * 4, 100) 83 | 84 | def forward(self, x): 85 | x = self.conv1(x) 86 | x = self.conv2(x) 87 | x = self.conv3(x) 88 | x = self.conv4(x) 89 | x = self.conv5(x) 90 | x = self.avg(x) 91 | x = x.view(x.size(0), -1) 92 | x = self.fc(x) 93 | return x 94 | 95 | def _make_layer(self, block, num_block, out_channels, stride): 96 | """Building resnext block 97 | Args: 98 | block: block type(default resnext bottleneck c) 99 | num_block: number of blocks per layer 100 | out_channels: output channels per block 101 | stride: block stride 102 | 103 | Returns: 104 | a resnext layer 105 | """ 106 | strides = [stride] + [1] * (num_block - 1) 107 | layers = [] 108 | for stride in strides: 109 | layers.append(block(self.in_channels, out_channels, stride)) 110 | self.in_channels = out_channels * 4 111 | 112 | return nn.Sequential(*layers) 113 | 114 | def resnext50(): 115 | """ return a resnext50(c32x4d) network 116 | """ 117 | return ResNext(ResNextBottleNeckC, [3, 4, 6, 3]) 118 | 119 | def resnext101(): 120 | """ return a resnext101(c32x4d) network 121 | """ 122 | return ResNext(ResNextBottleNeckC, [3, 4, 23, 3]) 123 | 124 | def resnext152(): 125 | """ return a resnext101(c32x4d) network 126 | """ 127 | return ResNext(ResNextBottleNeckC, [3, 4, 36, 3]) 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /models/rir.py: -------------------------------------------------------------------------------- 1 | """resnet in resnet in pytorch 2 | 3 | 4 | 5 | [1] Sasha Targ, Diogo Almeida, Kevin Lyman. 6 | 7 | Resnet in Resnet: Generalizing Residual Architectures 8 | https://arxiv.org/abs/1603.08029v1 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | #geralized 15 | class ResnetInit(nn.Module): 16 | def __init__(self, in_channel, out_channel, stride): 17 | super().__init__() 18 | 19 | #"""The modular unit of the generalized residual network architecture is a 20 | #generalized residual block consisting of parallel states for a residual stream, 21 | #r, which contains identity shortcut connections and is similar to the structure 22 | #of a residual block from the original ResNet with a single convolutional layer 23 | #(parameters W l,r→r ) 24 | self.residual_stream_conv = nn.Conv2d(in_channel, out_channel, 3, padding=1, stride=stride) 25 | 26 | #"""and a transient stream, t, which is a standard convolutional layer 27 | #(W l,t→t ).""" 28 | self.transient_stream_conv = nn.Conv2d(in_channel, out_channel, 3, padding=1, stride=stride) 29 | 30 | #"""Two additional sets of convolutional filters in each block (W l,r→t , W l,t→r ) 31 | #also transfer information across streams.""" 32 | self.residual_stream_conv_across = nn.Conv2d(in_channel, out_channel, 3, padding=1, stride=stride) 33 | 34 | #"""We use equal numbers of filters for the residual and transient streams of the 35 | #generalized residual network, but optimizing this hyperparameter could lead to 36 | #further potential improvements.""" 37 | self.transient_stream_conv_across = nn.Conv2d(in_channel, out_channel, 3, padding=1, stride=stride) 38 | 39 | self.residual_bn_relu = nn.Sequential( 40 | nn.BatchNorm2d(out_channel), 41 | nn.ReLU(inplace=True) 42 | ) 43 | 44 | self.transient_bn_relu = nn.Sequential( 45 | nn.BatchNorm2d(out_channel), 46 | nn.ReLU(inplace=True) 47 | ) 48 | 49 | #"""The form of the shortcut connection can be an identity function with 50 | #the appropriate padding or a projection as in He et al. (2015b).""" 51 | self.short_cut = nn.Sequential() 52 | if in_channel != out_channel or stride != 1: 53 | self.short_cut = nn.Sequential( 54 | nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride) 55 | ) 56 | 57 | 58 | def forward(self, x): 59 | x_residual, x_transient = x 60 | residual_r_r = self.residual_stream_conv(x_residual) 61 | residual_r_t = self.residual_stream_conv_across(x_residual) 62 | residual_shortcut = self.short_cut(x_residual) 63 | 64 | transient_t_t = self.transient_stream_conv(x_transient) 65 | transient_t_r = self.transient_stream_conv_across(x_transient) 66 | 67 | #transient_t_t = self.transient_stream_conv(x_residual) 68 | #transient_t_r = self.transient_stream_conv_across(x_residual) 69 | #"""Same-stream and cross-stream activations are summed (along with the 70 | #shortcut connection for the residual stream) before applying batch 71 | #normalization and ReLU nonlinearities (together σ) to get the output 72 | #states of the block (Equation 1) (Ioffe & Szegedy, 2015).""" 73 | x_residual = self.residual_bn_relu(residual_r_r + transient_t_r + residual_shortcut) 74 | x_transient = self.transient_bn_relu(residual_r_t + transient_t_t) 75 | 76 | return x_residual, x_transient 77 | 78 | 79 | 80 | class RiRBlock(nn.Module): 81 | def __init__(self, in_channel, out_channel, layer_num, stride, layer=ResnetInit): 82 | super().__init__() 83 | self.resnetinit = self._make_layers(in_channel, out_channel, layer_num, stride) 84 | 85 | #self.short_cut = nn.Sequential() 86 | #if stride != 1 or in_channel != out_channel: 87 | # self.short_cut = nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride) 88 | 89 | def forward(self, x): 90 | x_residual, x_transient = self.resnetinit(x) 91 | #x_residual = x_residual + self.short_cut(x[0]) 92 | #x_transient = x_transient + self.short_cut(x[1]) 93 | 94 | return (x_residual, x_transient) 95 | 96 | #"""Replacing each of the convolutional layers within a residual 97 | #block from the original ResNet (Figure 1a) with a generalized residual block 98 | #(Figure 1b) leads us to a new architecture we call ResNet in ResNet (RiR) 99 | #(Figure 1d).""" 100 | def _make_layers(self, in_channel, out_channel, layer_num, stride, layer=ResnetInit): 101 | strides = [stride] + [1] * (layer_num - 1) 102 | layers = nn.Sequential() 103 | for index, s in enumerate(strides): 104 | layers.add_module("generalized layers{}".format(index), layer(in_channel, out_channel, s)) 105 | in_channel = out_channel 106 | 107 | return layers 108 | 109 | class ResnetInResneet(nn.Module): 110 | def __init__(self, num_classes=100): 111 | super().__init__() 112 | base = int(96 / 2) 113 | self.residual_pre_conv = nn.Sequential( 114 | nn.Conv2d(3, base, 3, padding=1), 115 | nn.BatchNorm2d(base), 116 | nn.ReLU(inplace=True) 117 | ) 118 | self.transient_pre_conv = nn.Sequential( 119 | nn.Conv2d(3, base, 3, padding=1), 120 | nn.BatchNorm2d(base), 121 | nn.ReLU(inplace=True) 122 | ) 123 | 124 | self.rir1 = RiRBlock(base, base, 2, 1) 125 | self.rir2 = RiRBlock(base, base, 2, 1) 126 | self.rir3 = RiRBlock(base, base * 2, 2, 2) 127 | self.rir4 = RiRBlock(base * 2, base * 2, 2, 1) 128 | self.rir5 = RiRBlock(base * 2, base * 2, 2, 1) 129 | self.rir6 = RiRBlock(base * 2, base * 4, 2, 2) 130 | self.rir7 = RiRBlock(base * 4, base * 4, 2, 1) 131 | self.rir8 = RiRBlock(base * 4, base * 4, 2, 1) 132 | 133 | self.conv1 = nn.Sequential( 134 | nn.Conv2d(384, num_classes, kernel_size=3, stride=2), #without this convolution, loss will soon be nan 135 | nn.BatchNorm2d(num_classes), 136 | nn.ReLU(inplace=True), 137 | ) 138 | 139 | self.classifier = nn.Sequential( 140 | nn.Linear(900, 450), 141 | nn.ReLU(), 142 | nn.Dropout(), 143 | nn.Linear(450, 100), 144 | ) 145 | 146 | self._weight_init() 147 | 148 | def forward(self, x): 149 | x_residual = self.residual_pre_conv(x) 150 | x_transient = self.transient_pre_conv(x) 151 | 152 | x_residual, x_transient = self.rir1((x_residual, x_transient)) 153 | x_residual, x_transient = self.rir2((x_residual, x_transient)) 154 | x_residual, x_transient = self.rir3((x_residual, x_transient)) 155 | x_residual, x_transient = self.rir4((x_residual, x_transient)) 156 | x_residual, x_transient = self.rir5((x_residual, x_transient)) 157 | x_residual, x_transient = self.rir6((x_residual, x_transient)) 158 | x_residual, x_transient = self.rir7((x_residual, x_transient)) 159 | x_residual, x_transient = self.rir8((x_residual, x_transient)) 160 | h = torch.cat([x_residual, x_transient], 1) 161 | h = self.conv1(h) 162 | h = h.view(h.size()[0], -1) 163 | h = self.classifier(h) 164 | 165 | return h 166 | 167 | def _weight_init(self): 168 | for m in self.modules(): 169 | if isinstance(m, nn.Conv2d): 170 | torch.nn.init.kaiming_normal(m.weight) 171 | m.bias.data.fill_(0.01) 172 | 173 | 174 | def resnet_in_resnet(): 175 | return ResnetInResneet() 176 | -------------------------------------------------------------------------------- /models/senet.py: -------------------------------------------------------------------------------- 1 | """senet in pytorch 2 | 3 | 4 | 5 | [1] Jie Hu, Li Shen, Samuel Albanie, Gang Sun, Enhua Wu 6 | 7 | Squeeze-and-Excitation Networks 8 | https://arxiv.org/abs/1709.01507 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | class BasicResidualSEBlock(nn.Module): 16 | 17 | expansion = 1 18 | 19 | def __init__(self, in_channels, out_channels, stride, r=16): 20 | super().__init__() 21 | 22 | self.residual = nn.Sequential( 23 | nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1), 24 | nn.BatchNorm2d(out_channels), 25 | nn.ReLU(inplace=True), 26 | 27 | nn.Conv2d(out_channels, out_channels * self.expansion, 3, padding=1), 28 | nn.BatchNorm2d(out_channels * self.expansion), 29 | nn.ReLU(inplace=True) 30 | ) 31 | 32 | self.shortcut = nn.Sequential() 33 | if stride != 1 or in_channels != out_channels * self.expansion: 34 | self.shortcut = nn.Sequential( 35 | nn.Conv2d(in_channels, out_channels * self.expansion, 1, stride=stride), 36 | nn.BatchNorm2d(out_channels * self.expansion) 37 | ) 38 | 39 | self.squeeze = nn.AdaptiveAvgPool2d(1) 40 | self.excitation = nn.Sequential( 41 | nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r), 42 | nn.ReLU(inplace=True), 43 | nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion), 44 | nn.Sigmoid() 45 | ) 46 | 47 | def forward(self, x): 48 | shortcut = self.shortcut(x) 49 | residual = self.residual(x) 50 | 51 | squeeze = self.squeeze(residual) 52 | squeeze = squeeze.view(squeeze.size(0), -1) 53 | excitation = self.excitation(squeeze) 54 | excitation = excitation.view(residual.size(0), residual.size(1), 1, 1) 55 | 56 | x = residual * excitation.expand_as(residual) + shortcut 57 | 58 | return F.relu(x) 59 | 60 | class BottleneckResidualSEBlock(nn.Module): 61 | 62 | expansion = 4 63 | 64 | def __init__(self, in_channels, out_channels, stride, r=16): 65 | super().__init__() 66 | 67 | self.residual = nn.Sequential( 68 | nn.Conv2d(in_channels, out_channels, 1), 69 | nn.BatchNorm2d(out_channels), 70 | nn.ReLU(inplace=True), 71 | 72 | nn.Conv2d(out_channels, out_channels, 3, stride=stride, padding=1), 73 | nn.BatchNorm2d(out_channels), 74 | nn.ReLU(inplace=True), 75 | 76 | nn.Conv2d(out_channels, out_channels * self.expansion, 1), 77 | nn.BatchNorm2d(out_channels * self.expansion), 78 | nn.ReLU(inplace=True) 79 | ) 80 | 81 | self.squeeze = nn.AdaptiveAvgPool2d(1) 82 | self.excitation = nn.Sequential( 83 | nn.Linear(out_channels * self.expansion, out_channels * self.expansion // r), 84 | nn.ReLU(inplace=True), 85 | nn.Linear(out_channels * self.expansion // r, out_channels * self.expansion), 86 | nn.Sigmoid() 87 | ) 88 | 89 | self.shortcut = nn.Sequential() 90 | if stride != 1 or in_channels != out_channels * self.expansion: 91 | self.shortcut = nn.Sequential( 92 | nn.Conv2d(in_channels, out_channels * self.expansion, 1, stride=stride), 93 | nn.BatchNorm2d(out_channels * self.expansion) 94 | ) 95 | 96 | def forward(self, x): 97 | 98 | shortcut = self.shortcut(x) 99 | 100 | residual = self.residual(x) 101 | squeeze = self.squeeze(residual) 102 | squeeze = squeeze.view(squeeze.size(0), -1) 103 | excitation = self.excitation(squeeze) 104 | excitation = excitation.view(residual.size(0), residual.size(1), 1, 1) 105 | 106 | x = residual * excitation.expand_as(residual) + shortcut 107 | 108 | return F.relu(x) 109 | 110 | class SEResNet(nn.Module): 111 | 112 | def __init__(self, block, block_num, class_num=100): 113 | super().__init__() 114 | 115 | self.in_channels = 64 116 | 117 | self.pre = nn.Sequential( 118 | nn.Conv2d(3, 64, 3, padding=1), 119 | nn.BatchNorm2d(64), 120 | nn.ReLU(inplace=True) 121 | ) 122 | 123 | self.stage1 = self._make_stage(block, block_num[0], 64, 1) 124 | self.stage2 = self._make_stage(block, block_num[1], 128, 2) 125 | self.stage3 = self._make_stage(block, block_num[2], 256, 2) 126 | self.stage4 = self._make_stage(block, block_num[3], 512, 2) 127 | 128 | self.linear = nn.Linear(self.in_channels, class_num) 129 | 130 | def forward(self, x): 131 | x = self.pre(x) 132 | 133 | x = self.stage1(x) 134 | x = self.stage2(x) 135 | x = self.stage3(x) 136 | x = self.stage4(x) 137 | 138 | x = F.adaptive_avg_pool2d(x, 1) 139 | x = x.view(x.size(0), -1) 140 | 141 | x = self.linear(x) 142 | 143 | return x 144 | 145 | 146 | def _make_stage(self, block, num, out_channels, stride): 147 | 148 | layers = [] 149 | layers.append(block(self.in_channels, out_channels, stride)) 150 | self.in_channels = out_channels * block.expansion 151 | 152 | while num - 1: 153 | layers.append(block(self.in_channels, out_channels, 1)) 154 | num -= 1 155 | 156 | return nn.Sequential(*layers) 157 | 158 | def seresnet18(): 159 | return SEResNet(BasicResidualSEBlock, [2, 2, 2, 2]) 160 | 161 | def seresnet34(): 162 | return SEResNet(BasicResidualSEBlock, [3, 4, 6, 3]) 163 | 164 | def seresnet50(): 165 | return SEResNet(BottleneckResidualSEBlock, [3, 4, 6, 3]) 166 | 167 | def seresnet101(): 168 | return SEResNet(BottleneckResidualSEBlock, [3, 4, 23, 3]) 169 | 170 | def seresnet152(): 171 | return SEResNet(BottleneckResidualSEBlock, [3, 8, 36, 3]) 172 | -------------------------------------------------------------------------------- /models/shufflenet.py: -------------------------------------------------------------------------------- 1 | """shufflenet in pytorch 2 | 3 | 4 | 5 | [1] Xiangyu Zhang, Xinyu Zhou, Mengxiao Lin, Jian Sun. 6 | 7 | ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices 8 | https://arxiv.org/abs/1707.01083v2 9 | """ 10 | 11 | from functools import partial 12 | 13 | import torch 14 | import torch.nn as nn 15 | 16 | 17 | class BasicConv2d(nn.Module): 18 | 19 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 20 | super().__init__() 21 | self.conv = nn.Conv2d(input_channels, output_channels, kernel_size, **kwargs) 22 | self.bn = nn.BatchNorm2d(output_channels) 23 | self.relu = nn.ReLU(inplace=True) 24 | 25 | def forward(self, x): 26 | x = self.conv(x) 27 | x = self.bn(x) 28 | x = self.relu(x) 29 | return x 30 | 31 | class ChannelShuffle(nn.Module): 32 | 33 | def __init__(self, groups): 34 | super().__init__() 35 | self.groups = groups 36 | 37 | def forward(self, x): 38 | batchsize, channels, height, width = x.data.size() 39 | channels_per_group = int(channels / self.groups) 40 | 41 | #"""suppose a convolutional layer with g groups whose output has 42 | #g x n channels; we first reshape the output channel dimension 43 | #into (g, n)""" 44 | x = x.view(batchsize, self.groups, channels_per_group, height, width) 45 | 46 | #"""transposing and then flattening it back as the input of next layer.""" 47 | x = x.transpose(1, 2).contiguous() 48 | x = x.view(batchsize, -1, height, width) 49 | 50 | return x 51 | 52 | class DepthwiseConv2d(nn.Module): 53 | 54 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 55 | super().__init__() 56 | self.depthwise = nn.Sequential( 57 | nn.Conv2d(input_channels, output_channels, kernel_size, **kwargs), 58 | nn.BatchNorm2d(output_channels) 59 | ) 60 | 61 | def forward(self, x): 62 | return self.depthwise(x) 63 | 64 | class PointwiseConv2d(nn.Module): 65 | def __init__(self, input_channels, output_channels, **kwargs): 66 | super().__init__() 67 | self.pointwise = nn.Sequential( 68 | nn.Conv2d(input_channels, output_channels, 1, **kwargs), 69 | nn.BatchNorm2d(output_channels) 70 | ) 71 | 72 | def forward(self, x): 73 | return self.pointwise(x) 74 | 75 | class ShuffleNetUnit(nn.Module): 76 | 77 | def __init__(self, input_channels, output_channels, stage, stride, groups): 78 | super().__init__() 79 | 80 | #"""Similar to [9], we set the number of bottleneck channels to 1/4 81 | #of the output channels for each ShuffleNet unit.""" 82 | self.bottlneck = nn.Sequential( 83 | PointwiseConv2d( 84 | input_channels, 85 | int(output_channels / 4), 86 | groups=groups 87 | ), 88 | nn.ReLU(inplace=True) 89 | ) 90 | 91 | #"""Note that for Stage 2, we do not apply group convolution on the first pointwise 92 | #layer because the number of input channels is relatively small.""" 93 | if stage == 2: 94 | self.bottlneck = nn.Sequential( 95 | PointwiseConv2d( 96 | input_channels, 97 | int(output_channels / 4), 98 | groups=groups 99 | ), 100 | nn.ReLU(inplace=True) 101 | ) 102 | 103 | self.channel_shuffle = ChannelShuffle(groups) 104 | 105 | self.depthwise = DepthwiseConv2d( 106 | int(output_channels / 4), 107 | int(output_channels / 4), 108 | 3, 109 | groups=int(output_channels / 4), 110 | stride=stride, 111 | padding=1 112 | ) 113 | 114 | self.expand = PointwiseConv2d( 115 | int(output_channels / 4), 116 | output_channels, 117 | groups=groups 118 | ) 119 | 120 | self.relu = nn.ReLU(inplace=True) 121 | self.fusion = self._add 122 | self.shortcut = nn.Sequential() 123 | 124 | #"""As for the case where ShuffleNet is applied with stride, 125 | #we simply make two modifications (see Fig 2 (c)): 126 | #(i) add a 3 × 3 average pooling on the shortcut path; 127 | #(ii) replace the element-wise addition with channel concatenation, 128 | #which makes it easy to enlarge channel dimension with little extra 129 | #computation cost. 130 | if stride != 1 or input_channels != output_channels: 131 | self.shortcut = nn.AvgPool2d(3, stride=2, padding=1) 132 | 133 | self.expand = PointwiseConv2d( 134 | int(output_channels / 4), 135 | output_channels - input_channels, 136 | groups=groups 137 | ) 138 | 139 | self.fusion = self._cat 140 | 141 | def _add(self, x, y): 142 | return torch.add(x, y) 143 | 144 | def _cat(self, x, y): 145 | return torch.cat([x, y], dim=1) 146 | 147 | def forward(self, x): 148 | shortcut = self.shortcut(x) 149 | 150 | shuffled = self.bottlneck(x) 151 | shuffled = self.channel_shuffle(shuffled) 152 | shuffled = self.depthwise(shuffled) 153 | shuffled = self.expand(shuffled) 154 | 155 | output = self.fusion(shortcut, shuffled) 156 | output = self.relu(output) 157 | 158 | return output 159 | 160 | class ShuffleNet(nn.Module): 161 | 162 | def __init__(self, num_blocks, num_classes=100, groups=3): 163 | super().__init__() 164 | 165 | if groups == 1: 166 | out_channels = [24, 144, 288, 567] 167 | elif groups == 2: 168 | out_channels = [24, 200, 400, 800] 169 | elif groups == 3: 170 | out_channels = [24, 240, 480, 960] 171 | elif groups == 4: 172 | out_channels = [24, 272, 544, 1088] 173 | elif groups == 8: 174 | out_channels = [24, 384, 768, 1536] 175 | 176 | self.conv1 = BasicConv2d(3, out_channels[0], 3, padding=1, stride=1) 177 | self.input_channels = out_channels[0] 178 | 179 | self.stage2 = self._make_stage( 180 | ShuffleNetUnit, 181 | num_blocks[0], 182 | out_channels[1], 183 | stride=2, 184 | stage=2, 185 | groups=groups 186 | ) 187 | 188 | self.stage3 = self._make_stage( 189 | ShuffleNetUnit, 190 | num_blocks[1], 191 | out_channels[2], 192 | stride=2, 193 | stage=3, 194 | groups=groups 195 | ) 196 | 197 | self.stage4 = self._make_stage( 198 | ShuffleNetUnit, 199 | num_blocks[2], 200 | out_channels[3], 201 | stride=2, 202 | stage=4, 203 | groups=groups 204 | ) 205 | 206 | self.avg = nn.AdaptiveAvgPool2d((1, 1)) 207 | self.fc = nn.Linear(out_channels[3], num_classes) 208 | 209 | def forward(self, x): 210 | x = self.conv1(x) 211 | x = self.stage2(x) 212 | x = self.stage3(x) 213 | x = self.stage4(x) 214 | x = self.avg(x) 215 | x = x.view(x.size(0), -1) 216 | x = self.fc(x) 217 | 218 | return x 219 | 220 | def _make_stage(self, block, num_blocks, output_channels, stride, stage, groups): 221 | """make shufflenet stage 222 | 223 | Args: 224 | block: block type, shuffle unit 225 | out_channels: output depth channel number of this stage 226 | num_blocks: how many blocks per stage 227 | stride: the stride of the first block of this stage 228 | stage: stage index 229 | groups: group number of group convolution 230 | Return: 231 | return a shuffle net stage 232 | """ 233 | strides = [stride] + [1] * (num_blocks - 1) 234 | 235 | stage = [] 236 | 237 | for stride in strides: 238 | stage.append( 239 | block( 240 | self.input_channels, 241 | output_channels, 242 | stride=stride, 243 | stage=stage, 244 | groups=groups 245 | ) 246 | ) 247 | self.input_channels = output_channels 248 | 249 | return nn.Sequential(*stage) 250 | 251 | def shufflenet(): 252 | return ShuffleNet([4, 8, 4]) 253 | 254 | 255 | 256 | 257 | -------------------------------------------------------------------------------- /models/shufflenetv2.py: -------------------------------------------------------------------------------- 1 | """shufflenetv2 in pytorch 2 | 3 | 4 | 5 | [1] Ningning Ma, Xiangyu Zhang, Hai-Tao Zheng, Jian Sun 6 | 7 | ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design 8 | https://arxiv.org/abs/1807.11164 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | 16 | def channel_split(x, split): 17 | """split a tensor into two pieces along channel dimension 18 | Args: 19 | x: input tensor 20 | split:(int) channel size for each pieces 21 | """ 22 | assert x.size(1) == split * 2 23 | return torch.split(x, split, dim=1) 24 | 25 | def channel_shuffle(x, groups): 26 | """channel shuffle operation 27 | Args: 28 | x: input tensor 29 | groups: input branch number 30 | """ 31 | 32 | batch_size, channels, height, width = x.size() 33 | channels_per_group = int(channels // groups) 34 | 35 | x = x.view(batch_size, groups, channels_per_group, height, width) 36 | x = x.transpose(1, 2).contiguous() 37 | x = x.view(batch_size, -1, height, width) 38 | 39 | return x 40 | 41 | class ShuffleUnit(nn.Module): 42 | 43 | def __init__(self, in_channels, out_channels, stride): 44 | super().__init__() 45 | 46 | self.stride = stride 47 | self.in_channels = in_channels 48 | self.out_channels = out_channels 49 | 50 | if stride != 1 or in_channels != out_channels: 51 | self.residual = nn.Sequential( 52 | nn.Conv2d(in_channels, in_channels, 1), 53 | nn.BatchNorm2d(in_channels), 54 | nn.ReLU(inplace=True), 55 | nn.Conv2d(in_channels, in_channels, 3, stride=stride, padding=1, groups=in_channels), 56 | nn.BatchNorm2d(in_channels), 57 | nn.Conv2d(in_channels, int(out_channels / 2), 1), 58 | nn.BatchNorm2d(int(out_channels / 2)), 59 | nn.ReLU(inplace=True) 60 | ) 61 | 62 | self.shortcut = nn.Sequential( 63 | nn.Conv2d(in_channels, in_channels, 3, stride=stride, padding=1, groups=in_channels), 64 | nn.BatchNorm2d(in_channels), 65 | nn.Conv2d(in_channels, int(out_channels / 2), 1), 66 | nn.BatchNorm2d(int(out_channels / 2)), 67 | nn.ReLU(inplace=True) 68 | ) 69 | else: 70 | self.shortcut = nn.Sequential() 71 | 72 | in_channels = int(in_channels / 2) 73 | self.residual = nn.Sequential( 74 | nn.Conv2d(in_channels, in_channels, 1), 75 | nn.BatchNorm2d(in_channels), 76 | nn.ReLU(inplace=True), 77 | nn.Conv2d(in_channels, in_channels, 3, stride=stride, padding=1, groups=in_channels), 78 | nn.BatchNorm2d(in_channels), 79 | nn.Conv2d(in_channels, in_channels, 1), 80 | nn.BatchNorm2d(in_channels), 81 | nn.ReLU(inplace=True) 82 | ) 83 | 84 | 85 | def forward(self, x): 86 | 87 | if self.stride == 1 and self.out_channels == self.in_channels: 88 | shortcut, residual = channel_split(x, int(self.in_channels / 2)) 89 | else: 90 | shortcut = x 91 | residual = x 92 | 93 | shortcut = self.shortcut(shortcut) 94 | residual = self.residual(residual) 95 | x = torch.cat([shortcut, residual], dim=1) 96 | x = channel_shuffle(x, 2) 97 | 98 | return x 99 | 100 | class ShuffleNetV2(nn.Module): 101 | 102 | def __init__(self, ratio=1, class_num=100): 103 | super().__init__() 104 | if ratio == 0.5: 105 | out_channels = [48, 96, 192, 1024] 106 | elif ratio == 1: 107 | out_channels = [116, 232, 464, 1024] 108 | elif ratio == 1.5: 109 | out_channels = [176, 352, 704, 1024] 110 | elif ratio == 2: 111 | out_channels = [244, 488, 976, 2048] 112 | else: 113 | ValueError('unsupported ratio number') 114 | 115 | self.pre = nn.Sequential( 116 | nn.Conv2d(3, 24, 3, padding=1), 117 | nn.BatchNorm2d(24) 118 | ) 119 | 120 | self.stage2 = self._make_stage(24, out_channels[0], 3) 121 | self.stage3 = self._make_stage(out_channels[0], out_channels[1], 7) 122 | self.stage4 = self._make_stage(out_channels[1], out_channels[2], 3) 123 | self.conv5 = nn.Sequential( 124 | nn.Conv2d(out_channels[2], out_channels[3], 1), 125 | nn.BatchNorm2d(out_channels[3]), 126 | nn.ReLU(inplace=True) 127 | ) 128 | 129 | self.fc = nn.Linear(out_channels[3], class_num) 130 | 131 | def forward(self, x): 132 | x = self.pre(x) 133 | x = self.stage2(x) 134 | x = self.stage3(x) 135 | x = self.stage4(x) 136 | x = self.conv5(x) 137 | x = F.adaptive_avg_pool2d(x, 1) 138 | x = x.view(x.size(0), -1) 139 | x = self.fc(x) 140 | 141 | return x 142 | 143 | def _make_stage(self, in_channels, out_channels, repeat): 144 | layers = [] 145 | layers.append(ShuffleUnit(in_channels, out_channels, 2)) 146 | 147 | while repeat: 148 | layers.append(ShuffleUnit(out_channels, out_channels, 1)) 149 | repeat -= 1 150 | 151 | return nn.Sequential(*layers) 152 | 153 | def shufflenetv2(): 154 | return ShuffleNetV2() 155 | 156 | 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /models/squeezenet.py: -------------------------------------------------------------------------------- 1 | """squeezenet in pytorch 2 | 3 | 4 | 5 | [1] Song Han, Jeff Pool, John Tran, William J. Dally 6 | 7 | squeezenet: Learning both Weights and Connections for Efficient Neural Networks 8 | https://arxiv.org/abs/1506.02626 9 | """ 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | 15 | class Fire(nn.Module): 16 | 17 | def __init__(self, in_channel, out_channel, squzee_channel): 18 | 19 | super().__init__() 20 | self.squeeze = nn.Sequential( 21 | nn.Conv2d(in_channel, squzee_channel, 1), 22 | nn.BatchNorm2d(squzee_channel), 23 | nn.ReLU(inplace=True) 24 | ) 25 | 26 | self.expand_1x1 = nn.Sequential( 27 | nn.Conv2d(squzee_channel, int(out_channel / 2), 1), 28 | nn.BatchNorm2d(int(out_channel / 2)), 29 | nn.ReLU(inplace=True) 30 | ) 31 | 32 | self.expand_3x3 = nn.Sequential( 33 | nn.Conv2d(squzee_channel, int(out_channel / 2), 3, padding=1), 34 | nn.BatchNorm2d(int(out_channel / 2)), 35 | nn.ReLU(inplace=True) 36 | ) 37 | 38 | def forward(self, x): 39 | 40 | x = self.squeeze(x) 41 | x = torch.cat([ 42 | self.expand_1x1(x), 43 | self.expand_3x3(x) 44 | ], 1) 45 | 46 | return x 47 | 48 | class SqueezeNet(nn.Module): 49 | 50 | """mobile net with simple bypass""" 51 | def __init__(self, class_num=100): 52 | 53 | super().__init__() 54 | self.stem = nn.Sequential( 55 | nn.Conv2d(3, 96, 3, padding=1), 56 | nn.BatchNorm2d(96), 57 | nn.ReLU(inplace=True), 58 | nn.MaxPool2d(2, 2) 59 | ) 60 | 61 | self.fire2 = Fire(96, 128, 16) 62 | self.fire3 = Fire(128, 128, 16) 63 | self.fire4 = Fire(128, 256, 32) 64 | self.fire5 = Fire(256, 256, 32) 65 | self.fire6 = Fire(256, 384, 48) 66 | self.fire7 = Fire(384, 384, 48) 67 | self.fire8 = Fire(384, 512, 64) 68 | self.fire9 = Fire(512, 512, 64) 69 | 70 | self.conv10 = nn.Conv2d(512, class_num, 1) 71 | self.avg = nn.AdaptiveAvgPool2d(1) 72 | self.maxpool = nn.MaxPool2d(2, 2) 73 | 74 | def forward(self, x): 75 | x = self.stem(x) 76 | 77 | f2 = self.fire2(x) 78 | f3 = self.fire3(f2) + f2 79 | f4 = self.fire4(f3) 80 | f4 = self.maxpool(f4) 81 | 82 | f5 = self.fire5(f4) + f4 83 | f6 = self.fire6(f5) 84 | f7 = self.fire7(f6) + f6 85 | f8 = self.fire8(f7) 86 | f8 = self.maxpool(f8) 87 | 88 | f9 = self.fire9(f8) 89 | c10 = self.conv10(f9) 90 | 91 | x = self.avg(c10) 92 | x = x.view(x.size(0), -1) 93 | 94 | return x 95 | 96 | def squeezenet(class_num=100): 97 | return SqueezeNet(class_num=class_num) 98 | -------------------------------------------------------------------------------- /models/stochasticdepth.py: -------------------------------------------------------------------------------- 1 | """ 2 | resnet with stochastic depth 3 | 4 | [1] Gao Huang, Yu Sun, Zhuang Liu, Daniel Sedra, Kilian Weinberger 5 | Deep Networks with Stochastic Depth 6 | 7 | https://arxiv.org/abs/1603.09382v3 8 | """ 9 | import torch 10 | import torch.nn as nn 11 | from torch.distributions.bernoulli import Bernoulli 12 | import random 13 | 14 | 15 | class StochasticDepthBasicBlock(torch.jit.ScriptModule): 16 | 17 | expansion=1 18 | 19 | def __init__(self, p, in_channels, out_channels, stride=1): 20 | super().__init__() 21 | 22 | #self.p = torch.tensor(p).float() 23 | self.p = p 24 | self.residual = nn.Sequential( 25 | nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1), 26 | nn.BatchNorm2d(out_channels), 27 | nn.ReLU(inplace=True), 28 | nn.Conv2d(out_channels, out_channels * StochasticDepthBasicBlock.expansion, kernel_size=3, padding=1), 29 | nn.BatchNorm2d(out_channels) 30 | ) 31 | 32 | self.shortcut = nn.Sequential() 33 | 34 | if stride != 1 or in_channels != out_channels * StochasticDepthBasicBlock.expansion: 35 | self.shortcut = nn.Sequential( 36 | nn.Conv2d(in_channels, out_channels * StochasticDepthBasicBlock.expansion, kernel_size=1, stride=stride), 37 | nn.BatchNorm2d(out_channels) 38 | ) 39 | def survival(self): 40 | var = torch.bernoulli(torch.tensor(self.p).float()) 41 | return torch.equal(var, torch.tensor(1).float().to(var.device)) 42 | 43 | @torch.jit.script_method 44 | def forward(self, x): 45 | 46 | if self.training: 47 | if self.survival(): 48 | # official torch implementation 49 | # function ResidualDrop:updateOutput(input) 50 | # local skip_forward = self.skip:forward(input) 51 | # self.output:resizeAs(skip_forward):copy(skip_forward) 52 | # if self.train then 53 | # if self.gate then -- only compute convolutional output when gate is open 54 | # self.output:add(self.net:forward(input)) 55 | # end 56 | # else 57 | # self.output:add(self.net:forward(input):mul(1-self.deathRate)) 58 | # end 59 | # return self.output 60 | # end 61 | 62 | # paper: 63 | # Hl = ReLU(bl*fl(Hl−1) + id(Hl−1)). 64 | 65 | # paper and their official implementation are different 66 | # paper use relu after output 67 | # official implementation dosen't 68 | # 69 | # other implementions which use relu: 70 | # https://github.com/jiweeo/pytorch-stochastic-depth/blob/a6f95aaffee82d273c1cd73d9ed6ef0718c6683d/models/resnet.py 71 | # https://github.com/dblN/stochastic_depth_keras/blob/master/train.py 72 | 73 | # implementations which doesn't use relu: 74 | # https://github.com/transcranial/stochastic-depth/blob/master/stochastic-depth.ipynb 75 | # https://github.com/shamangary/Pytorch-Stochastic-Depth-Resnet/blob/master/TYY_stodepth_lineardecay.py 76 | 77 | # I will just stick with the official implementation, I think 78 | # whether add relu after residual won't effect the network 79 | # performance too much 80 | x = self.residual(x) + self.shortcut(x) 81 | else: 82 | # If bl = 0, the ResBlock reduces to the identity function 83 | x = self.shortcut(x) 84 | 85 | else: 86 | x = self.residual(x) * self.p + self.shortcut(x) 87 | 88 | return x 89 | 90 | 91 | class StochasticDepthBottleNeck(torch.jit.ScriptModule): 92 | """Residual block for resnet over 50 layers 93 | 94 | """ 95 | expansion = 4 96 | def __init__(self, p, in_channels, out_channels, stride=1): 97 | super().__init__() 98 | 99 | self.p = p 100 | self.residual = nn.Sequential( 101 | nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False), 102 | nn.BatchNorm2d(out_channels), 103 | nn.ReLU(inplace=True), 104 | nn.Conv2d(out_channels, out_channels, stride=stride, kernel_size=3, padding=1, bias=False), 105 | nn.BatchNorm2d(out_channels), 106 | nn.ReLU(inplace=True), 107 | nn.Conv2d(out_channels, out_channels * StochasticDepthBottleNeck.expansion, kernel_size=1, bias=False), 108 | nn.BatchNorm2d(out_channels * StochasticDepthBottleNeck.expansion), 109 | ) 110 | 111 | self.shortcut = nn.Sequential() 112 | 113 | if stride != 1 or in_channels != out_channels * StochasticDepthBottleNeck.expansion: 114 | self.shortcut = nn.Sequential( 115 | nn.Conv2d(in_channels, out_channels * StochasticDepthBottleNeck.expansion, stride=stride, kernel_size=1, bias=False), 116 | nn.BatchNorm2d(out_channels * StochasticDepthBottleNeck.expansion) 117 | ) 118 | 119 | def survival(self): 120 | var = torch.bernoulli(torch.tensor(self.p).float()) 121 | return torch.equal(var, torch.tensor(1).float().to(var.device)) 122 | 123 | @torch.jit.script_method 124 | def forward(self, x): 125 | 126 | if self.training: 127 | if self.survival(): 128 | x = self.residual(x) + self.shortcut(x) 129 | else: 130 | x = self.shortcut(x) 131 | else: 132 | x = self.residual(x) * self.p + self.shortcut(x) 133 | 134 | return x 135 | 136 | class StochasticDepthResNet(nn.Module): 137 | 138 | def __init__(self, block, num_block, num_classes=100): 139 | super().__init__() 140 | 141 | self.in_channels = 64 142 | self.conv1 = nn.Sequential( 143 | nn.Conv2d(3, 64, kernel_size=3, padding=1), 144 | nn.BatchNorm2d(64), 145 | nn.ReLU(inplace=True) 146 | ) 147 | 148 | self.step = (1 - 0.5) / (sum(num_block) - 1) 149 | self.pl = 1 150 | self.conv2_x = self._make_layer(block, 64, num_block[0], 1) 151 | self.conv3_x = self._make_layer(block, 128, num_block[1], 2) 152 | self.conv4_x = self._make_layer(block, 256, num_block[2], 2) 153 | self.conv5_x = self._make_layer(block, 512, num_block[3], 2) 154 | self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) 155 | self.fc = nn.Linear(512 * block.expansion, num_classes) 156 | 157 | def _make_layer(self, block, out_channels, num_blocks, stride): 158 | 159 | strides = [stride] + [1] * (num_blocks - 1) 160 | layers = [] 161 | for stride in strides: 162 | layers.append(block(self.pl, self.in_channels, out_channels, stride)) 163 | self.in_channels = out_channels * block.expansion 164 | self.pl -= self.step 165 | 166 | return nn.Sequential(*layers) 167 | 168 | def forward(self, x): 169 | output = self.conv1(x) 170 | output = self.conv2_x(output) 171 | output = self.conv3_x(output) 172 | output = self.conv4_x(output) 173 | output = self.conv5_x(output) 174 | output = self.avg_pool(output) 175 | output = output.view(output.size(0), -1) 176 | output = self.fc(output) 177 | 178 | return output 179 | 180 | 181 | def stochastic_depth_resnet18(): 182 | """ return a ResNet 18 object 183 | """ 184 | return StochasticDepthResNet(StochasticDepthBasicBlock, [2, 2, 2, 2]) 185 | 186 | def stochastic_depth_resnet34(): 187 | """ return a ResNet 34 object 188 | """ 189 | return StochasticDepthResNet(StochasticDepthBasicBlock, [3, 4, 6, 3]) 190 | 191 | def stochastic_depth_resnet50(): 192 | 193 | """ return a ResNet 50 object 194 | """ 195 | return StochasticDepthResNet(StochasticDepthBottleNeck, [3, 4, 6, 3]) 196 | 197 | def stochastic_depth_resnet101(): 198 | """ return a ResNet 101 object 199 | """ 200 | return StochasticDepthResNet(StochasticDepthBottleNeck, [3, 4, 23, 3]) 201 | 202 | def stochastic_depth_resnet152(): 203 | """ return a ResNet 152 object 204 | """ 205 | return StochasticDepthResNet(StochasticDepthBottleNeck, [3, 8, 36, 3]) 206 | 207 | -------------------------------------------------------------------------------- /models/vgg.py: -------------------------------------------------------------------------------- 1 | """vgg in pytorch 2 | 3 | 4 | [1] Karen Simonyan, Andrew Zisserman 5 | 6 | Very Deep Convolutional Networks for Large-Scale Image Recognition. 7 | https://arxiv.org/abs/1409.1556v6 8 | """ 9 | '''VGG11/13/16/19 in Pytorch.''' 10 | 11 | import torch 12 | import torch.nn as nn 13 | import loralib as lora 14 | 15 | cfg = { 16 | 'A' : [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 17 | 'B' : [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 18 | 'D' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 19 | 'E' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'] 20 | } 21 | 22 | class VGG(nn.Module): 23 | 24 | def __init__(self, features, num_class=100): 25 | super().__init__() 26 | self.features = features 27 | 28 | self.classifier = nn.Sequential( 29 | nn.Linear(512, 4096), 30 | nn.ReLU(inplace=True), 31 | nn.Dropout(), 32 | nn.Linear(4096, 4096), 33 | nn.ReLU(inplace=True), 34 | nn.Dropout(), 35 | nn.Linear(4096, num_class) 36 | ) 37 | 38 | def forward(self, x): 39 | output = self.features(x) 40 | output = output.view(output.size()[0], -1) 41 | output = self.classifier(output) 42 | 43 | return output 44 | 45 | class VGGLORA(nn.Module): 46 | 47 | def __init__(self, features, num_class=100): 48 | super().__init__() 49 | self.features = features 50 | # self.quant = torch.ao.quantization.QuantStub() 51 | self.classifier = nn.Sequential( 52 | lora.Linear(512, 4096, r=32), 53 | nn.ReLU(inplace=True), 54 | nn.Dropout(), 55 | lora.Linear(4096, 4096, r=32), 56 | nn.ReLU(inplace=True), 57 | nn.Dropout(), 58 | lora.Linear(4096, num_class, r=32), 59 | ) 60 | # self.dequant = torch.ao.quantization.DeQuantStub() 61 | 62 | def forward(self, x): 63 | # x = self.quant(x) 64 | output = self.features(x) 65 | # output = self.dequant(output) 66 | output = output.view(output.size()[0], -1) 67 | output = self.classifier(output) 68 | 69 | return output 70 | 71 | class VGGQLORA(nn.Module): 72 | """Quantize stub module, before calibration. 73 | 74 | Args: 75 | qconfig: quantization configuration for the tensor, 76 | if qconfig is not provided, we will get qconfig from parent modules 77 | """ 78 | def __init__(self, features, num_class=100): 79 | super().__init__() 80 | self.features = features 81 | self.quant = torch.ao.quantization.QuantStub() 82 | self.classifier = nn.Sequential( 83 | lora.Linear(512, 4096, r=32), 84 | nn.ReLU(inplace=True), 85 | nn.Dropout(), 86 | lora.Linear(4096, 4096, r=32), 87 | nn.ReLU(inplace=True), 88 | nn.Dropout(), 89 | lora.Linear(4096, num_class, r=32), 90 | ) 91 | self.dequant = torch.ao.quantization.DeQuantStub() 92 | 93 | def forward(self, x): 94 | x = self.quant(x) 95 | output = self.features(x) 96 | output = self.dequant(output) 97 | output = output.view(output.size()[0], -1) 98 | output = self.classifier(output) 99 | 100 | return output 101 | 102 | def make_layers(cfg, batch_norm=False): 103 | layers = [] 104 | 105 | input_channel = 3 106 | for l in cfg: 107 | if l == 'M': 108 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 109 | continue 110 | 111 | layers += [nn.Conv2d(input_channel, l, kernel_size=3, padding=1)] 112 | 113 | if batch_norm: 114 | layers += [nn.BatchNorm2d(l)] 115 | 116 | layers += [nn.ReLU(inplace=True)] 117 | input_channel = l 118 | 119 | return nn.Sequential(*layers) 120 | 121 | def vgg11_bn(): 122 | return VGG(make_layers(cfg['A'], batch_norm=True)) 123 | 124 | def vgg13_bn(): 125 | return VGG(make_layers(cfg['B'], batch_norm=True)) 126 | 127 | def vgg16_bn(): 128 | return VGG(make_layers(cfg['D'], batch_norm=True)) 129 | 130 | def vgg19_bn(): 131 | return VGG(make_layers(cfg['E'], batch_norm=True)) 132 | 133 | def vgg19_bn_lora(): 134 | return VGGLORA(make_layers(cfg['E'], batch_norm=True)) 135 | 136 | def vgg19_bn_qlora(): 137 | return VGGQLORA(make_layers(cfg['E'], batch_norm=True)) 138 | 139 | -------------------------------------------------------------------------------- /models/wideresidual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class WideBasic(nn.Module): 6 | 7 | def __init__(self, in_channels, out_channels, stride=1): 8 | super().__init__() 9 | self.residual = nn.Sequential( 10 | nn.BatchNorm2d(in_channels), 11 | nn.ReLU(inplace=True), 12 | nn.Conv2d( 13 | in_channels, 14 | out_channels, 15 | kernel_size=3, 16 | stride=stride, 17 | padding=1 18 | ), 19 | nn.BatchNorm2d(out_channels), 20 | nn.ReLU(inplace=True), 21 | nn.Dropout(), 22 | nn.Conv2d( 23 | out_channels, 24 | out_channels, 25 | kernel_size=3, 26 | stride=1, 27 | padding=1 28 | ) 29 | ) 30 | 31 | self.shortcut = nn.Sequential() 32 | 33 | if in_channels != out_channels or stride != 1: 34 | self.shortcut = nn.Sequential( 35 | nn.Conv2d(in_channels, out_channels, 1, stride=stride) 36 | ) 37 | 38 | def forward(self, x): 39 | 40 | residual = self.residual(x) 41 | shortcut = self.shortcut(x) 42 | 43 | return residual + shortcut 44 | 45 | class WideResNet(nn.Module): 46 | def __init__(self, num_classes, block, depth=50, widen_factor=1): 47 | super().__init__() 48 | 49 | self.depth = depth 50 | k = widen_factor 51 | l = int((depth - 4) / 6) 52 | self.in_channels = 16 53 | self.init_conv = nn.Conv2d(3, self.in_channels, 3, 1, padding=1) 54 | self.conv2 = self._make_layer(block, 16 * k, l, 1) 55 | self.conv3 = self._make_layer(block, 32 * k, l, 2) 56 | self.conv4 = self._make_layer(block, 64 * k, l, 2) 57 | self.bn = nn.BatchNorm2d(64 * k) 58 | self.relu = nn.ReLU(inplace=True) 59 | self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) 60 | self.linear = nn.Linear(64 * k, num_classes) 61 | 62 | def forward(self, x): 63 | x = self.init_conv(x) 64 | x = self.conv2(x) 65 | x = self.conv3(x) 66 | x = self.conv4(x) 67 | x = self.bn(x) 68 | x = self.relu(x) 69 | x = self.avg_pool(x) 70 | x = x.view(x.size(0), -1) 71 | x = self.linear(x) 72 | 73 | return x 74 | 75 | def _make_layer(self, block, out_channels, num_blocks, stride): 76 | """make resnet layers(by layer i didnt mean this 'layer' was the 77 | same as a neuron netowork layer, ex. conv layer), one layer may 78 | contain more than one residual block 79 | 80 | Args: 81 | block: block type, basic block or bottle neck block 82 | out_channels: output depth channel number of this layer 83 | num_blocks: how many blocks per layer 84 | stride: the stride of the first block of this layer 85 | 86 | Return: 87 | return a resnet layer 88 | """ 89 | 90 | # we have num_block blocks per layer, the first block 91 | # could be 1 or 2, other blocks would always be 1 92 | strides = [stride] + [1] * (num_blocks - 1) 93 | layers = [] 94 | for stride in strides: 95 | layers.append(block(self.in_channels, out_channels, stride)) 96 | self.in_channels = out_channels 97 | 98 | return nn.Sequential(*layers) 99 | 100 | 101 | # Table 9: Best WRN performance over various datasets, single run results. 102 | def wideresnet(depth=40, widen_factor=10): 103 | net = WideResNet(100, WideBasic, depth=depth, widen_factor=widen_factor) 104 | return net -------------------------------------------------------------------------------- /models/xception.py: -------------------------------------------------------------------------------- 1 | """xception in pytorch 2 | 3 | 4 | [1] François Chollet 5 | 6 | Xception: Deep Learning with Depthwise Separable Convolutions 7 | https://arxiv.org/abs/1610.02357 8 | """ 9 | 10 | import torch 11 | import torch.nn as nn 12 | 13 | class SeperableConv2d(nn.Module): 14 | 15 | #***Figure 4. An “extreme” version of our Inception module, 16 | #with one spatial convolution per output channel of the 1x1 17 | #convolution.""" 18 | def __init__(self, input_channels, output_channels, kernel_size, **kwargs): 19 | 20 | super().__init__() 21 | self.depthwise = nn.Conv2d( 22 | input_channels, 23 | input_channels, 24 | kernel_size, 25 | groups=input_channels, 26 | bias=False, 27 | **kwargs 28 | ) 29 | 30 | self.pointwise = nn.Conv2d(input_channels, output_channels, 1, bias=False) 31 | 32 | def forward(self, x): 33 | x = self.depthwise(x) 34 | x = self.pointwise(x) 35 | 36 | return x 37 | 38 | class EntryFlow(nn.Module): 39 | 40 | def __init__(self): 41 | 42 | super().__init__() 43 | self.conv1 = nn.Sequential( 44 | nn.Conv2d(3, 32, 3, padding=1, bias=False), 45 | nn.BatchNorm2d(32), 46 | nn.ReLU(inplace=True) 47 | ) 48 | 49 | self.conv2 = nn.Sequential( 50 | nn.Conv2d(32, 64, 3, padding=1, bias=False), 51 | nn.BatchNorm2d(64), 52 | nn.ReLU(inplace=True) 53 | ) 54 | 55 | self.conv3_residual = nn.Sequential( 56 | SeperableConv2d(64, 128, 3, padding=1), 57 | nn.BatchNorm2d(128), 58 | nn.ReLU(inplace=True), 59 | SeperableConv2d(128, 128, 3, padding=1), 60 | nn.BatchNorm2d(128), 61 | nn.MaxPool2d(3, stride=2, padding=1), 62 | ) 63 | 64 | self.conv3_shortcut = nn.Sequential( 65 | nn.Conv2d(64, 128, 1, stride=2), 66 | nn.BatchNorm2d(128), 67 | ) 68 | 69 | self.conv4_residual = nn.Sequential( 70 | nn.ReLU(inplace=True), 71 | SeperableConv2d(128, 256, 3, padding=1), 72 | nn.BatchNorm2d(256), 73 | nn.ReLU(inplace=True), 74 | SeperableConv2d(256, 256, 3, padding=1), 75 | nn.BatchNorm2d(256), 76 | nn.MaxPool2d(3, stride=2, padding=1) 77 | ) 78 | 79 | self.conv4_shortcut = nn.Sequential( 80 | nn.Conv2d(128, 256, 1, stride=2), 81 | nn.BatchNorm2d(256), 82 | ) 83 | 84 | #no downsampling 85 | self.conv5_residual = nn.Sequential( 86 | nn.ReLU(inplace=True), 87 | SeperableConv2d(256, 728, 3, padding=1), 88 | nn.BatchNorm2d(728), 89 | nn.ReLU(inplace=True), 90 | SeperableConv2d(728, 728, 3, padding=1), 91 | nn.BatchNorm2d(728), 92 | nn.MaxPool2d(3, 1, padding=1) 93 | ) 94 | 95 | #no downsampling 96 | self.conv5_shortcut = nn.Sequential( 97 | nn.Conv2d(256, 728, 1), 98 | nn.BatchNorm2d(728) 99 | ) 100 | 101 | def forward(self, x): 102 | x = self.conv1(x) 103 | x = self.conv2(x) 104 | residual = self.conv3_residual(x) 105 | shortcut = self.conv3_shortcut(x) 106 | x = residual + shortcut 107 | residual = self.conv4_residual(x) 108 | shortcut = self.conv4_shortcut(x) 109 | x = residual + shortcut 110 | residual = self.conv5_residual(x) 111 | shortcut = self.conv5_shortcut(x) 112 | x = residual + shortcut 113 | 114 | return x 115 | 116 | class MiddleFLowBlock(nn.Module): 117 | 118 | def __init__(self): 119 | super().__init__() 120 | 121 | self.shortcut = nn.Sequential() 122 | self.conv1 = nn.Sequential( 123 | nn.ReLU(inplace=True), 124 | SeperableConv2d(728, 728, 3, padding=1), 125 | nn.BatchNorm2d(728) 126 | ) 127 | self.conv2 = nn.Sequential( 128 | nn.ReLU(inplace=True), 129 | SeperableConv2d(728, 728, 3, padding=1), 130 | nn.BatchNorm2d(728) 131 | ) 132 | self.conv3 = nn.Sequential( 133 | nn.ReLU(inplace=True), 134 | SeperableConv2d(728, 728, 3, padding=1), 135 | nn.BatchNorm2d(728) 136 | ) 137 | 138 | def forward(self, x): 139 | residual = self.conv1(x) 140 | residual = self.conv2(residual) 141 | residual = self.conv3(residual) 142 | 143 | shortcut = self.shortcut(x) 144 | 145 | return shortcut + residual 146 | 147 | class MiddleFlow(nn.Module): 148 | def __init__(self, block): 149 | super().__init__() 150 | 151 | #"""then through the middle flow which is repeated eight times""" 152 | self.middel_block = self._make_flow(block, 8) 153 | 154 | def forward(self, x): 155 | x = self.middel_block(x) 156 | return x 157 | 158 | def _make_flow(self, block, times): 159 | flows = [] 160 | for i in range(times): 161 | flows.append(block()) 162 | 163 | return nn.Sequential(*flows) 164 | 165 | 166 | class ExitFLow(nn.Module): 167 | 168 | def __init__(self): 169 | super().__init__() 170 | self.residual = nn.Sequential( 171 | nn.ReLU(), 172 | SeperableConv2d(728, 728, 3, padding=1), 173 | nn.BatchNorm2d(728), 174 | nn.ReLU(), 175 | SeperableConv2d(728, 1024, 3, padding=1), 176 | nn.BatchNorm2d(1024), 177 | nn.MaxPool2d(3, stride=2, padding=1) 178 | ) 179 | 180 | self.shortcut = nn.Sequential( 181 | nn.Conv2d(728, 1024, 1, stride=2), 182 | nn.BatchNorm2d(1024) 183 | ) 184 | 185 | self.conv = nn.Sequential( 186 | SeperableConv2d(1024, 1536, 3, padding=1), 187 | nn.BatchNorm2d(1536), 188 | nn.ReLU(inplace=True), 189 | SeperableConv2d(1536, 2048, 3, padding=1), 190 | nn.BatchNorm2d(2048), 191 | nn.ReLU(inplace=True) 192 | ) 193 | 194 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 195 | 196 | def forward(self, x): 197 | shortcut = self.shortcut(x) 198 | residual = self.residual(x) 199 | output = shortcut + residual 200 | output = self.conv(output) 201 | output = self.avgpool(output) 202 | 203 | return output 204 | 205 | class Xception(nn.Module): 206 | 207 | def __init__(self, block, num_class=100): 208 | super().__init__() 209 | self.entry_flow = EntryFlow() 210 | self.middel_flow = MiddleFlow(block) 211 | self.exit_flow = ExitFLow() 212 | 213 | self.fc = nn.Linear(2048, num_class) 214 | 215 | def forward(self, x): 216 | x = self.entry_flow(x) 217 | x = self.middel_flow(x) 218 | x = self.exit_flow(x) 219 | x = x.view(x.size(0), -1) 220 | x = self.fc(x) 221 | 222 | return x 223 | 224 | def xception(): 225 | return Xception(MiddleFLowBlock) 226 | 227 | 228 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard 2 | torch 3 | torchvision 4 | loralib 5 | tqdm -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #test.py 2 | #!/usr/bin/env python3 3 | 4 | """ test neuron network performace 5 | print top1 and top5 err on test dataset 6 | of a model 7 | 8 | author baiyu 9 | """ 10 | 11 | import argparse 12 | 13 | from matplotlib import pyplot as plt 14 | 15 | import torch 16 | import torchvision.transforms as transforms 17 | from torch.utils.data import DataLoader 18 | 19 | from conf import settings 20 | from utils import get_network, get_test_dataloader 21 | 22 | import os 23 | import logging 24 | from datetime import datetime 25 | from tqdm import tqdm 26 | import json 27 | import numpy as np 28 | 29 | import torch 30 | import torchvision.models as models 31 | from torch.profiler import profile, record_function, ProfilerActivity 32 | 33 | from utils import network_to_half 34 | 35 | start_datetime = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") 36 | starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) 37 | 38 | def print_trainable_parameters(model): 39 | """ 40 | Prints the number of trainable parameters in the model. 41 | """ 42 | trainable_params = 0 43 | all_param = 0 44 | for _, param in model.named_parameters(): 45 | all_param += param.numel() 46 | if param.requires_grad: 47 | trainable_params += param.numel() 48 | print( 49 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}" 50 | ) 51 | 52 | if __name__ == '__main__': 53 | 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument('-net', type=str, required=True, help='net type') 56 | parser.add_argument('-weights', type=str, required=True, help='the weights file you want to test') 57 | parser.add_argument('-gpu', action='store_true', default=False, help='use gpu or not') 58 | parser.add_argument('-b', type=int, default=16, help='batch size for dataloader') 59 | parser.add_argument('-log', type=str, default="./logs/test_{datetime}.log", help='log file to save the logging info') 60 | args = parser.parse_args() 61 | 62 | net = get_network(args) 63 | 64 | cifar100_test_loader = get_test_dataloader( 65 | settings.CIFAR100_TRAIN_MEAN, 66 | settings.CIFAR100_TRAIN_STD, 67 | #settings.CIFAR100_PATH, 68 | num_workers=4, 69 | batch_size=args.b, 70 | ) 71 | 72 | if os.path.exists("logs/") is False: 73 | os.makedirs(args.log) 74 | 75 | log_format = '%(asctime)s [%(levelname)s] %(message)s' 76 | log_level = logging.INFO 77 | log_file = args.log 78 | 79 | logging.basicConfig(level=log_level, format=log_format, 80 | filename=log_file.format(datetime=start_datetime.replace(':','-'))) 81 | logging.getLogger().setLevel(log_level) 82 | 83 | logging.info(f'Parsed args: {json.dumps(dict(args.__dict__), indent=2)}') 84 | 85 | if len(args.net.split("_")) == 2 and args.net.split("_")[1] in ["lora","qlora"]: 86 | print("loading {}...".format(args.net)) 87 | net.load_state_dict(torch.load(args.weights), strict=False) 88 | else: 89 | net.load_state_dict(torch.load(args.weights)) 90 | 91 | net = network_to_half(net) 92 | print_trainable_parameters(net) 93 | 94 | logging.info(net) 95 | logging.info("\n") 96 | net.eval() 97 | 98 | correct_1 = 0.0 99 | correct_5 = 0.0 100 | total = 0 101 | 102 | timings=np.zeros((len(cifar100_test_loader), 1)) 103 | total_time = 0 104 | 105 | with torch.no_grad(): 106 | for n_iter, (image, label) in enumerate(tqdm(cifar100_test_loader)): 107 | # print("iteration: {}\ttotal {} iterations".format(n_iter + 1, len(cifar100_test_loader))) 108 | 109 | if args.gpu: 110 | image = image.cuda() 111 | label = label.cuda() 112 | # print('GPU INFO.....') 113 | # print(torch.cuda.memory_summary(), end='') 114 | 115 | starter.record() 116 | 117 | # with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof: 118 | # with record_function("model_inference"): 119 | output = net(image) 120 | 121 | ender.record() 122 | torch.cuda.synchronize() 123 | curr_time = starter.elapsed_time(ender)/1000 124 | timings[n_iter] = curr_time 125 | total_time += curr_time 126 | 127 | _, pred = output.topk(5, 1, largest=True, sorted=True) 128 | 129 | label = label.view(label.size(0), -1).expand_as(pred) 130 | correct = pred.eq(label).float() 131 | 132 | #compute top 5 133 | correct_5 += correct[:, :5].sum() 134 | 135 | #compute top1 136 | correct_1 += correct[:, :1].sum() 137 | 138 | # print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) 139 | 140 | throughput = (n_iter * args.b) / total_time 141 | logging.info('Average throughput: {}'.format(throughput)) 142 | 143 | mean_syn = np.sum(timings) / (n_iter+1) 144 | std_syn = np.std(timings) 145 | logging.info("Average inference time: {}".format(mean_syn)) 146 | 147 | if args.gpu: 148 | logging.info('GPU INFO.....\n') 149 | logging.info("\n"+torch.cuda.memory_summary()) 150 | logging.info("\n") 151 | 152 | logging.info("Top 1 err: {}\n".format(1 - correct_1 / len(cifar100_test_loader.dataset))) 153 | logging.info("Top 5 err: {}\n".format(1 - correct_5 / len(cifar100_test_loader.dataset))) 154 | logging.info("Parameter numbers: {}".format(sum(p.numel() for p in net.parameters()))) 155 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # train.py 2 | #!/usr/bin/env python3 3 | 4 | """ train network using pytorch 5 | 6 | author baiyu 7 | """ 8 | 9 | import os 10 | import sys 11 | import argparse 12 | import time 13 | from datetime import datetime 14 | 15 | import numpy as np 16 | import torch 17 | import torch.nn as nn 18 | import torch.optim as optim 19 | import torchvision 20 | import torchvision.transforms as transforms 21 | 22 | from torch.utils.data import DataLoader 23 | from torch.utils.tensorboard import SummaryWriter 24 | 25 | from conf import settings 26 | from utils import get_network, get_training_dataloader, get_test_dataloader, WarmUpLR, \ 27 | most_recent_folder, most_recent_weights, last_epoch, best_acc_weights, network_to_half 28 | 29 | import loralib as lora 30 | from tqdm import tqdm 31 | from torch.quantization import MinMaxObserver, QConfig 32 | from torch.quantization.observer import default_observer 33 | 34 | def print_trainable_parameters(model): 35 | """ 36 | Prints the number of trainable parameters in the model. 37 | """ 38 | trainable_params = 0 39 | all_param = 0 40 | for _, param in model.named_parameters(): 41 | all_param += param.numel() 42 | if param.requires_grad: 43 | trainable_params += param.numel() 44 | print( 45 | f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}" 46 | ) 47 | 48 | def train(epoch): 49 | 50 | start = time.time() 51 | net.train() 52 | for batch_index, (images, labels) in enumerate(tqdm(cifar100_training_loader)): 53 | 54 | if args.gpu: 55 | labels = labels.cuda() 56 | images = images.cuda() 57 | 58 | optimizer.zero_grad() 59 | outputs = net(images) 60 | loss = loss_function(outputs, labels) 61 | loss.backward() 62 | optimizer.step() 63 | 64 | n_iter = (epoch - 1) * len(cifar100_training_loader) + batch_index + 1 65 | 66 | last_layer = list(net.children())[-1] 67 | # for name, para in last_layer.named_parameters(): 68 | # if 'weight' in name: 69 | # writer.add_scalar('LastLayerGradients/grad_norm2_weights', para.grad.norm(), n_iter) 70 | # if 'bias' in name: 71 | # writer.add_scalar('LastLayerGradients/grad_norm2_bias', para.grad.norm(), n_iter) 72 | 73 | # print('Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format( 74 | # loss.item(), 75 | # optimizer.param_groups[0]['lr'], 76 | # epoch=epoch, 77 | # trained_samples=batch_index * args.b + len(images), 78 | # total_samples=len(cifar100_training_loader.dataset) 79 | # )) 80 | 81 | #update training loss for each iteration 82 | writer.add_scalar('Train/loss', loss.item(), n_iter) 83 | 84 | if epoch <= args.warm: 85 | warmup_scheduler.step() 86 | 87 | print('Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tLR: {:0.6f}'.format( 88 | loss.item(), 89 | optimizer.param_groups[0]['lr'], 90 | epoch=epoch, 91 | trained_samples=batch_index * args.b + len(images), 92 | total_samples=len(cifar100_training_loader.dataset) 93 | )) 94 | 95 | for name, param in net.named_parameters(): 96 | layer, attr = os.path.splitext(name) 97 | attr = attr[1:] 98 | writer.add_histogram("{}/{}".format(layer, attr), param, epoch) 99 | 100 | finish = time.time() 101 | 102 | print('epoch {} training time consumed: {:.2f}s'.format(epoch, finish - start)) 103 | 104 | @torch.no_grad() 105 | def eval_training(epoch=0, tb=True): 106 | 107 | start = time.time() 108 | net.eval() 109 | 110 | test_loss = 0.0 # cost function error 111 | correct = 0.0 112 | 113 | for (images, labels) in cifar100_test_loader: 114 | 115 | if args.gpu: 116 | images = images.cuda() 117 | labels = labels.cuda() 118 | 119 | outputs = net(images) 120 | loss = loss_function(outputs, labels) 121 | 122 | test_loss += loss.item() 123 | _, preds = outputs.max(1) 124 | correct += preds.eq(labels).sum() 125 | 126 | finish = time.time() 127 | if args.gpu: 128 | print('GPU INFO.....') 129 | print(torch.cuda.memory_summary(), end='') 130 | print('Evaluating Network.....') 131 | print('Test set: Epoch: {}, Average loss: {:.4f}, Accuracy: {:.4f}, Time consumed:{:.2f}s'.format( 132 | epoch, 133 | test_loss / len(cifar100_test_loader.dataset), 134 | correct.float() / len(cifar100_test_loader.dataset), 135 | finish - start 136 | )) 137 | print() 138 | 139 | #add informations to tensorboard 140 | if tb: 141 | writer.add_scalar('Test/Average loss', test_loss / len(cifar100_test_loader.dataset), epoch) 142 | writer.add_scalar('Test/Accuracy', correct.float() / len(cifar100_test_loader.dataset), epoch) 143 | 144 | return correct.float() / len(cifar100_test_loader.dataset) 145 | 146 | if __name__ == '__main__': 147 | 148 | parser = argparse.ArgumentParser() 149 | parser.add_argument('-net', type=str, required=True, help='net type') 150 | parser.add_argument('-gpu', action='store_true', default=False, help='use gpu or not') 151 | parser.add_argument('-b', type=int, default=512, help='batch size for dataloader') 152 | parser.add_argument('-warm', type=int, default=1, help='warm up training phase') 153 | parser.add_argument('-lr', type=float, default=0.05, help='initial learning rate') 154 | parser.add_argument('-resume', action='store_true', default=False, help='resume training') 155 | parser.add_argument('-finetune', type=bool, default=False, help='fine tuning') 156 | parser.add_argument('-weightspath', type=str, default=None, help='path to the weights file') 157 | 158 | args = parser.parse_args() 159 | 160 | net = get_network(args) 161 | 162 | # my_qconfig = QConfig( 163 | # activation=MinMaxObserver.with_args(dtype=torch.qint8), 164 | # weight=default_observer.with_args(dtype=torch.qint8) 165 | # ) 166 | 167 | #data preprocessing: 168 | cifar100_training_loader = get_training_dataloader( 169 | settings.CIFAR100_TRAIN_MEAN, 170 | settings.CIFAR100_TRAIN_STD, 171 | num_workers=4, 172 | batch_size=args.b, 173 | shuffle=True 174 | ) 175 | 176 | cifar100_test_loader = get_test_dataloader( 177 | settings.CIFAR100_TRAIN_MEAN, 178 | settings.CIFAR100_TRAIN_STD, 179 | num_workers=4, 180 | batch_size=args.b, 181 | shuffle=True 182 | ) 183 | 184 | loss_function = nn.CrossEntropyLoss() 185 | optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) 186 | train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=settings.MILESTONES, gamma=0.2) #learning rate decay 187 | iter_per_epoch = len(cifar100_training_loader) 188 | warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm) 189 | 190 | if args.resume: 191 | recent_folder = most_recent_folder(os.path.join(settings.CHECKPOINT_PATH, args.net), fmt=settings.DATE_FORMAT) 192 | if not recent_folder: 193 | raise Exception('no recent folder were found') 194 | 195 | checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder) 196 | 197 | else: 198 | checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW) 199 | 200 | #use tensorboard 201 | if not os.path.exists(settings.LOG_DIR): 202 | os.mkdir(settings.LOG_DIR) 203 | 204 | #since tensorboard can't overwrite old values 205 | #so the only way is to create a new tensorboard log 206 | writer = SummaryWriter(log_dir=os.path.join( 207 | settings.LOG_DIR, args.net, settings.TIME_NOW)) 208 | input_tensor = torch.Tensor(1, 3, 32, 32) 209 | if args.gpu: 210 | input_tensor = input_tensor.cuda() 211 | writer.add_graph(net, input_tensor) 212 | 213 | #create checkpoint folder to save model 214 | if not os.path.exists(checkpoint_path): 215 | os.makedirs(checkpoint_path) 216 | checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth') 217 | 218 | best_acc = 0.0 219 | if args.resume: 220 | best_weights = best_acc_weights(os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder)) 221 | if best_weights: 222 | weights_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder, best_weights) 223 | print('found best acc weights file:{}'.format(weights_path)) 224 | print('load best training file to test acc...') 225 | net.load_state_dict(torch.load(weights_path)) 226 | best_acc = eval_training(tb=False) 227 | print('best acc is {:0.2f}'.format(best_acc)) 228 | 229 | recent_weights_file = most_recent_weights(os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder)) 230 | if not recent_weights_file: 231 | raise Exception('no recent weights file were found') 232 | weights_path = os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder, recent_weights_file) 233 | print('loading weights file {} to resume training.....'.format(weights_path)) 234 | net.load_state_dict(torch.load(weights_path)) 235 | 236 | resume_epoch = last_epoch(os.path.join(settings.CHECKPOINT_PATH, args.net, recent_folder)) 237 | elif args.finetune: 238 | print('loading weights file {} to fine tuning.....'.format(args.weightspath)) 239 | net.load_state_dict(torch.load(args.weightspath),strict=False) 240 | 241 | if args.net.split("_")[-1] in ["lora", "qlora"]: 242 | print('fine tune just a part of the original weights file.....') 243 | lora.mark_only_lora_as_trainable(net) 244 | 245 | net = network_to_half(net) 246 | 247 | print_trainable_parameters(net) 248 | 249 | for epoch in range(1, settings.EPOCH + 1): 250 | if epoch > args.warm: 251 | train_scheduler.step(epoch) 252 | 253 | if args.resume: 254 | if epoch <= resume_epoch: 255 | continue 256 | 257 | train(epoch) 258 | acc = eval_training(epoch) 259 | 260 | #start to save best performance model after learning rate decay to 0.01 261 | if epoch > settings.MILESTONES[0] and best_acc < acc: 262 | weights_path = checkpoint_path.format(net=args.net, epoch=epoch, type='best') 263 | print('saving weights file to {}'.format(weights_path)) 264 | if args.net.split("_")[-1] != "lora": 265 | torch.save(net.state_dict(), weights_path) 266 | else: 267 | print("saving lora weights") 268 | torch.save(lora.lora_state_dict(net), weights_path) 269 | best_acc = acc 270 | continue 271 | 272 | if not epoch % settings.SAVE_EPOCH: 273 | weights_path = checkpoint_path.format(net=args.net, epoch=epoch, type='regular') 274 | print('saving weights file to {}'.format(weights_path)) 275 | if args.net.split("_")[-1] != "lora": 276 | torch.save(net.state_dict(), weights_path) 277 | else: 278 | print("saving lora weights") 279 | torch.save(lora.lora_state_dict(net), weights_path) 280 | 281 | writer.close() -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """ helper function 2 | 3 | author baiyu 4 | """ 5 | import os 6 | import sys 7 | import re 8 | import datetime 9 | 10 | import numpy 11 | 12 | import torch 13 | from torch.optim.lr_scheduler import _LRScheduler 14 | import torchvision 15 | import torchvision.transforms as transforms 16 | from torch.utils.data import DataLoader 17 | from torch import nn 18 | 19 | class tofp16(nn.Module): 20 | """ 21 | Utility module that implements:: 22 | def forward(self, input): 23 | return input.half() 24 | """ 25 | 26 | def __init__(self): 27 | super(tofp16, self).__init__() 28 | 29 | def forward(self, input): 30 | return input.half() 31 | 32 | def BN_convert_float(module): 33 | """ 34 | Utility function for network_to_half(). 35 | Retained for legacy purposes. 36 | """ 37 | if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True: 38 | module.float() 39 | for child in module.children(): 40 | BN_convert_float(child) 41 | return module 42 | 43 | def network_to_half(network): 44 | """ 45 | Convert model to half precision in a batchnorm-safe way. 46 | Retained for legacy purposes. It is recommended to use FP16Model. 47 | """ 48 | return nn.Sequential(tofp16(), BN_convert_float(network.half())) 49 | 50 | def get_network(args): 51 | """ return given network 52 | """ 53 | 54 | if args.net == 'vgg16': 55 | from models.vgg import vgg16_bn 56 | net = vgg16_bn() 57 | elif args.net == 'vgg13': 58 | from models.vgg import vgg13_bn 59 | net = vgg13_bn() 60 | elif args.net == 'vgg11': 61 | from models.vgg import vgg11_bn 62 | net = vgg11_bn() 63 | elif args.net == 'vgg19': 64 | from models.vgg import vgg19_bn 65 | net = vgg19_bn() 66 | elif args.net == 'vgg19_lora': 67 | from models.vgg import vgg19_bn_lora 68 | net = vgg19_bn_lora() 69 | elif args.net == 'vgg19_qlora': 70 | from models.vgg import vgg19_bn_qlora 71 | net = vgg19_bn_qlora() 72 | elif args.net == 'densenet121': 73 | from models.densenet import densenet121 74 | net = densenet121() 75 | elif args.net == 'densenet161': 76 | from models.densenet import densenet161 77 | net = densenet161() 78 | elif args.net == 'densenet169': 79 | from models.densenet import densenet169 80 | net = densenet169() 81 | elif args.net == 'densenet201': 82 | from models.densenet import densenet201 83 | net = densenet201() 84 | elif args.net == 'googlenet': 85 | from models.googlenet import googlenet 86 | net = googlenet() 87 | elif args.net == 'inceptionv3': 88 | from models.inceptionv3 import inceptionv3 89 | net = inceptionv3() 90 | elif args.net == 'inceptionv4': 91 | from models.inceptionv4 import inceptionv4 92 | net = inceptionv4() 93 | elif args.net == 'inceptionresnetv2': 94 | from models.inceptionv4 import inception_resnet_v2 95 | net = inception_resnet_v2() 96 | elif args.net == 'xception': 97 | from models.xception import xception 98 | net = xception() 99 | elif args.net == 'resnet18': 100 | from models.resnet import resnet18 101 | net = resnet18() 102 | elif args.net == 'resnet34': 103 | from models.resnet import resnet34 104 | net = resnet34() 105 | elif args.net == 'resnet50': 106 | from models.resnet import resnet50 107 | net = resnet50() 108 | elif args.net == 'resnet101': 109 | from models.resnet import resnet101 110 | net = resnet101() 111 | elif args.net == 'resnet152': 112 | from models.resnet import resnet152 113 | net = resnet152() 114 | elif args.net == 'preactresnet18': 115 | from models.preactresnet import preactresnet18 116 | net = preactresnet18() 117 | elif args.net == 'preactresnet34': 118 | from models.preactresnet import preactresnet34 119 | net = preactresnet34() 120 | elif args.net == 'preactresnet50': 121 | from models.preactresnet import preactresnet50 122 | net = preactresnet50() 123 | elif args.net == 'preactresnet101': 124 | from models.preactresnet import preactresnet101 125 | net = preactresnet101() 126 | elif args.net == 'preactresnet152': 127 | from models.preactresnet import preactresnet152 128 | net = preactresnet152() 129 | elif args.net == 'resnext50': 130 | from models.resnext import resnext50 131 | net = resnext50() 132 | elif args.net == 'resnext101': 133 | from models.resnext import resnext101 134 | net = resnext101() 135 | elif args.net == 'resnext152': 136 | from models.resnext import resnext152 137 | net = resnext152() 138 | elif args.net == 'shufflenet': 139 | from models.shufflenet import shufflenet 140 | net = shufflenet() 141 | elif args.net == 'shufflenetv2': 142 | from models.shufflenetv2 import shufflenetv2 143 | net = shufflenetv2() 144 | elif args.net == 'squeezenet': 145 | from models.squeezenet import squeezenet 146 | net = squeezenet() 147 | elif args.net == 'mobilenet': 148 | from models.mobilenet import mobilenet 149 | net = mobilenet() 150 | elif args.net == 'mobilenetv2': 151 | from models.mobilenetv2 import mobilenetv2 152 | net = mobilenetv2() 153 | elif args.net == 'nasnet': 154 | from models.nasnet import nasnet 155 | net = nasnet() 156 | elif args.net == 'attention56': 157 | from models.attention import attention56 158 | net = attention56() 159 | elif args.net == 'attention92': 160 | from models.attention import attention92 161 | net = attention92() 162 | elif args.net == 'seresnet18': 163 | from models.senet import seresnet18 164 | net = seresnet18() 165 | elif args.net == 'seresnet34': 166 | from models.senet import seresnet34 167 | net = seresnet34() 168 | elif args.net == 'seresnet50': 169 | from models.senet import seresnet50 170 | net = seresnet50() 171 | elif args.net == 'seresnet101': 172 | from models.senet import seresnet101 173 | net = seresnet101() 174 | elif args.net == 'seresnet152': 175 | from models.senet import seresnet152 176 | net = seresnet152() 177 | elif args.net == 'wideresnet': 178 | from models.wideresidual import wideresnet 179 | net = wideresnet() 180 | elif args.net == 'stochasticdepth18': 181 | from models.stochasticdepth import stochastic_depth_resnet18 182 | net = stochastic_depth_resnet18() 183 | elif args.net == 'stochasticdepth34': 184 | from models.stochasticdepth import stochastic_depth_resnet34 185 | net = stochastic_depth_resnet34() 186 | elif args.net == 'stochasticdepth50': 187 | from models.stochasticdepth import stochastic_depth_resnet50 188 | net = stochastic_depth_resnet50() 189 | elif args.net == 'stochasticdepth101': 190 | from models.stochasticdepth import stochastic_depth_resnet101 191 | net = stochastic_depth_resnet101() 192 | 193 | else: 194 | print('the network name you have entered is not supported yet') 195 | sys.exit() 196 | 197 | if args.gpu: #use_gpu 198 | net = net.cuda() 199 | 200 | return net 201 | 202 | 203 | def get_training_dataloader(mean, std, batch_size=16, num_workers=2, shuffle=True): 204 | """ return training dataloader 205 | Args: 206 | mean: mean of cifar100 training dataset 207 | std: std of cifar100 training dataset 208 | path: path to cifar100 training python dataset 209 | batch_size: dataloader batchsize 210 | num_workers: dataloader num_works 211 | shuffle: whether to shuffle 212 | Returns: train_data_loader:torch dataloader object 213 | """ 214 | 215 | transform_train = transforms.Compose([ 216 | #transforms.ToPILImage(), 217 | transforms.RandomCrop(32, padding=4), 218 | transforms.RandomHorizontalFlip(), 219 | transforms.RandomRotation(15), 220 | transforms.ToTensor(), 221 | transforms.Normalize(mean, std) 222 | ]) 223 | #cifar100_training = CIFAR100Train(path, transform=transform_train) 224 | cifar100_training = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train) 225 | cifar100_training_loader = DataLoader( 226 | cifar100_training, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size) 227 | 228 | return cifar100_training_loader 229 | 230 | def get_test_dataloader(mean, std, batch_size=16, num_workers=2, shuffle=True): 231 | """ return training dataloader 232 | Args: 233 | mean: mean of cifar100 test dataset 234 | std: std of cifar100 test dataset 235 | path: path to cifar100 test python dataset 236 | batch_size: dataloader batchsize 237 | num_workers: dataloader num_works 238 | shuffle: whether to shuffle 239 | Returns: cifar100_test_loader:torch dataloader object 240 | """ 241 | 242 | transform_test = transforms.Compose([ 243 | transforms.ToTensor(), 244 | transforms.Normalize(mean, std) 245 | ]) 246 | #cifar100_test = CIFAR100Test(path, transform=transform_test) 247 | cifar100_test = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test) 248 | cifar100_test_loader = DataLoader( 249 | cifar100_test, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size) 250 | 251 | return cifar100_test_loader 252 | 253 | def compute_mean_std(cifar100_dataset): 254 | """compute the mean and std of cifar100 dataset 255 | Args: 256 | cifar100_training_dataset or cifar100_test_dataset 257 | witch derived from class torch.utils.data 258 | 259 | Returns: 260 | a tuple contains mean, std value of entire dataset 261 | """ 262 | 263 | data_r = numpy.dstack([cifar100_dataset[i][1][:, :, 0] for i in range(len(cifar100_dataset))]) 264 | data_g = numpy.dstack([cifar100_dataset[i][1][:, :, 1] for i in range(len(cifar100_dataset))]) 265 | data_b = numpy.dstack([cifar100_dataset[i][1][:, :, 2] for i in range(len(cifar100_dataset))]) 266 | mean = numpy.mean(data_r), numpy.mean(data_g), numpy.mean(data_b) 267 | std = numpy.std(data_r), numpy.std(data_g), numpy.std(data_b) 268 | 269 | return mean, std 270 | 271 | class WarmUpLR(_LRScheduler): 272 | """warmup_training learning rate scheduler 273 | Args: 274 | optimizer: optimzier(e.g. SGD) 275 | total_iters: totoal_iters of warmup phase 276 | """ 277 | def __init__(self, optimizer, total_iters, last_epoch=-1): 278 | 279 | self.total_iters = total_iters 280 | super().__init__(optimizer, last_epoch) 281 | 282 | def get_lr(self): 283 | """we will use the first m batches, and set the learning 284 | rate to base_lr * m / total_iters 285 | """ 286 | return [base_lr * self.last_epoch / (self.total_iters + 1e-8) for base_lr in self.base_lrs] 287 | 288 | 289 | def most_recent_folder(net_weights, fmt): 290 | """ 291 | return most recent created folder under net_weights 292 | if no none-empty folder were found, return empty folder 293 | """ 294 | # get subfolders in net_weights 295 | folders = os.listdir(net_weights) 296 | 297 | # filter out empty folders 298 | folders = [f for f in folders if len(os.listdir(os.path.join(net_weights, f)))] 299 | if len(folders) == 0: 300 | return '' 301 | 302 | # sort folders by folder created time 303 | folders = sorted(folders, key=lambda f: datetime.datetime.strptime(f, fmt)) 304 | return folders[-1] 305 | 306 | def most_recent_weights(weights_folder): 307 | """ 308 | return most recent created weights file 309 | if folder is empty return empty string 310 | """ 311 | weight_files = os.listdir(weights_folder) 312 | if len(weights_folder) == 0: 313 | return '' 314 | 315 | regex_str = r'([A-Za-z0-9]+)-([0-9]+)-(regular|best)' 316 | 317 | # sort files by epoch 318 | weight_files = sorted(weight_files, key=lambda w: int(re.search(regex_str, w).groups()[1])) 319 | 320 | return weight_files[-1] 321 | 322 | def last_epoch(weights_folder): 323 | weight_file = most_recent_weights(weights_folder) 324 | if not weight_file: 325 | raise Exception('no recent weights were found') 326 | resume_epoch = int(weight_file.split('-')[1]) 327 | 328 | return resume_epoch 329 | 330 | def best_acc_weights(weights_folder): 331 | """ 332 | return the best acc .pth file in given folder, if no 333 | best acc weights file were found, return empty string 334 | """ 335 | files = os.listdir(weights_folder) 336 | if len(files) == 0: 337 | return '' 338 | 339 | regex_str = r'([A-Za-z0-9]+)-([0-9]+)-(regular|best)' 340 | best_files = [w for w in files if re.search(regex_str, w).groups()[2] == 'best'] 341 | if len(best_files) == 0: 342 | return '' 343 | 344 | best_files = sorted(best_files, key=lambda w: int(re.search(regex_str, w).groups()[1])) 345 | return best_files[-1] --------------------------------------------------------------------------------