├── .gitignore ├── LICENSE ├── README.md ├── alexnet ├── alex.py └── inference.py ├── googlenet ├── googlenet.py └── inference.py ├── inception ├── inception.py └── inference.py ├── lenet ├── inference.py └── lenet5.py ├── mlp ├── inference.py └── mlp.py ├── mnasnet ├── inference.py └── mnasnet.py ├── mobilenet ├── README.md ├── inference.py ├── mobilenetv2.py └── mobilenetv3.py ├── pytorch-yolo-v3 ├── .gitignore ├── README-Ayoosh.md ├── README.md ├── __init__.py ├── bbox.py ├── cam_demo.py ├── cfg │ ├── tiny-yolo-voc.cfg │ ├── yolo-voc.cfg │ ├── yolo.cfg │ └── yolov3.cfg ├── darknet.py ├── data │ ├── coco.names │ └── voc.names ├── det_messi.jpg ├── detect.py ├── imgs │ ├── dog.jpg │ ├── eagle.jpg │ ├── giraffe.jpg │ ├── herd_of_horses.jpg │ ├── img1.jpg │ ├── img2.jpg │ ├── img3.jpg │ ├── img4.jpg │ ├── messi.jpg │ ├── person.jpg │ └── scream.jpg ├── inference.py ├── pallete ├── preprocess.py ├── util.py ├── video_demo.py └── video_demo_half.py ├── resnet ├── resnet18 │ ├── inference.py │ └── resnet18.py ├── resnet34 │ ├── inference.py │ └── resnet34.py ├── resnet50 │ ├── inference.py │ └── resnet50.py └── resnext50_32x4d │ ├── inference.py │ └── resnext50.py ├── shufflenet ├── inference.py └── shufflenet.py ├── squeezenet ├── inference.py └── squeezenet.py └── vgg ├── inference.py └── vgg.py /.gitignore: -------------------------------------------------------------------------------- 1 | pytorch-summary/ 2 | */*.pkl 3 | */*.pth 4 | */*.wts 5 | */__pycache__/ 6 | */log* 7 | */*/*.pkl 8 | */*/*.pth 9 | */*/*.wts 10 | */*/__pycache__/ 11 | */*/log* 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Wang Xinyu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorchx 2 | 3 | This is a brother project with [wang-xinyu/tensorrtx](https://github.com/wang-xinyu/tensorrtx). 4 | 5 | Popular deep learning networks are implemented with pytorch in this project. And then weights files are exported for tensorrt implementation. 6 | 7 | ## Test Environments 8 | 1. Python 3.7.3 9 | 2. cuda 10.0 10 | 3. PyTorch 1.3.0 11 | 4. torchvision 0.4.1 12 | 13 | ## prepare pytorch-summary 14 | 15 | pytorch-summary is a very useful tool for understanding the model structure, for example it can output the dimensions of each layer. 16 | 17 | Clone, and `cd` into the repo directory. 18 | 19 | ``` 20 | git clone https://github.com/sksq96/pytorch-summary 21 | python setup.py build 22 | python setup.py install 23 | ``` 24 | 25 | ## Run 26 | 27 | Most of the models are from torchvision, exception for yolov3, which has a readme inside. 28 | 29 | A file named `xxxnet.py` can do inference and save model into .pth. 30 | And a file named `inference.py` can do inference and save weights into .wts, which is used for tensorrt. 31 | 32 | For example, googlenet, 33 | 34 | ``` 35 | cd googlenet 36 | python googlenet.py // do inference and save model into .pth firstly. 37 | python inference.py // then do inference and save weights file 38 | ``` 39 | -------------------------------------------------------------------------------- /alexnet/alex.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import torchvision 5 | 6 | def main(): 7 | print('cuda device count: ', torch.cuda.device_count()) 8 | net = torchvision.models.alexnet(pretrained=True) 9 | #net.fc = nn.Linear(512, 2) 10 | net.eval() 11 | net = net.to('cuda:0') 12 | print(net) 13 | tmp = torch.ones(2, 3, 224, 224).to('cuda:0') 14 | out = net(tmp) 15 | print('alexnet out:', out.shape) 16 | torch.save(net, "alexnet.pth") 17 | 18 | if __name__ == '__main__': 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /alexnet/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torchvision 4 | import os 5 | import struct 6 | from torchsummary import summary 7 | 8 | def main(): 9 | print('cuda device count: ', torch.cuda.device_count()) 10 | net = torch.load('alexnet.pth') 11 | net = net.to('cuda:0') 12 | net.eval() 13 | print('model: ', net) 14 | #print('state dict: ', net.state_dict().keys()) 15 | tmp = torch.ones(1, 3, 224, 224).to('cuda:0') 16 | print('input: ', tmp) 17 | out = net(tmp) 18 | #for l in list(net.classifier.modules())[1:]: 19 | # print('-', l) 20 | 21 | print('output:', out) 22 | 23 | summary(net, (3, 224, 224)) 24 | 25 | f = open("alexnet.wts", 'w') 26 | f.write("{}\n".format(len(net.state_dict().keys()))) 27 | for k,v in net.state_dict().items(): 28 | print('key: ', k) 29 | print('value: ', v.shape) 30 | vr = v.reshape(-1).cpu().numpy() 31 | f.write("{} {}".format(k, len(vr))) 32 | for vv in vr: 33 | f.write(" ") 34 | f.write(struct.pack(">f", float(vv)).hex()) 35 | f.write("\n") 36 | 37 | if __name__ == '__main__': 38 | main() 39 | 40 | -------------------------------------------------------------------------------- /googlenet/googlenet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import torchvision 5 | 6 | def main(): 7 | print('cuda device count: ', torch.cuda.device_count()) 8 | net = torchvision.models.googlenet(pretrained=True) 9 | #net.fc = nn.Linear(512, 2) 10 | net = net.eval() 11 | net = net.to('cuda:0') 12 | print(net) 13 | tmp = torch.ones(2, 3, 224, 224).to('cuda:0') 14 | out = net(tmp) 15 | print('googlenet out:', out.shape) 16 | torch.save(net, "googlenet.pth") 17 | 18 | if __name__ == '__main__': 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /googlenet/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torchvision 4 | import os 5 | import struct 6 | from torchsummary import summary 7 | 8 | def main(): 9 | print('cuda device count: ', torch.cuda.device_count()) 10 | net = torch.load('googlenet.pth') 11 | net = net.to('cuda:0') 12 | net = net.eval() 13 | print('model: ', net) 14 | #print('state dict: ', net.state_dict().keys()) 15 | tmp = torch.ones(1, 3, 224, 224).to('cuda:0') 16 | print('input: ', tmp) 17 | out = net(tmp) 18 | 19 | print('output:', out) 20 | 21 | summary(net, (3, 224, 224)) 22 | #return 23 | f = open("googlenet.wts", 'w') 24 | f.write("{}\n".format(len(net.state_dict().keys()))) 25 | for k,v in net.state_dict().items(): 26 | print('key: ', k) 27 | print('value: ', v.shape) 28 | vr = v.reshape(-1).cpu().numpy() 29 | f.write("{} {}".format(k, len(vr))) 30 | for vv in vr: 31 | f.write(" ") 32 | f.write(struct.pack(">f", float(vv)).hex()) 33 | f.write("\n") 34 | 35 | if __name__ == '__main__': 36 | main() 37 | 38 | -------------------------------------------------------------------------------- /inception/inception.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import torchvision 5 | 6 | def main(): 7 | print('cuda device count: ', torch.cuda.device_count()) 8 | net = torchvision.models.inception_v3(pretrained=True) 9 | #net.fc = nn.Linear(512, 2) 10 | net = net.eval() 11 | net = net.to('cuda:0') 12 | print(net) 13 | tmp = torch.ones(2, 3, 299, 299).to('cuda:0') 14 | out = net(tmp) 15 | print('inception out:', out.shape) 16 | torch.save(net, "inception.pth") 17 | 18 | if __name__ == '__main__': 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /inception/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torchvision 4 | import os 5 | import struct 6 | from torchsummary import summary 7 | 8 | def main(): 9 | print('cuda device count: ', torch.cuda.device_count()) 10 | net = torch.load('inception.pth') 11 | net = net.to('cuda:1') 12 | net = net.eval() 13 | print('model: ', net) 14 | #print('state dict: ', net.state_dict().keys()) 15 | tmp = torch.ones(1, 3, 299, 299).to('cuda:1') 16 | print('input: ', tmp) 17 | out = net(tmp) 18 | 19 | print('output:', out) 20 | 21 | summary(net, (3, 299, 299), device='cuda:1') 22 | #return 23 | f = open("inception.wts", 'w') 24 | f.write("{}\n".format(len(net.state_dict().keys()))) 25 | for k,v in net.state_dict().items(): 26 | print('key: ', k) 27 | print('value: ', v.shape) 28 | vr = v.reshape(-1).cpu().numpy() 29 | f.write("{} {}".format(k, len(vr))) 30 | for vv in vr: 31 | f.write(" ") 32 | f.write(struct.pack(">f", float(vv)).hex()) 33 | f.write("\n") 34 | 35 | if __name__ == '__main__': 36 | main() 37 | 38 | -------------------------------------------------------------------------------- /lenet/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from lenet5 import Lenet5 4 | import os 5 | import struct 6 | 7 | def main(): 8 | print('cuda device count: ', torch.cuda.device_count()) 9 | net = torch.load('lenet5.pth') 10 | net = net.to('cuda:0') 11 | net.eval() 12 | #print('model: ', net) 13 | #print('state dict: ', net.state_dict()['conv1.weight']) 14 | tmp = torch.ones(1, 1, 32, 32).to('cuda:0') 15 | #print('input: ', tmp) 16 | out = net(tmp) 17 | print('lenet out:', out) 18 | 19 | f = open("lenet5.wts", 'w') 20 | f.write("{}\n".format(len(net.state_dict().keys()))) 21 | for k,v in net.state_dict().items(): 22 | #print('key: ', k) 23 | #print('value: ', v.shape) 24 | vr = v.reshape(-1).cpu().numpy() 25 | f.write("{} {}".format(k, len(vr))) 26 | for vv in vr: 27 | f.write(" ") 28 | f.write(struct.pack(">f", float(vv)).hex()) 29 | f.write("\n") 30 | 31 | if __name__ == '__main__': 32 | main() 33 | 34 | -------------------------------------------------------------------------------- /lenet/lenet5.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | class Lenet5(nn.Module): 6 | """ 7 | for cifar10 dataset. 8 | """ 9 | def __init__(self): 10 | super(Lenet5, self).__init__() 11 | 12 | self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0) 13 | self.pool1 = nn.AvgPool2d(kernel_size=2, stride=2, padding=0) 14 | self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0) 15 | self.fc1 = nn.Linear(16*5*5, 120) 16 | self.fc2 = nn.Linear(120, 84) 17 | self.fc3 = nn.Linear(84, 10) 18 | 19 | def forward(self, x): 20 | print('input: ', x.shape) 21 | x = F.relu(self.conv1(x)) 22 | print('conv1',x.shape) 23 | x = self.pool1(x) 24 | print('pool1: ', x.shape) 25 | x = F.relu(self.conv2(x)) 26 | print('conv2',x.shape) 27 | x = self.pool1(x) 28 | print('pool2',x.shape) 29 | x = x.view(x.size(0), -1) 30 | print('view: ', x.shape) 31 | x = F.relu(self.fc1(x)) 32 | print('fc1: ', x.shape) 33 | x = F.relu(self.fc2(x)) 34 | x = F.softmax(self.fc3(x), dim=1) 35 | return x 36 | 37 | def main(): 38 | print('cuda device count: ', torch.cuda.device_count()) 39 | torch.manual_seed(1234) 40 | net = Lenet5() 41 | net = net.to('cuda:0') 42 | net.eval() 43 | tmp = torch.ones(1, 1, 32, 32).to('cuda:0') 44 | out = net(tmp) 45 | print('lenet out shape:', out.shape) 46 | print('lenet out:', out) 47 | torch.save(net, "lenet5.pth") 48 | 49 | if __name__ == '__main__': 50 | main() 51 | 52 | -------------------------------------------------------------------------------- /mlp/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from mlp import LinearRegressionModel # This is a must import for torch to load model 3 | import struct 4 | 5 | 6 | def load_model(model_path=''): 7 | """ 8 | Load saved model from file 9 | :param model_path: mlp.pth prepared using mlp.py 10 | :return net: loaded model 11 | """ 12 | print(f'[INFO]: Loading saved model...') 13 | net = torch.load(model_path) 14 | net = net.to('cuda:0') 15 | net.eval() 16 | return net 17 | 18 | 19 | def test_model(mlp_model): 20 | """ 21 | Test model on custom input 22 | :param mlp_model: pre-trained model 23 | :return: 24 | """ 25 | print(f'[INFO]: Testing model on sample input...') 26 | tmp = torch.ones(1, 1).to('cuda:0') 27 | out = mlp_model(tmp) 28 | print(f'[INFO]: Test Result is: ', out.detach().cpu().numpy()) 29 | 30 | 31 | def convert_to_wts(mlp_model): 32 | """ 33 | Convert weights to .wts format for TensorRT Engine 34 | Weights are written in the following format: 35 | 36 | weight.name ... 37 | 38 | -- total-weights-count: is an integer 39 | -- weight.name: is used as key in TensorRT engine 40 | -- weight-count: no. of weights for current layer 41 | -- weight-valxx: float to c-bytes to hexadecimal 42 | 43 | :param mlp_model: pre-trained model 44 | :return: 45 | """ 46 | print(f'[INFO]: Writing weights to .wts ...') 47 | with open('mlp.wts', 'w') as f: 48 | f.write(f'{len(mlp_model.state_dict().keys())}\n') 49 | for k, v in mlp_model.state_dict().items(): 50 | vr = v.reshape(-1).cpu().numpy() 51 | f.write(f'{k} {len(vr)}') 52 | for vv in vr: 53 | f.write(" ") 54 | # convert weights to c-structs 55 | # Big-Endian (byte values) to Hex 56 | f.write(struct.pack('>f', float(vv)).hex()) 57 | f.write('\n') 58 | print('[INFO]: Successfully, converted weights to WTS ') 59 | 60 | 61 | def main(): 62 | mlp_model = load_model('mlp.pth') 63 | test_model(mlp_model) 64 | convert_to_wts(mlp_model) 65 | 66 | 67 | if __name__ == '__main__': 68 | main() 69 | 70 | -------------------------------------------------------------------------------- /mlp/mlp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | 4 | 5 | class LinearRegressionModel(torch.nn.Module): 6 | """ 7 | Linear Regression Model: 8 | * - input a number 9 | * - output its double 10 | """ 11 | 12 | def __init__(self): 13 | """ 14 | Initialize the model 15 | """ 16 | super(LinearRegressionModel, self).__init__() 17 | 18 | # A single linear layer with one weight and one bias 19 | self.linear = torch.nn.Linear(1, 1) 20 | 21 | def forward(self, x): 22 | """ 23 | Forward function of linear model 24 | :param x: input 25 | :return y_pred: predictions 26 | """ 27 | y_pred = self.linear(x) 28 | return y_pred 29 | 30 | 31 | def get_dataset(): 32 | """ 33 | Create a simple dataset 34 | * - x: numbers 35 | * - y: the labels as 2x of numbers 36 | :return dataset: 37 | """ 38 | print("[INFO]: Building dataset...") 39 | x_data = Variable(torch.Tensor([[1.0], [2.0], [3.0]])) 40 | y_data = Variable(torch.Tensor([[2.0], [4.0], [6.0]])) 41 | return x_data, y_data 42 | 43 | 44 | def get_hyperparams(model): 45 | """ 46 | Create some configurations for model 47 | :param model: 48 | :return loss_function and optimizer: 49 | """ 50 | print("[INFO]: Preparing configurations...") 51 | criterion = torch.nn.MSELoss(size_average=False) 52 | optimizer = torch.optim.SGD(model.parameters(), lr=0.01) 53 | return criterion, optimizer 54 | 55 | 56 | def train(model=None, 57 | x=None, 58 | y=None, 59 | criterion=None, 60 | optimizer=None): 61 | """ 62 | Train the given model 63 | :param model: a pytorch model 64 | :param x: dataset 65 | :param y: labels 66 | :param criterion: loss function 67 | :param optimizer: optimization technique 68 | :return: 69 | """ 70 | print("[INFO]: Starting training...\n") 71 | for epoch in range(500): 72 | # Forward pass: Compute predicted y by passing x to the model 73 | pred_y = model(x) 74 | 75 | # Compute and print loss 76 | loss = criterion(pred_y, y) 77 | 78 | # Zero gradients, 79 | optimizer.zero_grad() 80 | 81 | # perform a backward pass 82 | loss.backward() 83 | 84 | # update the weights. 85 | optimizer.step() 86 | 87 | # print loss after 50 epochs 88 | if epoch % 50 == 0: 89 | print('[INFO]: Epoch {}, Loss {}'.format(epoch, loss.item())) 90 | 91 | 92 | def check_inference(mlp_model): 93 | """ 94 | Check inference on the given model, currently with a static input 95 | :param mlp_model: 96 | :return: 97 | """ 98 | new_var = Variable(torch.Tensor([[4.0]])) 99 | print("\n[INFO]: Predicted (after training) \n\tinput: ", 4,"\n\toutput:", mlp_model(new_var).item()) 100 | 101 | 102 | def main(): 103 | # initialize the model 104 | mlp_model = LinearRegressionModel() 105 | # create the dataset 106 | x, y = get_dataset() 107 | # prepare configs 108 | criterion, optim = get_hyperparams(mlp_model) 109 | # train the model 110 | train(model=mlp_model, 111 | x=x, y=y, 112 | criterion=criterion, optimizer=optim) 113 | # check inference 114 | check_inference(mlp_model) 115 | # save the model 116 | torch.save(mlp_model, "mlp.pth") 117 | 118 | 119 | if __name__ == "__main__": 120 | main() 121 | 122 | -------------------------------------------------------------------------------- /mnasnet/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torchvision 4 | import os 5 | import struct 6 | from torchsummary import summary 7 | 8 | def main(): 9 | print('cuda device count: ', torch.cuda.device_count()) 10 | net = torch.load('mnasnet.pth') 11 | net = net.to('cuda:0') 12 | net = net.eval() 13 | print('model: ', net) 14 | #print('state dict: ', net.state_dict().keys()) 15 | tmp = torch.ones(1, 3, 224, 224).to('cuda:0') 16 | print('input: ', tmp) 17 | out = net(tmp) 18 | 19 | print('output:', out) 20 | 21 | summary(net, (3, 224, 224)) 22 | #return 23 | f = open("mnasnet.wts", 'w') 24 | f.write("{}\n".format(len(net.state_dict().keys()))) 25 | for k,v in net.state_dict().items(): 26 | print('key: ', k) 27 | print('value: ', v.shape) 28 | vr = v.reshape(-1).cpu().numpy() 29 | f.write("{} {}".format(k, len(vr))) 30 | for vv in vr: 31 | f.write(" ") 32 | f.write(struct.pack(">f", float(vv)).hex()) 33 | f.write("\n") 34 | 35 | if __name__ == '__main__': 36 | main() 37 | 38 | -------------------------------------------------------------------------------- /mnasnet/mnasnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import torchvision 5 | 6 | def main(): 7 | print('cuda device count: ', torch.cuda.device_count()) 8 | net = torchvision.models.mnasnet0_5(pretrained=True) 9 | #net.fc = nn.Linear(512, 2) 10 | net = net.eval() 11 | net = net.to('cuda:0') 12 | print(net) 13 | tmp = torch.ones(2, 3, 224, 224).to('cuda:0') 14 | out = net(tmp) 15 | print('mnasnet out:', out.shape) 16 | torch.save(net, "mnasnet.pth") 17 | 18 | if __name__ == '__main__': 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /mobilenet/README.md: -------------------------------------------------------------------------------- 1 | ## Test Environments 2 | 1. Python==3.7.3 3 | 2. cuda>=10.2 4 | 3. PyTorch>=1.8.0 5 | 4. torchvision==0.9.1 -------------------------------------------------------------------------------- /mobilenet/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torchvision 4 | import os 5 | import struct 6 | from torchsummary import summary 7 | 8 | def main(): 9 | print('cuda device count: ', torch.cuda.device_count()) 10 | net = torch.load('mobilenet.pth') 11 | net = net.to('cuda:0') 12 | net = net.eval() 13 | print('model: ', net) 14 | #print('state dict: ', net.state_dict().keys()) 15 | tmp = torch.ones(1, 3, 224, 224).to('cuda:0') 16 | print('input: ', tmp) 17 | out = net(tmp) 18 | 19 | print('output:', out) 20 | 21 | summary(net, (3, 224, 224)) 22 | #return 23 | f = open("mobilenet.wts", 'w') 24 | f.write("{}\n".format(len(net.state_dict().keys()))) 25 | for k,v in net.state_dict().items(): 26 | print('key: ', k) 27 | print('value: ', v.shape) 28 | vr = v.reshape(-1).cpu().numpy() 29 | f.write("{} {}".format(k, len(vr))) 30 | for vv in vr: 31 | f.write(" ") 32 | f.write(struct.pack(">f", float(vv)).hex()) 33 | f.write("\n") 34 | 35 | if __name__ == '__main__': 36 | main() 37 | 38 | -------------------------------------------------------------------------------- /mobilenet/mobilenetv2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import torchvision 5 | 6 | def main(): 7 | print('cuda device count: ', torch.cuda.device_count()) 8 | net = torchvision.models.mobilenet_v2(pretrained=True) 9 | #net.fc = nn.Linear(512, 2) 10 | net = net.eval() 11 | net = net.to('cuda:0') 12 | print(net) 13 | tmp = torch.ones(1, 3, 224, 224).to('cuda:0') 14 | out = net(tmp) 15 | print('mobilenet out:', out.shape) 16 | torch.save(net, "mobilenetv2.pth") 17 | 18 | if __name__ == '__main__': 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /mobilenet/mobilenetv3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import torchvision 5 | 6 | def main(): 7 | print('cuda device count: ', torch.cuda.device_count()) 8 | net = torchvision.models.mobilenet_v3_small(pretrained=True) 9 | #net.fc = nn.Linear(512, 2) 10 | net = net.eval() 11 | net = net.to('cuda:0') 12 | print(net) 13 | tmp = torch.ones(1, 3, 224, 224).to('cuda:0') 14 | out = net(tmp) 15 | print('mobilenet out:', out.shape) 16 | torch.save(net, "mobilenetv3.pth") 17 | 18 | if __name__ == '__main__': 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | *.weights 103 | *.wts* 104 | imgss 105 | *.png 106 | det 107 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/README-Ayoosh.md: -------------------------------------------------------------------------------- 1 | # A PyTorch implementation of a YOLO v3 Object Detector 2 | 3 | [UPDATE] : This repo serves as a driver code for my research. I just graduated college, and am very busy looking for research internship / fellowship roles before eventually applying for a masters. I won't have the time to look into issues for the time being. Thank you. 4 | 5 | 6 | This repository contains code for a object detector based on [YOLOv3: An Incremental Improvement](https://pjreddie.com/media/files/papers/YOLOv3.pdf), implementedin PyTorch. The code is based on the official code of [YOLO v3](https://github.com/pjreddie/darknet), as well as a PyTorch 7 | port of the original code, by [marvis](https://github.com/marvis/pytorch-yolo2). One of the goals of this code is to improve 8 | upon the original port by removing redundant parts of the code (The official code is basically a fully blown deep learning 9 | library, and includes stuff like sequence models, which are not used in YOLO). I've also tried to keep the code minimal, and 10 | document it as well as I can. 11 | 12 | ### Tutorial for building this detector from scratch 13 | If you want to understand how to implement this detector by yourself from scratch, then you can go through this very detailed 5-part tutorial series I wrote on Paperspace. Perfect for someone who wants to move from beginner to intermediate pytorch skills. 14 | 15 | [Implement YOLO v3 from scratch](https://blog.paperspace.com/how-to-implement-a-yolo-object-detector-in-pytorch/) 16 | 17 | As of now, the code only contains the detection module, but you should expect the training module soon. :) 18 | 19 | ## Requirements 20 | 1. Python 3.5 21 | 2. OpenCV 22 | 3. PyTorch 0.4 23 | 24 | Using PyTorch 0.3 will break the detector. 25 | 26 | 27 | 28 | ## Detection Example 29 | 30 | ![Detection Example](https://i.imgur.com/m2jwneng.png) 31 | ## Running the detector 32 | 33 | ### On single or multiple images 34 | 35 | Clone, and `cd` into the repo directory. The first thing you need to do is to get the weights file 36 | This time around, for v3, authors has supplied a weightsfile only for COCO [here](https://pjreddie.com/media/files/yolov3.weights), and place 37 | 38 | the weights file into your repo directory. Or, you could just type (if you're on Linux) 39 | 40 | ``` 41 | wget https://pjreddie.com/media/files/yolov3.weights 42 | python detect.py --images imgs --det det 43 | ``` 44 | 45 | 46 | `--images` flag defines the directory to load images from, or a single image file (it will figure it out), and `--det` is the directory 47 | to save images to. Other setting such as batch size (using `--bs` flag) , object threshold confidence can be tweaked with flags that can be looked up with. 48 | 49 | ``` 50 | python detect.py -h 51 | ``` 52 | 53 | ### Speed Accuracy Tradeoff 54 | You can change the resolutions of the input image by the `--reso` flag. The default value is 416. Whatever value you chose, rememeber **it should be a multiple of 32 and greater than 32**. Weird things will happen if you don't. You've been warned. 55 | 56 | ``` 57 | python detect.py --images imgs --det det --reso 320 58 | ``` 59 | 60 | ### On Video 61 | For this, you should run the file, video_demo.py with --video flag specifying the video file. The video file should be in .avi format 62 | since openCV only accepts OpenCV as the input format. 63 | 64 | ``` 65 | python video_demo.py --video video.avi 66 | ``` 67 | 68 | Tweakable settings can be seen with -h flag. 69 | 70 | ### Speeding up Video Inference 71 | 72 | To speed video inference, you can try using the video_demo_half.py file instead which does all the inference with 16-bit half 73 | precision floats instead of 32-bit float. I haven't seen big improvements, but I attribute that to having an older card 74 | (Tesla K80, Kepler arch). If you have one of cards with fast float16 support, try it out, and if possible, benchmark it. 75 | 76 | ### On a Camera 77 | Same as video module, but you don't have to specify the video file since feed will be taken from your camera. To be precise, 78 | feed will be taken from what the OpenCV, recognises as camera 0. The default image resolution is 160 here, though you can change it with `reso` flag. 79 | 80 | ``` 81 | python cam_demo.py 82 | ``` 83 | You can easily tweak the code to use different weightsfiles, available at [yolo website](https://pjreddie.com/darknet/yolo/) 84 | 85 | NOTE: The scales features has been disabled for better refactoring. 86 | ### Detection across different scales 87 | YOLO v3 makes detections across different scales, each of which deputise in detecting objects of different sizes depending upon whether they capture coarse features, fine grained features or something between. You can experiment with these scales by the `--scales` flag. 88 | 89 | ``` 90 | python detect.py --scales 1,3 91 | ``` 92 | 93 | 94 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/README.md: -------------------------------------------------------------------------------- 1 | # Pytorch YOLO v3 2 | 3 | This is forked from [ayooshkathuria/pytorch-yolo-v3](https://github.com/ayooshkathuria/pytorch-yolo-v3) 4 | 5 | I added `inference.py` to do inference on one pic and export weights file for tensorrt. 6 | 7 | ## Test Environments 8 | 1. Python 3.7.3 9 | 2. cuda 10.0 10 | 3. PyTorch 1.3.0 11 | 4. torchvision 0.4.1 12 | 13 | ## Run 14 | 15 | Clone, and `cd` into the repo directory. The first thing you need to do is to get the weights file 16 | This time around, for v3, authors has supplied a weightsfile only for COCO [here](https://pjreddie.com/media/files/yolov3.weights), and place 17 | 18 | the weights file into your repo directory. Or, you could just type (if you're on Linux) 19 | 20 | ``` 21 | wget https://pjreddie.com/media/files/yolov3.weights 22 | python inference.py 23 | ``` 24 | 25 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/__init__.py -------------------------------------------------------------------------------- /pytorch-yolo-v3/bbox.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import random 5 | 6 | import numpy as np 7 | import cv2 8 | 9 | def confidence_filter(result, confidence): 10 | conf_mask = (result[:,:,4] > confidence).float().unsqueeze(2) 11 | result = result*conf_mask 12 | 13 | return result 14 | 15 | def confidence_filter_cls(result, confidence): 16 | max_scores = torch.max(result[:,:,5:25], 2)[0] 17 | res = torch.cat((result, max_scores),2) 18 | print(res.shape) 19 | 20 | 21 | cond_1 = (res[:,:,4] > confidence).float() 22 | cond_2 = (res[:,:,25] > 0.995).float() 23 | 24 | conf = cond_1 + cond_2 25 | conf = torch.clamp(conf, 0.0, 1.0) 26 | conf = conf.unsqueeze(2) 27 | result = result*conf 28 | return result 29 | 30 | 31 | 32 | def get_abs_coord(box): 33 | box[2], box[3] = abs(box[2]), abs(box[3]) 34 | x1 = (box[0] - box[2]/2) - 1 35 | y1 = (box[1] - box[3]/2) - 1 36 | x2 = (box[0] + box[2]/2) - 1 37 | y2 = (box[1] + box[3]/2) - 1 38 | return x1, y1, x2, y2 39 | 40 | 41 | 42 | def sanity_fix(box): 43 | if (box[0] > box[2]): 44 | box[0], box[2] = box[2], box[0] 45 | 46 | if (box[1] > box[3]): 47 | box[1], box[3] = box[3], box[1] 48 | 49 | return box 50 | 51 | def bbox_iou(box1, box2): 52 | """ 53 | Returns the IoU of two bounding boxes 54 | 55 | 56 | """ 57 | #Get the coordinates of bounding boxes 58 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3] 59 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3] 60 | 61 | #get the corrdinates of the intersection rectangle 62 | inter_rect_x1 = torch.max(b1_x1, b2_x1) 63 | inter_rect_y1 = torch.max(b1_y1, b2_y1) 64 | inter_rect_x2 = torch.min(b1_x2, b2_x2) 65 | inter_rect_y2 = torch.min(b1_y2, b2_y2) 66 | 67 | #Intersection area 68 | if torch.cuda.is_available(): 69 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape).cuda())*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape).cuda()) 70 | else: 71 | inter_area = torch.max(inter_rect_x2 - inter_rect_x1 + 1,torch.zeros(inter_rect_x2.shape))*torch.max(inter_rect_y2 - inter_rect_y1 + 1, torch.zeros(inter_rect_x2.shape)) 72 | 73 | #Union Area 74 | b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1) 75 | b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1) 76 | 77 | iou = inter_area / (b1_area + b2_area - inter_area) 78 | 79 | return iou 80 | 81 | 82 | def pred_corner_coord(prediction): 83 | #Get indices of non-zero confidence bboxes 84 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 85 | 86 | box = prediction[ind_nz[0], ind_nz[1]] 87 | 88 | 89 | box_a = box.new(box.shape) 90 | box_a[:,0] = (box[:,0] - box[:,2]/2) 91 | box_a[:,1] = (box[:,1] - box[:,3]/2) 92 | box_a[:,2] = (box[:,0] + box[:,2]/2) 93 | box_a[:,3] = (box[:,1] + box[:,3]/2) 94 | box[:,:4] = box_a[:,:4] 95 | 96 | prediction[ind_nz[0], ind_nz[1]] = box 97 | 98 | return prediction 99 | 100 | 101 | 102 | 103 | def write(x, batches, results, colors, classes): 104 | c1 = tuple(x[1:3].int()) 105 | c2 = tuple(x[3:5].int()) 106 | img = results[int(x[0])] 107 | cls = int(x[-1]) 108 | label = "{0}".format(classes[cls]) 109 | color = random.choice(colors) 110 | cv2.rectangle(img, c1, c2,color, 1) 111 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 112 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 113 | cv2.rectangle(img, c1, c2,color, -1) 114 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); 115 | return img 116 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/cam_demo.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import time 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import numpy as np 7 | import cv2 8 | from util import * 9 | from darknet import Darknet 10 | from preprocess import prep_image, inp_to_image 11 | import pandas as pd 12 | import random 13 | import argparse 14 | import pickle as pkl 15 | 16 | def get_test_input(input_dim, CUDA): 17 | img = cv2.imread("imgs/messi.jpg") 18 | img = cv2.resize(img, (input_dim, input_dim)) 19 | img_ = img[:,:,::-1].transpose((2,0,1)) 20 | img_ = img_[np.newaxis,:,:,:]/255.0 21 | img_ = torch.from_numpy(img_).float() 22 | img_ = Variable(img_) 23 | 24 | if CUDA: 25 | img_ = img_.cuda() 26 | 27 | return img_ 28 | 29 | def prep_image(img, inp_dim): 30 | """ 31 | Prepare image for inputting to the neural network. 32 | 33 | Returns a Variable 34 | """ 35 | 36 | orig_im = img 37 | dim = orig_im.shape[1], orig_im.shape[0] 38 | img = cv2.resize(orig_im, (inp_dim, inp_dim)) 39 | img_ = img[:,:,::-1].transpose((2,0,1)).copy() 40 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 41 | return img_, orig_im, dim 42 | 43 | def write(x, img): 44 | c1 = tuple(x[1:3].int()) 45 | c2 = tuple(x[3:5].int()) 46 | cls = int(x[-1]) 47 | label = "{0}".format(classes[cls]) 48 | color = random.choice(colors) 49 | cv2.rectangle(img, c1, c2,color, 1) 50 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 51 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 52 | cv2.rectangle(img, c1, c2,color, -1) 53 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); 54 | return img 55 | 56 | def arg_parse(): 57 | """ 58 | Parse arguements to the detect module 59 | 60 | """ 61 | 62 | 63 | parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo') 64 | parser.add_argument("--confidence", dest = "confidence", help = "Object Confidence to filter predictions", default = 0.25) 65 | parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS Threshhold", default = 0.4) 66 | parser.add_argument("--reso", dest = 'reso', help = 67 | "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed", 68 | default = "160", type = str) 69 | return parser.parse_args() 70 | 71 | 72 | 73 | if __name__ == '__main__': 74 | cfgfile = "cfg/yolov3.cfg" 75 | weightsfile = "yolov3.weights" 76 | num_classes = 80 77 | 78 | args = arg_parse() 79 | confidence = float(args.confidence) 80 | nms_thesh = float(args.nms_thresh) 81 | start = 0 82 | CUDA = torch.cuda.is_available() 83 | 84 | 85 | 86 | 87 | num_classes = 80 88 | bbox_attrs = 5 + num_classes 89 | 90 | model = Darknet(cfgfile) 91 | model.load_weights(weightsfile) 92 | 93 | model.net_info["height"] = args.reso 94 | inp_dim = int(model.net_info["height"]) 95 | 96 | assert inp_dim % 32 == 0 97 | assert inp_dim > 32 98 | 99 | if CUDA: 100 | model.cuda() 101 | 102 | model.eval() 103 | 104 | videofile = 'video.avi' 105 | 106 | cap = cv2.VideoCapture(0) 107 | 108 | assert cap.isOpened(), 'Cannot capture source' 109 | 110 | frames = 0 111 | start = time.time() 112 | while cap.isOpened(): 113 | 114 | ret, frame = cap.read() 115 | if ret: 116 | 117 | img, orig_im, dim = prep_image(frame, inp_dim) 118 | 119 | # im_dim = torch.FloatTensor(dim).repeat(1,2) 120 | 121 | 122 | if CUDA: 123 | im_dim = im_dim.cuda() 124 | img = img.cuda() 125 | 126 | 127 | output = model(Variable(img), CUDA) 128 | output = write_results(output, confidence, num_classes, nms = True, nms_conf = nms_thesh) 129 | 130 | if type(output) == int: 131 | frames += 1 132 | print("FPS of the video is {:5.2f}".format( frames / (time.time() - start))) 133 | cv2.imshow("frame", orig_im) 134 | key = cv2.waitKey(1) 135 | if key & 0xFF == ord('q'): 136 | break 137 | continue 138 | 139 | 140 | 141 | output[:,1:5] = torch.clamp(output[:,1:5], 0.0, float(inp_dim))/inp_dim 142 | 143 | # im_dim = im_dim.repeat(output.size(0), 1) 144 | output[:,[1,3]] *= frame.shape[1] 145 | output[:,[2,4]] *= frame.shape[0] 146 | 147 | 148 | classes = load_classes('data/coco.names') 149 | colors = pkl.load(open("pallete", "rb")) 150 | 151 | list(map(lambda x: write(x, orig_im), output)) 152 | 153 | 154 | cv2.imshow("frame", orig_im) 155 | key = cv2.waitKey(1) 156 | if key & 0xFF == ord('q'): 157 | break 158 | frames += 1 159 | print("FPS of the video is {:5.2f}".format( frames / (time.time() - start))) 160 | 161 | 162 | else: 163 | break 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/cfg/tiny-yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | batch=64 3 | subdivisions=8 4 | width=416 5 | height=416 6 | channels=3 7 | momentum=0.9 8 | decay=0.0005 9 | angle=0 10 | saturation = 1.5 11 | exposure = 1.5 12 | hue=.1 13 | 14 | learning_rate=0.001 15 | max_batches = 40200 16 | policy=steps 17 | steps=-1,100,20000,30000 18 | scales=.1,10,.1,.1 19 | 20 | [convolutional] 21 | batch_normalize=1 22 | filters=16 23 | size=3 24 | stride=1 25 | pad=1 26 | activation=leaky 27 | 28 | [maxpool] 29 | size=2 30 | stride=2 31 | 32 | [convolutional] 33 | batch_normalize=1 34 | filters=32 35 | size=3 36 | stride=1 37 | pad=1 38 | activation=leaky 39 | 40 | [maxpool] 41 | size=2 42 | stride=2 43 | 44 | [convolutional] 45 | batch_normalize=1 46 | filters=64 47 | size=3 48 | stride=1 49 | pad=1 50 | activation=leaky 51 | 52 | [maxpool] 53 | size=2 54 | stride=2 55 | 56 | [convolutional] 57 | batch_normalize=1 58 | filters=128 59 | size=3 60 | stride=1 61 | pad=1 62 | activation=leaky 63 | 64 | [maxpool] 65 | size=2 66 | stride=2 67 | 68 | [convolutional] 69 | batch_normalize=1 70 | filters=256 71 | size=3 72 | stride=1 73 | pad=1 74 | activation=leaky 75 | 76 | [maxpool] 77 | size=2 78 | stride=2 79 | 80 | [convolutional] 81 | batch_normalize=1 82 | filters=512 83 | size=3 84 | stride=1 85 | pad=1 86 | activation=leaky 87 | 88 | [maxpool] 89 | size=2 90 | stride=1 91 | 92 | [convolutional] 93 | batch_normalize=1 94 | filters=1024 95 | size=3 96 | stride=1 97 | pad=1 98 | activation=leaky 99 | 100 | ########### 101 | 102 | [convolutional] 103 | batch_normalize=1 104 | size=3 105 | stride=1 106 | pad=1 107 | filters=1024 108 | activation=leaky 109 | 110 | [convolutional] 111 | size=1 112 | stride=1 113 | pad=1 114 | filters=125 115 | activation=linear 116 | 117 | [region] 118 | anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 119 | bias_match=1 120 | classes=20 121 | coords=4 122 | num=5 123 | softmax=1 124 | jitter=.2 125 | rescore=1 126 | 127 | object_scale=5 128 | noobject_scale=1 129 | class_scale=1 130 | coord_scale=1 131 | 132 | absolute=1 133 | thresh = .6 134 | random=1 135 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/cfg/yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=64 4 | subdivisions=8 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | height=416 9 | width=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 80200 21 | policy=steps 22 | steps=-1,500,40000,60000 23 | scales=0.1,10,.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=125 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 243 | bias_match=1 244 | classes=20 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/cfg/yolo.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=425 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 243 | bias_match=1 244 | classes=80 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/cfg/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width= 320 9 | height = 320 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .5 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .5 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .5 787 | truth_thresh = 1 788 | random=1 789 | 790 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/darknet.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | import numpy as np 8 | import cv2 9 | import matplotlib.pyplot as plt 10 | from util import count_parameters as count 11 | from util import convert2cpu as cpu 12 | from util import predict_transform 13 | 14 | class test_net(nn.Module): 15 | def __init__(self, num_layers, input_size): 16 | super(test_net, self).__init__() 17 | self.num_layers= num_layers 18 | self.linear_1 = nn.Linear(input_size, 5) 19 | self.middle = nn.ModuleList([nn.Linear(5,5) for x in range(num_layers)]) 20 | self.output = nn.Linear(5,2) 21 | 22 | def forward(self, x): 23 | x = x.view(-1) 24 | fwd = nn.Sequential(self.linear_1, *self.middle, self.output) 25 | return fwd(x) 26 | 27 | def get_test_input(): 28 | img = cv2.imread("dog-cycle-car.png") 29 | img = cv2.resize(img, (416,416)) 30 | img_ = img[:,:,::-1].transpose((2,0,1)) 31 | img_ = img_[np.newaxis,:,:,:]/255.0 32 | img_ = torch.from_numpy(img_).float() 33 | img_ = Variable(img_) 34 | return img_ 35 | 36 | 37 | def parse_cfg(cfgfile): 38 | """ 39 | Takes a configuration file 40 | 41 | Returns a list of blocks. Each blocks describes a block in the neural 42 | network to be built. Block is represented as a dictionary in the list 43 | 44 | """ 45 | file = open(cfgfile, 'r') 46 | lines = file.read().split('\n') #store the lines in a list 47 | lines = [x for x in lines if len(x) > 0] #get read of the empty lines 48 | lines = [x for x in lines if x[0] != '#'] 49 | lines = [x.rstrip().lstrip() for x in lines] 50 | 51 | 52 | block = {} 53 | blocks = [] 54 | 55 | for line in lines: 56 | if line[0] == "[": #This marks the start of a new block 57 | if len(block) != 0: 58 | blocks.append(block) 59 | block = {} 60 | block["type"] = line[1:-1].rstrip() 61 | else: 62 | key,value = line.split("=") 63 | block[key.rstrip()] = value.lstrip() 64 | blocks.append(block) 65 | 66 | return blocks 67 | # print('\n\n'.join([repr(x) for x in blocks])) 68 | 69 | import pickle as pkl 70 | 71 | class MaxPoolStride1(nn.Module): 72 | def __init__(self, kernel_size): 73 | super(MaxPoolStride1, self).__init__() 74 | self.kernel_size = kernel_size 75 | self.pad = kernel_size - 1 76 | 77 | def forward(self, x): 78 | padded_x = F.pad(x, (0,self.pad,0,self.pad), mode="replicate") 79 | pooled_x = nn.MaxPool2d(self.kernel_size, self.pad)(padded_x) 80 | return pooled_x 81 | 82 | 83 | class EmptyLayer(nn.Module): 84 | def __init__(self): 85 | super(EmptyLayer, self).__init__() 86 | 87 | 88 | class DetectionLayer(nn.Module): 89 | def __init__(self, anchors): 90 | super(DetectionLayer, self).__init__() 91 | self.anchors = anchors 92 | 93 | def forward(self, x, inp_dim, num_classes, confidence): 94 | x = x.data 95 | global CUDA 96 | prediction = x 97 | prediction = predict_transform(prediction, inp_dim, self.anchors, num_classes, confidence, CUDA) 98 | return prediction 99 | 100 | 101 | 102 | 103 | 104 | class Upsample(nn.Module): 105 | def __init__(self, stride=2): 106 | super(Upsample, self).__init__() 107 | self.stride = stride 108 | 109 | def forward(self, x): 110 | stride = self.stride 111 | assert(x.data.dim() == 4) 112 | B = x.data.size(0) 113 | C = x.data.size(1) 114 | H = x.data.size(2) 115 | W = x.data.size(3) 116 | ws = stride 117 | hs = stride 118 | x = x.view(B, C, H, 1, W, 1).expand(B, C, H, stride, W, stride).contiguous().view(B, C, H*stride, W*stride) 119 | return x 120 | # 121 | 122 | class ReOrgLayer(nn.Module): 123 | def __init__(self, stride = 2): 124 | super(ReOrgLayer, self).__init__() 125 | self.stride= stride 126 | 127 | def forward(self,x): 128 | assert(x.data.dim() == 4) 129 | B,C,H,W = x.data.shape 130 | hs = self.stride 131 | ws = self.stride 132 | assert(H % hs == 0), "The stride " + str(self.stride) + " is not a proper divisor of height " + str(H) 133 | assert(W % ws == 0), "The stride " + str(self.stride) + " is not a proper divisor of height " + str(W) 134 | x = x.view(B,C, H // hs, hs, W // ws, ws).transpose(-2,-3).contiguous() 135 | x = x.view(B,C, H // hs * W // ws, hs, ws) 136 | x = x.view(B,C, H // hs * W // ws, hs*ws).transpose(-1,-2).contiguous() 137 | x = x.view(B, C, ws*hs, H // ws, W // ws).transpose(1,2).contiguous() 138 | x = x.view(B, C*ws*hs, H // ws, W // ws) 139 | return x 140 | 141 | 142 | def create_modules(blocks): 143 | net_info = blocks[0] #Captures the information about the input and pre-processing 144 | 145 | module_list = nn.ModuleList() 146 | 147 | index = 0 #indexing blocks helps with implementing route layers (skip connections) 148 | 149 | 150 | prev_filters = 3 151 | 152 | output_filters = [] 153 | 154 | for x in blocks: 155 | module = nn.Sequential() 156 | 157 | if (x["type"] == "net"): 158 | continue 159 | 160 | #If it's a convolutional layer 161 | if (x["type"] == "convolutional"): 162 | #Get the info about the layer 163 | activation = x["activation"] 164 | try: 165 | batch_normalize = int(x["batch_normalize"]) 166 | bias = False 167 | except: 168 | batch_normalize = 0 169 | bias = True 170 | 171 | filters= int(x["filters"]) 172 | padding = int(x["pad"]) 173 | kernel_size = int(x["size"]) 174 | stride = int(x["stride"]) 175 | 176 | if padding: 177 | pad = (kernel_size - 1) // 2 178 | else: 179 | pad = 0 180 | 181 | #Add the convolutional layer 182 | conv = nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias = bias) 183 | module.add_module("conv_{0}".format(index), conv) 184 | 185 | #Add the Batch Norm Layer 186 | if batch_normalize: 187 | bn = nn.BatchNorm2d(filters) 188 | module.add_module("batch_norm_{0}".format(index), bn) 189 | 190 | #Check the activation. 191 | #It is either Linear or a Leaky ReLU for YOLO 192 | if activation == "leaky": 193 | activn = nn.LeakyReLU(0.1, inplace = True) 194 | module.add_module("leaky_{0}".format(index), activn) 195 | 196 | 197 | 198 | #If it's an upsampling layer 199 | #We use Bilinear2dUpsampling 200 | 201 | elif (x["type"] == "upsample"): 202 | stride = int(x["stride"]) 203 | # upsample = Upsample(stride) 204 | upsample = nn.Upsample(scale_factor = 2, mode = "nearest") 205 | module.add_module("upsample_{}".format(index), upsample) 206 | 207 | #If it is a route layer 208 | elif (x["type"] == "route"): 209 | x["layers"] = x["layers"].split(',') 210 | 211 | #Start of a route 212 | start = int(x["layers"][0]) 213 | 214 | #end, if there exists one. 215 | try: 216 | end = int(x["layers"][1]) 217 | except: 218 | end = 0 219 | 220 | 221 | 222 | #Positive anotation 223 | if start > 0: 224 | start = start - index 225 | 226 | if end > 0: 227 | end = end - index 228 | 229 | 230 | route = EmptyLayer() 231 | module.add_module("route_{0}".format(index), route) 232 | 233 | 234 | 235 | if end < 0: 236 | filters = output_filters[index + start] + output_filters[index + end] 237 | else: 238 | filters= output_filters[index + start] 239 | 240 | 241 | 242 | #shortcut corresponds to skip connection 243 | elif x["type"] == "shortcut": 244 | from_ = int(x["from"]) 245 | shortcut = EmptyLayer() 246 | module.add_module("shortcut_{}".format(index), shortcut) 247 | 248 | 249 | elif x["type"] == "maxpool": 250 | stride = int(x["stride"]) 251 | size = int(x["size"]) 252 | if stride != 1: 253 | maxpool = nn.MaxPool2d(size, stride) 254 | else: 255 | maxpool = MaxPoolStride1(size) 256 | 257 | module.add_module("maxpool_{}".format(index), maxpool) 258 | 259 | #Yolo is the detection layer 260 | elif x["type"] == "yolo": 261 | mask = x["mask"].split(",") 262 | mask = [int(x) for x in mask] 263 | 264 | 265 | anchors = x["anchors"].split(",") 266 | anchors = [int(a) for a in anchors] 267 | anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors),2)] 268 | anchors = [anchors[i] for i in mask] 269 | 270 | detection = DetectionLayer(anchors) 271 | module.add_module("Detection_{}".format(index), detection) 272 | 273 | 274 | 275 | else: 276 | print("Something I dunno") 277 | assert False 278 | 279 | 280 | module_list.append(module) 281 | prev_filters = filters 282 | output_filters.append(filters) 283 | index += 1 284 | 285 | 286 | return (net_info, module_list) 287 | 288 | 289 | 290 | class Darknet(nn.Module): 291 | def __init__(self, cfgfile): 292 | super(Darknet, self).__init__() 293 | self.blocks = parse_cfg(cfgfile) 294 | self.net_info, self.module_list = create_modules(self.blocks) 295 | self.header = torch.IntTensor([0,0,0,0]) 296 | self.seen = 0 297 | 298 | 299 | 300 | def get_blocks(self): 301 | return self.blocks 302 | 303 | def get_module_list(self): 304 | print('get_module_list') 305 | return self.module_list 306 | 307 | 308 | def forward(self, x, CUDA = True): 309 | detections = [] 310 | modules = self.blocks[1:] 311 | outputs = {} #We cache the outputs for the route layer 312 | 313 | 314 | write = 0 315 | for i in range(len(modules)): 316 | 317 | 318 | module_type = (modules[i]["type"]) 319 | print('index -> ', i) 320 | print('type -> ', module_type) 321 | if module_type == "convolutional" or module_type == "upsample" or module_type == "maxpool": 322 | 323 | x = self.module_list[i](x) 324 | outputs[i] = x 325 | 326 | 327 | elif module_type == "route": 328 | layers = modules[i]["layers"] 329 | layers = [int(a) for a in layers] 330 | 331 | if (layers[0]) > 0: 332 | layers[0] = layers[0] - i 333 | 334 | if len(layers) == 1: 335 | x = outputs[i + (layers[0])] 336 | 337 | else: 338 | if (layers[1]) > 0: 339 | layers[1] = layers[1] - i 340 | 341 | map1 = outputs[i + layers[0]] 342 | map2 = outputs[i + layers[1]] 343 | 344 | 345 | x = torch.cat((map1, map2), 1) 346 | outputs[i] = x 347 | 348 | elif module_type == "shortcut": 349 | from_ = int(modules[i]["from"]) 350 | x = outputs[i-1] + outputs[i+from_] 351 | outputs[i] = x 352 | 353 | 354 | 355 | elif module_type == 'yolo': 356 | 357 | anchors = self.module_list[i][0].anchors 358 | #Get the input dimensions 359 | inp_dim = int (self.net_info["height"]) 360 | 361 | #Get the number of classes 362 | num_classes = int (modules[i]["classes"]) 363 | 364 | #Output the result 365 | x = x.data 366 | x = predict_transform(x, inp_dim, anchors, num_classes, CUDA) 367 | 368 | print('yolo shape: ', x.shape) 369 | print('yolo : ', x) 370 | 371 | if type(x) == int: 372 | continue 373 | 374 | 375 | if not write: 376 | detections = x 377 | write = 1 378 | 379 | else: 380 | detections = torch.cat((detections, x), 1) 381 | 382 | outputs[i] = outputs[i-1] 383 | 384 | print('outputs[81]: ', outputs[81].shape) 385 | print('outputs[82]: ', outputs[82].shape) 386 | 387 | try: 388 | return detections 389 | except: 390 | return 0 391 | 392 | 393 | def load_weights(self, weightfile): 394 | 395 | #Open the weights file 396 | fp = open(weightfile, "rb") 397 | 398 | #The first 4 values are header information 399 | # 1. Major version number 400 | # 2. Minor Version Number 401 | # 3. Subversion number 402 | # 4. IMages seen 403 | header = np.fromfile(fp, dtype = np.int32, count = 5) 404 | self.header = torch.from_numpy(header) 405 | self.seen = self.header[3] 406 | 407 | #The rest of the values are the weights 408 | # Let's load them up 409 | weights = np.fromfile(fp, dtype = np.float32) 410 | 411 | ptr = 0 412 | for i in range(len(self.module_list)): 413 | module_type = self.blocks[i + 1]["type"] 414 | 415 | if module_type == "convolutional": 416 | model = self.module_list[i] 417 | try: 418 | batch_normalize = int(self.blocks[i+1]["batch_normalize"]) 419 | except: 420 | batch_normalize = 0 421 | 422 | conv = model[0] 423 | 424 | if (batch_normalize): 425 | bn = model[1] 426 | 427 | #Get the number of weights of Batch Norm Layer 428 | num_bn_biases = bn.bias.numel() 429 | 430 | #Load the weights 431 | bn_biases = torch.from_numpy(weights[ptr:ptr + num_bn_biases]) 432 | ptr += num_bn_biases 433 | 434 | bn_weights = torch.from_numpy(weights[ptr: ptr + num_bn_biases]) 435 | ptr += num_bn_biases 436 | 437 | bn_running_mean = torch.from_numpy(weights[ptr: ptr + num_bn_biases]) 438 | ptr += num_bn_biases 439 | 440 | bn_running_var = torch.from_numpy(weights[ptr: ptr + num_bn_biases]) 441 | ptr += num_bn_biases 442 | 443 | #Cast the loaded weights into dims of model weights. 444 | bn_biases = bn_biases.view_as(bn.bias.data) 445 | bn_weights = bn_weights.view_as(bn.weight.data) 446 | bn_running_mean = bn_running_mean.view_as(bn.running_mean) 447 | bn_running_var = bn_running_var.view_as(bn.running_var) 448 | 449 | #Copy the data to model 450 | bn.bias.data.copy_(bn_biases) 451 | bn.weight.data.copy_(bn_weights) 452 | bn.running_mean.copy_(bn_running_mean) 453 | bn.running_var.copy_(bn_running_var) 454 | 455 | else: 456 | #Number of biases 457 | num_biases = conv.bias.numel() 458 | 459 | #Load the weights 460 | conv_biases = torch.from_numpy(weights[ptr: ptr + num_biases]) 461 | ptr = ptr + num_biases 462 | 463 | #reshape the loaded weights according to the dims of the model weights 464 | conv_biases = conv_biases.view_as(conv.bias.data) 465 | 466 | #Finally copy the data 467 | conv.bias.data.copy_(conv_biases) 468 | 469 | 470 | #Let us load the weights for the Convolutional layers 471 | num_weights = conv.weight.numel() 472 | 473 | #Do the same as above for weights 474 | conv_weights = torch.from_numpy(weights[ptr:ptr+num_weights]) 475 | ptr = ptr + num_weights 476 | 477 | conv_weights = conv_weights.view_as(conv.weight.data) 478 | conv.weight.data.copy_(conv_weights) 479 | 480 | def save_weights(self, savedfile, cutoff = 0): 481 | 482 | if cutoff <= 0: 483 | cutoff = len(self.blocks) - 1 484 | 485 | fp = open(savedfile, 'wb') 486 | 487 | # Attach the header at the top of the file 488 | self.header[3] = self.seen 489 | header = self.header 490 | 491 | header = header.numpy() 492 | header.tofile(fp) 493 | 494 | # Now, let us save the weights 495 | for i in range(len(self.module_list)): 496 | module_type = self.blocks[i+1]["type"] 497 | 498 | if (module_type) == "convolutional": 499 | model = self.module_list[i] 500 | try: 501 | batch_normalize = int(self.blocks[i+1]["batch_normalize"]) 502 | except: 503 | batch_normalize = 0 504 | 505 | conv = model[0] 506 | 507 | if (batch_normalize): 508 | bn = model[1] 509 | 510 | #If the parameters are on GPU, convert them back to CPU 511 | #We don't convert the parameter to GPU 512 | #Instead. we copy the parameter and then convert it to CPU 513 | #This is done as weight are need to be saved during training 514 | cpu(bn.bias.data).numpy().tofile(fp) 515 | cpu(bn.weight.data).numpy().tofile(fp) 516 | cpu(bn.running_mean).numpy().tofile(fp) 517 | cpu(bn.running_var).numpy().tofile(fp) 518 | 519 | 520 | else: 521 | cpu(conv.bias.data).numpy().tofile(fp) 522 | 523 | 524 | #Let us save the weights for the Convolutional layers 525 | cpu(conv.weight.data).numpy().tofile(fp) 526 | 527 | 528 | 529 | 530 | 531 | # 532 | #dn = Darknet('cfg/yolov3.cfg') 533 | #dn.load_weights("yolov3.weights") 534 | #inp = get_test_input() 535 | #a, interms = dn(inp) 536 | #dn.eval() 537 | #a_i, interms_i = dn(inp) 538 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/data/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/data/voc.names: -------------------------------------------------------------------------------- 1 | aeroplane 2 | bicycle 3 | bird 4 | boat 5 | bottle 6 | bus 7 | car 8 | cat 9 | chair 10 | cow 11 | diningtable 12 | dog 13 | horse 14 | motorbike 15 | person 16 | pottedplant 17 | sheep 18 | sofa 19 | train 20 | tvmonitor 21 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/det_messi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/det_messi.jpg -------------------------------------------------------------------------------- /pytorch-yolo-v3/detect.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import time 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import numpy as np 7 | import cv2 8 | from util import * 9 | import argparse 10 | import os 11 | import os.path as osp 12 | from darknet import Darknet 13 | from preprocess import prep_image, inp_to_image 14 | import pandas as pd 15 | import random 16 | import pickle as pkl 17 | import itertools 18 | 19 | class test_net(nn.Module): 20 | def __init__(self, num_layers, input_size): 21 | super(test_net, self).__init__() 22 | self.num_layers= num_layers 23 | self.linear_1 = nn.Linear(input_size, 5) 24 | self.middle = nn.ModuleList([nn.Linear(5,5) for x in range(num_layers)]) 25 | self.output = nn.Linear(5,2) 26 | 27 | def forward(self, x): 28 | x = x.view(-1) 29 | fwd = nn.Sequential(self.linear_1, *self.middle, self.output) 30 | return fwd(x) 31 | 32 | def get_test_input(input_dim, CUDA): 33 | img = cv2.imread("dog-cycle-car.png") 34 | img = cv2.resize(img, (input_dim, input_dim)) 35 | img_ = img[:,:,::-1].transpose((2,0,1)) 36 | img_ = img_[np.newaxis,:,:,:]/255.0 37 | img_ = torch.from_numpy(img_).float() 38 | img_ = Variable(img_) 39 | 40 | if CUDA: 41 | img_ = img_.cuda() 42 | num_classes 43 | return img_ 44 | 45 | 46 | 47 | def arg_parse(): 48 | """ 49 | Parse arguements to the detect module 50 | 51 | """ 52 | 53 | 54 | parser = argparse.ArgumentParser(description='YOLO v3 Detection Module') 55 | 56 | parser.add_argument("--images", dest = 'images', help = 57 | "Image / Directory containing images to perform detection upon", 58 | default = "imgs", type = str) 59 | parser.add_argument("--det", dest = 'det', help = 60 | "Image / Directory to store detections to", 61 | default = "det", type = str) 62 | parser.add_argument("--bs", dest = "bs", help = "Batch size", default = 1) 63 | parser.add_argument("--confidence", dest = "confidence", help = "Object Confidence to filter predictions", default = 0.5) 64 | parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS Threshhold", default = 0.4) 65 | parser.add_argument("--cfg", dest = 'cfgfile', help = 66 | "Config file", 67 | default = "cfg/yolov3.cfg", type = str) 68 | parser.add_argument("--weights", dest = 'weightsfile', help = 69 | "weightsfile", 70 | default = "yolov3.weights", type = str) 71 | parser.add_argument("--reso", dest = 'reso', help = 72 | "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed", 73 | default = "320", type = str) 74 | parser.add_argument("--scales", dest = "scales", help = "Scales to use for detection", 75 | default = "1,2,3", type = str) 76 | 77 | return parser.parse_args() 78 | 79 | if __name__ == '__main__': 80 | args = arg_parse() 81 | 82 | scales = args.scales 83 | 84 | 85 | # scales = [int(x) for x in scales.split(',')] 86 | # 87 | # 88 | # 89 | # args.reso = int(args.reso) 90 | # 91 | # num_boxes = [args.reso//32, args.reso//16, args.reso//8] 92 | # scale_indices = [3*(x**2) for x in num_boxes] 93 | # scale_indices = list(itertools.accumulate(scale_indices, lambda x,y : x+y)) 94 | # 95 | # 96 | # li = [] 97 | # i = 0 98 | # for scale in scale_indices: 99 | # li.extend(list(range(i, scale))) 100 | # i = scale 101 | # 102 | # scale_indices = li 103 | 104 | images = args.images 105 | batch_size = int(args.bs) 106 | confidence = float(args.confidence) 107 | nms_thesh = float(args.nms_thresh) 108 | start = 0 109 | 110 | CUDA = torch.cuda.is_available() 111 | 112 | num_classes = 80 113 | classes = load_classes('data/coco.names') 114 | 115 | #Set up the neural network 116 | print("Loading network.....") 117 | model = Darknet(args.cfgfile) 118 | model.load_weights(args.weightsfile) 119 | print("Network successfully loaded") 120 | 121 | model.net_info["height"] = args.reso 122 | inp_dim = int(model.net_info["height"]) 123 | assert inp_dim % 32 == 0 124 | assert inp_dim > 32 125 | 126 | #If there's a GPU availible, put the model on GPU 127 | if CUDA: 128 | model.cuda() 129 | 130 | 131 | #Set the model in evaluation mode 132 | model.eval() 133 | 134 | read_dir = time.time() 135 | #Detection phase 136 | try: 137 | imlist = [osp.join(osp.realpath('.'), images, img) for img in os.listdir(images) if os.path.splitext(img)[1] == '.png' or os.path.splitext(img)[1] =='.jpeg' or os.path.splitext(img)[1] =='.jpg'] 138 | except NotADirectoryError: 139 | imlist = [] 140 | imlist.append(osp.join(osp.realpath('.'), images)) 141 | except FileNotFoundError: 142 | print ("No file or directory with the name {}".format(images)) 143 | exit() 144 | 145 | if not os.path.exists(args.det): 146 | os.makedirs(args.det) 147 | 148 | load_batch = time.time() 149 | 150 | batches = list(map(prep_image, imlist, [inp_dim for x in range(len(imlist))])) 151 | print(len(batches)) 152 | print(batches) 153 | print(len(batches[0])) 154 | 155 | im_batches = [x[0] for x in batches] 156 | print(im_batches[0].shape) 157 | orig_ims = [x[1] for x in batches] 158 | print(orig_ims[0].shape) 159 | im_dim_list = [x[2] for x in batches] 160 | im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2) 161 | 162 | 163 | 164 | if CUDA: 165 | im_dim_list = im_dim_list.cuda() 166 | 167 | leftover = 0 168 | 169 | if (len(im_dim_list) % batch_size): 170 | leftover = 1 171 | 172 | 173 | if batch_size != 1: 174 | num_batches = len(imlist) // batch_size + leftover 175 | im_batches = [torch.cat((im_batches[i*batch_size : min((i + 1)*batch_size, 176 | len(im_batches))])) for i in range(num_batches)] 177 | 178 | 179 | i = 0 180 | 181 | 182 | write = False 183 | model(get_test_input(inp_dim, CUDA), CUDA) 184 | 185 | start_det_loop = time.time() 186 | 187 | objs = {} 188 | 189 | 190 | 191 | for batch in im_batches: 192 | #load the image 193 | start = time.time() 194 | if CUDA: 195 | batch = batch.cuda() 196 | 197 | 198 | #Apply offsets to the result predictions 199 | #Tranform the predictions as described in the YOLO paper 200 | #flatten the prediction vector 201 | # B x (bbox cord x no. of anchors) x grid_w x grid_h --> B x bbox x (all the boxes) 202 | # Put every proposed box as a row. 203 | with torch.no_grad(): 204 | prediction = model(Variable(batch), CUDA) 205 | 206 | # prediction = prediction[:,scale_indices] 207 | print('pre: ', prediction.shape) 208 | 209 | 210 | #get the boxes with object confidence > threshold 211 | #Convert the cordinates to absolute coordinates 212 | #perform NMS on these boxes, and save the results 213 | #I could have done NMS and saving seperately to have a better abstraction 214 | #But both these operations require looping, hence 215 | #clubbing these ops in one loop instead of two. 216 | #loops are slower than vectorised operations. 217 | 218 | prediction = write_results(prediction, confidence, num_classes, nms = True, nms_conf = nms_thesh) 219 | 220 | print('pre1: ', prediction.shape) 221 | 222 | if type(prediction) == int: 223 | i += 1 224 | continue 225 | 226 | end = time.time() 227 | 228 | 229 | # print(end - start) 230 | 231 | 232 | 233 | prediction[:,0] += i*batch_size 234 | 235 | 236 | 237 | 238 | if not write: 239 | output = prediction 240 | write = 1 241 | else: 242 | print('output: ', output.shape) 243 | print('prediction: ', prediction.shape) 244 | output = torch.cat((output,prediction), 1) 245 | 246 | 247 | 248 | 249 | for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]): 250 | im_id = i*batch_size + im_num 251 | objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id] 252 | print("{0:20s} predicted in {1:6.3f} seconds".format(image.split("/")[-1], (end - start)/batch_size)) 253 | print("{0:20s} {1:s}".format("Objects Detected:", " ".join(objs))) 254 | print("----------------------------------------------------------") 255 | i += 1 256 | 257 | 258 | if CUDA: 259 | torch.cuda.synchronize() 260 | 261 | try: 262 | output 263 | except NameError: 264 | print("No detections were made") 265 | exit() 266 | 267 | print(output) 268 | print(output[:,0].long()) 269 | print(im_dim_list) 270 | im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long()) 271 | print(im_dim_list) 272 | 273 | scaling_factor = torch.min(inp_dim/im_dim_list,1)[0].view(-1,1) 274 | print(scaling_factor) 275 | 276 | 277 | output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2 278 | output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2 279 | print(output[:,[1,3]]) 280 | print(output[:,[2,4]]) 281 | 282 | 283 | 284 | output[:,1:5] /= scaling_factor 285 | print(output) 286 | 287 | for i in range(output.shape[0]): 288 | output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0]) 289 | output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1]) 290 | 291 | print(output) 292 | 293 | output_recast = time.time() 294 | 295 | 296 | class_load = time.time() 297 | 298 | colors = pkl.load(open("pallete", "rb")) 299 | 300 | 301 | draw = time.time() 302 | 303 | 304 | def write(x, batches, results): 305 | c1 = tuple(x[1:3].int()) 306 | c2 = tuple(x[3:5].int()) 307 | img = results[int(x[0])] 308 | cls = int(x[-1]) 309 | label = "{0}".format(classes[cls]) 310 | color = random.choice(colors) 311 | cv2.rectangle(img, c1, c2,color, 1) 312 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 313 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 314 | cv2.rectangle(img, c1, c2,color, -1) 315 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1) 316 | return img 317 | 318 | 319 | list(map(lambda x: write(x, im_batches, orig_ims), output)) 320 | 321 | det_names = pd.Series(imlist).apply(lambda x: "{}/det_{}".format(args.det,x.split("/")[-1])) 322 | 323 | list(map(cv2.imwrite, det_names, orig_ims)) 324 | 325 | end = time.time() 326 | 327 | print() 328 | print("SUMMARY") 329 | print("----------------------------------------------------------") 330 | print("{:25s}: {}".format("Task", "Time Taken (in seconds)")) 331 | print() 332 | print("{:25s}: {:2.3f}".format("Reading addresses", load_batch - read_dir)) 333 | print("{:25s}: {:2.3f}".format("Loading batch", start_det_loop - load_batch)) 334 | print("{:25s}: {:2.3f}".format("Detection (" + str(len(imlist)) + " images)", output_recast - start_det_loop)) 335 | print("{:25s}: {:2.3f}".format("Output Processing", class_load - output_recast)) 336 | print("{:25s}: {:2.3f}".format("Drawing Boxes", end - draw)) 337 | print("{:25s}: {:2.3f}".format("Average time_per_img", (end - load_batch)/len(imlist))) 338 | print("----------------------------------------------------------") 339 | 340 | 341 | torch.cuda.empty_cache() 342 | 343 | 344 | 345 | 346 | 347 | 348 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/imgs/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/imgs/dog.jpg -------------------------------------------------------------------------------- /pytorch-yolo-v3/imgs/eagle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/imgs/eagle.jpg -------------------------------------------------------------------------------- /pytorch-yolo-v3/imgs/giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/imgs/giraffe.jpg -------------------------------------------------------------------------------- /pytorch-yolo-v3/imgs/herd_of_horses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/imgs/herd_of_horses.jpg -------------------------------------------------------------------------------- /pytorch-yolo-v3/imgs/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/imgs/img1.jpg -------------------------------------------------------------------------------- /pytorch-yolo-v3/imgs/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/imgs/img2.jpg -------------------------------------------------------------------------------- /pytorch-yolo-v3/imgs/img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/imgs/img3.jpg -------------------------------------------------------------------------------- /pytorch-yolo-v3/imgs/img4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/imgs/img4.jpg -------------------------------------------------------------------------------- /pytorch-yolo-v3/imgs/messi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/imgs/messi.jpg -------------------------------------------------------------------------------- /pytorch-yolo-v3/imgs/person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/imgs/person.jpg -------------------------------------------------------------------------------- /pytorch-yolo-v3/imgs/scream.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/imgs/scream.jpg -------------------------------------------------------------------------------- /pytorch-yolo-v3/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torchvision 4 | import os 5 | import struct 6 | from torchsummary import summary 7 | from darknet import Darknet 8 | from util import * 9 | from preprocess import prep_image, inp_to_image 10 | import cv2 11 | 12 | def main(): 13 | 14 | confidence = 0.5 15 | nms_thesh = 0.4 16 | num_classes = 80 17 | classes = load_classes('data/coco.names') 18 | 19 | print('cuda device count: ', torch.cuda.device_count()) 20 | print("Loading network.....") 21 | net = Darknet('cfg/yolov3.cfg') 22 | net.load_weights('yolov3.weights') 23 | print("Network successfully loaded") 24 | net = net.to('cuda:0') 25 | net = net.eval() 26 | print('print model') 27 | print('model: ', net) 28 | 29 | #------------------------input images------------------------------------------------ 30 | input, origin, dim = prep_image('imgs/dog.jpg', 320); 31 | print('input:', input) 32 | input = input.to('cuda:0') 33 | print(input.shape) 34 | prediction = net(input, True) 35 | print('pre shape: ', prediction.shape) 36 | print('pre : ', prediction) 37 | prediction = write_results(prediction, confidence, num_classes, nms = True, nms_conf = nms_thesh) 38 | print('pre shape1: ', prediction.shape) 39 | print('pre1: ', prediction) 40 | 41 | scaling_factor = min(320/dim[0], 320/dim[1], 1) 42 | print(scaling_factor) 43 | prediction[:,[1,3]] -= (320 - scaling_factor*dim[0]) / 2 44 | prediction[:,[2,4]] -= (320 - scaling_factor*dim[1]) / 2 45 | print('pre2: ', prediction) 46 | prediction[:,1:5] /= scaling_factor 47 | print('pre3: ', prediction) 48 | 49 | for i in range(prediction.shape[0]): 50 | prediction[i, [1,3]] = torch.clamp(prediction[i, [1,3]], 0.0, dim[0]) 51 | prediction[i, [2,4]] = torch.clamp(prediction[i, [2,4]], 0.0, dim[1]) 52 | print('pre4: ', prediction) 53 | 54 | def write(x, batches, res): 55 | c1 = tuple(x[1:3].int()) 56 | c2 = tuple(x[3:5].int()) 57 | img = res 58 | cls = int(x[-1]) 59 | label = "{0}".format(classes[cls]) 60 | cv2.rectangle(img, c1, c2, (255, 0, 0), 1) 61 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 62 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 63 | cv2.rectangle(img, c1, c2, (255, 0, 0), -1) 64 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1) 65 | return img 66 | 67 | list(map(lambda x: write(x, input, origin), prediction)) 68 | cv2.imwrite('infout.png', origin) 69 | 70 | #------------------------input ones------------------------------------------------ 71 | #print('state dict: ', net.state_dict().keys()) 72 | tmp = torch.ones(1, 3, 320, 320).to('cuda:0') 73 | print('input: ', tmp) 74 | out = net(tmp) 75 | 76 | print('output:', out) 77 | 78 | summary(net, (3, 320, 320)) 79 | #return 80 | f = open("yolov3.wts", 'w') 81 | f.write("{}\n".format(len(net.state_dict().keys()))) 82 | for k,v in net.state_dict().items(): 83 | print('key: ', k) 84 | print('value: ', v.shape) 85 | vr = v.reshape(-1).cpu().numpy() 86 | f.write("{} {}".format(k, len(vr))) 87 | for vv in vr: 88 | f.write(" ") 89 | f.write(struct.pack(">f", float(vv)).hex()) 90 | f.write("\n") 91 | 92 | if __name__ == '__main__': 93 | main() 94 | 95 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/pallete: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/pytorchx/938ba5855cfb72b0dbce91af8c0a6d0e3943f122/pytorch-yolo-v3/pallete -------------------------------------------------------------------------------- /pytorch-yolo-v3/preprocess.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | import numpy as np 8 | import cv2 9 | import matplotlib.pyplot as plt 10 | from util import count_parameters as count 11 | from util import convert2cpu as cpu 12 | from PIL import Image, ImageDraw 13 | 14 | 15 | def letterbox_image(img, inp_dim): 16 | '''resize image with unchanged aspect ratio using padding''' 17 | img_w, img_h = img.shape[1], img.shape[0] 18 | w, h = inp_dim 19 | new_w = int(img_w * min(w/img_w, h/img_h)) 20 | new_h = int(img_h * min(w/img_w, h/img_h)) 21 | resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC) 22 | 23 | canvas = np.full((inp_dim[1], inp_dim[0], 3), 128) 24 | 25 | canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w, :] = resized_image 26 | 27 | return canvas 28 | 29 | 30 | 31 | def prep_image(img, inp_dim): 32 | """ 33 | Prepare image for inputting to the neural network. 34 | 35 | Returns a Variable 36 | """ 37 | 38 | orig_im = cv2.imread(img) 39 | dim = orig_im.shape[1], orig_im.shape[0] 40 | img = (letterbox_image(orig_im, (inp_dim, inp_dim))) 41 | img_ = img[:,:,::-1].transpose((2,0,1)).copy() 42 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 43 | return img_, orig_im, dim 44 | 45 | def prep_image_pil(img, network_dim): 46 | orig_im = Image.open(img) 47 | img = orig_im.convert('RGB') 48 | dim = img.size 49 | img = img.resize(network_dim) 50 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes())) 51 | img = img.view(*network_dim, 3).transpose(0,1).transpose(0,2).contiguous() 52 | img = img.view(1, 3,*network_dim) 53 | img = img.float().div(255.0) 54 | return (img, orig_im, dim) 55 | 56 | def inp_to_image(inp): 57 | inp = inp.cpu().squeeze() 58 | inp = inp*255 59 | try: 60 | inp = inp.data.numpy() 61 | except RuntimeError: 62 | inp = inp.numpy() 63 | inp = inp.transpose(1,2,0) 64 | 65 | inp = inp[:,:,::-1] 66 | return inp 67 | 68 | 69 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/util.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import division 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import numpy as np 9 | import cv2 10 | import matplotlib.pyplot as plt 11 | from bbox import bbox_iou 12 | 13 | def count_parameters(model): 14 | return sum(p.numel() for p in model.parameters()) 15 | 16 | def count_learnable_parameters(model): 17 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 18 | 19 | def convert2cpu(matrix): 20 | if matrix.is_cuda: 21 | return torch.FloatTensor(matrix.size()).copy_(matrix) 22 | else: 23 | return matrix 24 | 25 | def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True): 26 | batch_size = prediction.size(0) 27 | stride = inp_dim // prediction.size(2) 28 | grid_size = inp_dim // stride 29 | bbox_attrs = 5 + num_classes 30 | num_anchors = len(anchors) 31 | 32 | anchors = [(a[0]/stride, a[1]/stride) for a in anchors] 33 | 34 | 35 | 36 | prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size) 37 | prediction = prediction.transpose(1,2).contiguous() 38 | prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs) 39 | 40 | 41 | #Sigmoid the centre_X, centre_Y. and object confidencce 42 | prediction[:,:,0] = torch.sigmoid(prediction[:,:,0]) 43 | prediction[:,:,1] = torch.sigmoid(prediction[:,:,1]) 44 | prediction[:,:,4] = torch.sigmoid(prediction[:,:,4]) 45 | 46 | 47 | 48 | #Add the center offsets 49 | grid_len = np.arange(grid_size) 50 | a,b = np.meshgrid(grid_len, grid_len) 51 | 52 | x_offset = torch.FloatTensor(a).view(-1,1) 53 | y_offset = torch.FloatTensor(b).view(-1,1) 54 | 55 | if CUDA: 56 | x_offset = x_offset.cuda() 57 | y_offset = y_offset.cuda() 58 | 59 | x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0) 60 | 61 | prediction[:,:,:2] += x_y_offset 62 | 63 | #log space transform height and the width 64 | anchors = torch.FloatTensor(anchors) 65 | 66 | if CUDA: 67 | anchors = anchors.cuda() 68 | 69 | anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0) 70 | prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors 71 | 72 | #Softmax the class scores 73 | prediction[:,:,5: 5 + num_classes] = torch.sigmoid((prediction[:,:, 5 : 5 + num_classes])) 74 | 75 | prediction[:,:,:4] *= stride 76 | 77 | 78 | return prediction 79 | 80 | def load_classes(namesfile): 81 | fp = open(namesfile, "r") 82 | names = fp.read().split("\n")[:-1] 83 | return names 84 | 85 | def get_im_dim(im): 86 | im = cv2.imread(im) 87 | w,h = im.shape[1], im.shape[0] 88 | return w,h 89 | 90 | def unique(tensor): 91 | tensor_np = tensor.cpu().numpy() 92 | unique_np = np.unique(tensor_np) 93 | unique_tensor = torch.from_numpy(unique_np) 94 | 95 | tensor_res = tensor.new(unique_tensor.shape) 96 | tensor_res.copy_(unique_tensor) 97 | return tensor_res 98 | 99 | def write_results(prediction, confidence, num_classes, nms = True, nms_conf = 0.4): 100 | conf_mask = (prediction[:,:,4] > confidence).float().unsqueeze(2) 101 | prediction = prediction*conf_mask 102 | 103 | 104 | try: 105 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 106 | except: 107 | return 0 108 | 109 | 110 | box_a = prediction.new(prediction.shape) 111 | box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2) 112 | box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2) 113 | box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 114 | box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2) 115 | prediction[:,:,:4] = box_a[:,:,:4] 116 | 117 | 118 | 119 | batch_size = prediction.size(0) 120 | 121 | output = prediction.new(1, prediction.size(2) + 1) 122 | write = False 123 | 124 | 125 | for ind in range(batch_size): 126 | #select the image from the batch 127 | image_pred = prediction[ind] 128 | 129 | 130 | 131 | #Get the class having maximum score, and the index of that class 132 | #Get rid of num_classes softmax scores 133 | #Add the class index and the class score of class having maximum score 134 | max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1) 135 | max_conf = max_conf.float().unsqueeze(1) 136 | max_conf_score = max_conf_score.float().unsqueeze(1) 137 | seq = (image_pred[:,:5], max_conf, max_conf_score) 138 | image_pred = torch.cat(seq, 1) 139 | 140 | 141 | 142 | #Get rid of the zero entries 143 | non_zero_ind = (torch.nonzero(image_pred[:,4])) 144 | print('wr pre: ', non_zero_ind.shape) 145 | 146 | 147 | image_pred_ = image_pred[non_zero_ind.squeeze(),:].view(-1,7) 148 | print('wr pre: ', image_pred_.shape) 149 | print('image_pred_: ', image_pred_) 150 | 151 | #Get the various classes detected in the image 152 | try: 153 | img_classes = unique(image_pred_[:,-1]) 154 | except: 155 | continue 156 | print('wr img c: ', img_classes.shape) 157 | #WE will do NMS classwise 158 | for cls in img_classes: 159 | #get the detections with one particular class 160 | cls_mask = image_pred_*(image_pred_[:,-1] == cls).float().unsqueeze(1) 161 | class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze() 162 | 163 | 164 | image_pred_class = image_pred_[class_mask_ind].view(-1,7) 165 | print('image_pred_class: ', image_pred_class) 166 | 167 | 168 | 169 | #sort the detections such that the entry with the maximum objectness 170 | #confidence is at the top 171 | conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1] 172 | image_pred_class = image_pred_class[conf_sort_index] 173 | print('image_pred_class1: ', image_pred_class) 174 | idx = image_pred_class.size(0) 175 | print('image_pred_class2: ', idx) 176 | 177 | #if nms has to be done 178 | if nms: 179 | #For each detection 180 | for i in range(idx): 181 | #Get the IOUs of all boxes that come after the one we are looking at 182 | #in the loop 183 | try: 184 | ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:]) 185 | except ValueError: 186 | break 187 | 188 | except IndexError: 189 | break 190 | 191 | #Zero out all the detections that have IoU > treshhold 192 | iou_mask = (ious < nms_conf).float().unsqueeze(1) 193 | image_pred_class[i+1:] *= iou_mask 194 | 195 | #Remove the non-zero entries 196 | non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze() 197 | image_pred_class = image_pred_class[non_zero_ind].view(-1,7) 198 | 199 | 200 | 201 | #Concatenate the batch_id of the image to the detection 202 | #this helps us identify which image does the detection correspond to 203 | #We use a linear straucture to hold ALL the detections from the batch 204 | #the batch_dim is flattened 205 | #batch is identified by extra batch column 206 | 207 | 208 | batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) 209 | seq = batch_ind, image_pred_class 210 | if not write: 211 | output = torch.cat(seq,1) 212 | write = True 213 | else: 214 | out = torch.cat(seq,1) 215 | output = torch.cat((output,out)) 216 | 217 | return output 218 | 219 | #!/usr/bin/env python3 220 | # -*- coding: utf-8 -*- 221 | """ 222 | Created on Sat Mar 24 00:12:16 2018 223 | 224 | @author: ayooshmac 225 | """ 226 | 227 | def predict_transform_half(prediction, inp_dim, anchors, num_classes, CUDA = True): 228 | batch_size = prediction.size(0) 229 | stride = inp_dim // prediction.size(2) 230 | 231 | bbox_attrs = 5 + num_classes 232 | num_anchors = len(anchors) 233 | grid_size = inp_dim // stride 234 | 235 | 236 | prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size) 237 | prediction = prediction.transpose(1,2).contiguous() 238 | prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs) 239 | 240 | 241 | #Sigmoid the centre_X, centre_Y. and object confidencce 242 | prediction[:,:,0] = torch.sigmoid(prediction[:,:,0]) 243 | prediction[:,:,1] = torch.sigmoid(prediction[:,:,1]) 244 | prediction[:,:,4] = torch.sigmoid(prediction[:,:,4]) 245 | 246 | 247 | #Add the center offsets 248 | grid_len = np.arange(grid_size) 249 | a,b = np.meshgrid(grid_len, grid_len) 250 | 251 | x_offset = torch.FloatTensor(a).view(-1,1) 252 | y_offset = torch.FloatTensor(b).view(-1,1) 253 | 254 | if CUDA: 255 | x_offset = x_offset.cuda().half() 256 | y_offset = y_offset.cuda().half() 257 | 258 | x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0) 259 | 260 | prediction[:,:,:2] += x_y_offset 261 | 262 | #log space transform height and the width 263 | anchors = torch.HalfTensor(anchors) 264 | 265 | if CUDA: 266 | anchors = anchors.cuda() 267 | 268 | anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0) 269 | prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors 270 | 271 | #Softmax the class scores 272 | prediction[:,:,5: 5 + num_classes] = nn.Softmax(-1)(Variable(prediction[:,:, 5 : 5 + num_classes])).data 273 | 274 | prediction[:,:,:4] *= stride 275 | 276 | 277 | return prediction 278 | 279 | 280 | def write_results_half(prediction, confidence, num_classes, nms = True, nms_conf = 0.4): 281 | conf_mask = (prediction[:,:,4] > confidence).half().unsqueeze(2) 282 | prediction = prediction*conf_mask 283 | 284 | try: 285 | ind_nz = torch.nonzero(prediction[:,:,4]).transpose(0,1).contiguous() 286 | except: 287 | return 0 288 | 289 | 290 | 291 | box_a = prediction.new(prediction.shape) 292 | box_a[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2) 293 | box_a[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2) 294 | box_a[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2) 295 | box_a[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2) 296 | prediction[:,:,:4] = box_a[:,:,:4] 297 | 298 | 299 | 300 | batch_size = prediction.size(0) 301 | 302 | output = prediction.new(1, prediction.size(2) + 1) 303 | write = False 304 | 305 | for ind in range(batch_size): 306 | #select the image from the batch 307 | image_pred = prediction[ind] 308 | 309 | 310 | #Get the class having maximum score, and the index of that class 311 | #Get rid of num_classes softmax scores 312 | #Add the class index and the class score of class having maximum score 313 | max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1) 314 | max_conf = max_conf.half().unsqueeze(1) 315 | max_conf_score = max_conf_score.half().unsqueeze(1) 316 | seq = (image_pred[:,:5], max_conf, max_conf_score) 317 | image_pred = torch.cat(seq, 1) 318 | 319 | 320 | #Get rid of the zero entries 321 | non_zero_ind = (torch.nonzero(image_pred[:,4])) 322 | try: 323 | image_pred_ = image_pred[non_zero_ind.squeeze(),:] 324 | except: 325 | continue 326 | 327 | 328 | #Get the various classes detected in the image 329 | img_classes = unique(image_pred_[:,-1].long()).half() 330 | 331 | 332 | 333 | 334 | #WE will do NMS classwise 335 | for cls in img_classes: 336 | #get the detections with one particular class 337 | cls_mask = image_pred_*(image_pred_[:,-1] == cls).half().unsqueeze(1) 338 | class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze() 339 | 340 | 341 | image_pred_class = image_pred_[class_mask_ind] 342 | 343 | 344 | #sort the detections such that the entry with the maximum objectness 345 | #confidence is at the top 346 | conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1] 347 | image_pred_class = image_pred_class[conf_sort_index] 348 | idx = image_pred_class.size(0) 349 | 350 | #if nms has to be done 351 | if nms: 352 | #For each detection 353 | for i in range(idx): 354 | #Get the IOUs of all boxes that come after the one we are looking at 355 | #in the loop 356 | try: 357 | ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:]) 358 | except ValueError: 359 | break 360 | 361 | except IndexError: 362 | break 363 | 364 | #Zero out all the detections that have IoU > treshhold 365 | iou_mask = (ious < nms_conf).half().unsqueeze(1) 366 | image_pred_class[i+1:] *= iou_mask 367 | 368 | #Remove the non-zero entries 369 | non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze() 370 | image_pred_class = image_pred_class[non_zero_ind] 371 | 372 | 373 | 374 | #Concatenate the batch_id of the image to the detection 375 | #this helps us identify which image does the detection correspond to 376 | #We use a linear straucture to hold ALL the detections from the batch 377 | #the batch_dim is flattened 378 | #batch is identified by extra batch column 379 | batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) 380 | seq = batch_ind, image_pred_class 381 | 382 | if not write: 383 | output = torch.cat(seq,1) 384 | write = True 385 | else: 386 | out = torch.cat(seq,1) 387 | output = torch.cat((output,out)) 388 | 389 | return output 390 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/video_demo.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import time 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import numpy as np 7 | import cv2 8 | from util import * 9 | from darknet import Darknet 10 | from preprocess import prep_image, inp_to_image, letterbox_image 11 | import pandas as pd 12 | import random 13 | import pickle as pkl 14 | import argparse 15 | 16 | 17 | def get_test_input(input_dim, CUDA): 18 | img = cv2.imread("dog-cycle-car.png") 19 | img = cv2.resize(img, (input_dim, input_dim)) 20 | img_ = img[:,:,::-1].transpose((2,0,1)) 21 | img_ = img_[np.newaxis,:,:,:]/255.0 22 | img_ = torch.from_numpy(img_).float() 23 | img_ = Variable(img_) 24 | 25 | if CUDA: 26 | img_ = img_.cuda() 27 | 28 | return img_ 29 | 30 | def prep_image(img, inp_dim): 31 | """ 32 | Prepare image for inputting to the neural network. 33 | 34 | Returns a Variable 35 | """ 36 | 37 | orig_im = img 38 | dim = orig_im.shape[1], orig_im.shape[0] 39 | img = (letterbox_image(orig_im, (inp_dim, inp_dim))) 40 | img_ = img[:,:,::-1].transpose((2,0,1)).copy() 41 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 42 | return img_, orig_im, dim 43 | 44 | def write(x, img): 45 | c1 = tuple(x[1:3].int()) 46 | c2 = tuple(x[3:5].int()) 47 | cls = int(x[-1]) 48 | label = "{0}".format(classes[cls]) 49 | color = random.choice(colors) 50 | cv2.rectangle(img, c1, c2,color, 1) 51 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 52 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 53 | cv2.rectangle(img, c1, c2,color, -1) 54 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); 55 | return img 56 | 57 | def arg_parse(): 58 | """ 59 | Parse arguements to the detect module 60 | 61 | """ 62 | 63 | 64 | parser = argparse.ArgumentParser(description='YOLO v3 Video Detection Module') 65 | 66 | parser.add_argument("--video", dest = 'video', help = 67 | "Video to run detection upon", 68 | default = "video.avi", type = str) 69 | parser.add_argument("--dataset", dest = "dataset", help = "Dataset on which the network has been trained", default = "pascal") 70 | parser.add_argument("--confidence", dest = "confidence", help = "Object Confidence to filter predictions", default = 0.5) 71 | parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS Threshhold", default = 0.4) 72 | parser.add_argument("--cfg", dest = 'cfgfile', help = 73 | "Config file", 74 | default = "cfg/yolov3.cfg", type = str) 75 | parser.add_argument("--weights", dest = 'weightsfile', help = 76 | "weightsfile", 77 | default = "yolov3.weights", type = str) 78 | parser.add_argument("--reso", dest = 'reso', help = 79 | "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed", 80 | default = "416", type = str) 81 | return parser.parse_args() 82 | 83 | 84 | if __name__ == '__main__': 85 | args = arg_parse() 86 | confidence = float(args.confidence) 87 | nms_thesh = float(args.nms_thresh) 88 | start = 0 89 | 90 | CUDA = torch.cuda.is_available() 91 | 92 | num_classes = 80 93 | 94 | CUDA = torch.cuda.is_available() 95 | 96 | bbox_attrs = 5 + num_classes 97 | 98 | print("Loading network.....") 99 | model = Darknet(args.cfgfile) 100 | model.load_weights(args.weightsfile) 101 | print("Network successfully loaded") 102 | 103 | model.net_info["height"] = args.reso 104 | inp_dim = int(model.net_info["height"]) 105 | assert inp_dim % 32 == 0 106 | assert inp_dim > 32 107 | 108 | if CUDA: 109 | model.cuda() 110 | 111 | model(get_test_input(inp_dim, CUDA), CUDA) 112 | 113 | model.eval() 114 | 115 | videofile = args.video 116 | 117 | cap = cv2.VideoCapture(videofile) 118 | 119 | assert cap.isOpened(), 'Cannot capture source' 120 | 121 | frames = 0 122 | start = time.time() 123 | while cap.isOpened(): 124 | 125 | ret, frame = cap.read() 126 | if ret: 127 | 128 | 129 | img, orig_im, dim = prep_image(frame, inp_dim) 130 | 131 | im_dim = torch.FloatTensor(dim).repeat(1,2) 132 | 133 | 134 | if CUDA: 135 | im_dim = im_dim.cuda() 136 | img = img.cuda() 137 | 138 | with torch.no_grad(): 139 | output = model(Variable(img), CUDA) 140 | output = write_results(output, confidence, num_classes, nms = True, nms_conf = nms_thesh) 141 | 142 | if type(output) == int: 143 | frames += 1 144 | print("FPS of the video is {:5.2f}".format( frames / (time.time() - start))) 145 | cv2.imshow("frame", orig_im) 146 | key = cv2.waitKey(1) 147 | if key & 0xFF == ord('q'): 148 | break 149 | continue 150 | 151 | 152 | 153 | 154 | im_dim = im_dim.repeat(output.size(0), 1) 155 | scaling_factor = torch.min(inp_dim/im_dim,1)[0].view(-1,1) 156 | 157 | output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim[:,0].view(-1,1))/2 158 | output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim[:,1].view(-1,1))/2 159 | 160 | output[:,1:5] /= scaling_factor 161 | 162 | for i in range(output.shape[0]): 163 | output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim[i,0]) 164 | output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim[i,1]) 165 | 166 | classes = load_classes('data/coco.names') 167 | colors = pkl.load(open("pallete", "rb")) 168 | 169 | list(map(lambda x: write(x, orig_im), output)) 170 | 171 | 172 | cv2.imshow("frame", orig_im) 173 | key = cv2.waitKey(1) 174 | if key & 0xFF == ord('q'): 175 | break 176 | frames += 1 177 | print("FPS of the video is {:5.2f}".format( frames / (time.time() - start))) 178 | 179 | 180 | else: 181 | break 182 | 183 | 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /pytorch-yolo-v3/video_demo_half.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import time 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import numpy as np 7 | import cv2 8 | from util import * 9 | from darknet import Darknet 10 | from preprocess import prep_image, inp_to_image, letterbox_image 11 | import pandas as pd 12 | import random 13 | import pickle as pkl 14 | import argparse 15 | 16 | 17 | def get_test_input(input_dim, CUDA): 18 | img = cv2.imread("dog-cycle-car.png") 19 | img = cv2.resize(img, (input_dim, input_dim)) 20 | img_ = img[:,:,::-1].transpose((2,0,1)) 21 | img_ = img_[np.newaxis,:,:,:]/255.0 22 | img_ = torch.from_numpy(img_).float() 23 | img_ = Variable(img_) 24 | 25 | if CUDA: 26 | img_ = img_.cuda() 27 | 28 | return img_ 29 | 30 | def prep_image(img, inp_dim): 31 | """ 32 | Prepare image for inputting to the neural network. 33 | 34 | Returns a Variable 35 | """ 36 | 37 | orig_im = img 38 | dim = orig_im.shape[1], orig_im.shape[0] 39 | img = (letterbox_image(orig_im, (inp_dim, inp_dim))) 40 | img_ = img[:,:,::-1].transpose((2,0,1)).copy() 41 | img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) 42 | return img_, orig_im, dim 43 | 44 | def write(x, img): 45 | c1 = tuple(x[1:3].int()) 46 | c2 = tuple(x[3:5].int()) 47 | cls = int(x[-1]) 48 | label = "{0}".format(classes[cls]) 49 | color = random.choice(colors) 50 | cv2.rectangle(img, c1, c2,color, 1) 51 | t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] 52 | c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 53 | cv2.rectangle(img, c1, c2,color, -1) 54 | cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); 55 | return img 56 | 57 | def arg_parse(): 58 | """ 59 | Parse arguements to the detect module 60 | 61 | """ 62 | 63 | 64 | parser = argparse.ArgumentParser(description='YOLO v2 Video Detection Module') 65 | 66 | parser.add_argument("--video", dest = 'video', help = 67 | "Video to run detection upon", 68 | default = "video.avi", type = str) 69 | parser.add_argument("--dataset", dest = "dataset", help = "Dataset on which the network has been trained", default = "pascal") 70 | parser.add_argument("--confidence", dest = "confidence", help = "Object Confidence to filter predictions", default = 0.5) 71 | parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS Threshhold", default = 0.4) 72 | parser.add_argument("--cfg", dest = 'cfgfile', help = 73 | "Config file", 74 | default = "cfg/yolov3.cfg", type = str) 75 | parser.add_argument("--weights", dest = 'weightsfile', help = 76 | "weightsfile", 77 | default = "yolov3.weights", type = str) 78 | parser.add_argument("--reso", dest = 'reso', help = 79 | "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed", 80 | default = "416", type = str) 81 | return parser.parse_args() 82 | 83 | 84 | if __name__ == '__main__': 85 | args = arg_parse() 86 | confidence = float(args.confidence) 87 | nms_thesh = float(args.nms_thresh) 88 | start = 0 89 | 90 | CUDA = torch.cuda.is_available() 91 | 92 | 93 | 94 | CUDA = torch.cuda.is_available() 95 | num_classes = 80 96 | bbox_attrs = 5 + num_classes 97 | 98 | print("Loading network.....") 99 | model = Darknet(args.cfgfile) 100 | model.load_weights(args.weightsfile) 101 | print("Network successfully loaded") 102 | 103 | model.net_info["height"] = args.reso 104 | inp_dim = int(model.net_info["height"]) 105 | assert inp_dim % 32 == 0 106 | assert inp_dim > 32 107 | 108 | 109 | if CUDA: 110 | model.cuda().half() 111 | 112 | model(get_test_input(inp_dim, CUDA), CUDA) 113 | 114 | model.eval() 115 | 116 | videofile = 'video.avi' 117 | 118 | cap = cv2.VideoCapture(videofile) 119 | 120 | assert cap.isOpened(), 'Cannot capture source' 121 | 122 | frames = 0 123 | start = time.time() 124 | while cap.isOpened(): 125 | 126 | ret, frame = cap.read() 127 | if ret: 128 | 129 | 130 | img, orig_im, dim = prep_image(frame, inp_dim) 131 | 132 | im_dim = torch.FloatTensor(dim).repeat(1,2) 133 | 134 | 135 | if CUDA: 136 | img = img.cuda().half() 137 | im_dim = im_dim.half().cuda() 138 | write_results = write_results_half 139 | predict_transform = predict_transform_half 140 | 141 | 142 | output = model(Variable(img, volatile = True), CUDA) 143 | output = write_results(output, confidence, num_classes, nms = True, nms_conf = nms_thesh) 144 | 145 | 146 | if type(output) == int: 147 | frames += 1 148 | print("FPS of the video is {:5.2f}".format( frames / (time.time() - start))) 149 | cv2.imshow("frame", orig_im) 150 | key = cv2.waitKey(1) 151 | if key & 0xFF == ord('q'): 152 | break 153 | continue 154 | 155 | 156 | im_dim = im_dim.repeat(output.size(0), 1) 157 | scaling_factor = torch.min(inp_dim/im_dim,1)[0].view(-1,1) 158 | 159 | output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim[:,0].view(-1,1))/2 160 | output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim[:,1].view(-1,1))/2 161 | 162 | output[:,1:5] /= scaling_factor 163 | 164 | for i in range(output.shape[0]): 165 | output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim[i,0]) 166 | output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim[i,1]) 167 | 168 | 169 | classes = load_classes('data/coco.names') 170 | colors = pkl.load(open("pallete", "rb")) 171 | 172 | list(map(lambda x: write(x, orig_im), output)) 173 | 174 | 175 | cv2.imshow("frame", orig_im) 176 | key = cv2.waitKey(1) 177 | if key & 0xFF == ord('q'): 178 | break 179 | frames += 1 180 | print("FPS of the video is {:5.2f}".format( frames / (time.time() - start))) 181 | 182 | 183 | else: 184 | break 185 | 186 | 187 | 188 | 189 | 190 | -------------------------------------------------------------------------------- /resnet/resnet18/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torchvision 4 | import os 5 | import struct 6 | from torchsummary import summary 7 | 8 | def main(): 9 | print('cuda device count: ', torch.cuda.device_count()) 10 | net = torch.load('resnet18.pth') 11 | net = net.to('cuda:0') 12 | net.eval() 13 | print('model: ', net) 14 | #print('state dict: ', net.state_dict().keys()) 15 | tmp = torch.ones(1, 3, 224, 224).to('cuda:0') 16 | print('input: ', tmp) 17 | out = net(tmp) 18 | print('output:', out) 19 | 20 | summary(net, (3,224,224)) 21 | #return 22 | f = open("resnet18.wts", 'w') 23 | f.write("{}\n".format(len(net.state_dict().keys()))) 24 | for k,v in net.state_dict().items(): 25 | print('key: ', k) 26 | print('value: ', v.shape) 27 | vr = v.reshape(-1).cpu().numpy() 28 | f.write("{} {}".format(k, len(vr))) 29 | for vv in vr: 30 | f.write(" ") 31 | f.write(struct.pack(">f", float(vv)).hex()) 32 | f.write("\n") 33 | 34 | if __name__ == '__main__': 35 | main() 36 | 37 | -------------------------------------------------------------------------------- /resnet/resnet18/resnet18.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import torchvision 5 | 6 | def main(): 7 | print('cuda device count: ', torch.cuda.device_count()) 8 | net = torchvision.models.resnet18(pretrained=True) 9 | #net.fc = nn.Linear(512, 2) 10 | net = net.to('cuda:0') 11 | net.eval() 12 | print(net) 13 | tmp = torch.ones(2, 3, 224, 224).to('cuda:0') 14 | out = net(tmp) 15 | print('resnet18 out:', out.shape) 16 | torch.save(net, "resnet18.pth") 17 | 18 | if __name__ == '__main__': 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /resnet/resnet34/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torchvision 4 | import os 5 | import struct 6 | from torchsummary import summary 7 | 8 | def main(): 9 | print('cuda device count: ', torch.cuda.device_count()) 10 | net = torch.load('resnet34.pth') 11 | net = net.to('cuda:0') 12 | net.eval() 13 | print('model: ', net) 14 | #print('state dict: ', net.state_dict().keys()) 15 | tmp = torch.ones(1, 3, 224, 224).to('cuda:0') 16 | print('input: ', tmp) 17 | out = net(tmp) 18 | print('output:', out) 19 | 20 | summary(net, (3,224,224)) 21 | #return 22 | f = open("resnet34.wts", 'w') 23 | f.write("{}\n".format(len(net.state_dict().keys()))) 24 | for k,v in net.state_dict().items(): 25 | print('key: ', k) 26 | print('value: ', v.shape) 27 | vr = v.reshape(-1).cpu().numpy() 28 | f.write("{} {}".format(k, len(vr))) 29 | for vv in vr: 30 | f.write(" ") 31 | f.write(struct.pack(">f", float(vv)).hex()) 32 | f.write("\n") 33 | 34 | if __name__ == '__main__': 35 | main() 36 | 37 | -------------------------------------------------------------------------------- /resnet/resnet34/resnet34.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import torchvision 5 | 6 | def main(): 7 | print('cuda device count: ', torch.cuda.device_count()) 8 | net = torchvision.models.resnet34(pretrained=True) 9 | #net.fc = nn.Linear(512, 2) 10 | net = net.to('cuda:0') 11 | net.eval() 12 | print(net) 13 | tmp = torch.ones(2, 3, 224, 224).to('cuda:0') 14 | out = net(tmp) 15 | print('resnet34 out:', out.shape) 16 | torch.save(net, "resnet34.pth") 17 | 18 | if __name__ == '__main__': 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /resnet/resnet50/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torchvision 4 | import os 5 | import struct 6 | from torchsummary import summary 7 | 8 | def main(): 9 | print('cuda device count: ', torch.cuda.device_count()) 10 | net = torch.load('resnet50.pth') 11 | net = net.to('cuda:0') 12 | net.eval() 13 | print('model: ', net) 14 | #print('state dict: ', net.state_dict().keys()) 15 | tmp = torch.ones(1, 3, 224, 224).to('cuda:0') 16 | print('input: ', tmp) 17 | out = net(tmp) 18 | print('output:', out) 19 | 20 | summary(net, (3,224,224)) 21 | #return 22 | f = open("resnet50.wts", 'w') 23 | f.write("{}\n".format(len(net.state_dict().keys()))) 24 | for k,v in net.state_dict().items(): 25 | print('key: ', k) 26 | print('value: ', v.shape) 27 | vr = v.reshape(-1).cpu().numpy() 28 | f.write("{} {}".format(k, len(vr))) 29 | for vv in vr: 30 | f.write(" ") 31 | f.write(struct.pack(">f", float(vv)).hex()) 32 | f.write("\n") 33 | 34 | if __name__ == '__main__': 35 | main() 36 | 37 | -------------------------------------------------------------------------------- /resnet/resnet50/resnet50.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import torchvision 5 | 6 | def main(): 7 | print('cuda device count: ', torch.cuda.device_count()) 8 | net = torchvision.models.resnet50(pretrained=True) 9 | #net.fc = nn.Linear(512, 2) 10 | net = net.to('cuda:0') 11 | net.eval() 12 | print(net) 13 | tmp = torch.ones(2, 3, 224, 224).to('cuda:0') 14 | out = net(tmp) 15 | print('resnet50 out:', out.shape) 16 | torch.save(net, "resnet50.pth") 17 | 18 | if __name__ == '__main__': 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /resnet/resnext50_32x4d/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torchvision 4 | import os 5 | import struct 6 | from torchsummary import summary 7 | 8 | def main(): 9 | print('cuda device count: ', torch.cuda.device_count()) 10 | net = torch.load('resnext50.pth') 11 | net = net.to('cuda:0') 12 | net.eval() 13 | print('model: ', net) 14 | #print('state dict: ', net.state_dict().keys()) 15 | tmp = torch.ones(1, 3, 224, 224).to('cuda:0') 16 | print('input: ', tmp) 17 | out = net(tmp) 18 | print('output:', out) 19 | 20 | summary(net, (3,224,224)) 21 | #return 22 | f = open("resnext50.wts", 'w') 23 | f.write("{}\n".format(len(net.state_dict().keys()))) 24 | for k,v in net.state_dict().items(): 25 | print('key: ', k) 26 | print('value: ', v.shape) 27 | vr = v.reshape(-1).cpu().numpy() 28 | f.write("{} {}".format(k, len(vr))) 29 | for vv in vr: 30 | f.write(" ") 31 | f.write(struct.pack(">f", float(vv)).hex()) 32 | f.write("\n") 33 | 34 | if __name__ == '__main__': 35 | main() 36 | 37 | -------------------------------------------------------------------------------- /resnet/resnext50_32x4d/resnext50.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import torchvision 5 | 6 | def main(): 7 | print('cuda device count: ', torch.cuda.device_count()) 8 | net = torchvision.models.resnext50_32x4d(pretrained=True) 9 | #net.fc = nn.Linear(512, 2) 10 | net = net.to('cuda:0') 11 | net.eval() 12 | print(net) 13 | tmp = torch.ones(2, 3, 224, 224).to('cuda:0') 14 | out = net(tmp) 15 | print('resnext50 out:', out.shape) 16 | torch.save(net, "resnext50.pth") 17 | 18 | if __name__ == '__main__': 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /shufflenet/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torchvision 4 | import os 5 | import struct 6 | from torchsummary import summary 7 | 8 | def main(): 9 | print('cuda device count: ', torch.cuda.device_count()) 10 | net = torch.load('shufflenet.pth') 11 | net = net.to('cuda:0') 12 | net = net.eval() 13 | print('model: ', net) 14 | #print('state dict: ', net.state_dict().keys()) 15 | tmp = torch.ones(1, 3, 224, 224).to('cuda:0') 16 | print('input: ', tmp) 17 | out = net(tmp) 18 | 19 | print('output:', out) 20 | 21 | summary(net, (3, 224, 224)) 22 | #return 23 | f = open("shufflenet.wts", 'w') 24 | f.write("{}\n".format(len(net.state_dict().keys()))) 25 | for k,v in net.state_dict().items(): 26 | print('key: ', k) 27 | print('value: ', v.shape) 28 | vr = v.reshape(-1).cpu().numpy() 29 | f.write("{} {}".format(k, len(vr))) 30 | for vv in vr: 31 | f.write(" ") 32 | f.write(struct.pack(">f", float(vv)).hex()) 33 | f.write("\n") 34 | 35 | if __name__ == '__main__': 36 | main() 37 | 38 | -------------------------------------------------------------------------------- /shufflenet/shufflenet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import torchvision 5 | 6 | def main(): 7 | print('cuda device count: ', torch.cuda.device_count()) 8 | net = torchvision.models.shufflenet_v2_x0_5(pretrained=True) 9 | #net.fc = nn.Linear(512, 2) 10 | net = net.eval() 11 | net = net.to('cuda:0') 12 | print(net) 13 | tmp = torch.ones(2, 3, 224, 224).to('cuda:0') 14 | out = net(tmp) 15 | print('shufflenet out:', out.shape) 16 | torch.save(net, "shufflenet.pth") 17 | 18 | if __name__ == '__main__': 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /squeezenet/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torchvision 4 | import os 5 | import struct 6 | from torchsummary import summary 7 | 8 | def main(): 9 | print('cuda device count: ', torch.cuda.device_count()) 10 | net = torch.load('squeezenet.pth') 11 | net = net.to('cuda:0') 12 | net = net.eval() 13 | print('model: ', net) 14 | #print('state dict: ', net.state_dict().keys()) 15 | tmp = torch.ones(1, 3, 227, 227).to('cuda:0') 16 | print('input: ', tmp) 17 | out = net(tmp) 18 | 19 | print('output:', out) 20 | 21 | summary(net, (3, 227, 227)) 22 | #return 23 | f = open("squeezenet.wts", 'w') 24 | f.write("{}\n".format(len(net.state_dict().keys()))) 25 | for k,v in net.state_dict().items(): 26 | print('key: ', k) 27 | print('value: ', v.shape) 28 | vr = v.reshape(-1).cpu().numpy() 29 | f.write("{} {}".format(k, len(vr))) 30 | for vv in vr: 31 | f.write(" ") 32 | f.write(struct.pack(">f", float(vv)).hex()) 33 | f.write("\n") 34 | 35 | if __name__ == '__main__': 36 | main() 37 | 38 | -------------------------------------------------------------------------------- /squeezenet/squeezenet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import torchvision 5 | 6 | def main(): 7 | print('cuda device count: ', torch.cuda.device_count()) 8 | net = torchvision.models.squeezenet1_1(pretrained=True) 9 | #net.fc = nn.Linear(512, 2) 10 | net = net.eval() 11 | net = net.to('cuda:0') 12 | print(net) 13 | tmp = torch.ones(2, 3, 227, 227).to('cuda:0') 14 | out = net(tmp) 15 | print('squeezenet out:', out.shape) 16 | torch.save(net, "squeezenet.pth") 17 | 18 | if __name__ == '__main__': 19 | main() 20 | 21 | -------------------------------------------------------------------------------- /vgg/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torchvision 4 | import os 5 | import struct 6 | from torchsummary import summary 7 | 8 | def main(): 9 | print('cuda device count: ', torch.cuda.device_count()) 10 | net = torch.load('vgg.pth') 11 | net = net.to('cuda:0') 12 | net = net.eval() 13 | print('model: ', net) 14 | #print('state dict: ', net.state_dict().keys()) 15 | tmp = torch.ones(1, 3, 224, 224).to('cuda:0') 16 | print('input: ', tmp) 17 | out = net(tmp) 18 | 19 | print('output:', out) 20 | 21 | summary(net, (3, 224, 224)) 22 | #return 23 | f = open("vgg.wts", 'w') 24 | f.write("{}\n".format(len(net.state_dict().keys()))) 25 | for k,v in net.state_dict().items(): 26 | print('key: ', k) 27 | print('value: ', v.shape) 28 | vr = v.reshape(-1).cpu().numpy() 29 | f.write("{} {}".format(k, len(vr))) 30 | for vv in vr: 31 | f.write(" ") 32 | f.write(struct.pack(">f", float(vv)).hex()) 33 | f.write("\n") 34 | 35 | if __name__ == '__main__': 36 | main() 37 | 38 | -------------------------------------------------------------------------------- /vgg/vgg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | import torchvision 5 | 6 | def main(): 7 | print('cuda device count: ', torch.cuda.device_count()) 8 | net = torchvision.models.vgg11(pretrained=True) 9 | #net.fc = nn.Linear(512, 2) 10 | net = net.eval() 11 | net = net.to('cuda:1') 12 | print(net) 13 | tmp = torch.ones(2, 3, 224, 224).to('cuda:1') 14 | out = net(tmp) 15 | print('vgg out:', out.shape) 16 | torch.save(net, "vgg.pth") 17 | 18 | if __name__ == '__main__': 19 | main() 20 | 21 | --------------------------------------------------------------------------------