├── utils ├── __init__.py └── augmentations.py ├── layers ├── __init__.py ├── functions │ ├── __init__.py │ ├── prior_box.py │ └── detection.py ├── modules │ ├── __init__.py │ ├── l2norm.py │ ├── focal_loss.py │ └── multibox_loss.py └── box_utils.py ├── .gitattributes ├── netModel ├── testModel.py ├── multi_flow.py └── resnet.py ├── data ├── validPhoto.py ├── splitTrainVal.py ├── splitTrainVal copy.py ├── xmlPaser.py ├── coco │ └── coco_labels.txt ├── __init__.py ├── resultVisualize.py ├── config.py ├── xmlPaserGenLabel.py ├── voc0712.py ├── custom.py ├── custom_for_visual.py └── coco.py ├── LICENSE ├── README.md ├── .gitignore ├── 实验 4.1 ├── ssd_resnet_101.py ├── trainCustom_101.py ├── visualTest_gauge.py └── evalCustom_101.py └── 实验 4.2 ├── ssd_resnet_18.py ├── visualTest_building.py ├── trainCustom_18.py └── evalCustom_18.py /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .augmentations import SSDAugmentation -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * 2 | from .modules import * 3 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-language=Python 2 | .ipynb_checkpoints/* linguist-documentation 3 | dev.ipynb linguist-documentation 4 | -------------------------------------------------------------------------------- /layers/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .detection import Detect 2 | from .prior_box import PriorBox 3 | 4 | 5 | __all__ = ['Detect', 'PriorBox'] 6 | -------------------------------------------------------------------------------- /layers/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .l2norm import L2Norm 2 | from .multibox_loss import MultiBoxLoss 3 | 4 | __all__ = ['L2Norm', 'MultiBoxLoss'] 5 | -------------------------------------------------------------------------------- /netModel/testModel.py: -------------------------------------------------------------------------------- 1 | from resnet import resnet101 2 | import torch 3 | 4 | if __name__ == '__main__': 5 | model = resnet101() 6 | input = torch.rand(2,3,512,512) 7 | res = model(input) 8 | print(model) -------------------------------------------------------------------------------- /data/validPhoto.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path, PurePath 2 | import cv2 3 | 4 | if __name__ == '__main__': 5 | p = Path('./piaofu/piao/shenhe/JPEGImages/') 6 | files = [x for x in p.iterdir() if x.is_file()] 7 | for file in files: 8 | try: 9 | print(file.name) 10 | img = cv2.imread('./piaofu/piao/shenhe/JPEGImages/%s' % file.name, cv2.IMREAD_COLOR) 11 | except Exception: 12 | print(file.name) -------------------------------------------------------------------------------- /data/splitTrainVal.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path, PurePath 2 | 3 | resultPath = "./video/buildingwater/ImageSets/Main/" 4 | def splitDataset(path, filename): 5 | p = Path(path) 6 | files = [x for x in p.iterdir() if x.is_file()] 7 | count = 0 8 | with open(resultPath+filename+'trainval.txt', 'w+') as f: 9 | with open(resultPath+filename+'train.txt', 'w+') as ft: 10 | with open(resultPath+filename+'val.txt', 'w+') as fv: 11 | for file in files: 12 | f.write(file.stem + '\n') 13 | if count % 5 == 4: 14 | fv.write(file.stem + '\n') 15 | else: 16 | ft.write(file.stem + '\n') 17 | count += 1 18 | 19 | splitDataset('./video/buildingwater/Annotations', '') 20 | -------------------------------------------------------------------------------- /layers/modules/l2norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Function 4 | from torch.autograd import Variable 5 | import torch.nn.init as init 6 | 7 | class L2Norm(nn.Module): 8 | def __init__(self,n_channels, scale): 9 | super(L2Norm,self).__init__() 10 | self.n_channels = n_channels 11 | self.gamma = scale or None 12 | self.eps = 1e-10 13 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 14 | self.reset_parameters() 15 | 16 | def reset_parameters(self): 17 | init.constant_(self.weight,self.gamma) 18 | 19 | def forward(self, x): 20 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()+self.eps 21 | #x /= norm 22 | x = torch.div(x,norm) 23 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x 24 | return out 25 | -------------------------------------------------------------------------------- /data/splitTrainVal copy.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path, PurePath 2 | 3 | resultPath = "./video/buildingwater/ImageSets/Main/" 4 | def splitDataset(path, filename): 5 | p = Path(path) 6 | files = [x for x in p.iterdir() if x.is_file()] 7 | count = 0 8 | with open(resultPath+filename+'trainval0.txt', 'w+') as f: 9 | with open(resultPath+filename+'train0.txt', 'w+') as ft: 10 | with open(resultPath+filename+'val0.txt', 'w+') as fv: 11 | for file in files: 12 | f.write(file.stem + '\n') 13 | if file.stem.find('v1') > -1 or file.stem.find('v2') > -1 or file.stem.find('v4') > -1 or file.stem.find('v5') > -1 or file.stem.find('v6') > -1: 14 | ft.write(file.stem + '\n') 15 | elif file.stem.find('v3') > -1: 16 | fv.write(file.stem + '\n') 17 | count += 1 18 | 19 | splitDataset('./video/buildingwater/Annotations', '') 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Max deGroot, Ellis Brown 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /data/xmlPaser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: UTF-8 -*- 3 | 4 | import xml.sax 5 | from pathlib import Path, PurePath 6 | 7 | total = {} 8 | 9 | class MovieHandler( xml.sax.ContentHandler ): 10 | def __init__(self): 11 | self.CurrentData = "" 12 | self.name = '' 13 | 14 | # 元素开始事件处理 15 | def startElement(self, tag, attributes): 16 | self.CurrentData = tag 17 | 18 | # 元素结束事件处理 19 | def endElement(self, tag): 20 | if self.CurrentData == "name": 21 | if self.name in total: 22 | total[self.name] += 1 23 | else: 24 | total[self.name] = 1 25 | self.CurrentData = "" 26 | 27 | # 内容事件处理 28 | def characters(self, content): 29 | if self.CurrentData == "name": 30 | self.name = content 31 | 32 | if ( __name__ == "__main__"): 33 | 34 | # 创建一个 XMLReader 35 | parser = xml.sax.make_parser() 36 | # turn off namepsaces 37 | parser.setFeature(xml.sax.handler.feature_namespaces, 0) 38 | 39 | # 重写 ContextHandler 40 | Handler = MovieHandler() 41 | parser.setContentHandler( Handler ) 42 | 43 | path = '.\\piaofu\\piao\\shenhe\\Annotations' 44 | p = Path(path) 45 | files = [x for x in p.iterdir() if x.is_file()] 46 | for f in files: 47 | parser.parse(path+'\\'+f.name) 48 | print(total) -------------------------------------------------------------------------------- /data/coco/coco_labels.txt: -------------------------------------------------------------------------------- 1 | 1,1,person 2 | 2,2,bicycle 3 | 3,3,car 4 | 4,4,motorcycle 5 | 5,5,airplane 6 | 6,6,bus 7 | 7,7,train 8 | 8,8,truck 9 | 9,9,boat 10 | 10,10,traffic light 11 | 11,11,fire hydrant 12 | 13,12,stop sign 13 | 14,13,parking meter 14 | 15,14,bench 15 | 16,15,bird 16 | 17,16,cat 17 | 18,17,dog 18 | 19,18,horse 19 | 20,19,sheep 20 | 21,20,cow 21 | 22,21,elephant 22 | 23,22,bear 23 | 24,23,zebra 24 | 25,24,giraffe 25 | 27,25,backpack 26 | 28,26,umbrella 27 | 31,27,handbag 28 | 32,28,tie 29 | 33,29,suitcase 30 | 34,30,frisbee 31 | 35,31,skis 32 | 36,32,snowboard 33 | 37,33,sports ball 34 | 38,34,kite 35 | 39,35,baseball bat 36 | 40,36,baseball glove 37 | 41,37,skateboard 38 | 42,38,surfboard 39 | 43,39,tennis racket 40 | 44,40,bottle 41 | 46,41,wine glass 42 | 47,42,cup 43 | 48,43,fork 44 | 49,44,knife 45 | 50,45,spoon 46 | 51,46,bowl 47 | 52,47,banana 48 | 53,48,apple 49 | 54,49,sandwich 50 | 55,50,orange 51 | 56,51,broccoli 52 | 57,52,carrot 53 | 58,53,hot dog 54 | 59,54,pizza 55 | 60,55,donut 56 | 61,56,cake 57 | 62,57,chair 58 | 63,58,couch 59 | 64,59,potted plant 60 | 65,60,bed 61 | 67,61,dining table 62 | 70,62,toilet 63 | 72,63,tv 64 | 73,64,laptop 65 | 74,65,mouse 66 | 75,66,remote 67 | 76,67,keyboard 68 | 77,68,cell phone 69 | 78,69,microwave 70 | 79,70,oven 71 | 80,71,toaster 72 | 81,72,sink 73 | 82,73,refrigerator 74 | 84,74,book 75 | 85,75,clock 76 | 86,76,vase 77 | 87,77,scissors 78 | 88,78,teddy bear 79 | 89,79,hair drier 80 | 90,80,toothbrush 81 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | from .voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES, VOC_ROOT 2 | from .custom import customDetection, customAnnotationTransform, CUSTOM_CLASSES, CUSTOM_ROOT 3 | 4 | # from .coco import COCODetection, COCOAnnotationTransform, COCO_CLASSES, COCO_ROOT, get_label_map 5 | from .config import * 6 | import torch 7 | import cv2 8 | import numpy as np 9 | 10 | def detection_collate(batch): 11 | """Custom collate fn for dealing with batches of images that have a different 12 | number of associated object annotations (bounding boxes). 13 | 14 | Arguments: 15 | batch: (tuple) A tuple of tensor images and lists of annotations 16 | 17 | Return: 18 | A tuple containing: 19 | 1) (tensor) batch of images stacked on their 0 dim 20 | 2) (list of tensors) annotations for a given image are stacked on 21 | 0 dim 22 | """ 23 | targets = [] 24 | imgs = [] 25 | for sample in batch: 26 | imgs.append(sample[0]) 27 | targets.append(torch.FloatTensor(sample[1])) 28 | return torch.stack(imgs, 0), targets 29 | 30 | 31 | def base_transform(image, size, mean): 32 | x = cv2.resize(image, (size, size)).astype(np.float32) 33 | x -= mean 34 | x = x.astype(np.float32) 35 | return x 36 | 37 | 38 | class BaseTransform: 39 | def __init__(self, size, mean): 40 | self.size = size 41 | self.mean = np.array(mean, dtype=np.float32) 42 | 43 | def __call__(self, image, boxes=None, labels=None): 44 | return base_transform(image, self.size, self.mean), boxes, labels 45 | -------------------------------------------------------------------------------- /data/resultVisualize.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import os.path as osp 4 | import math 5 | 6 | rootPath = 'F:/ssd/data/video/waterline' 7 | 8 | imgList = {} 9 | 10 | if __name__ == "__main__": 11 | with open('./det_test_waterline_99.txt', 'r') as f: 12 | text_lines = f.readlines() 13 | for line in text_lines: 14 | info = line.split(" ") 15 | name, score, x1, y1, x2, y2 = info 16 | if name in imgList: 17 | if float(score) > imgList[name]['score']: 18 | imgList[name] = { 19 | 'score': float(score), 20 | 'x1': float(x1), 21 | 'y1': float(y1), 22 | 'x2': float(x2), 23 | 'y2': float(y2) 24 | } 25 | else: 26 | imgList[name] = { 27 | 'score': float(score), 28 | 'x1': float(x1), 29 | 'y1': float(y1), 30 | 'x2': float(x2), 31 | 'y2': float(y2) 32 | } 33 | 34 | cv2.namedWindow('w1',1) 35 | img_path = osp.join(rootPath, 'JPEGImages', '%s.jpg') 36 | for obj in imgList.items(): 37 | name, img = obj 38 | image = cv2.imread(img_path % name) 39 | (h, w, c) = image.shape 40 | cv2.rectangle(image, (math.floor(img['x1']), math.floor(img['y1'])), (math.floor(img['x2']), math.floor(img['y2'])), (255,0,0), 5) 41 | # cv2.putText(image, img['score'], (math.floor(img['x1']), math.floor(img['y1'])), cv2.FONT_HERSHEY_COMPLEX, 5, (0, 255, 0), 12) 42 | # sc = min(512, h) / h 43 | # image = cv2.resize(image, (math.floor(w * sc), math.floor(h * sc))) 44 | image = cv2.resize(image, (512, 512)) 45 | cv2.imshow('w1', image) 46 | cv2.waitKey() 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SSD: Single Shot MultiBox Object Detector, in PyTorch 2 | A [PyTorch](http://pytorch.org/) implementation of [Single Shot MultiBox Detector](http://arxiv.org/abs/1512.02325) from the 2016 paper by Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang, and Alexander C. Berg. The official and original Caffe code can be found [here](https://github.com/weiliu89/caffe/tree/ssd). 3 | 4 | 5 | 6 | 7 | ### Table of Contents 8 | - Installation 9 | - Datasets 10 | 11 |   12 |   13 |   14 |   15 | 16 | ## Installation 17 | - Install [PyTorch](http://pytorch.org/) by selecting your environment on the website and running the appropriate command. 18 | - Clone this repository. 19 | * Note: We currently only support Python 3+. 20 | - Then download the dataset by following the [instructions](#datasets) below. 21 | - We now support [Visdom](https://github.com/facebookresearch/visdom) for real-time loss visualization during training! 22 | * To use Visdom in the browser: 23 | ```Shell 24 | # First install Python server and client 25 | pip install visdom 26 | # Start the server (probably in a screen or tmux) 27 | python -m visdom.server 28 | ``` 29 | * Then (during training) navigate to http://localhost:8097/ (see the Train section below for training details). 30 | - Note: For training, we currently support [VOC](http://host.robots.ox.ac.uk/pascal/VOC/) and [COCO](http://mscoco.org/), and aim to add [ImageNet](http://www.image-net.org/) support soon. 31 | 32 | ## train 33 | 34 | ### 实验4.1 35 | 36 | 使用gauge.zip的数据训练一个水位尺检测模型 37 | 38 | 使用mark.zip的数据训练一个实际水位和警戒水位检测模型 39 | 40 | ### 实验4.2 41 | 42 | 使用buildingwater.zip的数据训练一个河道区域和建筑区域的检测模型 43 | 44 | ## 注意 45 | 46 | 修改data/custom.py中的CUSTOM_CLASSES以适应不同的数据集 47 | 48 | 修改data/config.py中的num_classes, lr_steps, max_iter以适应不同的数据集 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # atom remote-sync package 92 | .remote-sync.json 93 | 94 | # weights 95 | weights/ 96 | 97 | #DS_Store 98 | .DS_Store 99 | 100 | # dev stuff 101 | eval/ 102 | eval.ipynb 103 | dev.ipynb 104 | .vscode/ 105 | 106 | # not ready 107 | videos/ 108 | templates/ 109 | data/ssd_dataloader.py 110 | data/datasets/ 111 | data/video/ 112 | doc/visualize.py 113 | read_results.py 114 | ssd300_120000/ 115 | demos/live 116 | webdemo.py 117 | test_data_aug.py 118 | 119 | # attributes 120 | 121 | # pycharm 122 | .idea/ 123 | 124 | # temp checkout soln 125 | data/datasets/ 126 | data/ssd_dataloader.py 127 | data/piaofu 128 | data/VOCdevkit 129 | data/*.zip 130 | 131 | # pylint 132 | .pylintrc -------------------------------------------------------------------------------- /layers/functions/prior_box.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from math import sqrt as sqrt 3 | from itertools import product as product 4 | import torch 5 | 6 | 7 | class PriorBox(object): 8 | """Compute priorbox coordinates in center-offset form for each source 9 | feature map. 10 | """ 11 | def __init__(self, cfg): 12 | super(PriorBox, self).__init__() 13 | self.image_size = cfg['min_dim'] 14 | # number of priors for feature map location (either 4 or 6) 15 | self.num_priors = len(cfg['aspect_ratios']) 16 | self.variance = cfg['variance'] or [0.1] 17 | self.feature_maps = cfg['feature_maps'] 18 | self.min_sizes = cfg['min_sizes'] 19 | self.max_sizes = cfg['max_sizes'] 20 | self.steps = cfg['steps'] 21 | self.aspect_ratios = cfg['aspect_ratios'] 22 | self.clip = cfg['clip'] 23 | self.version = cfg['name'] 24 | for v in self.variance: 25 | if v <= 0: 26 | raise ValueError('Variances must be greater than 0') 27 | 28 | def forward(self): 29 | mean = [] 30 | for k, f in enumerate(self.feature_maps): 31 | for i, j in product(range(f), repeat=2): 32 | f_k = self.image_size / self.steps[k] 33 | # unit center x,y 34 | cx = (j + 0.5) / f_k 35 | cy = (i + 0.5) / f_k 36 | 37 | # aspect_ratio: 1 38 | # rel size: min_size 39 | s_k = self.min_sizes[k]/self.image_size 40 | mean += [cx, cy, s_k, s_k] 41 | 42 | # aspect_ratio: 1 43 | # rel size: sqrt(s_k * s_(k+1)) 44 | s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size)) 45 | mean += [cx, cy, s_k_prime, s_k_prime] 46 | 47 | # rest of aspect ratios 48 | for ar in self.aspect_ratios[k]: 49 | mean += [cx, cy, s_k*sqrt(ar), s_k/sqrt(ar)] 50 | mean += [cx, cy, s_k/sqrt(ar), s_k*sqrt(ar)] 51 | # back to torch land 52 | output = torch.Tensor(mean).view(-1, 4) 53 | if self.clip: 54 | output.clamp_(max=1, min=0) 55 | return output 56 | -------------------------------------------------------------------------------- /netModel/multi_flow.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | try: 4 | from torch.hub import load_state_dict_from_url 5 | except ImportError: 6 | from torch.utils.model_zoo import load_url as load_state_dict_from_url 7 | 8 | class MultiFlow_Block(nn.Module): 9 | 10 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 11 | base_width=64, dilation=1, norm_layer=None): 12 | super(BasicBlock, self).__init__() 13 | if norm_layer is None: 14 | norm_layer = nn.BatchNorm2d 15 | if groups != 1 or base_width != 64: 16 | raise ValueError('BasicBlock only supports groups=1 and base_width=64') 17 | if dilation > 1: 18 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock") 19 | 20 | class MultiFlow(nn.Module): 21 | expansion = 1 22 | __constants__ = ['downsample'] 23 | 24 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 25 | base_width=64, dilation=1, norm_layer=None): 26 | super(BasicBlock, self).__init__() 27 | if norm_layer is None: 28 | norm_layer = nn.BatchNorm2d 29 | if groups != 1 or base_width != 64: 30 | raise ValueError('BasicBlock only supports groups=1 and base_width=64') 31 | if dilation > 1: 32 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock") 33 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1 34 | self.conv1 = conv3x3(inplanes, planes, stride) 35 | self.bn1 = norm_layer(planes) 36 | self.relu = nn.ReLU(inplace=True) 37 | self.conv2 = conv3x3(planes, planes) 38 | self.bn2 = norm_layer(planes) 39 | self.downsample = downsample 40 | self.stride = stride 41 | 42 | def forward(self, x): 43 | identity = x 44 | 45 | out = self.conv1(x) 46 | out = self.bn1(out) 47 | out = self.relu(out) 48 | 49 | out = self.conv2(out) 50 | out = self.bn2(out) 51 | 52 | if self.downsample is not None: 53 | identity = self.downsample(x) 54 | 55 | out += identity 56 | out = self.relu(out) 57 | 58 | return out -------------------------------------------------------------------------------- /data/config.py: -------------------------------------------------------------------------------- 1 | # config.py 2 | import os.path 3 | 4 | # gets home dir cross platform 5 | HOME = "F:/ssd/" # os.path.expanduser("~") 6 | 7 | # for making bounding boxes pretty 8 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128), 9 | (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128)) 10 | 11 | MEANS = (104, 117, 123) 12 | 13 | # SSD300 CONFIGS 14 | custom = { 15 | 'num_classes': 2, 16 | 'lr_steps': (22500, 30000, 37500), 17 | 'max_iter': 120000, 18 | 'feature_maps': [38, 19, 10, 5, 3, 1], 19 | 'min_dim': 300, 20 | 'steps': [8, 16, 32, 64, 100, 300], 21 | 'min_sizes': [30, 60, 111, 162, 213, 264], 22 | 'max_sizes': [60, 111, 162, 213, 264, 315], 23 | 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 24 | 'variance': [0.1, 0.2], 25 | 'clip': True, 26 | 'name': 'CUSTOM', 27 | } 28 | 29 | VOC_300_2 = { 30 | 'num_classes': 5, 31 | 'lr_steps': (100000, 130000, 160000), 32 | 'max_iter': 160000, 33 | 'feature_maps' : [38, 19, 10, 5, 3], 34 | 'min_dim' : 300, 35 | 'steps' : [8, 16, 32, 64, 100], 36 | 'min_sizes' : [30, 60, 111, 162, 213], 37 | 'max_sizes' : [60, 111, 162, 213, 315], 38 | 'aspect_ratios' : [[2,3], [2, 3], [2, 3], [2, 3], [2,3]], 39 | 'variance' : [0.1, 0.2], 40 | 'clip' : True, 41 | 'name': 'CUSTOM', 42 | } 43 | 44 | voc = { 45 | 'num_classes': 21, 46 | 'lr_steps': (80000, 100000, 120000), 47 | 'max_iter': 120000, 48 | 'feature_maps': [38, 19, 10, 5, 3, 1], 49 | 'min_dim': 300, 50 | 'steps': [8, 16, 32, 64, 100, 300], 51 | 'min_sizes': [30, 60, 111, 162, 213, 264], 52 | 'max_sizes': [60, 111, 162, 213, 264, 315], 53 | 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 54 | 'variance': [0.1, 0.2], 55 | 'clip': True, 56 | 'name': 'VOC', 57 | } 58 | 59 | coco = { 60 | 'num_classes': 201, 61 | 'lr_steps': (280000, 360000, 400000), 62 | 'max_iter': 400000, 63 | 'feature_maps': [38, 19, 10, 5, 3, 1], 64 | 'min_dim': 300, 65 | 'steps': [8, 16, 32, 64, 100, 300], 66 | 'min_sizes': [21, 45, 99, 153, 207, 261], 67 | 'max_sizes': [45, 99, 153, 207, 261, 315], 68 | 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 69 | 'variance': [0.1, 0.2], 70 | 'clip': True, 71 | 'name': 'COCO', 72 | } 73 | -------------------------------------------------------------------------------- /layers/modules/focal_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | 7 | class FocalLoss(nn.Module): 8 | r""" 9 | This criterion is a implemenation of Focal Loss, which is proposed in 10 | Focal Loss for Dense Object Detection. 11 | 12 | Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class]) 13 | 14 | The losses are averaged across observations for each minibatch. 15 | 16 | Args: 17 | alpha(1D Tensor, Variable) : the scalar factor for this criterion 18 | gamma(float, double) : gamma > 0; reduces the relative loss for well-classified examples (p > .5), 19 | putting more focus on hard, misclassified examples 20 | size_average(bool): By default, the losses are averaged over observations for each minibatch. 21 | However, if the field size_average is set to False, the losses are 22 | instead summed for each minibatch. 23 | 24 | 25 | """ 26 | def __init__(self, class_num, alpha=None, gamma=2, size_average=True): 27 | super(FocalLoss, self).__init__() 28 | if alpha is None: 29 | self.alpha = Variable(torch.ones(class_num, 1)) 30 | else: 31 | if isinstance(alpha, Variable): 32 | self.alpha = alpha 33 | else: 34 | self.alpha = Variable(alpha) 35 | self.gamma = gamma 36 | self.class_num = class_num 37 | self.size_average = size_average 38 | 39 | def forward(self, inputs, targets): 40 | N = inputs.size(0) 41 | C = inputs.size(1) 42 | P = F.softmax(inputs, dim=-1) 43 | 44 | class_mask = inputs.data.new(N, C).fill_(0) 45 | class_mask = Variable(class_mask) 46 | ids = targets.view(-1, 1) 47 | class_mask.scatter_(1, ids.data, 1.) 48 | #print(class_mask) 49 | 50 | 51 | if inputs.is_cuda and not self.alpha.is_cuda: 52 | self.alpha = self.alpha.cuda() 53 | alpha = self.alpha[ids.data.view(-1)] 54 | 55 | probs = (P*class_mask).sum(1).view(-1,1) 56 | 57 | log_p = probs.log() 58 | #print('probs size= {}'.format(probs.size())) 59 | #print(probs) 60 | 61 | batch_loss = -alpha*(torch.pow((1-probs), self.gamma))*log_p 62 | #print('-----bacth_loss------') 63 | #print(batch_loss) 64 | 65 | 66 | if self.size_average: 67 | loss = batch_loss.mean() 68 | else: 69 | loss = batch_loss.sum() 70 | return loss -------------------------------------------------------------------------------- /layers/functions/detection.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from ..box_utils import decode, nms 4 | from data import voc as cfg 5 | 6 | 7 | class Detect(Function): 8 | """At test time, Detect is the final layer of SSD. Decode location preds, 9 | apply non-maximum suppression to location predictions based on conf 10 | scores and threshold to a top_k number of output predictions for both 11 | confidence score and locations. 12 | """ 13 | def __init__(self, num_classes, bkg_label, top_k, conf_thresh, nms_thresh): 14 | self.num_classes = num_classes 15 | self.background_label = bkg_label 16 | self.top_k = top_k 17 | # Parameters used in nms. 18 | self.nms_thresh = nms_thresh 19 | if nms_thresh <= 0: 20 | raise ValueError('nms_threshold must be non negative.') 21 | self.conf_thresh = conf_thresh 22 | self.variance = cfg['variance'] 23 | 24 | def forward(self, loc_data, conf_data, prior_data): 25 | """ 26 | Args: 27 | loc_data: (tensor) Loc preds from loc layers 28 | Shape: [batch,num_priors*4] 29 | conf_data: (tensor) Shape: Conf preds from conf layers 30 | Shape: [batch*num_priors,num_classes] 31 | prior_data: (tensor) Prior boxes and variances from priorbox layers 32 | Shape: [1,num_priors,4] 33 | """ 34 | num = loc_data.size(0) # batch size 35 | num_priors = prior_data.size(0) 36 | output = torch.zeros(num, self.num_classes, self.top_k, 5) 37 | conf_preds = conf_data.view(num, num_priors, 38 | self.num_classes).transpose(2, 1) 39 | 40 | # Decode predictions into bboxes. 41 | for i in range(num): 42 | decoded_boxes = decode(loc_data[i], prior_data, self.variance) 43 | # For each class, perform nms 44 | conf_scores = conf_preds[i].clone() 45 | 46 | for cl in range(1, self.num_classes): 47 | c_mask = conf_scores[cl].gt(self.conf_thresh) 48 | scores = conf_scores[cl][c_mask] 49 | if scores.size(0) == 0: 50 | continue 51 | l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes) 52 | boxes = decoded_boxes[l_mask].view(-1, 4) 53 | # idx of highest scoring and non-overlapping boxes per class 54 | ids, count = nms(boxes, scores, self.nms_thresh, self.top_k) 55 | output[i, cl, :count] = \ 56 | torch.cat((scores[ids[:count]].unsqueeze(1), 57 | boxes[ids[:count]]), 1) 58 | flt = output.contiguous().view(num, -1, 5) 59 | _, idx = flt[:, :, 0].sort(1, descending=True) 60 | _, rank = idx.sort(1) 61 | flt[(rank < self.top_k).unsqueeze(-1).expand_as(flt)].fill_(0) 62 | return output 63 | -------------------------------------------------------------------------------- /data/xmlPaserGenLabel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: UTF-8 -*- 3 | 4 | import xml.sax 5 | from pathlib import Path, PurePath 6 | 7 | r = {} 8 | 9 | result = [] 10 | 11 | # label = { 12 | # 'garbage': 0, 13 | # 'garbagew': 1, 14 | # 'www': 2, 15 | # 'w': 3 16 | # } 17 | 18 | label = { 19 | 'waterline': 0, 20 | } 21 | 22 | class MovieHandler( xml.sax.ContentHandler ): 23 | def __init__(self): 24 | self.tag = "" 25 | self.boxes = [] 26 | self.box = { 27 | 'name': '', 28 | 'xmin': 0, 29 | 'xmax': 0, 30 | 'ymin': 0, 31 | 'ymax': 0 32 | } 33 | self.size = { 34 | 'width': 0, 35 | 'height': 0, 36 | 'depth': 0 37 | } 38 | 39 | # 元素开始事件处理 40 | def startElement(self, tag, attributes): 41 | self.tag = tag 42 | 43 | # 元素结束事件处理 44 | def endElement(self, tag): 45 | if self.tag == 'depth': 46 | r['data']['size'] = self.size 47 | if self.tag == 'ymax': 48 | r['data']['boxes'].append(self.box) 49 | self.tag = "" 50 | 51 | # 内容事件处理 52 | def characters(self, content): 53 | if self.tag == 'size': 54 | self.size = { 55 | 'width': 0, 56 | 'height': 0, 57 | 'depth': 0 58 | } 59 | elif self.tag == 'object': 60 | self.box = { 61 | 'name': '', 62 | 'xmin': 0, 63 | 'xmax': 0, 64 | 'ymin': 0, 65 | 'ymax': 0 66 | } 67 | elif self.tag == 'width': 68 | self.size['width'] = int(content) 69 | elif self.tag == 'height': 70 | self.size['height'] = int(content) 71 | elif self.tag == 'depth': 72 | self.size['depth'] = int(content) 73 | elif self.tag == 'name': 74 | self.box['name'] = content 75 | elif self.tag == 'xmin': 76 | self.box['xmin'] = int(content) 77 | elif self.tag == 'xmax': 78 | self.box['xmax'] = int(content) 79 | elif self.tag == 'ymin': 80 | self.box['ymin'] = int(content) 81 | elif self.tag == 'ymax': 82 | self.box['ymax'] = int(content) 83 | 84 | if ( __name__ == "__main__"): 85 | 86 | # 创建一个 XMLReader 87 | parser = xml.sax.make_parser() 88 | # turn off namepsaces 89 | parser.setFeature(xml.sax.handler.feature_namespaces, 0) 90 | 91 | # 重写 ContextHandler 92 | Handler = MovieHandler() 93 | parser.setContentHandler( Handler ) 94 | 95 | path = './video/waterline/Annotations' 96 | p = Path(path) 97 | files = [x for x in p.iterdir() if x.is_file()] 98 | for f in files: 99 | r = { 100 | 'file': f.name[0: -4], 101 | 'data': { 102 | 'size': {}, 103 | 'boxes': [] 104 | } 105 | } 106 | parser.parse(path+'/'+f.name) 107 | result.append(r) 108 | for r in result: 109 | # with open(".\\labels\\" + r['file'] + ".txt", "w") as f: 110 | width = r['data']['size']['width'] 111 | height = r['data']['size']['height'] 112 | for b in r['data']['boxes']: 113 | center_x = (b['xmax'] + b['xmin']) / 2 / width 114 | center_y = (b['ymax'] + b['ymax']) / 2 / height 115 | width_x = (b['xmax'] - b['xmin']) / width 116 | height_y = (b['ymax'] - b['ymin']) / height 117 | label_idx = label[b['name']] 118 | if width_x == 0 or height_y == 0 or (b['name'] != 'waterline'): 119 | print(r['file']) 120 | break 121 | # f.write(str(label_idx) + ' ' + str(center_x) + ' ' + str(center_y) + ' ' + str(width_x) + ' ' + str(height_y) + "\n") -------------------------------------------------------------------------------- /layers/modules/multibox_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from data import coco as cfg 7 | from ..box_utils import match, log_sum_exp 8 | from .focal_loss import FocalLoss 9 | 10 | 11 | class MultiBoxLoss(nn.Module): 12 | """SSD Weighted Loss Function 13 | Compute Targets: 14 | 1) Produce Confidence Target Indices by matching ground truth boxes 15 | with (default) 'priorboxes' that have jaccard index > threshold parameter 16 | (default threshold: 0.5). 17 | 2) Produce localization target by 'encoding' variance into offsets of ground 18 | truth boxes and their matched 'priorboxes'. 19 | 3) Hard negative mining to filter the excessive number of negative examples 20 | that comes with using a large number of default bounding boxes. 21 | (default negative:positive ratio 3:1) 22 | Objective Loss: 23 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 24 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 25 | weighted by α which is set to 1 by cross val. 26 | Args: 27 | c: class confidences, 28 | l: predicted boxes, 29 | g: ground truth boxes 30 | N: number of matched default boxes 31 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 32 | """ 33 | 34 | def __init__(self, num_classes, overlap_thresh, prior_for_matching, 35 | bkg_label, neg_mining, neg_pos, neg_overlap, encode_target, 36 | use_gpu=True): 37 | super(MultiBoxLoss, self).__init__() 38 | self.use_gpu = use_gpu 39 | self.num_classes = num_classes 40 | self.threshold = overlap_thresh 41 | self.background_label = bkg_label 42 | self.encode_target = encode_target 43 | self.use_prior_for_matching = prior_for_matching 44 | self.do_neg_mining = neg_mining 45 | self.negpos_ratio = neg_pos 46 | self.neg_overlap = neg_overlap 47 | self.variance = cfg['variance'] 48 | self.FL = FocalLoss(class_num=cfg['num_classes'], alpha=torch.Tensor([[0.25], [0.25]]), size_average=False) 49 | 50 | def forward(self, predictions, targets): 51 | """Multibox Loss 52 | Args: 53 | predictions (tuple): A tuple containing loc preds, conf preds, 54 | and prior boxes from SSD net. 55 | conf shape: torch.size(batch_size,num_priors,num_classes) 56 | loc shape: torch.size(batch_size,num_priors,4) 57 | priors shape: torch.size(num_priors,4) 58 | 59 | targets (tensor): Ground truth boxes and labels for a batch, 60 | shape: [batch_size,num_objs,5] (last idx is the label). 61 | """ 62 | loc_data, conf_data, priors = predictions 63 | num = loc_data.size(0) 64 | priors = priors[:loc_data.size(1), :] 65 | num_priors = (priors.size(0)) 66 | num_classes = self.num_classes 67 | 68 | # match priors (default boxes) and ground truth boxes 69 | loc_t = torch.Tensor(num, num_priors, 4) 70 | conf_t = torch.LongTensor(num, num_priors) 71 | for idx in range(num): 72 | truths = targets[idx][:, :-1].data 73 | labels = targets[idx][:, -1].data 74 | defaults = priors.data 75 | match(self.threshold, truths, defaults, self.variance, labels, 76 | loc_t, conf_t, idx) 77 | if self.use_gpu: 78 | loc_t = loc_t.cuda() 79 | conf_t = conf_t.cuda() 80 | # wrap targets 81 | loc_t = Variable(loc_t, requires_grad=False) 82 | conf_t = Variable(conf_t, requires_grad=False) 83 | 84 | pos = conf_t > 0 85 | num_pos = pos.sum(dim=1, keepdim=True) 86 | 87 | # Localization Loss (Smooth L1) 88 | # Shape: [batch,num_priors,4] 89 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 90 | loc_p = loc_data[pos_idx].view(-1, 4) 91 | loc_t = loc_t[pos_idx].view(-1, 4) 92 | # loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) 93 | loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') 94 | 95 | # Compute max conf across batch for hard negative mining 96 | batch_conf = conf_data.view(-1, self.num_classes) 97 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1)) 98 | 99 | # 修复bug 100 | loss_c = loss_c.view(pos.size()[0], pos.size()[1]) 101 | # Hard Negative Mining 102 | loss_c[pos] = 0 # filter out pos boxes for now 103 | loss_c = loss_c.view(num, -1) 104 | _, loss_idx = loss_c.sort(1, descending=True) 105 | _, idx_rank = loss_idx.sort(1) 106 | num_pos = pos.long().sum(1, keepdim=True) 107 | num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1) 108 | neg = idx_rank < num_neg.expand_as(idx_rank) 109 | 110 | # Confidence Loss Including Positive and Negative Examples 111 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 112 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 113 | conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes) 114 | targets_weighted = conf_t[(pos+neg).gt(0)] 115 | # loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False) 116 | # loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='sum') 117 | loss_c = self.FL(conf_p, targets_weighted) 118 | 119 | # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 120 | 121 | N = num_pos.data.sum() 122 | loss_l /= N 123 | loss_c /= N 124 | return loss_l, loss_c 125 | 126 | 127 | -------------------------------------------------------------------------------- /data/voc0712.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | from .config import HOME 9 | import os.path as osp 10 | import sys 11 | import torch 12 | import torch.utils.data as data 13 | import cv2 14 | import numpy as np 15 | if sys.version_info[0] == 2: 16 | import xml.etree.cElementTree as ET 17 | else: 18 | import xml.etree.ElementTree as ET 19 | 20 | VOC_CLASSES = ( # always index 0 21 | 'aeroplane', 'bicycle', 'bird', 'boat', 22 | 'bottle', 'bus', 'car', 'cat', 'chair', 23 | 'cow', 'diningtable', 'dog', 'horse', 24 | 'motorbike', 'person', 'pottedplant', 25 | 'sheep', 'sofa', 'train', 'tvmonitor') 26 | 27 | # note: if you used our download scripts, this should be right 28 | VOC_ROOT = osp.join(HOME, "data/VOCdevkit/") 29 | 30 | 31 | class VOCAnnotationTransform(object): 32 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 33 | Initilized with a dictionary lookup of classnames to indexes 34 | 35 | Arguments: 36 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 37 | (default: alphabetic indexing of VOC's 20 classes) 38 | keep_difficult (bool, optional): keep difficult instances or not 39 | (default: False) 40 | height (int): height 41 | width (int): width 42 | """ 43 | 44 | def __init__(self, class_to_ind=None, keep_difficult=False): 45 | self.class_to_ind = class_to_ind or dict( 46 | zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 47 | self.keep_difficult = keep_difficult 48 | 49 | def __call__(self, target, width, height): 50 | """ 51 | Arguments: 52 | target (annotation) : the target annotation to be made usable 53 | will be an ET.Element 54 | Returns: 55 | a list containing lists of bounding boxes [bbox coords, class name] 56 | """ 57 | res = [] 58 | for obj in target.iter('object'): 59 | difficult = int(obj.find('difficult').text) == 1 60 | if not self.keep_difficult and difficult: 61 | continue 62 | name = obj.find('name').text.lower().strip() 63 | bbox = obj.find('bndbox') 64 | 65 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 66 | bndbox = [] 67 | for i, pt in enumerate(pts): 68 | cur_pt = int(bbox.find(pt).text) - 1 69 | # scale height or width 70 | cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height 71 | bndbox.append(cur_pt) 72 | label_idx = self.class_to_ind[name] 73 | bndbox.append(label_idx) 74 | res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind] 75 | # img_id = target.find('filename').text[:-4] 76 | 77 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 78 | 79 | 80 | class VOCDetection(data.Dataset): 81 | """VOC Detection Dataset Object 82 | 83 | input is image, target is annotation 84 | 85 | Arguments: 86 | root (string): filepath to VOCdevkit folder. 87 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 88 | transform (callable, optional): transformation to perform on the 89 | input image 90 | target_transform (callable, optional): transformation to perform on the 91 | target `annotation` 92 | (eg: take in caption string, return tensor of word indices) 93 | dataset_name (string, optional): which dataset to load 94 | (default: 'VOC2007') 95 | """ 96 | 97 | def __init__(self, root, 98 | image_sets=[('2007', 'trainval'), ('2012', 'trainval')], 99 | transform=None, target_transform=VOCAnnotationTransform(), 100 | dataset_name='VOC0712'): 101 | self.root = root 102 | self.image_set = image_sets 103 | self.transform = transform 104 | self.target_transform = target_transform 105 | self.name = dataset_name 106 | self._annopath = osp.join('%s', 'Annotations', '%s.xml') 107 | self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg') 108 | self.ids = list() 109 | for (year, name) in image_sets: 110 | rootpath = osp.join(self.root, 'VOC' + year) 111 | for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 112 | self.ids.append((rootpath, line.strip())) 113 | 114 | def __getitem__(self, index): 115 | im, gt, h, w = self.pull_item(index) 116 | 117 | return im, gt 118 | 119 | def __len__(self): 120 | return len(self.ids) 121 | 122 | def pull_item(self, index): 123 | img_id = self.ids[index] 124 | 125 | target = ET.parse(self._annopath % img_id).getroot() 126 | img = cv2.imread(self._imgpath % img_id) 127 | height, width, channels = img.shape 128 | 129 | if self.target_transform is not None: 130 | target = self.target_transform(target, width, height) 131 | 132 | if self.transform is not None: 133 | target = np.array(target) 134 | img, boxes, labels = self.transform(img, target[:, :4], target[:, 4]) 135 | # to rgb 136 | img = img[:, :, (2, 1, 0)] 137 | # img = img.transpose(2, 0, 1) 138 | target = np.hstack((boxes, np.expand_dims(labels, axis=1))) 139 | return torch.from_numpy(img).permute(2, 0, 1), target, height, width 140 | # return torch.from_numpy(img), target, height, width 141 | 142 | def pull_image(self, index): 143 | '''Returns the original image object at index in PIL form 144 | 145 | Note: not using self.__getitem__(), as any transformations passed in 146 | could mess up this functionality. 147 | 148 | Argument: 149 | index (int): index of img to show 150 | Return: 151 | PIL img 152 | ''' 153 | img_id = self.ids[index] 154 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 155 | 156 | def pull_anno(self, index): 157 | '''Returns the original annotation of image at index 158 | 159 | Note: not using self.__getitem__(), as any transformations passed in 160 | could mess up this functionality. 161 | 162 | Argument: 163 | index (int): index of img to get annotation of 164 | Return: 165 | list: [img_id, [(label, bbox coords),...]] 166 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 167 | ''' 168 | img_id = self.ids[index] 169 | anno = ET.parse(self._annopath % img_id).getroot() 170 | gt = self.target_transform(anno, 1, 1) 171 | return img_id[1], gt 172 | 173 | def pull_tensor(self, index): 174 | '''Returns the original image at an index in tensor form 175 | 176 | Note: not using self.__getitem__(), as any transformations passed in 177 | could mess up this functionality. 178 | 179 | Argument: 180 | index (int): index of img to show 181 | Return: 182 | tensorized version of img, squeezed 183 | ''' 184 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) -------------------------------------------------------------------------------- /data/custom.py: -------------------------------------------------------------------------------- 1 | """custom Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/custom_dataset/torchvision/datasets/custom.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | from .config import HOME 9 | import os.path as osp 10 | import sys 11 | import torch 12 | import torch.utils.data as data 13 | import cv2 14 | import numpy as np 15 | if sys.version_info[0] == 2: 16 | import xml.etree.cElementTree as ET 17 | else: 18 | import xml.etree.ElementTree as ET 19 | 20 | CUSTOM_CLASSES = ( # always index 0 21 | 'gauge',) 22 | 23 | # CUSTOM_CLASSES = ( # always index 0 24 | # 'waterline', 'mark') 25 | 26 | # CUSTOM_CLASSES = ( # always index 0 27 | # 'building', 'water') 28 | 29 | # note: if you used our download scripts, this should be right 30 | CUSTOM_ROOT = osp.join(HOME, "data/video/") 31 | 32 | 33 | class customAnnotationTransform(object): 34 | """Transforms a custom annotation into a Tensor of bbox coords and label index 35 | Initilized with a dictionary lookup of classnames to indexes 36 | 37 | Arguments: 38 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 39 | (default: alphabetic indexing of custom's 20 classes) 40 | keep_difficult (bool, optional): keep difficult instances or not 41 | (default: False) 42 | height (int): height 43 | width (int): width 44 | """ 45 | 46 | def __init__(self, class_to_ind=None, keep_difficult=False): 47 | self.class_to_ind = class_to_ind or dict( 48 | zip(CUSTOM_CLASSES, range(len(CUSTOM_CLASSES)))) 49 | print(self.class_to_ind) 50 | self.keep_difficult = keep_difficult 51 | 52 | def __call__(self, target, width, height): 53 | """ 54 | Arguments: 55 | target (annotation) : the target annotation to be made usable 56 | will be an ET.Element 57 | Returns: 58 | a list containing lists of bounding boxes [bbox coords, class name] 59 | """ 60 | res = [] 61 | for obj in target.iter('object'): 62 | difficult = int(obj.find('difficult').text) == 1 63 | if not self.keep_difficult and difficult: 64 | continue 65 | name = obj.find('name').text.lower().strip() 66 | bbox = obj.find('bndbox') 67 | 68 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 69 | bndbox = [] 70 | for i, pt in enumerate(pts): 71 | cur_pt = int(bbox.find(pt).text) - 1 72 | # scale height or width 73 | cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height 74 | bndbox.append(cur_pt) 75 | label_idx = self.class_to_ind[name] 76 | bndbox.append(label_idx) 77 | res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind] 78 | # img_id = target.find('filename').text[:-4] 79 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 80 | 81 | 82 | class customDetection(data.Dataset): 83 | """custom Detection Dataset Object 84 | 85 | input is image, target is annotation 86 | 87 | Arguments: 88 | root (string): filepath to customdevkit folder. 89 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 90 | transform (callable, optional): transformation to perform on the 91 | input image 92 | target_transform (callable, optional): transformation to perform on the 93 | target `annotation` 94 | (eg: take in caption string, return tensor of word indices) 95 | dataset_name (string, optional): which dataset to load 96 | (default: 'VOC2007') 97 | """ 98 | 99 | def __init__(self, root, 100 | # image_sets=[('shenhe', 'train')], 101 | image_sets=[('gauge', 'train')], 102 | transform=None, target_transform=customAnnotationTransform(), 103 | dataset_name='custom'): 104 | self.root = root 105 | self.image_set = image_sets 106 | self.transform = transform 107 | self.target_transform = target_transform 108 | self.name = dataset_name 109 | self._annopath = osp.join('%s', 'Annotations', '%s.xml') 110 | self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg') 111 | self.ids = list() 112 | for (curDir, name) in image_sets: 113 | rootpath = osp.join(self.root, curDir) 114 | for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 115 | self.ids.append((rootpath, line.strip())) 116 | 117 | def __getitem__(self, index): 118 | im, gt, h, w = self.pull_item(index) 119 | 120 | return im, gt 121 | 122 | def __len__(self): 123 | return len(self.ids) 124 | 125 | def pull_item(self, index): 126 | img_id = self.ids[index] 127 | 128 | target = ET.parse(self._annopath % img_id).getroot() 129 | img = cv2.imread(self._imgpath % img_id) 130 | height, width, channels = img.shape 131 | 132 | if self.target_transform is not None: 133 | target = self.target_transform(target, width, height) 134 | 135 | if self.transform is not None: 136 | target = np.array(target) 137 | img, boxes, labels = self.transform(img, target[:, :4], target[:, 4]) 138 | # to rgb 139 | img = img[:, :, (2, 1, 0)] 140 | # img = img.transpose(2, 0, 1) 141 | target = np.hstack((boxes, np.expand_dims(labels, axis=1))) 142 | return torch.from_numpy(img).permute(2, 0, 1), target, height, width 143 | # return torch.from_numpy(img), target, height, width 144 | 145 | def pull_image(self, index): 146 | '''Returns the original image object at index in PIL form 147 | 148 | Note: not using self.__getitem__(), as any transformations passed in 149 | could mess up this functionality. 150 | 151 | Argument: 152 | index (int): index of img to show 153 | Return: 154 | PIL img 155 | ''' 156 | img_id = self.ids[index] 157 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 158 | 159 | def pull_anno(self, index): 160 | '''Returns the original annotation of image at index 161 | 162 | Note: not using self.__getitem__(), as any transformations passed in 163 | could mess up this functionality. 164 | 165 | Argument: 166 | index (int): index of img to get annotation of 167 | Return: 168 | list: [img_id, [(label, bbox coords),...]] 169 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 170 | ''' 171 | img_id = self.ids[index] 172 | anno = ET.parse(self._annopath % img_id).getroot() 173 | gt = self.target_transform(anno, 1, 1) 174 | return img_id[1], gt 175 | 176 | def pull_tensor(self, index): 177 | '''Returns the original image at an index in tensor form 178 | 179 | Note: not using self.__getitem__(), as any transformations passed in 180 | could mess up this functionality. 181 | 182 | Argument: 183 | index (int): index of img to show 184 | Return: 185 | tensorized version of img, squeezed 186 | ''' 187 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) -------------------------------------------------------------------------------- /data/custom_for_visual.py: -------------------------------------------------------------------------------- 1 | """custom Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/custom_dataset/torchvision/datasets/custom.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | from .config import HOME 9 | import os.path as osp 10 | import sys 11 | import torch 12 | import torch.utils.data as data 13 | import cv2 14 | import numpy as np 15 | if sys.version_info[0] == 2: 16 | import xml.etree.cElementTree as ET 17 | else: 18 | import xml.etree.ElementTree as ET 19 | 20 | # {'garbage': 1, 'garbagew': 1, 'www': 1, 'w': 1} 21 | # CUSTOM_CLASSES = ( # always index 0 22 | # 'waterline',) 23 | CUSTOM_CLASSES_GAUGE = ( # always index 0 24 | 'gauge',) 25 | 26 | CUSTOM_CLASSES_WATERLINE = ( # always index 0 27 | 'waterline', 'mark',) 28 | 29 | CUSTOM_CLASSES_BUILDING = ( # always index 0 30 | 'building', 'water',) 31 | 32 | # note: if you used our download scripts, this should be right 33 | # CUSTOM_ROOT = osp.join(HOME, "data/piaofu/piao/") 34 | CUSTOM_ROOT = osp.join(HOME, "data/video/") 35 | 36 | 37 | class customAnnotationTransform(object): 38 | """Transforms a custom annotation into a Tensor of bbox coords and label index 39 | Initilized with a dictionary lookup of classnames to indexes 40 | 41 | Arguments: 42 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 43 | (default: alphabetic indexing of custom's 20 classes) 44 | keep_difficult (bool, optional): keep difficult instances or not 45 | (default: False) 46 | height (int): height 47 | width (int): width 48 | """ 49 | 50 | def __init__(self, class_to_ind=None, keep_difficult=False): 51 | self.class_to_ind = class_to_ind or dict( 52 | zip(CUSTOM_CLASSES_GAUGE, range(len(CUSTOM_CLASSES_GAUGE)))) 53 | self.keep_difficult = keep_difficult 54 | 55 | def __call__(self, target, width, height): 56 | """ 57 | Arguments: 58 | target (annotation) : the target annotation to be made usable 59 | will be an ET.Element 60 | Returns: 61 | a list containing lists of bounding boxes [bbox coords, class name] 62 | """ 63 | res = [] 64 | for obj in target.iter('object'): 65 | difficult = int(obj.find('difficult').text) == 1 66 | if not self.keep_difficult and difficult: 67 | continue 68 | name = obj.find('name').text.lower().strip() 69 | bbox = obj.find('bndbox') 70 | 71 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 72 | bndbox = [] 73 | for i, pt in enumerate(pts): 74 | cur_pt = int(bbox.find(pt).text) - 1 75 | # scale height or width 76 | cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height 77 | bndbox.append(cur_pt) 78 | label_idx = self.class_to_ind[name] 79 | bndbox.append(label_idx) 80 | res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind] 81 | # img_id = target.find('filename').text[:-4] 82 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 83 | 84 | 85 | class customDetection(data.Dataset): 86 | """custom Detection Dataset Object 87 | 88 | input is image, target is annotation 89 | 90 | Arguments: 91 | root (string): filepath to customdevkit folder. 92 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 93 | transform (callable, optional): transformation to perform on the 94 | input image 95 | target_transform (callable, optional): transformation to perform on the 96 | target `annotation` 97 | (eg: take in caption string, return tensor of word indices) 98 | dataset_name (string, optional): which dataset to load 99 | (default: 'VOC2007') 100 | """ 101 | 102 | def __init__(self, root, 103 | image_sets=[('gauge', 'train')], 104 | transform=None, target_transform=customAnnotationTransform(), 105 | dataset_name='custom'): 106 | self.root = root 107 | self.image_set = image_sets 108 | self.transform = transform 109 | self.target_transform = target_transform 110 | self.name = dataset_name 111 | self._annopath = osp.join('%s', 'Annotations', '%s.xml') 112 | self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg') 113 | self.ids = list() 114 | for (curDir, name) in image_sets: 115 | rootpath = osp.join(self.root, curDir) 116 | for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 117 | self.ids.append((rootpath, line.strip())) 118 | 119 | def __getitem__(self, index): 120 | im, gt, h, w = self.pull_item(index) 121 | 122 | return im, gt 123 | 124 | def __len__(self): 125 | return len(self.ids) 126 | 127 | def pull_item(self, index): 128 | img_id = self.ids[index] 129 | 130 | target = ET.parse(self._annopath % img_id).getroot() 131 | img = cv2.imread(self._imgpath % img_id) 132 | height, width, channels = img.shape 133 | 134 | if self.target_transform is not None: 135 | target = self.target_transform(target, width, height) 136 | 137 | if self.transform is not None: 138 | target = np.array(target) 139 | img, boxes, labels = self.transform(img, target[:, :4], target[:, 4]) 140 | # to rgb 141 | img = img[:, :, (2, 1, 0)] 142 | # img = img.transpose(2, 0, 1) 143 | target = np.hstack((boxes, np.expand_dims(labels, axis=1))) 144 | return torch.from_numpy(img).permute(2, 0, 1), target, height, width 145 | # return torch.from_numpy(img), target, height, width 146 | 147 | def pull_image(self, index): 148 | '''Returns the original image object at index in PIL form 149 | 150 | Note: not using self.__getitem__(), as any transformations passed in 151 | could mess up this functionality. 152 | 153 | Argument: 154 | index (int): index of img to show 155 | Return: 156 | PIL img 157 | ''' 158 | img_id = self.ids[index] 159 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 160 | 161 | def pull_anno(self, index): 162 | '''Returns the original annotation of image at index 163 | 164 | Note: not using self.__getitem__(), as any transformations passed in 165 | could mess up this functionality. 166 | 167 | Argument: 168 | index (int): index of img to get annotation of 169 | Return: 170 | list: [img_id, [(label, bbox coords),...]] 171 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 172 | ''' 173 | img_id = self.ids[index] 174 | anno = ET.parse(self._annopath % img_id).getroot() 175 | gt = self.target_transform(anno, 1, 1) 176 | return img_id[1], gt 177 | 178 | def pull_tensor(self, index): 179 | '''Returns the original image at an index in tensor form 180 | 181 | Note: not using self.__getitem__(), as any transformations passed in 182 | could mess up this functionality. 183 | 184 | Argument: 185 | index (int): index of img to show 186 | Return: 187 | tensorized version of img, squeezed 188 | ''' 189 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) 190 | 191 | def pull_img_name(self, index): 192 | return self.ids[index] -------------------------------------------------------------------------------- /data/coco.py: -------------------------------------------------------------------------------- 1 | from .config import HOME 2 | import os 3 | import os.path as osp 4 | import sys 5 | import torch 6 | import torch.utils.data as data 7 | import torchvision.transforms as transforms 8 | import cv2 9 | import numpy as np 10 | 11 | COCO_ROOT = osp.join(HOME, 'data/coco/') 12 | IMAGES = 'images' 13 | ANNOTATIONS = 'annotations' 14 | COCO_API = 'PythonAPI' 15 | INSTANCES_SET = 'instances_{}.json' 16 | COCO_CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 17 | 'train', 'truck', 'boat', 'traffic light', 'fire', 'hydrant', 18 | 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 19 | 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 20 | 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 21 | 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 22 | 'kite', 'baseball bat', 'baseball glove', 'skateboard', 23 | 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 24 | 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 25 | 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 26 | 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 27 | 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 28 | 'keyboard', 'cell phone', 'microwave oven', 'toaster', 'sink', 29 | 'refrigerator', 'book', 'clock', 'vase', 'scissors', 30 | 'teddy bear', 'hair drier', 'toothbrush') 31 | 32 | 33 | def get_label_map(label_file): 34 | label_map = {} 35 | labels = open(label_file, 'r') 36 | for line in labels: 37 | ids = line.split(',') 38 | label_map[int(ids[0])] = int(ids[1]) 39 | return label_map 40 | 41 | 42 | class COCOAnnotationTransform(object): 43 | """Transforms a COCO annotation into a Tensor of bbox coords and label index 44 | Initilized with a dictionary lookup of classnames to indexes 45 | """ 46 | def __init__(self): 47 | self.label_map = get_label_map(osp.join(COCO_ROOT, 'coco_labels.txt')) 48 | 49 | def __call__(self, target, width, height): 50 | """ 51 | Args: 52 | target (dict): COCO target json annotation as a python dict 53 | height (int): height 54 | width (int): width 55 | Returns: 56 | a list containing lists of bounding boxes [bbox coords, class idx] 57 | """ 58 | scale = np.array([width, height, width, height]) 59 | res = [] 60 | for obj in target: 61 | if 'bbox' in obj: 62 | bbox = obj['bbox'] 63 | bbox[2] += bbox[0] 64 | bbox[3] += bbox[1] 65 | label_idx = self.label_map[obj['category_id']] - 1 66 | final_box = list(np.array(bbox)/scale) 67 | final_box.append(label_idx) 68 | res += [final_box] # [xmin, ymin, xmax, ymax, label_idx] 69 | else: 70 | print("no bbox problem!") 71 | 72 | return res # [[xmin, ymin, xmax, ymax, label_idx], ... ] 73 | 74 | 75 | class COCODetection(data.Dataset): 76 | """`MS Coco Detection `_ Dataset. 77 | Args: 78 | root (string): Root directory where images are downloaded to. 79 | set_name (string): Name of the specific set of COCO images. 80 | transform (callable, optional): A function/transform that augments the 81 | raw images` 82 | target_transform (callable, optional): A function/transform that takes 83 | in the target (bbox) and transforms it. 84 | """ 85 | 86 | def __init__(self, root, image_set='trainval35k', transform=None, 87 | target_transform=COCOAnnotationTransform(), dataset_name='MS COCO'): 88 | sys.path.append(osp.join(root, COCO_API)) 89 | from pycocotools.coco import COCO 90 | self.root = osp.join(root, IMAGES, image_set) 91 | self.coco = COCO(osp.join(root, ANNOTATIONS, 92 | INSTANCES_SET.format(image_set))) 93 | self.ids = list(self.coco.imgToAnns.keys()) 94 | self.transform = transform 95 | self.target_transform = target_transform 96 | self.name = dataset_name 97 | 98 | def __getitem__(self, index): 99 | """ 100 | Args: 101 | index (int): Index 102 | Returns: 103 | tuple: Tuple (image, target). 104 | target is the object returned by ``coco.loadAnns``. 105 | """ 106 | im, gt, h, w = self.pull_item(index) 107 | return im, gt 108 | 109 | def __len__(self): 110 | return len(self.ids) 111 | 112 | def pull_item(self, index): 113 | """ 114 | Args: 115 | index (int): Index 116 | Returns: 117 | tuple: Tuple (image, target, height, width). 118 | target is the object returned by ``coco.loadAnns``. 119 | """ 120 | img_id = self.ids[index] 121 | target = self.coco.imgToAnns[img_id] 122 | ann_ids = self.coco.getAnnIds(imgIds=img_id) 123 | 124 | target = self.coco.loadAnns(ann_ids) 125 | path = osp.join(self.root, self.coco.loadImgs(img_id)[0]['file_name']) 126 | assert osp.exists(path), 'Image path does not exist: {}'.format(path) 127 | img = cv2.imread(osp.join(self.root, path)) 128 | height, width, _ = img.shape 129 | if self.target_transform is not None: 130 | target = self.target_transform(target, width, height) 131 | if self.transform is not None: 132 | target = np.array(target) 133 | img, boxes, labels = self.transform(img, target[:, :4], 134 | target[:, 4]) 135 | # to rgb 136 | img = img[:, :, (2, 1, 0)] 137 | 138 | target = np.hstack((boxes, np.expand_dims(labels, axis=1))) 139 | return torch.from_numpy(img).permute(2, 0, 1), target, height, width 140 | 141 | def pull_image(self, index): 142 | '''Returns the original image object at index in PIL form 143 | 144 | Note: not using self.__getitem__(), as any transformations passed in 145 | could mess up this functionality. 146 | 147 | Argument: 148 | index (int): index of img to show 149 | Return: 150 | cv2 img 151 | ''' 152 | img_id = self.ids[index] 153 | path = self.coco.loadImgs(img_id)[0]['file_name'] 154 | return cv2.imread(osp.join(self.root, path), cv2.IMREAD_COLOR) 155 | 156 | def pull_anno(self, index): 157 | '''Returns the original annotation of image at index 158 | 159 | Note: not using self.__getitem__(), as any transformations passed in 160 | could mess up this functionality. 161 | 162 | Argument: 163 | index (int): index of img to get annotation of 164 | Return: 165 | list: [img_id, [(label, bbox coords),...]] 166 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 167 | ''' 168 | img_id = self.ids[index] 169 | ann_ids = self.coco.getAnnIds(imgIds=img_id) 170 | return self.coco.loadAnns(ann_ids) 171 | 172 | def __repr__(self): 173 | fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' 174 | fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) 175 | fmt_str += ' Root Location: {}\n'.format(self.root) 176 | tmp = ' Transforms (if any): ' 177 | fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) 178 | tmp = ' Target Transforms (if any): ' 179 | fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) 180 | return fmt_str 181 | -------------------------------------------------------------------------------- /实验 4.1/ssd_resnet_101.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from layers import * 6 | from data import voc, coco, custom 7 | import os 8 | 9 | from netModel.resnet import resnet18, resnet34, BasicBlock 10 | 11 | 12 | class SSD(nn.Module): 13 | """Single Shot Multibox Architecture 14 | The network is composed of a base VGG network followed by the 15 | added multibox conv layers. Each multibox layer branches into 16 | 1) conv2d for class conf scores 17 | 2) conv2d for localization predictions 18 | 3) associated priorbox layer to produce default bounding 19 | boxes specific to the layer's feature map size. 20 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 21 | 22 | Args: 23 | phase: (string) Can be "test" or "train" 24 | size: input image size 25 | base: resnet layers for input, size of either 300 or 500 26 | extras: extra layers that feed to multibox loc and conf layers 27 | head: "multibox head" consists of loc and conf conv layers 28 | """ 29 | 30 | def __init__(self, phase, size, base, extras, head, num_classes): 31 | super(SSD, self).__init__() 32 | self.phase = phase 33 | self.num_classes = num_classes 34 | self.cfg = custom 35 | self.priorbox = PriorBox(self.cfg) 36 | self.priors = Variable(self.priorbox.forward(), volatile=True) 37 | self.size = size 38 | 39 | # SSD network 40 | self.resnet = nn.ModuleList(base) 41 | # Layer learns to scale the l2 normalized features from conv4_3 42 | self.L2Norm = L2Norm(512, 20) 43 | self.extras = nn.ModuleList(extras) 44 | 45 | self.loc = nn.ModuleList(head[0]) 46 | self.conf = nn.ModuleList(head[1]) 47 | 48 | if phase == 'test': 49 | self.softmax = nn.Softmax(dim=-1) 50 | self.detect = Detect(num_classes, 0, 200, 0.01, 0.45) 51 | 52 | def forward(self, x): 53 | """Applies network layers and ops on input image(s) x. 54 | 55 | Args: 56 | x: input image or batch of images. Shape: [batch,3,300,300]. 57 | 58 | Return: 59 | Depending on phase: 60 | test: 61 | Variable(tensor) of output class label predictions, 62 | confidence score, and corresponding location predictions for 63 | each object detected. Shape: [batch,topk,7] 64 | 65 | train: 66 | list of concat outputs from: 67 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 68 | 2: localization layers, Shape: [batch,num_priors*4] 69 | 3: priorbox layers, Shape: [2,num_priors*4] 70 | """ 71 | sources = list() 72 | loc = list() 73 | conf = list() 74 | 75 | # apply resnet up to layer2 76 | for k in range(0,7): 77 | x = self.resnet[k](x) 78 | sources.append(x) 79 | 80 | # apply resnet up to layer4 81 | for k in range(7, len(self.resnet)): 82 | x = self.resnet[k](x) 83 | sources.append(x) 84 | # s = self.L2Norm(x) 85 | # sources.append(s) 86 | 87 | # apply extra layers and cache source layer outputs 88 | for k, v in enumerate(self.extras): 89 | x = F.relu(v(x), inplace=True) 90 | if k % 2 == 1: 91 | sources.append(x) 92 | # apply multibox head to source layers 93 | for (x, l, c) in zip(sources, self.loc, self.conf): 94 | loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 95 | conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 96 | 97 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 98 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 99 | if self.phase == "test": 100 | output = self.detect( 101 | loc.view(loc.size(0), -1, 4), # loc preds 102 | self.softmax(conf.view(conf.size(0), -1, 103 | self.num_classes)), # conf preds 104 | self.priors.type(type(x.data)) # default boxes 105 | ) 106 | else: 107 | output = ( 108 | loc.view(loc.size(0), -1, 4), 109 | conf.view(conf.size(0), -1, self.num_classes), 110 | self.priors 111 | ) 112 | return output 113 | 114 | def load_weights(self, base_file): 115 | other, ext = os.path.splitext(base_file) 116 | if ext == '.pkl' or '.pth': 117 | print('Loading weights into state dict...') 118 | self.load_state_dict(torch.load(base_file, 119 | map_location=lambda storage, loc: storage)) 120 | print('Finished!') 121 | else: 122 | print('Sorry only .pth and .pkl files supported.') 123 | 124 | 125 | # This function is derived from torchvision VGG make_layers() 126 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 127 | def vgg(cfg, i, batch_norm=False): 128 | layers = [] 129 | in_channels = i 130 | for v in cfg: 131 | if v == 'M': 132 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 133 | elif v == 'C': 134 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 135 | else: 136 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 137 | if batch_norm: 138 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 139 | else: 140 | layers += [conv2d, nn.ReLU(inplace=True)] 141 | in_channels = v 142 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 143 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 144 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 145 | layers += [pool5, conv6, 146 | nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 147 | return layers 148 | 149 | def resnet(): 150 | resnet = resnet34(pretrained=True) 151 | layers = [ 152 | resnet.conv1, 153 | resnet.bn1, 154 | resnet.relu, 155 | resnet.maxpool, 156 | resnet.layer1, 157 | resnet.layer2, 158 | resnet.layer3, 159 | resnet.layer4, 160 | ] 161 | return layers 162 | 163 | def add_extras(cfg, i, batch_norm=False): 164 | # Extra layers added to VGG for feature scaling 165 | layers = [] 166 | in_channels = i 167 | flag = False 168 | for k, v in enumerate(cfg): 169 | if in_channels != 'S': 170 | if v == 'S': 171 | layers += [nn.Conv2d(in_channels, cfg[k + 1], 172 | kernel_size=(1, 3)[flag], stride=2, padding=1)] 173 | else: 174 | layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 175 | flag = not flag 176 | in_channels = v 177 | return layers 178 | 179 | 180 | def multibox(resnet, extra_layers, cfg, num_classes): 181 | loc_layers = [] 182 | conf_layers = [] 183 | resnet_source = [-2, -1] 184 | for k, v in enumerate(resnet_source): 185 | loc_layers += [nn.Conv2d(resnet[v][-1].conv2.out_channels, 186 | cfg[k] * 4, kernel_size=3, padding=1)] 187 | conf_layers += [nn.Conv2d(resnet[v][-1].conv2.out_channels, 188 | cfg[k] * num_classes, kernel_size=3, padding=1)] 189 | for k, v in enumerate(extra_layers[1::2], 2): 190 | loc_layers += [nn.Conv2d(v.out_channels, cfg[k] 191 | * 4, kernel_size=3, padding=1)] 192 | conf_layers += [nn.Conv2d(v.out_channels, cfg[k] 193 | * num_classes, kernel_size=3, padding=1)] 194 | return resnet, extra_layers, (loc_layers, conf_layers) 195 | 196 | 197 | base = { 198 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 199 | 512, 512, 512], 200 | '512': [], 201 | } 202 | extras = { 203 | '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 204 | '512': [], 205 | } 206 | mbox = { 207 | '300': [4, 6, 6, 6, 4, 4], # number of boxes per feature map location 208 | '512': [], 209 | } 210 | 211 | 212 | def build_ssd(phase, size=300, num_classes=21): 213 | if phase != "test" and phase != "train": 214 | print("ERROR: Phase: " + phase + " not recognized") 215 | return 216 | if size != 300: 217 | print("ERROR: You specified size " + repr(size) + ". However, " + 218 | "currently only SSD300 (size=300) is supported!") 219 | return 220 | base_, extras_, head_ = multibox(resnet(), 221 | add_extras(extras[str(size)], 512), 222 | mbox[str(size)], num_classes) 223 | return SSD(phase, size, base_, extras_, head_, num_classes) 224 | -------------------------------------------------------------------------------- /实验 4.2/ssd_resnet_18.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from layers import * 6 | from data import voc, coco, custom 7 | import os 8 | 9 | from netModel.resnet import resnet18, resnet34, BasicBlock 10 | 11 | 12 | class SSD(nn.Module): 13 | """Single Shot Multibox Architecture 14 | The network is composed of a base VGG network followed by the 15 | added multibox conv layers. Each multibox layer branches into 16 | 1) conv2d for class conf scores 17 | 2) conv2d for localization predictions 18 | 3) associated priorbox layer to produce default bounding 19 | boxes specific to the layer's feature map size. 20 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 21 | 22 | Args: 23 | phase: (string) Can be "test" or "train" 24 | size: input image size 25 | base: resnet layers for input, size of either 300 or 500 26 | extras: extra layers that feed to multibox loc and conf layers 27 | head: "multibox head" consists of loc and conf conv layers 28 | """ 29 | 30 | def __init__(self, phase, size, base, extras, head, num_classes): 31 | super(SSD, self).__init__() 32 | self.phase = phase 33 | self.num_classes = num_classes 34 | self.cfg = custom 35 | self.priorbox = PriorBox(self.cfg) 36 | self.priors = Variable(self.priorbox.forward(), volatile=True) 37 | self.size = size 38 | 39 | # SSD network 40 | self.resnet = nn.ModuleList(base) 41 | # Layer learns to scale the l2 normalized features from conv4_3 42 | self.L2Norm = L2Norm(512, 20) 43 | self.extras = nn.ModuleList(extras) 44 | 45 | self.loc = nn.ModuleList(head[0]) 46 | self.conf = nn.ModuleList(head[1]) 47 | 48 | if phase == 'test': 49 | self.softmax = nn.Softmax(dim=-1) 50 | self.detect = Detect(num_classes, 0, 200, 0.01, 0.45) 51 | 52 | def forward(self, x): 53 | """Applies network layers and ops on input image(s) x. 54 | 55 | Args: 56 | x: input image or batch of images. Shape: [batch,3,300,300]. 57 | 58 | Return: 59 | Depending on phase: 60 | test: 61 | Variable(tensor) of output class label predictions, 62 | confidence score, and corresponding location predictions for 63 | each object detected. Shape: [batch,topk,7] 64 | 65 | train: 66 | list of concat outputs from: 67 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 68 | 2: localization layers, Shape: [batch,num_priors*4] 69 | 3: priorbox layers, Shape: [2,num_priors*4] 70 | """ 71 | sources = list() 72 | loc = list() 73 | conf = list() 74 | 75 | # apply resnet up to layer2 76 | for k in range(0,7): 77 | x = self.resnet[k](x) 78 | sources.append(x) 79 | 80 | # apply resnet up to layer4 81 | for k in range(7, len(self.resnet)): 82 | x = self.resnet[k](x) 83 | sources.append(x) 84 | # s = self.L2Norm(x) 85 | # sources.append(s) 86 | 87 | # apply extra layers and cache source layer outputs 88 | for k, v in enumerate(self.extras): 89 | x = F.relu(v(x), inplace=True) 90 | if k % 2 == 1: 91 | sources.append(x) 92 | # apply multibox head to source layers 93 | for (x, l, c) in zip(sources, self.loc, self.conf): 94 | loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 95 | conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 96 | 97 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 98 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 99 | if self.phase == "test": 100 | output = self.detect( 101 | loc.view(loc.size(0), -1, 4), # loc preds 102 | self.softmax(conf.view(conf.size(0), -1, 103 | self.num_classes)), # conf preds 104 | self.priors.type(type(x.data)) # default boxes 105 | ) 106 | else: 107 | output = ( 108 | loc.view(loc.size(0), -1, 4), 109 | conf.view(conf.size(0), -1, self.num_classes), 110 | self.priors 111 | ) 112 | return output 113 | 114 | def load_weights(self, base_file): 115 | other, ext = os.path.splitext(base_file) 116 | if ext == '.pkl' or '.pth': 117 | print('Loading weights into state dict...') 118 | self.load_state_dict(torch.load(base_file, 119 | map_location=lambda storage, loc: storage)) 120 | print('Finished!') 121 | else: 122 | print('Sorry only .pth and .pkl files supported.') 123 | 124 | 125 | # This function is derived from torchvision VGG make_layers() 126 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 127 | def vgg(cfg, i, batch_norm=False): 128 | layers = [] 129 | in_channels = i 130 | for v in cfg: 131 | if v == 'M': 132 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 133 | elif v == 'C': 134 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 135 | else: 136 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 137 | if batch_norm: 138 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 139 | else: 140 | layers += [conv2d, nn.ReLU(inplace=True)] 141 | in_channels = v 142 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 143 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 144 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 145 | layers += [pool5, conv6, 146 | nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 147 | return layers 148 | 149 | def resnet(): 150 | resnet = resnet18(pretrained=True) 151 | layers = [ 152 | resnet.conv1, 153 | resnet.bn1, 154 | resnet.relu, 155 | resnet.maxpool, 156 | resnet.layer1, 157 | resnet.layer2, 158 | resnet.layer3, 159 | resnet.layer4, 160 | ] 161 | return layers 162 | 163 | def add_extras(cfg, i, batch_norm=False): 164 | # Extra layers added to VGG for feature scaling 165 | layers = [] 166 | in_channels = i 167 | flag = False 168 | for k, v in enumerate(cfg): 169 | if in_channels != 'S': 170 | if v == 'S': 171 | layers += [nn.Conv2d(in_channels, cfg[k + 1], 172 | kernel_size=(1, 3)[flag], stride=2, padding=1)] 173 | else: 174 | layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 175 | flag = not flag 176 | in_channels = v 177 | return layers 178 | 179 | 180 | def multibox(resnet, extra_layers, cfg, num_classes): 181 | loc_layers = [] 182 | conf_layers = [] 183 | resnet_source = [-2, -1] 184 | for k, v in enumerate(resnet_source): 185 | loc_layers += [nn.Conv2d(resnet[v][-1].conv2.out_channels, 186 | cfg[k] * 4, kernel_size=3, padding=1)] 187 | conf_layers += [nn.Conv2d(resnet[v][-1].conv2.out_channels, 188 | cfg[k] * num_classes, kernel_size=3, padding=1)] 189 | for k, v in enumerate(extra_layers[1::2], 2): 190 | loc_layers += [nn.Conv2d(v.out_channels, cfg[k] 191 | * 4, kernel_size=3, padding=1)] 192 | conf_layers += [nn.Conv2d(v.out_channels, cfg[k] 193 | * num_classes, kernel_size=3, padding=1)] 194 | return resnet, extra_layers, (loc_layers, conf_layers) 195 | 196 | 197 | base = { 198 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 199 | 512, 512, 512], 200 | '512': [], 201 | } 202 | extras = { 203 | '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 204 | '512': [], 205 | } 206 | mbox = { 207 | '300': [4, 6, 6, 6, 4, 4], # number of boxes per feature map location 208 | '512': [], 209 | } 210 | 211 | 212 | def build_ssd(phase, size=300, num_classes=21): 213 | if phase != "test" and phase != "train": 214 | print("ERROR: Phase: " + phase + " not recognized") 215 | return 216 | if size != 300: 217 | print("ERROR: You specified size " + repr(size) + ". However, " + 218 | "currently only SSD300 (size=300) is supported!") 219 | return 220 | base_, extras_, head_ = multibox(resnet(), 221 | add_extras(extras[str(size)], 512), 222 | mbox[str(size)], num_classes) 223 | return SSD(phase, size, base_, extras_, head_, num_classes) 224 | -------------------------------------------------------------------------------- /layers/box_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | 4 | 5 | def point_form(boxes): 6 | """ Convert prior_boxes to (xmin, ymin, xmax, ymax) 7 | representation for comparison to point form ground truth data. 8 | Args: 9 | boxes: (tensor) center-size default boxes from priorbox layers. 10 | Return: 11 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 12 | """ 13 | return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin 14 | boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax 15 | 16 | 17 | def center_size(boxes): 18 | """ Convert prior_boxes to (cx, cy, w, h) 19 | representation for comparison to center-size form ground truth data. 20 | Args: 21 | boxes: (tensor) point_form boxes 22 | Return: 23 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 24 | """ 25 | return torch.cat((boxes[:, 2:] + boxes[:, :2])/2, # cx, cy 26 | boxes[:, 2:] - boxes[:, :2], 1) # w, h 27 | 28 | 29 | def intersect(box_a, box_b): 30 | """ We resize both tensors to [A,B,2] without new malloc: 31 | [A,2] -> [A,1,2] -> [A,B,2] 32 | [B,2] -> [1,B,2] -> [A,B,2] 33 | Then we compute the area of intersect between box_a and box_b. 34 | Args: 35 | box_a: (tensor) bounding boxes, Shape: [A,4]. 36 | box_b: (tensor) bounding boxes, Shape: [B,4]. 37 | Return: 38 | (tensor) intersection area, Shape: [A,B]. 39 | """ 40 | A = box_a.size(0) 41 | B = box_b.size(0) 42 | max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), 43 | box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) 44 | min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), 45 | box_b[:, :2].unsqueeze(0).expand(A, B, 2)) 46 | inter = torch.clamp((max_xy - min_xy), min=0) 47 | return inter[:, :, 0] * inter[:, :, 1] 48 | 49 | 50 | def jaccard(box_a, box_b): 51 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 52 | is simply the intersection over union of two boxes. Here we operate on 53 | ground truth boxes and default boxes. 54 | E.g.: 55 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 56 | Args: 57 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] 58 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] 59 | Return: 60 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] 61 | """ 62 | inter = intersect(box_a, box_b) 63 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 64 | (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] 65 | area_b = ((box_b[:, 2]-box_b[:, 0]) * 66 | (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] 67 | union = area_a + area_b - inter 68 | return inter / union # [A,B] 69 | 70 | 71 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx): 72 | """Match each prior box with the ground truth box of the highest jaccard 73 | overlap, encode the bounding boxes, then return the matched indices 74 | corresponding to both confidence and location preds. 75 | Args: 76 | threshold: (float) The overlap threshold used when mathing boxes. 77 | truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors]. 78 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. 79 | variances: (tensor) Variances corresponding to each prior coord, 80 | Shape: [num_priors, 4]. 81 | labels: (tensor) All the class labels for the image, Shape: [num_obj]. 82 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets. 83 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. 84 | idx: (int) current batch index 85 | Return: 86 | The matched indices corresponding to 1)location and 2)confidence preds. 87 | """ 88 | # jaccard index 89 | overlaps = jaccard( 90 | truths, 91 | point_form(priors) 92 | ) 93 | # (Bipartite Matching) 94 | # [1,num_objects] best prior for each ground truth 95 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) 96 | # [1,num_priors] best ground truth for each prior 97 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) 98 | best_truth_idx.squeeze_(0) 99 | best_truth_overlap.squeeze_(0) 100 | best_prior_idx.squeeze_(1) 101 | best_prior_overlap.squeeze_(1) 102 | best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior 103 | # TODO refactor: index best_prior_idx with long tensor 104 | # ensure every gt matches with its prior of max overlap 105 | for j in range(best_prior_idx.size(0)): 106 | best_truth_idx[best_prior_idx[j]] = j 107 | matches = truths[best_truth_idx] # Shape: [num_priors,4] 108 | conf = labels[best_truth_idx] + 1 # Shape: [num_priors] 109 | conf[best_truth_overlap < threshold] = 0 # label as background 110 | loc = encode(matches, priors, variances) 111 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 112 | conf_t[idx] = conf # [num_priors] top class label for each prior 113 | 114 | 115 | def encode(matched, priors, variances): 116 | """Encode the variances from the priorbox layers into the ground truth boxes 117 | we have matched (based on jaccard overlap) with the prior boxes. 118 | Args: 119 | matched: (tensor) Coords of ground truth for each prior in point-form 120 | Shape: [num_priors, 4]. 121 | priors: (tensor) Prior boxes in center-offset form 122 | Shape: [num_priors,4]. 123 | variances: (list[float]) Variances of priorboxes 124 | Return: 125 | encoded boxes (tensor), Shape: [num_priors, 4] 126 | """ 127 | 128 | # dist b/t match center and prior's center 129 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] 130 | # encode variance 131 | g_cxcy /= (variances[0] * priors[:, 2:]) 132 | # match wh / prior wh 133 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 134 | g_wh = torch.log(g_wh) / variances[1] 135 | # return target for smooth_l1_loss 136 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 137 | 138 | 139 | # Adapted from https://github.com/Hakuyume/chainer-ssd 140 | def decode(loc, priors, variances): 141 | """Decode locations from predictions using priors to undo 142 | the encoding we did for offset regression at train time. 143 | Args: 144 | loc (tensor): location predictions for loc layers, 145 | Shape: [num_priors,4] 146 | priors (tensor): Prior boxes in center-offset form. 147 | Shape: [num_priors,4]. 148 | variances: (list[float]) Variances of priorboxes 149 | Return: 150 | decoded bounding box predictions 151 | """ 152 | 153 | boxes = torch.cat(( 154 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 155 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 156 | boxes[:, :2] -= boxes[:, 2:] / 2 157 | boxes[:, 2:] += boxes[:, :2] 158 | return boxes 159 | 160 | 161 | def log_sum_exp(x): 162 | """Utility function for computing log_sum_exp while determining 163 | This will be used to determine unaveraged confidence loss across 164 | all examples in a batch. 165 | Args: 166 | x (Variable(tensor)): conf_preds from conf layers 167 | """ 168 | x_max = x.data.max() 169 | return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max 170 | 171 | 172 | # Original author: Francisco Massa: 173 | # https://github.com/fmassa/object-detection.torch 174 | # Ported to PyTorch by Max deGroot (02/01/2017) 175 | def nms(boxes, scores, overlap=0.5, top_k=200): 176 | """Apply non-maximum suppression at test time to avoid detecting too many 177 | overlapping bounding boxes for a given object. 178 | Args: 179 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 180 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 181 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 182 | top_k: (int) The Maximum number of box preds to consider. 183 | Return: 184 | The indices of the kept boxes with respect to num_priors. 185 | """ 186 | 187 | keep = scores.new(scores.size(0)).zero_().long() 188 | if boxes.numel() == 0: 189 | return keep 190 | x1 = boxes[:, 0] 191 | y1 = boxes[:, 1] 192 | x2 = boxes[:, 2] 193 | y2 = boxes[:, 3] 194 | area = torch.mul(x2 - x1, y2 - y1) 195 | v, idx = scores.sort(0) # sort in ascending order 196 | # I = I[v >= 0.01] 197 | idx = idx[-top_k:] # indices of the top-k largest vals 198 | xx1 = boxes.new() 199 | yy1 = boxes.new() 200 | xx2 = boxes.new() 201 | yy2 = boxes.new() 202 | w = boxes.new() 203 | h = boxes.new() 204 | 205 | # keep = torch.Tensor() 206 | count = 0 207 | while idx.numel() > 0: 208 | i = idx[-1] # index of current largest val 209 | # keep.append(i) 210 | keep[count] = i 211 | count += 1 212 | if idx.size(0) == 1: 213 | break 214 | idx = idx[:-1] # remove kept element from view 215 | # load bboxes of next highest vals 216 | torch.index_select(x1, 0, idx, out=xx1) 217 | torch.index_select(y1, 0, idx, out=yy1) 218 | torch.index_select(x2, 0, idx, out=xx2) 219 | torch.index_select(y2, 0, idx, out=yy2) 220 | # store element-wise max with next highest score 221 | xx1 = torch.clamp(xx1, min=x1[i]) 222 | yy1 = torch.clamp(yy1, min=y1[i]) 223 | xx2 = torch.clamp(xx2, max=x2[i]) 224 | yy2 = torch.clamp(yy2, max=y2[i]) 225 | w.resize_as_(xx2) 226 | h.resize_as_(yy2) 227 | w = xx2 - xx1 228 | h = yy2 - yy1 229 | # check sizes of xx1 and xx2.. after each iteration 230 | w = torch.clamp(w, min=0.0) 231 | h = torch.clamp(h, min=0.0) 232 | inter = w*h 233 | # IoU = i / (area(a) + area(b) - i) 234 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 235 | union = (rem_areas - inter) + area[i] 236 | IoU = inter/union # store result in iou 237 | # keep only elements with an IoU <= overlap 238 | idx = idx[IoU.le(overlap)] 239 | return keep, count 240 | -------------------------------------------------------------------------------- /实验 4.2/visualTest_building.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import torch 3 | import torch.nn as nn 4 | import torch.backends.cudnn as cudnn 5 | from torch.autograd import Variable 6 | import torch.utils.data as data 7 | 8 | from data import BaseTransform 9 | from data.custom_for_visual import CUSTOM_CLASSES_BUILDING as labelmap_building 10 | from data.custom_for_visual import customDetection, customAnnotationTransform, CUSTOM_ROOT, CUSTOM_CLASSES_BUILDING 11 | 12 | # from ssd import build_ssd 13 | from ssd_resnet_18 import build_ssd 14 | 15 | import sys 16 | import os 17 | import time 18 | import argparse 19 | import numpy as np 20 | import pickle 21 | import cv2 22 | import math 23 | 24 | import warnings 25 | warnings.filterwarnings("ignore") 26 | 27 | parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection') 28 | parser.add_argument('--trained_model_building', 29 | default='useful_weight/CUSTOM.pth', type=str, 30 | help='Trained state_dict file path to open') 31 | parser.add_argument('--save_folder', default='eval/', type=str, 32 | help='Dir to save results') 33 | parser.add_argument('--visual_threshold', default=0.15, type=float, 34 | help='Final confidence threshold') 35 | parser.add_argument('--cuda', default=True, type=bool, 36 | help='Use cuda to train model') 37 | parser.add_argument('--custom_root', default=CUSTOM_ROOT, help='Location of VOC root directory') 38 | parser.add_argument('-f', default=None, type=str, help="Dummy arg so we can load in Jupyter Notebooks") 39 | args = parser.parse_args() 40 | 41 | if args.cuda and torch.cuda.is_available(): 42 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 43 | else: 44 | torch.set_default_tensor_type('torch.FloatTensor') 45 | 46 | if not os.path.exists(args.save_folder): 47 | os.mkdir(args.save_folder) 48 | 49 | 50 | def test_net(save_folder, net, cuda, testset, transform, thresh, labelmap): 51 | # dump predictions and assoc. ground truth to text file for now 52 | filename = save_folder + 'result_%s.txt' 53 | num_images = len(testset) 54 | for i in range(num_images): 55 | print('Testing image {:d}/{:d}....'.format(i+1, num_images)) 56 | img = testset.pull_image(i) 57 | img_id, annotation = testset.pull_anno(i) 58 | x = torch.from_numpy(transform(img)[0]).permute(2, 0, 1) 59 | x = Variable(x.unsqueeze(0)) 60 | 61 | if cuda: 62 | x = x.cuda() 63 | 64 | y = net(x) # forward pass 65 | detections = y.data 66 | # scale each detection back up to the image 67 | scale = torch.Tensor([img.shape[1], img.shape[0], 68 | img.shape[1], img.shape[0]]) 69 | pred_num = 0 70 | for i in range(detections.size(1)): 71 | j = 0 72 | while detections[0, i, j, 0] >= args.visual_threshold: 73 | score = detections[0, i, j, 0] 74 | label_name = labelmap[i-1] 75 | pt = (detections[0, i, j, 1:]*scale).cpu().numpy() 76 | coords = (pt[0], pt[1], pt[2], pt[3]) 77 | pred_num += 1 78 | with open(filename % label_name, mode='a') as f: 79 | f.write(str(img_id) + ' ' + 80 | str(score.cpu().numpy()) + ' '+ ' '.join(str(c) for c in coords) + '\n') 81 | j += 1 82 | 83 | def xmlData(name, width, height, label): 84 | return ''' 85 | JPEGImages 86 | %s.jpg 87 | %s.jpg 88 | 89 | Unknown 90 | 91 | 92 | %d 93 | %d 94 | 3 95 | 96 | 0 97 | 98 | %s 99 | Unspecified 100 | 1 101 | 0 102 | 103 | 0 104 | 0 105 | 1 106 | 1 107 | 108 | 109 | ''' % (name, name, width, height, label) 110 | 111 | def get_output_dir(name, phase=""): 112 | filedir = os.path.join(name, phase) 113 | if not os.path.exists(filedir): 114 | os.makedirs(filedir) 115 | return filedir 116 | 117 | def is_rect_intersect(rect1, rect2): 118 | rect1_x1 = math.floor(rect1['x1']) 119 | rect1_y1 = math.floor(rect1['y1']) 120 | rect1_x2 = math.floor(rect1['x2']) 121 | rect1_y2 = math.floor(rect1['y2']) 122 | 123 | rect2_x1 = math.floor(rect2['x1']) 124 | rect2_y1 = math.floor(rect2['y1']) 125 | rect2_x2 = math.floor(rect2['x2']) 126 | rect2_y2 = math.floor(rect2['y2']) 127 | 128 | zx = abs(rect1_x1 + rect1_x2 - rect2_x1 - rect2_x2) 129 | x = abs(rect1_x1 - rect1_x2) + abs(rect2_x1 - rect2_x2) 130 | 131 | zy = abs(rect1_y1 + rect1_y2 - rect2_y1 - rect2_y2) 132 | y = abs(rect1_y1 - rect1_y2) + abs(rect2_y1 - rect2_y2) 133 | 134 | return True if zx <= x and zy <= y else False 135 | 136 | 137 | def test_custom(): 138 | DEBUG = False 139 | set_type = 'test' 140 | 141 | if not os.path.exists(os.path.join(args.save_folder, 'result_building.txt')): 142 | # load net 143 | num_classes_building = len(labelmap_building) + 1 # +1 for background 144 | net = build_ssd('test', 300, num_classes_building) # initialize SSD 145 | net.load_state_dict(torch.load(args.trained_model_building)) 146 | net.eval() 147 | 148 | print('Finished loading model!') 149 | # load data 150 | dataset1 = customDetection(args.custom_root, [('buildingwater', set_type)], None, customAnnotationTransform(class_to_ind=dict(zip(CUSTOM_CLASSES_BUILDING, range(len(CUSTOM_CLASSES_BUILDING)))))) 151 | if args.cuda: 152 | net = net.cuda() 153 | cudnn.benchmark = True 154 | # evaluation 155 | 156 | test_net(args.save_folder, net, args.cuda, dataset1, 157 | BaseTransform(net.size, (104, 117, 123)), 158 | thresh=args.visual_threshold, labelmap=labelmap_building) 159 | 160 | rootPath = 'F:/ssd/data/video/buildingwater' 161 | img_path = os.path.join(rootPath, 'JPEGImages', '%s.jpg') 162 | imgList_building = {} 163 | imgList_water = {} 164 | 165 | with open(os.path.join(args.save_folder, 'result_building.txt'), 'r') as f: 166 | text_lines = f.readlines() 167 | for line in text_lines: 168 | info = line.split(" ") 169 | name, score, x1, y1, x2, y2 = info 170 | if name in imgList_building: 171 | imgList_building[name].append({ 172 | 'score': float(score), 173 | 'x1': float(x1), 174 | 'y1': float(y1), 175 | 'x2': float(x2), 176 | 'y2': float(y2) 177 | }) 178 | else: 179 | imgList_building[name] = [{ 180 | 'score': float(score), 181 | 'x1': float(x1), 182 | 'y1': float(y1), 183 | 'x2': float(x2), 184 | 'y2': float(y2) 185 | }] 186 | 187 | with open(os.path.join(args.save_folder, 'result_water.txt'), 'r') as f: 188 | text_lines = f.readlines() 189 | for line in text_lines: 190 | info = line.split(" ") 191 | name, score, x1, y1, x2, y2 = info 192 | if name in imgList_water: 193 | imgList_water[name].append({ 194 | 'score': float(score), 195 | 'x1': float(x1), 196 | 'y1': float(y1), 197 | 'x2': float(x2), 198 | 'y2': float(y2) 199 | }) 200 | else: 201 | imgList_water[name] = [{ 202 | 'score': float(score), 203 | 'x1': float(x1), 204 | 'y1': float(y1), 205 | 'x2': float(x2), 206 | 'y2': float(y2) 207 | }] 208 | 209 | opacity = 0.8 210 | for name in imgList_building: 211 | img_building = imgList_building[name] 212 | img_water = imgList_water[name] if name in imgList_water else [] 213 | 214 | image = cv2.imread(img_path % name) 215 | (h, w, c) = image.shape 216 | img_black = image.copy() 217 | img_cp = image.copy() 218 | img_black.fill(1) 219 | 220 | 221 | for building in img_building: 222 | for water in img_water: 223 | if is_rect_intersect(building, water): 224 | x1_b = max(math.floor(building['x1']), 0) 225 | y1_b = max(math.floor(building['y1']), 0) 226 | x2_b = min(math.floor(building['x2']), w) 227 | y2_b = min(math.floor(building['y2']), h) 228 | cv2.rectangle(image, (x1_b-2, y1_b-2), (x2_b+2, y2_b+2), (0,0,255), 5) 229 | img_black[y1_b:y2_b, x1_b:x2_b] = 0 230 | 231 | 232 | # for building in img_building: 233 | # x1_b = max(math.floor(building['x1']), 0) 234 | # y1_b = max(math.floor(building['y1']), 0) 235 | # x2_b = min(math.floor(building['x2']), w) 236 | # y2_b = min(math.floor(building['y2']), h) 237 | # # cv2.rectangle(image, (x1_b, y1_b), (x2_b, y2_b), (0,0,255), 5) 238 | # img_black[y1_b:y2_b, x1_b:x2_b] = 0 239 | image[:,:,0] = (1 - img_black[:,:,0]) * (img_cp[:,:,0]) + img_black[:,:,0] * image[:,:,0] 240 | image[:,:,1] = (1 - img_black[:,:,1]) * (img_cp[:,:,1]) + img_black[:,:,1] * image[:,:,1] 241 | image[:,:,2] = (1 - img_black[:,:,2]) * (img_cp[:,:,2] ) + img_black[:,:,2] * image[:,:,2] 242 | 243 | image[:,:,0] = (1 - img_black[:,:,0]) * (image[:,:,0] * opacity + 0 * (1 - opacity)) + img_black[:,:,0] * image[:,:,0] 244 | image[:,:,1] = (1 - img_black[:,:,1]) * (image[:,:,1] * opacity + 0 * (1 - opacity)) + img_black[:,:,1] * image[:,:,1] 245 | image[:,:,2] = (1 - img_black[:,:,2]) * (image[:,:,2] * opacity + 255 * (1 - opacity)) + img_black[:,:,2] * image[:,:,2] 246 | 247 | # for water in img_water: 248 | # x1_w = max(math.floor(water['x1']), 0) 249 | # y1_w = max(math.floor(water['y1']), 0) 250 | # x2_w = min(math.floor(water['x2']), w) 251 | # y2_w = min(math.floor(water['y2']), h) 252 | # cv2.rectangle(image, (x1_w, y1_w), (x2_w, y2_w), (0,255,0), 5) 253 | 254 | image = cv2.resize(image, (512, 512)) 255 | # cv2.putText(image, 'building', (10, 40), cv2.FONT_HERSHEY_COMPLEX, 1.2, (0, 0, 255), 2) 256 | # cv2.putText(image, 'water', (10, 80), cv2.FONT_HERSHEY_COMPLEX, 1.2, (0, 255, 0), 2) 257 | cv2.imshow('w2', image) 258 | cv2.waitKey() 259 | 260 | if __name__ == '__main__': 261 | test_custom() 262 | -------------------------------------------------------------------------------- /实验 4.2/trainCustom_18.py: -------------------------------------------------------------------------------- 1 | from data import * 2 | from utils.augmentations import SSDAugmentation 3 | from layers.modules import MultiBoxLoss 4 | from ssd_resnet_18 import build_ssd 5 | import os 6 | import sys 7 | import time 8 | import torch 9 | from torch.autograd import Variable 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | import torch.backends.cudnn as cudnn 13 | import torch.nn.init as init 14 | import torch.utils.data as data 15 | import numpy as np 16 | import argparse 17 | 18 | from data.voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES, VOC_ROOT 19 | 20 | from data.coco import COCODetection, COCOAnnotationTransform, COCO_CLASSES, COCO_ROOT, get_label_map 21 | 22 | from data.custom import customDetection, customAnnotationTransform, CUSTOM_CLASSES, CUSTOM_ROOT 23 | 24 | def str2bool(v): 25 | return v.lower() in ("yes", "true", "t", "1") 26 | 27 | 28 | parser = argparse.ArgumentParser( 29 | description='Single Shot MultiBox Detector Training With Pytorch') 30 | train_set = parser.add_mutually_exclusive_group() 31 | parser.add_argument('--dataset', default='CUSTOM', choices=['VOC', 'COCO', 'CUSTOM'], 32 | type=str, help='VOC or COCO') 33 | parser.add_argument('--dataset_root', default=CUSTOM_ROOT, # VOC_ROOT, 34 | help='Dataset root directory path') 35 | parser.add_argument('--basenet', default='vgg16_reducedfc.pth', 36 | help='Pretrained base model') 37 | parser.add_argument('--batch_size', default=32, type=int, 38 | help='Batch size for training') 39 | parser.add_argument('--resume', default=None, type=str, 40 | help='Checkpoint state_dict file to resume training from') 41 | parser.add_argument('--start_iter', default=0, type=int, 42 | help='Resume training at this iter') 43 | parser.add_argument('--num_workers', default=4, type=int, 44 | help='Number of workers used in dataloading') 45 | parser.add_argument('--cuda', default=True, type=str2bool, 46 | help='Use CUDA to train model') 47 | parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float, 48 | help='initial learning rate') 49 | parser.add_argument('--momentum', default=0.9, type=float, 50 | help='Momentum value for optim') 51 | parser.add_argument('--weight_decay', default=5e-4, type=float, 52 | help='Weight decay for SGD') 53 | parser.add_argument('--gamma', default=0.1, type=float, 54 | help='Gamma update for SGD') 55 | parser.add_argument('--visdom', default=False, type=str2bool, 56 | help='Use visdom for loss visualization') 57 | parser.add_argument('--save_folder', default='weights/', 58 | help='Directory for saving checkpoint models') 59 | args = parser.parse_args() 60 | 61 | 62 | if torch.cuda.is_available(): 63 | if args.cuda: 64 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 65 | if not args.cuda: 66 | print("WARNING: It looks like you have a CUDA device, but aren't " + 67 | "using CUDA.\nRun with --cuda for optimal training speed.") 68 | torch.set_default_tensor_type('torch.FloatTensor') 69 | else: 70 | torch.set_default_tensor_type('torch.FloatTensor') 71 | 72 | if not os.path.exists(args.save_folder): 73 | os.mkdir(args.save_folder) 74 | 75 | 76 | def train(): 77 | if args.dataset == 'COCO': 78 | if args.dataset_root == VOC_ROOT: 79 | if not os.path.exists(COCO_ROOT): 80 | parser.error('Must specify dataset_root if specifying dataset') 81 | print("WARNING: Using default COCO dataset_root because " + 82 | "--dataset_root was not specified.") 83 | args.dataset_root = COCO_ROOT 84 | cfg = coco 85 | dataset = COCODetection(root=args.dataset_root, 86 | transform=SSDAugmentation(cfg['min_dim'], 87 | MEANS)) 88 | elif args.dataset == 'VOC': 89 | if args.dataset_root == COCO_ROOT: 90 | parser.error('Must specify dataset if specifying dataset_root') 91 | cfg = voc 92 | dataset = VOCDetection(root=args.dataset_root, 93 | transform=SSDAugmentation(cfg['min_dim'], 94 | MEANS)) 95 | 96 | elif args.dataset == 'CUSTOM': 97 | if args.dataset_root == VOC_ROOT or args.dataset_root == COCO_ROOT: 98 | parser.error('Must specify dataset if specifying dataset_root') 99 | cfg = custom 100 | dataset = customDetection(root=args.dataset_root, 101 | transform=SSDAugmentation(cfg['min_dim'], 102 | MEANS)) 103 | 104 | if args.visdom: 105 | import visdom 106 | viz = visdom.Visdom() 107 | 108 | ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes']) 109 | net = ssd_net 110 | 111 | if args.cuda: 112 | net = torch.nn.DataParallel(ssd_net) 113 | cudnn.benchmark = True 114 | 115 | if args.resume: 116 | print('Resuming training, loading {}...'.format(args.resume)) 117 | ssd_net.load_weights(args.resume) 118 | else: 119 | pass 120 | # resnet_weights = torch.load(args.save_folder + args.basenet) 121 | # print('Loading base network...') 122 | # ssd_net.resnet.load_state_dict(resnet_weights) 123 | 124 | if args.cuda: 125 | net = net.cuda() 126 | 127 | if not args.resume: 128 | print('Initializing weights...') 129 | # initialize newly added layers' weights with xavier method 130 | ssd_net.extras.apply(weights_init) 131 | ssd_net.loc.apply(weights_init) 132 | ssd_net.conf.apply(weights_init) 133 | 134 | optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, 135 | weight_decay=args.weight_decay) 136 | criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, 137 | False, args.cuda) 138 | 139 | net.train() 140 | # loss counters 141 | loc_loss = 0 142 | conf_loss = 0 143 | epoch = 0 144 | print('Loading the dataset...') 145 | 146 | epoch_size = len(dataset) // args.batch_size 147 | print('Epochj Size:', epoch_size) 148 | print('Training SSD on:', dataset.name) 149 | print('Using the specified args:') 150 | print(args) 151 | 152 | step_index = 0 153 | 154 | if args.visdom: 155 | vis_title = 'SSD.PyTorch on ' + dataset.name 156 | vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss'] 157 | iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend) 158 | epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend) 159 | 160 | data_loader = data.DataLoader(dataset, args.batch_size, 161 | num_workers=args.num_workers, 162 | shuffle=True, collate_fn=detection_collate, 163 | pin_memory=True) 164 | # create batch iterator 165 | batch_iterator = iter(data_loader) 166 | for iteration in range(args.start_iter, cfg['max_iter']): 167 | if args.visdom and iteration != 0 and (iteration % epoch_size == 0): 168 | update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None, 169 | 'append', epoch_size) 170 | # reset epoch loss counters 171 | loc_loss = 0 172 | conf_loss = 0 173 | epoch += 1 174 | 175 | if iteration in cfg['lr_steps']: 176 | step_index += 1 177 | adjust_learning_rate(optimizer, args.gamma, step_index) 178 | 179 | # load train data 180 | # images, targets = next(batch_iterator) 181 | try: 182 | images, targets = next(batch_iterator) 183 | except StopIteration: 184 | batch_iterator = iter(data_loader) 185 | images, targets = next(batch_iterator) 186 | 187 | if args.cuda: 188 | images = Variable(images.cuda()) 189 | targets = [Variable(ann.cuda(), volatile=True) for ann in targets] 190 | else: 191 | images = Variable(images) 192 | targets = [Variable(ann, volatile=True) for ann in targets] 193 | # forward 194 | t0 = time.time() 195 | out = net(images) 196 | # backprop 197 | optimizer.zero_grad() 198 | loss_l, loss_c = criterion(out, targets) 199 | loss = loss_l + loss_c 200 | loss.backward() 201 | optimizer.step() 202 | t1 = time.time() 203 | # loc_loss += loss_l.data[0] 204 | # conf_loss += loss_c.data[0] 205 | loc_loss += loss_l.item() 206 | conf_loss += loss_c.item() 207 | 208 | if iteration % 10 == 0: 209 | print('timer: %.4f sec.' % (t1 - t0)) 210 | # print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data[0]), end=' ') 211 | print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.item()), end=' ') 212 | 213 | if args.visdom: 214 | # update_vis_plot(iteration, loss_l.data[0], loss_c.data[0], 215 | # iter_plot, epoch_plot, 'append') 216 | update_vis_plot(iteration, loss_l.item(), loss_c.item(), 217 | iter_plot, epoch_plot, 'append') 218 | 219 | if iteration != 0 and iteration % 5000 == 0: 220 | print('Saving state, iter:', iteration) 221 | torch.save(ssd_net.state_dict(), args.save_folder + '/ssd300_COCO_' + 222 | repr(iteration) + '.pth') 223 | torch.save(ssd_net.state_dict(), 224 | args.save_folder + '' + args.dataset + '.pth') 225 | 226 | 227 | def adjust_learning_rate(optimizer, gamma, step): 228 | """Sets the learning rate to the initial LR decayed by 10 at every 229 | specified step 230 | # Adapted from PyTorch Imagenet example: 231 | # https://github.com/pytorch/examples/blob/master/imagenet/main.py 232 | """ 233 | lr = args.lr * (gamma ** (step)) 234 | for param_group in optimizer.param_groups: 235 | param_group['lr'] = lr 236 | 237 | 238 | def xavier(param): 239 | init.xavier_uniform(param) 240 | 241 | 242 | def weights_init(m): 243 | if isinstance(m, nn.Conv2d): 244 | xavier(m.weight.data) 245 | m.bias.data.zero_() 246 | 247 | 248 | def create_vis_plot(_xlabel, _ylabel, _title, _legend): 249 | return viz.line( 250 | X=torch.zeros((1,)).cpu(), 251 | Y=torch.zeros((1, 3)).cpu(), 252 | opts=dict( 253 | xlabel=_xlabel, 254 | ylabel=_ylabel, 255 | title=_title, 256 | legend=_legend 257 | ) 258 | ) 259 | 260 | 261 | def update_vis_plot(iteration, loc, conf, window1, window2, update_type, 262 | epoch_size=1): 263 | viz.line( 264 | X=torch.ones((1, 3)).cpu() * iteration, 265 | Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu() / epoch_size, 266 | win=window1, 267 | update=update_type 268 | ) 269 | # initialize epoch plot on first iteration 270 | if iteration == 0: 271 | viz.line( 272 | X=torch.zeros((1, 3)).cpu(), 273 | Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu(), 274 | win=window2, 275 | update=True 276 | ) 277 | 278 | 279 | if __name__ == '__main__': 280 | train() 281 | -------------------------------------------------------------------------------- /实验 4.1/trainCustom_101.py: -------------------------------------------------------------------------------- 1 | from data import * 2 | from utils.augmentations import SSDAugmentation 3 | from layers.modules import MultiBoxLoss 4 | from ssd_resnet_101 import build_ssd 5 | import os 6 | import sys 7 | import time 8 | import torch 9 | from torch.autograd import Variable 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | import torch.backends.cudnn as cudnn 13 | import torch.nn.init as init 14 | import torch.utils.data as data 15 | import numpy as np 16 | import argparse 17 | 18 | from data.voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES, VOC_ROOT 19 | 20 | from data.coco import COCODetection, COCOAnnotationTransform, COCO_CLASSES, COCO_ROOT, get_label_map 21 | 22 | from data.custom import customDetection, customAnnotationTransform, CUSTOM_CLASSES, CUSTOM_ROOT 23 | 24 | def str2bool(v): 25 | return v.lower() in ("yes", "true", "t", "1") 26 | 27 | 28 | parser = argparse.ArgumentParser( 29 | description='Single Shot MultiBox Detector Training With Pytorch') 30 | train_set = parser.add_mutually_exclusive_group() 31 | parser.add_argument('--dataset', default='CUSTOM', choices=['VOC', 'COCO', 'CUSTOM'], 32 | type=str, help='VOC or COCO') 33 | parser.add_argument('--dataset_root', default=CUSTOM_ROOT, # VOC_ROOT, 34 | help='Dataset root directory path') 35 | parser.add_argument('--basenet', default='vgg16_reducedfc.pth', 36 | help='Pretrained base model') 37 | parser.add_argument('--batch_size', default=32, type=int, 38 | help='Batch size for training') 39 | parser.add_argument('--resume', default=None, type=str, 40 | help='Checkpoint state_dict file to resume training from') 41 | parser.add_argument('--start_iter', default=0, type=int, 42 | help='Resume training at this iter') 43 | parser.add_argument('--num_workers', default=4, type=int, 44 | help='Number of workers used in dataloading') 45 | parser.add_argument('--cuda', default=True, type=str2bool, 46 | help='Use CUDA to train model') 47 | parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float, 48 | help='initial learning rate') 49 | parser.add_argument('--momentum', default=0.9, type=float, 50 | help='Momentum value for optim') 51 | parser.add_argument('--weight_decay', default=5e-4, type=float, 52 | help='Weight decay for SGD') 53 | parser.add_argument('--gamma', default=0.1, type=float, 54 | help='Gamma update for SGD') 55 | parser.add_argument('--visdom', default=False, type=str2bool, 56 | help='Use visdom for loss visualization') 57 | parser.add_argument('--save_folder', default='weights/', 58 | help='Directory for saving checkpoint models') 59 | args = parser.parse_args() 60 | 61 | 62 | if torch.cuda.is_available(): 63 | if args.cuda: 64 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 65 | if not args.cuda: 66 | print("WARNING: It looks like you have a CUDA device, but aren't " + 67 | "using CUDA.\nRun with --cuda for optimal training speed.") 68 | torch.set_default_tensor_type('torch.FloatTensor') 69 | else: 70 | torch.set_default_tensor_type('torch.FloatTensor') 71 | 72 | if not os.path.exists(args.save_folder): 73 | os.mkdir(args.save_folder) 74 | 75 | 76 | def train(): 77 | if args.dataset == 'COCO': 78 | if args.dataset_root == VOC_ROOT: 79 | if not os.path.exists(COCO_ROOT): 80 | parser.error('Must specify dataset_root if specifying dataset') 81 | print("WARNING: Using default COCO dataset_root because " + 82 | "--dataset_root was not specified.") 83 | args.dataset_root = COCO_ROOT 84 | cfg = coco 85 | dataset = COCODetection(root=args.dataset_root, 86 | transform=SSDAugmentation(cfg['min_dim'], 87 | MEANS)) 88 | elif args.dataset == 'VOC': 89 | if args.dataset_root == COCO_ROOT: 90 | parser.error('Must specify dataset if specifying dataset_root') 91 | cfg = voc 92 | dataset = VOCDetection(root=args.dataset_root, 93 | transform=SSDAugmentation(cfg['min_dim'], 94 | MEANS)) 95 | 96 | elif args.dataset == 'CUSTOM': 97 | if args.dataset_root == VOC_ROOT or args.dataset_root == COCO_ROOT: 98 | parser.error('Must specify dataset if specifying dataset_root') 99 | cfg = custom 100 | dataset = customDetection(root=args.dataset_root, 101 | transform=SSDAugmentation(cfg['min_dim'], 102 | MEANS)) 103 | 104 | if args.visdom: 105 | import visdom 106 | viz = visdom.Visdom() 107 | 108 | ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes']) 109 | net = ssd_net 110 | 111 | if args.cuda: 112 | net = torch.nn.DataParallel(ssd_net) 113 | cudnn.benchmark = True 114 | 115 | if args.resume: 116 | print('Resuming training, loading {}...'.format(args.resume)) 117 | ssd_net.load_weights(args.resume) 118 | else: 119 | pass 120 | # resnet_weights = torch.load(args.save_folder + args.basenet) 121 | # print('Loading base network...') 122 | # ssd_net.resnet.load_state_dict(resnet_weights) 123 | 124 | if args.cuda: 125 | net = net.cuda() 126 | 127 | if not args.resume: 128 | print('Initializing weights...') 129 | # initialize newly added layers' weights with xavier method 130 | ssd_net.extras.apply(weights_init) 131 | ssd_net.loc.apply(weights_init) 132 | ssd_net.conf.apply(weights_init) 133 | 134 | optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, 135 | weight_decay=args.weight_decay) 136 | criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, 137 | False, args.cuda) 138 | 139 | net.train() 140 | # loss counters 141 | loc_loss = 0 142 | conf_loss = 0 143 | epoch = 0 144 | print('Loading the dataset...') 145 | 146 | epoch_size = len(dataset) // args.batch_size 147 | print('Epochj Size:', epoch_size) 148 | print('Training SSD on:', dataset.name) 149 | print('Using the specified args:') 150 | print(args) 151 | 152 | step_index = 0 153 | 154 | if args.visdom: 155 | vis_title = 'SSD.PyTorch on ' + dataset.name 156 | vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss'] 157 | iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend) 158 | epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend) 159 | 160 | data_loader = data.DataLoader(dataset, args.batch_size, 161 | num_workers=args.num_workers, 162 | shuffle=True, collate_fn=detection_collate, 163 | pin_memory=True) 164 | # create batch iterator 165 | batch_iterator = iter(data_loader) 166 | for iteration in range(args.start_iter, cfg['max_iter']): 167 | if args.visdom and iteration != 0 and (iteration % epoch_size == 0): 168 | update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None, 169 | 'append', epoch_size) 170 | # reset epoch loss counters 171 | loc_loss = 0 172 | conf_loss = 0 173 | epoch += 1 174 | 175 | if iteration in cfg['lr_steps']: 176 | step_index += 1 177 | adjust_learning_rate(optimizer, args.gamma, step_index) 178 | 179 | # load train data 180 | # images, targets = next(batch_iterator) 181 | try: 182 | images, targets = next(batch_iterator) 183 | except StopIteration: 184 | batch_iterator = iter(data_loader) 185 | images, targets = next(batch_iterator) 186 | 187 | if args.cuda: 188 | images = Variable(images.cuda()) 189 | targets = [Variable(ann.cuda(), volatile=True) for ann in targets] 190 | else: 191 | images = Variable(images) 192 | targets = [Variable(ann, volatile=True) for ann in targets] 193 | # forward 194 | t0 = time.time() 195 | out = net(images) 196 | # backprop 197 | optimizer.zero_grad() 198 | loss_l, loss_c = criterion(out, targets) 199 | loss = loss_l + loss_c 200 | loss.backward() 201 | optimizer.step() 202 | t1 = time.time() 203 | # loc_loss += loss_l.data[0] 204 | # conf_loss += loss_c.data[0] 205 | loc_loss += loss_l.item() 206 | conf_loss += loss_c.item() 207 | 208 | if iteration % 10 == 0: 209 | print('timer: %.4f sec.' % (t1 - t0)) 210 | # print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data[0]), end=' ') 211 | print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.item()), end=' ') 212 | 213 | if args.visdom: 214 | # update_vis_plot(iteration, loss_l.data[0], loss_c.data[0], 215 | # iter_plot, epoch_plot, 'append') 216 | update_vis_plot(iteration, loss_l.item(), loss_c.item(), 217 | iter_plot, epoch_plot, 'append') 218 | 219 | if iteration != 0 and iteration % 5000 == 0: 220 | print('Saving state, iter:', iteration) 221 | torch.save(ssd_net.state_dict(), args.save_folder + '/ssd300_COCO_' + 222 | repr(iteration) + '.pth') 223 | torch.save(ssd_net.state_dict(), 224 | args.save_folder + '' + args.dataset + '.pth') 225 | 226 | 227 | def adjust_learning_rate(optimizer, gamma, step): 228 | """Sets the learning rate to the initial LR decayed by 10 at every 229 | specified step 230 | # Adapted from PyTorch Imagenet example: 231 | # https://github.com/pytorch/examples/blob/master/imagenet/main.py 232 | """ 233 | lr = args.lr * (gamma ** (step)) 234 | for param_group in optimizer.param_groups: 235 | param_group['lr'] = lr 236 | 237 | 238 | def xavier(param): 239 | init.xavier_uniform(param) 240 | 241 | 242 | def weights_init(m): 243 | if isinstance(m, nn.Conv2d): 244 | xavier(m.weight.data) 245 | m.bias.data.zero_() 246 | 247 | 248 | def create_vis_plot(_xlabel, _ylabel, _title, _legend): 249 | return viz.line( 250 | X=torch.zeros((1,)).cpu(), 251 | Y=torch.zeros((1, 3)).cpu(), 252 | opts=dict( 253 | xlabel=_xlabel, 254 | ylabel=_ylabel, 255 | title=_title, 256 | legend=_legend 257 | ) 258 | ) 259 | 260 | 261 | def update_vis_plot(iteration, loc, conf, window1, window2, update_type, 262 | epoch_size=1): 263 | viz.line( 264 | X=torch.ones((1, 3)).cpu() * iteration, 265 | Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu() / epoch_size, 266 | win=window1, 267 | update=update_type 268 | ) 269 | # initialize epoch plot on first iteration 270 | if iteration == 0: 271 | viz.line( 272 | X=torch.zeros((1, 3)).cpu(), 273 | Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu(), 274 | win=window2, 275 | update=True 276 | ) 277 | 278 | 279 | if __name__ == '__main__': 280 | train() 281 | -------------------------------------------------------------------------------- /实验 4.1/visualTest_gauge.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import torch 3 | import torch.nn as nn 4 | import torch.backends.cudnn as cudnn 5 | from torch.autograd import Variable 6 | import torch.utils.data as data 7 | 8 | from data import BaseTransform 9 | from data.custom_for_visual import CUSTOM_CLASSES_GAUGE as labelmap_gauge 10 | from data.custom_for_visual import CUSTOM_CLASSES_WATERLINE as labelmap_waterline 11 | from data.custom_for_visual import customDetection, customAnnotationTransform, CUSTOM_ROOT, CUSTOM_CLASSES_GAUGE, CUSTOM_CLASSES_WATERLINE 12 | 13 | # from ssd import build_ssd 14 | from ssd_resnet_101 import build_ssd 15 | 16 | import sys 17 | import os 18 | import time 19 | import argparse 20 | import numpy as np 21 | import pickle 22 | import cv2 23 | import math 24 | 25 | import warnings 26 | warnings.filterwarnings("ignore") 27 | 28 | parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection') 29 | parser.add_argument('--trained_model_gauge', 30 | default='useful_weight/CUSTOM_gauge.pth', type=str, 31 | help='Trained state_dict file path to open') 32 | parser.add_argument('--trained_model_waterline', 33 | default='useful_weight/CUSTOM_mark.pth', type=str, 34 | help='Trained state_dict file path to open') 35 | parser.add_argument('--save_folder', default='eval/', type=str, 36 | help='Dir to save results') 37 | parser.add_argument('--visual_threshold', default=0.1, type=float, 38 | help='Final confidence threshold') 39 | parser.add_argument('--cuda', default=True, type=bool, 40 | help='Use cuda to train model') 41 | parser.add_argument('--custom_root', default=CUSTOM_ROOT, help='Location of VOC root directory') 42 | parser.add_argument('-f', default=None, type=str, help="Dummy arg so we can load in Jupyter Notebooks") 43 | args = parser.parse_args() 44 | 45 | if args.cuda and torch.cuda.is_available(): 46 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 47 | else: 48 | torch.set_default_tensor_type('torch.FloatTensor') 49 | 50 | if not os.path.exists(args.save_folder): 51 | os.mkdir(args.save_folder) 52 | 53 | 54 | def test_net(save_folder, net, cuda, testset, transform, thresh, labelmap): 55 | # dump predictions and assoc. ground truth to text file for now 56 | filename = save_folder + 'result_%s.txt' 57 | num_images = len(testset) 58 | for i in range(num_images): 59 | print('Testing image {:d}/{:d}....'.format(i+1, num_images)) 60 | img = testset.pull_image(i) 61 | img_id, annotation = testset.pull_anno(i) 62 | x = torch.from_numpy(transform(img)[0]).permute(2, 0, 1) 63 | x = Variable(x.unsqueeze(0)) 64 | 65 | if cuda: 66 | x = x.cuda() 67 | 68 | y = net(x) # forward pass 69 | detections = y.data 70 | # scale each detection back up to the image 71 | scale = torch.Tensor([img.shape[1], img.shape[0], 72 | img.shape[1], img.shape[0]]) 73 | pred_num = 0 74 | for i in range(detections.size(1)): 75 | j = 0 76 | while detections[0, i, j, 0] >= 0.1: 77 | score = detections[0, i, j, 0] 78 | label_name = labelmap[i-1] 79 | pt = (detections[0, i, j, 1:]*scale).cpu().numpy() 80 | coords = (pt[0], pt[1], pt[2], pt[3]) 81 | pred_num += 1 82 | with open(filename % label_name, mode='a') as f: 83 | f.write(str(img_id) + ' ' + 84 | str(score.cpu().numpy()) + ' '+ ' '.join(str(c) for c in coords) + '\n') 85 | j += 1 86 | 87 | def xmlData(name, width, height, label): 88 | return ''' 89 | JPEGImages 90 | %s.jpg 91 | %s.jpg 92 | 93 | Unknown 94 | 95 | 96 | %d 97 | %d 98 | 3 99 | 100 | 0 101 | 102 | %s 103 | Unspecified 104 | 1 105 | 0 106 | 107 | 0 108 | 0 109 | 1 110 | 1 111 | 112 | 113 | ''' % (name, name, width, height, label) 114 | 115 | def get_output_dir(name, phase=""): 116 | filedir = os.path.join(name, phase) 117 | if not os.path.exists(filedir): 118 | os.makedirs(filedir) 119 | return filedir 120 | 121 | def test_custom(): 122 | DEBUG = False 123 | set_type = 'test' 124 | devkit_path = args.custom_root + 'test' 125 | devkit_annopath = os.path.join(args.custom_root, 'test', 'Annotations') 126 | devkit_imgpath = os.path.join(args.custom_root, 'test', 'JPEGImages') 127 | devkit_imgsetpath = os.path.join(args.custom_root, 'test', 'ImageSets', 'Main') 128 | 129 | # load net 130 | num_classes_gauge = len(labelmap_gauge) + 1 # +1 for background 131 | net = build_ssd('test', 300, num_classes_gauge) # initialize SSD 132 | net.load_state_dict(torch.load(args.trained_model_gauge)) 133 | net.eval() 134 | 135 | num_classes_waterline = len(labelmap_waterline) + 1 # +1 for background 136 | net1 = build_ssd('test', 300, num_classes_waterline) # initialize SSD 137 | net1.load_state_dict(torch.load(args.trained_model_waterline)) 138 | net1.eval() 139 | print('Finished loading model!') 140 | # load data 141 | dataset1 = customDetection(args.custom_root, [('gauge', set_type)], None, customAnnotationTransform(class_to_ind=dict(zip(CUSTOM_CLASSES_GAUGE, range(len(CUSTOM_CLASSES_GAUGE)))))) 142 | if args.cuda: 143 | net = net.cuda() 144 | cudnn.benchmark = True 145 | # evaluation 146 | test_net(args.save_folder, net, args.cuda, dataset1, 147 | BaseTransform(net.size, (104, 117, 123)), 148 | thresh=args.visual_threshold, labelmap=labelmap_gauge) 149 | 150 | rootPath = 'F:/ssd/data/video/gauge' 151 | rootPath_temp = 'F:/ssd/data/video/test' 152 | imgList_gauge = {} 153 | 154 | with open(os.path.join(args.save_folder, 'result_gauge.txt'), 'r') as f: 155 | text_lines = f.readlines() 156 | for line in text_lines: 157 | info = line.split(" ") 158 | name, score, x1, y1, x2, y2 = info 159 | if name in imgList_gauge: 160 | if float(score) > imgList_gauge[name]['score']: 161 | imgList_gauge[name] = { 162 | 'score': float(score), 163 | 'x1': float(x1), 164 | 'y1': float(y1), 165 | 'x2': float(x2), 166 | 'y2': float(y2) 167 | } 168 | else: 169 | imgList_gauge[name] = { 170 | 'score': float(score), 171 | 'x1': float(x1), 172 | 'y1': float(y1), 173 | 'x2': float(x2), 174 | 'y2': float(y2) 175 | } 176 | 177 | img_path = os.path.join(rootPath, 'JPEGImages', '%s.jpg') 178 | devkit_imgpath = os.path.join(get_output_dir(devkit_imgpath), '%s.jpg') 179 | devkit_imgsetpath = os.path.join(get_output_dir(devkit_imgsetpath), '%s.txt') 180 | devkit_annopath = os.path.join(get_output_dir(devkit_annopath), '%s.xml') 181 | with open(devkit_imgsetpath % ('test'), 'w') as f: 182 | for obj in imgList_gauge.items(): 183 | name, img = obj 184 | image = cv2.imread(img_path % name) 185 | (h, w, c) = image.shape 186 | x1 = max(math.floor(img['x1']), 0) 187 | y1 = max(math.floor(img['y1']), 0) 188 | x2 = min(math.floor(img['x2']), w) 189 | y2 = min(math.floor(img['y2']), h) 190 | if DEBUG: 191 | cv2.rectangle(image, (x1, y1), (x2, y2), (255,0,0), 5) 192 | image = cv2.resize(image, (512, 512)) 193 | cv2.imshow('w1', image) 194 | cv2.waitKey() 195 | else: 196 | image = image[y1:y2, x1:x2] 197 | # cv2.imshow('w1', image) 198 | cv2.imwrite(devkit_imgpath % name, image, [100]) 199 | f.write(name + '\n') 200 | # cv2.waitKey() 201 | with open(devkit_annopath % (name), 'w') as f_a: 202 | f_a.write(xmlData(name, x2 - x1, y2 - y1, 'waterline')) 203 | 204 | dataset2 = customDetection(args.custom_root, [('test', set_type)], None, customAnnotationTransform(class_to_ind=dict(zip(CUSTOM_CLASSES_WATERLINE, range(len(CUSTOM_CLASSES_WATERLINE)))))) 205 | 206 | if args.cuda: 207 | net1 = net1.cuda() 208 | cudnn.benchmark = True 209 | 210 | # evaluation 211 | test_net(args.save_folder, net1, args.cuda, dataset2, 212 | BaseTransform(net.size, (104, 117, 123)), 213 | thresh=args.visual_threshold, labelmap=labelmap_waterline) 214 | 215 | imgList_waterline = {} 216 | with open(os.path.join(args.save_folder, 'result_waterline.txt'), 'r') as f: 217 | text_lines = f.readlines() 218 | for line in text_lines: 219 | info = line.split(" ") 220 | name, score, x1, y1, x2, y2 = info 221 | if name in imgList_waterline: 222 | if float(score) > imgList_waterline[name]['score']: 223 | imgList_waterline[name] = { 224 | 'score': float(score), 225 | 'x1': float(x1), 226 | 'y1': float(y1), 227 | 'x2': float(x2), 228 | 'y2': float(y2) 229 | } 230 | else: 231 | imgList_waterline[name] = { 232 | 'score': float(score), 233 | 'x1': float(x1), 234 | 'y1': float(y1), 235 | 'x2': float(x2), 236 | 'y2': float(y2) 237 | } 238 | 239 | imgList_mark = {} 240 | with open(os.path.join(args.save_folder, 'result_mark.txt'), 'r') as f: 241 | text_lines = f.readlines() 242 | for line in text_lines: 243 | info = line.split(" ") 244 | name, score, x1, y1, x2, y2 = info 245 | if name in imgList_mark: 246 | if float(score) > imgList_mark[name]['score']: 247 | imgList_mark[name] = { 248 | 'score': float(score), 249 | 'x1': float(x1), 250 | 'y1': float(y1), 251 | 'x2': float(x2), 252 | 'y2': float(y2) 253 | } 254 | else: 255 | imgList_mark[name] = { 256 | 'score': float(score), 257 | 'x1': float(x1), 258 | 'y1': float(y1), 259 | 'x2': float(x2), 260 | 'y2': float(y2) 261 | } 262 | 263 | cv2.namedWindow('w2',1) 264 | use_origin = True 265 | 266 | if not use_origin: 267 | img_path = os.path.join(rootPath_temp, 'JPEGImages', '%s.jpg') 268 | count = 0 269 | for name in imgList_gauge: 270 | img_gauge = imgList_gauge[name] 271 | img_waterline = imgList_waterline[name] 272 | img_mark = imgList_mark[name] 273 | 274 | if not use_origin: 275 | image = cv2.imread(img_path % name) 276 | (h, w, c) = image.shape 277 | 278 | x1_w = max(math.floor(img_waterline['x1']), 0) 279 | y1_w = max(math.floor(img_waterline['y1']), 0) 280 | x2_w = min(math.floor(img_waterline['x2']), w) 281 | y2_w = min(math.floor(img_waterline['y2']), h) 282 | 283 | x1_m = max(math.floor(img_mark['x1']), 0) 284 | y1_m = max(math.floor(img_mark['y1']), 0) 285 | x2_m = min(math.floor(img_mark['x2']), w) 286 | y2_m = min(math.floor(img_mark['y2']), h) 287 | 288 | cv2.rectangle(image, (x1_w, y1_w), (x2_w, y2_w), (255,0,0), 5) 289 | cv2.rectangle(image, (x1_m, y1_m), (x2_m, y2_m), (0,255,0), 5) 290 | image = cv2.resize(image, (512, 512)) 291 | cv2.imshow('w2', image) 292 | cv2.waitKey() 293 | else: 294 | image = cv2.imread(img_path % name) 295 | (h, w, c) = image.shape 296 | 297 | x1_g = math.floor(img_gauge['x1']) 298 | y1_g = math.floor(img_gauge['y1']) 299 | x2_g = math.floor(img_gauge['x2']) 300 | y2_g = math.floor(img_gauge['y2']) 301 | 302 | x1_w = max(math.floor(img_waterline['x1']), 0) 303 | y1_w = max(math.floor(img_waterline['y1']), 0) 304 | x2_w = min(math.floor(img_waterline['x2']), w) 305 | y2_w = min(math.floor(img_waterline['y2']), h) 306 | 307 | x1_m = max(math.floor(img_mark['x1']), 0) 308 | y1_m = max(math.floor(img_mark['y1']), 0) 309 | x2_m = min(math.floor(img_mark['x2']), w) 310 | y2_m = min(math.floor(img_mark['y2']), h) 311 | 312 | if (y1_w + y2_w) > (y1_m + y2_m): 313 | count += 1 314 | 315 | cv2.rectangle(image, (x1_g, y1_g), (x2_g, y2_g), (255,0,0), 5) 316 | cv2.rectangle(image, (x1_g + x1_w, y1_g + y1_w), (x1_g + x2_w, y1_g + y2_w), (0,255,0), 5) 317 | cv2.rectangle(image, (x1_g + x1_m, y1_g + y1_m), (x1_g + x2_m, y1_g + y2_m), (0,0,255), 5) 318 | 319 | image = cv2.resize(image, (512, 512)) 320 | cv2.putText(image, 'gauge: %.2f' % img_gauge['score'], (10, 40), cv2.FONT_HERSHEY_COMPLEX, 1.2, (255, 0, 0), 2) 321 | cv2.putText(image, 'waterline: %.2f' % img_waterline['score'], (10, 80), cv2.FONT_HERSHEY_COMPLEX, 1.2, (0, 255, 0), 2) 322 | cv2.putText(image, 'mark: %.2f' % img_mark['score'], (10, 120), cv2.FONT_HERSHEY_COMPLEX, 1.2, (0, 0, 255), 2) 323 | cv2.imshow('w2', image) 324 | cv2.waitKey() 325 | print('correct count:', count) 326 | 327 | if __name__ == '__main__': 328 | test_custom() 329 | -------------------------------------------------------------------------------- /netModel/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | try: 4 | from torch.hub import load_state_dict_from_url 5 | except ImportError: 6 | from torch.utils.model_zoo import load_url as load_state_dict_from_url 7 | 8 | 9 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 10 | 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', 11 | 'wide_resnet50_2', 'wide_resnet101_2'] 12 | 13 | 14 | model_urls = { 15 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 16 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 17 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 18 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 19 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', 20 | 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth', 21 | 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth', 22 | 'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth', 23 | 'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth', 24 | } 25 | 26 | 27 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): 28 | """3x3 convolution with padding""" 29 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 30 | padding=dilation, groups=groups, bias=False, dilation=dilation) 31 | 32 | 33 | def conv1x1(in_planes, out_planes, stride=1): 34 | """1x1 convolution""" 35 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 36 | 37 | 38 | class BasicBlock(nn.Module): 39 | expansion = 1 40 | __constants__ = ['downsample'] 41 | 42 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 43 | base_width=64, dilation=1, norm_layer=None): 44 | super(BasicBlock, self).__init__() 45 | if norm_layer is None: 46 | norm_layer = nn.BatchNorm2d 47 | if groups != 1 or base_width != 64: 48 | raise ValueError('BasicBlock only supports groups=1 and base_width=64') 49 | if dilation > 1: 50 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock") 51 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1 52 | self.conv1 = conv3x3(inplanes, planes, stride) 53 | self.bn1 = norm_layer(planes) 54 | self.relu = nn.ReLU(inplace=True) 55 | self.conv2 = conv3x3(planes, planes) 56 | self.bn2 = norm_layer(planes) 57 | self.downsample = downsample 58 | self.stride = stride 59 | 60 | def forward(self, x): 61 | identity = x 62 | 63 | out = self.conv1(x) 64 | out = self.bn1(out) 65 | out = self.relu(out) 66 | 67 | out = self.conv2(out) 68 | out = self.bn2(out) 69 | 70 | if self.downsample is not None: 71 | identity = self.downsample(x) 72 | 73 | out += identity 74 | out = self.relu(out) 75 | 76 | return out 77 | 78 | 79 | class Bottleneck(nn.Module): 80 | expansion = 4 81 | __constants__ = ['downsample'] 82 | 83 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 84 | base_width=64, dilation=1, norm_layer=None): 85 | super(Bottleneck, self).__init__() 86 | if norm_layer is None: 87 | norm_layer = nn.BatchNorm2d 88 | width = int(planes * (base_width / 64.)) * groups 89 | # Both self.conv2 and self.downsample layers downsample the input when stride != 1 90 | self.conv1 = conv1x1(inplanes, width) 91 | self.bn1 = norm_layer(width) 92 | self.conv2 = conv3x3(width, width, stride, groups, dilation) 93 | self.bn2 = norm_layer(width) 94 | self.conv3 = conv1x1(width, planes * self.expansion) 95 | self.bn3 = norm_layer(planes * self.expansion) 96 | self.relu = nn.ReLU(inplace=True) 97 | self.downsample = downsample 98 | self.stride = stride 99 | 100 | def forward(self, x): 101 | identity = x 102 | 103 | out = self.conv1(x) 104 | out = self.bn1(out) 105 | out = self.relu(out) 106 | 107 | out = self.conv2(out) 108 | out = self.bn2(out) 109 | out = self.relu(out) 110 | 111 | out = self.conv3(out) 112 | out = self.bn3(out) 113 | 114 | if self.downsample is not None: 115 | identity = self.downsample(x) 116 | 117 | out += identity 118 | out = self.relu(out) 119 | 120 | return out 121 | 122 | 123 | class ResNet(nn.Module): 124 | 125 | def __init__(self, block, layers, num_classes=1000, zero_init_residual=False, 126 | groups=1, width_per_group=64, replace_stride_with_dilation=None, 127 | norm_layer=None): 128 | super(ResNet, self).__init__() 129 | if norm_layer is None: 130 | norm_layer = nn.BatchNorm2d 131 | self._norm_layer = norm_layer 132 | 133 | self.inplanes = 64 134 | self.dilation = 1 135 | if replace_stride_with_dilation is None: 136 | # each element in the tuple indicates if we should replace 137 | # the 2x2 stride with a dilated convolution instead 138 | replace_stride_with_dilation = [False, False, False] 139 | if len(replace_stride_with_dilation) != 3: 140 | raise ValueError("replace_stride_with_dilation should be None " 141 | "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) 142 | self.groups = groups 143 | self.base_width = width_per_group 144 | self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=1, padding=3, 145 | bias=False) 146 | self.bn1 = norm_layer(self.inplanes) 147 | self.relu = nn.ReLU(inplace=True) 148 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 149 | self.layer1 = self._make_layer(block, 64, layers[0]) 150 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2, 151 | dilate=replace_stride_with_dilation[0]) 152 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2, 153 | dilate=replace_stride_with_dilation[1]) 154 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2, 155 | dilate=replace_stride_with_dilation[2]) 156 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 157 | self.fc = nn.Linear(512 * block.expansion, num_classes) 158 | 159 | for m in self.modules(): 160 | if isinstance(m, nn.Conv2d): 161 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 162 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): 163 | nn.init.constant_(m.weight, 1) 164 | nn.init.constant_(m.bias, 0) 165 | 166 | # Zero-initialize the last BN in each residual branch, 167 | # so that the residual branch starts with zeros, and each residual block behaves like an identity. 168 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 169 | if zero_init_residual: 170 | for m in self.modules(): 171 | if isinstance(m, Bottleneck): 172 | nn.init.constant_(m.bn3.weight, 0) 173 | elif isinstance(m, BasicBlock): 174 | nn.init.constant_(m.bn2.weight, 0) 175 | 176 | def _make_layer(self, block, planes, blocks, stride=1, dilate=False): 177 | norm_layer = self._norm_layer 178 | downsample = None 179 | previous_dilation = self.dilation 180 | if dilate: 181 | self.dilation *= stride 182 | stride = 1 183 | if stride != 1 or self.inplanes != planes * block.expansion: 184 | downsample = nn.Sequential( 185 | conv1x1(self.inplanes, planes * block.expansion, stride), 186 | norm_layer(planes * block.expansion), 187 | ) 188 | 189 | layers = [] 190 | layers.append(block(self.inplanes, planes, stride, downsample, self.groups, 191 | self.base_width, previous_dilation, norm_layer)) 192 | self.inplanes = planes * block.expansion 193 | for _ in range(1, blocks): 194 | layers.append(block(self.inplanes, planes, groups=self.groups, 195 | base_width=self.base_width, dilation=self.dilation, 196 | norm_layer=norm_layer)) 197 | 198 | return nn.Sequential(*layers) 199 | 200 | def _forward_impl(self, x): 201 | # See note [TorchScript super()] 202 | x = self.conv1(x) 203 | x = self.bn1(x) 204 | x = self.relu(x) 205 | x = self.maxpool(x) 206 | 207 | x = self.layer1(x) 208 | x = self.layer2(x) 209 | x = self.layer3(x) 210 | x = self.layer4(x) 211 | 212 | x = self.avgpool(x) 213 | x = torch.flatten(x, 1) 214 | x = self.fc(x) 215 | 216 | return x 217 | 218 | def forward(self, x): 219 | return self._forward_impl(x) 220 | 221 | 222 | def _resnet(arch, block, layers, pretrained, progress, **kwargs): 223 | model = ResNet(block, layers, **kwargs) 224 | if pretrained: 225 | state_dict = load_state_dict_from_url(model_urls[arch], 226 | progress=progress) 227 | model.load_state_dict(state_dict) 228 | return model 229 | 230 | 231 | def resnet18(pretrained=False, progress=True, **kwargs): 232 | r"""ResNet-18 model from 233 | `"Deep Residual Learning for Image Recognition" `_ 234 | Args: 235 | pretrained (bool): If True, returns a model pre-trained on ImageNet 236 | progress (bool): If True, displays a progress bar of the download to stderr 237 | """ 238 | return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress, 239 | **kwargs) 240 | 241 | 242 | def resnet34(pretrained=False, progress=True, **kwargs): 243 | r"""ResNet-34 model from 244 | `"Deep Residual Learning for Image Recognition" `_ 245 | Args: 246 | pretrained (bool): If True, returns a model pre-trained on ImageNet 247 | progress (bool): If True, displays a progress bar of the download to stderr 248 | """ 249 | return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress, 250 | **kwargs) 251 | 252 | 253 | def resnet50(pretrained=False, progress=True, **kwargs): 254 | r"""ResNet-50 model from 255 | `"Deep Residual Learning for Image Recognition" `_ 256 | Args: 257 | pretrained (bool): If True, returns a model pre-trained on ImageNet 258 | progress (bool): If True, displays a progress bar of the download to stderr 259 | """ 260 | return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress, 261 | **kwargs) 262 | 263 | 264 | def resnet101(pretrained=False, progress=True, **kwargs): 265 | r"""ResNet-101 model from 266 | `"Deep Residual Learning for Image Recognition" `_ 267 | Args: 268 | pretrained (bool): If True, returns a model pre-trained on ImageNet 269 | progress (bool): If True, displays a progress bar of the download to stderr 270 | """ 271 | return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress, 272 | **kwargs) 273 | 274 | 275 | def resnet152(pretrained=False, progress=True, **kwargs): 276 | r"""ResNet-152 model from 277 | `"Deep Residual Learning for Image Recognition" `_ 278 | Args: 279 | pretrained (bool): If True, returns a model pre-trained on ImageNet 280 | progress (bool): If True, displays a progress bar of the download to stderr 281 | """ 282 | return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress, 283 | **kwargs) 284 | 285 | 286 | def resnext50_32x4d(pretrained=False, progress=True, **kwargs): 287 | r"""ResNeXt-50 32x4d model from 288 | `"Aggregated Residual Transformation for Deep Neural Networks" `_ 289 | Args: 290 | pretrained (bool): If True, returns a model pre-trained on ImageNet 291 | progress (bool): If True, displays a progress bar of the download to stderr 292 | """ 293 | kwargs['groups'] = 32 294 | kwargs['width_per_group'] = 4 295 | return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3], 296 | pretrained, progress, **kwargs) 297 | 298 | 299 | def resnext101_32x8d(pretrained=False, progress=True, **kwargs): 300 | r"""ResNeXt-101 32x8d model from 301 | `"Aggregated Residual Transformation for Deep Neural Networks" `_ 302 | Args: 303 | pretrained (bool): If True, returns a model pre-trained on ImageNet 304 | progress (bool): If True, displays a progress bar of the download to stderr 305 | """ 306 | kwargs['groups'] = 32 307 | kwargs['width_per_group'] = 8 308 | return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3], 309 | pretrained, progress, **kwargs) 310 | 311 | 312 | def wide_resnet50_2(pretrained=False, progress=True, **kwargs): 313 | r"""Wide ResNet-50-2 model from 314 | `"Wide Residual Networks" `_ 315 | The model is the same as ResNet except for the bottleneck number of channels 316 | which is twice larger in every block. The number of channels in outer 1x1 317 | convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 318 | channels, and in Wide ResNet-50-2 has 2048-1024-2048. 319 | Args: 320 | pretrained (bool): If True, returns a model pre-trained on ImageNet 321 | progress (bool): If True, displays a progress bar of the download to stderr 322 | """ 323 | kwargs['width_per_group'] = 64 * 2 324 | return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3], 325 | pretrained, progress, **kwargs) 326 | 327 | 328 | def wide_resnet101_2(pretrained=False, progress=True, **kwargs): 329 | r"""Wide ResNet-101-2 model from 330 | `"Wide Residual Networks" `_ 331 | The model is the same as ResNet except for the bottleneck number of channels 332 | which is twice larger in every block. The number of channels in outer 1x1 333 | convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 334 | channels, and in Wide ResNet-50-2 has 2048-1024-2048. 335 | Args: 336 | pretrained (bool): If True, returns a model pre-trained on ImageNet 337 | progress (bool): If True, displays a progress bar of the download to stderr 338 | """ 339 | kwargs['width_per_group'] = 64 * 2 340 | return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3], 341 | pretrained, progress, **kwargs) -------------------------------------------------------------------------------- /utils/augmentations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision import transforms 3 | import cv2 4 | import numpy as np 5 | import types 6 | from numpy import random 7 | 8 | 9 | def intersect(box_a, box_b): 10 | max_xy = np.minimum(box_a[:, 2:], box_b[2:]) 11 | min_xy = np.maximum(box_a[:, :2], box_b[:2]) 12 | inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf) 13 | return inter[:, 0] * inter[:, 1] 14 | 15 | 16 | def jaccard_numpy(box_a, box_b): 17 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 18 | is simply the intersection over union of two boxes. 19 | E.g.: 20 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 21 | Args: 22 | box_a: Multiple bounding boxes, Shape: [num_boxes,4] 23 | box_b: Single bounding box, Shape: [4] 24 | Return: 25 | jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]] 26 | """ 27 | inter = intersect(box_a, box_b) 28 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 29 | (box_a[:, 3]-box_a[:, 1])) # [A,B] 30 | area_b = ((box_b[2]-box_b[0]) * 31 | (box_b[3]-box_b[1])) # [A,B] 32 | union = area_a + area_b - inter 33 | return inter / union # [A,B] 34 | 35 | 36 | class Compose(object): 37 | """Composes several augmentations together. 38 | Args: 39 | transforms (List[Transform]): list of transforms to compose. 40 | Example: 41 | >>> augmentations.Compose([ 42 | >>> transforms.CenterCrop(10), 43 | >>> transforms.ToTensor(), 44 | >>> ]) 45 | """ 46 | 47 | def __init__(self, transforms): 48 | self.transforms = transforms 49 | 50 | def __call__(self, img, boxes=None, labels=None): 51 | for t in self.transforms: 52 | img, boxes, labels = t(img, boxes, labels) 53 | return img, boxes, labels 54 | 55 | 56 | class Lambda(object): 57 | """Applies a lambda as a transform.""" 58 | 59 | def __init__(self, lambd): 60 | assert isinstance(lambd, types.LambdaType) 61 | self.lambd = lambd 62 | 63 | def __call__(self, img, boxes=None, labels=None): 64 | return self.lambd(img, boxes, labels) 65 | 66 | 67 | class ConvertFromInts(object): 68 | def __call__(self, image, boxes=None, labels=None): 69 | return image.astype(np.float32), boxes, labels 70 | 71 | 72 | class SubtractMeans(object): 73 | def __init__(self, mean): 74 | self.mean = np.array(mean, dtype=np.float32) 75 | 76 | def __call__(self, image, boxes=None, labels=None): 77 | image = image.astype(np.float32) 78 | image -= self.mean 79 | return image.astype(np.float32), boxes, labels 80 | 81 | 82 | class ToAbsoluteCoords(object): 83 | def __call__(self, image, boxes=None, labels=None): 84 | height, width, channels = image.shape 85 | boxes[:, 0] *= width 86 | boxes[:, 2] *= width 87 | boxes[:, 1] *= height 88 | boxes[:, 3] *= height 89 | 90 | return image, boxes, labels 91 | 92 | 93 | class ToPercentCoords(object): 94 | def __call__(self, image, boxes=None, labels=None): 95 | height, width, channels = image.shape 96 | boxes[:, 0] /= width 97 | boxes[:, 2] /= width 98 | boxes[:, 1] /= height 99 | boxes[:, 3] /= height 100 | 101 | return image, boxes, labels 102 | 103 | 104 | class Resize(object): 105 | def __init__(self, size=300): 106 | self.size = size 107 | 108 | def __call__(self, image, boxes=None, labels=None): 109 | image = cv2.resize(image, (self.size, 110 | self.size)) 111 | return image, boxes, labels 112 | 113 | 114 | class RandomSaturation(object): 115 | def __init__(self, lower=0.5, upper=1.5): 116 | self.lower = lower 117 | self.upper = upper 118 | assert self.upper >= self.lower, "contrast upper must be >= lower." 119 | assert self.lower >= 0, "contrast lower must be non-negative." 120 | 121 | def __call__(self, image, boxes=None, labels=None): 122 | if random.randint(2): 123 | image[:, :, 1] *= random.uniform(self.lower, self.upper) 124 | 125 | return image, boxes, labels 126 | 127 | 128 | class RandomHue(object): 129 | def __init__(self, delta=18.0): 130 | assert delta >= 0.0 and delta <= 360.0 131 | self.delta = delta 132 | 133 | def __call__(self, image, boxes=None, labels=None): 134 | if random.randint(2): 135 | image[:, :, 0] += random.uniform(-self.delta, self.delta) 136 | image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0 137 | image[:, :, 0][image[:, :, 0] < 0.0] += 360.0 138 | return image, boxes, labels 139 | 140 | 141 | class RandomLightingNoise(object): 142 | def __init__(self): 143 | self.perms = ((0, 1, 2), (0, 2, 1), 144 | (1, 0, 2), (1, 2, 0), 145 | (2, 0, 1), (2, 1, 0)) 146 | 147 | def __call__(self, image, boxes=None, labels=None): 148 | if random.randint(2): 149 | swap = self.perms[random.randint(len(self.perms))] 150 | shuffle = SwapChannels(swap) # shuffle channels 151 | image = shuffle(image) 152 | return image, boxes, labels 153 | 154 | 155 | class ConvertColor(object): 156 | def __init__(self, current='BGR', transform='HSV'): 157 | self.transform = transform 158 | self.current = current 159 | 160 | def __call__(self, image, boxes=None, labels=None): 161 | if self.current == 'BGR' and self.transform == 'HSV': 162 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 163 | elif self.current == 'HSV' and self.transform == 'BGR': 164 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 165 | else: 166 | raise NotImplementedError 167 | return image, boxes, labels 168 | 169 | 170 | class RandomContrast(object): 171 | def __init__(self, lower=0.5, upper=1.5): 172 | self.lower = lower 173 | self.upper = upper 174 | assert self.upper >= self.lower, "contrast upper must be >= lower." 175 | assert self.lower >= 0, "contrast lower must be non-negative." 176 | 177 | # expects float image 178 | def __call__(self, image, boxes=None, labels=None): 179 | if random.randint(2): 180 | alpha = random.uniform(self.lower, self.upper) 181 | image *= alpha 182 | return image, boxes, labels 183 | 184 | 185 | class RandomBrightness(object): 186 | def __init__(self, delta=32): 187 | assert delta >= 0.0 188 | assert delta <= 255.0 189 | self.delta = delta 190 | 191 | def __call__(self, image, boxes=None, labels=None): 192 | if random.randint(2): 193 | delta = random.uniform(-self.delta, self.delta) 194 | image += delta 195 | return image, boxes, labels 196 | 197 | 198 | class ToCV2Image(object): 199 | def __call__(self, tensor, boxes=None, labels=None): 200 | return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels 201 | 202 | 203 | class ToTensor(object): 204 | def __call__(self, cvimage, boxes=None, labels=None): 205 | return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels 206 | 207 | 208 | class RandomSampleCrop(object): 209 | """Crop 210 | Arguments: 211 | img (Image): the image being input during training 212 | boxes (Tensor): the original bounding boxes in pt form 213 | labels (Tensor): the class labels for each bbox 214 | mode (float tuple): the min and max jaccard overlaps 215 | Return: 216 | (img, boxes, classes) 217 | img (Image): the cropped image 218 | boxes (Tensor): the adjusted bounding boxes in pt form 219 | labels (Tensor): the class labels for each bbox 220 | """ 221 | def __init__(self): 222 | self.sample_options = ( 223 | # using entire original input image 224 | None, 225 | # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9 226 | (0.1, None), 227 | (0.3, None), 228 | (0.7, None), 229 | (0.9, None), 230 | # randomly sample a patch 231 | (None, None), 232 | ) 233 | 234 | def __call__(self, image, boxes=None, labels=None): 235 | height, width, _ = image.shape 236 | while True: 237 | # randomly choose a mode 238 | mode = random.choice(self.sample_options) 239 | if mode is None: 240 | return image, boxes, labels 241 | 242 | min_iou, max_iou = mode 243 | if min_iou is None: 244 | min_iou = float('-inf') 245 | if max_iou is None: 246 | max_iou = float('inf') 247 | 248 | # max trails (50) 249 | for _ in range(50): 250 | current_image = image 251 | 252 | w = random.uniform(0.3 * width, width) 253 | h = random.uniform(0.3 * height, height) 254 | 255 | # aspect ratio constraint b/t .5 & 2 256 | if h / w < 0.5 or h / w > 2: 257 | continue 258 | 259 | left = random.uniform(width - w) 260 | top = random.uniform(height - h) 261 | 262 | # convert to integer rect x1,y1,x2,y2 263 | rect = np.array([int(left), int(top), int(left+w), int(top+h)]) 264 | 265 | # calculate IoU (jaccard overlap) b/t the cropped and gt boxes 266 | overlap = jaccard_numpy(boxes, rect) 267 | 268 | # is min and max overlap constraint satisfied? if not try again 269 | if overlap.min() < min_iou and max_iou < overlap.max(): 270 | continue 271 | 272 | # cut the crop from the image 273 | current_image = current_image[rect[1]:rect[3], rect[0]:rect[2], 274 | :] 275 | 276 | # keep overlap with gt box IF center in sampled patch 277 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 278 | 279 | # mask in all gt boxes that above and to the left of centers 280 | m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1]) 281 | 282 | # mask in all gt boxes that under and to the right of centers 283 | m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1]) 284 | 285 | # mask in that both m1 and m2 are true 286 | mask = m1 * m2 287 | 288 | # have any valid boxes? try again if not 289 | if not mask.any(): 290 | continue 291 | 292 | # take only matching gt boxes 293 | current_boxes = boxes[mask, :].copy() 294 | 295 | # take only matching gt labels 296 | current_labels = labels[mask] 297 | 298 | # should we use the box left and top corner or the crop's 299 | current_boxes[:, :2] = np.maximum(current_boxes[:, :2], 300 | rect[:2]) 301 | # adjust to crop (by substracting crop's left,top) 302 | current_boxes[:, :2] -= rect[:2] 303 | 304 | current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], 305 | rect[2:]) 306 | # adjust to crop (by substracting crop's left,top) 307 | current_boxes[:, 2:] -= rect[:2] 308 | 309 | return current_image, current_boxes, current_labels 310 | 311 | 312 | class Expand(object): 313 | def __init__(self, mean): 314 | self.mean = mean 315 | 316 | def __call__(self, image, boxes, labels): 317 | if random.randint(2): 318 | return image, boxes, labels 319 | 320 | height, width, depth = image.shape 321 | ratio = random.uniform(1, 4) 322 | left = random.uniform(0, width*ratio - width) 323 | top = random.uniform(0, height*ratio - height) 324 | 325 | expand_image = np.zeros( 326 | (int(height*ratio), int(width*ratio), depth), 327 | dtype=image.dtype) 328 | expand_image[:, :, :] = self.mean 329 | expand_image[int(top):int(top + height), 330 | int(left):int(left + width)] = image 331 | image = expand_image 332 | 333 | boxes = boxes.copy() 334 | boxes[:, :2] += (int(left), int(top)) 335 | boxes[:, 2:] += (int(left), int(top)) 336 | 337 | return image, boxes, labels 338 | 339 | 340 | class RandomMirror(object): 341 | def __call__(self, image, boxes, classes): 342 | _, width, _ = image.shape 343 | if random.randint(2): 344 | image = image[:, ::-1] 345 | boxes = boxes.copy() 346 | boxes[:, 0::2] = width - boxes[:, 2::-2] 347 | return image, boxes, classes 348 | 349 | 350 | class SwapChannels(object): 351 | """Transforms a tensorized image by swapping the channels in the order 352 | specified in the swap tuple. 353 | Args: 354 | swaps (int triple): final order of channels 355 | eg: (2, 1, 0) 356 | """ 357 | 358 | def __init__(self, swaps): 359 | self.swaps = swaps 360 | 361 | def __call__(self, image): 362 | """ 363 | Args: 364 | image (Tensor): image tensor to be transformed 365 | Return: 366 | a tensor with channels swapped according to swap 367 | """ 368 | # if torch.is_tensor(image): 369 | # image = image.data.cpu().numpy() 370 | # else: 371 | # image = np.array(image) 372 | image = image[:, :, self.swaps] 373 | return image 374 | 375 | 376 | class PhotometricDistort(object): 377 | def __init__(self): 378 | self.pd = [ 379 | RandomContrast(), 380 | ConvertColor(transform='HSV'), 381 | RandomSaturation(), 382 | RandomHue(), 383 | ConvertColor(current='HSV', transform='BGR'), 384 | RandomContrast() 385 | ] 386 | self.rand_brightness = RandomBrightness() 387 | self.rand_light_noise = RandomLightingNoise() 388 | 389 | def __call__(self, image, boxes, labels): 390 | im = image.copy() 391 | im, boxes, labels = self.rand_brightness(im, boxes, labels) 392 | if random.randint(2): 393 | distort = Compose(self.pd[:-1]) 394 | else: 395 | distort = Compose(self.pd[1:]) 396 | im, boxes, labels = distort(im, boxes, labels) 397 | return self.rand_light_noise(im, boxes, labels) 398 | 399 | 400 | class SSDAugmentation(object): 401 | def __init__(self, size=300, mean=(104, 117, 123)): 402 | self.mean = mean 403 | self.size = size 404 | self.augment = Compose([ 405 | ConvertFromInts(), 406 | ToAbsoluteCoords(), 407 | PhotometricDistort(), 408 | Expand(self.mean), 409 | RandomSampleCrop(), 410 | RandomMirror(), 411 | ToPercentCoords(), 412 | Resize(self.size), 413 | SubtractMeans(self.mean) 414 | ]) 415 | 416 | def __call__(self, img, boxes, labels): 417 | return self.augment(img, boxes, labels) 418 | -------------------------------------------------------------------------------- /实验 4.1/evalCustom_101.py: -------------------------------------------------------------------------------- 1 | """Adapted from: 2 | @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch 3 | @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn 4 | Licensed under The MIT License [see LICENSE for details] 5 | """ 6 | 7 | from __future__ import print_function 8 | import torch 9 | import torch.nn as nn 10 | import torch.backends.cudnn as cudnn 11 | from torch.autograd import Variable 12 | # from data import VOC_ROOT, VOCAnnotationTransform, VOCDetection, BaseTransform 13 | # from data import VOC_CLASSES as labelmap 14 | import torch.utils.data as data 15 | 16 | from data import BaseTransform 17 | from data.custom import CUSTOM_CLASSES as labelmap 18 | from data.custom import customDetection, customAnnotationTransform, CUSTOM_CLASSES, CUSTOM_ROOT 19 | 20 | # from ssd import build_ssd 21 | from ssd_resnet_101 import build_ssd 22 | 23 | import sys 24 | import os 25 | import time 26 | import argparse 27 | import numpy as np 28 | import pickle 29 | import cv2 30 | 31 | if sys.version_info[0] == 2: 32 | import xml.etree.cElementTree as ET 33 | else: 34 | import xml.etree.ElementTree as ET 35 | 36 | 37 | def str2bool(v): 38 | return v.lower() in ("yes", "true", "t", "1") 39 | 40 | 41 | parser = argparse.ArgumentParser( 42 | description='Single Shot MultiBox Detector Evaluation') 43 | parser.add_argument('--trained_model', 44 | default='weights/CUSTOM.pth', type=str, 45 | help='Trained state_dict file path to open') 46 | parser.add_argument('--save_folder', default='eval/', type=str, 47 | help='File path to save results') 48 | parser.add_argument('--confidence_threshold', default=0.01, type=float, 49 | help='Detection confidence threshold') 50 | parser.add_argument('--top_k', default=5, type=int, 51 | help='Further restrict the number of predictions to parse') 52 | parser.add_argument('--cuda', default=True, type=str2bool, 53 | help='Use cuda to train model') 54 | parser.add_argument('--custom_root', default=CUSTOM_ROOT, 55 | help='Location of VOC root directory') 56 | parser.add_argument('--cleanup', default=True, type=str2bool, 57 | help='Cleanup and remove results files following eval') 58 | 59 | args = parser.parse_args() 60 | 61 | if not os.path.exists(args.save_folder): 62 | os.mkdir(args.save_folder) 63 | 64 | if torch.cuda.is_available(): 65 | if args.cuda: 66 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 67 | if not args.cuda: 68 | print("WARNING: It looks like you have a CUDA device, but aren't using \ 69 | CUDA. Run with --cuda for optimal eval speed.") 70 | torch.set_default_tensor_type('torch.FloatTensor') 71 | else: 72 | torch.set_default_tensor_type('torch.FloatTensor') 73 | 74 | annopath = os.path.join(args.custom_root, 'shenhe', 'Annotations', '%s.xml') 75 | imgpath = os.path.join(args.custom_root, 'shenhe', 'JPEGImages', '%s.jpg') 76 | imgsetpath = os.path.join(args.custom_root, 'shenhe', 'ImageSets', 'Main', '%s.txt') 77 | 78 | devkit_path = args.custom_root + 'shenhe' 79 | dataset_mean = (104, 117, 123) 80 | set_type = 'test' 81 | 82 | 83 | class Timer(object): 84 | """A simple timer.""" 85 | def __init__(self): 86 | self.total_time = 0. 87 | self.calls = 0 88 | self.start_time = 0. 89 | self.diff = 0. 90 | self.average_time = 0. 91 | 92 | def tic(self): 93 | # using time.time instead of time.clock because time time.clock 94 | # does not normalize for multithreading 95 | self.start_time = time.time() 96 | 97 | def toc(self, average=True): 98 | self.diff = time.time() - self.start_time 99 | self.total_time += self.diff 100 | self.calls += 1 101 | self.average_time = self.total_time / self.calls 102 | if average: 103 | return self.average_time 104 | else: 105 | return self.diff 106 | 107 | 108 | def parse_rec(filename): 109 | """ Parse a PASCAL VOC xml file """ 110 | tree = ET.parse(filename) 111 | objects = [] 112 | for obj in tree.findall('object'): 113 | obj_struct = {} 114 | obj_struct['name'] = obj.find('name').text 115 | obj_struct['pose'] = obj.find('pose').text 116 | obj_struct['truncated'] = int(obj.find('truncated').text) 117 | obj_struct['difficult'] = int(obj.find('difficult').text) 118 | bbox = obj.find('bndbox') 119 | obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1, 120 | int(bbox.find('ymin').text) - 1, 121 | int(bbox.find('xmax').text) - 1, 122 | int(bbox.find('ymax').text) - 1] 123 | objects.append(obj_struct) 124 | 125 | return objects 126 | 127 | 128 | def get_output_dir(name, phase): 129 | """Return the directory where experimental artifacts are placed. 130 | If the directory does not exist, it is created. 131 | A canonical path is built using the name from an imdb and a network 132 | (if not None). 133 | """ 134 | filedir = os.path.join(name, phase) 135 | if not os.path.exists(filedir): 136 | os.makedirs(filedir) 137 | return filedir 138 | 139 | 140 | def get_voc_results_file_template(image_set, cls): 141 | # VOCdevkit/VOC2007/results/det_test_aeroplane.txt 142 | filename = 'det_' + image_set + '_%s.txt' % (cls) 143 | filedir = os.path.join(devkit_path, 'results') 144 | if not os.path.exists(filedir): 145 | os.makedirs(filedir) 146 | path = os.path.join(filedir, filename) 147 | return path 148 | 149 | 150 | def write_voc_results_file(all_boxes, dataset): 151 | for cls_ind, cls in enumerate(labelmap): 152 | print('Writing {:s} VOC results file'.format(cls)) 153 | filename = get_voc_results_file_template(set_type, cls) 154 | with open(filename, 'wt') as f: 155 | for im_ind, index in enumerate(dataset.ids): 156 | dets = all_boxes[cls_ind+1][im_ind] 157 | if dets == []: 158 | continue 159 | # the VOCdevkit expects 1-based indices 160 | for k in range(dets.shape[0]): 161 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 162 | format(index[1], dets[k, -1], 163 | dets[k, 0] + 1, dets[k, 1] + 1, 164 | dets[k, 2] + 1, dets[k, 3] + 1)) 165 | 166 | 167 | def do_python_eval(output_dir='output', use_07=True): 168 | cachedir = os.path.join(devkit_path, 'annotations_cache') 169 | aps = [] 170 | # The PASCAL VOC metric changed in 2010 171 | use_07_metric = use_07 172 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 173 | if not os.path.isdir(output_dir): 174 | os.mkdir(output_dir) 175 | for i, cls in enumerate(labelmap): 176 | filename = get_voc_results_file_template(set_type, cls) 177 | rec, prec, ap = voc_eval( 178 | filename, annopath, imgsetpath % (set_type), cls, cachedir, 179 | ovthresh=0.1, use_07_metric=use_07_metric) 180 | aps += [ap] 181 | print('AP for {} = {:.4f}'.format(cls, ap)) 182 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: 183 | pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 184 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 185 | print('~~~~~~~~') 186 | print('Results:') 187 | for ap in aps: 188 | print('{:.3f}'.format(ap)) 189 | print('{:.3f}'.format(np.mean(aps))) 190 | print('~~~~~~~~') 191 | print('') 192 | print('--------------------------------------------------------------') 193 | print('Results computed with the **unofficial** Python eval code.') 194 | print('Results should be very close to the official MATLAB eval code.') 195 | print('--------------------------------------------------------------') 196 | 197 | 198 | def voc_ap(rec, prec, use_07_metric=True): 199 | """ ap = voc_ap(rec, prec, [use_07_metric]) 200 | Compute VOC AP given precision and recall. 201 | If use_07_metric is true, uses the 202 | VOC 07 11 point method (default:True). 203 | """ 204 | if use_07_metric: 205 | # 11 point metric 206 | ap = 0. 207 | for t in np.arange(0., 1.1, 0.1): 208 | if np.sum(rec >= t) == 0: 209 | p = 0 210 | else: 211 | p = np.max(prec[rec >= t]) 212 | ap = ap + p / 11. 213 | else: 214 | # correct AP calculation 215 | # first append sentinel values at the end 216 | mrec = np.concatenate(([0.], rec, [1.])) 217 | mpre = np.concatenate(([0.], prec, [0.])) 218 | 219 | # compute the precision envelope 220 | for i in range(mpre.size - 1, 0, -1): 221 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 222 | 223 | # to calculate area under PR curve, look for points 224 | # where X axis (recall) changes value 225 | i = np.where(mrec[1:] != mrec[:-1])[0] 226 | 227 | # and sum (\Delta recall) * prec 228 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 229 | return ap 230 | 231 | 232 | def voc_eval(detpath, 233 | annopath, 234 | imagesetfile, 235 | classname, 236 | cachedir, 237 | ovthresh=0.5, 238 | use_07_metric=True): 239 | """rec, prec, ap = voc_eval(detpath, 240 | annopath, 241 | imagesetfile, 242 | classname, 243 | [ovthresh], 244 | [use_07_metric]) 245 | Top level function that does the PASCAL VOC evaluation. 246 | detpath: Path to detections 247 | detpath.format(classname) should produce the detection results file. 248 | annopath: Path to annotations 249 | annopath.format(imagename) should be the xml annotations file. 250 | imagesetfile: Text file containing the list of images, one image per line. 251 | classname: Category name (duh) 252 | cachedir: Directory for caching the annotations 253 | [ovthresh]: Overlap threshold (default = 0.5) 254 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 255 | (default True) 256 | """ 257 | # assumes detections are in detpath.format(classname) 258 | # assumes annotations are in annopath.format(imagename) 259 | # assumes imagesetfile is a text file with each line an image name 260 | # cachedir caches the annotations in a pickle file 261 | # first load gt 262 | if not os.path.isdir(cachedir): 263 | os.mkdir(cachedir) 264 | cachefile = os.path.join(cachedir, 'annots.pkl') 265 | # read list of images 266 | with open(imagesetfile, 'r') as f: 267 | lines = f.readlines() 268 | imagenames = [x.strip() for x in lines] 269 | if not os.path.isfile(cachefile): 270 | # load annots 271 | recs = {} 272 | for i, imagename in enumerate(imagenames): 273 | recs[imagename] = parse_rec(annopath % (imagename)) 274 | if i % 100 == 0: 275 | print('Reading annotation for {:d}/{:d}'.format( 276 | i + 1, len(imagenames))) 277 | # save 278 | print('Saving cached annotations to {:s}'.format(cachefile)) 279 | with open(cachefile, 'wb') as f: 280 | pickle.dump(recs, f) 281 | else: 282 | # load 283 | with open(cachefile, 'rb') as f: 284 | recs = pickle.load(f) 285 | 286 | # extract gt objects for this class 287 | class_recs = {} 288 | npos = 0 289 | for imagename in imagenames: 290 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 291 | bbox = np.array([x['bbox'] for x in R]) 292 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 293 | det = [False] * len(R) 294 | npos = npos + sum(~difficult) 295 | class_recs[imagename] = {'bbox': bbox, 296 | 'difficult': difficult, 297 | 'det': det} 298 | 299 | # read dets 300 | detfile = detpath.format(classname) 301 | with open(detfile, 'r') as f: 302 | lines = f.readlines() 303 | if any(lines) == 1: 304 | 305 | splitlines = [x.strip().split(' ') for x in lines] 306 | image_ids = [x[0] for x in splitlines] 307 | confidence = np.array([float(x[1]) for x in splitlines]) 308 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 309 | 310 | # sort by confidence 311 | sorted_ind = np.argsort(-confidence) 312 | sorted_scores = np.sort(-confidence) 313 | BB = BB[sorted_ind, :] 314 | image_ids = [image_ids[x] for x in sorted_ind] 315 | 316 | # go down dets and mark TPs and FPs 317 | nd = len(image_ids) 318 | tp = np.zeros(nd) 319 | fp = np.zeros(nd) 320 | for d in range(nd): 321 | R = class_recs[image_ids[d]] 322 | bb = BB[d, :].astype(float) 323 | ovmax = -np.inf 324 | BBGT = R['bbox'].astype(float) 325 | if BBGT.size > 0: 326 | # compute overlaps 327 | # intersection 328 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 329 | iymin = np.maximum(BBGT[:, 1], bb[1]) 330 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 331 | iymax = np.minimum(BBGT[:, 3], bb[3]) 332 | iw = np.maximum(ixmax - ixmin, 0.) 333 | ih = np.maximum(iymax - iymin, 0.) 334 | inters = iw * ih 335 | uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) + 336 | (BBGT[:, 2] - BBGT[:, 0]) * 337 | (BBGT[:, 3] - BBGT[:, 1]) - inters) 338 | overlaps = inters / uni 339 | ovmax = np.max(overlaps) 340 | jmax = np.argmax(overlaps) 341 | 342 | if ovmax > ovthresh: 343 | if not R['difficult'][jmax]: 344 | if not R['det'][jmax]: 345 | tp[d] = 1. 346 | R['det'][jmax] = 1 347 | else: 348 | fp[d] = 1. 349 | else: 350 | fp[d] = 1. 351 | 352 | # compute precision recall 353 | fp = np.cumsum(fp) 354 | tp = np.cumsum(tp) 355 | rec = tp / float(npos) 356 | # avoid divide by zero in case the first detection matches a difficult 357 | # ground truth 358 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 359 | ap = voc_ap(rec, prec, use_07_metric) 360 | else: 361 | rec = -1. 362 | prec = -1. 363 | ap = -1. 364 | 365 | return rec, prec, ap 366 | 367 | 368 | def test_net(save_folder, net, cuda, dataset, transform, top_k, 369 | im_size=300, thresh=0.05): 370 | num_images = len(dataset) 371 | # all detections are collected into: 372 | # all_boxes[cls][image] = N x 5 array of detections in 373 | # (x1, y1, x2, y2, score) 374 | all_boxes = [[[] for _ in range(num_images)] 375 | for _ in range(len(labelmap)+1)] 376 | 377 | # timers 378 | _t = {'im_detect': Timer(), 'misc': Timer()} 379 | output_dir = get_output_dir('ssd300_120000', set_type) 380 | det_file = os.path.join(output_dir, 'detections.pkl') 381 | 382 | for i in range(num_images): 383 | im, gt, h, w = dataset.pull_item(i) 384 | 385 | x = Variable(im.unsqueeze(0)) 386 | if args.cuda: 387 | x = x.cuda() 388 | _t['im_detect'].tic() 389 | detections = net(x).data 390 | detect_time = _t['im_detect'].toc(average=False) 391 | 392 | # skip j = 0, because it's the background class 393 | for j in range(1, detections.size(1)): 394 | dets = detections[0, j, :] 395 | mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t() 396 | dets = torch.masked_select(dets, mask).view(-1, 5) 397 | if dets.size(0) == 0: 398 | continue 399 | boxes = dets[:, 1:] 400 | boxes[:, 0] *= w 401 | boxes[:, 2] *= w 402 | boxes[:, 1] *= h 403 | boxes[:, 3] *= h 404 | scores = dets[:, 0].cpu().numpy() 405 | cls_dets = np.hstack((boxes.cpu().numpy(), 406 | scores[:, np.newaxis])).astype(np.float32, 407 | copy=False) 408 | all_boxes[j][i] = cls_dets 409 | 410 | print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, 411 | num_images, detect_time)) 412 | 413 | with open(det_file, 'wb') as f: 414 | pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) 415 | 416 | print('Evaluating detections') 417 | evaluate_detections(all_boxes, output_dir, dataset) 418 | 419 | 420 | def evaluate_detections(box_list, output_dir, dataset): 421 | write_voc_results_file(box_list, dataset) 422 | do_python_eval(output_dir) 423 | 424 | 425 | if __name__ == '__main__': 426 | # load net 427 | num_classes = len(labelmap) + 1 # +1 for background 428 | net = build_ssd('test', 300, num_classes) # initialize SSD 429 | net.load_state_dict(torch.load(args.trained_model)) 430 | net.eval() 431 | print('Finished loading model!') 432 | # load data 433 | dataset = customDetection(args.custom_root, [('shenhe', set_type)], 434 | BaseTransform(300, dataset_mean), 435 | customAnnotationTransform()) 436 | if args.cuda: 437 | net = net.cuda() 438 | cudnn.benchmark = True 439 | # evaluation 440 | test_net(args.save_folder, net, args.cuda, dataset, 441 | BaseTransform(net.size, dataset_mean), args.top_k, 300, 442 | thresh=args.confidence_threshold) 443 | -------------------------------------------------------------------------------- /实验 4.2/evalCustom_18.py: -------------------------------------------------------------------------------- 1 | """Adapted from: 2 | @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch 3 | @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn 4 | Licensed under The MIT License [see LICENSE for details] 5 | """ 6 | 7 | from __future__ import print_function 8 | import torch 9 | import torch.nn as nn 10 | import torch.backends.cudnn as cudnn 11 | from torch.autograd import Variable 12 | # from data import VOC_ROOT, VOCAnnotationTransform, VOCDetection, BaseTransform 13 | # from data import VOC_CLASSES as labelmap 14 | import torch.utils.data as data 15 | 16 | from data import BaseTransform 17 | from data.custom import CUSTOM_CLASSES as labelmap 18 | from data.custom import customDetection, customAnnotationTransform, CUSTOM_CLASSES, CUSTOM_ROOT 19 | 20 | # from ssd import build_ssd 21 | from ssd_resnet_18 import build_ssd 22 | 23 | import sys 24 | import os 25 | import time 26 | import argparse 27 | import numpy as np 28 | import pickle 29 | import cv2 30 | 31 | if sys.version_info[0] == 2: 32 | import xml.etree.cElementTree as ET 33 | else: 34 | import xml.etree.ElementTree as ET 35 | 36 | 37 | def str2bool(v): 38 | return v.lower() in ("yes", "true", "t", "1") 39 | 40 | 41 | parser = argparse.ArgumentParser( 42 | description='Single Shot MultiBox Detector Evaluation') 43 | parser.add_argument('--trained_model', 44 | default='weights/CUSTOM.pth', type=str, 45 | help='Trained state_dict file path to open') 46 | parser.add_argument('--save_folder', default='eval/', type=str, 47 | help='File path to save results') 48 | parser.add_argument('--confidence_threshold', default=0.01, type=float, 49 | help='Detection confidence threshold') 50 | parser.add_argument('--top_k', default=5, type=int, 51 | help='Further restrict the number of predictions to parse') 52 | parser.add_argument('--cuda', default=True, type=str2bool, 53 | help='Use cuda to train model') 54 | parser.add_argument('--custom_root', default=CUSTOM_ROOT, 55 | help='Location of VOC root directory') 56 | parser.add_argument('--cleanup', default=True, type=str2bool, 57 | help='Cleanup and remove results files following eval') 58 | 59 | args = parser.parse_args() 60 | 61 | if not os.path.exists(args.save_folder): 62 | os.mkdir(args.save_folder) 63 | 64 | if torch.cuda.is_available(): 65 | if args.cuda: 66 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 67 | if not args.cuda: 68 | print("WARNING: It looks like you have a CUDA device, but aren't using \ 69 | CUDA. Run with --cuda for optimal eval speed.") 70 | torch.set_default_tensor_type('torch.FloatTensor') 71 | else: 72 | torch.set_default_tensor_type('torch.FloatTensor') 73 | 74 | annopath = os.path.join(args.custom_root, 'shenhe', 'Annotations', '%s.xml') 75 | imgpath = os.path.join(args.custom_root, 'shenhe', 'JPEGImages', '%s.jpg') 76 | imgsetpath = os.path.join(args.custom_root, 'shenhe', 'ImageSets', 'Main', '%s.txt') 77 | 78 | devkit_path = args.custom_root + 'shenhe' 79 | dataset_mean = (104, 117, 123) 80 | set_type = 'test' 81 | 82 | 83 | class Timer(object): 84 | """A simple timer.""" 85 | def __init__(self): 86 | self.total_time = 0. 87 | self.calls = 0 88 | self.start_time = 0. 89 | self.diff = 0. 90 | self.average_time = 0. 91 | 92 | def tic(self): 93 | # using time.time instead of time.clock because time time.clock 94 | # does not normalize for multithreading 95 | self.start_time = time.time() 96 | 97 | def toc(self, average=True): 98 | self.diff = time.time() - self.start_time 99 | self.total_time += self.diff 100 | self.calls += 1 101 | self.average_time = self.total_time / self.calls 102 | if average: 103 | return self.average_time 104 | else: 105 | return self.diff 106 | 107 | 108 | def parse_rec(filename): 109 | """ Parse a PASCAL VOC xml file """ 110 | tree = ET.parse(filename) 111 | objects = [] 112 | for obj in tree.findall('object'): 113 | obj_struct = {} 114 | obj_struct['name'] = obj.find('name').text 115 | obj_struct['pose'] = obj.find('pose').text 116 | obj_struct['truncated'] = int(obj.find('truncated').text) 117 | obj_struct['difficult'] = int(obj.find('difficult').text) 118 | bbox = obj.find('bndbox') 119 | obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1, 120 | int(bbox.find('ymin').text) - 1, 121 | int(bbox.find('xmax').text) - 1, 122 | int(bbox.find('ymax').text) - 1] 123 | objects.append(obj_struct) 124 | 125 | return objects 126 | 127 | 128 | def get_output_dir(name, phase): 129 | """Return the directory where experimental artifacts are placed. 130 | If the directory does not exist, it is created. 131 | A canonical path is built using the name from an imdb and a network 132 | (if not None). 133 | """ 134 | filedir = os.path.join(name, phase) 135 | if not os.path.exists(filedir): 136 | os.makedirs(filedir) 137 | return filedir 138 | 139 | 140 | def get_voc_results_file_template(image_set, cls): 141 | # VOCdevkit/VOC2007/results/det_test_aeroplane.txt 142 | filename = 'det_' + image_set + '_%s.txt' % (cls) 143 | filedir = os.path.join(devkit_path, 'results') 144 | if not os.path.exists(filedir): 145 | os.makedirs(filedir) 146 | path = os.path.join(filedir, filename) 147 | return path 148 | 149 | 150 | def write_voc_results_file(all_boxes, dataset): 151 | for cls_ind, cls in enumerate(labelmap): 152 | print('Writing {:s} VOC results file'.format(cls)) 153 | filename = get_voc_results_file_template(set_type, cls) 154 | with open(filename, 'wt') as f: 155 | for im_ind, index in enumerate(dataset.ids): 156 | dets = all_boxes[cls_ind+1][im_ind] 157 | if dets == []: 158 | continue 159 | # the VOCdevkit expects 1-based indices 160 | for k in range(dets.shape[0]): 161 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 162 | format(index[1], dets[k, -1], 163 | dets[k, 0] + 1, dets[k, 1] + 1, 164 | dets[k, 2] + 1, dets[k, 3] + 1)) 165 | 166 | 167 | def do_python_eval(output_dir='output', use_07=True): 168 | cachedir = os.path.join(devkit_path, 'annotations_cache') 169 | aps = [] 170 | # The PASCAL VOC metric changed in 2010 171 | use_07_metric = use_07 172 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 173 | if not os.path.isdir(output_dir): 174 | os.mkdir(output_dir) 175 | for i, cls in enumerate(labelmap): 176 | filename = get_voc_results_file_template(set_type, cls) 177 | rec, prec, ap = voc_eval( 178 | filename, annopath, imgsetpath % (set_type), cls, cachedir, 179 | ovthresh=0.1, use_07_metric=use_07_metric) 180 | aps += [ap] 181 | print('AP for {} = {:.4f}'.format(cls, ap)) 182 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: 183 | pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 184 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 185 | print('~~~~~~~~') 186 | print('Results:') 187 | for ap in aps: 188 | print('{:.3f}'.format(ap)) 189 | print('{:.3f}'.format(np.mean(aps))) 190 | print('~~~~~~~~') 191 | print('') 192 | print('--------------------------------------------------------------') 193 | print('Results computed with the **unofficial** Python eval code.') 194 | print('Results should be very close to the official MATLAB eval code.') 195 | print('--------------------------------------------------------------') 196 | 197 | 198 | def voc_ap(rec, prec, use_07_metric=True): 199 | """ ap = voc_ap(rec, prec, [use_07_metric]) 200 | Compute VOC AP given precision and recall. 201 | If use_07_metric is true, uses the 202 | VOC 07 11 point method (default:True). 203 | """ 204 | if use_07_metric: 205 | # 11 point metric 206 | ap = 0. 207 | for t in np.arange(0., 1.1, 0.1): 208 | if np.sum(rec >= t) == 0: 209 | p = 0 210 | else: 211 | p = np.max(prec[rec >= t]) 212 | ap = ap + p / 11. 213 | else: 214 | # correct AP calculation 215 | # first append sentinel values at the end 216 | mrec = np.concatenate(([0.], rec, [1.])) 217 | mpre = np.concatenate(([0.], prec, [0.])) 218 | 219 | # compute the precision envelope 220 | for i in range(mpre.size - 1, 0, -1): 221 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 222 | 223 | # to calculate area under PR curve, look for points 224 | # where X axis (recall) changes value 225 | i = np.where(mrec[1:] != mrec[:-1])[0] 226 | 227 | # and sum (\Delta recall) * prec 228 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 229 | return ap 230 | 231 | 232 | def voc_eval(detpath, 233 | annopath, 234 | imagesetfile, 235 | classname, 236 | cachedir, 237 | ovthresh=0.5, 238 | use_07_metric=True): 239 | """rec, prec, ap = voc_eval(detpath, 240 | annopath, 241 | imagesetfile, 242 | classname, 243 | [ovthresh], 244 | [use_07_metric]) 245 | Top level function that does the PASCAL VOC evaluation. 246 | detpath: Path to detections 247 | detpath.format(classname) should produce the detection results file. 248 | annopath: Path to annotations 249 | annopath.format(imagename) should be the xml annotations file. 250 | imagesetfile: Text file containing the list of images, one image per line. 251 | classname: Category name (duh) 252 | cachedir: Directory for caching the annotations 253 | [ovthresh]: Overlap threshold (default = 0.5) 254 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 255 | (default True) 256 | """ 257 | # assumes detections are in detpath.format(classname) 258 | # assumes annotations are in annopath.format(imagename) 259 | # assumes imagesetfile is a text file with each line an image name 260 | # cachedir caches the annotations in a pickle file 261 | # first load gt 262 | if not os.path.isdir(cachedir): 263 | os.mkdir(cachedir) 264 | cachefile = os.path.join(cachedir, 'annots.pkl') 265 | # read list of images 266 | with open(imagesetfile, 'r') as f: 267 | lines = f.readlines() 268 | imagenames = [x.strip() for x in lines] 269 | if not os.path.isfile(cachefile): 270 | # load annots 271 | recs = {} 272 | for i, imagename in enumerate(imagenames): 273 | recs[imagename] = parse_rec(annopath % (imagename)) 274 | if i % 100 == 0: 275 | print('Reading annotation for {:d}/{:d}'.format( 276 | i + 1, len(imagenames))) 277 | # save 278 | print('Saving cached annotations to {:s}'.format(cachefile)) 279 | with open(cachefile, 'wb') as f: 280 | pickle.dump(recs, f) 281 | else: 282 | # load 283 | with open(cachefile, 'rb') as f: 284 | recs = pickle.load(f) 285 | 286 | # extract gt objects for this class 287 | class_recs = {} 288 | npos = 0 289 | for imagename in imagenames: 290 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 291 | bbox = np.array([x['bbox'] for x in R]) 292 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 293 | det = [False] * len(R) 294 | npos = npos + sum(~difficult) 295 | class_recs[imagename] = {'bbox': bbox, 296 | 'difficult': difficult, 297 | 'det': det} 298 | 299 | # read dets 300 | detfile = detpath.format(classname) 301 | with open(detfile, 'r') as f: 302 | lines = f.readlines() 303 | if any(lines) == 1: 304 | 305 | splitlines = [x.strip().split(' ') for x in lines] 306 | image_ids = [x[0] for x in splitlines] 307 | confidence = np.array([float(x[1]) for x in splitlines]) 308 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 309 | 310 | # sort by confidence 311 | sorted_ind = np.argsort(-confidence) 312 | sorted_scores = np.sort(-confidence) 313 | BB = BB[sorted_ind, :] 314 | image_ids = [image_ids[x] for x in sorted_ind] 315 | 316 | # go down dets and mark TPs and FPs 317 | nd = len(image_ids) 318 | tp = np.zeros(nd) 319 | fp = np.zeros(nd) 320 | for d in range(nd): 321 | R = class_recs[image_ids[d]] 322 | bb = BB[d, :].astype(float) 323 | ovmax = -np.inf 324 | BBGT = R['bbox'].astype(float) 325 | if BBGT.size > 0: 326 | # compute overlaps 327 | # intersection 328 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 329 | iymin = np.maximum(BBGT[:, 1], bb[1]) 330 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 331 | iymax = np.minimum(BBGT[:, 3], bb[3]) 332 | iw = np.maximum(ixmax - ixmin, 0.) 333 | ih = np.maximum(iymax - iymin, 0.) 334 | inters = iw * ih 335 | uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) + 336 | (BBGT[:, 2] - BBGT[:, 0]) * 337 | (BBGT[:, 3] - BBGT[:, 1]) - inters) 338 | overlaps = inters / uni 339 | ovmax = np.max(overlaps) 340 | jmax = np.argmax(overlaps) 341 | 342 | if ovmax > ovthresh: 343 | if not R['difficult'][jmax]: 344 | if not R['det'][jmax]: 345 | tp[d] = 1. 346 | R['det'][jmax] = 1 347 | else: 348 | fp[d] = 1. 349 | else: 350 | fp[d] = 1. 351 | 352 | # compute precision recall 353 | fp = np.cumsum(fp) 354 | tp = np.cumsum(tp) 355 | rec = tp / float(npos) 356 | # avoid divide by zero in case the first detection matches a difficult 357 | # ground truth 358 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 359 | ap = voc_ap(rec, prec, use_07_metric) 360 | else: 361 | rec = -1. 362 | prec = -1. 363 | ap = -1. 364 | 365 | return rec, prec, ap 366 | 367 | 368 | def test_net(save_folder, net, cuda, dataset, transform, top_k, 369 | im_size=300, thresh=0.05): 370 | num_images = len(dataset) 371 | # all detections are collected into: 372 | # all_boxes[cls][image] = N x 5 array of detections in 373 | # (x1, y1, x2, y2, score) 374 | all_boxes = [[[] for _ in range(num_images)] 375 | for _ in range(len(labelmap)+1)] 376 | 377 | # timers 378 | _t = {'im_detect': Timer(), 'misc': Timer()} 379 | output_dir = get_output_dir('ssd300_120000', set_type) 380 | det_file = os.path.join(output_dir, 'detections.pkl') 381 | 382 | for i in range(num_images): 383 | im, gt, h, w = dataset.pull_item(i) 384 | 385 | x = Variable(im.unsqueeze(0)) 386 | if args.cuda: 387 | x = x.cuda() 388 | _t['im_detect'].tic() 389 | detections = net(x).data 390 | detect_time = _t['im_detect'].toc(average=False) 391 | 392 | # skip j = 0, because it's the background class 393 | for j in range(1, detections.size(1)): 394 | dets = detections[0, j, :] 395 | mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t() 396 | dets = torch.masked_select(dets, mask).view(-1, 5) 397 | if dets.size(0) == 0: 398 | continue 399 | boxes = dets[:, 1:] 400 | boxes[:, 0] *= w 401 | boxes[:, 2] *= w 402 | boxes[:, 1] *= h 403 | boxes[:, 3] *= h 404 | scores = dets[:, 0].cpu().numpy() 405 | cls_dets = np.hstack((boxes.cpu().numpy(), 406 | scores[:, np.newaxis])).astype(np.float32, 407 | copy=False) 408 | all_boxes[j][i] = cls_dets 409 | 410 | print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, 411 | num_images, detect_time)) 412 | 413 | with open(det_file, 'wb') as f: 414 | pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) 415 | 416 | print('Evaluating detections') 417 | evaluate_detections(all_boxes, output_dir, dataset) 418 | 419 | 420 | def evaluate_detections(box_list, output_dir, dataset): 421 | write_voc_results_file(box_list, dataset) 422 | do_python_eval(output_dir) 423 | 424 | 425 | if __name__ == '__main__': 426 | # load net 427 | num_classes = len(labelmap) + 1 # +1 for background 428 | net = build_ssd('test', 300, num_classes) # initialize SSD 429 | net.load_state_dict(torch.load(args.trained_model)) 430 | net.eval() 431 | print('Finished loading model!') 432 | # load data 433 | dataset = customDetection(args.custom_root, [('shenhe', set_type)], 434 | BaseTransform(300, dataset_mean), 435 | customAnnotationTransform()) 436 | if args.cuda: 437 | net = net.cuda() 438 | cudnn.benchmark = True 439 | # evaluation 440 | test_net(args.save_folder, net, args.cuda, dataset, 441 | BaseTransform(net.size, dataset_mean), args.top_k, 300, 442 | thresh=args.confidence_threshold) 443 | --------------------------------------------------------------------------------