├── utils
    ├── __init__.py
    └── augmentations.py
├── layers
    ├── __init__.py
    ├── functions
    │   ├── __init__.py
    │   ├── prior_box.py
    │   └── detection.py
    ├── modules
    │   ├── __init__.py
    │   ├── l2norm.py
    │   ├── focal_loss.py
    │   └── multibox_loss.py
    └── box_utils.py
├── .gitattributes
├── netModel
    ├── testModel.py
    ├── multi_flow.py
    └── resnet.py
├── data
    ├── validPhoto.py
    ├── splitTrainVal.py
    ├── splitTrainVal copy.py
    ├── xmlPaser.py
    ├── coco
    │   └── coco_labels.txt
    ├── __init__.py
    ├── resultVisualize.py
    ├── config.py
    ├── xmlPaserGenLabel.py
    ├── voc0712.py
    ├── custom.py
    ├── custom_for_visual.py
    └── coco.py
├── LICENSE
├── README.md
├── .gitignore
├── 实验 4.1
    ├── ssd_resnet_101.py
    ├── trainCustom_101.py
    ├── visualTest_gauge.py
    └── evalCustom_101.py
└── 实验 4.2
    ├── ssd_resnet_18.py
    ├── visualTest_building.py
    ├── trainCustom_18.py
    └── evalCustom_18.py


/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .augmentations import SSDAugmentation


--------------------------------------------------------------------------------
/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .functions import *
2 | from .modules import *
3 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-language=Python
2 | .ipynb_checkpoints/* linguist-documentation
3 | dev.ipynb linguist-documentation
4 | 


--------------------------------------------------------------------------------
/layers/functions/__init__.py:
--------------------------------------------------------------------------------
1 | from .detection import Detect
2 | from .prior_box import PriorBox
3 | 
4 | 
5 | __all__ = ['Detect', 'PriorBox']
6 | 


--------------------------------------------------------------------------------
/layers/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .l2norm import L2Norm
2 | from .multibox_loss import MultiBoxLoss
3 | 
4 | __all__ = ['L2Norm', 'MultiBoxLoss']
5 | 


--------------------------------------------------------------------------------
/netModel/testModel.py:
--------------------------------------------------------------------------------
1 | from resnet import resnet101
2 | import torch
3 | 
4 | if __name__ == '__main__':
5 |     model = resnet101()
6 |     input = torch.rand(2,3,512,512)
7 |     res = model(input)
8 |     print(model)


--------------------------------------------------------------------------------
/data/validPhoto.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path, PurePath
 2 | import cv2
 3 | 
 4 | if __name__ == '__main__':
 5 |     p = Path('./piaofu/piao/shenhe/JPEGImages/')
 6 |     files = [x for x in p.iterdir() if x.is_file()]
 7 |     for file in files:
 8 |         try:
 9 |             print(file.name)
10 |             img = cv2.imread('./piaofu/piao/shenhe/JPEGImages/%s' % file.name, cv2.IMREAD_COLOR)
11 |         except Exception:
12 |             print(file.name)


--------------------------------------------------------------------------------
/data/splitTrainVal.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path, PurePath
 2 | 
 3 | resultPath = "./video/buildingwater/ImageSets/Main/"
 4 | def splitDataset(path, filename):
 5 |     p = Path(path)
 6 |     files = [x for x in p.iterdir() if x.is_file()]
 7 |     count = 0
 8 |     with open(resultPath+filename+'trainval.txt', 'w+') as f:
 9 |         with open(resultPath+filename+'train.txt', 'w+') as ft:
10 |             with open(resultPath+filename+'val.txt', 'w+') as fv:
11 |                 for file in files:
12 |                     f.write(file.stem + '\n')
13 |                     if count % 5 == 4:
14 |                         fv.write(file.stem + '\n')
15 |                     else:
16 |                         ft.write(file.stem + '\n')
17 |                     count += 1
18 | 
19 | splitDataset('./video/buildingwater/Annotations', '')
20 | 


--------------------------------------------------------------------------------
/layers/modules/l2norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Function
 4 | from torch.autograd import Variable
 5 | import torch.nn.init as init
 6 | 
 7 | class L2Norm(nn.Module):
 8 |     def __init__(self,n_channels, scale):
 9 |         super(L2Norm,self).__init__()
10 |         self.n_channels = n_channels
11 |         self.gamma = scale or None
12 |         self.eps = 1e-10
13 |         self.weight = nn.Parameter(torch.Tensor(self.n_channels))
14 |         self.reset_parameters()
15 | 
16 |     def reset_parameters(self):
17 |         init.constant_(self.weight,self.gamma)
18 | 
19 |     def forward(self, x):
20 |         norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()+self.eps
21 |         #x /= norm
22 |         x = torch.div(x,norm)
23 |         out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
24 |         return out
25 | 


--------------------------------------------------------------------------------
/data/splitTrainVal copy.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path, PurePath
 2 | 
 3 | resultPath = "./video/buildingwater/ImageSets/Main/"
 4 | def splitDataset(path, filename):
 5 |     p = Path(path)
 6 |     files = [x for x in p.iterdir() if x.is_file()]
 7 |     count = 0
 8 |     with open(resultPath+filename+'trainval0.txt', 'w+') as f:
 9 |         with open(resultPath+filename+'train0.txt', 'w+') as ft:
10 |             with open(resultPath+filename+'val0.txt', 'w+') as fv:
11 |                 for file in files:
12 |                     f.write(file.stem + '\n')
13 |                     if file.stem.find('v1') > -1 or file.stem.find('v2') > -1 or file.stem.find('v4') > -1 or file.stem.find('v5') > -1 or file.stem.find('v6') > -1:
14 |                         ft.write(file.stem + '\n')
15 |                     elif file.stem.find('v3') > -1:
16 |                         fv.write(file.stem + '\n')
17 |                     count += 1
18 | 
19 | splitDataset('./video/buildingwater/Annotations', '')
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Max deGroot, Ellis Brown
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/data/xmlPaser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: UTF-8 -*-
 3 |  
 4 | import xml.sax
 5 | from pathlib import Path, PurePath
 6 | 
 7 | total = {}
 8 |  
 9 | class MovieHandler( xml.sax.ContentHandler ):
10 |     def __init__(self):
11 |         self.CurrentData = ""
12 |         self.name = ''
13 |  
14 |     # 元素开始事件处理
15 |     def startElement(self, tag, attributes):
16 |         self.CurrentData = tag
17 |  
18 |     # 元素结束事件处理
19 |     def endElement(self, tag):
20 |         if self.CurrentData == "name":
21 |             if self.name in total:
22 |                 total[self.name] += 1
23 |             else:
24 |                 total[self.name] = 1
25 |         self.CurrentData = ""
26 |  
27 |     # 内容事件处理
28 |     def characters(self, content):
29 |         if self.CurrentData == "name":
30 |             self.name = content
31 | 
32 | if ( __name__ == "__main__"):
33 |    
34 |     # 创建一个 XMLReader
35 |     parser = xml.sax.make_parser()
36 |     # turn off namepsaces
37 |     parser.setFeature(xml.sax.handler.feature_namespaces, 0)
38 | 
39 |     # 重写 ContextHandler
40 |     Handler = MovieHandler()
41 |     parser.setContentHandler( Handler )
42 | 
43 |     path = '.\\piaofu\\piao\\shenhe\\Annotations'
44 |     p = Path(path)
45 |     files = [x for x in p.iterdir() if x.is_file()]
46 |     for f in files:
47 |         parser.parse(path+'\\'+f.name)
48 |     print(total)


--------------------------------------------------------------------------------
/data/coco/coco_labels.txt:
--------------------------------------------------------------------------------
 1 | 1,1,person
 2 | 2,2,bicycle
 3 | 3,3,car
 4 | 4,4,motorcycle
 5 | 5,5,airplane
 6 | 6,6,bus
 7 | 7,7,train
 8 | 8,8,truck
 9 | 9,9,boat
10 | 10,10,traffic light
11 | 11,11,fire hydrant
12 | 13,12,stop sign
13 | 14,13,parking meter
14 | 15,14,bench
15 | 16,15,bird
16 | 17,16,cat
17 | 18,17,dog
18 | 19,18,horse
19 | 20,19,sheep
20 | 21,20,cow
21 | 22,21,elephant
22 | 23,22,bear
23 | 24,23,zebra
24 | 25,24,giraffe
25 | 27,25,backpack
26 | 28,26,umbrella
27 | 31,27,handbag
28 | 32,28,tie
29 | 33,29,suitcase
30 | 34,30,frisbee
31 | 35,31,skis
32 | 36,32,snowboard
33 | 37,33,sports ball
34 | 38,34,kite
35 | 39,35,baseball bat
36 | 40,36,baseball glove
37 | 41,37,skateboard
38 | 42,38,surfboard
39 | 43,39,tennis racket
40 | 44,40,bottle
41 | 46,41,wine glass
42 | 47,42,cup
43 | 48,43,fork
44 | 49,44,knife
45 | 50,45,spoon
46 | 51,46,bowl
47 | 52,47,banana
48 | 53,48,apple
49 | 54,49,sandwich
50 | 55,50,orange
51 | 56,51,broccoli
52 | 57,52,carrot
53 | 58,53,hot dog
54 | 59,54,pizza
55 | 60,55,donut
56 | 61,56,cake
57 | 62,57,chair
58 | 63,58,couch
59 | 64,59,potted plant
60 | 65,60,bed
61 | 67,61,dining table
62 | 70,62,toilet
63 | 72,63,tv
64 | 73,64,laptop
65 | 74,65,mouse
66 | 75,66,remote
67 | 76,67,keyboard
68 | 77,68,cell phone
69 | 78,69,microwave
70 | 79,70,oven
71 | 80,71,toaster
72 | 81,72,sink
73 | 82,73,refrigerator
74 | 84,74,book
75 | 85,75,clock
76 | 86,76,vase
77 | 87,77,scissors
78 | 88,78,teddy bear
79 | 89,79,hair drier
80 | 90,80,toothbrush
81 | 


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
 1 | from .voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES, VOC_ROOT
 2 | from .custom import customDetection, customAnnotationTransform, CUSTOM_CLASSES, CUSTOM_ROOT
 3 | 
 4 | # from .coco import COCODetection, COCOAnnotationTransform, COCO_CLASSES, COCO_ROOT, get_label_map
 5 | from .config import *
 6 | import torch
 7 | import cv2
 8 | import numpy as np
 9 | 
10 | def detection_collate(batch):
11 |     """Custom collate fn for dealing with batches of images that have a different
12 |     number of associated object annotations (bounding boxes).
13 | 
14 |     Arguments:
15 |         batch: (tuple) A tuple of tensor images and lists of annotations
16 | 
17 |     Return:
18 |         A tuple containing:
19 |             1) (tensor) batch of images stacked on their 0 dim
20 |             2) (list of tensors) annotations for a given image are stacked on
21 |                                  0 dim
22 |     """
23 |     targets = []
24 |     imgs = []
25 |     for sample in batch:
26 |         imgs.append(sample[0])
27 |         targets.append(torch.FloatTensor(sample[1]))
28 |     return torch.stack(imgs, 0), targets
29 | 
30 | 
31 | def base_transform(image, size, mean):
32 |     x = cv2.resize(image, (size, size)).astype(np.float32)
33 |     x -= mean
34 |     x = x.astype(np.float32)
35 |     return x
36 | 
37 | 
38 | class BaseTransform:
39 |     def __init__(self, size, mean):
40 |         self.size = size
41 |         self.mean = np.array(mean, dtype=np.float32)
42 | 
43 |     def __call__(self, image, boxes=None, labels=None):
44 |         return base_transform(image, self.size, self.mean), boxes, labels
45 | 


--------------------------------------------------------------------------------
/data/resultVisualize.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | import os.path as osp
 4 | import math
 5 | 
 6 | rootPath = 'F:/ssd/data/video/waterline'
 7 | 
 8 | imgList = {}
 9 | 
10 | if __name__ == "__main__":
11 |     with open('./det_test_waterline_99.txt', 'r') as f:
12 |         text_lines = f.readlines()
13 |         for line in text_lines:
14 |             info = line.split(" ")
15 |             name, score, x1, y1, x2, y2 = info
16 |             if name in imgList:
17 |                 if float(score) > imgList[name]['score']:
18 |                     imgList[name] = {
19 |                         'score': float(score),
20 |                         'x1': float(x1),
21 |                         'y1': float(y1),
22 |                         'x2': float(x2),
23 |                         'y2': float(y2)
24 |                     }
25 |             else:
26 |                 imgList[name] = {
27 |                     'score': float(score),
28 |                     'x1': float(x1),
29 |                     'y1': float(y1),
30 |                     'x2': float(x2),
31 |                     'y2': float(y2)
32 |                 }
33 | 
34 |     cv2.namedWindow('w1',1)
35 |     img_path = osp.join(rootPath, 'JPEGImages', '%s.jpg')
36 |     for obj in imgList.items():
37 |         name, img = obj
38 |         image = cv2.imread(img_path % name)
39 |         (h, w, c) = image.shape
40 |         cv2.rectangle(image, (math.floor(img['x1']), math.floor(img['y1'])), (math.floor(img['x2']), math.floor(img['y2'])), (255,0,0), 5)
41 |         # cv2.putText(image, img['score'], (math.floor(img['x1']), math.floor(img['y1'])), cv2.FONT_HERSHEY_COMPLEX, 5, (0, 255, 0), 12)
42 |         # sc = min(512, h) / h
43 |         # image = cv2.resize(image, (math.floor(w * sc), math.floor(h * sc)))
44 |         image = cv2.resize(image, (512, 512))
45 |         cv2.imshow('w1', image)
46 |         cv2.waitKey()
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SSD: Single Shot MultiBox Object Detector, in PyTorch
 2 | A [PyTorch](http://pytorch.org/) implementation of [Single Shot MultiBox Detector](http://arxiv.org/abs/1512.02325) from the 2016 paper by Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang, and Alexander C. Berg.  The official and original Caffe code can be found [here](https://github.com/weiliu89/caffe/tree/ssd).
 3 | 
 4 | 
 5 | <img align="right" src= "https://github.com/amdegroot/ssd.pytorch/blob/master/doc/ssd.png" height = 400/>
 6 | 
 7 | ### Table of Contents
 8 | - <a href='#installation'>Installation</a>
 9 | - <a href='#datasets'>Datasets</a>
10 | 
11 | &nbsp;
12 | &nbsp;
13 | &nbsp;
14 | &nbsp;
15 | 
16 | ## Installation
17 | - Install [PyTorch](http://pytorch.org/) by selecting your environment on the website and running the appropriate command.
18 | - Clone this repository.
19 |   * Note: We currently only support Python 3+.
20 | - Then download the dataset by following the [instructions](#datasets) below.
21 | - We now support [Visdom](https://github.com/facebookresearch/visdom) for real-time loss visualization during training!
22 |   * To use Visdom in the browser:
23 |   ```Shell
24 |   # First install Python server and client
25 |   pip install visdom
26 |   # Start the server (probably in a screen or tmux)
27 |   python -m visdom.server
28 |   ```
29 |   * Then (during training) navigate to http://localhost:8097/ (see the Train section below for training details).
30 | - Note: For training, we currently support [VOC](http://host.robots.ox.ac.uk/pascal/VOC/) and [COCO](http://mscoco.org/), and aim to add [ImageNet](http://www.image-net.org/) support soon.
31 | 
32 | ## train
33 | 
34 | ### 实验4.1
35 | 
36 | 使用gauge.zip的数据训练一个水位尺检测模型
37 | 
38 | 使用mark.zip的数据训练一个实际水位和警戒水位检测模型
39 | 
40 | ### 实验4.2
41 | 
42 | 使用buildingwater.zip的数据训练一个河道区域和建筑区域的检测模型
43 | 
44 | ## 注意
45 | 
46 | 修改data/custom.py中的CUSTOM_CLASSES以适应不同的数据集
47 | 
48 | 修改data/config.py中的num_classes, lr_steps, max_iter以适应不同的数据集


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # IPython Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # dotenv
 79 | .env
 80 | 
 81 | # virtualenv
 82 | venv/
 83 | ENV/
 84 | 
 85 | # Spyder project settings
 86 | .spyderproject
 87 | 
 88 | # Rope project settings
 89 | .ropeproject
 90 | 
 91 | # atom remote-sync package
 92 | .remote-sync.json
 93 | 
 94 | # weights
 95 | weights/
 96 | 
 97 | #DS_Store
 98 | .DS_Store
 99 | 
100 | # dev stuff
101 | eval/
102 | eval.ipynb
103 | dev.ipynb
104 | .vscode/
105 | 
106 | # not ready
107 | videos/
108 | templates/
109 | data/ssd_dataloader.py
110 | data/datasets/
111 | data/video/
112 | doc/visualize.py
113 | read_results.py
114 | ssd300_120000/
115 | demos/live
116 | webdemo.py
117 | test_data_aug.py
118 | 
119 | # attributes
120 | 
121 | # pycharm
122 | .idea/
123 | 
124 | # temp checkout soln
125 | data/datasets/
126 | data/ssd_dataloader.py
127 | data/piaofu
128 | data/VOCdevkit
129 | data/*.zip
130 | 
131 | # pylint
132 | .pylintrc


--------------------------------------------------------------------------------
/layers/functions/prior_box.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from math import sqrt as sqrt
 3 | from itertools import product as product
 4 | import torch
 5 | 
 6 | 
 7 | class PriorBox(object):
 8 |     """Compute priorbox coordinates in center-offset form for each source
 9 |     feature map.
10 |     """
11 |     def __init__(self, cfg):
12 |         super(PriorBox, self).__init__()
13 |         self.image_size = cfg['min_dim']
14 |         # number of priors for feature map location (either 4 or 6)
15 |         self.num_priors = len(cfg['aspect_ratios'])
16 |         self.variance = cfg['variance'] or [0.1]
17 |         self.feature_maps = cfg['feature_maps']
18 |         self.min_sizes = cfg['min_sizes']
19 |         self.max_sizes = cfg['max_sizes']
20 |         self.steps = cfg['steps']
21 |         self.aspect_ratios = cfg['aspect_ratios']
22 |         self.clip = cfg['clip']
23 |         self.version = cfg['name']
24 |         for v in self.variance:
25 |             if v <= 0:
26 |                 raise ValueError('Variances must be greater than 0')
27 | 
28 |     def forward(self):
29 |         mean = []
30 |         for k, f in enumerate(self.feature_maps):
31 |             for i, j in product(range(f), repeat=2):
32 |                 f_k = self.image_size / self.steps[k]
33 |                 # unit center x,y
34 |                 cx = (j + 0.5) / f_k
35 |                 cy = (i + 0.5) / f_k
36 | 
37 |                 # aspect_ratio: 1
38 |                 # rel size: min_size
39 |                 s_k = self.min_sizes[k]/self.image_size
40 |                 mean += [cx, cy, s_k, s_k]
41 | 
42 |                 # aspect_ratio: 1
43 |                 # rel size: sqrt(s_k * s_(k+1))
44 |                 s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size))
45 |                 mean += [cx, cy, s_k_prime, s_k_prime]
46 | 
47 |                 # rest of aspect ratios
48 |                 for ar in self.aspect_ratios[k]:
49 |                     mean += [cx, cy, s_k*sqrt(ar), s_k/sqrt(ar)]
50 |                     mean += [cx, cy, s_k/sqrt(ar), s_k*sqrt(ar)]
51 |         # back to torch land
52 |         output = torch.Tensor(mean).view(-1, 4)
53 |         if self.clip:
54 |             output.clamp_(max=1, min=0)
55 |         return output
56 | 


--------------------------------------------------------------------------------
/netModel/multi_flow.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | try:
 4 |     from torch.hub import load_state_dict_from_url
 5 | except ImportError:
 6 |     from torch.utils.model_zoo import load_url as load_state_dict_from_url
 7 | 
 8 | class MultiFlow_Block(nn.Module):
 9 | 
10 |     def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
11 |                  base_width=64, dilation=1, norm_layer=None):
12 |         super(BasicBlock, self).__init__()
13 |         if norm_layer is None:
14 |             norm_layer = nn.BatchNorm2d
15 |         if groups != 1 or base_width != 64:
16 |             raise ValueError('BasicBlock only supports groups=1 and base_width=64')
17 |         if dilation > 1:
18 |             raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
19 | 
20 | class MultiFlow(nn.Module):
21 |     expansion = 1
22 |     __constants__ = ['downsample']
23 | 
24 |     def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
25 |                  base_width=64, dilation=1, norm_layer=None):
26 |         super(BasicBlock, self).__init__()
27 |         if norm_layer is None:
28 |             norm_layer = nn.BatchNorm2d
29 |         if groups != 1 or base_width != 64:
30 |             raise ValueError('BasicBlock only supports groups=1 and base_width=64')
31 |         if dilation > 1:
32 |             raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
33 |         # Both self.conv1 and self.downsample layers downsample the input when stride != 1
34 |         self.conv1 = conv3x3(inplanes, planes, stride)
35 |         self.bn1 = norm_layer(planes)
36 |         self.relu = nn.ReLU(inplace=True)
37 |         self.conv2 = conv3x3(planes, planes)
38 |         self.bn2 = norm_layer(planes)
39 |         self.downsample = downsample
40 |         self.stride = stride
41 | 
42 |     def forward(self, x):
43 |         identity = x
44 | 
45 |         out = self.conv1(x)
46 |         out = self.bn1(out)
47 |         out = self.relu(out)
48 | 
49 |         out = self.conv2(out)
50 |         out = self.bn2(out)
51 | 
52 |         if self.downsample is not None:
53 |             identity = self.downsample(x)
54 | 
55 |         out += identity
56 |         out = self.relu(out)
57 | 
58 |         return out


--------------------------------------------------------------------------------
/data/config.py:
--------------------------------------------------------------------------------
 1 | # config.py
 2 | import os.path
 3 | 
 4 | # gets home dir cross platform
 5 | HOME = "F:/ssd/" # os.path.expanduser("~")
 6 | 
 7 | # for making bounding boxes pretty
 8 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128),
 9 |           (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128))
10 | 
11 | MEANS = (104, 117, 123)
12 | 
13 | # SSD300 CONFIGS
14 | custom = {
15 |     'num_classes':  2,
16 |     'lr_steps': (22500, 30000, 37500),
17 |     'max_iter': 120000,
18 |     'feature_maps': [38, 19, 10, 5, 3, 1],
19 |     'min_dim': 300,
20 |     'steps': [8, 16, 32, 64, 100, 300],
21 |     'min_sizes': [30, 60, 111, 162, 213, 264],
22 |     'max_sizes': [60, 111, 162, 213, 264, 315],
23 |     'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
24 |     'variance': [0.1, 0.2],
25 |     'clip': True,
26 |     'name': 'CUSTOM',
27 | }
28 | 
29 | VOC_300_2 = {
30 |     'num_classes':  5,
31 |     'lr_steps': (100000, 130000, 160000),
32 |     'max_iter': 160000,
33 |     'feature_maps' : [38, 19, 10, 5, 3],
34 |     'min_dim' : 300,
35 |     'steps' : [8, 16, 32, 64, 100],
36 |     'min_sizes' : [30, 60, 111, 162, 213],
37 |     'max_sizes' : [60, 111, 162, 213, 315],
38 |     'aspect_ratios' : [[2,3], [2, 3], [2, 3], [2, 3], [2,3]],
39 |     'variance' : [0.1, 0.2],
40 |     'clip' : True,
41 |     'name': 'CUSTOM',
42 | }
43 | 
44 | voc = {
45 |     'num_classes': 21,
46 |     'lr_steps': (80000, 100000, 120000),
47 |     'max_iter': 120000,
48 |     'feature_maps': [38, 19, 10, 5, 3, 1],
49 |     'min_dim': 300,
50 |     'steps': [8, 16, 32, 64, 100, 300],
51 |     'min_sizes': [30, 60, 111, 162, 213, 264],
52 |     'max_sizes': [60, 111, 162, 213, 264, 315],
53 |     'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
54 |     'variance': [0.1, 0.2],
55 |     'clip': True,
56 |     'name': 'VOC',
57 | }
58 | 
59 | coco = {
60 |     'num_classes': 201,
61 |     'lr_steps': (280000, 360000, 400000),
62 |     'max_iter': 400000,
63 |     'feature_maps': [38, 19, 10, 5, 3, 1],
64 |     'min_dim': 300,
65 |     'steps': [8, 16, 32, 64, 100, 300],
66 |     'min_sizes': [21, 45, 99, 153, 207, 261],
67 |     'max_sizes': [45, 99, 153, 207, 261, 315],
68 |     'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
69 |     'variance': [0.1, 0.2],
70 |     'clip': True,
71 |     'name': 'COCO',
72 | }
73 | 


--------------------------------------------------------------------------------
/layers/modules/focal_loss.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch.autograd import Variable
 6 | 
 7 | class FocalLoss(nn.Module):
 8 |     r"""
 9 |         This criterion is a implemenation of Focal Loss, which is proposed in 
10 |         Focal Loss for Dense Object Detection.
11 | 
12 |             Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class])
13 | 
14 |         The losses are averaged across observations for each minibatch.
15 | 
16 |         Args:
17 |             alpha(1D Tensor, Variable) : the scalar factor for this criterion
18 |             gamma(float, double) : gamma > 0; reduces the relative loss for well-classiﬁed examples (p > .5), 
19 |                                    putting more focus on hard, misclassiﬁed examples
20 |             size_average(bool): By default, the losses are averaged over observations for each minibatch.
21 |                                 However, if the field size_average is set to False, the losses are
22 |                                 instead summed for each minibatch.
23 | 
24 | 
25 |     """
26 |     def __init__(self, class_num, alpha=None, gamma=2, size_average=True):
27 |         super(FocalLoss, self).__init__()
28 |         if alpha is None:
29 |             self.alpha = Variable(torch.ones(class_num, 1))
30 |         else:
31 |             if isinstance(alpha, Variable):
32 |                 self.alpha = alpha
33 |             else:
34 |                 self.alpha = Variable(alpha)
35 |         self.gamma = gamma
36 |         self.class_num = class_num
37 |         self.size_average = size_average
38 | 
39 |     def forward(self, inputs, targets):
40 |         N = inputs.size(0)
41 |         C = inputs.size(1)
42 |         P = F.softmax(inputs, dim=-1)
43 | 
44 |         class_mask = inputs.data.new(N, C).fill_(0)
45 |         class_mask = Variable(class_mask)
46 |         ids = targets.view(-1, 1)
47 |         class_mask.scatter_(1, ids.data, 1.)
48 |         #print(class_mask)
49 | 
50 | 
51 |         if inputs.is_cuda and not self.alpha.is_cuda:
52 |             self.alpha = self.alpha.cuda()
53 |         alpha = self.alpha[ids.data.view(-1)]
54 | 
55 |         probs = (P*class_mask).sum(1).view(-1,1)
56 | 
57 |         log_p = probs.log()
58 |         #print('probs size= {}'.format(probs.size()))
59 |         #print(probs)
60 | 
61 |         batch_loss = -alpha*(torch.pow((1-probs), self.gamma))*log_p 
62 |         #print('-----bacth_loss------')
63 |         #print(batch_loss)
64 | 
65 | 
66 |         if self.size_average:
67 |             loss = batch_loss.mean()
68 |         else:
69 |             loss = batch_loss.sum()
70 |         return loss


--------------------------------------------------------------------------------
/layers/functions/detection.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Function
 3 | from ..box_utils import decode, nms
 4 | from data import voc as cfg
 5 | 
 6 | 
 7 | class Detect(Function):
 8 |     """At test time, Detect is the final layer of SSD.  Decode location preds,
 9 |     apply non-maximum suppression to location predictions based on conf
10 |     scores and threshold to a top_k number of output predictions for both
11 |     confidence score and locations.
12 |     """
13 |     def __init__(self, num_classes, bkg_label, top_k, conf_thresh, nms_thresh):
14 |         self.num_classes = num_classes
15 |         self.background_label = bkg_label
16 |         self.top_k = top_k
17 |         # Parameters used in nms.
18 |         self.nms_thresh = nms_thresh
19 |         if nms_thresh <= 0:
20 |             raise ValueError('nms_threshold must be non negative.')
21 |         self.conf_thresh = conf_thresh
22 |         self.variance = cfg['variance']
23 | 
24 |     def forward(self, loc_data, conf_data, prior_data):
25 |         """
26 |         Args:
27 |             loc_data: (tensor) Loc preds from loc layers
28 |                 Shape: [batch,num_priors*4]
29 |             conf_data: (tensor) Shape: Conf preds from conf layers
30 |                 Shape: [batch*num_priors,num_classes]
31 |             prior_data: (tensor) Prior boxes and variances from priorbox layers
32 |                 Shape: [1,num_priors,4]
33 |         """
34 |         num = loc_data.size(0)  # batch size
35 |         num_priors = prior_data.size(0)
36 |         output = torch.zeros(num, self.num_classes, self.top_k, 5)
37 |         conf_preds = conf_data.view(num, num_priors,
38 |                                     self.num_classes).transpose(2, 1)
39 | 
40 |         # Decode predictions into bboxes.
41 |         for i in range(num):
42 |             decoded_boxes = decode(loc_data[i], prior_data, self.variance)
43 |             # For each class, perform nms
44 |             conf_scores = conf_preds[i].clone()
45 | 
46 |             for cl in range(1, self.num_classes):
47 |                 c_mask = conf_scores[cl].gt(self.conf_thresh)
48 |                 scores = conf_scores[cl][c_mask]
49 |                 if scores.size(0) == 0:
50 |                     continue
51 |                 l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes)
52 |                 boxes = decoded_boxes[l_mask].view(-1, 4)
53 |                 # idx of highest scoring and non-overlapping boxes per class
54 |                 ids, count = nms(boxes, scores, self.nms_thresh, self.top_k)
55 |                 output[i, cl, :count] = \
56 |                     torch.cat((scores[ids[:count]].unsqueeze(1),
57 |                                boxes[ids[:count]]), 1)
58 |         flt = output.contiguous().view(num, -1, 5)
59 |         _, idx = flt[:, :, 0].sort(1, descending=True)
60 |         _, rank = idx.sort(1)
61 |         flt[(rank < self.top_k).unsqueeze(-1).expand_as(flt)].fill_(0)
62 |         return output
63 | 


--------------------------------------------------------------------------------
/data/xmlPaserGenLabel.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: UTF-8 -*-
  3 |  
  4 | import xml.sax
  5 | from pathlib import Path, PurePath
  6 | 
  7 | r = {}
  8 | 
  9 | result = []
 10 | 
 11 | # label = {
 12 | #     'garbage': 0,
 13 | #     'garbagew': 1,
 14 | #     'www': 2,
 15 | #     'w': 3
 16 | # }
 17 | 
 18 | label = {
 19 |     'waterline': 0,
 20 | }
 21 |  
 22 | class MovieHandler( xml.sax.ContentHandler ):
 23 |     def __init__(self):
 24 |         self.tag = ""
 25 |         self.boxes = []
 26 |         self.box = {
 27 |             'name': '',
 28 |             'xmin': 0,
 29 |             'xmax': 0,
 30 |             'ymin': 0,
 31 |             'ymax': 0
 32 |         }
 33 |         self.size = {
 34 |             'width': 0,
 35 |             'height': 0,
 36 |             'depth': 0
 37 |         }
 38 |  
 39 |     # 元素开始事件处理
 40 |     def startElement(self, tag, attributes):
 41 |         self.tag = tag
 42 |  
 43 |     # 元素结束事件处理
 44 |     def endElement(self, tag):
 45 |         if self.tag == 'depth':
 46 |             r['data']['size'] = self.size
 47 |         if self.tag == 'ymax':
 48 |             r['data']['boxes'].append(self.box)
 49 |         self.tag = ""
 50 |  
 51 |     # 内容事件处理
 52 |     def characters(self, content):
 53 |         if self.tag == 'size':
 54 |             self.size = {
 55 |                 'width': 0,
 56 |                 'height': 0,
 57 |                 'depth': 0
 58 |             }
 59 |         elif self.tag == 'object':
 60 |             self.box = {
 61 |                 'name': '',
 62 |                 'xmin': 0,
 63 |                 'xmax': 0,
 64 |                 'ymin': 0,
 65 |                 'ymax': 0
 66 |             }
 67 |         elif self.tag == 'width':
 68 |             self.size['width'] = int(content)
 69 |         elif self.tag == 'height':
 70 |             self.size['height'] = int(content)
 71 |         elif self.tag == 'depth':
 72 |             self.size['depth'] = int(content)
 73 |         elif self.tag == 'name':
 74 |             self.box['name'] = content
 75 |         elif self.tag == 'xmin':
 76 |             self.box['xmin'] = int(content)
 77 |         elif self.tag == 'xmax':
 78 |             self.box['xmax'] = int(content)
 79 |         elif self.tag == 'ymin':
 80 |             self.box['ymin'] = int(content)
 81 |         elif self.tag == 'ymax':
 82 |             self.box['ymax'] = int(content)        
 83 | 
 84 | if ( __name__ == "__main__"):
 85 |    
 86 |     # 创建一个 XMLReader
 87 |     parser = xml.sax.make_parser()
 88 |     # turn off namepsaces
 89 |     parser.setFeature(xml.sax.handler.feature_namespaces, 0)
 90 | 
 91 |     # 重写 ContextHandler
 92 |     Handler = MovieHandler()
 93 |     parser.setContentHandler( Handler )
 94 | 
 95 |     path = './video/waterline/Annotations'
 96 |     p = Path(path)
 97 |     files = [x for x in p.iterdir() if x.is_file()]
 98 |     for f in files:
 99 |         r = {
100 |             'file': f.name[0: -4],
101 |             'data': {
102 |                 'size': {},
103 |                 'boxes': []
104 |             }
105 |         }
106 |         parser.parse(path+'/'+f.name)
107 |         result.append(r)
108 |     for r in result:
109 |         # with open(".\\labels\\" + r['file'] + ".txt", "w") as f:
110 |         width = r['data']['size']['width']
111 |         height = r['data']['size']['height']
112 |         for b in r['data']['boxes']:
113 |             center_x = (b['xmax'] + b['xmin']) / 2 / width
114 |             center_y = (b['ymax'] + b['ymax']) / 2 / height
115 |             width_x = (b['xmax'] - b['xmin']) / width
116 |             height_y = (b['ymax'] - b['ymin']) / height
117 |             label_idx = label[b['name']]
118 |             if width_x == 0 or height_y == 0 or (b['name'] != 'waterline'):
119 |                 print(r['file'])
120 |                 break
121 |             # f.write(str(label_idx) + ' ' + str(center_x) + ' ' + str(center_y) + ' ' + str(width_x) + ' ' + str(height_y) + "\n")


--------------------------------------------------------------------------------
/layers/modules/multibox_loss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | from data import coco as cfg
  7 | from ..box_utils import match, log_sum_exp
  8 | from .focal_loss import FocalLoss
  9 | 
 10 | 
 11 | class MultiBoxLoss(nn.Module):
 12 |     """SSD Weighted Loss Function
 13 |     Compute Targets:
 14 |         1) Produce Confidence Target Indices by matching  ground truth boxes
 15 |            with (default) 'priorboxes' that have jaccard index > threshold parameter
 16 |            (default threshold: 0.5).
 17 |         2) Produce localization target by 'encoding' variance into offsets of ground
 18 |            truth boxes and their matched  'priorboxes'.
 19 |         3) Hard negative mining to filter the excessive number of negative examples
 20 |            that comes with using a large number of default bounding boxes.
 21 |            (default negative:positive ratio 3:1)
 22 |     Objective Loss:
 23 |         L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
 24 |         Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
 25 |         weighted by α which is set to 1 by cross val.
 26 |         Args:
 27 |             c: class confidences,
 28 |             l: predicted boxes,
 29 |             g: ground truth boxes
 30 |             N: number of matched default boxes
 31 |         See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 32 |     """
 33 | 
 34 |     def __init__(self, num_classes, overlap_thresh, prior_for_matching,
 35 |                  bkg_label, neg_mining, neg_pos, neg_overlap, encode_target,
 36 |                  use_gpu=True):
 37 |         super(MultiBoxLoss, self).__init__()
 38 |         self.use_gpu = use_gpu
 39 |         self.num_classes = num_classes
 40 |         self.threshold = overlap_thresh
 41 |         self.background_label = bkg_label
 42 |         self.encode_target = encode_target
 43 |         self.use_prior_for_matching = prior_for_matching
 44 |         self.do_neg_mining = neg_mining
 45 |         self.negpos_ratio = neg_pos
 46 |         self.neg_overlap = neg_overlap
 47 |         self.variance = cfg['variance']
 48 |         self.FL = FocalLoss(class_num=cfg['num_classes'], alpha=torch.Tensor([[0.25], [0.25]]), size_average=False)
 49 | 
 50 |     def forward(self, predictions, targets):
 51 |         """Multibox Loss
 52 |         Args:
 53 |             predictions (tuple): A tuple containing loc preds, conf preds,
 54 |             and prior boxes from SSD net.
 55 |                 conf shape: torch.size(batch_size,num_priors,num_classes)
 56 |                 loc shape: torch.size(batch_size,num_priors,4)
 57 |                 priors shape: torch.size(num_priors,4)
 58 | 
 59 |             targets (tensor): Ground truth boxes and labels for a batch,
 60 |                 shape: [batch_size,num_objs,5] (last idx is the label).
 61 |         """
 62 |         loc_data, conf_data, priors = predictions
 63 |         num = loc_data.size(0)
 64 |         priors = priors[:loc_data.size(1), :]
 65 |         num_priors = (priors.size(0))
 66 |         num_classes = self.num_classes
 67 | 
 68 |         # match priors (default boxes) and ground truth boxes
 69 |         loc_t = torch.Tensor(num, num_priors, 4)
 70 |         conf_t = torch.LongTensor(num, num_priors)
 71 |         for idx in range(num):
 72 |             truths = targets[idx][:, :-1].data
 73 |             labels = targets[idx][:, -1].data
 74 |             defaults = priors.data
 75 |             match(self.threshold, truths, defaults, self.variance, labels,
 76 |                   loc_t, conf_t, idx)
 77 |         if self.use_gpu:
 78 |             loc_t = loc_t.cuda()
 79 |             conf_t = conf_t.cuda()
 80 |         # wrap targets
 81 |         loc_t = Variable(loc_t, requires_grad=False)
 82 |         conf_t = Variable(conf_t, requires_grad=False)
 83 | 
 84 |         pos = conf_t > 0
 85 |         num_pos = pos.sum(dim=1, keepdim=True)
 86 | 
 87 |         # Localization Loss (Smooth L1)
 88 |         # Shape: [batch,num_priors,4]
 89 |         pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
 90 |         loc_p = loc_data[pos_idx].view(-1, 4)
 91 |         loc_t = loc_t[pos_idx].view(-1, 4)
 92 |         # loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)
 93 |         loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum')
 94 | 
 95 |         # Compute max conf across batch for hard negative mining
 96 |         batch_conf = conf_data.view(-1, self.num_classes)
 97 |         loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1))
 98 | 
 99 |         # 修复bug
100 |         loss_c = loss_c.view(pos.size()[0], pos.size()[1])
101 |         # Hard Negative Mining
102 |         loss_c[pos] = 0  # filter out pos boxes for now
103 |         loss_c = loss_c.view(num, -1)
104 |         _, loss_idx = loss_c.sort(1, descending=True)
105 |         _, idx_rank = loss_idx.sort(1)
106 |         num_pos = pos.long().sum(1, keepdim=True)
107 |         num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
108 |         neg = idx_rank < num_neg.expand_as(idx_rank)
109 | 
110 |         # Confidence Loss Including Positive and Negative Examples
111 |         pos_idx = pos.unsqueeze(2).expand_as(conf_data)
112 |         neg_idx = neg.unsqueeze(2).expand_as(conf_data)
113 |         conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes)
114 |         targets_weighted = conf_t[(pos+neg).gt(0)]
115 |         # loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)
116 |         # loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='sum')
117 |         loss_c = self.FL(conf_p, targets_weighted)
118 | 
119 |         # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
120 | 
121 |         N = num_pos.data.sum()
122 |         loss_l /= N
123 |         loss_c /= N
124 |         return loss_l, loss_c
125 |     
126 | 
127 | 


--------------------------------------------------------------------------------
/data/voc0712.py:
--------------------------------------------------------------------------------
  1 | """VOC Dataset Classes
  2 | 
  3 | Original author: Francisco Massa
  4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
  5 | 
  6 | Updated by: Ellis Brown, Max deGroot
  7 | """
  8 | from .config import HOME
  9 | import os.path as osp
 10 | import sys
 11 | import torch
 12 | import torch.utils.data as data
 13 | import cv2
 14 | import numpy as np
 15 | if sys.version_info[0] == 2:
 16 |     import xml.etree.cElementTree as ET
 17 | else:
 18 |     import xml.etree.ElementTree as ET
 19 | 
 20 | VOC_CLASSES = (  # always index 0
 21 |     'aeroplane', 'bicycle', 'bird', 'boat',
 22 |     'bottle', 'bus', 'car', 'cat', 'chair',
 23 |     'cow', 'diningtable', 'dog', 'horse',
 24 |     'motorbike', 'person', 'pottedplant',
 25 |     'sheep', 'sofa', 'train', 'tvmonitor')
 26 | 
 27 | # note: if you used our download scripts, this should be right
 28 | VOC_ROOT = osp.join(HOME, "data/VOCdevkit/")
 29 | 
 30 | 
 31 | class VOCAnnotationTransform(object):
 32 |     """Transforms a VOC annotation into a Tensor of bbox coords and label index
 33 |     Initilized with a dictionary lookup of classnames to indexes
 34 | 
 35 |     Arguments:
 36 |         class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
 37 |             (default: alphabetic indexing of VOC's 20 classes)
 38 |         keep_difficult (bool, optional): keep difficult instances or not
 39 |             (default: False)
 40 |         height (int): height
 41 |         width (int): width
 42 |     """
 43 | 
 44 |     def __init__(self, class_to_ind=None, keep_difficult=False):
 45 |         self.class_to_ind = class_to_ind or dict(
 46 |             zip(VOC_CLASSES, range(len(VOC_CLASSES))))
 47 |         self.keep_difficult = keep_difficult
 48 | 
 49 |     def __call__(self, target, width, height):
 50 |         """
 51 |         Arguments:
 52 |             target (annotation) : the target annotation to be made usable
 53 |                 will be an ET.Element
 54 |         Returns:
 55 |             a list containing lists of bounding boxes  [bbox coords, class name]
 56 |         """
 57 |         res = []
 58 |         for obj in target.iter('object'):
 59 |             difficult = int(obj.find('difficult').text) == 1
 60 |             if not self.keep_difficult and difficult:
 61 |                 continue
 62 |             name = obj.find('name').text.lower().strip()
 63 |             bbox = obj.find('bndbox')
 64 | 
 65 |             pts = ['xmin', 'ymin', 'xmax', 'ymax']
 66 |             bndbox = []
 67 |             for i, pt in enumerate(pts):
 68 |                 cur_pt = int(bbox.find(pt).text) - 1
 69 |                 # scale height or width
 70 |                 cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
 71 |                 bndbox.append(cur_pt)
 72 |             label_idx = self.class_to_ind[name]
 73 |             bndbox.append(label_idx)
 74 |             res += [bndbox]  # [xmin, ymin, xmax, ymax, label_ind]
 75 |             # img_id = target.find('filename').text[:-4]
 76 | 
 77 |         return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
 78 | 
 79 | 
 80 | class VOCDetection(data.Dataset):
 81 |     """VOC Detection Dataset Object
 82 | 
 83 |     input is image, target is annotation
 84 | 
 85 |     Arguments:
 86 |         root (string): filepath to VOCdevkit folder.
 87 |         image_set (string): imageset to use (eg. 'train', 'val', 'test')
 88 |         transform (callable, optional): transformation to perform on the
 89 |             input image
 90 |         target_transform (callable, optional): transformation to perform on the
 91 |             target `annotation`
 92 |             (eg: take in caption string, return tensor of word indices)
 93 |         dataset_name (string, optional): which dataset to load
 94 |             (default: 'VOC2007')
 95 |     """
 96 | 
 97 |     def __init__(self, root,
 98 |                  image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
 99 |                  transform=None, target_transform=VOCAnnotationTransform(),
100 |                  dataset_name='VOC0712'):
101 |         self.root = root
102 |         self.image_set = image_sets
103 |         self.transform = transform
104 |         self.target_transform = target_transform
105 |         self.name = dataset_name
106 |         self._annopath = osp.join('%s', 'Annotations', '%s.xml')
107 |         self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
108 |         self.ids = list()
109 |         for (year, name) in image_sets:
110 |             rootpath = osp.join(self.root, 'VOC' + year)
111 |             for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
112 |                 self.ids.append((rootpath, line.strip()))
113 | 
114 |     def __getitem__(self, index):
115 |         im, gt, h, w = self.pull_item(index)
116 | 
117 |         return im, gt
118 | 
119 |     def __len__(self):
120 |         return len(self.ids)
121 | 
122 |     def pull_item(self, index):
123 |         img_id = self.ids[index]
124 | 
125 |         target = ET.parse(self._annopath % img_id).getroot()
126 |         img = cv2.imread(self._imgpath % img_id)
127 |         height, width, channels = img.shape
128 | 
129 |         if self.target_transform is not None:
130 |             target = self.target_transform(target, width, height)
131 | 
132 |         if self.transform is not None:
133 |             target = np.array(target)
134 |             img, boxes, labels = self.transform(img, target[:, :4], target[:, 4])
135 |             # to rgb
136 |             img = img[:, :, (2, 1, 0)]
137 |             # img = img.transpose(2, 0, 1)
138 |             target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
139 |         return torch.from_numpy(img).permute(2, 0, 1), target, height, width
140 |         # return torch.from_numpy(img), target, height, width
141 | 
142 |     def pull_image(self, index):
143 |         '''Returns the original image object at index in PIL form
144 | 
145 |         Note: not using self.__getitem__(), as any transformations passed in
146 |         could mess up this functionality.
147 | 
148 |         Argument:
149 |             index (int): index of img to show
150 |         Return:
151 |             PIL img
152 |         '''
153 |         img_id = self.ids[index]
154 |         return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
155 | 
156 |     def pull_anno(self, index):
157 |         '''Returns the original annotation of image at index
158 | 
159 |         Note: not using self.__getitem__(), as any transformations passed in
160 |         could mess up this functionality.
161 | 
162 |         Argument:
163 |             index (int): index of img to get annotation of
164 |         Return:
165 |             list:  [img_id, [(label, bbox coords),...]]
166 |                 eg: ('001718', [('dog', (96, 13, 438, 332))])
167 |         '''
168 |         img_id = self.ids[index]
169 |         anno = ET.parse(self._annopath % img_id).getroot()
170 |         gt = self.target_transform(anno, 1, 1)
171 |         return img_id[1], gt
172 | 
173 |     def pull_tensor(self, index):
174 |         '''Returns the original image at an index in tensor form
175 | 
176 |         Note: not using self.__getitem__(), as any transformations passed in
177 |         could mess up this functionality.
178 | 
179 |         Argument:
180 |             index (int): index of img to show
181 |         Return:
182 |             tensorized version of img, squeezed
183 |         '''
184 |         return torch.Tensor(self.pull_image(index)).unsqueeze_(0)


--------------------------------------------------------------------------------
/data/custom.py:
--------------------------------------------------------------------------------
  1 | """custom Dataset Classes
  2 | 
  3 | Original author: Francisco Massa
  4 | https://github.com/fmassa/vision/blob/custom_dataset/torchvision/datasets/custom.py
  5 | 
  6 | Updated by: Ellis Brown, Max deGroot
  7 | """
  8 | from .config import HOME
  9 | import os.path as osp
 10 | import sys
 11 | import torch
 12 | import torch.utils.data as data
 13 | import cv2
 14 | import numpy as np
 15 | if sys.version_info[0] == 2:
 16 |     import xml.etree.cElementTree as ET
 17 | else:
 18 |     import xml.etree.ElementTree as ET
 19 | 
 20 | CUSTOM_CLASSES = (  # always index 0
 21 |     'gauge',)
 22 | 
 23 | # CUSTOM_CLASSES = (  # always index 0
 24 | #     'waterline', 'mark')
 25 | 
 26 | # CUSTOM_CLASSES = (  # always index 0
 27 | #     'building', 'water')
 28 | 
 29 | # note: if you used our download scripts, this should be right
 30 | CUSTOM_ROOT = osp.join(HOME, "data/video/")
 31 | 
 32 | 
 33 | class customAnnotationTransform(object):
 34 |     """Transforms a custom annotation into a Tensor of bbox coords and label index
 35 |     Initilized with a dictionary lookup of classnames to indexes
 36 | 
 37 |     Arguments:
 38 |         class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
 39 |             (default: alphabetic indexing of custom's 20 classes)
 40 |         keep_difficult (bool, optional): keep difficult instances or not
 41 |             (default: False)
 42 |         height (int): height
 43 |         width (int): width
 44 |     """
 45 | 
 46 |     def __init__(self, class_to_ind=None, keep_difficult=False):
 47 |         self.class_to_ind = class_to_ind or dict(
 48 |             zip(CUSTOM_CLASSES, range(len(CUSTOM_CLASSES))))
 49 |         print(self.class_to_ind)
 50 |         self.keep_difficult = keep_difficult
 51 | 
 52 |     def __call__(self, target, width, height):
 53 |         """
 54 |         Arguments:
 55 |             target (annotation) : the target annotation to be made usable
 56 |                 will be an ET.Element
 57 |         Returns:
 58 |             a list containing lists of bounding boxes  [bbox coords, class name]
 59 |         """
 60 |         res = []
 61 |         for obj in target.iter('object'):
 62 |             difficult = int(obj.find('difficult').text) == 1
 63 |             if not self.keep_difficult and difficult:
 64 |                 continue
 65 |             name = obj.find('name').text.lower().strip()
 66 |             bbox = obj.find('bndbox')
 67 | 
 68 |             pts = ['xmin', 'ymin', 'xmax', 'ymax']
 69 |             bndbox = []
 70 |             for i, pt in enumerate(pts):
 71 |                 cur_pt = int(bbox.find(pt).text) - 1
 72 |                 # scale height or width
 73 |                 cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
 74 |                 bndbox.append(cur_pt)
 75 |             label_idx = self.class_to_ind[name]
 76 |             bndbox.append(label_idx)
 77 |             res += [bndbox]  # [xmin, ymin, xmax, ymax, label_ind]
 78 |             # img_id = target.find('filename').text[:-4]
 79 |         return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
 80 | 
 81 | 
 82 | class customDetection(data.Dataset):
 83 |     """custom Detection Dataset Object
 84 | 
 85 |     input is image, target is annotation
 86 | 
 87 |     Arguments:
 88 |         root (string): filepath to customdevkit folder.
 89 |         image_set (string): imageset to use (eg. 'train', 'val', 'test')
 90 |         transform (callable, optional): transformation to perform on the
 91 |             input image
 92 |         target_transform (callable, optional): transformation to perform on the
 93 |             target `annotation`
 94 |             (eg: take in caption string, return tensor of word indices)
 95 |         dataset_name (string, optional): which dataset to load
 96 |             (default: 'VOC2007')
 97 |     """
 98 | 
 99 |     def __init__(self, root,
100 |                 #  image_sets=[('shenhe', 'train')],
101 |                  image_sets=[('gauge', 'train')],
102 |                  transform=None, target_transform=customAnnotationTransform(),
103 |                  dataset_name='custom'):
104 |         self.root = root
105 |         self.image_set = image_sets
106 |         self.transform = transform
107 |         self.target_transform = target_transform
108 |         self.name = dataset_name
109 |         self._annopath = osp.join('%s', 'Annotations', '%s.xml')
110 |         self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
111 |         self.ids = list()
112 |         for (curDir, name) in image_sets:
113 |             rootpath = osp.join(self.root, curDir)
114 |             for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
115 |                 self.ids.append((rootpath, line.strip()))
116 | 
117 |     def __getitem__(self, index):
118 |         im, gt, h, w = self.pull_item(index)
119 | 
120 |         return im, gt
121 | 
122 |     def __len__(self):
123 |         return len(self.ids)
124 | 
125 |     def pull_item(self, index):
126 |         img_id = self.ids[index]
127 | 
128 |         target = ET.parse(self._annopath % img_id).getroot()
129 |         img = cv2.imread(self._imgpath % img_id)
130 |         height, width, channels = img.shape
131 | 
132 |         if self.target_transform is not None:
133 |             target = self.target_transform(target, width, height)
134 | 
135 |         if self.transform is not None:
136 |             target = np.array(target)
137 |             img, boxes, labels = self.transform(img, target[:, :4], target[:, 4])
138 |             # to rgb
139 |             img = img[:, :, (2, 1, 0)]
140 |             # img = img.transpose(2, 0, 1)
141 |             target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
142 |         return torch.from_numpy(img).permute(2, 0, 1), target, height, width
143 |         # return torch.from_numpy(img), target, height, width
144 | 
145 |     def pull_image(self, index):
146 |         '''Returns the original image object at index in PIL form
147 | 
148 |         Note: not using self.__getitem__(), as any transformations passed in
149 |         could mess up this functionality.
150 | 
151 |         Argument:
152 |             index (int): index of img to show
153 |         Return:
154 |             PIL img
155 |         '''
156 |         img_id = self.ids[index]
157 |         return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
158 | 
159 |     def pull_anno(self, index):
160 |         '''Returns the original annotation of image at index
161 | 
162 |         Note: not using self.__getitem__(), as any transformations passed in
163 |         could mess up this functionality.
164 | 
165 |         Argument:
166 |             index (int): index of img to get annotation of
167 |         Return:
168 |             list:  [img_id, [(label, bbox coords),...]]
169 |                 eg: ('001718', [('dog', (96, 13, 438, 332))])
170 |         '''
171 |         img_id = self.ids[index]
172 |         anno = ET.parse(self._annopath % img_id).getroot()
173 |         gt = self.target_transform(anno, 1, 1)
174 |         return img_id[1], gt
175 | 
176 |     def pull_tensor(self, index):
177 |         '''Returns the original image at an index in tensor form
178 | 
179 |         Note: not using self.__getitem__(), as any transformations passed in
180 |         could mess up this functionality.
181 | 
182 |         Argument:
183 |             index (int): index of img to show
184 |         Return:
185 |             tensorized version of img, squeezed
186 |         '''
187 |         return torch.Tensor(self.pull_image(index)).unsqueeze_(0)


--------------------------------------------------------------------------------
/data/custom_for_visual.py:
--------------------------------------------------------------------------------
  1 | """custom Dataset Classes
  2 | 
  3 | Original author: Francisco Massa
  4 | https://github.com/fmassa/vision/blob/custom_dataset/torchvision/datasets/custom.py
  5 | 
  6 | Updated by: Ellis Brown, Max deGroot
  7 | """
  8 | from .config import HOME
  9 | import os.path as osp
 10 | import sys
 11 | import torch
 12 | import torch.utils.data as data
 13 | import cv2
 14 | import numpy as np
 15 | if sys.version_info[0] == 2:
 16 |     import xml.etree.cElementTree as ET
 17 | else:
 18 |     import xml.etree.ElementTree as ET
 19 | 
 20 | # {'garbage': 1, 'garbagew': 1, 'www': 1, 'w': 1}
 21 | # CUSTOM_CLASSES = (  # always index 0
 22 | #     'waterline',)
 23 | CUSTOM_CLASSES_GAUGE = (  # always index 0
 24 |     'gauge',)
 25 | 
 26 | CUSTOM_CLASSES_WATERLINE = (  # always index 0
 27 |     'waterline', 'mark',)
 28 | 
 29 | CUSTOM_CLASSES_BUILDING = (  # always index 0
 30 |     'building', 'water',)
 31 | 
 32 | # note: if you used our download scripts, this should be right
 33 | # CUSTOM_ROOT = osp.join(HOME, "data/piaofu/piao/")
 34 | CUSTOM_ROOT = osp.join(HOME, "data/video/")
 35 | 
 36 | 
 37 | class customAnnotationTransform(object):
 38 |     """Transforms a custom annotation into a Tensor of bbox coords and label index
 39 |     Initilized with a dictionary lookup of classnames to indexes
 40 | 
 41 |     Arguments:
 42 |         class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
 43 |             (default: alphabetic indexing of custom's 20 classes)
 44 |         keep_difficult (bool, optional): keep difficult instances or not
 45 |             (default: False)
 46 |         height (int): height
 47 |         width (int): width
 48 |     """
 49 | 
 50 |     def __init__(self, class_to_ind=None, keep_difficult=False):
 51 |         self.class_to_ind = class_to_ind or dict(
 52 |             zip(CUSTOM_CLASSES_GAUGE, range(len(CUSTOM_CLASSES_GAUGE))))
 53 |         self.keep_difficult = keep_difficult
 54 | 
 55 |     def __call__(self, target, width, height):
 56 |         """
 57 |         Arguments:
 58 |             target (annotation) : the target annotation to be made usable
 59 |                 will be an ET.Element
 60 |         Returns:
 61 |             a list containing lists of bounding boxes  [bbox coords, class name]
 62 |         """
 63 |         res = []
 64 |         for obj in target.iter('object'):
 65 |             difficult = int(obj.find('difficult').text) == 1
 66 |             if not self.keep_difficult and difficult:
 67 |                 continue
 68 |             name = obj.find('name').text.lower().strip()
 69 |             bbox = obj.find('bndbox')
 70 | 
 71 |             pts = ['xmin', 'ymin', 'xmax', 'ymax']
 72 |             bndbox = []
 73 |             for i, pt in enumerate(pts):
 74 |                 cur_pt = int(bbox.find(pt).text) - 1
 75 |                 # scale height or width
 76 |                 cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
 77 |                 bndbox.append(cur_pt)
 78 |             label_idx = self.class_to_ind[name]
 79 |             bndbox.append(label_idx)
 80 |             res += [bndbox]  # [xmin, ymin, xmax, ymax, label_ind]
 81 |             # img_id = target.find('filename').text[:-4]
 82 |         return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
 83 | 
 84 | 
 85 | class customDetection(data.Dataset):
 86 |     """custom Detection Dataset Object
 87 | 
 88 |     input is image, target is annotation
 89 | 
 90 |     Arguments:
 91 |         root (string): filepath to customdevkit folder.
 92 |         image_set (string): imageset to use (eg. 'train', 'val', 'test')
 93 |         transform (callable, optional): transformation to perform on the
 94 |             input image
 95 |         target_transform (callable, optional): transformation to perform on the
 96 |             target `annotation`
 97 |             (eg: take in caption string, return tensor of word indices)
 98 |         dataset_name (string, optional): which dataset to load
 99 |             (default: 'VOC2007')
100 |     """
101 | 
102 |     def __init__(self, root,
103 |                  image_sets=[('gauge', 'train')],
104 |                  transform=None, target_transform=customAnnotationTransform(),
105 |                  dataset_name='custom'):
106 |         self.root = root
107 |         self.image_set = image_sets
108 |         self.transform = transform
109 |         self.target_transform = target_transform
110 |         self.name = dataset_name
111 |         self._annopath = osp.join('%s', 'Annotations', '%s.xml')
112 |         self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
113 |         self.ids = list()
114 |         for (curDir, name) in image_sets:
115 |             rootpath = osp.join(self.root, curDir)
116 |             for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
117 |                 self.ids.append((rootpath, line.strip()))
118 | 
119 |     def __getitem__(self, index):
120 |         im, gt, h, w = self.pull_item(index)
121 | 
122 |         return im, gt
123 | 
124 |     def __len__(self):
125 |         return len(self.ids)
126 | 
127 |     def pull_item(self, index):
128 |         img_id = self.ids[index]
129 | 
130 |         target = ET.parse(self._annopath % img_id).getroot()
131 |         img = cv2.imread(self._imgpath % img_id)
132 |         height, width, channels = img.shape
133 | 
134 |         if self.target_transform is not None:
135 |             target = self.target_transform(target, width, height)
136 | 
137 |         if self.transform is not None:
138 |             target = np.array(target)
139 |             img, boxes, labels = self.transform(img, target[:, :4], target[:, 4])
140 |             # to rgb
141 |             img = img[:, :, (2, 1, 0)]
142 |             # img = img.transpose(2, 0, 1)
143 |             target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
144 |         return torch.from_numpy(img).permute(2, 0, 1), target, height, width
145 |         # return torch.from_numpy(img), target, height, width
146 | 
147 |     def pull_image(self, index):
148 |         '''Returns the original image object at index in PIL form
149 | 
150 |         Note: not using self.__getitem__(), as any transformations passed in
151 |         could mess up this functionality.
152 | 
153 |         Argument:
154 |             index (int): index of img to show
155 |         Return:
156 |             PIL img
157 |         '''
158 |         img_id = self.ids[index]
159 |         return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
160 | 
161 |     def pull_anno(self, index):
162 |         '''Returns the original annotation of image at index
163 | 
164 |         Note: not using self.__getitem__(), as any transformations passed in
165 |         could mess up this functionality.
166 | 
167 |         Argument:
168 |             index (int): index of img to get annotation of
169 |         Return:
170 |             list:  [img_id, [(label, bbox coords),...]]
171 |                 eg: ('001718', [('dog', (96, 13, 438, 332))])
172 |         '''
173 |         img_id = self.ids[index]
174 |         anno = ET.parse(self._annopath % img_id).getroot()
175 |         gt = self.target_transform(anno, 1, 1)
176 |         return img_id[1], gt
177 | 
178 |     def pull_tensor(self, index):
179 |         '''Returns the original image at an index in tensor form
180 | 
181 |         Note: not using self.__getitem__(), as any transformations passed in
182 |         could mess up this functionality.
183 | 
184 |         Argument:
185 |             index (int): index of img to show
186 |         Return:
187 |             tensorized version of img, squeezed
188 |         '''
189 |         return torch.Tensor(self.pull_image(index)).unsqueeze_(0)
190 |     
191 |     def pull_img_name(self, index):
192 |         return self.ids[index]


--------------------------------------------------------------------------------
/data/coco.py:
--------------------------------------------------------------------------------
  1 | from .config import HOME
  2 | import os
  3 | import os.path as osp
  4 | import sys
  5 | import torch
  6 | import torch.utils.data as data
  7 | import torchvision.transforms as transforms
  8 | import cv2
  9 | import numpy as np
 10 | 
 11 | COCO_ROOT = osp.join(HOME, 'data/coco/')
 12 | IMAGES = 'images'
 13 | ANNOTATIONS = 'annotations'
 14 | COCO_API = 'PythonAPI'
 15 | INSTANCES_SET = 'instances_{}.json'
 16 | COCO_CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
 17 |                 'train', 'truck', 'boat', 'traffic light', 'fire', 'hydrant',
 18 |                 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
 19 |                 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
 20 |                 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
 21 |                 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
 22 |                 'kite', 'baseball bat', 'baseball glove', 'skateboard',
 23 |                 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
 24 |                 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
 25 |                 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
 26 |                 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
 27 |                 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
 28 |                 'keyboard', 'cell phone', 'microwave oven', 'toaster', 'sink',
 29 |                 'refrigerator', 'book', 'clock', 'vase', 'scissors',
 30 |                 'teddy bear', 'hair drier', 'toothbrush')
 31 | 
 32 | 
 33 | def get_label_map(label_file):
 34 |     label_map = {}
 35 |     labels = open(label_file, 'r')
 36 |     for line in labels:
 37 |         ids = line.split(',')
 38 |         label_map[int(ids[0])] = int(ids[1])
 39 |     return label_map
 40 | 
 41 | 
 42 | class COCOAnnotationTransform(object):
 43 |     """Transforms a COCO annotation into a Tensor of bbox coords and label index
 44 |     Initilized with a dictionary lookup of classnames to indexes
 45 |     """
 46 |     def __init__(self):
 47 |         self.label_map = get_label_map(osp.join(COCO_ROOT, 'coco_labels.txt'))
 48 | 
 49 |     def __call__(self, target, width, height):
 50 |         """
 51 |         Args:
 52 |             target (dict): COCO target json annotation as a python dict
 53 |             height (int): height
 54 |             width (int): width
 55 |         Returns:
 56 |             a list containing lists of bounding boxes  [bbox coords, class idx]
 57 |         """
 58 |         scale = np.array([width, height, width, height])
 59 |         res = []
 60 |         for obj in target:
 61 |             if 'bbox' in obj:
 62 |                 bbox = obj['bbox']
 63 |                 bbox[2] += bbox[0]
 64 |                 bbox[3] += bbox[1]
 65 |                 label_idx = self.label_map[obj['category_id']] - 1
 66 |                 final_box = list(np.array(bbox)/scale)
 67 |                 final_box.append(label_idx)
 68 |                 res += [final_box]  # [xmin, ymin, xmax, ymax, label_idx]
 69 |             else:
 70 |                 print("no bbox problem!")
 71 | 
 72 |         return res  # [[xmin, ymin, xmax, ymax, label_idx], ... ]
 73 | 
 74 | 
 75 | class COCODetection(data.Dataset):
 76 |     """`MS Coco Detection <http://mscoco.org/dataset/#detections-challenge2016>`_ Dataset.
 77 |     Args:
 78 |         root (string): Root directory where images are downloaded to.
 79 |         set_name (string): Name of the specific set of COCO images.
 80 |         transform (callable, optional): A function/transform that augments the
 81 |                                         raw images`
 82 |         target_transform (callable, optional): A function/transform that takes
 83 |         in the target (bbox) and transforms it.
 84 |     """
 85 | 
 86 |     def __init__(self, root, image_set='trainval35k', transform=None,
 87 |                  target_transform=COCOAnnotationTransform(), dataset_name='MS COCO'):
 88 |         sys.path.append(osp.join(root, COCO_API))
 89 |         from pycocotools.coco import COCO
 90 |         self.root = osp.join(root, IMAGES, image_set)
 91 |         self.coco = COCO(osp.join(root, ANNOTATIONS,
 92 |                                   INSTANCES_SET.format(image_set)))
 93 |         self.ids = list(self.coco.imgToAnns.keys())
 94 |         self.transform = transform
 95 |         self.target_transform = target_transform
 96 |         self.name = dataset_name
 97 | 
 98 |     def __getitem__(self, index):
 99 |         """
100 |         Args:
101 |             index (int): Index
102 |         Returns:
103 |             tuple: Tuple (image, target).
104 |                    target is the object returned by ``coco.loadAnns``.
105 |         """
106 |         im, gt, h, w = self.pull_item(index)
107 |         return im, gt
108 | 
109 |     def __len__(self):
110 |         return len(self.ids)
111 | 
112 |     def pull_item(self, index):
113 |         """
114 |         Args:
115 |             index (int): Index
116 |         Returns:
117 |             tuple: Tuple (image, target, height, width).
118 |                    target is the object returned by ``coco.loadAnns``.
119 |         """
120 |         img_id = self.ids[index]
121 |         target = self.coco.imgToAnns[img_id]
122 |         ann_ids = self.coco.getAnnIds(imgIds=img_id)
123 | 
124 |         target = self.coco.loadAnns(ann_ids)
125 |         path = osp.join(self.root, self.coco.loadImgs(img_id)[0]['file_name'])
126 |         assert osp.exists(path), 'Image path does not exist: {}'.format(path)
127 |         img = cv2.imread(osp.join(self.root, path))
128 |         height, width, _ = img.shape
129 |         if self.target_transform is not None:
130 |             target = self.target_transform(target, width, height)
131 |         if self.transform is not None:
132 |             target = np.array(target)
133 |             img, boxes, labels = self.transform(img, target[:, :4],
134 |                                                 target[:, 4])
135 |             # to rgb
136 |             img = img[:, :, (2, 1, 0)]
137 | 
138 |             target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
139 |         return torch.from_numpy(img).permute(2, 0, 1), target, height, width
140 | 
141 |     def pull_image(self, index):
142 |         '''Returns the original image object at index in PIL form
143 | 
144 |         Note: not using self.__getitem__(), as any transformations passed in
145 |         could mess up this functionality.
146 | 
147 |         Argument:
148 |             index (int): index of img to show
149 |         Return:
150 |             cv2 img
151 |         '''
152 |         img_id = self.ids[index]
153 |         path = self.coco.loadImgs(img_id)[0]['file_name']
154 |         return cv2.imread(osp.join(self.root, path), cv2.IMREAD_COLOR)
155 | 
156 |     def pull_anno(self, index):
157 |         '''Returns the original annotation of image at index
158 | 
159 |         Note: not using self.__getitem__(), as any transformations passed in
160 |         could mess up this functionality.
161 | 
162 |         Argument:
163 |             index (int): index of img to get annotation of
164 |         Return:
165 |             list:  [img_id, [(label, bbox coords),...]]
166 |                 eg: ('001718', [('dog', (96, 13, 438, 332))])
167 |         '''
168 |         img_id = self.ids[index]
169 |         ann_ids = self.coco.getAnnIds(imgIds=img_id)
170 |         return self.coco.loadAnns(ann_ids)
171 | 
172 |     def __repr__(self):
173 |         fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
174 |         fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
175 |         fmt_str += '    Root Location: {}\n'.format(self.root)
176 |         tmp = '    Transforms (if any): '
177 |         fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
178 |         tmp = '    Target Transforms (if any): '
179 |         fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
180 |         return fmt_str
181 | 


--------------------------------------------------------------------------------
/实验 4.1/ssd_resnet_101.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | from layers import *
  6 | from data import voc, coco, custom
  7 | import os
  8 | 
  9 | from netModel.resnet import resnet18, resnet34, BasicBlock
 10 | 
 11 | 
 12 | class SSD(nn.Module):
 13 |     """Single Shot Multibox Architecture
 14 |     The network is composed of a base VGG network followed by the
 15 |     added multibox conv layers.  Each multibox layer branches into
 16 |         1) conv2d for class conf scores
 17 |         2) conv2d for localization predictions
 18 |         3) associated priorbox layer to produce default bounding
 19 |            boxes specific to the layer's feature map size.
 20 |     See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 21 | 
 22 |     Args:
 23 |         phase: (string) Can be "test" or "train"
 24 |         size: input image size
 25 |         base: resnet layers for input, size of either 300 or 500
 26 |         extras: extra layers that feed to multibox loc and conf layers
 27 |         head: "multibox head" consists of loc and conf conv layers
 28 |     """
 29 | 
 30 |     def __init__(self, phase, size, base, extras, head, num_classes):
 31 |         super(SSD, self).__init__()
 32 |         self.phase = phase
 33 |         self.num_classes = num_classes
 34 |         self.cfg = custom
 35 |         self.priorbox = PriorBox(self.cfg)
 36 |         self.priors = Variable(self.priorbox.forward(), volatile=True)
 37 |         self.size = size
 38 | 
 39 |         # SSD network
 40 |         self.resnet = nn.ModuleList(base)
 41 |         # Layer learns to scale the l2 normalized features from conv4_3
 42 |         self.L2Norm = L2Norm(512, 20)
 43 |         self.extras = nn.ModuleList(extras)
 44 | 
 45 |         self.loc = nn.ModuleList(head[0])
 46 |         self.conf = nn.ModuleList(head[1])
 47 | 
 48 |         if phase == 'test':
 49 |             self.softmax = nn.Softmax(dim=-1)
 50 |             self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)
 51 | 
 52 |     def forward(self, x):
 53 |         """Applies network layers and ops on input image(s) x.
 54 | 
 55 |         Args:
 56 |             x: input image or batch of images. Shape: [batch,3,300,300].
 57 | 
 58 |         Return:
 59 |             Depending on phase:
 60 |             test:
 61 |                 Variable(tensor) of output class label predictions,
 62 |                 confidence score, and corresponding location predictions for
 63 |                 each object detected. Shape: [batch,topk,7]
 64 | 
 65 |             train:
 66 |                 list of concat outputs from:
 67 |                     1: confidence layers, Shape: [batch*num_priors,num_classes]
 68 |                     2: localization layers, Shape: [batch,num_priors*4]
 69 |                     3: priorbox layers, Shape: [2,num_priors*4]
 70 |         """
 71 |         sources = list()
 72 |         loc = list()
 73 |         conf = list()
 74 | 
 75 |         # apply resnet up to layer2
 76 |         for k in range(0,7):
 77 |             x = self.resnet[k](x)
 78 |         sources.append(x)
 79 | 
 80 |         # apply resnet up to layer4
 81 |         for k in range(7, len(self.resnet)):
 82 |             x = self.resnet[k](x)
 83 |         sources.append(x)
 84 |         # s = self.L2Norm(x)
 85 |         # sources.append(s)
 86 | 
 87 |         # apply extra layers and cache source layer outputs
 88 |         for k, v in enumerate(self.extras):
 89 |             x = F.relu(v(x), inplace=True)
 90 |             if k % 2 == 1:
 91 |                 sources.append(x)
 92 |         # apply multibox head to source layers
 93 |         for (x, l, c) in zip(sources, self.loc, self.conf):
 94 |             loc.append(l(x).permute(0, 2, 3, 1).contiguous())
 95 |             conf.append(c(x).permute(0, 2, 3, 1).contiguous())
 96 | 
 97 |         loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
 98 |         conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
 99 |         if self.phase == "test":
100 |             output = self.detect(
101 |                 loc.view(loc.size(0), -1, 4),                   # loc preds
102 |                 self.softmax(conf.view(conf.size(0), -1,
103 |                              self.num_classes)),                # conf preds
104 |                 self.priors.type(type(x.data))                  # default boxes
105 |             )
106 |         else:
107 |             output = (
108 |                 loc.view(loc.size(0), -1, 4),
109 |                 conf.view(conf.size(0), -1, self.num_classes),
110 |                 self.priors
111 |             )
112 |         return output
113 | 
114 |     def load_weights(self, base_file):
115 |         other, ext = os.path.splitext(base_file)
116 |         if ext == '.pkl' or '.pth':
117 |             print('Loading weights into state dict...')
118 |             self.load_state_dict(torch.load(base_file,
119 |                                  map_location=lambda storage, loc: storage))
120 |             print('Finished!')
121 |         else:
122 |             print('Sorry only .pth and .pkl files supported.')
123 | 
124 | 
125 | # This function is derived from torchvision VGG make_layers()
126 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
127 | def vgg(cfg, i, batch_norm=False):
128 |     layers = []
129 |     in_channels = i
130 |     for v in cfg:
131 |         if v == 'M':
132 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
133 |         elif v == 'C':
134 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
135 |         else:
136 |             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
137 |             if batch_norm:
138 |                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
139 |             else:
140 |                 layers += [conv2d, nn.ReLU(inplace=True)]
141 |             in_channels = v
142 |     pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
143 |     conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
144 |     conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
145 |     layers += [pool5, conv6,
146 |                nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
147 |     return layers
148 | 
149 | def resnet():
150 |     resnet = resnet34(pretrained=True)
151 |     layers = [
152 |         resnet.conv1,
153 |         resnet.bn1,
154 |         resnet.relu,
155 |         resnet.maxpool,
156 |         resnet.layer1,
157 |         resnet.layer2,
158 |         resnet.layer3,
159 |         resnet.layer4,
160 |     ]
161 |     return layers
162 | 
163 | def add_extras(cfg, i, batch_norm=False):
164 |     # Extra layers added to VGG for feature scaling
165 |     layers = []
166 |     in_channels = i
167 |     flag = False
168 |     for k, v in enumerate(cfg):
169 |         if in_channels != 'S':
170 |             if v == 'S':
171 |                 layers += [nn.Conv2d(in_channels, cfg[k + 1],
172 |                            kernel_size=(1, 3)[flag], stride=2, padding=1)]
173 |             else:
174 |                 layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
175 |             flag = not flag
176 |         in_channels = v
177 |     return layers
178 | 
179 | 
180 | def multibox(resnet, extra_layers, cfg, num_classes):
181 |     loc_layers = []
182 |     conf_layers = []
183 |     resnet_source = [-2, -1]
184 |     for k, v in enumerate(resnet_source):
185 |         loc_layers += [nn.Conv2d(resnet[v][-1].conv2.out_channels,
186 |                                  cfg[k] * 4, kernel_size=3, padding=1)]
187 |         conf_layers += [nn.Conv2d(resnet[v][-1].conv2.out_channels,
188 |                         cfg[k] * num_classes, kernel_size=3, padding=1)]
189 |     for k, v in enumerate(extra_layers[1::2], 2):
190 |         loc_layers += [nn.Conv2d(v.out_channels, cfg[k]
191 |                                  * 4, kernel_size=3, padding=1)]
192 |         conf_layers += [nn.Conv2d(v.out_channels, cfg[k]
193 |                                   * num_classes, kernel_size=3, padding=1)]
194 |     return resnet, extra_layers, (loc_layers, conf_layers)
195 | 
196 | 
197 | base = {
198 |     '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
199 |             512, 512, 512],
200 |     '512': [],
201 | }
202 | extras = {
203 |     '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
204 |     '512': [],
205 | }
206 | mbox = {
207 |     '300': [4, 6, 6, 6, 4, 4],  # number of boxes per feature map location
208 |     '512': [],
209 | }
210 | 
211 | 
212 | def build_ssd(phase, size=300, num_classes=21):
213 |     if phase != "test" and phase != "train":
214 |         print("ERROR: Phase: " + phase + " not recognized")
215 |         return
216 |     if size != 300:
217 |         print("ERROR: You specified size " + repr(size) + ". However, " +
218 |               "currently only SSD300 (size=300) is supported!")
219 |         return
220 |     base_, extras_, head_ = multibox(resnet(),
221 |                                      add_extras(extras[str(size)], 512),
222 |                                      mbox[str(size)], num_classes)
223 |     return SSD(phase, size, base_, extras_, head_, num_classes)
224 | 


--------------------------------------------------------------------------------
/实验 4.2/ssd_resnet_18.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | from layers import *
  6 | from data import voc, coco, custom
  7 | import os
  8 | 
  9 | from netModel.resnet import resnet18, resnet34, BasicBlock
 10 | 
 11 | 
 12 | class SSD(nn.Module):
 13 |     """Single Shot Multibox Architecture
 14 |     The network is composed of a base VGG network followed by the
 15 |     added multibox conv layers.  Each multibox layer branches into
 16 |         1) conv2d for class conf scores
 17 |         2) conv2d for localization predictions
 18 |         3) associated priorbox layer to produce default bounding
 19 |            boxes specific to the layer's feature map size.
 20 |     See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 21 | 
 22 |     Args:
 23 |         phase: (string) Can be "test" or "train"
 24 |         size: input image size
 25 |         base: resnet layers for input, size of either 300 or 500
 26 |         extras: extra layers that feed to multibox loc and conf layers
 27 |         head: "multibox head" consists of loc and conf conv layers
 28 |     """
 29 | 
 30 |     def __init__(self, phase, size, base, extras, head, num_classes):
 31 |         super(SSD, self).__init__()
 32 |         self.phase = phase
 33 |         self.num_classes = num_classes
 34 |         self.cfg = custom
 35 |         self.priorbox = PriorBox(self.cfg)
 36 |         self.priors = Variable(self.priorbox.forward(), volatile=True)
 37 |         self.size = size
 38 | 
 39 |         # SSD network
 40 |         self.resnet = nn.ModuleList(base)
 41 |         # Layer learns to scale the l2 normalized features from conv4_3
 42 |         self.L2Norm = L2Norm(512, 20)
 43 |         self.extras = nn.ModuleList(extras)
 44 | 
 45 |         self.loc = nn.ModuleList(head[0])
 46 |         self.conf = nn.ModuleList(head[1])
 47 | 
 48 |         if phase == 'test':
 49 |             self.softmax = nn.Softmax(dim=-1)
 50 |             self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)
 51 | 
 52 |     def forward(self, x):
 53 |         """Applies network layers and ops on input image(s) x.
 54 | 
 55 |         Args:
 56 |             x: input image or batch of images. Shape: [batch,3,300,300].
 57 | 
 58 |         Return:
 59 |             Depending on phase:
 60 |             test:
 61 |                 Variable(tensor) of output class label predictions,
 62 |                 confidence score, and corresponding location predictions for
 63 |                 each object detected. Shape: [batch,topk,7]
 64 | 
 65 |             train:
 66 |                 list of concat outputs from:
 67 |                     1: confidence layers, Shape: [batch*num_priors,num_classes]
 68 |                     2: localization layers, Shape: [batch,num_priors*4]
 69 |                     3: priorbox layers, Shape: [2,num_priors*4]
 70 |         """
 71 |         sources = list()
 72 |         loc = list()
 73 |         conf = list()
 74 | 
 75 |         # apply resnet up to layer2
 76 |         for k in range(0,7):
 77 |             x = self.resnet[k](x)
 78 |         sources.append(x)
 79 | 
 80 |         # apply resnet up to layer4
 81 |         for k in range(7, len(self.resnet)):
 82 |             x = self.resnet[k](x)
 83 |         sources.append(x)
 84 |         # s = self.L2Norm(x)
 85 |         # sources.append(s)
 86 | 
 87 |         # apply extra layers and cache source layer outputs
 88 |         for k, v in enumerate(self.extras):
 89 |             x = F.relu(v(x), inplace=True)
 90 |             if k % 2 == 1:
 91 |                 sources.append(x)
 92 |         # apply multibox head to source layers
 93 |         for (x, l, c) in zip(sources, self.loc, self.conf):
 94 |             loc.append(l(x).permute(0, 2, 3, 1).contiguous())
 95 |             conf.append(c(x).permute(0, 2, 3, 1).contiguous())
 96 | 
 97 |         loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
 98 |         conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
 99 |         if self.phase == "test":
100 |             output = self.detect(
101 |                 loc.view(loc.size(0), -1, 4),                   # loc preds
102 |                 self.softmax(conf.view(conf.size(0), -1,
103 |                              self.num_classes)),                # conf preds
104 |                 self.priors.type(type(x.data))                  # default boxes
105 |             )
106 |         else:
107 |             output = (
108 |                 loc.view(loc.size(0), -1, 4),
109 |                 conf.view(conf.size(0), -1, self.num_classes),
110 |                 self.priors
111 |             )
112 |         return output
113 | 
114 |     def load_weights(self, base_file):
115 |         other, ext = os.path.splitext(base_file)
116 |         if ext == '.pkl' or '.pth':
117 |             print('Loading weights into state dict...')
118 |             self.load_state_dict(torch.load(base_file,
119 |                                  map_location=lambda storage, loc: storage))
120 |             print('Finished!')
121 |         else:
122 |             print('Sorry only .pth and .pkl files supported.')
123 | 
124 | 
125 | # This function is derived from torchvision VGG make_layers()
126 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
127 | def vgg(cfg, i, batch_norm=False):
128 |     layers = []
129 |     in_channels = i
130 |     for v in cfg:
131 |         if v == 'M':
132 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
133 |         elif v == 'C':
134 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
135 |         else:
136 |             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
137 |             if batch_norm:
138 |                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
139 |             else:
140 |                 layers += [conv2d, nn.ReLU(inplace=True)]
141 |             in_channels = v
142 |     pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
143 |     conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
144 |     conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
145 |     layers += [pool5, conv6,
146 |                nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
147 |     return layers
148 | 
149 | def resnet():
150 |     resnet = resnet18(pretrained=True)
151 |     layers = [
152 |         resnet.conv1,
153 |         resnet.bn1,
154 |         resnet.relu,
155 |         resnet.maxpool,
156 |         resnet.layer1,
157 |         resnet.layer2,
158 |         resnet.layer3,
159 |         resnet.layer4,
160 |     ]
161 |     return layers
162 | 
163 | def add_extras(cfg, i, batch_norm=False):
164 |     # Extra layers added to VGG for feature scaling
165 |     layers = []
166 |     in_channels = i
167 |     flag = False
168 |     for k, v in enumerate(cfg):
169 |         if in_channels != 'S':
170 |             if v == 'S':
171 |                 layers += [nn.Conv2d(in_channels, cfg[k + 1],
172 |                            kernel_size=(1, 3)[flag], stride=2, padding=1)]
173 |             else:
174 |                 layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
175 |             flag = not flag
176 |         in_channels = v
177 |     return layers
178 | 
179 | 
180 | def multibox(resnet, extra_layers, cfg, num_classes):
181 |     loc_layers = []
182 |     conf_layers = []
183 |     resnet_source = [-2, -1]
184 |     for k, v in enumerate(resnet_source):
185 |         loc_layers += [nn.Conv2d(resnet[v][-1].conv2.out_channels,
186 |                                  cfg[k] * 4, kernel_size=3, padding=1)]
187 |         conf_layers += [nn.Conv2d(resnet[v][-1].conv2.out_channels,
188 |                         cfg[k] * num_classes, kernel_size=3, padding=1)]
189 |     for k, v in enumerate(extra_layers[1::2], 2):
190 |         loc_layers += [nn.Conv2d(v.out_channels, cfg[k]
191 |                                  * 4, kernel_size=3, padding=1)]
192 |         conf_layers += [nn.Conv2d(v.out_channels, cfg[k]
193 |                                   * num_classes, kernel_size=3, padding=1)]
194 |     return resnet, extra_layers, (loc_layers, conf_layers)
195 | 
196 | 
197 | base = {
198 |     '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
199 |             512, 512, 512],
200 |     '512': [],
201 | }
202 | extras = {
203 |     '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
204 |     '512': [],
205 | }
206 | mbox = {
207 |     '300': [4, 6, 6, 6, 4, 4],  # number of boxes per feature map location
208 |     '512': [],
209 | }
210 | 
211 | 
212 | def build_ssd(phase, size=300, num_classes=21):
213 |     if phase != "test" and phase != "train":
214 |         print("ERROR: Phase: " + phase + " not recognized")
215 |         return
216 |     if size != 300:
217 |         print("ERROR: You specified size " + repr(size) + ". However, " +
218 |               "currently only SSD300 (size=300) is supported!")
219 |         return
220 |     base_, extras_, head_ = multibox(resnet(),
221 |                                      add_extras(extras[str(size)], 512),
222 |                                      mbox[str(size)], num_classes)
223 |     return SSD(phase, size, base_, extras_, head_, num_classes)
224 | 


--------------------------------------------------------------------------------
/layers/box_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import torch
  3 | 
  4 | 
  5 | def point_form(boxes):
  6 |     """ Convert prior_boxes to (xmin, ymin, xmax, ymax)
  7 |     representation for comparison to point form ground truth data.
  8 |     Args:
  9 |         boxes: (tensor) center-size default boxes from priorbox layers.
 10 |     Return:
 11 |         boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
 12 |     """
 13 |     return torch.cat((boxes[:, :2] - boxes[:, 2:]/2,     # xmin, ymin
 14 |                      boxes[:, :2] + boxes[:, 2:]/2), 1)  # xmax, ymax
 15 | 
 16 | 
 17 | def center_size(boxes):
 18 |     """ Convert prior_boxes to (cx, cy, w, h)
 19 |     representation for comparison to center-size form ground truth data.
 20 |     Args:
 21 |         boxes: (tensor) point_form boxes
 22 |     Return:
 23 |         boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
 24 |     """
 25 |     return torch.cat((boxes[:, 2:] + boxes[:, :2])/2,  # cx, cy
 26 |                      boxes[:, 2:] - boxes[:, :2], 1)  # w, h
 27 | 
 28 | 
 29 | def intersect(box_a, box_b):
 30 |     """ We resize both tensors to [A,B,2] without new malloc:
 31 |     [A,2] -> [A,1,2] -> [A,B,2]
 32 |     [B,2] -> [1,B,2] -> [A,B,2]
 33 |     Then we compute the area of intersect between box_a and box_b.
 34 |     Args:
 35 |       box_a: (tensor) bounding boxes, Shape: [A,4].
 36 |       box_b: (tensor) bounding boxes, Shape: [B,4].
 37 |     Return:
 38 |       (tensor) intersection area, Shape: [A,B].
 39 |     """
 40 |     A = box_a.size(0)
 41 |     B = box_b.size(0)
 42 |     max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
 43 |                        box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
 44 |     min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
 45 |                        box_b[:, :2].unsqueeze(0).expand(A, B, 2))
 46 |     inter = torch.clamp((max_xy - min_xy), min=0)
 47 |     return inter[:, :, 0] * inter[:, :, 1]
 48 | 
 49 | 
 50 | def jaccard(box_a, box_b):
 51 |     """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
 52 |     is simply the intersection over union of two boxes.  Here we operate on
 53 |     ground truth boxes and default boxes.
 54 |     E.g.:
 55 |         A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
 56 |     Args:
 57 |         box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
 58 |         box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
 59 |     Return:
 60 |         jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
 61 |     """
 62 |     inter = intersect(box_a, box_b)
 63 |     area_a = ((box_a[:, 2]-box_a[:, 0]) *
 64 |               (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
 65 |     area_b = ((box_b[:, 2]-box_b[:, 0]) *
 66 |               (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
 67 |     union = area_a + area_b - inter
 68 |     return inter / union  # [A,B]
 69 | 
 70 | 
 71 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx):
 72 |     """Match each prior box with the ground truth box of the highest jaccard
 73 |     overlap, encode the bounding boxes, then return the matched indices
 74 |     corresponding to both confidence and location preds.
 75 |     Args:
 76 |         threshold: (float) The overlap threshold used when mathing boxes.
 77 |         truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
 78 |         priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
 79 |         variances: (tensor) Variances corresponding to each prior coord,
 80 |             Shape: [num_priors, 4].
 81 |         labels: (tensor) All the class labels for the image, Shape: [num_obj].
 82 |         loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
 83 |         conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
 84 |         idx: (int) current batch index
 85 |     Return:
 86 |         The matched indices corresponding to 1)location and 2)confidence preds.
 87 |     """
 88 |     # jaccard index
 89 |     overlaps = jaccard(
 90 |         truths,
 91 |         point_form(priors)
 92 |     )
 93 |     # (Bipartite Matching)
 94 |     # [1,num_objects] best prior for each ground truth
 95 |     best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
 96 |     # [1,num_priors] best ground truth for each prior
 97 |     best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
 98 |     best_truth_idx.squeeze_(0)
 99 |     best_truth_overlap.squeeze_(0)
100 |     best_prior_idx.squeeze_(1)
101 |     best_prior_overlap.squeeze_(1)
102 |     best_truth_overlap.index_fill_(0, best_prior_idx, 2)  # ensure best prior
103 |     # TODO refactor: index  best_prior_idx with long tensor
104 |     # ensure every gt matches with its prior of max overlap
105 |     for j in range(best_prior_idx.size(0)):
106 |         best_truth_idx[best_prior_idx[j]] = j
107 |     matches = truths[best_truth_idx]          # Shape: [num_priors,4]
108 |     conf = labels[best_truth_idx] + 1         # Shape: [num_priors]
109 |     conf[best_truth_overlap < threshold] = 0  # label as background
110 |     loc = encode(matches, priors, variances)
111 |     loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn
112 |     conf_t[idx] = conf  # [num_priors] top class label for each prior
113 | 
114 | 
115 | def encode(matched, priors, variances):
116 |     """Encode the variances from the priorbox layers into the ground truth boxes
117 |     we have matched (based on jaccard overlap) with the prior boxes.
118 |     Args:
119 |         matched: (tensor) Coords of ground truth for each prior in point-form
120 |             Shape: [num_priors, 4].
121 |         priors: (tensor) Prior boxes in center-offset form
122 |             Shape: [num_priors,4].
123 |         variances: (list[float]) Variances of priorboxes
124 |     Return:
125 |         encoded boxes (tensor), Shape: [num_priors, 4]
126 |     """
127 | 
128 |     # dist b/t match center and prior's center
129 |     g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
130 |     # encode variance
131 |     g_cxcy /= (variances[0] * priors[:, 2:])
132 |     # match wh / prior wh
133 |     g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
134 |     g_wh = torch.log(g_wh) / variances[1]
135 |     # return target for smooth_l1_loss
136 |     return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]
137 | 
138 | 
139 | # Adapted from https://github.com/Hakuyume/chainer-ssd
140 | def decode(loc, priors, variances):
141 |     """Decode locations from predictions using priors to undo
142 |     the encoding we did for offset regression at train time.
143 |     Args:
144 |         loc (tensor): location predictions for loc layers,
145 |             Shape: [num_priors,4]
146 |         priors (tensor): Prior boxes in center-offset form.
147 |             Shape: [num_priors,4].
148 |         variances: (list[float]) Variances of priorboxes
149 |     Return:
150 |         decoded bounding box predictions
151 |     """
152 | 
153 |     boxes = torch.cat((
154 |         priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
155 |         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
156 |     boxes[:, :2] -= boxes[:, 2:] / 2
157 |     boxes[:, 2:] += boxes[:, :2]
158 |     return boxes
159 | 
160 | 
161 | def log_sum_exp(x):
162 |     """Utility function for computing log_sum_exp while determining
163 |     This will be used to determine unaveraged confidence loss across
164 |     all examples in a batch.
165 |     Args:
166 |         x (Variable(tensor)): conf_preds from conf layers
167 |     """
168 |     x_max = x.data.max()
169 |     return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
170 | 
171 | 
172 | # Original author: Francisco Massa:
173 | # https://github.com/fmassa/object-detection.torch
174 | # Ported to PyTorch by Max deGroot (02/01/2017)
175 | def nms(boxes, scores, overlap=0.5, top_k=200):
176 |     """Apply non-maximum suppression at test time to avoid detecting too many
177 |     overlapping bounding boxes for a given object.
178 |     Args:
179 |         boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
180 |         scores: (tensor) The class predscores for the img, Shape:[num_priors].
181 |         overlap: (float) The overlap thresh for suppressing unnecessary boxes.
182 |         top_k: (int) The Maximum number of box preds to consider.
183 |     Return:
184 |         The indices of the kept boxes with respect to num_priors.
185 |     """
186 | 
187 |     keep = scores.new(scores.size(0)).zero_().long()
188 |     if boxes.numel() == 0:
189 |         return keep
190 |     x1 = boxes[:, 0]
191 |     y1 = boxes[:, 1]
192 |     x2 = boxes[:, 2]
193 |     y2 = boxes[:, 3]
194 |     area = torch.mul(x2 - x1, y2 - y1)
195 |     v, idx = scores.sort(0)  # sort in ascending order
196 |     # I = I[v >= 0.01]
197 |     idx = idx[-top_k:]  # indices of the top-k largest vals
198 |     xx1 = boxes.new()
199 |     yy1 = boxes.new()
200 |     xx2 = boxes.new()
201 |     yy2 = boxes.new()
202 |     w = boxes.new()
203 |     h = boxes.new()
204 | 
205 |     # keep = torch.Tensor()
206 |     count = 0
207 |     while idx.numel() > 0:
208 |         i = idx[-1]  # index of current largest val
209 |         # keep.append(i)
210 |         keep[count] = i
211 |         count += 1
212 |         if idx.size(0) == 1:
213 |             break
214 |         idx = idx[:-1]  # remove kept element from view
215 |         # load bboxes of next highest vals
216 |         torch.index_select(x1, 0, idx, out=xx1)
217 |         torch.index_select(y1, 0, idx, out=yy1)
218 |         torch.index_select(x2, 0, idx, out=xx2)
219 |         torch.index_select(y2, 0, idx, out=yy2)
220 |         # store element-wise max with next highest score
221 |         xx1 = torch.clamp(xx1, min=x1[i])
222 |         yy1 = torch.clamp(yy1, min=y1[i])
223 |         xx2 = torch.clamp(xx2, max=x2[i])
224 |         yy2 = torch.clamp(yy2, max=y2[i])
225 |         w.resize_as_(xx2)
226 |         h.resize_as_(yy2)
227 |         w = xx2 - xx1
228 |         h = yy2 - yy1
229 |         # check sizes of xx1 and xx2.. after each iteration
230 |         w = torch.clamp(w, min=0.0)
231 |         h = torch.clamp(h, min=0.0)
232 |         inter = w*h
233 |         # IoU = i / (area(a) + area(b) - i)
234 |         rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
235 |         union = (rem_areas - inter) + area[i]
236 |         IoU = inter/union  # store result in iou
237 |         # keep only elements with an IoU <= overlap
238 |         idx = idx[IoU.le(overlap)]
239 |     return keep, count
240 | 


--------------------------------------------------------------------------------
/实验 4.2/visualTest_building.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.backends.cudnn as cudnn
  5 | from torch.autograd import Variable
  6 | import torch.utils.data as data
  7 | 
  8 | from data import BaseTransform
  9 | from data.custom_for_visual import CUSTOM_CLASSES_BUILDING as labelmap_building
 10 | from data.custom_for_visual import customDetection, customAnnotationTransform, CUSTOM_ROOT, CUSTOM_CLASSES_BUILDING
 11 | 
 12 | # from ssd import build_ssd
 13 | from ssd_resnet_18 import build_ssd
 14 | 
 15 | import sys
 16 | import os
 17 | import time
 18 | import argparse
 19 | import numpy as np
 20 | import pickle
 21 | import cv2
 22 | import math
 23 | 
 24 | import warnings
 25 | warnings.filterwarnings("ignore")
 26 | 
 27 | parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection')
 28 | parser.add_argument('--trained_model_building',
 29 |                     default='useful_weight/CUSTOM.pth', type=str,
 30 |                     help='Trained state_dict file path to open')
 31 | parser.add_argument('--save_folder', default='eval/', type=str,
 32 |                     help='Dir to save results')
 33 | parser.add_argument('--visual_threshold', default=0.15, type=float,
 34 |                     help='Final confidence threshold')
 35 | parser.add_argument('--cuda', default=True, type=bool,
 36 |                     help='Use cuda to train model')
 37 | parser.add_argument('--custom_root', default=CUSTOM_ROOT, help='Location of VOC root directory')
 38 | parser.add_argument('-f', default=None, type=str, help="Dummy arg so we can load in Jupyter Notebooks")
 39 | args = parser.parse_args()
 40 | 
 41 | if args.cuda and torch.cuda.is_available():
 42 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 43 | else:
 44 |     torch.set_default_tensor_type('torch.FloatTensor')
 45 | 
 46 | if not os.path.exists(args.save_folder):
 47 |     os.mkdir(args.save_folder)
 48 | 
 49 | 
 50 | def test_net(save_folder, net, cuda, testset, transform, thresh, labelmap):
 51 |     # dump predictions and assoc. ground truth to text file for now
 52 |     filename = save_folder + 'result_%s.txt'
 53 |     num_images = len(testset)
 54 |     for i in range(num_images):
 55 |         print('Testing image {:d}/{:d}....'.format(i+1, num_images))
 56 |         img = testset.pull_image(i)
 57 |         img_id, annotation = testset.pull_anno(i)
 58 |         x = torch.from_numpy(transform(img)[0]).permute(2, 0, 1)
 59 |         x = Variable(x.unsqueeze(0))
 60 | 
 61 |         if cuda:
 62 |             x = x.cuda()
 63 | 
 64 |         y = net(x)      # forward pass
 65 |         detections = y.data
 66 |         # scale each detection back up to the image
 67 |         scale = torch.Tensor([img.shape[1], img.shape[0],
 68 |                              img.shape[1], img.shape[0]])
 69 |         pred_num = 0
 70 |         for i in range(detections.size(1)):
 71 |             j = 0
 72 |             while detections[0, i, j, 0] >= args.visual_threshold:
 73 |                 score = detections[0, i, j, 0]
 74 |                 label_name = labelmap[i-1]
 75 |                 pt = (detections[0, i, j, 1:]*scale).cpu().numpy()
 76 |                 coords = (pt[0], pt[1], pt[2], pt[3])
 77 |                 pred_num += 1
 78 |                 with open(filename % label_name, mode='a') as f:
 79 |                     f.write(str(img_id) + ' ' +
 80 |                             str(score.cpu().numpy()) + ' '+ ' '.join(str(c) for c in coords) + '\n')
 81 |                 j += 1
 82 | 
 83 | def xmlData(name, width, height, label):
 84 |     return '''<annotation>
 85 |     <folder>JPEGImages</folder>
 86 |     <filename>%s.jpg</filename>
 87 |     <path>%s.jpg</path>
 88 |     <source>
 89 |         <database>Unknown</database>
 90 |     </source>
 91 |     <size>
 92 |         <width>%d</width>
 93 |         <height>%d</height>
 94 |         <depth>3</depth>
 95 |     </size>
 96 |     <segmented>0</segmented>
 97 |     <object>
 98 |         <name>%s</name>
 99 |         <pose>Unspecified</pose>
100 |         <truncated>1</truncated>
101 |         <difficult>0</difficult>
102 |         <bndbox>
103 |             <xmin>0</xmin>
104 |             <ymin>0</ymin>
105 |             <xmax>1</xmax>
106 |             <ymax>1</ymax>
107 |         </bndbox>
108 |     </object>
109 | </annotation>''' % (name, name, width, height, label)
110 | 
111 | def get_output_dir(name, phase=""):
112 |     filedir = os.path.join(name, phase)
113 |     if not os.path.exists(filedir):
114 |         os.makedirs(filedir)
115 |     return filedir
116 | 
117 | def is_rect_intersect(rect1, rect2):
118 |     rect1_x1 = math.floor(rect1['x1'])
119 |     rect1_y1 = math.floor(rect1['y1'])
120 |     rect1_x2 = math.floor(rect1['x2'])
121 |     rect1_y2 = math.floor(rect1['y2'])
122 | 
123 |     rect2_x1 = math.floor(rect2['x1'])
124 |     rect2_y1 = math.floor(rect2['y1'])
125 |     rect2_x2 = math.floor(rect2['x2'])
126 |     rect2_y2 = math.floor(rect2['y2'])
127 | 
128 |     zx = abs(rect1_x1 + rect1_x2 - rect2_x1 - rect2_x2)
129 |     x = abs(rect1_x1 - rect1_x2) + abs(rect2_x1 - rect2_x2)
130 | 
131 |     zy = abs(rect1_y1 + rect1_y2 - rect2_y1 - rect2_y2)
132 |     y = abs(rect1_y1 - rect1_y2) + abs(rect2_y1 - rect2_y2)
133 | 
134 |     return True if zx <= x and zy <= y else False
135 | 
136 | 
137 | def test_custom():
138 |     DEBUG = False
139 |     set_type = 'test'
140 | 
141 |     if not os.path.exists(os.path.join(args.save_folder, 'result_building.txt')):
142 |         # load net
143 |         num_classes_building = len(labelmap_building) + 1                      # +1 for background
144 |         net = build_ssd('test', 300, num_classes_building)            # initialize SSD
145 |         net.load_state_dict(torch.load(args.trained_model_building))
146 |         net.eval()
147 | 
148 |         print('Finished loading model!')
149 |         # load data
150 |         dataset1 = customDetection(args.custom_root, [('buildingwater', set_type)], None, customAnnotationTransform(class_to_ind=dict(zip(CUSTOM_CLASSES_BUILDING, range(len(CUSTOM_CLASSES_BUILDING))))))
151 |         if args.cuda:
152 |             net = net.cuda()
153 |             cudnn.benchmark = True
154 |         # evaluation
155 |     
156 |         test_net(args.save_folder, net, args.cuda, dataset1,
157 |                 BaseTransform(net.size, (104, 117, 123)),
158 |                 thresh=args.visual_threshold, labelmap=labelmap_building)
159 | 
160 |     rootPath = 'F:/ssd/data/video/buildingwater'
161 |     img_path = os.path.join(rootPath, 'JPEGImages', '%s.jpg')
162 |     imgList_building = {}
163 |     imgList_water = {}
164 | 
165 |     with open(os.path.join(args.save_folder, 'result_building.txt'), 'r') as f:
166 |         text_lines = f.readlines()
167 |         for line in text_lines:
168 |             info = line.split(" ")
169 |             name, score, x1, y1, x2, y2 = info
170 |             if name in imgList_building:
171 |                 imgList_building[name].append({
172 |                     'score': float(score),
173 |                     'x1': float(x1),
174 |                     'y1': float(y1),
175 |                     'x2': float(x2),
176 |                     'y2': float(y2)
177 |                 })
178 |             else:
179 |                 imgList_building[name] = [{
180 |                     'score': float(score),
181 |                     'x1': float(x1),
182 |                     'y1': float(y1),
183 |                     'x2': float(x2),
184 |                     'y2': float(y2)
185 |                 }]
186 |     
187 |     with open(os.path.join(args.save_folder, 'result_water.txt'), 'r') as f:
188 |         text_lines = f.readlines()
189 |         for line in text_lines:
190 |             info = line.split(" ")
191 |             name, score, x1, y1, x2, y2 = info
192 |             if name in imgList_water:
193 |                 imgList_water[name].append({
194 |                     'score': float(score),
195 |                     'x1': float(x1),
196 |                     'y1': float(y1),
197 |                     'x2': float(x2),
198 |                     'y2': float(y2)
199 |                 })
200 |             else:
201 |                 imgList_water[name] = [{
202 |                     'score': float(score),
203 |                     'x1': float(x1),
204 |                     'y1': float(y1),
205 |                     'x2': float(x2),
206 |                     'y2': float(y2)
207 |                 }]
208 | 
209 |     opacity = 0.8
210 |     for name in imgList_building:
211 |         img_building = imgList_building[name]
212 |         img_water = imgList_water[name] if name in imgList_water else []
213 |         
214 |         image = cv2.imread(img_path % name)
215 |         (h, w, c) = image.shape
216 |         img_black = image.copy()
217 |         img_cp = image.copy()
218 |         img_black.fill(1)
219 | 
220 | 
221 |         for building in img_building:
222 |             for water in img_water:
223 |                 if is_rect_intersect(building, water):
224 |                     x1_b = max(math.floor(building['x1']), 0)
225 |                     y1_b = max(math.floor(building['y1']), 0)
226 |                     x2_b = min(math.floor(building['x2']), w)
227 |                     y2_b = min(math.floor(building['y2']), h)
228 |                     cv2.rectangle(image, (x1_b-2, y1_b-2), (x2_b+2, y2_b+2), (0,0,255), 5)
229 |                     img_black[y1_b:y2_b, x1_b:x2_b] = 0
230 |         
231 | 
232 |         # for building in img_building:
233 |         #     x1_b = max(math.floor(building['x1']), 0)
234 |         #     y1_b = max(math.floor(building['y1']), 0)
235 |         #     x2_b = min(math.floor(building['x2']), w)
236 |         #     y2_b = min(math.floor(building['y2']), h)
237 |         #     # cv2.rectangle(image, (x1_b, y1_b), (x2_b, y2_b), (0,0,255), 5)
238 |         #     img_black[y1_b:y2_b, x1_b:x2_b] = 0
239 |         image[:,:,0] = (1 - img_black[:,:,0]) * (img_cp[:,:,0]) + img_black[:,:,0] * image[:,:,0]
240 |         image[:,:,1] = (1 - img_black[:,:,1]) * (img_cp[:,:,1]) + img_black[:,:,1] * image[:,:,1]
241 |         image[:,:,2] = (1 - img_black[:,:,2]) * (img_cp[:,:,2] ) + img_black[:,:,2] * image[:,:,2]
242 | 
243 |         image[:,:,0] = (1 - img_black[:,:,0]) * (image[:,:,0] * opacity + 0 * (1 - opacity)) + img_black[:,:,0] * image[:,:,0]
244 |         image[:,:,1] = (1 - img_black[:,:,1]) * (image[:,:,1] * opacity + 0 * (1 - opacity)) + img_black[:,:,1] * image[:,:,1]
245 |         image[:,:,2] = (1 - img_black[:,:,2]) * (image[:,:,2] * opacity + 255 * (1 - opacity)) + img_black[:,:,2] * image[:,:,2]
246 | 
247 |         # for water in img_water:
248 |         #     x1_w = max(math.floor(water['x1']), 0)
249 |         #     y1_w = max(math.floor(water['y1']), 0)
250 |         #     x2_w = min(math.floor(water['x2']), w)
251 |         #     y2_w = min(math.floor(water['y2']), h)
252 |         #     cv2.rectangle(image, (x1_w, y1_w), (x2_w, y2_w), (0,255,0), 5)
253 |         
254 |         image = cv2.resize(image, (512, 512))
255 |         # cv2.putText(image, 'building', (10, 40), cv2.FONT_HERSHEY_COMPLEX, 1.2, (0, 0, 255), 2)
256 |         # cv2.putText(image, 'water', (10, 80), cv2.FONT_HERSHEY_COMPLEX, 1.2, (0, 255, 0), 2)
257 |         cv2.imshow('w2', image)
258 |         cv2.waitKey()
259 | 
260 | if __name__ == '__main__':
261 |     test_custom()
262 | 


--------------------------------------------------------------------------------
/实验 4.2/trainCustom_18.py:
--------------------------------------------------------------------------------
  1 | from data import *
  2 | from utils.augmentations import SSDAugmentation
  3 | from layers.modules import MultiBoxLoss
  4 | from ssd_resnet_18 import build_ssd
  5 | import os
  6 | import sys
  7 | import time
  8 | import torch
  9 | from torch.autograd import Variable
 10 | import torch.nn as nn
 11 | import torch.optim as optim
 12 | import torch.backends.cudnn as cudnn
 13 | import torch.nn.init as init
 14 | import torch.utils.data as data
 15 | import numpy as np
 16 | import argparse
 17 | 
 18 | from data.voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES, VOC_ROOT
 19 | 
 20 | from data.coco import COCODetection, COCOAnnotationTransform, COCO_CLASSES, COCO_ROOT, get_label_map
 21 | 
 22 | from data.custom import customDetection, customAnnotationTransform, CUSTOM_CLASSES, CUSTOM_ROOT
 23 | 
 24 | def str2bool(v):
 25 |     return v.lower() in ("yes", "true", "t", "1")
 26 | 
 27 | 
 28 | parser = argparse.ArgumentParser(
 29 |     description='Single Shot MultiBox Detector Training With Pytorch')
 30 | train_set = parser.add_mutually_exclusive_group()
 31 | parser.add_argument('--dataset', default='CUSTOM', choices=['VOC', 'COCO', 'CUSTOM'],
 32 |                     type=str, help='VOC or COCO')
 33 | parser.add_argument('--dataset_root', default=CUSTOM_ROOT, # VOC_ROOT,
 34 |                     help='Dataset root directory path')
 35 | parser.add_argument('--basenet', default='vgg16_reducedfc.pth',
 36 |                     help='Pretrained base model')
 37 | parser.add_argument('--batch_size', default=32, type=int,
 38 |                     help='Batch size for training')
 39 | parser.add_argument('--resume', default=None, type=str,
 40 |                     help='Checkpoint state_dict file to resume training from')
 41 | parser.add_argument('--start_iter', default=0, type=int,
 42 |                     help='Resume training at this iter')
 43 | parser.add_argument('--num_workers', default=4, type=int,
 44 |                     help='Number of workers used in dataloading')
 45 | parser.add_argument('--cuda', default=True, type=str2bool,
 46 |                     help='Use CUDA to train model')
 47 | parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float,
 48 |                     help='initial learning rate')
 49 | parser.add_argument('--momentum', default=0.9, type=float,
 50 |                     help='Momentum value for optim')
 51 | parser.add_argument('--weight_decay', default=5e-4, type=float,
 52 |                     help='Weight decay for SGD')
 53 | parser.add_argument('--gamma', default=0.1, type=float,
 54 |                     help='Gamma update for SGD')
 55 | parser.add_argument('--visdom', default=False, type=str2bool,
 56 |                     help='Use visdom for loss visualization')
 57 | parser.add_argument('--save_folder', default='weights/',
 58 |                     help='Directory for saving checkpoint models')
 59 | args = parser.parse_args()
 60 | 
 61 | 
 62 | if torch.cuda.is_available():
 63 |     if args.cuda:
 64 |         torch.set_default_tensor_type('torch.cuda.FloatTensor')
 65 |     if not args.cuda:
 66 |         print("WARNING: It looks like you have a CUDA device, but aren't " +
 67 |               "using CUDA.\nRun with --cuda for optimal training speed.")
 68 |         torch.set_default_tensor_type('torch.FloatTensor')
 69 | else:
 70 |     torch.set_default_tensor_type('torch.FloatTensor')
 71 | 
 72 | if not os.path.exists(args.save_folder):
 73 |     os.mkdir(args.save_folder)
 74 | 
 75 | 
 76 | def train():
 77 |     if args.dataset == 'COCO':
 78 |         if args.dataset_root == VOC_ROOT:
 79 |             if not os.path.exists(COCO_ROOT):
 80 |                 parser.error('Must specify dataset_root if specifying dataset')
 81 |             print("WARNING: Using default COCO dataset_root because " +
 82 |                   "--dataset_root was not specified.")
 83 |             args.dataset_root = COCO_ROOT
 84 |         cfg = coco
 85 |         dataset = COCODetection(root=args.dataset_root,
 86 |                                 transform=SSDAugmentation(cfg['min_dim'],
 87 |                                                           MEANS))
 88 |     elif args.dataset == 'VOC':
 89 |         if args.dataset_root == COCO_ROOT:
 90 |             parser.error('Must specify dataset if specifying dataset_root')
 91 |         cfg = voc
 92 |         dataset = VOCDetection(root=args.dataset_root,
 93 |                                transform=SSDAugmentation(cfg['min_dim'],
 94 |                                                          MEANS))
 95 |     
 96 |     elif args.dataset == 'CUSTOM':
 97 |         if args.dataset_root == VOC_ROOT or args.dataset_root == COCO_ROOT:
 98 |             parser.error('Must specify dataset if specifying dataset_root')
 99 |         cfg = custom
100 |         dataset = customDetection(root=args.dataset_root,
101 |                                transform=SSDAugmentation(cfg['min_dim'],
102 |                                                          MEANS))
103 | 
104 |     if args.visdom:
105 |         import visdom
106 |         viz = visdom.Visdom()
107 | 
108 |     ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes'])
109 |     net = ssd_net
110 | 
111 |     if args.cuda:
112 |         net = torch.nn.DataParallel(ssd_net)
113 |         cudnn.benchmark = True
114 | 
115 |     if args.resume:
116 |         print('Resuming training, loading {}...'.format(args.resume))
117 |         ssd_net.load_weights(args.resume)
118 |     else:
119 |         pass
120 |         # resnet_weights = torch.load(args.save_folder + args.basenet)
121 |         # print('Loading base network...')
122 |         # ssd_net.resnet.load_state_dict(resnet_weights)
123 | 
124 |     if args.cuda:
125 |         net = net.cuda()
126 | 
127 |     if not args.resume:
128 |         print('Initializing weights...')
129 |         # initialize newly added layers' weights with xavier method
130 |         ssd_net.extras.apply(weights_init)
131 |         ssd_net.loc.apply(weights_init)
132 |         ssd_net.conf.apply(weights_init)
133 | 
134 |     optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
135 |                           weight_decay=args.weight_decay)
136 |     criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5,
137 |                              False, args.cuda)
138 | 
139 |     net.train()
140 |     # loss counters
141 |     loc_loss = 0
142 |     conf_loss = 0
143 |     epoch = 0
144 |     print('Loading the dataset...')
145 | 
146 |     epoch_size = len(dataset) // args.batch_size
147 |     print('Epochj Size:', epoch_size)
148 |     print('Training SSD on:', dataset.name)
149 |     print('Using the specified args:')
150 |     print(args)
151 | 
152 |     step_index = 0
153 | 
154 |     if args.visdom:
155 |         vis_title = 'SSD.PyTorch on ' + dataset.name
156 |         vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss']
157 |         iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend)
158 |         epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend)
159 | 
160 |     data_loader = data.DataLoader(dataset, args.batch_size,
161 |                                   num_workers=args.num_workers,
162 |                                   shuffle=True, collate_fn=detection_collate,
163 |                                   pin_memory=True)
164 |     # create batch iterator
165 |     batch_iterator = iter(data_loader)
166 |     for iteration in range(args.start_iter, cfg['max_iter']):
167 |         if args.visdom and iteration != 0 and (iteration % epoch_size == 0):
168 |             update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None,
169 |                             'append', epoch_size)
170 |             # reset epoch loss counters
171 |             loc_loss = 0
172 |             conf_loss = 0
173 |             epoch += 1
174 | 
175 |         if iteration in cfg['lr_steps']:
176 |             step_index += 1
177 |             adjust_learning_rate(optimizer, args.gamma, step_index)
178 | 
179 |         # load train data
180 |         # images, targets = next(batch_iterator)
181 |         try:
182 |             images, targets = next(batch_iterator)
183 |         except StopIteration:
184 |             batch_iterator = iter(data_loader)
185 |             images, targets = next(batch_iterator)
186 | 
187 |         if args.cuda:
188 |             images = Variable(images.cuda())
189 |             targets = [Variable(ann.cuda(), volatile=True) for ann in targets]
190 |         else:
191 |             images = Variable(images)
192 |             targets = [Variable(ann, volatile=True) for ann in targets]
193 |         # forward
194 |         t0 = time.time()
195 |         out = net(images)
196 |         # backprop
197 |         optimizer.zero_grad()
198 |         loss_l, loss_c = criterion(out, targets)
199 |         loss = loss_l + loss_c
200 |         loss.backward()
201 |         optimizer.step()
202 |         t1 = time.time()
203 |         # loc_loss += loss_l.data[0]
204 |         # conf_loss += loss_c.data[0]
205 |         loc_loss += loss_l.item()
206 |         conf_loss += loss_c.item()
207 | 
208 |         if iteration % 10 == 0:
209 |             print('timer: %.4f sec.' % (t1 - t0))
210 |             # print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data[0]), end=' ')
211 |             print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.item()), end=' ')
212 | 
213 |         if args.visdom:
214 |             # update_vis_plot(iteration, loss_l.data[0], loss_c.data[0],
215 |             #                 iter_plot, epoch_plot, 'append')
216 |             update_vis_plot(iteration, loss_l.item(), loss_c.item(),
217 |                             iter_plot, epoch_plot, 'append')
218 | 
219 |         if iteration != 0 and iteration % 5000 == 0:
220 |             print('Saving state, iter:', iteration)
221 |             torch.save(ssd_net.state_dict(), args.save_folder + '/ssd300_COCO_' +
222 |                        repr(iteration) + '.pth')
223 |     torch.save(ssd_net.state_dict(),
224 |                args.save_folder + '' + args.dataset + '.pth')
225 | 
226 | 
227 | def adjust_learning_rate(optimizer, gamma, step):
228 |     """Sets the learning rate to the initial LR decayed by 10 at every
229 |         specified step
230 |     # Adapted from PyTorch Imagenet example:
231 |     # https://github.com/pytorch/examples/blob/master/imagenet/main.py
232 |     """
233 |     lr = args.lr * (gamma ** (step))
234 |     for param_group in optimizer.param_groups:
235 |         param_group['lr'] = lr
236 | 
237 | 
238 | def xavier(param):
239 |     init.xavier_uniform(param)
240 | 
241 | 
242 | def weights_init(m):
243 |     if isinstance(m, nn.Conv2d):
244 |         xavier(m.weight.data)
245 |         m.bias.data.zero_()
246 | 
247 | 
248 | def create_vis_plot(_xlabel, _ylabel, _title, _legend):
249 |     return viz.line(
250 |         X=torch.zeros((1,)).cpu(),
251 |         Y=torch.zeros((1, 3)).cpu(),
252 |         opts=dict(
253 |             xlabel=_xlabel,
254 |             ylabel=_ylabel,
255 |             title=_title,
256 |             legend=_legend
257 |         )
258 |     )
259 | 
260 | 
261 | def update_vis_plot(iteration, loc, conf, window1, window2, update_type,
262 |                     epoch_size=1):
263 |     viz.line(
264 |         X=torch.ones((1, 3)).cpu() * iteration,
265 |         Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu() / epoch_size,
266 |         win=window1,
267 |         update=update_type
268 |     )
269 |     # initialize epoch plot on first iteration
270 |     if iteration == 0:
271 |         viz.line(
272 |             X=torch.zeros((1, 3)).cpu(),
273 |             Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu(),
274 |             win=window2,
275 |             update=True
276 |         )
277 | 
278 | 
279 | if __name__ == '__main__':
280 |     train()
281 | 


--------------------------------------------------------------------------------
/实验 4.1/trainCustom_101.py:
--------------------------------------------------------------------------------
  1 | from data import *
  2 | from utils.augmentations import SSDAugmentation
  3 | from layers.modules import MultiBoxLoss
  4 | from ssd_resnet_101 import build_ssd
  5 | import os
  6 | import sys
  7 | import time
  8 | import torch
  9 | from torch.autograd import Variable
 10 | import torch.nn as nn
 11 | import torch.optim as optim
 12 | import torch.backends.cudnn as cudnn
 13 | import torch.nn.init as init
 14 | import torch.utils.data as data
 15 | import numpy as np
 16 | import argparse
 17 | 
 18 | from data.voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES, VOC_ROOT
 19 | 
 20 | from data.coco import COCODetection, COCOAnnotationTransform, COCO_CLASSES, COCO_ROOT, get_label_map
 21 | 
 22 | from data.custom import customDetection, customAnnotationTransform, CUSTOM_CLASSES, CUSTOM_ROOT
 23 | 
 24 | def str2bool(v):
 25 |     return v.lower() in ("yes", "true", "t", "1")
 26 | 
 27 | 
 28 | parser = argparse.ArgumentParser(
 29 |     description='Single Shot MultiBox Detector Training With Pytorch')
 30 | train_set = parser.add_mutually_exclusive_group()
 31 | parser.add_argument('--dataset', default='CUSTOM', choices=['VOC', 'COCO', 'CUSTOM'],
 32 |                     type=str, help='VOC or COCO')
 33 | parser.add_argument('--dataset_root', default=CUSTOM_ROOT, # VOC_ROOT,
 34 |                     help='Dataset root directory path')
 35 | parser.add_argument('--basenet', default='vgg16_reducedfc.pth',
 36 |                     help='Pretrained base model')
 37 | parser.add_argument('--batch_size', default=32, type=int,
 38 |                     help='Batch size for training')
 39 | parser.add_argument('--resume', default=None, type=str,
 40 |                     help='Checkpoint state_dict file to resume training from')
 41 | parser.add_argument('--start_iter', default=0, type=int,
 42 |                     help='Resume training at this iter')
 43 | parser.add_argument('--num_workers', default=4, type=int,
 44 |                     help='Number of workers used in dataloading')
 45 | parser.add_argument('--cuda', default=True, type=str2bool,
 46 |                     help='Use CUDA to train model')
 47 | parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float,
 48 |                     help='initial learning rate')
 49 | parser.add_argument('--momentum', default=0.9, type=float,
 50 |                     help='Momentum value for optim')
 51 | parser.add_argument('--weight_decay', default=5e-4, type=float,
 52 |                     help='Weight decay for SGD')
 53 | parser.add_argument('--gamma', default=0.1, type=float,
 54 |                     help='Gamma update for SGD')
 55 | parser.add_argument('--visdom', default=False, type=str2bool,
 56 |                     help='Use visdom for loss visualization')
 57 | parser.add_argument('--save_folder', default='weights/',
 58 |                     help='Directory for saving checkpoint models')
 59 | args = parser.parse_args()
 60 | 
 61 | 
 62 | if torch.cuda.is_available():
 63 |     if args.cuda:
 64 |         torch.set_default_tensor_type('torch.cuda.FloatTensor')
 65 |     if not args.cuda:
 66 |         print("WARNING: It looks like you have a CUDA device, but aren't " +
 67 |               "using CUDA.\nRun with --cuda for optimal training speed.")
 68 |         torch.set_default_tensor_type('torch.FloatTensor')
 69 | else:
 70 |     torch.set_default_tensor_type('torch.FloatTensor')
 71 | 
 72 | if not os.path.exists(args.save_folder):
 73 |     os.mkdir(args.save_folder)
 74 | 
 75 | 
 76 | def train():
 77 |     if args.dataset == 'COCO':
 78 |         if args.dataset_root == VOC_ROOT:
 79 |             if not os.path.exists(COCO_ROOT):
 80 |                 parser.error('Must specify dataset_root if specifying dataset')
 81 |             print("WARNING: Using default COCO dataset_root because " +
 82 |                   "--dataset_root was not specified.")
 83 |             args.dataset_root = COCO_ROOT
 84 |         cfg = coco
 85 |         dataset = COCODetection(root=args.dataset_root,
 86 |                                 transform=SSDAugmentation(cfg['min_dim'],
 87 |                                                           MEANS))
 88 |     elif args.dataset == 'VOC':
 89 |         if args.dataset_root == COCO_ROOT:
 90 |             parser.error('Must specify dataset if specifying dataset_root')
 91 |         cfg = voc
 92 |         dataset = VOCDetection(root=args.dataset_root,
 93 |                                transform=SSDAugmentation(cfg['min_dim'],
 94 |                                                          MEANS))
 95 |     
 96 |     elif args.dataset == 'CUSTOM':
 97 |         if args.dataset_root == VOC_ROOT or args.dataset_root == COCO_ROOT:
 98 |             parser.error('Must specify dataset if specifying dataset_root')
 99 |         cfg = custom
100 |         dataset = customDetection(root=args.dataset_root,
101 |                                transform=SSDAugmentation(cfg['min_dim'],
102 |                                                          MEANS))
103 | 
104 |     if args.visdom:
105 |         import visdom
106 |         viz = visdom.Visdom()
107 | 
108 |     ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes'])
109 |     net = ssd_net
110 | 
111 |     if args.cuda:
112 |         net = torch.nn.DataParallel(ssd_net)
113 |         cudnn.benchmark = True
114 | 
115 |     if args.resume:
116 |         print('Resuming training, loading {}...'.format(args.resume))
117 |         ssd_net.load_weights(args.resume)
118 |     else:
119 |         pass
120 |         # resnet_weights = torch.load(args.save_folder + args.basenet)
121 |         # print('Loading base network...')
122 |         # ssd_net.resnet.load_state_dict(resnet_weights)
123 | 
124 |     if args.cuda:
125 |         net = net.cuda()
126 | 
127 |     if not args.resume:
128 |         print('Initializing weights...')
129 |         # initialize newly added layers' weights with xavier method
130 |         ssd_net.extras.apply(weights_init)
131 |         ssd_net.loc.apply(weights_init)
132 |         ssd_net.conf.apply(weights_init)
133 | 
134 |     optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
135 |                           weight_decay=args.weight_decay)
136 |     criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5,
137 |                              False, args.cuda)
138 | 
139 |     net.train()
140 |     # loss counters
141 |     loc_loss = 0
142 |     conf_loss = 0
143 |     epoch = 0
144 |     print('Loading the dataset...')
145 | 
146 |     epoch_size = len(dataset) // args.batch_size
147 |     print('Epochj Size:', epoch_size)
148 |     print('Training SSD on:', dataset.name)
149 |     print('Using the specified args:')
150 |     print(args)
151 | 
152 |     step_index = 0
153 | 
154 |     if args.visdom:
155 |         vis_title = 'SSD.PyTorch on ' + dataset.name
156 |         vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss']
157 |         iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend)
158 |         epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend)
159 | 
160 |     data_loader = data.DataLoader(dataset, args.batch_size,
161 |                                   num_workers=args.num_workers,
162 |                                   shuffle=True, collate_fn=detection_collate,
163 |                                   pin_memory=True)
164 |     # create batch iterator
165 |     batch_iterator = iter(data_loader)
166 |     for iteration in range(args.start_iter, cfg['max_iter']):
167 |         if args.visdom and iteration != 0 and (iteration % epoch_size == 0):
168 |             update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None,
169 |                             'append', epoch_size)
170 |             # reset epoch loss counters
171 |             loc_loss = 0
172 |             conf_loss = 0
173 |             epoch += 1
174 | 
175 |         if iteration in cfg['lr_steps']:
176 |             step_index += 1
177 |             adjust_learning_rate(optimizer, args.gamma, step_index)
178 | 
179 |         # load train data
180 |         # images, targets = next(batch_iterator)
181 |         try:
182 |             images, targets = next(batch_iterator)
183 |         except StopIteration:
184 |             batch_iterator = iter(data_loader)
185 |             images, targets = next(batch_iterator)
186 | 
187 |         if args.cuda:
188 |             images = Variable(images.cuda())
189 |             targets = [Variable(ann.cuda(), volatile=True) for ann in targets]
190 |         else:
191 |             images = Variable(images)
192 |             targets = [Variable(ann, volatile=True) for ann in targets]
193 |         # forward
194 |         t0 = time.time()
195 |         out = net(images)
196 |         # backprop
197 |         optimizer.zero_grad()
198 |         loss_l, loss_c = criterion(out, targets)
199 |         loss = loss_l + loss_c
200 |         loss.backward()
201 |         optimizer.step()
202 |         t1 = time.time()
203 |         # loc_loss += loss_l.data[0]
204 |         # conf_loss += loss_c.data[0]
205 |         loc_loss += loss_l.item()
206 |         conf_loss += loss_c.item()
207 | 
208 |         if iteration % 10 == 0:
209 |             print('timer: %.4f sec.' % (t1 - t0))
210 |             # print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data[0]), end=' ')
211 |             print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.item()), end=' ')
212 | 
213 |         if args.visdom:
214 |             # update_vis_plot(iteration, loss_l.data[0], loss_c.data[0],
215 |             #                 iter_plot, epoch_plot, 'append')
216 |             update_vis_plot(iteration, loss_l.item(), loss_c.item(),
217 |                             iter_plot, epoch_plot, 'append')
218 | 
219 |         if iteration != 0 and iteration % 5000 == 0:
220 |             print('Saving state, iter:', iteration)
221 |             torch.save(ssd_net.state_dict(), args.save_folder + '/ssd300_COCO_' +
222 |                        repr(iteration) + '.pth')
223 |     torch.save(ssd_net.state_dict(),
224 |                args.save_folder + '' + args.dataset + '.pth')
225 | 
226 | 
227 | def adjust_learning_rate(optimizer, gamma, step):
228 |     """Sets the learning rate to the initial LR decayed by 10 at every
229 |         specified step
230 |     # Adapted from PyTorch Imagenet example:
231 |     # https://github.com/pytorch/examples/blob/master/imagenet/main.py
232 |     """
233 |     lr = args.lr * (gamma ** (step))
234 |     for param_group in optimizer.param_groups:
235 |         param_group['lr'] = lr
236 | 
237 | 
238 | def xavier(param):
239 |     init.xavier_uniform(param)
240 | 
241 | 
242 | def weights_init(m):
243 |     if isinstance(m, nn.Conv2d):
244 |         xavier(m.weight.data)
245 |         m.bias.data.zero_()
246 | 
247 | 
248 | def create_vis_plot(_xlabel, _ylabel, _title, _legend):
249 |     return viz.line(
250 |         X=torch.zeros((1,)).cpu(),
251 |         Y=torch.zeros((1, 3)).cpu(),
252 |         opts=dict(
253 |             xlabel=_xlabel,
254 |             ylabel=_ylabel,
255 |             title=_title,
256 |             legend=_legend
257 |         )
258 |     )
259 | 
260 | 
261 | def update_vis_plot(iteration, loc, conf, window1, window2, update_type,
262 |                     epoch_size=1):
263 |     viz.line(
264 |         X=torch.ones((1, 3)).cpu() * iteration,
265 |         Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu() / epoch_size,
266 |         win=window1,
267 |         update=update_type
268 |     )
269 |     # initialize epoch plot on first iteration
270 |     if iteration == 0:
271 |         viz.line(
272 |             X=torch.zeros((1, 3)).cpu(),
273 |             Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu(),
274 |             win=window2,
275 |             update=True
276 |         )
277 | 
278 | 
279 | if __name__ == '__main__':
280 |     train()
281 | 


--------------------------------------------------------------------------------
/实验 4.1/visualTest_gauge.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.backends.cudnn as cudnn
  5 | from torch.autograd import Variable
  6 | import torch.utils.data as data
  7 | 
  8 | from data import BaseTransform
  9 | from data.custom_for_visual import CUSTOM_CLASSES_GAUGE as labelmap_gauge
 10 | from data.custom_for_visual import CUSTOM_CLASSES_WATERLINE as labelmap_waterline
 11 | from data.custom_for_visual import customDetection, customAnnotationTransform, CUSTOM_ROOT, CUSTOM_CLASSES_GAUGE, CUSTOM_CLASSES_WATERLINE
 12 | 
 13 | # from ssd import build_ssd
 14 | from ssd_resnet_101 import build_ssd
 15 | 
 16 | import sys
 17 | import os
 18 | import time
 19 | import argparse
 20 | import numpy as np
 21 | import pickle
 22 | import cv2
 23 | import math
 24 | 
 25 | import warnings
 26 | warnings.filterwarnings("ignore")
 27 | 
 28 | parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection')
 29 | parser.add_argument('--trained_model_gauge',
 30 |                     default='useful_weight/CUSTOM_gauge.pth', type=str,
 31 |                     help='Trained state_dict file path to open')
 32 | parser.add_argument('--trained_model_waterline',
 33 |                     default='useful_weight/CUSTOM_mark.pth', type=str,
 34 |                     help='Trained state_dict file path to open')
 35 | parser.add_argument('--save_folder', default='eval/', type=str,
 36 |                     help='Dir to save results')
 37 | parser.add_argument('--visual_threshold', default=0.1, type=float,
 38 |                     help='Final confidence threshold')
 39 | parser.add_argument('--cuda', default=True, type=bool,
 40 |                     help='Use cuda to train model')
 41 | parser.add_argument('--custom_root', default=CUSTOM_ROOT, help='Location of VOC root directory')
 42 | parser.add_argument('-f', default=None, type=str, help="Dummy arg so we can load in Jupyter Notebooks")
 43 | args = parser.parse_args()
 44 | 
 45 | if args.cuda and torch.cuda.is_available():
 46 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 47 | else:
 48 |     torch.set_default_tensor_type('torch.FloatTensor')
 49 | 
 50 | if not os.path.exists(args.save_folder):
 51 |     os.mkdir(args.save_folder)
 52 | 
 53 | 
 54 | def test_net(save_folder, net, cuda, testset, transform, thresh, labelmap):
 55 |     # dump predictions and assoc. ground truth to text file for now
 56 |     filename = save_folder + 'result_%s.txt'
 57 |     num_images = len(testset)
 58 |     for i in range(num_images):
 59 |         print('Testing image {:d}/{:d}....'.format(i+1, num_images))
 60 |         img = testset.pull_image(i)
 61 |         img_id, annotation = testset.pull_anno(i)
 62 |         x = torch.from_numpy(transform(img)[0]).permute(2, 0, 1)
 63 |         x = Variable(x.unsqueeze(0))
 64 | 
 65 |         if cuda:
 66 |             x = x.cuda()
 67 | 
 68 |         y = net(x)      # forward pass
 69 |         detections = y.data
 70 |         # scale each detection back up to the image
 71 |         scale = torch.Tensor([img.shape[1], img.shape[0],
 72 |                              img.shape[1], img.shape[0]])
 73 |         pred_num = 0
 74 |         for i in range(detections.size(1)):
 75 |             j = 0
 76 |             while detections[0, i, j, 0] >= 0.1:
 77 |                 score = detections[0, i, j, 0]
 78 |                 label_name = labelmap[i-1]
 79 |                 pt = (detections[0, i, j, 1:]*scale).cpu().numpy()
 80 |                 coords = (pt[0], pt[1], pt[2], pt[3])
 81 |                 pred_num += 1
 82 |                 with open(filename % label_name, mode='a') as f:
 83 |                     f.write(str(img_id) + ' ' +
 84 |                             str(score.cpu().numpy()) + ' '+ ' '.join(str(c) for c in coords) + '\n')
 85 |                 j += 1
 86 | 
 87 | def xmlData(name, width, height, label):
 88 |     return '''<annotation>
 89 |     <folder>JPEGImages</folder>
 90 |     <filename>%s.jpg</filename>
 91 |     <path>%s.jpg</path>
 92 |     <source>
 93 |         <database>Unknown</database>
 94 |     </source>
 95 |     <size>
 96 |         <width>%d</width>
 97 |         <height>%d</height>
 98 |         <depth>3</depth>
 99 |     </size>
100 |     <segmented>0</segmented>
101 |     <object>
102 |         <name>%s</name>
103 |         <pose>Unspecified</pose>
104 |         <truncated>1</truncated>
105 |         <difficult>0</difficult>
106 |         <bndbox>
107 |             <xmin>0</xmin>
108 |             <ymin>0</ymin>
109 |             <xmax>1</xmax>
110 |             <ymax>1</ymax>
111 |         </bndbox>
112 |     </object>
113 | </annotation>''' % (name, name, width, height, label)
114 | 
115 | def get_output_dir(name, phase=""):
116 |     filedir = os.path.join(name, phase)
117 |     if not os.path.exists(filedir):
118 |         os.makedirs(filedir)
119 |     return filedir
120 | 
121 | def test_custom():
122 |     DEBUG = False
123 |     set_type = 'test'
124 |     devkit_path = args.custom_root + 'test'
125 |     devkit_annopath = os.path.join(args.custom_root, 'test', 'Annotations')
126 |     devkit_imgpath = os.path.join(args.custom_root, 'test', 'JPEGImages')
127 |     devkit_imgsetpath = os.path.join(args.custom_root, 'test', 'ImageSets', 'Main')
128 | 
129 |     # load net
130 |     num_classes_gauge = len(labelmap_gauge) + 1                      # +1 for background
131 |     net = build_ssd('test', 300, num_classes_gauge)            # initialize SSD
132 |     net.load_state_dict(torch.load(args.trained_model_gauge))
133 |     net.eval()
134 | 
135 |     num_classes_waterline = len(labelmap_waterline) + 1                      # +1 for background
136 |     net1 = build_ssd('test', 300, num_classes_waterline)            # initialize SSD
137 |     net1.load_state_dict(torch.load(args.trained_model_waterline))
138 |     net1.eval()
139 |     print('Finished loading model!')
140 |     # load data
141 |     dataset1 = customDetection(args.custom_root, [('gauge', set_type)], None, customAnnotationTransform(class_to_ind=dict(zip(CUSTOM_CLASSES_GAUGE, range(len(CUSTOM_CLASSES_GAUGE))))))
142 |     if args.cuda:
143 |         net = net.cuda()
144 |         cudnn.benchmark = True
145 |     # evaluation
146 |     test_net(args.save_folder, net, args.cuda, dataset1,
147 |              BaseTransform(net.size, (104, 117, 123)),
148 |              thresh=args.visual_threshold, labelmap=labelmap_gauge)
149 | 
150 |     rootPath = 'F:/ssd/data/video/gauge'
151 |     rootPath_temp = 'F:/ssd/data/video/test'
152 |     imgList_gauge = {}
153 | 
154 |     with open(os.path.join(args.save_folder, 'result_gauge.txt'), 'r') as f:
155 |         text_lines = f.readlines()
156 |         for line in text_lines:
157 |             info = line.split(" ")
158 |             name, score, x1, y1, x2, y2 = info
159 |             if name in imgList_gauge:
160 |                 if float(score) > imgList_gauge[name]['score']:
161 |                     imgList_gauge[name] = {
162 |                         'score': float(score),
163 |                         'x1': float(x1),
164 |                         'y1': float(y1),
165 |                         'x2': float(x2),
166 |                         'y2': float(y2)
167 |                     }
168 |             else:
169 |                 imgList_gauge[name] = {
170 |                     'score': float(score),
171 |                     'x1': float(x1),
172 |                     'y1': float(y1),
173 |                     'x2': float(x2),
174 |                     'y2': float(y2)
175 |                 }
176 | 
177 |     img_path = os.path.join(rootPath, 'JPEGImages', '%s.jpg')
178 |     devkit_imgpath = os.path.join(get_output_dir(devkit_imgpath), '%s.jpg')
179 |     devkit_imgsetpath = os.path.join(get_output_dir(devkit_imgsetpath), '%s.txt')
180 |     devkit_annopath = os.path.join(get_output_dir(devkit_annopath), '%s.xml')
181 |     with open(devkit_imgsetpath % ('test'), 'w') as f:
182 |         for obj in imgList_gauge.items():
183 |             name, img = obj
184 |             image = cv2.imread(img_path % name)
185 |             (h, w, c) = image.shape
186 |             x1 = max(math.floor(img['x1']), 0)
187 |             y1 = max(math.floor(img['y1']), 0)
188 |             x2 = min(math.floor(img['x2']), w)
189 |             y2 = min(math.floor(img['y2']), h)
190 |             if DEBUG:
191 |                 cv2.rectangle(image, (x1, y1), (x2, y2), (255,0,0), 5)
192 |                 image = cv2.resize(image, (512, 512))
193 |                 cv2.imshow('w1', image)
194 |                 cv2.waitKey()
195 |             else:
196 |                 image = image[y1:y2, x1:x2]
197 |                 # cv2.imshow('w1', image)
198 |                 cv2.imwrite(devkit_imgpath % name, image, [100])
199 |                 f.write(name + '\n')
200 |                 # cv2.waitKey()
201 |             with open(devkit_annopath % (name), 'w') as f_a:
202 |                 f_a.write(xmlData(name, x2 - x1, y2 - y1, 'waterline'))
203 |     
204 |     dataset2 = customDetection(args.custom_root, [('test', set_type)], None, customAnnotationTransform(class_to_ind=dict(zip(CUSTOM_CLASSES_WATERLINE, range(len(CUSTOM_CLASSES_WATERLINE))))))
205 | 
206 |     if args.cuda:
207 |         net1 = net1.cuda()
208 |         cudnn.benchmark = True
209 |     
210 |     # evaluation
211 |     test_net(args.save_folder, net1, args.cuda, dataset2,
212 |              BaseTransform(net.size, (104, 117, 123)),
213 |              thresh=args.visual_threshold, labelmap=labelmap_waterline)
214 | 
215 |     imgList_waterline = {}
216 |     with open(os.path.join(args.save_folder, 'result_waterline.txt'), 'r') as f:
217 |         text_lines = f.readlines()
218 |         for line in text_lines:
219 |             info = line.split(" ")
220 |             name, score, x1, y1, x2, y2 = info
221 |             if name in imgList_waterline:
222 |                 if float(score) > imgList_waterline[name]['score']:
223 |                     imgList_waterline[name] = {
224 |                         'score': float(score),
225 |                         'x1': float(x1),
226 |                         'y1': float(y1),
227 |                         'x2': float(x2),
228 |                         'y2': float(y2)
229 |                     }
230 |             else:
231 |                 imgList_waterline[name] = {
232 |                     'score': float(score),
233 |                     'x1': float(x1),
234 |                     'y1': float(y1),
235 |                     'x2': float(x2),
236 |                     'y2': float(y2)
237 |                 }
238 |     
239 |     imgList_mark = {}
240 |     with open(os.path.join(args.save_folder, 'result_mark.txt'), 'r') as f:
241 |         text_lines = f.readlines()
242 |         for line in text_lines:
243 |             info = line.split(" ")
244 |             name, score, x1, y1, x2, y2 = info
245 |             if name in imgList_mark:
246 |                 if float(score) > imgList_mark[name]['score']:
247 |                     imgList_mark[name] = {
248 |                         'score': float(score),
249 |                         'x1': float(x1),
250 |                         'y1': float(y1),
251 |                         'x2': float(x2),
252 |                         'y2': float(y2)
253 |                     }
254 |             else:
255 |                 imgList_mark[name] = {
256 |                     'score': float(score),
257 |                     'x1': float(x1),
258 |                     'y1': float(y1),
259 |                     'x2': float(x2),
260 |                     'y2': float(y2)
261 |                 }
262 |     
263 |     cv2.namedWindow('w2',1)
264 |     use_origin = True
265 | 
266 |     if not use_origin:
267 |         img_path = os.path.join(rootPath_temp, 'JPEGImages', '%s.jpg')
268 |     count = 0
269 |     for name in imgList_gauge:
270 |         img_gauge = imgList_gauge[name]
271 |         img_waterline = imgList_waterline[name]
272 |         img_mark = imgList_mark[name]
273 |         
274 |         if not use_origin:
275 |             image = cv2.imread(img_path % name)
276 |             (h, w, c) = image.shape
277 | 
278 |             x1_w = max(math.floor(img_waterline['x1']), 0)
279 |             y1_w = max(math.floor(img_waterline['y1']), 0)
280 |             x2_w = min(math.floor(img_waterline['x2']), w)
281 |             y2_w = min(math.floor(img_waterline['y2']), h)
282 | 
283 |             x1_m = max(math.floor(img_mark['x1']), 0)
284 |             y1_m = max(math.floor(img_mark['y1']), 0)
285 |             x2_m = min(math.floor(img_mark['x2']), w)
286 |             y2_m = min(math.floor(img_mark['y2']), h)
287 |             
288 |             cv2.rectangle(image, (x1_w, y1_w), (x2_w, y2_w), (255,0,0), 5)
289 |             cv2.rectangle(image, (x1_m, y1_m), (x2_m, y2_m), (0,255,0), 5)
290 |             image = cv2.resize(image, (512, 512))
291 |             cv2.imshow('w2', image)
292 |             cv2.waitKey()
293 |         else:
294 |             image = cv2.imread(img_path % name)
295 |             (h, w, c) = image.shape
296 | 
297 |             x1_g = math.floor(img_gauge['x1'])
298 |             y1_g = math.floor(img_gauge['y1'])
299 |             x2_g = math.floor(img_gauge['x2'])
300 |             y2_g = math.floor(img_gauge['y2'])
301 | 
302 |             x1_w = max(math.floor(img_waterline['x1']), 0)
303 |             y1_w = max(math.floor(img_waterline['y1']), 0)
304 |             x2_w = min(math.floor(img_waterline['x2']), w)
305 |             y2_w = min(math.floor(img_waterline['y2']), h)
306 | 
307 |             x1_m = max(math.floor(img_mark['x1']), 0)
308 |             y1_m = max(math.floor(img_mark['y1']), 0)
309 |             x2_m = min(math.floor(img_mark['x2']), w)
310 |             y2_m = min(math.floor(img_mark['y2']), h)
311 |             
312 |             if (y1_w + y2_w) > (y1_m + y2_m):
313 |                 count += 1
314 | 
315 |             cv2.rectangle(image, (x1_g, y1_g), (x2_g, y2_g), (255,0,0), 5)
316 |             cv2.rectangle(image, (x1_g + x1_w, y1_g + y1_w), (x1_g + x2_w, y1_g + y2_w), (0,255,0), 5)
317 |             cv2.rectangle(image, (x1_g + x1_m, y1_g + y1_m), (x1_g + x2_m, y1_g + y2_m), (0,0,255), 5)
318 |             
319 |             image = cv2.resize(image, (512, 512))
320 |             cv2.putText(image, 'gauge: %.2f' % img_gauge['score'], (10, 40), cv2.FONT_HERSHEY_COMPLEX, 1.2, (255, 0, 0), 2)
321 |             cv2.putText(image, 'waterline: %.2f' % img_waterline['score'], (10, 80), cv2.FONT_HERSHEY_COMPLEX, 1.2, (0, 255, 0), 2)
322 |             cv2.putText(image, 'mark: %.2f' % img_mark['score'], (10, 120), cv2.FONT_HERSHEY_COMPLEX, 1.2, (0, 0, 255), 2)
323 |             cv2.imshow('w2', image)
324 |             cv2.waitKey()
325 |     print('correct count:', count)
326 | 
327 | if __name__ == '__main__':
328 |     test_custom()
329 | 


--------------------------------------------------------------------------------
/netModel/resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | try:
  4 |     from torch.hub import load_state_dict_from_url
  5 | except ImportError:
  6 |     from torch.utils.model_zoo import load_url as load_state_dict_from_url
  7 | 
  8 | 
  9 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
 10 |            'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
 11 |            'wide_resnet50_2', 'wide_resnet101_2']
 12 | 
 13 | 
 14 | model_urls = {
 15 |     'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
 16 |     'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
 17 |     'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
 18 |     'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
 19 |     'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
 20 |     'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
 21 |     'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
 22 |     'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
 23 |     'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
 24 | }
 25 | 
 26 | 
 27 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
 28 |     """3x3 convolution with padding"""
 29 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
 30 |                      padding=dilation, groups=groups, bias=False, dilation=dilation)
 31 | 
 32 | 
 33 | def conv1x1(in_planes, out_planes, stride=1):
 34 |     """1x1 convolution"""
 35 |     return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
 36 | 
 37 | 
 38 | class BasicBlock(nn.Module):
 39 |     expansion = 1
 40 |     __constants__ = ['downsample']
 41 | 
 42 |     def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
 43 |                  base_width=64, dilation=1, norm_layer=None):
 44 |         super(BasicBlock, self).__init__()
 45 |         if norm_layer is None:
 46 |             norm_layer = nn.BatchNorm2d
 47 |         if groups != 1 or base_width != 64:
 48 |             raise ValueError('BasicBlock only supports groups=1 and base_width=64')
 49 |         if dilation > 1:
 50 |             raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
 51 |         # Both self.conv1 and self.downsample layers downsample the input when stride != 1
 52 |         self.conv1 = conv3x3(inplanes, planes, stride)
 53 |         self.bn1 = norm_layer(planes)
 54 |         self.relu = nn.ReLU(inplace=True)
 55 |         self.conv2 = conv3x3(planes, planes)
 56 |         self.bn2 = norm_layer(planes)
 57 |         self.downsample = downsample
 58 |         self.stride = stride
 59 | 
 60 |     def forward(self, x):
 61 |         identity = x
 62 | 
 63 |         out = self.conv1(x)
 64 |         out = self.bn1(out)
 65 |         out = self.relu(out)
 66 | 
 67 |         out = self.conv2(out)
 68 |         out = self.bn2(out)
 69 | 
 70 |         if self.downsample is not None:
 71 |             identity = self.downsample(x)
 72 | 
 73 |         out += identity
 74 |         out = self.relu(out)
 75 | 
 76 |         return out
 77 | 
 78 | 
 79 | class Bottleneck(nn.Module):
 80 |     expansion = 4
 81 |     __constants__ = ['downsample']
 82 | 
 83 |     def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
 84 |                  base_width=64, dilation=1, norm_layer=None):
 85 |         super(Bottleneck, self).__init__()
 86 |         if norm_layer is None:
 87 |             norm_layer = nn.BatchNorm2d
 88 |         width = int(planes * (base_width / 64.)) * groups
 89 |         # Both self.conv2 and self.downsample layers downsample the input when stride != 1
 90 |         self.conv1 = conv1x1(inplanes, width)
 91 |         self.bn1 = norm_layer(width)
 92 |         self.conv2 = conv3x3(width, width, stride, groups, dilation)
 93 |         self.bn2 = norm_layer(width)
 94 |         self.conv3 = conv1x1(width, planes * self.expansion)
 95 |         self.bn3 = norm_layer(planes * self.expansion)
 96 |         self.relu = nn.ReLU(inplace=True)
 97 |         self.downsample = downsample
 98 |         self.stride = stride
 99 | 
100 |     def forward(self, x):
101 |         identity = x
102 | 
103 |         out = self.conv1(x)
104 |         out = self.bn1(out)
105 |         out = self.relu(out)
106 | 
107 |         out = self.conv2(out)
108 |         out = self.bn2(out)
109 |         out = self.relu(out)
110 | 
111 |         out = self.conv3(out)
112 |         out = self.bn3(out)
113 | 
114 |         if self.downsample is not None:
115 |             identity = self.downsample(x)
116 | 
117 |         out += identity
118 |         out = self.relu(out)
119 | 
120 |         return out
121 | 
122 | 
123 | class ResNet(nn.Module):
124 | 
125 |     def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
126 |                  groups=1, width_per_group=64, replace_stride_with_dilation=None,
127 |                  norm_layer=None):
128 |         super(ResNet, self).__init__()
129 |         if norm_layer is None:
130 |             norm_layer = nn.BatchNorm2d
131 |         self._norm_layer = norm_layer
132 | 
133 |         self.inplanes = 64
134 |         self.dilation = 1
135 |         if replace_stride_with_dilation is None:
136 |             # each element in the tuple indicates if we should replace
137 |             # the 2x2 stride with a dilated convolution instead
138 |             replace_stride_with_dilation = [False, False, False]
139 |         if len(replace_stride_with_dilation) != 3:
140 |             raise ValueError("replace_stride_with_dilation should be None "
141 |                              "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
142 |         self.groups = groups
143 |         self.base_width = width_per_group
144 |         self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=1, padding=3,
145 |                                bias=False)
146 |         self.bn1 = norm_layer(self.inplanes)
147 |         self.relu = nn.ReLU(inplace=True)
148 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
149 |         self.layer1 = self._make_layer(block, 64, layers[0])
150 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
151 |                                        dilate=replace_stride_with_dilation[0])
152 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
153 |                                        dilate=replace_stride_with_dilation[1])
154 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
155 |                                        dilate=replace_stride_with_dilation[2])
156 |         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
157 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
158 | 
159 |         for m in self.modules():
160 |             if isinstance(m, nn.Conv2d):
161 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
162 |             elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
163 |                 nn.init.constant_(m.weight, 1)
164 |                 nn.init.constant_(m.bias, 0)
165 | 
166 |         # Zero-initialize the last BN in each residual branch,
167 |         # so that the residual branch starts with zeros, and each residual block behaves like an identity.
168 |         # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
169 |         if zero_init_residual:
170 |             for m in self.modules():
171 |                 if isinstance(m, Bottleneck):
172 |                     nn.init.constant_(m.bn3.weight, 0)
173 |                 elif isinstance(m, BasicBlock):
174 |                     nn.init.constant_(m.bn2.weight, 0)
175 | 
176 |     def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
177 |         norm_layer = self._norm_layer
178 |         downsample = None
179 |         previous_dilation = self.dilation
180 |         if dilate:
181 |             self.dilation *= stride
182 |             stride = 1
183 |         if stride != 1 or self.inplanes != planes * block.expansion:
184 |             downsample = nn.Sequential(
185 |                 conv1x1(self.inplanes, planes * block.expansion, stride),
186 |                 norm_layer(planes * block.expansion),
187 |             )
188 | 
189 |         layers = []
190 |         layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
191 |                             self.base_width, previous_dilation, norm_layer))
192 |         self.inplanes = planes * block.expansion
193 |         for _ in range(1, blocks):
194 |             layers.append(block(self.inplanes, planes, groups=self.groups,
195 |                                 base_width=self.base_width, dilation=self.dilation,
196 |                                 norm_layer=norm_layer))
197 | 
198 |         return nn.Sequential(*layers)
199 | 
200 |     def _forward_impl(self, x):
201 |         # See note [TorchScript super()]
202 |         x = self.conv1(x)
203 |         x = self.bn1(x)
204 |         x = self.relu(x)
205 |         x = self.maxpool(x)
206 | 
207 |         x = self.layer1(x)
208 |         x = self.layer2(x)
209 |         x = self.layer3(x)
210 |         x = self.layer4(x)
211 | 
212 |         x = self.avgpool(x)
213 |         x = torch.flatten(x, 1)
214 |         x = self.fc(x)
215 | 
216 |         return x
217 | 
218 |     def forward(self, x):
219 |         return self._forward_impl(x)
220 | 
221 | 
222 | def _resnet(arch, block, layers, pretrained, progress, **kwargs):
223 |     model = ResNet(block, layers, **kwargs)
224 |     if pretrained:
225 |         state_dict = load_state_dict_from_url(model_urls[arch],
226 |                                               progress=progress)
227 |         model.load_state_dict(state_dict)
228 |     return model
229 | 
230 | 
231 | def resnet18(pretrained=False, progress=True, **kwargs):
232 |     r"""ResNet-18 model from
233 |     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
234 |     Args:
235 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
236 |         progress (bool): If True, displays a progress bar of the download to stderr
237 |     """
238 |     return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
239 |                    **kwargs)
240 | 
241 | 
242 | def resnet34(pretrained=False, progress=True, **kwargs):
243 |     r"""ResNet-34 model from
244 |     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
245 |     Args:
246 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
247 |         progress (bool): If True, displays a progress bar of the download to stderr
248 |     """
249 |     return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
250 |                    **kwargs)
251 | 
252 | 
253 | def resnet50(pretrained=False, progress=True, **kwargs):
254 |     r"""ResNet-50 model from
255 |     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
256 |     Args:
257 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
258 |         progress (bool): If True, displays a progress bar of the download to stderr
259 |     """
260 |     return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
261 |                    **kwargs)
262 | 
263 | 
264 | def resnet101(pretrained=False, progress=True, **kwargs):
265 |     r"""ResNet-101 model from
266 |     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
267 |     Args:
268 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
269 |         progress (bool): If True, displays a progress bar of the download to stderr
270 |     """
271 |     return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
272 |                    **kwargs)
273 | 
274 | 
275 | def resnet152(pretrained=False, progress=True, **kwargs):
276 |     r"""ResNet-152 model from
277 |     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
278 |     Args:
279 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
280 |         progress (bool): If True, displays a progress bar of the download to stderr
281 |     """
282 |     return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
283 |                    **kwargs)
284 | 
285 | 
286 | def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
287 |     r"""ResNeXt-50 32x4d model from
288 |     `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
289 |     Args:
290 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
291 |         progress (bool): If True, displays a progress bar of the download to stderr
292 |     """
293 |     kwargs['groups'] = 32
294 |     kwargs['width_per_group'] = 4
295 |     return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
296 |                    pretrained, progress, **kwargs)
297 | 
298 | 
299 | def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
300 |     r"""ResNeXt-101 32x8d model from
301 |     `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
302 |     Args:
303 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
304 |         progress (bool): If True, displays a progress bar of the download to stderr
305 |     """
306 |     kwargs['groups'] = 32
307 |     kwargs['width_per_group'] = 8
308 |     return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
309 |                    pretrained, progress, **kwargs)
310 | 
311 | 
312 | def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
313 |     r"""Wide ResNet-50-2 model from
314 |     `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
315 |     The model is the same as ResNet except for the bottleneck number of channels
316 |     which is twice larger in every block. The number of channels in outer 1x1
317 |     convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
318 |     channels, and in Wide ResNet-50-2 has 2048-1024-2048.
319 |     Args:
320 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
321 |         progress (bool): If True, displays a progress bar of the download to stderr
322 |     """
323 |     kwargs['width_per_group'] = 64 * 2
324 |     return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
325 |                    pretrained, progress, **kwargs)
326 | 
327 | 
328 | def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
329 |     r"""Wide ResNet-101-2 model from
330 |     `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
331 |     The model is the same as ResNet except for the bottleneck number of channels
332 |     which is twice larger in every block. The number of channels in outer 1x1
333 |     convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
334 |     channels, and in Wide ResNet-50-2 has 2048-1024-2048.
335 |     Args:
336 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
337 |         progress (bool): If True, displays a progress bar of the download to stderr
338 |     """
339 |     kwargs['width_per_group'] = 64 * 2
340 |     return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
341 |                    pretrained, progress, **kwargs)


--------------------------------------------------------------------------------
/utils/augmentations.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torchvision import transforms
  3 | import cv2
  4 | import numpy as np
  5 | import types
  6 | from numpy import random
  7 | 
  8 | 
  9 | def intersect(box_a, box_b):
 10 |     max_xy = np.minimum(box_a[:, 2:], box_b[2:])
 11 |     min_xy = np.maximum(box_a[:, :2], box_b[:2])
 12 |     inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
 13 |     return inter[:, 0] * inter[:, 1]
 14 | 
 15 | 
 16 | def jaccard_numpy(box_a, box_b):
 17 |     """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
 18 |     is simply the intersection over union of two boxes.
 19 |     E.g.:
 20 |         A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
 21 |     Args:
 22 |         box_a: Multiple bounding boxes, Shape: [num_boxes,4]
 23 |         box_b: Single bounding box, Shape: [4]
 24 |     Return:
 25 |         jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
 26 |     """
 27 |     inter = intersect(box_a, box_b)
 28 |     area_a = ((box_a[:, 2]-box_a[:, 0]) *
 29 |               (box_a[:, 3]-box_a[:, 1]))  # [A,B]
 30 |     area_b = ((box_b[2]-box_b[0]) *
 31 |               (box_b[3]-box_b[1]))  # [A,B]
 32 |     union = area_a + area_b - inter
 33 |     return inter / union  # [A,B]
 34 | 
 35 | 
 36 | class Compose(object):
 37 |     """Composes several augmentations together.
 38 |     Args:
 39 |         transforms (List[Transform]): list of transforms to compose.
 40 |     Example:
 41 |         >>> augmentations.Compose([
 42 |         >>>     transforms.CenterCrop(10),
 43 |         >>>     transforms.ToTensor(),
 44 |         >>> ])
 45 |     """
 46 | 
 47 |     def __init__(self, transforms):
 48 |         self.transforms = transforms
 49 | 
 50 |     def __call__(self, img, boxes=None, labels=None):
 51 |         for t in self.transforms:
 52 |             img, boxes, labels = t(img, boxes, labels)
 53 |         return img, boxes, labels
 54 | 
 55 | 
 56 | class Lambda(object):
 57 |     """Applies a lambda as a transform."""
 58 | 
 59 |     def __init__(self, lambd):
 60 |         assert isinstance(lambd, types.LambdaType)
 61 |         self.lambd = lambd
 62 | 
 63 |     def __call__(self, img, boxes=None, labels=None):
 64 |         return self.lambd(img, boxes, labels)
 65 | 
 66 | 
 67 | class ConvertFromInts(object):
 68 |     def __call__(self, image, boxes=None, labels=None):
 69 |         return image.astype(np.float32), boxes, labels
 70 | 
 71 | 
 72 | class SubtractMeans(object):
 73 |     def __init__(self, mean):
 74 |         self.mean = np.array(mean, dtype=np.float32)
 75 | 
 76 |     def __call__(self, image, boxes=None, labels=None):
 77 |         image = image.astype(np.float32)
 78 |         image -= self.mean
 79 |         return image.astype(np.float32), boxes, labels
 80 | 
 81 | 
 82 | class ToAbsoluteCoords(object):
 83 |     def __call__(self, image, boxes=None, labels=None):
 84 |         height, width, channels = image.shape
 85 |         boxes[:, 0] *= width
 86 |         boxes[:, 2] *= width
 87 |         boxes[:, 1] *= height
 88 |         boxes[:, 3] *= height
 89 | 
 90 |         return image, boxes, labels
 91 | 
 92 | 
 93 | class ToPercentCoords(object):
 94 |     def __call__(self, image, boxes=None, labels=None):
 95 |         height, width, channels = image.shape
 96 |         boxes[:, 0] /= width
 97 |         boxes[:, 2] /= width
 98 |         boxes[:, 1] /= height
 99 |         boxes[:, 3] /= height
100 | 
101 |         return image, boxes, labels
102 | 
103 | 
104 | class Resize(object):
105 |     def __init__(self, size=300):
106 |         self.size = size
107 | 
108 |     def __call__(self, image, boxes=None, labels=None):
109 |         image = cv2.resize(image, (self.size,
110 |                                  self.size))
111 |         return image, boxes, labels
112 | 
113 | 
114 | class RandomSaturation(object):
115 |     def __init__(self, lower=0.5, upper=1.5):
116 |         self.lower = lower
117 |         self.upper = upper
118 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
119 |         assert self.lower >= 0, "contrast lower must be non-negative."
120 | 
121 |     def __call__(self, image, boxes=None, labels=None):
122 |         if random.randint(2):
123 |             image[:, :, 1] *= random.uniform(self.lower, self.upper)
124 | 
125 |         return image, boxes, labels
126 | 
127 | 
128 | class RandomHue(object):
129 |     def __init__(self, delta=18.0):
130 |         assert delta >= 0.0 and delta <= 360.0
131 |         self.delta = delta
132 | 
133 |     def __call__(self, image, boxes=None, labels=None):
134 |         if random.randint(2):
135 |             image[:, :, 0] += random.uniform(-self.delta, self.delta)
136 |             image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
137 |             image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
138 |         return image, boxes, labels
139 | 
140 | 
141 | class RandomLightingNoise(object):
142 |     def __init__(self):
143 |         self.perms = ((0, 1, 2), (0, 2, 1),
144 |                       (1, 0, 2), (1, 2, 0),
145 |                       (2, 0, 1), (2, 1, 0))
146 | 
147 |     def __call__(self, image, boxes=None, labels=None):
148 |         if random.randint(2):
149 |             swap = self.perms[random.randint(len(self.perms))]
150 |             shuffle = SwapChannels(swap)  # shuffle channels
151 |             image = shuffle(image)
152 |         return image, boxes, labels
153 | 
154 | 
155 | class ConvertColor(object):
156 |     def __init__(self, current='BGR', transform='HSV'):
157 |         self.transform = transform
158 |         self.current = current
159 | 
160 |     def __call__(self, image, boxes=None, labels=None):
161 |         if self.current == 'BGR' and self.transform == 'HSV':
162 |             image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
163 |         elif self.current == 'HSV' and self.transform == 'BGR':
164 |             image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
165 |         else:
166 |             raise NotImplementedError
167 |         return image, boxes, labels
168 | 
169 | 
170 | class RandomContrast(object):
171 |     def __init__(self, lower=0.5, upper=1.5):
172 |         self.lower = lower
173 |         self.upper = upper
174 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
175 |         assert self.lower >= 0, "contrast lower must be non-negative."
176 | 
177 |     # expects float image
178 |     def __call__(self, image, boxes=None, labels=None):
179 |         if random.randint(2):
180 |             alpha = random.uniform(self.lower, self.upper)
181 |             image *= alpha
182 |         return image, boxes, labels
183 | 
184 | 
185 | class RandomBrightness(object):
186 |     def __init__(self, delta=32):
187 |         assert delta >= 0.0
188 |         assert delta <= 255.0
189 |         self.delta = delta
190 | 
191 |     def __call__(self, image, boxes=None, labels=None):
192 |         if random.randint(2):
193 |             delta = random.uniform(-self.delta, self.delta)
194 |             image += delta
195 |         return image, boxes, labels
196 | 
197 | 
198 | class ToCV2Image(object):
199 |     def __call__(self, tensor, boxes=None, labels=None):
200 |         return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels
201 | 
202 | 
203 | class ToTensor(object):
204 |     def __call__(self, cvimage, boxes=None, labels=None):
205 |         return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels
206 | 
207 | 
208 | class RandomSampleCrop(object):
209 |     """Crop
210 |     Arguments:
211 |         img (Image): the image being input during training
212 |         boxes (Tensor): the original bounding boxes in pt form
213 |         labels (Tensor): the class labels for each bbox
214 |         mode (float tuple): the min and max jaccard overlaps
215 |     Return:
216 |         (img, boxes, classes)
217 |             img (Image): the cropped image
218 |             boxes (Tensor): the adjusted bounding boxes in pt form
219 |             labels (Tensor): the class labels for each bbox
220 |     """
221 |     def __init__(self):
222 |         self.sample_options = (
223 |             # using entire original input image
224 |             None,
225 |             # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
226 |             (0.1, None),
227 |             (0.3, None),
228 |             (0.7, None),
229 |             (0.9, None),
230 |             # randomly sample a patch
231 |             (None, None),
232 |         )
233 | 
234 |     def __call__(self, image, boxes=None, labels=None):
235 |         height, width, _ = image.shape
236 |         while True:
237 |             # randomly choose a mode
238 |             mode = random.choice(self.sample_options)
239 |             if mode is None:
240 |                 return image, boxes, labels
241 | 
242 |             min_iou, max_iou = mode
243 |             if min_iou is None:
244 |                 min_iou = float('-inf')
245 |             if max_iou is None:
246 |                 max_iou = float('inf')
247 | 
248 |             # max trails (50)
249 |             for _ in range(50):
250 |                 current_image = image
251 | 
252 |                 w = random.uniform(0.3 * width, width)
253 |                 h = random.uniform(0.3 * height, height)
254 | 
255 |                 # aspect ratio constraint b/t .5 & 2
256 |                 if h / w < 0.5 or h / w > 2:
257 |                     continue
258 | 
259 |                 left = random.uniform(width - w)
260 |                 top = random.uniform(height - h)
261 | 
262 |                 # convert to integer rect x1,y1,x2,y2
263 |                 rect = np.array([int(left), int(top), int(left+w), int(top+h)])
264 | 
265 |                 # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
266 |                 overlap = jaccard_numpy(boxes, rect)
267 | 
268 |                 # is min and max overlap constraint satisfied? if not try again
269 |                 if overlap.min() < min_iou and max_iou < overlap.max():
270 |                     continue
271 | 
272 |                 # cut the crop from the image
273 |                 current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
274 |                                               :]
275 | 
276 |                 # keep overlap with gt box IF center in sampled patch
277 |                 centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
278 | 
279 |                 # mask in all gt boxes that above and to the left of centers
280 |                 m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
281 | 
282 |                 # mask in all gt boxes that under and to the right of centers
283 |                 m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
284 | 
285 |                 # mask in that both m1 and m2 are true
286 |                 mask = m1 * m2
287 | 
288 |                 # have any valid boxes? try again if not
289 |                 if not mask.any():
290 |                     continue
291 | 
292 |                 # take only matching gt boxes
293 |                 current_boxes = boxes[mask, :].copy()
294 | 
295 |                 # take only matching gt labels
296 |                 current_labels = labels[mask]
297 | 
298 |                 # should we use the box left and top corner or the crop's
299 |                 current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
300 |                                                   rect[:2])
301 |                 # adjust to crop (by substracting crop's left,top)
302 |                 current_boxes[:, :2] -= rect[:2]
303 | 
304 |                 current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
305 |                                                   rect[2:])
306 |                 # adjust to crop (by substracting crop's left,top)
307 |                 current_boxes[:, 2:] -= rect[:2]
308 | 
309 |                 return current_image, current_boxes, current_labels
310 | 
311 | 
312 | class Expand(object):
313 |     def __init__(self, mean):
314 |         self.mean = mean
315 | 
316 |     def __call__(self, image, boxes, labels):
317 |         if random.randint(2):
318 |             return image, boxes, labels
319 | 
320 |         height, width, depth = image.shape
321 |         ratio = random.uniform(1, 4)
322 |         left = random.uniform(0, width*ratio - width)
323 |         top = random.uniform(0, height*ratio - height)
324 | 
325 |         expand_image = np.zeros(
326 |             (int(height*ratio), int(width*ratio), depth),
327 |             dtype=image.dtype)
328 |         expand_image[:, :, :] = self.mean
329 |         expand_image[int(top):int(top + height),
330 |                      int(left):int(left + width)] = image
331 |         image = expand_image
332 | 
333 |         boxes = boxes.copy()
334 |         boxes[:, :2] += (int(left), int(top))
335 |         boxes[:, 2:] += (int(left), int(top))
336 | 
337 |         return image, boxes, labels
338 | 
339 | 
340 | class RandomMirror(object):
341 |     def __call__(self, image, boxes, classes):
342 |         _, width, _ = image.shape
343 |         if random.randint(2):
344 |             image = image[:, ::-1]
345 |             boxes = boxes.copy()
346 |             boxes[:, 0::2] = width - boxes[:, 2::-2]
347 |         return image, boxes, classes
348 | 
349 | 
350 | class SwapChannels(object):
351 |     """Transforms a tensorized image by swapping the channels in the order
352 |      specified in the swap tuple.
353 |     Args:
354 |         swaps (int triple): final order of channels
355 |             eg: (2, 1, 0)
356 |     """
357 | 
358 |     def __init__(self, swaps):
359 |         self.swaps = swaps
360 | 
361 |     def __call__(self, image):
362 |         """
363 |         Args:
364 |             image (Tensor): image tensor to be transformed
365 |         Return:
366 |             a tensor with channels swapped according to swap
367 |         """
368 |         # if torch.is_tensor(image):
369 |         #     image = image.data.cpu().numpy()
370 |         # else:
371 |         #     image = np.array(image)
372 |         image = image[:, :, self.swaps]
373 |         return image
374 | 
375 | 
376 | class PhotometricDistort(object):
377 |     def __init__(self):
378 |         self.pd = [
379 |             RandomContrast(),
380 |             ConvertColor(transform='HSV'),
381 |             RandomSaturation(),
382 |             RandomHue(),
383 |             ConvertColor(current='HSV', transform='BGR'),
384 |             RandomContrast()
385 |         ]
386 |         self.rand_brightness = RandomBrightness()
387 |         self.rand_light_noise = RandomLightingNoise()
388 | 
389 |     def __call__(self, image, boxes, labels):
390 |         im = image.copy()
391 |         im, boxes, labels = self.rand_brightness(im, boxes, labels)
392 |         if random.randint(2):
393 |             distort = Compose(self.pd[:-1])
394 |         else:
395 |             distort = Compose(self.pd[1:])
396 |         im, boxes, labels = distort(im, boxes, labels)
397 |         return self.rand_light_noise(im, boxes, labels)
398 | 
399 | 
400 | class SSDAugmentation(object):
401 |     def __init__(self, size=300, mean=(104, 117, 123)):
402 |         self.mean = mean
403 |         self.size = size
404 |         self.augment = Compose([
405 |             ConvertFromInts(),
406 |             ToAbsoluteCoords(),
407 |             PhotometricDistort(),
408 |             Expand(self.mean),
409 |             RandomSampleCrop(),
410 |             RandomMirror(),
411 |             ToPercentCoords(),
412 |             Resize(self.size),
413 |             SubtractMeans(self.mean)
414 |         ])
415 | 
416 |     def __call__(self, img, boxes, labels):
417 |         return self.augment(img, boxes, labels)
418 | 


--------------------------------------------------------------------------------
/实验 4.1/evalCustom_101.py:
--------------------------------------------------------------------------------
  1 | """Adapted from:
  2 |     @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch
  3 |     @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn
  4 |     Licensed under The MIT License [see LICENSE for details]
  5 | """
  6 | 
  7 | from __future__ import print_function
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.backends.cudnn as cudnn
 11 | from torch.autograd import Variable
 12 | # from data import VOC_ROOT, VOCAnnotationTransform, VOCDetection, BaseTransform
 13 | # from data import VOC_CLASSES as labelmap
 14 | import torch.utils.data as data
 15 | 
 16 | from data import BaseTransform
 17 | from data.custom import CUSTOM_CLASSES as labelmap
 18 | from data.custom import customDetection, customAnnotationTransform, CUSTOM_CLASSES, CUSTOM_ROOT
 19 | 
 20 | # from ssd import build_ssd
 21 | from ssd_resnet_101 import build_ssd
 22 | 
 23 | import sys
 24 | import os
 25 | import time
 26 | import argparse
 27 | import numpy as np
 28 | import pickle
 29 | import cv2
 30 | 
 31 | if sys.version_info[0] == 2:
 32 |     import xml.etree.cElementTree as ET
 33 | else:
 34 |     import xml.etree.ElementTree as ET
 35 | 
 36 | 
 37 | def str2bool(v):
 38 |     return v.lower() in ("yes", "true", "t", "1")
 39 | 
 40 | 
 41 | parser = argparse.ArgumentParser(
 42 |     description='Single Shot MultiBox Detector Evaluation')
 43 | parser.add_argument('--trained_model',
 44 |                     default='weights/CUSTOM.pth', type=str,
 45 |                     help='Trained state_dict file path to open')
 46 | parser.add_argument('--save_folder', default='eval/', type=str,
 47 |                     help='File path to save results')
 48 | parser.add_argument('--confidence_threshold', default=0.01, type=float,
 49 |                     help='Detection confidence threshold')
 50 | parser.add_argument('--top_k', default=5, type=int,
 51 |                     help='Further restrict the number of predictions to parse')
 52 | parser.add_argument('--cuda', default=True, type=str2bool,
 53 |                     help='Use cuda to train model')
 54 | parser.add_argument('--custom_root', default=CUSTOM_ROOT,
 55 |                     help='Location of VOC root directory')
 56 | parser.add_argument('--cleanup', default=True, type=str2bool,
 57 |                     help='Cleanup and remove results files following eval')
 58 | 
 59 | args = parser.parse_args()
 60 | 
 61 | if not os.path.exists(args.save_folder):
 62 |     os.mkdir(args.save_folder)
 63 | 
 64 | if torch.cuda.is_available():
 65 |     if args.cuda:
 66 |         torch.set_default_tensor_type('torch.cuda.FloatTensor')
 67 |     if not args.cuda:
 68 |         print("WARNING: It looks like you have a CUDA device, but aren't using \
 69 |               CUDA.  Run with --cuda for optimal eval speed.")
 70 |         torch.set_default_tensor_type('torch.FloatTensor')
 71 | else:
 72 |     torch.set_default_tensor_type('torch.FloatTensor')
 73 | 
 74 | annopath = os.path.join(args.custom_root, 'shenhe', 'Annotations', '%s.xml')
 75 | imgpath = os.path.join(args.custom_root, 'shenhe', 'JPEGImages', '%s.jpg')
 76 | imgsetpath = os.path.join(args.custom_root, 'shenhe', 'ImageSets', 'Main', '%s.txt')
 77 | 
 78 | devkit_path = args.custom_root + 'shenhe'
 79 | dataset_mean = (104, 117, 123)
 80 | set_type = 'test'
 81 | 
 82 | 
 83 | class Timer(object):
 84 |     """A simple timer."""
 85 |     def __init__(self):
 86 |         self.total_time = 0.
 87 |         self.calls = 0
 88 |         self.start_time = 0.
 89 |         self.diff = 0.
 90 |         self.average_time = 0.
 91 | 
 92 |     def tic(self):
 93 |         # using time.time instead of time.clock because time time.clock
 94 |         # does not normalize for multithreading
 95 |         self.start_time = time.time()
 96 | 
 97 |     def toc(self, average=True):
 98 |         self.diff = time.time() - self.start_time
 99 |         self.total_time += self.diff
100 |         self.calls += 1
101 |         self.average_time = self.total_time / self.calls
102 |         if average:
103 |             return self.average_time
104 |         else:
105 |             return self.diff
106 | 
107 | 
108 | def parse_rec(filename):
109 |     """ Parse a PASCAL VOC xml file """
110 |     tree = ET.parse(filename)
111 |     objects = []
112 |     for obj in tree.findall('object'):
113 |         obj_struct = {}
114 |         obj_struct['name'] = obj.find('name').text
115 |         obj_struct['pose'] = obj.find('pose').text
116 |         obj_struct['truncated'] = int(obj.find('truncated').text)
117 |         obj_struct['difficult'] = int(obj.find('difficult').text)
118 |         bbox = obj.find('bndbox')
119 |         obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1,
120 |                               int(bbox.find('ymin').text) - 1,
121 |                               int(bbox.find('xmax').text) - 1,
122 |                               int(bbox.find('ymax').text) - 1]
123 |         objects.append(obj_struct)
124 | 
125 |     return objects
126 | 
127 | 
128 | def get_output_dir(name, phase):
129 |     """Return the directory where experimental artifacts are placed.
130 |     If the directory does not exist, it is created.
131 |     A canonical path is built using the name from an imdb and a network
132 |     (if not None).
133 |     """
134 |     filedir = os.path.join(name, phase)
135 |     if not os.path.exists(filedir):
136 |         os.makedirs(filedir)
137 |     return filedir
138 | 
139 | 
140 | def get_voc_results_file_template(image_set, cls):
141 |     # VOCdevkit/VOC2007/results/det_test_aeroplane.txt
142 |     filename = 'det_' + image_set + '_%s.txt' % (cls)
143 |     filedir = os.path.join(devkit_path, 'results')
144 |     if not os.path.exists(filedir):
145 |         os.makedirs(filedir)
146 |     path = os.path.join(filedir, filename)
147 |     return path
148 | 
149 | 
150 | def write_voc_results_file(all_boxes, dataset):
151 |     for cls_ind, cls in enumerate(labelmap):
152 |         print('Writing {:s} VOC results file'.format(cls))
153 |         filename = get_voc_results_file_template(set_type, cls)
154 |         with open(filename, 'wt') as f:
155 |             for im_ind, index in enumerate(dataset.ids):
156 |                 dets = all_boxes[cls_ind+1][im_ind]
157 |                 if dets == []:
158 |                     continue
159 |                 # the VOCdevkit expects 1-based indices
160 |                 for k in range(dets.shape[0]):
161 |                     f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
162 |                             format(index[1], dets[k, -1],
163 |                                    dets[k, 0] + 1, dets[k, 1] + 1,
164 |                                    dets[k, 2] + 1, dets[k, 3] + 1))
165 | 
166 | 
167 | def do_python_eval(output_dir='output', use_07=True):
168 |     cachedir = os.path.join(devkit_path, 'annotations_cache')
169 |     aps = []
170 |     # The PASCAL VOC metric changed in 2010
171 |     use_07_metric = use_07
172 |     print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
173 |     if not os.path.isdir(output_dir):
174 |         os.mkdir(output_dir)
175 |     for i, cls in enumerate(labelmap):
176 |         filename = get_voc_results_file_template(set_type, cls)
177 |         rec, prec, ap = voc_eval(
178 |            filename, annopath, imgsetpath % (set_type), cls, cachedir,
179 |            ovthresh=0.1, use_07_metric=use_07_metric)
180 |         aps += [ap]
181 |         print('AP for {} = {:.4f}'.format(cls, ap))
182 |         with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
183 |             pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
184 |     print('Mean AP = {:.4f}'.format(np.mean(aps)))
185 |     print('~~~~~~~~')
186 |     print('Results:')
187 |     for ap in aps:
188 |         print('{:.3f}'.format(ap))
189 |     print('{:.3f}'.format(np.mean(aps)))
190 |     print('~~~~~~~~')
191 |     print('')
192 |     print('--------------------------------------------------------------')
193 |     print('Results computed with the **unofficial** Python eval code.')
194 |     print('Results should be very close to the official MATLAB eval code.')
195 |     print('--------------------------------------------------------------')
196 | 
197 | 
198 | def voc_ap(rec, prec, use_07_metric=True):
199 |     """ ap = voc_ap(rec, prec, [use_07_metric])
200 |     Compute VOC AP given precision and recall.
201 |     If use_07_metric is true, uses the
202 |     VOC 07 11 point method (default:True).
203 |     """
204 |     if use_07_metric:
205 |         # 11 point metric
206 |         ap = 0.
207 |         for t in np.arange(0., 1.1, 0.1):
208 |             if np.sum(rec >= t) == 0:
209 |                 p = 0
210 |             else:
211 |                 p = np.max(prec[rec >= t])
212 |             ap = ap + p / 11.
213 |     else:
214 |         # correct AP calculation
215 |         # first append sentinel values at the end
216 |         mrec = np.concatenate(([0.], rec, [1.]))
217 |         mpre = np.concatenate(([0.], prec, [0.]))
218 | 
219 |         # compute the precision envelope
220 |         for i in range(mpre.size - 1, 0, -1):
221 |             mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
222 | 
223 |         # to calculate area under PR curve, look for points
224 |         # where X axis (recall) changes value
225 |         i = np.where(mrec[1:] != mrec[:-1])[0]
226 | 
227 |         # and sum (\Delta recall) * prec
228 |         ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
229 |     return ap
230 | 
231 | 
232 | def voc_eval(detpath,
233 |              annopath,
234 |              imagesetfile,
235 |              classname,
236 |              cachedir,
237 |              ovthresh=0.5,
238 |              use_07_metric=True):
239 |     """rec, prec, ap = voc_eval(detpath,
240 |                            annopath,
241 |                            imagesetfile,
242 |                            classname,
243 |                            [ovthresh],
244 |                            [use_07_metric])
245 | Top level function that does the PASCAL VOC evaluation.
246 | detpath: Path to detections
247 |    detpath.format(classname) should produce the detection results file.
248 | annopath: Path to annotations
249 |    annopath.format(imagename) should be the xml annotations file.
250 | imagesetfile: Text file containing the list of images, one image per line.
251 | classname: Category name (duh)
252 | cachedir: Directory for caching the annotations
253 | [ovthresh]: Overlap threshold (default = 0.5)
254 | [use_07_metric]: Whether to use VOC07's 11 point AP computation
255 |    (default True)
256 | """
257 | # assumes detections are in detpath.format(classname)
258 | # assumes annotations are in annopath.format(imagename)
259 | # assumes imagesetfile is a text file with each line an image name
260 | # cachedir caches the annotations in a pickle file
261 | # first load gt
262 |     if not os.path.isdir(cachedir):
263 |         os.mkdir(cachedir)
264 |     cachefile = os.path.join(cachedir, 'annots.pkl')
265 |     # read list of images
266 |     with open(imagesetfile, 'r') as f:
267 |         lines = f.readlines()
268 |     imagenames = [x.strip() for x in lines]
269 |     if not os.path.isfile(cachefile):
270 |         # load annots
271 |         recs = {}
272 |         for i, imagename in enumerate(imagenames):
273 |             recs[imagename] = parse_rec(annopath % (imagename))
274 |             if i % 100 == 0:
275 |                 print('Reading annotation for {:d}/{:d}'.format(
276 |                    i + 1, len(imagenames)))
277 |         # save
278 |         print('Saving cached annotations to {:s}'.format(cachefile))
279 |         with open(cachefile, 'wb') as f:
280 |             pickle.dump(recs, f)
281 |     else:
282 |         # load
283 |         with open(cachefile, 'rb') as f:
284 |             recs = pickle.load(f)
285 | 
286 |     # extract gt objects for this class
287 |     class_recs = {}
288 |     npos = 0
289 |     for imagename in imagenames:
290 |         R = [obj for obj in recs[imagename] if obj['name'] == classname]
291 |         bbox = np.array([x['bbox'] for x in R])
292 |         difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
293 |         det = [False] * len(R)
294 |         npos = npos + sum(~difficult)
295 |         class_recs[imagename] = {'bbox': bbox,
296 |                                  'difficult': difficult,
297 |                                  'det': det}
298 | 
299 |     # read dets
300 |     detfile = detpath.format(classname)
301 |     with open(detfile, 'r') as f:
302 |         lines = f.readlines()
303 |     if any(lines) == 1:
304 | 
305 |         splitlines = [x.strip().split(' ') for x in lines]
306 |         image_ids = [x[0] for x in splitlines]
307 |         confidence = np.array([float(x[1]) for x in splitlines])
308 |         BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
309 | 
310 |         # sort by confidence
311 |         sorted_ind = np.argsort(-confidence)
312 |         sorted_scores = np.sort(-confidence)
313 |         BB = BB[sorted_ind, :]
314 |         image_ids = [image_ids[x] for x in sorted_ind]
315 | 
316 |         # go down dets and mark TPs and FPs
317 |         nd = len(image_ids)
318 |         tp = np.zeros(nd)
319 |         fp = np.zeros(nd)
320 |         for d in range(nd):
321 |             R = class_recs[image_ids[d]]
322 |             bb = BB[d, :].astype(float)
323 |             ovmax = -np.inf
324 |             BBGT = R['bbox'].astype(float)
325 |             if BBGT.size > 0:
326 |                 # compute overlaps
327 |                 # intersection
328 |                 ixmin = np.maximum(BBGT[:, 0], bb[0])
329 |                 iymin = np.maximum(BBGT[:, 1], bb[1])
330 |                 ixmax = np.minimum(BBGT[:, 2], bb[2])
331 |                 iymax = np.minimum(BBGT[:, 3], bb[3])
332 |                 iw = np.maximum(ixmax - ixmin, 0.)
333 |                 ih = np.maximum(iymax - iymin, 0.)
334 |                 inters = iw * ih
335 |                 uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) +
336 |                        (BBGT[:, 2] - BBGT[:, 0]) *
337 |                        (BBGT[:, 3] - BBGT[:, 1]) - inters)
338 |                 overlaps = inters / uni
339 |                 ovmax = np.max(overlaps)
340 |                 jmax = np.argmax(overlaps)
341 | 
342 |             if ovmax > ovthresh:
343 |                 if not R['difficult'][jmax]:
344 |                     if not R['det'][jmax]:
345 |                         tp[d] = 1.
346 |                         R['det'][jmax] = 1
347 |                     else:
348 |                         fp[d] = 1.
349 |             else:
350 |                 fp[d] = 1.
351 | 
352 |         # compute precision recall
353 |         fp = np.cumsum(fp)
354 |         tp = np.cumsum(tp)
355 |         rec = tp / float(npos)
356 |         # avoid divide by zero in case the first detection matches a difficult
357 |         # ground truth
358 |         prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
359 |         ap = voc_ap(rec, prec, use_07_metric)
360 |     else:
361 |         rec = -1.
362 |         prec = -1.
363 |         ap = -1.
364 | 
365 |     return rec, prec, ap
366 | 
367 | 
368 | def test_net(save_folder, net, cuda, dataset, transform, top_k,
369 |              im_size=300, thresh=0.05):
370 |     num_images = len(dataset)
371 |     # all detections are collected into:
372 |     #    all_boxes[cls][image] = N x 5 array of detections in
373 |     #    (x1, y1, x2, y2, score)
374 |     all_boxes = [[[] for _ in range(num_images)]
375 |                  for _ in range(len(labelmap)+1)]
376 | 
377 |     # timers
378 |     _t = {'im_detect': Timer(), 'misc': Timer()}
379 |     output_dir = get_output_dir('ssd300_120000', set_type)
380 |     det_file = os.path.join(output_dir, 'detections.pkl')
381 | 
382 |     for i in range(num_images):
383 |         im, gt, h, w = dataset.pull_item(i)
384 | 
385 |         x = Variable(im.unsqueeze(0))
386 |         if args.cuda:
387 |             x = x.cuda()
388 |         _t['im_detect'].tic()
389 |         detections = net(x).data
390 |         detect_time = _t['im_detect'].toc(average=False)
391 | 
392 |         # skip j = 0, because it's the background class
393 |         for j in range(1, detections.size(1)):
394 |             dets = detections[0, j, :]
395 |             mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t()
396 |             dets = torch.masked_select(dets, mask).view(-1, 5)
397 |             if dets.size(0) == 0:
398 |                 continue
399 |             boxes = dets[:, 1:]
400 |             boxes[:, 0] *= w
401 |             boxes[:, 2] *= w
402 |             boxes[:, 1] *= h
403 |             boxes[:, 3] *= h
404 |             scores = dets[:, 0].cpu().numpy()
405 |             cls_dets = np.hstack((boxes.cpu().numpy(),
406 |                                   scores[:, np.newaxis])).astype(np.float32,
407 |                                                                  copy=False)
408 |             all_boxes[j][i] = cls_dets
409 | 
410 |         print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1,
411 |                                                     num_images, detect_time))
412 | 
413 |     with open(det_file, 'wb') as f:
414 |         pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL)
415 | 
416 |     print('Evaluating detections')
417 |     evaluate_detections(all_boxes, output_dir, dataset)
418 | 
419 | 
420 | def evaluate_detections(box_list, output_dir, dataset):
421 |     write_voc_results_file(box_list, dataset)
422 |     do_python_eval(output_dir)
423 | 
424 | 
425 | if __name__ == '__main__':
426 |     # load net
427 |     num_classes = len(labelmap) + 1                      # +1 for background
428 |     net = build_ssd('test', 300, num_classes)            # initialize SSD
429 |     net.load_state_dict(torch.load(args.trained_model))
430 |     net.eval()
431 |     print('Finished loading model!')
432 |     # load data
433 |     dataset = customDetection(args.custom_root, [('shenhe', set_type)],
434 |                            BaseTransform(300, dataset_mean),
435 |                            customAnnotationTransform())
436 |     if args.cuda:
437 |         net = net.cuda()
438 |         cudnn.benchmark = True
439 |     # evaluation
440 |     test_net(args.save_folder, net, args.cuda, dataset,
441 |              BaseTransform(net.size, dataset_mean), args.top_k, 300,
442 |              thresh=args.confidence_threshold)
443 | 


--------------------------------------------------------------------------------
/实验 4.2/evalCustom_18.py:
--------------------------------------------------------------------------------
  1 | """Adapted from:
  2 |     @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch
  3 |     @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn
  4 |     Licensed under The MIT License [see LICENSE for details]
  5 | """
  6 | 
  7 | from __future__ import print_function
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.backends.cudnn as cudnn
 11 | from torch.autograd import Variable
 12 | # from data import VOC_ROOT, VOCAnnotationTransform, VOCDetection, BaseTransform
 13 | # from data import VOC_CLASSES as labelmap
 14 | import torch.utils.data as data
 15 | 
 16 | from data import BaseTransform
 17 | from data.custom import CUSTOM_CLASSES as labelmap
 18 | from data.custom import customDetection, customAnnotationTransform, CUSTOM_CLASSES, CUSTOM_ROOT
 19 | 
 20 | # from ssd import build_ssd
 21 | from ssd_resnet_18 import build_ssd
 22 | 
 23 | import sys
 24 | import os
 25 | import time
 26 | import argparse
 27 | import numpy as np
 28 | import pickle
 29 | import cv2
 30 | 
 31 | if sys.version_info[0] == 2:
 32 |     import xml.etree.cElementTree as ET
 33 | else:
 34 |     import xml.etree.ElementTree as ET
 35 | 
 36 | 
 37 | def str2bool(v):
 38 |     return v.lower() in ("yes", "true", "t", "1")
 39 | 
 40 | 
 41 | parser = argparse.ArgumentParser(
 42 |     description='Single Shot MultiBox Detector Evaluation')
 43 | parser.add_argument('--trained_model',
 44 |                     default='weights/CUSTOM.pth', type=str,
 45 |                     help='Trained state_dict file path to open')
 46 | parser.add_argument('--save_folder', default='eval/', type=str,
 47 |                     help='File path to save results')
 48 | parser.add_argument('--confidence_threshold', default=0.01, type=float,
 49 |                     help='Detection confidence threshold')
 50 | parser.add_argument('--top_k', default=5, type=int,
 51 |                     help='Further restrict the number of predictions to parse')
 52 | parser.add_argument('--cuda', default=True, type=str2bool,
 53 |                     help='Use cuda to train model')
 54 | parser.add_argument('--custom_root', default=CUSTOM_ROOT,
 55 |                     help='Location of VOC root directory')
 56 | parser.add_argument('--cleanup', default=True, type=str2bool,
 57 |                     help='Cleanup and remove results files following eval')
 58 | 
 59 | args = parser.parse_args()
 60 | 
 61 | if not os.path.exists(args.save_folder):
 62 |     os.mkdir(args.save_folder)
 63 | 
 64 | if torch.cuda.is_available():
 65 |     if args.cuda:
 66 |         torch.set_default_tensor_type('torch.cuda.FloatTensor')
 67 |     if not args.cuda:
 68 |         print("WARNING: It looks like you have a CUDA device, but aren't using \
 69 |               CUDA.  Run with --cuda for optimal eval speed.")
 70 |         torch.set_default_tensor_type('torch.FloatTensor')
 71 | else:
 72 |     torch.set_default_tensor_type('torch.FloatTensor')
 73 | 
 74 | annopath = os.path.join(args.custom_root, 'shenhe', 'Annotations', '%s.xml')
 75 | imgpath = os.path.join(args.custom_root, 'shenhe', 'JPEGImages', '%s.jpg')
 76 | imgsetpath = os.path.join(args.custom_root, 'shenhe', 'ImageSets', 'Main', '%s.txt')
 77 | 
 78 | devkit_path = args.custom_root + 'shenhe'
 79 | dataset_mean = (104, 117, 123)
 80 | set_type = 'test'
 81 | 
 82 | 
 83 | class Timer(object):
 84 |     """A simple timer."""
 85 |     def __init__(self):
 86 |         self.total_time = 0.
 87 |         self.calls = 0
 88 |         self.start_time = 0.
 89 |         self.diff = 0.
 90 |         self.average_time = 0.
 91 | 
 92 |     def tic(self):
 93 |         # using time.time instead of time.clock because time time.clock
 94 |         # does not normalize for multithreading
 95 |         self.start_time = time.time()
 96 | 
 97 |     def toc(self, average=True):
 98 |         self.diff = time.time() - self.start_time
 99 |         self.total_time += self.diff
100 |         self.calls += 1
101 |         self.average_time = self.total_time / self.calls
102 |         if average:
103 |             return self.average_time
104 |         else:
105 |             return self.diff
106 | 
107 | 
108 | def parse_rec(filename):
109 |     """ Parse a PASCAL VOC xml file """
110 |     tree = ET.parse(filename)
111 |     objects = []
112 |     for obj in tree.findall('object'):
113 |         obj_struct = {}
114 |         obj_struct['name'] = obj.find('name').text
115 |         obj_struct['pose'] = obj.find('pose').text
116 |         obj_struct['truncated'] = int(obj.find('truncated').text)
117 |         obj_struct['difficult'] = int(obj.find('difficult').text)
118 |         bbox = obj.find('bndbox')
119 |         obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1,
120 |                               int(bbox.find('ymin').text) - 1,
121 |                               int(bbox.find('xmax').text) - 1,
122 |                               int(bbox.find('ymax').text) - 1]
123 |         objects.append(obj_struct)
124 | 
125 |     return objects
126 | 
127 | 
128 | def get_output_dir(name, phase):
129 |     """Return the directory where experimental artifacts are placed.
130 |     If the directory does not exist, it is created.
131 |     A canonical path is built using the name from an imdb and a network
132 |     (if not None).
133 |     """
134 |     filedir = os.path.join(name, phase)
135 |     if not os.path.exists(filedir):
136 |         os.makedirs(filedir)
137 |     return filedir
138 | 
139 | 
140 | def get_voc_results_file_template(image_set, cls):
141 |     # VOCdevkit/VOC2007/results/det_test_aeroplane.txt
142 |     filename = 'det_' + image_set + '_%s.txt' % (cls)
143 |     filedir = os.path.join(devkit_path, 'results')
144 |     if not os.path.exists(filedir):
145 |         os.makedirs(filedir)
146 |     path = os.path.join(filedir, filename)
147 |     return path
148 | 
149 | 
150 | def write_voc_results_file(all_boxes, dataset):
151 |     for cls_ind, cls in enumerate(labelmap):
152 |         print('Writing {:s} VOC results file'.format(cls))
153 |         filename = get_voc_results_file_template(set_type, cls)
154 |         with open(filename, 'wt') as f:
155 |             for im_ind, index in enumerate(dataset.ids):
156 |                 dets = all_boxes[cls_ind+1][im_ind]
157 |                 if dets == []:
158 |                     continue
159 |                 # the VOCdevkit expects 1-based indices
160 |                 for k in range(dets.shape[0]):
161 |                     f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
162 |                             format(index[1], dets[k, -1],
163 |                                    dets[k, 0] + 1, dets[k, 1] + 1,
164 |                                    dets[k, 2] + 1, dets[k, 3] + 1))
165 | 
166 | 
167 | def do_python_eval(output_dir='output', use_07=True):
168 |     cachedir = os.path.join(devkit_path, 'annotations_cache')
169 |     aps = []
170 |     # The PASCAL VOC metric changed in 2010
171 |     use_07_metric = use_07
172 |     print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
173 |     if not os.path.isdir(output_dir):
174 |         os.mkdir(output_dir)
175 |     for i, cls in enumerate(labelmap):
176 |         filename = get_voc_results_file_template(set_type, cls)
177 |         rec, prec, ap = voc_eval(
178 |            filename, annopath, imgsetpath % (set_type), cls, cachedir,
179 |            ovthresh=0.1, use_07_metric=use_07_metric)
180 |         aps += [ap]
181 |         print('AP for {} = {:.4f}'.format(cls, ap))
182 |         with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
183 |             pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
184 |     print('Mean AP = {:.4f}'.format(np.mean(aps)))
185 |     print('~~~~~~~~')
186 |     print('Results:')
187 |     for ap in aps:
188 |         print('{:.3f}'.format(ap))
189 |     print('{:.3f}'.format(np.mean(aps)))
190 |     print('~~~~~~~~')
191 |     print('')
192 |     print('--------------------------------------------------------------')
193 |     print('Results computed with the **unofficial** Python eval code.')
194 |     print('Results should be very close to the official MATLAB eval code.')
195 |     print('--------------------------------------------------------------')
196 | 
197 | 
198 | def voc_ap(rec, prec, use_07_metric=True):
199 |     """ ap = voc_ap(rec, prec, [use_07_metric])
200 |     Compute VOC AP given precision and recall.
201 |     If use_07_metric is true, uses the
202 |     VOC 07 11 point method (default:True).
203 |     """
204 |     if use_07_metric:
205 |         # 11 point metric
206 |         ap = 0.
207 |         for t in np.arange(0., 1.1, 0.1):
208 |             if np.sum(rec >= t) == 0:
209 |                 p = 0
210 |             else:
211 |                 p = np.max(prec[rec >= t])
212 |             ap = ap + p / 11.
213 |     else:
214 |         # correct AP calculation
215 |         # first append sentinel values at the end
216 |         mrec = np.concatenate(([0.], rec, [1.]))
217 |         mpre = np.concatenate(([0.], prec, [0.]))
218 | 
219 |         # compute the precision envelope
220 |         for i in range(mpre.size - 1, 0, -1):
221 |             mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
222 | 
223 |         # to calculate area under PR curve, look for points
224 |         # where X axis (recall) changes value
225 |         i = np.where(mrec[1:] != mrec[:-1])[0]
226 | 
227 |         # and sum (\Delta recall) * prec
228 |         ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
229 |     return ap
230 | 
231 | 
232 | def voc_eval(detpath,
233 |              annopath,
234 |              imagesetfile,
235 |              classname,
236 |              cachedir,
237 |              ovthresh=0.5,
238 |              use_07_metric=True):
239 |     """rec, prec, ap = voc_eval(detpath,
240 |                            annopath,
241 |                            imagesetfile,
242 |                            classname,
243 |                            [ovthresh],
244 |                            [use_07_metric])
245 | Top level function that does the PASCAL VOC evaluation.
246 | detpath: Path to detections
247 |    detpath.format(classname) should produce the detection results file.
248 | annopath: Path to annotations
249 |    annopath.format(imagename) should be the xml annotations file.
250 | imagesetfile: Text file containing the list of images, one image per line.
251 | classname: Category name (duh)
252 | cachedir: Directory for caching the annotations
253 | [ovthresh]: Overlap threshold (default = 0.5)
254 | [use_07_metric]: Whether to use VOC07's 11 point AP computation
255 |    (default True)
256 | """
257 | # assumes detections are in detpath.format(classname)
258 | # assumes annotations are in annopath.format(imagename)
259 | # assumes imagesetfile is a text file with each line an image name
260 | # cachedir caches the annotations in a pickle file
261 | # first load gt
262 |     if not os.path.isdir(cachedir):
263 |         os.mkdir(cachedir)
264 |     cachefile = os.path.join(cachedir, 'annots.pkl')
265 |     # read list of images
266 |     with open(imagesetfile, 'r') as f:
267 |         lines = f.readlines()
268 |     imagenames = [x.strip() for x in lines]
269 |     if not os.path.isfile(cachefile):
270 |         # load annots
271 |         recs = {}
272 |         for i, imagename in enumerate(imagenames):
273 |             recs[imagename] = parse_rec(annopath % (imagename))
274 |             if i % 100 == 0:
275 |                 print('Reading annotation for {:d}/{:d}'.format(
276 |                    i + 1, len(imagenames)))
277 |         # save
278 |         print('Saving cached annotations to {:s}'.format(cachefile))
279 |         with open(cachefile, 'wb') as f:
280 |             pickle.dump(recs, f)
281 |     else:
282 |         # load
283 |         with open(cachefile, 'rb') as f:
284 |             recs = pickle.load(f)
285 | 
286 |     # extract gt objects for this class
287 |     class_recs = {}
288 |     npos = 0
289 |     for imagename in imagenames:
290 |         R = [obj for obj in recs[imagename] if obj['name'] == classname]
291 |         bbox = np.array([x['bbox'] for x in R])
292 |         difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
293 |         det = [False] * len(R)
294 |         npos = npos + sum(~difficult)
295 |         class_recs[imagename] = {'bbox': bbox,
296 |                                  'difficult': difficult,
297 |                                  'det': det}
298 | 
299 |     # read dets
300 |     detfile = detpath.format(classname)
301 |     with open(detfile, 'r') as f:
302 |         lines = f.readlines()
303 |     if any(lines) == 1:
304 | 
305 |         splitlines = [x.strip().split(' ') for x in lines]
306 |         image_ids = [x[0] for x in splitlines]
307 |         confidence = np.array([float(x[1]) for x in splitlines])
308 |         BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
309 | 
310 |         # sort by confidence
311 |         sorted_ind = np.argsort(-confidence)
312 |         sorted_scores = np.sort(-confidence)
313 |         BB = BB[sorted_ind, :]
314 |         image_ids = [image_ids[x] for x in sorted_ind]
315 | 
316 |         # go down dets and mark TPs and FPs
317 |         nd = len(image_ids)
318 |         tp = np.zeros(nd)
319 |         fp = np.zeros(nd)
320 |         for d in range(nd):
321 |             R = class_recs[image_ids[d]]
322 |             bb = BB[d, :].astype(float)
323 |             ovmax = -np.inf
324 |             BBGT = R['bbox'].astype(float)
325 |             if BBGT.size > 0:
326 |                 # compute overlaps
327 |                 # intersection
328 |                 ixmin = np.maximum(BBGT[:, 0], bb[0])
329 |                 iymin = np.maximum(BBGT[:, 1], bb[1])
330 |                 ixmax = np.minimum(BBGT[:, 2], bb[2])
331 |                 iymax = np.minimum(BBGT[:, 3], bb[3])
332 |                 iw = np.maximum(ixmax - ixmin, 0.)
333 |                 ih = np.maximum(iymax - iymin, 0.)
334 |                 inters = iw * ih
335 |                 uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) +
336 |                        (BBGT[:, 2] - BBGT[:, 0]) *
337 |                        (BBGT[:, 3] - BBGT[:, 1]) - inters)
338 |                 overlaps = inters / uni
339 |                 ovmax = np.max(overlaps)
340 |                 jmax = np.argmax(overlaps)
341 | 
342 |             if ovmax > ovthresh:
343 |                 if not R['difficult'][jmax]:
344 |                     if not R['det'][jmax]:
345 |                         tp[d] = 1.
346 |                         R['det'][jmax] = 1
347 |                     else:
348 |                         fp[d] = 1.
349 |             else:
350 |                 fp[d] = 1.
351 | 
352 |         # compute precision recall
353 |         fp = np.cumsum(fp)
354 |         tp = np.cumsum(tp)
355 |         rec = tp / float(npos)
356 |         # avoid divide by zero in case the first detection matches a difficult
357 |         # ground truth
358 |         prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
359 |         ap = voc_ap(rec, prec, use_07_metric)
360 |     else:
361 |         rec = -1.
362 |         prec = -1.
363 |         ap = -1.
364 | 
365 |     return rec, prec, ap
366 | 
367 | 
368 | def test_net(save_folder, net, cuda, dataset, transform, top_k,
369 |              im_size=300, thresh=0.05):
370 |     num_images = len(dataset)
371 |     # all detections are collected into:
372 |     #    all_boxes[cls][image] = N x 5 array of detections in
373 |     #    (x1, y1, x2, y2, score)
374 |     all_boxes = [[[] for _ in range(num_images)]
375 |                  for _ in range(len(labelmap)+1)]
376 | 
377 |     # timers
378 |     _t = {'im_detect': Timer(), 'misc': Timer()}
379 |     output_dir = get_output_dir('ssd300_120000', set_type)
380 |     det_file = os.path.join(output_dir, 'detections.pkl')
381 | 
382 |     for i in range(num_images):
383 |         im, gt, h, w = dataset.pull_item(i)
384 | 
385 |         x = Variable(im.unsqueeze(0))
386 |         if args.cuda:
387 |             x = x.cuda()
388 |         _t['im_detect'].tic()
389 |         detections = net(x).data
390 |         detect_time = _t['im_detect'].toc(average=False)
391 | 
392 |         # skip j = 0, because it's the background class
393 |         for j in range(1, detections.size(1)):
394 |             dets = detections[0, j, :]
395 |             mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t()
396 |             dets = torch.masked_select(dets, mask).view(-1, 5)
397 |             if dets.size(0) == 0:
398 |                 continue
399 |             boxes = dets[:, 1:]
400 |             boxes[:, 0] *= w
401 |             boxes[:, 2] *= w
402 |             boxes[:, 1] *= h
403 |             boxes[:, 3] *= h
404 |             scores = dets[:, 0].cpu().numpy()
405 |             cls_dets = np.hstack((boxes.cpu().numpy(),
406 |                                   scores[:, np.newaxis])).astype(np.float32,
407 |                                                                  copy=False)
408 |             all_boxes[j][i] = cls_dets
409 | 
410 |         print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1,
411 |                                                     num_images, detect_time))
412 | 
413 |     with open(det_file, 'wb') as f:
414 |         pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL)
415 | 
416 |     print('Evaluating detections')
417 |     evaluate_detections(all_boxes, output_dir, dataset)
418 | 
419 | 
420 | def evaluate_detections(box_list, output_dir, dataset):
421 |     write_voc_results_file(box_list, dataset)
422 |     do_python_eval(output_dir)
423 | 
424 | 
425 | if __name__ == '__main__':
426 |     # load net
427 |     num_classes = len(labelmap) + 1                      # +1 for background
428 |     net = build_ssd('test', 300, num_classes)            # initialize SSD
429 |     net.load_state_dict(torch.load(args.trained_model))
430 |     net.eval()
431 |     print('Finished loading model!')
432 |     # load data
433 |     dataset = customDetection(args.custom_root, [('shenhe', set_type)],
434 |                            BaseTransform(300, dataset_mean),
435 |                            customAnnotationTransform())
436 |     if args.cuda:
437 |         net = net.cuda()
438 |         cudnn.benchmark = True
439 |     # evaluation
440 |     test_net(args.save_folder, net, args.cuda, dataset,
441 |              BaseTransform(net.size, dataset_mean), args.top_k, 300,
442 |              thresh=args.confidence_threshold)
443 | 


--------------------------------------------------------------------------------