├── .gitattributes
├── .gitignore
├── 2voctxt.py
├── LICENSE
├── README.md
├── bbox-regression.py
├── data
    ├── MYSELF.py
    ├── VOCdevkit
    │   └── VOC2007
    │   │   └── ImageSets
    │   │       └── Main
    │   │           ├── test.txt
    │   │           ├── train.txt
    │   │           ├── trainval.txt
    │   │           └── val.txt
    ├── VOCdevkitVOC2007
    │   ├── annotations_cache
    │   │   └── annots.pkl
    │   └── results
    │   │   ├── det_test_None.txt
    │   │   ├── det_test_ship.txt
    │   │   ├── det_train_None.txt
    │   │   ├── det_train_ship.txt
    │   │   ├── det_trainval_None.txt
    │   │   ├── det_trainval_ship.txt
    │   │   ├── det_val_None.txt
    │   │   └── det_val_ship.txt
    ├── __init__.py
    ├── config.py
    ├── example.jpg
    ├── scripts
    │   ├── COCO2014.sh
    │   ├── VOC2007.sh
    │   └── VOC2012.sh
    └── voc0712.py
├── demo
    ├── __init__.py
    ├── demo.ipynb
    └── live.py
├── doc
    ├── SSD.jpg
    ├── detection_example.png
    ├── detection_example2.png
    ├── detection_examples.png
    └── ssd.png
├── eval.py
├── focal_loss.py
├── layers
    ├── __init__.py
    ├── box_utils.py
    ├── functions
    │   ├── __init__.py
    │   ├── detection.py
    │   └── prior_box.py
    └── modules
    │   ├── __init__.py
    │   ├── l2norm.py
    │   └── multibox_loss.py
├── loc-txt.ipynb
├── ssd.py
├── test.py
├── train.py
├── utils
    ├── __init__.py
    └── augmentations.py
├── xml2regresstxt.py
├── 代码详解blog.txt
├── 保存权重
    ├── train.py
    └── 代码详解blog.txt
├── 显示检测结果code.py
└── 训练步骤.txt


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-language=Python
2 | .ipynb_checkpoints/* linguist-documentation
3 | dev.ipynb linguist-documentation
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # IPython Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # dotenv
 79 | .env
 80 | 
 81 | # virtualenv
 82 | venv/
 83 | ENV/
 84 | 
 85 | # Spyder project settings
 86 | .spyderproject
 87 | 
 88 | # Rope project settings
 89 | .ropeproject
 90 | 
 91 | # atom remote-sync package
 92 | .remote-sync.json
 93 | 
 94 | # weights
 95 | weights/
 96 | 
 97 | #DS_Store
 98 | .DS_Store
 99 | 
100 | # dev stuff
101 | eval/
102 | eval.ipynb
103 | dev.ipynb
104 | .vscode/
105 | 
106 | # not ready
107 | videos/
108 | templates/
109 | data/ssd_dataloader.py
110 | data/datasets/
111 | doc/visualize.py
112 | read_results.py
113 | ssd300_120000/
114 | demos/live
115 | webdemo.py
116 | test_data_aug.py
117 | 
118 | # attributes
119 | 
120 | # pycharm
121 | .idea/
122 | 
123 | # temp checkout soln
124 | data/datasets/
125 | data/ssd_dataloader.py
126 | 
127 | # pylint
128 | .pylintrc


--------------------------------------------------------------------------------
/2voctxt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | '''
 4 | @File    :   2voctxt.py    
 5 | @Version :   1.0 
 6 | @Author  :   2014Vee
 7 | @Contact :   1976535998@qq.com
 8 | @License :   (C)Copyright 2014Vee From UESTC
 9 | @Modify Time :   2020/4/17 15:37
10 | @Desciption  :   None
11 | '''
12 | import os
13 | import random
14 | 
15 | # https://blog.csdn.net/duanyajun987/article/details/81507656
16 | #*这里小心一下，里面的训练集和数据集文件的比例如下
17 | #这里由于数据集NWPU里已经有测试集所以不需要在把数据留一部分测试
18 | trainval_percent = 0.2
19 | train_percent = 0.8
20 | xmlfilepath = './data/VOCdevkit/VOC2007/Annotations'
21 | txtsavepath = './data/VOCdevkit/VOC2007/ImageSets/Main'
22 | total_xml = os.listdir(xmlfilepath)
23 | 
24 | num = len(total_xml)
25 | list = range(num)
26 | tv = int(num * trainval_percent)
27 | tr = int(tv * train_percent)
28 | trainval = random.sample(list, tv)
29 | train = random.sample(trainval, tr)
30 | 
31 | ftrainval = open(txtsavepath + '/trainval.txt', 'w')
32 | ftest = open(txtsavepath + '/test.txt', 'w')
33 | ftrain = open(txtsavepath + '/train.txt', 'w')
34 | fval = open(txtsavepath + '/val.txt', 'w')
35 | 
36 | for i in list:
37 |     name = total_xml[i][:-4] + '\n'
38 |     if i in trainval:
39 |         ftrainval.write(name)
40 |         if i in train:
41 |             ftest.write(name)
42 |         else:
43 |             fval.write(name)
44 |     else:
45 |         ftrain.write(name)
46 | 
47 | ftrainval.close()
48 | ftrain.close()
49 | fval.close()
50 | ftest.close()
51 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Max deGroot, Ellis Brown
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SSD: Single Shot MultiBox Object Detector, in PyTorch
  2 | A [PyTorch](http://pytorch.org/) implementation of [Single Shot MultiBox Detector](http://arxiv.org/abs/1512.02325) from the 2016 paper by Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang, and Alexander C. Berg.  The official and original Caffe code can be found [here](https://github.com/weiliu89/caffe/tree/ssd).
  3 | 
  4 | 
  5 | <img align="right" src= "https://github.com/amdegroot/ssd.pytorch/blob/master/doc/ssd.png" height = 400/>
  6 | 
  7 | ### Table of Contents
  8 | - <a href='#installation'>Installation</a>
  9 | - <a href='#datasets'>Datasets</a>
 10 | - <a href='#training-ssd'>Train</a>
 11 | - <a href='#evaluation'>Evaluate</a>
 12 | - <a href='#performance'>Performance</a>
 13 | - <a href='#demos'>Demos</a>
 14 | - <a href='#todo'>Future Work</a>
 15 | - <a href='#references'>Reference</a>
 16 | 
 17 | &nbsp;
 18 | &nbsp;
 19 | &nbsp;
 20 | &nbsp;
 21 | 
 22 | ## Installation
 23 | - Install [PyTorch](http://pytorch.org/) by selecting your environment on the website and running the appropriate command.
 24 | - Clone this repository.
 25 |   * Note: We currently only support Python 3+.
 26 | - Then download the dataset by following the [instructions](#datasets) below.
 27 | - We now support [Visdom](https://github.com/facebookresearch/visdom) for real-time loss visualization during training!
 28 |   * To use Visdom in the browser:
 29 |   ```Shell
 30 |   # First install Python server and client
 31 |   pip install visdom
 32 |   # Start the server (probably in a screen or tmux)
 33 |   python -m visdom.server
 34 |   ```
 35 |   * Then (during training) navigate to http://localhost:8097/ (see the Train section below for training details).
 36 | - Note: For training, we currently support [VOC](http://host.robots.ox.ac.uk/pascal/VOC/) and [COCO](http://mscoco.org/), and aim to add [ImageNet](http://www.image-net.org/) support soon.
 37 | 
 38 | ## Datasets
 39 | To make things easy, we provide bash scripts to handle the dataset downloads and setup for you.  We also provide simple dataset loaders that inherit `torch.utils.data.Dataset`, making them fully compatible with the `torchvision.datasets` [API](http://pytorch.org/docs/torchvision/datasets.html).
 40 | 
 41 | 
 42 | ### COCO
 43 | Microsoft COCO: Common Objects in Context
 44 | 
 45 | ##### Download COCO 2014
 46 | ```Shell
 47 | # specify a directory for dataset to be downloaded into, else default is ~/data/
 48 | sh data/scripts/COCO2014.sh
 49 | ```
 50 | 
 51 | ### VOC Dataset
 52 | PASCAL VOC: Visual Object Classes
 53 | 
 54 | ##### Download VOC2007 trainval & test
 55 | ```Shell
 56 | # specify a directory for dataset to be downloaded into, else default is ~/data/
 57 | sh data/scripts/VOC2007.sh # <directory>
 58 | ```
 59 | 
 60 | ##### Download VOC2012 trainval
 61 | ```Shell
 62 | # specify a directory for dataset to be downloaded into, else default is ~/data/
 63 | sh data/scripts/VOC2012.sh # <directory>
 64 | ```
 65 | 
 66 | ## Training SSD
 67 | - First download the fc-reduced [VGG-16](https://arxiv.org/abs/1409.1556) PyTorch base network weights at:              https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth
 68 | - By default, we assume you have downloaded the file in the `ssd.pytorch/weights` dir:
 69 | 
 70 | ```Shell
 71 | mkdir weights
 72 | cd weights
 73 | wget https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth
 74 | ```
 75 | 
 76 | - To train SSD using the train script simply specify the parameters listed in `train.py` as a flag or manually change them.
 77 | 
 78 | ```Shell
 79 | python train.py
 80 | ```
 81 | 
 82 | - Note:
 83 |   * For training, an NVIDIA GPU is strongly recommended for speed.
 84 |   * For instructions on Visdom usage/installation, see the <a href='#installation'>Installation</a> section.
 85 |   * You can pick-up training from a checkpoint by specifying the path as one of the training parameters (again, see `train.py` for options)
 86 | 
 87 | ## Evaluation
 88 | To evaluate a trained network:
 89 | 
 90 | ```Shell
 91 | python eval.py
 92 | ```
 93 | 
 94 | You can specify the parameters listed in the `eval.py` file by flagging them or manually changing them.  
 95 | 
 96 | 
 97 | <img align="left" src= "https://github.com/amdegroot/ssd.pytorch/blob/master/doc/detection_examples.png">
 98 | 
 99 | ## Performance
100 | 
101 | #### VOC2007 Test
102 | 
103 | ##### mAP
104 | 
105 | | Original | Converted weiliu89 weights | From scratch w/o data aug | From scratch w/ data aug |
106 | |:-:|:-:|:-:|:-:|
107 | | 77.2 % | 77.26 % | 58.12% | 77.43 % |
108 | 
109 | ##### FPS
110 | **GTX 1060:** ~45.45 FPS
111 | 
112 | ## Demos
113 | 
114 | ### Use a pre-trained SSD network for detection
115 | 
116 | #### Download a pre-trained network
117 | - We are trying to provide PyTorch `state_dicts` (dict of weight tensors) of the latest SSD model definitions trained on different datasets.  
118 | - Currently, we provide the following PyTorch models:
119 |     * SSD300 trained on VOC0712 (newest PyTorch weights)
120 |       - https://s3.amazonaws.com/amdegroot-models/ssd300_mAP_77.43_v2.pth
121 |     * SSD300 trained on VOC0712 (original Caffe weights)
122 |       - https://s3.amazonaws.com/amdegroot-models/ssd_300_VOC0712.pth
123 | - Our goal is to reproduce this table from the [original paper](http://arxiv.org/abs/1512.02325)
124 | <p align="left">
125 | <img src="http://www.cs.unc.edu/~wliu/papers/ssd_results.png" alt="SSD results on multiple datasets" width="800px"></p>
126 | 
127 | ### Try the demo notebook
128 | - Make sure you have [jupyter notebook](http://jupyter.readthedocs.io/en/latest/install.html) installed.
129 | - Two alternatives for installing jupyter notebook:
130 |     1. If you installed PyTorch with [conda](https://www.continuum.io/downloads) (recommended), then you should already have it.  (Just  navigate to the ssd.pytorch cloned repo and run):
131 |     `jupyter notebook`
132 | 
133 |     2. If using [pip](https://pypi.python.org/pypi/pip):
134 | 
135 | ```Shell
136 | # make sure pip is upgraded
137 | pip3 install --upgrade pip
138 | # install jupyter notebook
139 | pip install jupyter
140 | # Run this inside ssd.pytorch
141 | jupyter notebook
142 | ```
143 | 
144 | - Now navigate to `demo/demo.ipynb` at http://localhost:8888 (by default) and have at it!
145 | 
146 | ### Try the webcam demo
147 | - Works on CPU (may have to tweak `cv2.waitkey` for optimal fps) or on an NVIDIA GPU
148 | - This demo currently requires opencv2+ w/ python bindings and an onboard webcam
149 |   * You can change the default webcam in `demo/live.py`
150 | - Install the [imutils](https://github.com/jrosebr1/imutils) package to leverage multi-threading on CPU:
151 |   * `pip install imutils`
152 | - Running `python -m demo.live` opens the webcam and begins detecting!
153 | 
154 | ## TODO
155 | We have accumulated the following to-do list, which we hope to complete in the near future
156 | - Still to come:
157 |   * [x] Support for the MS COCO dataset
158 |   * [ ] Support for SSD512 training and testing
159 |   * [ ] Support for training on custom datasets
160 | 
161 | ## Authors
162 | 
163 | * [**Max deGroot**](https://github.com/amdegroot)
164 | * [**Ellis Brown**](http://github.com/ellisbrown)
165 | 
166 | ***Note:*** Unfortunately, this is just a hobby of ours and not a full-time job, so we'll do our best to keep things up to date, but no guarantees.  That being said, thanks to everyone for your continued help and feedback as it is really appreciated. We will try to address everything as soon as possible.
167 | 
168 | ## References
169 | - Wei Liu, et al. "SSD: Single Shot MultiBox Detector." [ECCV2016]((http://arxiv.org/abs/1512.02325)).
170 | - [Original Implementation (CAFFE)](https://github.com/weiliu89/caffe/tree/ssd)
171 | - A huge thank you to [Alex Koltun](https://github.com/alexkoltun) and his team at [Webyclip](http://www.webyclip.com) for their help in finishing the data augmentation portion.
172 | - A list of other great SSD ports that were sources of inspiration (especially the Chainer repo):
173 |   * [Chainer](https://github.com/Hakuyume/chainer-ssd), [Keras](https://github.com/rykov8/ssd_keras), [MXNet](https://github.com/zhreshold/mxnet-ssd), [Tensorflow](https://github.com/balancap/SSD-Tensorflow)
174 | 


--------------------------------------------------------------------------------
/bbox-regression.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | '''
  4 | @File    :   bbox-regression.py    
  5 | @Version :   1.0 
  6 | @Author  :   2014Vee
  7 | @Contact :   1976535998@qq.com
  8 | @License :   (C)Copyright 2014Vee From UESTC
  9 | @Modify Time :   2020/4/14 10:37
 10 | @Desciption  :   None
 11 | '''
 12 | 
 13 | import cv2
 14 | import numpy as np
 15 | import xml.dom.minidom
 16 | import tensorflow as tf
 17 | import os
 18 | import time
 19 | from tensorflow.python.framework import graph_util
 20 | 
 21 | slim = tf.contrib.slim
 22 | 
 23 | #读取txt文件
 24 | train_txt = open('/data/lp/project/ssd.pytorch/txtsave/train.txt')
 25 | val_txt = open('/data/lp/project/ssd.pytorch/txtsave/val.txt')
 26 | train_content = train_txt.readlines()   #保存的train.txt中的内容
 27 | val_content = val_txt.readlines()  #保存的val.txt中的内容
 28 | # for linetr in train_content:
 29 | #     print ("train_content",linetr.rstrip('\n'))
 30 | # for lineva in val_content:
 31 | #     print ("val_content",lineva.rstrip('\n'))
 32 | 
 33 | #根据txt文件读取图像数据,并归一化图像，并保存缩放比例
 34 | train_imgs=[]#缩放后的图像尺寸
 35 | train_imgs_ratio=[] #width 缩放比，height缩放比
 36 | val_imgs=[]
 37 | val_imgs_ratio=[]
 38 | 
 39 | 
 40 | h=48
 41 | w=192  #归一化的尺寸
 42 | c=3   #通道
 43 | 
 44 | 
 45 | for linetr in train_content:
 46 |     img_path='/data/lp/project/ssd.pytorch/oripic/'+linetr.rstrip('\n')+'.jpg'
 47 |     img = cv2.imread(img_path)  #读取原图
 48 | #     print("image_name", str(linetr.rstrip('\n')))
 49 | #     print("imgshape", img.shape)
 50 |     imgresize= cv2.resize(img,(w,h)) #图像归一化
 51 |     ratio = np.array([imgresize.shape[0]/img.shape[0], imgresize.shape[1]/img.shape[1]],np.float32) #height缩放比 ,width 缩放比，
 52 |     train_imgs_ratio.append(ratio)
 53 |     train_imgs.append(imgresize)
 54 | train_img_arr = np.asarray(train_imgs,np.float32)  #保存训练图像数据的列表  h w c
 55 | print(len(train_img_arr),len(train_imgs_ratio))
 56 | 
 57 | for  lineva in val_content:
 58 |     img_path='/data/lp/project/ssd.pytorch/oripic/'+lineva.rstrip('\n')+'.jpg'
 59 |     img = cv2.imread(img_path) # h w c
 60 |     imgresize= cv2.resize(img,(w,h))  #h w c
 61 |     ratio = np.array([imgresize.shape[0]/img.shape[0], imgresize.shape[1]/img.shape[1]],np.float32) #height缩放比, width 缩放比，
 62 |     val_imgs_ratio.append(ratio)
 63 |     val_imgs.append(imgresize)
 64 |    # print(imgresize.shape[0], imgresize.shape[1], imgresize.shape[2])
 65 | val_img_arr = np.asarray(val_imgs,np.float32)  #保存验证图像的数据的列表 h w c
 66 | 
 67 | # print(len(val_img_arr),len(val_imgs_ratio))
 68 | 
 69 | # 根据txt文件读取xml,并获取xml中的坐标（xmin,ymin,xmax,ymax）(x表示width,y表示height),并获取经过缩放后的坐标
 70 | train_xml = []  # 保存标记的边框坐标
 71 | train_xml_resize = []  # 保存标记的边框坐标经过缩放后的坐标，缩放比与图像归一化的缩放比
 72 | val_xml = []
 73 | val_xml_resize = []
 74 | for linetr in train_content:
 75 |     xml_path = '/data/lp/project/ssd.pytorch/xml_zc_fz/' + linetr.rstrip(
 76 |         '\n') + '.xml'
 77 |     print(xml_path)
 78 |     xml_DomTree = xml.dom.minidom.parse(xml_path)
 79 |     xml_annotation = xml_DomTree.documentElement
 80 |     xml_object = xml_annotation.getElementsByTagName('object')
 81 |     xml_bndbox = xml_object[0].getElementsByTagName('bndbox')
 82 |     xmin_list = xml_bndbox[0].getElementsByTagName('xmin')
 83 |     xmin = int(xmin_list[0].childNodes[0].data)
 84 |     ymin_list = xml_bndbox[0].getElementsByTagName('ymin')
 85 |     ymin = int(ymin_list[0].childNodes[0].data)
 86 |     xmax_list = xml_bndbox[0].getElementsByTagName('xmax')
 87 |     xmax = int(xmax_list[0].childNodes[0].data)
 88 |     ymax_list = xml_bndbox[0].getElementsByTagName('ymax')
 89 |     ymax = int(ymax_list[0].childNodes[0].data)
 90 |     coordinate = np.array([ymin, xmin, ymax, xmax], np.int)  # h w h w
 91 |     train_xml.append(coordinate)  # 保存训练图像的xml的坐标
 92 | #     print("bbox:", coordinate)
 93 | # print(len(train_xml))
 94 | 
 95 | for lineva in val_content:
 96 |     xml_path = '/data/lp/project/ssd.pytorch/xml_zc_fz/' + lineva.rstrip(
 97 |         '\n') + '.xml'
 98 |     print(xml_path)
 99 |     xml_DomTree = xml.dom.minidom.parse(xml_path)
100 |     xml_annotation = xml_DomTree.documentElement
101 |     xml_object = xml_annotation.getElementsByTagName('object')
102 |     xml_bndbox = xml_object[0].getElementsByTagName('bndbox')
103 |     xmin_list = xml_bndbox[0].getElementsByTagName('xmin')
104 |     xmin = int(xmin_list[0].childNodes[0].data)
105 |     ymin_list = xml_bndbox[0].getElementsByTagName('ymin')
106 |     ymin = int(ymin_list[0].childNodes[0].data)
107 |     xmax_list = xml_bndbox[0].getElementsByTagName('xmax')
108 |     xmax = int(xmax_list[0].childNodes[0].data)
109 |     ymax_list = xml_bndbox[0].getElementsByTagName('ymax')
110 |     ymax = int(ymax_list[0].childNodes[0].data)
111 |     coordinate = np.array([ymin, xmin, ymax, xmax], np.int)
112 |     val_xml.append(coordinate)  # 保存验证图像的xml的坐标
113 | # print(len(val_xml))
114 | 
115 | for i in range(0, len(train_imgs_ratio)):
116 |     ymin_ratio = train_xml[i][0] * train_imgs_ratio[i][0]
117 |     xmin_ratio = train_xml[i][1] * train_imgs_ratio[i][1]
118 |     ymax_ratio = train_xml[i][2] * train_imgs_ratio[i][0]
119 |     xmax_ratio = train_xml[i][3] * train_imgs_ratio[i][1]
120 |     coordinate_ratio = np.array([ymin_ratio, xmin_ratio, ymax_ratio, xmax_ratio], np.float32)
121 |     train_xml_resize.append(coordinate_ratio)  # 保存训练图像的标记的xml的缩放后的坐标
122 | 
123 | for i in range(0, len(val_imgs_ratio)):
124 |     ymin_ratio = val_xml[i][0] * val_imgs_ratio[i][0]
125 |     xmin_ratio = val_xml[i][1] * val_imgs_ratio[i][1]
126 |     ymax_ratio = val_xml[i][2] * val_imgs_ratio[i][0]
127 |     xmax_ratio = val_xml[i][3] * val_imgs_ratio[i][1]
128 |     coordinate_ratio = np.array([ymin_ratio, xmin_ratio, ymax_ratio, xmax_ratio], np.float32)
129 |     val_xml_resize.append(coordinate_ratio)  # 保存训练验证图像的标记的xml的缩放后的坐标
130 | 
131 | 
132 | # 按批次取数据，获取batchsize数据
133 | # inputs 图像数据  归一化后的数据
134 | # targets xml坐标数据  归一化后的数据
135 | def getbatches(inputs=None, targets=None, batch_size=None, shuffle=False):
136 |     assert len(inputs) == len(targets)
137 |     if shuffle:
138 |         indices = np.arange(len(inputs))
139 |         np.random.shuffle(indices)
140 |     for start_idx in range(0, len(inputs) - batch_size + 1, batch_size):
141 |         if shuffle:
142 |             excerpt = indices[start_idx:start_idx + batch_size]  # 其实就是按照batchsize做切片
143 |         else:
144 |             excerpt = slice(start_idx, start_idx + batch_size)
145 |         yield inputs[excerpt], targets[excerpt]  # 这个yield每次都是遇到了就返回类似于关键字return
146 |         # 但是下次执行的时候就是从yield后面的代码进行继续，此时这个函数不是普通函数而是一个生成器了
147 | 
148 | 
149 | #损失函数smoothL1范数
150 | def abs_smooth(x):
151 |     """Smoothed absolute function. Useful to compute an L1 smooth error.
152 | 
153 |     Define as:
154 |         x^2 / 2         if abs(x) < 1
155 |         abs(x) - 0.5    if abs(x) > 1
156 |     We use here a differentiable definition using min(x) and abs(x). Clearly
157 |     not optimal, but good enough for our purpose!
158 |     """
159 |     absx = tf.abs(x)
160 |     minx = tf.minimum(absx, 1)
161 |     r = 0.5 * ((absx - 1) * minx + absx)#这个地方打开会有平方项
162 |     return r
163 | 
164 | #构建网络结构
165 | 
166 | input_data = tf.placeholder(tf.float32,shape=[None,h,w,c],name='x')  #输入的图像数据（归一化后的图像数据）
167 | input_bound = tf.placeholder(tf.float32,shape=[None,None],name='y') #输入的标记的边框坐标数据（缩放后的xml坐标）
168 | prob=tf.placeholder(tf.float32, name='keep_prob')
169 | 
170 | 
171 | #第一个卷积层（192——>96) （48--》24）
172 | #conv1 = slim.repeat(input_data, 2, slim.conv2d, 32, [3, 3], scope='conv1')
173 | conv1 = slim.conv2d(input_data,  32, [3, 3], scope='conv1')##32是指卷积核的个数，[3, 3]是指卷积核尺寸，默认步长是[1,1]
174 | pool1 = slim.max_pool2d(conv1, [2, 2], scope='pool1')#[2,2]是池化步长
175 | 
176 | #第二个卷积层（96-48） （24-》12）
177 | #conv2 =  slim.repeat(pool1, 2, slim.conv2d, 64, [3, 3], scope='conv2')
178 | conv2 = slim.conv2d(pool1, 64, [3, 3], scope='conv2')
179 | pool2 = slim.max_pool2d(conv2, [2, 2], scope='pool2')
180 | 
181 | #第三个卷积层（48-24） （12-》6）
182 | #conv3 = slim.repeat(pool2, 2, slim.conv2d, 128, [3, 3], scope='conv3')
183 | conv3 = slim.conv2d(pool2, 128, [3, 3], scope='conv3')
184 | pool3 = slim.max_pool2d(conv3, [2, 2], scope='pool3')
185 | 
186 | #第四个卷积层（24） （6）
187 | conv4 = slim.conv2d(pool3, 256 ,[3, 3], scope='conv4')
188 | dropout = tf.layers.dropout(conv4, rate=prob, training=True)
189 | #dropout = tf.nn.dropout(conv4,keep_prob)
190 | #pool4 = slim.max_pool2d(conv4, [2, 2], scope='pool4')
191 | 
192 | #第五个卷积层（24-12） （6-》3）
193 | #conv5 = slim.repeat(dropout, 2, slim.conv2d, 128, [3, 3], scope='conv5')
194 | conv5 = slim.conv2d(dropout , 128, [3, 3], scope='conv5')
195 | pool5 = slim.max_pool2d(conv5, [2, 2], scope='pool5')
196 | 
197 | #第六个卷积层（12-6） （3-》1）
198 | #conv6 = slim.repeat(pool5, 2, slim.conv2d, 64, [3, 3], scope='conv6')
199 | conv6 = slim.conv2d(pool5, 64, [3, 3], scope='conv6')
200 | pool6 = slim.max_pool2d(conv6, [2, 2], scope='pool6')
201 | 
202 | reshape = tf.reshape(pool6, [-1, 6 * 1 * 64])
203 | # print(reshape.get_shape())
204 | 
205 | fc = slim.fully_connected(reshape, 4, scope='fc')
206 | # print(fc)
207 | # print(input_data)
208 | 
209 | '''
210 | #第七个卷积层（6-3） （1-》1）
211 | conv7 = slim.conv2d(pool6,  32, [3, 3], scope='conv7')
212 | pool7 = slim.max_pool2d(conv7, [2, 2], scope='pool7')
213 | 
214 | conv8 = slim.conv2d(pool7, 4, [3, 3], padding=None, activation_fn=None,scope='conv8')
215 | '''
216 | 
217 | 
218 | n_epoch =500
219 | batch_size= 32
220 | print (batch_size)
221 | 
222 | 
223 | weights = tf.expand_dims(1. * 1., axis=-1)
224 | loss = abs_smooth(fc - input_bound)#fc层和输入标签的差，用平滑L2范数做损失函数
225 | # print(loss)
226 | train_op=tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)#优化用的adam，学习率0.001
227 | 
228 | #correct_prediction = tf.equal(fc, input_bound)
229 | #correct_prediction = tf.equal(tf.cast(fc,tf.int32), tf.cast(input_bound, tf.int32))
230 | 
231 | temp_acc = tf.abs(tf.cast(fc,tf.int32) - tf.cast(input_bound, tf.int32)) #fc出来之后的和标签做个差值
232 | compare_np = np.ones((batch_size,4), np.int32) #建立一个和batch_size一样大小，4通道的compare_np
233 | compare_np[:] = 3
234 | print(compare_np)
235 | compare_tf = tf.convert_to_tensor(compare_np) #
236 | # print(compare_tf)
237 | correct_prediction = tf.less(temp_acc,compare_tf)  ##temp_acc对应的元素如果比compare_tf对应的小，那么对应位置返回true
238 | # print(correct_prediction)
239 | loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value')##求张量沿着某个方向的和，求完后可以降维度
240 | tf.summary.scalar('loss',loss) #可视化观看常量
241 | # print(loss)
242 | accuracy= tf.reduce_mean(tf.cast(correct_prediction, tf.float32))###tf.cast函数转换类型###
243 | #tf.summary.scalar('accuracy',accuracy) #可视化观看常量
244 | # print(accuracy)
245 | 
246 | 
247 | # print(prob)
248 | 
249 | # pb_file_path = '/data/liuan/jupyter/root/project/keras-retinanet-master/bbox_fz_zc_006000/bbox_pb_model/ocr_bboxregress_batch16_epoch10000.pb'
250 | pb_file_path = '/data/lp/project/ssd.pytorch/ocr_bbox_batch16_epoch'
251 | 
252 | # 设置可见GPU
253 | gpu_no = '1'  # or '1'
254 | os.environ["CUDA_VISIBLE_DEVICES"] = gpu_no
255 | # 定义TensorFlow配置
256 | config = tf.ConfigProto()
257 | # 配置GPU内存分配方式
258 | config.gpu_options.allow_growth = True
259 | config.gpu_options.per_process_gpu_memory_fraction = 0.6
260 | # config.gpu_options.per_process_gpu_memory_fraction = 0.8
261 | 
262 | 
263 | sess = tf.InteractiveSession(config=config)
264 | 
265 | # ////////////////////////////////
266 | # ckpt = tf.train.get_checkpoint_state('/home/data/wangchongjin/ad_image/model_save/')
267 | # saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path +'.meta')   # 载入图结构，保存在.meta文件中
268 | # saver.restore(sess,ckpt.model_checkpoint_path)
269 | # //////////////////////////////////
270 | sess.run(tf.global_variables_initializer())
271 | 
272 | merged = tf.summary.merge_all()
273 | writer = tf.summary.FileWriter(
274 |     "/data/lp/project/ssd.pytorch/ocr_bbox_batch16_epoch/record_graph", sess.graph_def)
275 | 
276 | # saver = tf.train.Saver() # 声明tf.train.Saver类用于保存模型
277 | 
278 | 
279 | for epoch in range(n_epoch):
280 |     start_time = time.time()
281 | 
282 |     # training
283 |     train_loss, train_acc, n_batch = 0, 0, 0
284 |     for x_train_a, y_train_a in getbatches(train_img_arr, train_xml_resize, batch_size, shuffle=False):
285 |         _, err, acc = sess.run([train_op, loss, accuracy],
286 |                                feed_dict={input_data: x_train_a, input_bound: y_train_a, prob: 0.5})
287 |         train_loss += err
288 |         train_acc += acc
289 |         n_batch += 1
290 | 
291 |     #     print(epoch)
292 |     #     print("   train loss: %f" % (train_loss/ n_batch))
293 |     #     print("   train acc: %f" % (train_acc/ n_batch))
294 | 
295 |     # validation
296 |     val_loss, val_acc, n_batch = 0, 0, 0
297 |     for x_val_a, y_val_a in getbatches(val_img_arr, val_xml_resize, batch_size, shuffle=False):
298 |         err, acc = sess.run([loss, accuracy], feed_dict={input_data: x_val_a, input_bound: y_val_a, prob: 0})
299 |         # print(err)
300 |         val_loss += err
301 |         val_acc += acc
302 |         n_batch += 1
303 | 
304 |         rs = sess.run([merged], feed_dict={input_data: x_val_a, input_bound: y_val_a, prob: 0})
305 |         if n_batch is batch_size:
306 |             writer.add_summary(rs[0], epoch)
307 | 
308 |     #     print("   validation loss: %f" % (val_loss/ n_batch))
309 |     #     print("   validation acc: %f" % (val_acc/ n_batch))
310 | 
311 |     #    saver.save(sess, "/home/data/wangchongjin/ad_image/model_save_new/ad.ckpt")
312 |     constant_graph = graph_util.convert_variables_to_constants(sess, sess.graph_def, ['fc/Relu'])
313 | 
314 |     with tf.gfile.FastGFile(pb_file_path + '_' + str(epoch) + '.pb', mode='wb') as f:
315 |         f.write(constant_graph.SerializeToString())
316 | 
317 | writer.close()
318 | sess.close()


--------------------------------------------------------------------------------
/data/MYSELF.py:
--------------------------------------------------------------------------------
  1 | # # import os.path as osp
  2 | # # import sys
  3 | # # import torch
  4 | # # import torch.utils.data as data
  5 | # # import cv2
  6 | # # import numpy as np
  7 | # # if sys.version_info[0] == 2:
  8 | # #     import xml.etree.cElementTree as ET
  9 | # # else:
 10 | # #     import xml.etree.ElementTree as ET
 11 | # # image_sets=['2007', 'trainval'],#,('2012', 'trainval') 要选用的数据集
 12 | # # root="D:/Deep_learning/ssd.pytorch-master/data/VOCdevkit/"
 13 | # # ids = list()
 14 | # # for (year, name) in image_sets:
 15 | # #     rootpath = osp.join(root, 'VOC' + year)
 16 | # #     for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
 17 | # #         ids.append((rootpath, line.strip()))
 18 | # # print(ids[0])
 19 | # #
 20 | # # img_id = ids[927] #('D:/Deep_learning/ssd.pytorch-master/data/VOCdevkit/VOC2007', '000001')
 21 | # # anno = osp.join('%s', 'Annotations', '%s.xml')
 22 | # # img = osp.join('%s', 'JPEGImages', '%s.jpg')
 23 | # # target = ET.parse(anno % img_id).getroot() #读取xml文件
 24 | # # img = cv2.imread(img % img_id)#获取图像
 25 | # # cv2.imshow('pwn',img)
 26 | # # height, width, channels = img.shape
 27 | # # print(height)
 28 | # # print(width)
 29 | # # print(channels)
 30 | # # cv2.waitKey (0)
 31 | # #
 32 | # # VOC_CLASSES1 = (  # always index 0
 33 | # #     'aeroplane', 'bicycle', 'bird', 'boat',
 34 | # #     'bottle', 'bus', 'car', 'cat', 'chair',
 35 | # #     'cow', 'diningtable', 'dog', 'horse',
 36 | # #     'motorbike', 'person', 'pottedplant',
 37 | # #     'sheep', 'sofa', 'train', 'tvmonitor')
 38 | # # VOC_CLASSES2=('ship','pwn')
 39 | # #
 40 | # # what=dict(zip(VOC_CLASSES1, range(len(VOC_CLASSES1))))
 41 | # # what2=dict(zip(VOC_CLASSES2, range(len(VOC_CLASSES2))))
 42 | # # print(what)
 43 | # # print(what2)
 44 | # #######################################################################################################################
 45 | # # from __future__ import division
 46 | # # from math import sqrt as sqrt
 47 | # # from itertools import product as product
 48 | # # import torch
 49 | # # mean = []
 50 | # # clip=True
 51 | # # for i, j in product(range(5), repeat=2):  # 生成平面的网格位置坐标 i=[0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4]
 52 | # #     f_k = 300 / 64 #37.5                                           j=[0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4]
 53 | # #     cx = (j + 0.5) / f_k #
 54 | # #     cy = (i + 0.5) / f_k #
 55 | # #     s_k =162 / 300#0.1
 56 | # #     mean += [cx, cy, s_k, s_k]
 57 | # #     # aspect_ratio: 1
 58 | # #     # rel size: sqrt(s_k * s_(k+1))
 59 | # #     s_k_prime = sqrt(s_k * (213/300))#0.14
 60 | # #     mean += [cx, cy, s_k_prime, s_k_prime]
 61 | # #
 62 | # #     # rest of aspect ratios
 63 | # #     for ar in [2,3]:  # 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
 64 | # #         mean += [cx, cy, s_k * sqrt(ar), s_k / sqrt(ar)]
 65 | # #         mean += [cx, cy, s_k / sqrt(ar), s_k * sqrt(ar)]
 66 | # #
 67 | # # output = torch.Tensor(mean).view(-1, 4)
 68 | # # if clip:
 69 | # #     output.clamp_(max=1, min=0)
 70 | # import torch as t
 71 | # list1=[t.full([2,2,2],1),t.full([2,2,2],2)]
 72 | # list2=[t.full([2,2,2],3),t.full([2,2,2],4)]
 73 | # list3=[t.full([2,2,2],5),t.full([2,2,2],6)]
 74 | # loc=[]
 75 | # conf=[]
 76 | # pwn=zip(list1,list2,list3)
 77 | # print(pwn)
 78 | #
 79 | # # for (x,l,c) in zip(list1,list2,list3):
 80 | # #     loc.append(l(x))
 81 | # #     conf.append(c(x))
 82 | #
 83 | # # import torch
 84 | # # x = torch.tensor([[1,2,3],[4,5,6]])
 85 | # # x.is_contiguous()  # True
 86 | # # print(x)
 87 | # # print(x.transpose(0,1))
 88 | # # print(x.transpose(0, 1).is_contiguous()) # False
 89 | # # print(x.transpose(0, 1).contiguous().is_contiguous())  # True
 90 | 
 91 | from data import *
 92 | from utils.augmentations import SSDAugmentation
 93 | from layers.modules import MultiBoxLoss
 94 | from ssd import build_ssd
 95 | import os
 96 | import time
 97 | import torch
 98 | from torch.autograd import Variable
 99 | import torch.nn as nn
100 | import torch.optim as optim
101 | import torch.backends.cudnn as cudnn
102 | import torch.nn.init as init
103 | import torch.utils.data as data
104 | import argparse
105 | import visdom as viz
106 | 
107 | list1=torch.arange(0,8)
108 | x = torch.Tensor([[1], [2], [3]])
109 | y = x.expand(3, 4)
110 | print("x.size():", x.size())
111 | print("y.size():", y.size())
112 | 
113 | print(x)
114 | print(y)


--------------------------------------------------------------------------------
/data/VOCdevkit/VOC2007/ImageSets/Main/test.txt:
--------------------------------------------------------------------------------
  1 | 000009
  2 | 000010
  3 | 000012
  4 | 000013
  5 | 000014
  6 | 000015
  7 | 000023
  8 | 000024
  9 | 000028
 10 | 000029
 11 | 000031
 12 | 000032
 13 | 000034
 14 | 000040
 15 | 000053
 16 | 000059
 17 | 000067
 18 | 000074
 19 | 000075
 20 | 000076
 21 | 000090
 22 | 000101
 23 | 000105
 24 | 000107
 25 | 000116
 26 | 000120
 27 | 000131
 28 | 000133
 29 | 000135
 30 | 000136
 31 | 000137
 32 | 000139
 33 | 000149
 34 | 000157
 35 | 000163
 36 | 000164
 37 | 000181
 38 | 000183
 39 | 000187
 40 | 000192
 41 | 000197
 42 | 000203
 43 | 000219
 44 | 000220
 45 | 000229
 46 | 000230
 47 | 000231
 48 | 000235
 49 | 000237
 50 | 000241
 51 | 000244
 52 | 000248
 53 | 000255
 54 | 000256
 55 | 000262
 56 | 000263
 57 | 000264
 58 | 000266
 59 | 000268
 60 | 000269
 61 | 000274
 62 | 000275
 63 | 000282
 64 | 000290
 65 | 000291
 66 | 000303
 67 | 000306
 68 | 000319
 69 | 000334
 70 | 000340
 71 | 000346
 72 | 000351
 73 | 000352
 74 | 000360
 75 | 000366
 76 | 000369
 77 | 000384
 78 | 000390
 79 | 000392
 80 | 000398
 81 | 000408
 82 | 000409
 83 | 000423
 84 | 000430
 85 | 000431
 86 | 000440
 87 | 000444
 88 | 000450
 89 | 000454
 90 | 000462
 91 | 000470
 92 | 000472
 93 | 000486
 94 | 000487
 95 | 000488
 96 | 000489
 97 | 000499
 98 | 000501
 99 | 000505
100 | 000506
101 | 000507
102 | 000513
103 | 000530
104 | 000533
105 | 000540
106 | 000542
107 | 000544
108 | 000552
109 | 000561
110 | 000564
111 | 000567
112 | 000568
113 | 000569
114 | 000571
115 | 000574
116 | 000576
117 | 000584
118 | 000590
119 | 000593
120 | 000598
121 | 000602
122 | 000604
123 | 000605
124 | 000619
125 | 000620
126 | 000631
127 | 000635
128 | 000649
129 | 000661
130 | 000672
131 | 000676
132 | 000694
133 | 000697
134 | 000708
135 | 000711
136 | 000712
137 | 000713
138 | 000717
139 | 000727
140 | 000729
141 | 000731
142 | 000732
143 | 000739
144 | 000743
145 | 000746
146 | 000748
147 | 000753
148 | 000754
149 | 000756
150 | 000764
151 | 000769
152 | 000774
153 | 000775
154 | 000786
155 | 000793
156 | 000796
157 | 000802
158 | 000808
159 | 000809
160 | 000814
161 | 000819
162 | 000821
163 | 000823
164 | 000846
165 | 000848
166 | 000858
167 | 000863
168 | 000867
169 | 000869
170 | 000875
171 | 000880
172 | 000883
173 | 000884
174 | 000888
175 | 000893
176 | 000894
177 | 000897
178 | 000898
179 | 000900
180 | 000910
181 | 000912
182 | 000918
183 | 000919
184 | 000920
185 | 000921
186 | 000922
187 | 000926
188 | 000946
189 | 000947
190 | 000954
191 | 000960
192 | 000961
193 | 000967
194 | 000971
195 | 000977
196 | 000978
197 | 000982
198 | 000984
199 | 000986
200 | 000988
201 | 000989
202 | 000996
203 | 000997
204 | 001006
205 | 001007
206 | 001020
207 | 001029
208 | 001036
209 | 001043
210 | 001044
211 | 001047
212 | 001057
213 | 001059
214 | 001062
215 | 001068
216 | 001069
217 | 001075
218 | 001076
219 | 001078
220 | 001080
221 | 001084
222 | 001091
223 | 001096
224 | 001099
225 | 001104
226 | 001114
227 | 001127
228 | 001129
229 | 001130
230 | 001139
231 | 001146
232 | 001147
233 | 


--------------------------------------------------------------------------------
/data/VOCdevkit/VOC2007/ImageSets/Main/train.txt:
--------------------------------------------------------------------------------
  1 | 000001
  2 | 000002
  3 | 000003
  4 | 000004
  5 | 000005
  6 | 000006
  7 | 000007
  8 | 000008
  9 | 000011
 10 | 000016
 11 | 000018
 12 | 000019
 13 | 000020
 14 | 000021
 15 | 000025
 16 | 000026
 17 | 000027
 18 | 000030
 19 | 000033
 20 | 000035
 21 | 000036
 22 | 000037
 23 | 000038
 24 | 000039
 25 | 000041
 26 | 000042
 27 | 000043
 28 | 000044
 29 | 000045
 30 | 000046
 31 | 000047
 32 | 000048
 33 | 000049
 34 | 000050
 35 | 000051
 36 | 000052
 37 | 000054
 38 | 000056
 39 | 000057
 40 | 000058
 41 | 000060
 42 | 000061
 43 | 000062
 44 | 000063
 45 | 000064
 46 | 000065
 47 | 000066
 48 | 000069
 49 | 000070
 50 | 000071
 51 | 000072
 52 | 000073
 53 | 000077
 54 | 000078
 55 | 000079
 56 | 000080
 57 | 000081
 58 | 000082
 59 | 000083
 60 | 000084
 61 | 000085
 62 | 000086
 63 | 000087
 64 | 000088
 65 | 000089
 66 | 000091
 67 | 000093
 68 | 000094
 69 | 000095
 70 | 000096
 71 | 000097
 72 | 000099
 73 | 000100
 74 | 000102
 75 | 000103
 76 | 000104
 77 | 000106
 78 | 000108
 79 | 000109
 80 | 000110
 81 | 000111
 82 | 000112
 83 | 000113
 84 | 000114
 85 | 000115
 86 | 000117
 87 | 000121
 88 | 000122
 89 | 000123
 90 | 000125
 91 | 000126
 92 | 000127
 93 | 000128
 94 | 000129
 95 | 000130
 96 | 000132
 97 | 000134
 98 | 000138
 99 | 000140
100 | 000142
101 | 000143
102 | 000144
103 | 000145
104 | 000146
105 | 000147
106 | 000150
107 | 000151
108 | 000152
109 | 000153
110 | 000154
111 | 000156
112 | 000158
113 | 000159
114 | 000160
115 | 000162
116 | 000165
117 | 000166
118 | 000167
119 | 000168
120 | 000169
121 | 000170
122 | 000171
123 | 000172
124 | 000173
125 | 000174
126 | 000175
127 | 000176
128 | 000177
129 | 000178
130 | 000179
131 | 000180
132 | 000182
133 | 000184
134 | 000185
135 | 000186
136 | 000188
137 | 000189
138 | 000190
139 | 000191
140 | 000193
141 | 000194
142 | 000195
143 | 000196
144 | 000198
145 | 000199
146 | 000200
147 | 000201
148 | 000202
149 | 000204
150 | 000205
151 | 000206
152 | 000207
153 | 000208
154 | 000209
155 | 000210
156 | 000213
157 | 000215
158 | 000216
159 | 000217
160 | 000218
161 | 000221
162 | 000223
163 | 000225
164 | 000226
165 | 000227
166 | 000228
167 | 000232
168 | 000233
169 | 000234
170 | 000236
171 | 000238
172 | 000239
173 | 000240
174 | 000242
175 | 000243
176 | 000245
177 | 000246
178 | 000247
179 | 000249
180 | 000250
181 | 000251
182 | 000252
183 | 000253
184 | 000257
185 | 000258
186 | 000260
187 | 000261
188 | 000265
189 | 000267
190 | 000270
191 | 000271
192 | 000272
193 | 000273
194 | 000276
195 | 000277
196 | 000278
197 | 000279
198 | 000280
199 | 000281
200 | 000283
201 | 000284
202 | 000285
203 | 000286
204 | 000287
205 | 000288
206 | 000289
207 | 000292
208 | 000293
209 | 000294
210 | 000295
211 | 000296
212 | 000298
213 | 000299
214 | 000301
215 | 000302
216 | 000309
217 | 000310
218 | 000311
219 | 000312
220 | 000313
221 | 000314
222 | 000316
223 | 000317
224 | 000318
225 | 000320
226 | 000321
227 | 000322
228 | 000323
229 | 000324
230 | 000325
231 | 000326
232 | 000327
233 | 000328
234 | 000329
235 | 000331
236 | 000332
237 | 000335
238 | 000336
239 | 000337
240 | 000338
241 | 000339
242 | 000341
243 | 000342
244 | 000343
245 | 000344
246 | 000345
247 | 000347
248 | 000348
249 | 000349
250 | 000350
251 | 000353
252 | 000354
253 | 000355
254 | 000356
255 | 000357
256 | 000358
257 | 000359
258 | 000361
259 | 000362
260 | 000363
261 | 000364
262 | 000365
263 | 000367
264 | 000368
265 | 000370
266 | 000371
267 | 000372
268 | 000373
269 | 000374
270 | 000376
271 | 000377
272 | 000379
273 | 000380
274 | 000381
275 | 000383
276 | 000385
277 | 000386
278 | 000387
279 | 000388
280 | 000389
281 | 000391
282 | 000393
283 | 000394
284 | 000395
285 | 000396
286 | 000397
287 | 000399
288 | 000400
289 | 000401
290 | 000402
291 | 000403
292 | 000405
293 | 000406
294 | 000407
295 | 000410
296 | 000411
297 | 000412
298 | 000413
299 | 000414
300 | 000415
301 | 000416
302 | 000417
303 | 000418
304 | 000419
305 | 000420
306 | 000421
307 | 000422
308 | 000424
309 | 000426
310 | 000427
311 | 000428
312 | 000429
313 | 000432
314 | 000433
315 | 000434
316 | 000435
317 | 000438
318 | 000439
319 | 000441
320 | 000443
321 | 000445
322 | 000446
323 | 000447
324 | 000448
325 | 000449
326 | 000451
327 | 000452
328 | 000453
329 | 000455
330 | 000456
331 | 000457
332 | 000458
333 | 000459
334 | 000460
335 | 000463
336 | 000464
337 | 000465
338 | 000466
339 | 000467
340 | 000468
341 | 000469
342 | 000471
343 | 000473
344 | 000474
345 | 000475
346 | 000476
347 | 000477
348 | 000478
349 | 000479
350 | 000480
351 | 000482
352 | 000483
353 | 000484
354 | 000485
355 | 000490
356 | 000492
357 | 000493
358 | 000494
359 | 000495
360 | 000496
361 | 000497
362 | 000498
363 | 000500
364 | 000502
365 | 000503
366 | 000509
367 | 000510
368 | 000511
369 | 000512
370 | 000514
371 | 000515
372 | 000516
373 | 000517
374 | 000518
375 | 000520
376 | 000521
377 | 000522
378 | 000523
379 | 000525
380 | 000527
381 | 000528
382 | 000529
383 | 000531
384 | 000532
385 | 000534
386 | 000535
387 | 000536
388 | 000537
389 | 000538
390 | 000539
391 | 000541
392 | 000543
393 | 000545
394 | 000546
395 | 000547
396 | 000548
397 | 000549
398 | 000550
399 | 000551
400 | 000553
401 | 000554
402 | 000555
403 | 000556
404 | 000557
405 | 000558
406 | 000559
407 | 000560
408 | 000562
409 | 000563
410 | 000565
411 | 000566
412 | 000570
413 | 000572
414 | 000573
415 | 000575
416 | 000577
417 | 000578
418 | 000579
419 | 000580
420 | 000582
421 | 000583
422 | 000585
423 | 000586
424 | 000587
425 | 000588
426 | 000589
427 | 000591
428 | 000592
429 | 000594
430 | 000595
431 | 000596
432 | 000597
433 | 000599
434 | 000600
435 | 000601
436 | 000606
437 | 000607
438 | 000608
439 | 000609
440 | 000610
441 | 000611
442 | 000612
443 | 000613
444 | 000614
445 | 000615
446 | 000616
447 | 000617
448 | 000618
449 | 000621
450 | 000622
451 | 000623
452 | 000624
453 | 000625
454 | 000626
455 | 000627
456 | 000628
457 | 000630
458 | 000632
459 | 000633
460 | 000634
461 | 000636
462 | 000637
463 | 000638
464 | 000641
465 | 000642
466 | 000643
467 | 000645
468 | 000647
469 | 000648
470 | 000650
471 | 000651
472 | 000652
473 | 000653
474 | 000654
475 | 000656
476 | 000657
477 | 000658
478 | 000659
479 | 000663
480 | 000664
481 | 000665
482 | 000666
483 | 000667
484 | 000668
485 | 000669
486 | 000670
487 | 000671
488 | 000673
489 | 000674
490 | 000675
491 | 000677
492 | 000678
493 | 000679
494 | 000680
495 | 000681
496 | 000682
497 | 000683
498 | 000684
499 | 000685
500 | 000686
501 | 000687
502 | 000688
503 | 000689
504 | 000690
505 | 000691
506 | 000692
507 | 000693
508 | 000696
509 | 000698
510 | 000699
511 | 000700
512 | 000701
513 | 000702
514 | 000703
515 | 000704
516 | 000705
517 | 000706
518 | 000707
519 | 000709
520 | 000710
521 | 000714
522 | 000715
523 | 000718
524 | 000719
525 | 000720
526 | 000721
527 | 000722
528 | 000723
529 | 000724
530 | 000725
531 | 000726
532 | 000728
533 | 000730
534 | 000733
535 | 000734
536 | 000736
537 | 000738
538 | 000740
539 | 000741
540 | 000742
541 | 000744
542 | 000745
543 | 000747
544 | 000749
545 | 000750
546 | 000751
547 | 000755
548 | 000757
549 | 000758
550 | 000759
551 | 000760
552 | 000761
553 | 000762
554 | 000763
555 | 000765
556 | 000766
557 | 000767
558 | 000768
559 | 000770
560 | 000772
561 | 000776
562 | 000777
563 | 000778
564 | 000779
565 | 000780
566 | 000781
567 | 000782
568 | 000783
569 | 000784
570 | 000785
571 | 000787
572 | 000788
573 | 000789
574 | 000790
575 | 000791
576 | 000792
577 | 000794
578 | 000795
579 | 000797
580 | 000798
581 | 000799
582 | 000800
583 | 000801
584 | 000803
585 | 000804
586 | 000805
587 | 000806
588 | 000811
589 | 000813
590 | 000815
591 | 000816
592 | 000817
593 | 000818
594 | 000822
595 | 000824
596 | 000825
597 | 000826
598 | 000828
599 | 000829
600 | 000830
601 | 000831
602 | 000832
603 | 000833
604 | 000834
605 | 000835
606 | 000836
607 | 000837
608 | 000838
609 | 000839
610 | 000840
611 | 000841
612 | 000842
613 | 000843
614 | 000844
615 | 000845
616 | 000847
617 | 000849
618 | 000850
619 | 000851
620 | 000852
621 | 000853
622 | 000854
623 | 000855
624 | 000856
625 | 000857
626 | 000859
627 | 000860
628 | 000861
629 | 000862
630 | 000864
631 | 000865
632 | 000866
633 | 000868
634 | 000871
635 | 000872
636 | 000873
637 | 000874
638 | 000876
639 | 000877
640 | 000881
641 | 000885
642 | 000886
643 | 000887
644 | 000889
645 | 000890
646 | 000892
647 | 000895
648 | 000896
649 | 000899
650 | 000901
651 | 000902
652 | 000903
653 | 000904
654 | 000905
655 | 000907
656 | 000908
657 | 000913
658 | 000914
659 | 000915
660 | 000916
661 | 000917
662 | 000923
663 | 000924
664 | 000927
665 | 000929
666 | 000930
667 | 000931
668 | 000932
669 | 000933
670 | 000934
671 | 000935
672 | 000936
673 | 000937
674 | 000938
675 | 000939
676 | 000940
677 | 000941
678 | 000942
679 | 000943
680 | 000944
681 | 000945
682 | 000948
683 | 000949
684 | 000950
685 | 000951
686 | 000952
687 | 000953
688 | 000955
689 | 000956
690 | 000957
691 | 000958
692 | 000959
693 | 000962
694 | 000963
695 | 000964
696 | 000966
697 | 000969
698 | 000970
699 | 000972
700 | 000973
701 | 000974
702 | 000975
703 | 000976
704 | 000979
705 | 000980
706 | 000981
707 | 000983
708 | 000985
709 | 000987
710 | 000990
711 | 000992
712 | 000993
713 | 000994
714 | 000995
715 | 000998
716 | 001000
717 | 001001
718 | 001002
719 | 001003
720 | 001004
721 | 001005
722 | 001008
723 | 001009
724 | 001010
725 | 001012
726 | 001013
727 | 001014
728 | 001015
729 | 001016
730 | 001017
731 | 001018
732 | 001019
733 | 001021
734 | 001022
735 | 001024
736 | 001025
737 | 001026
738 | 001027
739 | 001028
740 | 001030
741 | 001032
742 | 001033
743 | 001034
744 | 001035
745 | 001037
746 | 001038
747 | 001039
748 | 001040
749 | 001041
750 | 001042
751 | 001046
752 | 001048
753 | 001049
754 | 001051
755 | 001052
756 | 001053
757 | 001055
758 | 001056
759 | 001058
760 | 001060
761 | 001061
762 | 001063
763 | 001064
764 | 001065
765 | 001066
766 | 001067
767 | 001070
768 | 001071
769 | 001072
770 | 001074
771 | 001077
772 | 001079
773 | 001081
774 | 001082
775 | 001083
776 | 001085
777 | 001086
778 | 001087
779 | 001088
780 | 001089
781 | 001090
782 | 001092
783 | 001093
784 | 001094
785 | 001095
786 | 001097
787 | 001098
788 | 001101
789 | 001102
790 | 001103
791 | 001105
792 | 001107
793 | 001108
794 | 001109
795 | 001110
796 | 001111
797 | 001112
798 | 001113
799 | 001115
800 | 001116
801 | 001117
802 | 001118
803 | 001119
804 | 001120
805 | 001121
806 | 001122
807 | 001123
808 | 001124
809 | 001125
810 | 001126
811 | 001128
812 | 001131
813 | 001132
814 | 001133
815 | 001134
816 | 001135
817 | 001136
818 | 001137
819 | 001138
820 | 001140
821 | 001141
822 | 001142
823 | 001143
824 | 001144
825 | 001145
826 | 001148
827 | 001149
828 | 001150
829 | 001151
830 | 001152
831 | 001154
832 | 001156
833 | 001158
834 | 001159
835 | 001160
836 | 


--------------------------------------------------------------------------------
/data/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt:
--------------------------------------------------------------------------------
  1 | 000001
  2 | 000002
  3 | 000003
  4 | 000004
  5 | 000005
  6 | 000006
  7 | 000007
  8 | 000008
  9 | 000011
 10 | 000016
 11 | 000017
 12 | 000018
 13 | 000019
 14 | 000020
 15 | 000021
 16 | 000022
 17 | 000025
 18 | 000026
 19 | 000027
 20 | 000030
 21 | 000033
 22 | 000035
 23 | 000036
 24 | 000037
 25 | 000038
 26 | 000039
 27 | 000041
 28 | 000042
 29 | 000043
 30 | 000044
 31 | 000045
 32 | 000046
 33 | 000047
 34 | 000048
 35 | 000049
 36 | 000050
 37 | 000051
 38 | 000052
 39 | 000054
 40 | 000055
 41 | 000056
 42 | 000057
 43 | 000058
 44 | 000060
 45 | 000061
 46 | 000062
 47 | 000063
 48 | 000064
 49 | 000065
 50 | 000066
 51 | 000068
 52 | 000069
 53 | 000070
 54 | 000071
 55 | 000072
 56 | 000073
 57 | 000077
 58 | 000078
 59 | 000079
 60 | 000080
 61 | 000081
 62 | 000082
 63 | 000083
 64 | 000084
 65 | 000085
 66 | 000086
 67 | 000087
 68 | 000088
 69 | 000089
 70 | 000091
 71 | 000092
 72 | 000093
 73 | 000094
 74 | 000095
 75 | 000096
 76 | 000097
 77 | 000098
 78 | 000099
 79 | 000100
 80 | 000102
 81 | 000103
 82 | 000104
 83 | 000106
 84 | 000108
 85 | 000109
 86 | 000110
 87 | 000111
 88 | 000112
 89 | 000113
 90 | 000114
 91 | 000115
 92 | 000117
 93 | 000118
 94 | 000119
 95 | 000121
 96 | 000122
 97 | 000123
 98 | 000124
 99 | 000125
100 | 000126
101 | 000127
102 | 000128
103 | 000129
104 | 000130
105 | 000132
106 | 000134
107 | 000138
108 | 000140
109 | 000141
110 | 000142
111 | 000143
112 | 000144
113 | 000145
114 | 000146
115 | 000147
116 | 000148
117 | 000150
118 | 000151
119 | 000152
120 | 000153
121 | 000154
122 | 000155
123 | 000156
124 | 000158
125 | 000159
126 | 000160
127 | 000161
128 | 000162
129 | 000165
130 | 000166
131 | 000167
132 | 000168
133 | 000169
134 | 000170
135 | 000171
136 | 000172
137 | 000173
138 | 000174
139 | 000175
140 | 000176
141 | 000177
142 | 000178
143 | 000179
144 | 000180
145 | 000182
146 | 000184
147 | 000185
148 | 000186
149 | 000188
150 | 000189
151 | 000190
152 | 000191
153 | 000193
154 | 000194
155 | 000195
156 | 000196
157 | 000198
158 | 000199
159 | 000200
160 | 000201
161 | 000202
162 | 000204
163 | 000205
164 | 000206
165 | 000207
166 | 000208
167 | 000209
168 | 000210
169 | 000211
170 | 000212
171 | 000213
172 | 000214
173 | 000215
174 | 000216
175 | 000217
176 | 000218
177 | 000221
178 | 000222
179 | 000223
180 | 000224
181 | 000225
182 | 000226
183 | 000227
184 | 000228
185 | 000232
186 | 000233
187 | 000234
188 | 000236
189 | 000238
190 | 000239
191 | 000240
192 | 000242
193 | 000243
194 | 000245
195 | 000246
196 | 000247
197 | 000249
198 | 000250
199 | 000251
200 | 000252
201 | 000253
202 | 000254
203 | 000257
204 | 000258
205 | 000259
206 | 000260
207 | 000261
208 | 000265
209 | 000267
210 | 000270
211 | 000271
212 | 000272
213 | 000273
214 | 000276
215 | 000277
216 | 000278
217 | 000279
218 | 000280
219 | 000281
220 | 000283
221 | 000284
222 | 000285
223 | 000286
224 | 000287
225 | 000288
226 | 000289
227 | 000292
228 | 000293
229 | 000294
230 | 000295
231 | 000296
232 | 000297
233 | 000298
234 | 000299
235 | 000300
236 | 000301
237 | 000302
238 | 000304
239 | 000305
240 | 000307
241 | 000308
242 | 000309
243 | 000310
244 | 000311
245 | 000312
246 | 000313
247 | 000314
248 | 000315
249 | 000316
250 | 000317
251 | 000318
252 | 000320
253 | 000321
254 | 000322
255 | 000323
256 | 000324
257 | 000325
258 | 000326
259 | 000327
260 | 000328
261 | 000329
262 | 000330
263 | 000331
264 | 000332
265 | 000333
266 | 000335
267 | 000336
268 | 000337
269 | 000338
270 | 000339
271 | 000341
272 | 000342
273 | 000343
274 | 000344
275 | 000345
276 | 000347
277 | 000348
278 | 000349
279 | 000350
280 | 000353
281 | 000354
282 | 000355
283 | 000356
284 | 000357
285 | 000358
286 | 000359
287 | 000361
288 | 000362
289 | 000363
290 | 000364
291 | 000365
292 | 000367
293 | 000368
294 | 000370
295 | 000371
296 | 000372
297 | 000373
298 | 000374
299 | 000375
300 | 000376
301 | 000377
302 | 000378
303 | 000379
304 | 000380
305 | 000381
306 | 000382
307 | 000383
308 | 000385
309 | 000386
310 | 000387
311 | 000388
312 | 000389
313 | 000391
314 | 000393
315 | 000394
316 | 000395
317 | 000396
318 | 000397
319 | 000399
320 | 000400
321 | 000401
322 | 000402
323 | 000403
324 | 000404
325 | 000405
326 | 000406
327 | 000407
328 | 000410
329 | 000411
330 | 000412
331 | 000413
332 | 000414
333 | 000415
334 | 000416
335 | 000417
336 | 000418
337 | 000419
338 | 000420
339 | 000421
340 | 000422
341 | 000424
342 | 000425
343 | 000426
344 | 000427
345 | 000428
346 | 000429
347 | 000432
348 | 000433
349 | 000434
350 | 000435
351 | 000436
352 | 000437
353 | 000438
354 | 000439
355 | 000441
356 | 000442
357 | 000443
358 | 000445
359 | 000446
360 | 000447
361 | 000448
362 | 000449
363 | 000451
364 | 000452
365 | 000453
366 | 000455
367 | 000456
368 | 000457
369 | 000458
370 | 000459
371 | 000460
372 | 000461
373 | 000463
374 | 000464
375 | 000465
376 | 000466
377 | 000467
378 | 000468
379 | 000469
380 | 000471
381 | 000473
382 | 000474
383 | 000475
384 | 000476
385 | 000477
386 | 000478
387 | 000479
388 | 000480
389 | 000481
390 | 000482
391 | 000483
392 | 000484
393 | 000485
394 | 000490
395 | 000491
396 | 000492
397 | 000493
398 | 000494
399 | 000495
400 | 000496
401 | 000497
402 | 000498
403 | 000500
404 | 000502
405 | 000503
406 | 000504
407 | 000508
408 | 000509
409 | 000510
410 | 000511
411 | 000512
412 | 000514
413 | 000515
414 | 000516
415 | 000517
416 | 000518
417 | 000519
418 | 000520
419 | 000521
420 | 000522
421 | 000523
422 | 000524
423 | 000525
424 | 000526
425 | 000527
426 | 000528
427 | 000529
428 | 000531
429 | 000532
430 | 000534
431 | 000535
432 | 000536
433 | 000537
434 | 000538
435 | 000539
436 | 000541
437 | 000543
438 | 000545
439 | 000546
440 | 000547
441 | 000548
442 | 000549
443 | 000550
444 | 000551
445 | 000553
446 | 000554
447 | 000555
448 | 000556
449 | 000557
450 | 000558
451 | 000559
452 | 000560
453 | 000562
454 | 000563
455 | 000565
456 | 000566
457 | 000570
458 | 000572
459 | 000573
460 | 000575
461 | 000577
462 | 000578
463 | 000579
464 | 000580
465 | 000581
466 | 000582
467 | 000583
468 | 000585
469 | 000586
470 | 000587
471 | 000588
472 | 000589
473 | 000591
474 | 000592
475 | 000594
476 | 000595
477 | 000596
478 | 000597
479 | 000599
480 | 000600
481 | 000601
482 | 000603
483 | 000606
484 | 000607
485 | 000608
486 | 000609
487 | 000610
488 | 000611
489 | 000612
490 | 000613
491 | 000614
492 | 000615
493 | 000616
494 | 000617
495 | 000618
496 | 000621
497 | 000622
498 | 000623
499 | 000624
500 | 000625
501 | 000626
502 | 000627
503 | 000628
504 | 000629
505 | 000630
506 | 000632
507 | 000633
508 | 000634
509 | 000636
510 | 000637
511 | 000638
512 | 000639
513 | 000640
514 | 000641
515 | 000642
516 | 000643
517 | 000644
518 | 000645
519 | 000646
520 | 000647
521 | 000648
522 | 000650
523 | 000651
524 | 000652
525 | 000653
526 | 000654
527 | 000655
528 | 000656
529 | 000657
530 | 000658
531 | 000659
532 | 000660
533 | 000662
534 | 000663
535 | 000664
536 | 000665
537 | 000666
538 | 000667
539 | 000668
540 | 000669
541 | 000670
542 | 000671
543 | 000673
544 | 000674
545 | 000675
546 | 000677
547 | 000678
548 | 000679
549 | 000680
550 | 000681
551 | 000682
552 | 000683
553 | 000684
554 | 000685
555 | 000686
556 | 000687
557 | 000688
558 | 000689
559 | 000690
560 | 000691
561 | 000692
562 | 000693
563 | 000695
564 | 000696
565 | 000698
566 | 000699
567 | 000700
568 | 000701
569 | 000702
570 | 000703
571 | 000704
572 | 000705
573 | 000706
574 | 000707
575 | 000709
576 | 000710
577 | 000714
578 | 000715
579 | 000716
580 | 000718
581 | 000719
582 | 000720
583 | 000721
584 | 000722
585 | 000723
586 | 000724
587 | 000725
588 | 000726
589 | 000728
590 | 000730
591 | 000733
592 | 000734
593 | 000735
594 | 000736
595 | 000737
596 | 000738
597 | 000740
598 | 000741
599 | 000742
600 | 000744
601 | 000745
602 | 000747
603 | 000749
604 | 000750
605 | 000751
606 | 000752
607 | 000755
608 | 000757
609 | 000758
610 | 000759
611 | 000760
612 | 000761
613 | 000762
614 | 000763
615 | 000765
616 | 000766
617 | 000767
618 | 000768
619 | 000770
620 | 000771
621 | 000772
622 | 000773
623 | 000776
624 | 000777
625 | 000778
626 | 000779
627 | 000780
628 | 000781
629 | 000782
630 | 000783
631 | 000784
632 | 000785
633 | 000787
634 | 000788
635 | 000789
636 | 000790
637 | 000791
638 | 000792
639 | 000794
640 | 000795
641 | 000797
642 | 000798
643 | 000799
644 | 000800
645 | 000801
646 | 000803
647 | 000804
648 | 000805
649 | 000806
650 | 000807
651 | 000810
652 | 000811
653 | 000812
654 | 000813
655 | 000815
656 | 000816
657 | 000817
658 | 000818
659 | 000820
660 | 000822
661 | 000824
662 | 000825
663 | 000826
664 | 000827
665 | 000828
666 | 000829
667 | 000830
668 | 000831
669 | 000832
670 | 000833
671 | 000834
672 | 000835
673 | 000836
674 | 000837
675 | 000838
676 | 000839
677 | 000840
678 | 000841
679 | 000842
680 | 000843
681 | 000844
682 | 000845
683 | 000847
684 | 000849
685 | 000850
686 | 000851
687 | 000852
688 | 000853
689 | 000854
690 | 000855
691 | 000856
692 | 000857
693 | 000859
694 | 000860
695 | 000861
696 | 000862
697 | 000864
698 | 000865
699 | 000866
700 | 000868
701 | 000870
702 | 000871
703 | 000872
704 | 000873
705 | 000874
706 | 000876
707 | 000877
708 | 000878
709 | 000879
710 | 000881
711 | 000882
712 | 000885
713 | 000886
714 | 000887
715 | 000889
716 | 000890
717 | 000891
718 | 000892
719 | 000895
720 | 000896
721 | 000899
722 | 000901
723 | 000902
724 | 000903
725 | 000904
726 | 000905
727 | 000906
728 | 000907
729 | 000908
730 | 000909
731 | 000911
732 | 000913
733 | 000914
734 | 000915
735 | 000916
736 | 000917
737 | 000923
738 | 000924
739 | 000925
740 | 000927
741 | 000928
742 | 000929
743 | 000930
744 | 000931
745 | 000932
746 | 000933
747 | 000934
748 | 000935
749 | 000936
750 | 000937
751 | 000938
752 | 000939
753 | 000940
754 | 000941
755 | 000942
756 | 000943
757 | 000944
758 | 000945
759 | 000948
760 | 000949
761 | 000950
762 | 000951
763 | 000952
764 | 000953
765 | 000955
766 | 000956
767 | 000957
768 | 000958
769 | 000959
770 | 000962
771 | 000963
772 | 000964
773 | 000965
774 | 000966
775 | 000968
776 | 000969
777 | 000970
778 | 000972
779 | 000973
780 | 000974
781 | 000975
782 | 000976
783 | 000979
784 | 000980
785 | 000981
786 | 000983
787 | 000985
788 | 000987
789 | 000990
790 | 000991
791 | 000992
792 | 000993
793 | 000994
794 | 000995
795 | 000998
796 | 000999
797 | 001000
798 | 001001
799 | 001002
800 | 001003
801 | 001004
802 | 001005
803 | 001008
804 | 001009
805 | 001010
806 | 001011
807 | 001012
808 | 001013
809 | 001014
810 | 001015
811 | 001016
812 | 001017
813 | 001018
814 | 001019
815 | 001021
816 | 001022
817 | 001023
818 | 001024
819 | 001025
820 | 001026
821 | 001027
822 | 001028
823 | 001030
824 | 001031
825 | 001032
826 | 001033
827 | 001034
828 | 001035
829 | 001037
830 | 001038
831 | 001039
832 | 001040
833 | 001041
834 | 001042
835 | 001045
836 | 001046
837 | 001048
838 | 001049
839 | 001050
840 | 001051
841 | 001052
842 | 001053
843 | 001054
844 | 001055
845 | 001056
846 | 001058
847 | 001060
848 | 001061
849 | 001063
850 | 001064
851 | 001065
852 | 001066
853 | 001067
854 | 001070
855 | 001071
856 | 001072
857 | 001073
858 | 001074
859 | 001077
860 | 001079
861 | 001081
862 | 001082
863 | 001083
864 | 001085
865 | 001086
866 | 001087
867 | 001088
868 | 001089
869 | 001090
870 | 001092
871 | 001093
872 | 001094
873 | 001095
874 | 001097
875 | 001098
876 | 001100
877 | 001101
878 | 001102
879 | 001103
880 | 001105
881 | 001106
882 | 001107
883 | 001108
884 | 001109
885 | 001110
886 | 001111
887 | 001112
888 | 001113
889 | 001115
890 | 001116
891 | 001117
892 | 001118
893 | 001119
894 | 001120
895 | 001121
896 | 001122
897 | 001123
898 | 001124
899 | 001125
900 | 001126
901 | 001128
902 | 001131
903 | 001132
904 | 001133
905 | 001134
906 | 001135
907 | 001136
908 | 001137
909 | 001138
910 | 001140
911 | 001141
912 | 001142
913 | 001143
914 | 001144
915 | 001145
916 | 001148
917 | 001149
918 | 001150
919 | 001151
920 | 001152
921 | 001153
922 | 001154
923 | 001155
924 | 001156
925 | 001157
926 | 001158
927 | 001159
928 | 001160
929 | 


--------------------------------------------------------------------------------
/data/VOCdevkit/VOC2007/ImageSets/Main/val.txt:
--------------------------------------------------------------------------------
 1 | 000017
 2 | 000022
 3 | 000055
 4 | 000068
 5 | 000092
 6 | 000098
 7 | 000118
 8 | 000119
 9 | 000124
10 | 000141
11 | 000148
12 | 000155
13 | 000161
14 | 000211
15 | 000212
16 | 000214
17 | 000222
18 | 000224
19 | 000254
20 | 000259
21 | 000297
22 | 000300
23 | 000304
24 | 000305
25 | 000307
26 | 000308
27 | 000315
28 | 000330
29 | 000333
30 | 000375
31 | 000378
32 | 000382
33 | 000404
34 | 000425
35 | 000436
36 | 000437
37 | 000442
38 | 000461
39 | 000481
40 | 000491
41 | 000504
42 | 000508
43 | 000519
44 | 000524
45 | 000526
46 | 000581
47 | 000603
48 | 000629
49 | 000639
50 | 000640
51 | 000644
52 | 000646
53 | 000655
54 | 000660
55 | 000662
56 | 000695
57 | 000716
58 | 000735
59 | 000737
60 | 000752
61 | 000771
62 | 000773
63 | 000807
64 | 000810
65 | 000812
66 | 000820
67 | 000827
68 | 000870
69 | 000878
70 | 000879
71 | 000882
72 | 000891
73 | 000906
74 | 000909
75 | 000911
76 | 000925
77 | 000928
78 | 000965
79 | 000968
80 | 000991
81 | 000999
82 | 001011
83 | 001023
84 | 001031
85 | 001045
86 | 001050
87 | 001054
88 | 001073
89 | 001100
90 | 001106
91 | 001153
92 | 001155
93 | 001157
94 | 


--------------------------------------------------------------------------------
/data/VOCdevkitVOC2007/annotations_cache/annots.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/data/VOCdevkitVOC2007/annotations_cache/annots.pkl


--------------------------------------------------------------------------------
/data/VOCdevkitVOC2007/results/det_train_None.txt:
--------------------------------------------------------------------------------
1 | 000221 0.014 6.2 39.9 631.2 464.1
2 | 000223 0.014 6.0 28.5 555.4 374.2
3 | 000236 0.012 4.9 31.2 472.5 370.8
4 | 000238 0.012 3.9 29.1 471.7 363.1
5 | 000744 0.013 -222.6 234.6 258.1 341.3
6 | 001112 0.012 417.1 -44.6 629.0 65.2
7 | 


--------------------------------------------------------------------------------
/data/VOCdevkitVOC2007/results/det_trainval_None.txt:
--------------------------------------------------------------------------------
1 | 000221 0.014 6.2 39.9 631.2 464.1
2 | 000223 0.014 6.0 28.5 555.4 374.2
3 | 000224 0.040 109.3 24.8 385.1 322.4
4 | 000224 0.017 268.0 195.9 583.6 433.1
5 | 000236 0.012 4.9 31.2 472.5 370.8
6 | 000238 0.012 3.9 29.1 471.7 363.1
7 | 000744 0.013 -222.6 234.6 258.1 341.3
8 | 001112 0.012 417.1 -44.6 629.0 65.2
9 | 


--------------------------------------------------------------------------------
/data/VOCdevkitVOC2007/results/det_val_None.txt:
--------------------------------------------------------------------------------
1 | 000224 0.040 109.3 24.8 385.1 322.4
2 | 000224 0.017 268.0 195.9 583.6 433.1
3 | 


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
 1 | from .voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES, VOC_ROOT
 2 | 
 3 | #from .coco import COCODetection, COCOAnnotationTransform, COCO_CLASSES, COCO_ROOT, get_label_map
 4 | from .config import *
 5 | import torch
 6 | import cv2
 7 | import numpy as np
 8 | 
 9 | def detection_collate(batch):
10 |     """Custom collate fn for dealing with batches of images that have a different
11 |     number of associated object annotations (bounding boxes).
12 | 
13 |     Arguments:
14 |         batch: (tuple) A tuple of tensor images and lists of annotations
15 | 
16 |     Return:
17 |         A tuple containing:
18 |             1) (tensor) batch of images stacked on their 0 dim
19 |             2) (list of tensors) annotations for a given image are stacked on
20 |                                  0 dim
21 |     """
22 |     targets = []
23 |     imgs = []
24 |     for sample in batch:
25 |         imgs.append(sample[0])
26 |         targets.append(torch.FloatTensor(sample[1]))
27 |     return torch.stack(imgs, 0), targets
28 | 
29 | 
30 | def base_transform(image, size, mean):
31 |     x = cv2.resize(image, (size, size)).astype(np.float32)
32 |     x -= mean
33 |     x = x.astype(np.float32)
34 |     return x
35 | 
36 | 
37 | class BaseTransform:
38 |     def __init__(self, size, mean):
39 |         self.size = size
40 |         self.mean = np.array(mean, dtype=np.float32)
41 | 
42 |     def __call__(self, image, boxes=None, labels=None):
43 |         return base_transform(image, self.size, self.mean), boxes, labels
44 | 


--------------------------------------------------------------------------------
/data/config.py:
--------------------------------------------------------------------------------
 1 | # config.py
 2 | import os.path
 3 | 
 4 | # gets home dir cross platform
 5 | HOME = os.path.expanduser("~")
 6 | 
 7 | # for making bounding boxes pretty
 8 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128),
 9 |           (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128))
10 | 
11 | MEANS = (104, 117, 123)
12 | 
13 | # SSD300 CONFIGS
14 | voc = {
15 |     'num_classes': 3, # 【改成自己训练的类别数+1】#######原本我只有‘ship’1个类别 但我这里要写2
16 |     'lr_steps': (60000, 90000, 120000),  # ##原本(80000, 100000, 120000) 修改
17 |     'max_iter': 120000, # 【改成自己训练的迭代次数】
18 |     'feature_maps': [38, 19, 10, 5, 3, 1],
19 |     'min_dim': 300,
20 |     'steps': [8, 16, 32, 64, 100, 300],
21 |     'min_sizes': [30, 60, 111, 162, 213, 264],
22 |     'max_sizes': [60, 111, 162, 213, 264, 315],
23 |     'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
24 |     'variance': [0.1, 0.2],
25 |     'clip': True,
26 |     'name': 'VOC',
27 | }
28 | 
29 | # coco = {
30 | #     'num_classes': 91,
31 | #     'lr_steps': (280000, 360000, 400000),
32 | #     'max_iter': 400000,
33 | #     'feature_maps': [38, 19, 10, 5, 3, 1],
34 | #     'min_dim': 300,
35 | #     'steps': [8, 16, 32, 64, 100, 300],
36 | #     'min_sizes': [21, 45, 99, 153, 207, 261],
37 | #     'max_sizes': [45, 99, 153, 207, 261, 315],
38 | #     'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
39 | #     'variance': [0.1, 0.2],
40 | #     'clip': True,
41 | #     'name': 'COCO',
42 | # }
43 | 


--------------------------------------------------------------------------------
/data/example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/data/example.jpg


--------------------------------------------------------------------------------
/data/scripts/COCO2014.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | start=`date +%s`
 4 | 
 5 | # handle optional download dir
 6 | if [ -z "$1" ]
 7 |   then
 8 |     # navigate to ~/data
 9 |     echo "navigating to ~/data/ ..."
10 |     mkdir -p ~/data
11 |     cd ~/data/
12 |     mkdir -p ./coco
13 |     cd ./coco
14 |     mkdir -p ./images
15 |     mkdir -p ./annotations
16 |   else
17 |     # check if specified dir is valid
18 |     if [ ! -d $1 ]; then
19 |         echo $1 " is not a valid directory"
20 |         exit 0
21 |     fi
22 |     echo "navigating to " $1 " ..."
23 |     cd $1
24 | fi
25 | 
26 | if [ ! -d images ]
27 |   then
28 |     mkdir -p ./images
29 | fi
30 | 
31 | # Download the image data.
32 | cd ./images
33 | echo "Downloading MSCOCO train images ..."
34 | curl -LO http://images.cocodataset.org/zips/train2014.zip
35 | echo "Downloading MSCOCO val images ..."
36 | curl -LO http://images.cocodataset.org/zips/val2014.zip
37 | 
38 | cd ../
39 | if [ ! -d annotations]
40 |   then
41 |     mkdir -p ./annotations
42 | fi
43 | 
44 | # Download the annotation data.
45 | cd ./annotations
46 | echo "Downloading MSCOCO train/val annotations ..."
47 | curl -LO http://images.cocodataset.org/annotations/annotations_trainval2014.zip
48 | echo "Finished downloading. Now extracting ..."
49 | 
50 | # Unzip data
51 | echo "Extracting train images ..."
52 | unzip ../images/train2014.zip -d ../images
53 | echo "Extracting val images ..."
54 | unzip ../images/val2014.zip -d ../images
55 | echo "Extracting annotations ..."
56 | unzip ./annotations_trainval2014.zip
57 | 
58 | echo "Removing zip files ..."
59 | rm ../images/train2014.zip
60 | rm ../images/val2014.zip
61 | rm ./annotations_trainval2014.zip
62 | 
63 | echo "Creating trainval35k dataset..."
64 | 
65 | # Download annotations json
66 | echo "Downloading trainval35k annotations from S3"
67 | curl -LO https://s3.amazonaws.com/amdegroot-datasets/instances_trainval35k.json.zip
68 | 
69 | # combine train and val 
70 | echo "Combining train and val images"
71 | mkdir ../images/trainval35k
72 | cd ../images/train2014
73 | find -maxdepth 1 -name '*.jpg' -exec cp -t ../trainval35k {} + # dir too large for cp
74 | cd ../val2014
75 | find -maxdepth 1 -name '*.jpg' -exec cp -t ../trainval35k {} +
76 | 
77 | 
78 | end=`date +%s`
79 | runtime=$((end-start))
80 | 
81 | echo "Completed in " $runtime " seconds"
82 | 


--------------------------------------------------------------------------------
/data/scripts/VOC2007.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Ellis Brown
 3 | 
 4 | start=`date +%s`
 5 | 
 6 | # handle optional download dir
 7 | if [ -z "$1" ]
 8 |   then
 9 |     # navigate to ~/data
10 |     echo "navigating to ~/data/ ..." 
11 |     mkdir -p ~/data
12 |     cd ~/data/
13 |   else
14 |     # check if is valid directory
15 |     if [ ! -d $1 ]; then
16 |         echo $1 "is not a valid directory"
17 |         exit 0
18 |     fi
19 |     echo "navigating to" $1 "..."
20 |     cd $1
21 | fi
22 | 
23 | echo "Downloading VOC2007 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
26 | echo "Downloading VOC2007 test data ..."
27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
28 | echo "Done downloading."
29 | 
30 | # Extract data
31 | echo "Extracting trainval ..."
32 | tar -xvf VOCtrainval_06-Nov-2007.tar
33 | echo "Extracting test ..."
34 | tar -xvf VOCtest_06-Nov-2007.tar
35 | echo "removing tars ..."
36 | rm VOCtrainval_06-Nov-2007.tar
37 | rm VOCtest_06-Nov-2007.tar
38 | 
39 | end=`date +%s`
40 | runtime=$((end-start))
41 | 
42 | echo "Completed in" $runtime "seconds"


--------------------------------------------------------------------------------
/data/scripts/VOC2012.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Ellis Brown
 3 | 
 4 | start=`date +%s`
 5 | 
 6 | # handle optional download dir
 7 | if [ -z "$1" ]
 8 |   then
 9 |     # navigate to ~/data
10 |     echo "navigating to ~/data/ ..." 
11 |     mkdir -p ~/data
12 |     cd ~/data/
13 |   else
14 |     # check if is valid directory
15 |     if [ ! -d $1 ]; then
16 |         echo $1 "is not a valid directory"
17 |         exit 0
18 |     fi
19 |     echo "navigating to" $1 "..."
20 |     cd $1
21 | fi
22 | 
23 | echo "Downloading VOC2012 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
26 | echo "Done downloading."
27 | 
28 | 
29 | # Extract data
30 | echo "Extracting trainval ..."
31 | tar -xvf VOCtrainval_11-May-2012.tar
32 | echo "removing tar ..."
33 | rm VOCtrainval_11-May-2012.tar
34 | 
35 | end=`date +%s`
36 | runtime=$((end-start))
37 | 
38 | echo "Completed in" $runtime "seconds"


--------------------------------------------------------------------------------
/data/voc0712.py:
--------------------------------------------------------------------------------
  1 | """VOC Dataset Classes
  2 | 
  3 | Original author: Francisco Massa
  4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
  5 | 
  6 | Updated by: Ellis Brown, Max deGroot
  7 | """
  8 | from .config import HOME, MEANS, voc, COLORS   # ################################3
  9 | import os.path as osp
 10 | import sys
 11 | import torch
 12 | import torch.utils.data as data
 13 | import cv2
 14 | import numpy as np
 15 | if sys.version_info[0] == 2:
 16 |     import xml.etree.cElementTree as ET
 17 | else:
 18 |     import xml.etree.ElementTree as ET
 19 | 
 20 | # VOC_CLASSES = (  # always index 0
 21 | #     'aeroplane', 'bicycle', 'bird', 'boat',
 22 | #     'bottle', 'bus', 'car', 'cat', 'chair',
 23 | #     'cow', 'diningtable', 'dog', 'horse',
 24 | #     'motorbike', 'person', 'pottedplant',
 25 | #     'sheep', 'sofa', 'train', 'tvmonitor')
 26 | 
 27 | # ##VOC_CLASSES=( 'None','ship')#这里如果只写一个名字的话就有很严重的问题
 28 | # ##VOC_CLASSES=('ship')###就会出现严重的错误 需要在ship后面添加‘，’
 29 | VOC_CLASSES=('face',
 30 |              'face_mask')  # ##**************************************************************
 31 | # note: if you used our download scripts, this should be right
 32 | VOC_ROOT = "/data/lp/project/ssd.pytorch/data/VOCdevkit/" # 个人感觉路径应该自己设定一下
 33 | # VOC_ROOT = osp.join(HOME, "data/VOCdevkit/")  # ###去掉了HOME####我认为应该修改成自己的数据位子 【这里是数据默认的读取路径】
 34 | 
 35 | 
 36 | class VOCAnnotationTransform(object):
 37 |     """Transforms a VOC annotation into a Tensor of bbox coords and label index
 38 |     Initilized with a dictionary lookup of classnames to indexes
 39 | 
 40 |     Arguments:
 41 |         class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
 42 |             (default: alphabetic indexing of VOC's 20 classes)
 43 |         keep_difficult (bool, optional): keep difficult instances or not
 44 |             (default: False)
 45 |         height (int): height
 46 |         width (int): width
 47 |     """
 48 | 
 49 |     def __init__(self, class_to_ind=None, keep_difficult=False):
 50 |         self.class_to_ind = class_to_ind or dict(  # class_to_ind将标签信息转化为字典形式{'aeroplane': 0, 'bicycle': 1}
 51 |             zip(VOC_CLASSES, range(len(VOC_CLASSES))))
 52 |         self.keep_difficult = keep_difficult
 53 | 
 54 |     def __call__(self, target, width, height):
 55 |         """
 56 |         Arguments:
 57 |             target (annotation) : the target annotation to be made usable
 58 |                 will be an ET.Element
 59 |         Returns:
 60 |             a list containing lists of bounding boxes  [bbox coords, class name]
 61 |         """
 62 |         res = []
 63 |         for obj in target.iter('object'):
 64 |             difficult = int(obj.find('difficult').text) == 1
 65 |             if not self.keep_difficult and difficult:
 66 |                 continue
 67 |             name = obj.find('name').text.lower().strip()  # ##strip()返回移除字符串头尾指定的字符生成的新字符串。 # 去除首尾空格
 68 |             bbox = obj.find('bndbox')
 69 |             pts = ['xmin', 'ymin', 'xmax', 'ymax']
 70 |             bndbox = []
 71 |             for i, pt in enumerate(pts):
 72 |                 cur_pt = int(bbox.find(pt).text) - 1
 73 |                 # scale height or width
 74 |                 cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height  # 获得该目标在这张图向上的相对坐标位置【0-1】
 75 |                 bndbox.append(cur_pt)
 76 |             label_idx = self.class_to_ind[name]
 77 |             bndbox.append(label_idx)
 78 |             res += [bndbox]  # [xmin, ymin, xmax, ymax, label_ind]
 79 |             # img_id = target.find('filename').text[:-4]
 80 | 
 81 |         return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
 82 | 
 83 | 
 84 | class VOCDetection(data.Dataset):
 85 | 
 86 |     """VOC Detection Dataset Object
 87 | 
 88 |     input is image, target is annotation
 89 | 
 90 |     Arguments:
 91 |         root (string): filepath to VOCdevkit folder.
 92 |         image_set (string): imageset to use (eg. 'train', 'val', 'test')
 93 |         transform (callable, optional): transformation to perform on the
 94 |             input image
 95 |         target_transform (callable, optional): transformation to perform on the
 96 |             target `annotation`
 97 |             (eg: take in caption string, return tensor of word indices)
 98 |         dataset_name (string, optional): which dataset to load
 99 |             (default: 'VOC2007')
100 |     """
101 | 
102 |     def __init__(self,
103 |                  root,  # VOCdevkit folder的根目录
104 |                  image_sets=[('2007', 'trainval')],  # ('2012', 'trainval') 要选用的数据集 是字符串的格式
105 |                  transform=None,  # 图片的预处理方法
106 |                  target_transform=VOCAnnotationTransform(),  # 标签的预处理方法
107 |                  dataset_name='VOC0712'):  # 数据集的名字
108 |         self.root = root  # 设置数VOCdevkit folder的根目录
109 |         self.image_set = image_sets  # 设置要选用的数据集
110 |         self.transform = transform  # 定义图像转换方法
111 |         self.target_transform = target_transform  # 定义标签的转换方法
112 |         self.name = dataset_name  # 定义数据集名称
113 |         self._annopath = osp.join('%s', 'Annotations', '%s.xml')  # 记录标签的位置 留下了两个【%s】的部分没有填写
114 |         self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')  # 记录图像的位置 留下了两个【%s】的部分没有填写
115 |         self.ids = list()  # 记录数据集中的所有图像的名字,没有后缀名
116 |         for (year, name) in image_sets:   # 【image_sets】就是我们要用训练好的模型测试的test数据集('2007', 'trainval')这样的形式
117 |             # 读入数据集中的图像名称，可以依照该名称和_annopath、_imgpath推断出图片、描述文件存储的位置
118 |             rootpath = osp.join(self.root, 'VOC' + year) # ...../VOC2007
119 |             for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
120 |                 # ...../VOC2007/ImageSets/Main/(test val train).txt
121 |                 self.ids.append((rootpath, line.strip()))
122 |                 # 将这个测试集的txt文本打开后，读取每一行的数据，注意去除前后的空格
123 |                 # 【（ids）存放每一张图片的信息 （rootpath 和 去后缀的图片名 没有.jpg）为一个元组】ids是个list里面的每个元素都是一个元组
124 | 
125 |     def __getitem__(self, index):
126 |         im, gt, h, w = self.pull_item(index)
127 | 
128 |         return im, gt
129 | 
130 |     def __len__(self):
131 |         return len(self.ids)
132 | 
133 |     def pull_item(self, index):
134 |         img_id = self.ids[index]  # ('D:/Deep_learning/ssd.pytorch-master/data/VOCdevkit/VOC2007', '000001')
135 | 
136 |         target = ET.parse(self._annopath % img_id).getroot()  # 将self._annopath空缺的两个部分用img_id补全。  获得需要读取xml的对象
137 |         img = cv2.imread(self._imgpath % img_id)  # 将self._imgpath的空缺的两个部分用img_id补全      获取对应的图像
138 |         height, width, channels = img.shape  # 获取图像的尺寸  高宽通道数
139 | 
140 |         if self.target_transform is not None:  # 对读入的测试的标签进行处理
141 |             target = self.target_transform(target, width, height)  # 返回的是【xmin，xmax，ymin，ymax，label】
142 |         if self.transform is not None:  # 对测试集的图片默认的transform是None
143 |             target = np.array(target)  # 下面这个transform理解不了啊！！！！！
144 |             img, boxes, labels = self.transform(img, target[:, :4], target[:, 4])  # 输入的图像，位置（4维度），标签（1维度）
145 |             # to rgb
146 |             img = img[:, :, (2, 1, 0)]  # opencv读入图像的顺序是BGR，该操作将图像转为RGB
147 |             # img = img.transpose(2, 0, 1)
148 |             target = np.hstack((boxes, np.expand_dims(labels, axis=1)))  # 首先将narry的向量（x，）转化为（x,1）,然后又重新组织成了一样的格式
149 |         return torch.from_numpy(img).permute(2, 0, 1), target, height, width  # permute将通道数提前，符合pytorch的格式
150 |         # 将通道数提前，为了统一torch的后续训练操作。
151 | # ###############################################################################################################################################
152 | 
153 |     def pull_image(self, index):
154 |         """Returns the original image object at index in PIL form
155 | 
156 |         Note: not using self.__getitem__(), as any transformations passed in
157 |         could mess up this functionality.
158 | 
159 |         Argument:
160 |             index (int): index of img to show
161 |         Return:
162 |             PIL img
163 |         """
164 |         img_id = self.ids[index]
165 |         return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)  # 从图片的路径直接读取图片
166 | 
167 |     def pull_anno(self, index):
168 |         """Returns the original annotation of image at index
169 | 
170 |         Note: not using self.__getitem__(), as any transformations passed in
171 |         could mess up this functionality.
172 | 
173 |         Argument:
174 |             index (int): index of img to get annotation of
175 |         Return:
176 |             list:  [img_id, [(label, bbox coords),...]]
177 |                 eg: ('001718', [('dog', (96, 13, 438, 332))])
178 |         """
179 |         img_id = self.ids[index]
180 |         anno = ET.parse(self._annopath % img_id).getroot()
181 |         gt = self.target_transform(anno, 1, 1)  # ##后面的1,1,就是上面实例化VOCAnno的后两个参数width和height，用于对标签进行归一化
182 |         return img_id[1], gt  # 返回的是图片的名字（去后缀名），还有groundtruth [[xmin, ymin, xmax, ymax, label_ind], ... ]
183 | 
184 |     def pull_tensor(self, index):
185 |         """Returns the original image at an index in tensor form
186 | 
187 |         Note: not using self.__getitem__(), as any transformations passed in
188 |         could mess up this functionality.
189 | 
190 |         Argument:
191 |             index (int): index of img to show
192 |         Return:
193 |             tensorized version of img, squeezed
194 |         """
195 |         return torch.Tensor(self.pull_image(index)).unsqueeze_(0)
196 | 


--------------------------------------------------------------------------------
/demo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/demo/__init__.py


--------------------------------------------------------------------------------
/demo/live.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import torch
 3 | from torch.autograd import Variable
 4 | import cv2
 5 | import time
 6 | from imutils.video import FPS, WebcamVideoStream
 7 | import argparse
 8 | 
 9 | parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection')
10 | parser.add_argument('--weights', default='weights/ssd_300_VOC0712.pth',
11 |                     type=str, help='Trained state_dict file path')
12 | parser.add_argument('--cuda', default=False, type=bool,
13 |                     help='Use cuda in live demo')
14 | args = parser.parse_args()
15 | 
16 | COLORS = [(255, 0, 0), (0, 255, 0), (0, 0, 255)]
17 | FONT = cv2.FONT_HERSHEY_SIMPLEX
18 | 
19 | 
20 | def cv2_demo(net, transform):
21 |     def predict(frame):
22 |         height, width = frame.shape[:2]
23 |         x = torch.from_numpy(transform(frame)[0]).permute(2, 0, 1)
24 |         x = Variable(x.unsqueeze(0))
25 |         y = net(x)  # forward pass
26 |         detections = y.data
27 |         # scale each detection back up to the image
28 |         scale = torch.Tensor([width, height, width, height])
29 |         for i in range(detections.size(1)):
30 |             j = 0
31 |             while detections[0, i, j, 0] >= 0.6:
32 |                 pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
33 |                 cv2.rectangle(frame,
34 |                               (int(pt[0]), int(pt[1])),
35 |                               (int(pt[2]), int(pt[3])),
36 |                               COLORS[i % 3], 2)
37 |                 cv2.putText(frame, labelmap[i - 1], (int(pt[0]), int(pt[1])),
38 |                             FONT, 2, (255, 255, 255), 2, cv2.LINE_AA)
39 |                 j += 1
40 |         return frame
41 | 
42 |     # start video stream thread, allow buffer to fill
43 |     print("[INFO] starting threaded video stream...")
44 |     stream = WebcamVideoStream(src=0).start()  # default camera
45 |     time.sleep(1.0)
46 |     # start fps timer
47 |     # loop over frames from the video file stream
48 |     while True:
49 |         # grab next frame
50 |         frame = stream.read()
51 |         key = cv2.waitKey(1) & 0xFF
52 | 
53 |         # update FPS counter
54 |         fps.update()
55 |         frame = predict(frame)
56 | 
57 |         # keybindings for display
58 |         if key == ord('p'):  # pause
59 |             while True:
60 |                 key2 = cv2.waitKey(1) or 0xff
61 |                 cv2.imshow('frame', frame)
62 |                 if key2 == ord('p'):  # resume
63 |                     break
64 |         cv2.imshow('frame', frame)
65 |         if key == 27:  # exit
66 |             break
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     import sys
71 |     from os import path
72 |     sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
73 | 
74 |     from data import BaseTransform, VOC_CLASSES as labelmap
75 |     from ssd import build_ssd
76 | 
77 |     net = build_ssd('test', 300, 21)    # initialize SSD
78 |     net.load_state_dict(torch.load(args.weights))
79 |     transform = BaseTransform(net.size, (104/256.0, 117/256.0, 123/256.0))
80 | 
81 |     fps = FPS().start()
82 |     cv2_demo(net.eval(), transform)
83 |     # stop the timer and display FPS information
84 |     fps.stop()
85 | 
86 |     print("[INFO] elasped time: {:.2f}".format(fps.elapsed()))
87 |     print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))
88 | 
89 |     # cleanup
90 |     cv2.destroyAllWindows()
91 |     stream.stop()
92 | 


--------------------------------------------------------------------------------
/doc/SSD.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/doc/SSD.jpg


--------------------------------------------------------------------------------
/doc/detection_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/doc/detection_example.png


--------------------------------------------------------------------------------
/doc/detection_example2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/doc/detection_example2.png


--------------------------------------------------------------------------------
/doc/detection_examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/doc/detection_examples.png


--------------------------------------------------------------------------------
/doc/ssd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/doc/ssd.png


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | """Adapted from:
  4 |     @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch
  5 |     @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn
  6 |     Licensed under The MIT License [see LICENSE for details]
  7 | """
  8 | 
  9 | from __future__ import print_function
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.backends.cudnn as cudnn
 13 | from torch.autograd import Variable
 14 | from data import VOC_ROOT, VOCAnnotationTransform, VOCDetection, BaseTransform
 15 | from data import VOC_CLASSES as labelmap
 16 | import torch.utils.data as data
 17 | 
 18 | from ssd import build_ssd
 19 | 
 20 | import sys
 21 | import os
 22 | import time
 23 | import argparse
 24 | import numpy as np
 25 | import pickle
 26 | import cv2
 27 | 
 28 | if sys.version_info[0] == 2:
 29 |     import xml.etree.cElementTree as ET
 30 | else:
 31 |     import xml.etree.ElementTree as ET
 32 | 
 33 | 
 34 | def str2bool(v):
 35 |     return v.lower() in ("yes", "true", "t", "1")
 36 | 
 37 | 
 38 | parser = argparse.ArgumentParser(
 39 |     description='Single Shot MultiBox Detector Evaluation')
 40 | parser.add_argument('--trained_model',
 41 |                     default='weights/ssd300_VOC_10000.pth', type=str,
 42 |                     help='Trained state_dict file path to open')
 43 | parser.add_argument('--save_folder', default='eval/', type=str,
 44 |                     help='File path to save results')
 45 | parser.add_argument('--confidence_threshold', default=0.01, type=float,
 46 |                     help='Detection confidence threshold')
 47 | parser.add_argument('--top_k', default=5, type=int,
 48 |                     help='Further restrict the number of predictions to parse')
 49 | parser.add_argument('--cuda', default=True, type=str2bool,
 50 |                     help='Use cuda to train model')
 51 | parser.add_argument('--voc_root', default=VOC_ROOT,
 52 |                     help='Location of VOC root directory')
 53 | parser.add_argument('--cleanup', default=True, type=str2bool,
 54 |                     help='Cleanup and remove results files following eval')
 55 | 
 56 | args = parser.parse_args()
 57 | 
 58 | if not os.path.exists(args.save_folder):
 59 |     os.mkdir(args.save_folder)
 60 | 
 61 | if torch.cuda.is_available():
 62 |     if args.cuda:
 63 |         torch.set_default_tensor_type('torch.cuda.FloatTensor')
 64 |     if not args.cuda:
 65 |         print("WARNING: It looks like you have a CUDA device, but aren't using \
 66 |               CUDA.  Run with --cuda for optimal eval speed.")
 67 |         torch.set_default_tensor_type('torch.FloatTensor')
 68 | else:
 69 |     torch.set_default_tensor_type('torch.FloatTensor')
 70 | 
 71 | annopath = os.path.join(args.voc_root, 'VOC2007', 'Annotations', '%s.xml')
 72 | imgpath = os.path.join(args.voc_root, 'VOC2007', 'JPEGImages', '%s.jpg')
 73 | imgsetpath = os.path.join(args.voc_root, 'VOC2007', 'ImageSets',
 74 |                           # 'Main', '{:s}.txt')
 75 |                             'Main', '{}.txt')
 76 | YEAR = '2007'
 77 | devkit_path = args.voc_root + 'VOC' + YEAR
 78 | dataset_mean = (104, 117, 123)
 79 | set_type = 'test'
 80 | 
 81 | 
 82 | class Timer(object):
 83 |     """A simple timer."""
 84 |     def __init__(self):
 85 |         self.total_time = 0.
 86 |         self.calls = 0
 87 |         self.start_time = 0.
 88 |         self.diff = 0.
 89 |         self.average_time = 0.
 90 | 
 91 |     def tic(self):
 92 |         # using time.time instead of time.clock because time time.clock
 93 |         # does not normalize for multithreading
 94 |         self.start_time = time.time()
 95 | 
 96 |     def toc(self, average=True):
 97 |         self.diff = time.time() - self.start_time
 98 |         self.total_time += self.diff
 99 |         self.calls += 1
100 |         self.average_time = self.total_time / self.calls
101 |         if average:
102 |             return self.average_time
103 |         else:
104 |             return self.diff
105 | 
106 | 
107 | def parse_rec(filename):
108 |     """ Parse a PASCAL VOC xml file """
109 |     tree = ET.parse(filename)
110 |     objects = []
111 |     for obj in tree.findall('object'):
112 |         obj_struct = {}
113 |         obj_struct['name'] = obj.find('name').text
114 |         # 不知道为什么下面字段的信息读不出来我这里就直接给付的默认值
115 |         # obj_struct['pose'] = obj.find('pose').text
116 |         # obj_struct['truncated'] = int(obj.find('truncated').text)
117 |         # obj_struct['difficult'] = int(obj.find('difficult').text)
118 |         obj_struct['pose'] = 'Unspecified'
119 |         obj_struct['truncated'] = 0
120 |         obj_struct['difficult'] = 0
121 |         bbox = obj.find('bndbox')
122 |         obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1,
123 |                               int(bbox.find('ymin').text) - 1,
124 |                               int(bbox.find('xmax').text) - 1,
125 |                               int(bbox.find('ymax').text) - 1]
126 |         objects.append(obj_struct)
127 | 
128 |     return objects
129 | 
130 | 
131 | def get_output_dir(name, phase):
132 |     """Return the directory where experimental artifacts are placed.
133 |     If the directory does not exist, it is created.
134 |     A canonical path is built using the name from an imdb and a network
135 |     (if not None).
136 |     """
137 |     filedir = os.path.join(name, phase)
138 |     if not os.path.exists(filedir):
139 |         os.makedirs(filedir)
140 |     return filedir
141 | 
142 | 
143 | def get_voc_results_file_template(image_set, cls):
144 |     # VOCdevkit/VOC2007/results/det_test_aeroplane.txt
145 |     filename = 'det_' + image_set + '_%s.txt' % (cls)
146 |     filedir = os.path.join(devkit_path, 'results')
147 |     if not os.path.exists(filedir):
148 |         os.makedirs(filedir)
149 |     path = os.path.join(filedir, filename)
150 |     return path
151 | 
152 | 
153 | def write_voc_results_file(all_boxes, dataset):
154 |     for cls_ind, cls in enumerate(labelmap):
155 |         print('Writing {:s} VOC results file'.format(cls))
156 |         filename = get_voc_results_file_template(set_type, cls)
157 |         with open(filename, 'wt') as f:
158 |             for im_ind, index in enumerate(dataset.ids):
159 |                 dets = all_boxes[cls_ind+1][im_ind]
160 |                 if dets == []:
161 |                     continue
162 |                 # the VOCdevkit expects 1-based indices
163 |                 for k in range(dets.shape[0]):
164 |                     f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
165 |                             format(index[1], dets[k, -1],
166 |                                    dets[k, 0] + 1, dets[k, 1] + 1,
167 |                                    dets[k, 2] + 1, dets[k, 3] + 1))
168 | 
169 | 
170 | def do_python_eval(output_dir='output', use_07=True):
171 |     cachedir = os.path.join(devkit_path, 'annotations_cache')
172 |     aps = []
173 |     # The PASCAL VOC metric changed in 2010
174 |     use_07_metric = use_07
175 |     print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
176 |     if not os.path.isdir(output_dir):
177 |         os.mkdir(output_dir)
178 |     for i, cls in enumerate(labelmap):
179 |         filename = get_voc_results_file_template(set_type, cls)
180 |         rec, prec, ap = voc_eval(
181 |            filename, annopath, imgsetpath.format(set_type), cls, cachedir,
182 |            ovthresh=0.5, use_07_metric=use_07_metric)
183 |         aps += [ap]
184 |         print('AP for {} = {:.4f}'.format(cls, ap))
185 |         with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
186 |             pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
187 |     print('Mean AP = {:.4f}'.format(np.mean(aps)))
188 |     print('~~~~~~~~')
189 |     print('Results:')
190 |     for ap in aps:
191 |         print('{:.3f}'.format(ap))
192 |     print('{:.3f}'.format(np.mean(aps)))
193 |     print('~~~~~~~~')
194 |     print('')
195 |     print('--------------------------------------------------------------')
196 |     print('Results computed with the **unofficial** Python eval code.')
197 |     print('Results should be very close to the official MATLAB eval code.')
198 |     print('--------------------------------------------------------------')
199 | 
200 | 
201 | def voc_ap(rec, prec, use_07_metric=True):
202 |     """ ap = voc_ap(rec, prec, [use_07_metric])
203 |     Compute VOC AP given precision and recall.
204 |     If use_07_metric is true, uses the
205 |     VOC 07 11 point method (default:True).
206 |     """
207 |     if use_07_metric:
208 |         # 11 point metric
209 |         ap = 0.
210 |         for t in np.arange(0., 1.1, 0.1):
211 |             if np.sum(rec >= t) == 0:
212 |                 p = 0
213 |             else:
214 |                 p = np.max(prec[rec >= t])
215 |             ap = ap + p / 11.
216 |     else:
217 |         # correct AP calculation
218 |         # first append sentinel values at the end
219 |         mrec = np.concatenate(([0.], rec, [1.]))
220 |         mpre = np.concatenate(([0.], prec, [0.]))
221 | 
222 |         # compute the precision envelope
223 |         for i in range(mpre.size - 1, 0, -1):
224 |             mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
225 | 
226 |         # to calculate area under PR curve, look for points
227 |         # where X axis (recall) changes value
228 |         i = np.where(mrec[1:] != mrec[:-1])[0]
229 | 
230 |         # and sum (\Delta recall) * prec
231 |         ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
232 |     return ap
233 | 
234 | 
235 | def voc_eval(detpath,
236 |              annopath,
237 |              imagesetfile,
238 |              classname,
239 |              cachedir,
240 |              ovthresh=0.5,
241 |              use_07_metric=True):
242 |     """rec, prec, ap = voc_eval(detpath,
243 |                            annopath,
244 |                            imagesetfile,
245 |                            classname,
246 |                            [ovthresh],
247 |                            [use_07_metric])
248 | Top level function that does the PASCAL VOC evaluation.
249 | detpath: Path to detections
250 |    detpath.format(classname) should produce the detection results file.
251 | annopath: Path to annotations
252 |    annopath.format(imagename) should be the xml annotations file.
253 | imagesetfile: Text file containing the list of images, one image per line.
254 | classname: Category name (duh)
255 | cachedir: Directory for caching the annotations
256 | [ovthresh]: Overlap threshold (default = 0.5)
257 | [use_07_metric]: Whether to use VOC07's 11 point AP computation
258 |    (default True)
259 | """
260 | # assumes detections are in detpath.format(classname)
261 | # assumes annotations are in annopath.format(imagename)
262 | # assumes imagesetfile is a text file with each line an image name
263 | # cachedir caches the annotations in a pickle file
264 | # first load gt
265 |     if not os.path.isdir(cachedir):
266 |         os.mkdir(cachedir)
267 |     cachefile = os.path.join(cachedir, 'annots.pkl')
268 |     # read list of images
269 |     with open(imagesetfile, 'r') as f:
270 |         lines = f.readlines()
271 |     imagenames = [x.strip() for x in lines]
272 |     if not os.path.isfile(cachefile):
273 |         # load annots
274 |         recs = {}
275 |         for i, imagename in enumerate(imagenames):
276 |             recs[imagename] = parse_rec(annopath % (imagename))
277 |             if i % 100 == 0:
278 |                 print('Reading annotation for {:d}/{:d}'.format(
279 |                    i + 1, len(imagenames)))
280 |         # save
281 |         print('Saving cached annotations to {:s}'.format(cachefile))
282 |         with open(cachefile, 'wb') as f:
283 |             pickle.dump(recs, f)
284 |     else:
285 |         # load
286 |         with open(cachefile, 'rb') as f:
287 |             recs = pickle.load(f)
288 | 
289 |     # extract gt objects for this class
290 |     class_recs = {}
291 |     npos = 0
292 |     for imagename in imagenames:
293 |         R = [obj for obj in recs[imagename] if obj['name'] == classname]
294 |         bbox = np.array([x['bbox'] for x in R])
295 |         # difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
296 |         difficult = np.array( [None for x in R]).astype(np.bool) # 这里不知道为什么评价的参数用不了了，我就自己把difficult字段设为0的列表
297 |         det = [False] * len(R)
298 |         npos = npos + sum(~difficult)
299 |         class_recs[imagename] = {'bbox': bbox,
300 |                                  'difficult': difficult,
301 |                                  'det': det}
302 | 
303 |     # read dets
304 |     detfile = detpath.format(classname)
305 |     with open(detfile, 'r') as f:
306 |         lines = f.readlines()
307 |     if any(lines) == 1:
308 | 
309 |         splitlines = [x.strip().split(' ') for x in lines]
310 |         image_ids = [x[0] for x in splitlines]
311 |         confidence = np.array([float(x[1]) for x in splitlines])
312 |         BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
313 | 
314 |         # sort by confidence
315 |         sorted_ind = np.argsort(-confidence)
316 |         sorted_scores = np.sort(-confidence)
317 |         BB = BB[sorted_ind, :]
318 |         image_ids = [image_ids[x] for x in sorted_ind]
319 | 
320 |         # go down dets and mark TPs and FPs
321 |         nd = len(image_ids)
322 |         tp = np.zeros(nd)
323 |         fp = np.zeros(nd)
324 |         for d in range(nd):
325 |             R = class_recs[image_ids[d]]
326 |             bb = BB[d, :].astype(float)
327 |             ovmax = -np.inf
328 |             BBGT = R['bbox'].astype(float)
329 |             if BBGT.size > 0:
330 |                 # compute overlaps
331 |                 # intersection
332 |                 ixmin = np.maximum(BBGT[:, 0], bb[0])
333 |                 iymin = np.maximum(BBGT[:, 1], bb[1])
334 |                 ixmax = np.minimum(BBGT[:, 2], bb[2])
335 |                 iymax = np.minimum(BBGT[:, 3], bb[3])
336 |                 iw = np.maximum(ixmax - ixmin, 0.)
337 |                 ih = np.maximum(iymax - iymin, 0.)
338 |                 inters = iw * ih
339 |                 uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) +
340 |                        (BBGT[:, 2] - BBGT[:, 0]) *
341 |                        (BBGT[:, 3] - BBGT[:, 1]) - inters)
342 |                 overlaps = inters / uni
343 |                 ovmax = np.max(overlaps)
344 |                 jmax = np.argmax(overlaps)
345 | 
346 |             if ovmax > ovthresh:
347 |                 if not R['difficult'][jmax]:
348 |                     if not R['det'][jmax]:
349 |                         tp[d] = 1.
350 |                         R['det'][jmax] = 1
351 |                     else:
352 |                         fp[d] = 1.
353 |             else:
354 |                 fp[d] = 1.
355 | 
356 |         # compute precision recall
357 |         fp = np.cumsum(fp)
358 |         tp = np.cumsum(tp)
359 |         rec = tp / float(npos)
360 |         # avoid divide by zero in case the first detection matches a difficult
361 |         # ground truth
362 |         prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
363 |         ap = voc_ap(rec, prec, use_07_metric)
364 |     else:
365 |         rec = -1.
366 |         prec = -1.
367 |         ap = -1.
368 | 
369 |     return rec, prec, ap
370 | 
371 | 
372 | def test_net(save_folder, net, cuda, dataset, transform, top_k,
373 |              im_size=300, thresh=0.05):
374 |     num_images = len(dataset)
375 |     # all detections are collected into:
376 |     #    all_boxes[cls][image] = N x 5 array of detections in
377 |     #    (x1, y1, x2, y2, score)
378 |     all_boxes = [[[] for _ in range(num_images)]
379 |                  for _ in range(len(labelmap)+1)]
380 | 
381 |     # timers
382 |     _t = {'im_detect': Timer(), 'misc': Timer()}
383 |     output_dir = get_output_dir('ssd300_120000', set_type)
384 |     det_file = os.path.join(output_dir, 'detections.pkl')
385 | 
386 |     for i in range(num_images):
387 |         im, gt, h, w = dataset.pull_item(i)
388 | 
389 |         x = Variable(im.unsqueeze(0))
390 |         if args.cuda:
391 |             x = x.cuda()
392 |         _t['im_detect'].tic()
393 |         detections = net(x).data
394 |         detect_time = _t['im_detect'].toc(average=False)
395 | 
396 |         # skip j = 0, because it's the background class
397 |         for j in range(1, detections.size(1)):
398 |             dets = detections[0, j, :]
399 |             mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t()
400 |             dets = torch.masked_select(dets, mask).view(-1, 5)
401 |             if dets.size(0) == 0:
402 |                 continue
403 |             boxes = dets[:, 1:]
404 |             boxes[:, 0] *= w
405 |             boxes[:, 2] *= w
406 |             boxes[:, 1] *= h
407 |             boxes[:, 3] *= h
408 |             scores = dets[:, 0].cpu().numpy()
409 |             cls_dets = np.hstack((boxes.cpu().numpy(),
410 |                                   scores[:, np.newaxis])).astype(np.float32,
411 |                                                                  copy=False)
412 |             all_boxes[j][i] = cls_dets
413 | 
414 |         print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1,
415 |                                                     num_images, detect_time))
416 | 
417 |     with open(det_file, 'wb') as f:
418 |         pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL)
419 | 
420 |     print('Evaluating detections')
421 |     evaluate_detections(all_boxes, output_dir, dataset)
422 | 
423 | 
424 | def evaluate_detections(box_list, output_dir, dataset):
425 |     write_voc_results_file(box_list, dataset)
426 |     do_python_eval(output_dir)
427 | 
428 | 
429 | if __name__ == '__main__':
430 |     # load net
431 |     num_classes = len(labelmap) + 1                      # +1 for background
432 |     net = build_ssd('test', 300, num_classes)            # initialize SSD
433 |     net.load_state_dict(torch.load(args.trained_model))
434 |     net.eval()
435 |     print('Finished loading model!')
436 |     # load data
437 |     dataset = VOCDetection(args.voc_root, [('2007', set_type)],
438 |                            BaseTransform(300, dataset_mean),
439 |                            VOCAnnotationTransform())
440 |     if args.cuda:
441 |         net = net.cuda()
442 |         cudnn.benchmark = True
443 |     # evaluation
444 |     test_net(args.save_folder, net, args.cuda, dataset,
445 |              BaseTransform(net.size, dataset_mean), args.top_k, 300,
446 |              thresh=args.confidence_threshold)
447 | 


--------------------------------------------------------------------------------
/focal_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch.autograd import Variable
 5 | 
 6 | 
 7 | class FocalLoss(nn.Module):
 8 |     r"""
 9 |         This criterion is a implemenation of Focal Loss, which is proposed in
10 |         Focal Loss for Dense Object Detection.
11 | 
12 |             Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class])
13 | 
14 |         The losses are averaged across observations for each minibatch.
15 | 
16 |         Args:
17 |             alpha(1D Tensor, Variable) : the scalar factor for this criterion
18 |             gamma(float, double) : gamma > 0; reduces the relative loss for well-clasified examples (p > .5),
19 |                                    putting more focus on hard, misclassified examples
20 |             size_average(bool): By default, the losses are averaged over observations for each minibatch.
21 |                                 However, if the field size_average is set to False, the losses are
22 |                                 instead summed for each minibatch.
23 | 
24 | 
25 |     """
26 |     def __init__(self, alpha, gamma=2, class_num=5,size_average=False):
27 |         super(FocalLoss, self).__init__()
28 |         if alpha is None:
29 |             self.alpha = Variable(torch.ones(class_num, 1))
30 |         else:
31 |             if isinstance(alpha, Variable):
32 |                 self.alpha = alpha
33 |             else:
34 |                 self.alpha = Variable(alpha)
35 | 
36 |         self.gamma = gamma
37 | 
38 |         # self.class_num = class_num
39 |         self.size_average = size_average
40 | 
41 |     def forward(self, inputs, targets):
42 |         N = inputs.size(0)  # batch_size
43 |         C = inputs.size(1)  # channels
44 |         P = F.softmax(inputs, dim=1)
45 | 
46 |         class_mask = inputs.data.new(N, C).fill_(0)
47 |         class_mask = Variable(class_mask)
48 |         ids = targets.view(-1, 1)
49 |         class_mask.scatter_(1, ids.data, 1.)
50 |         # print(class_mask)
51 | 
52 |         if inputs.is_cuda and not self.alpha.is_cuda:
53 |             self.alpha = self.alpha.cuda()
54 |         alpha = self.alpha[ids.data.view(-1)]
55 | 
56 |         probs = (P*class_mask).sum(1).view(-1, 1)
57 | 
58 |         log_p = probs.log()
59 |         # print('probs size= {}'.format(probs.size()))
60 |         # print(probs)
61 | 
62 |         batch_loss = -alpha*(torch.pow((1-probs), self.gamma))*log_p
63 | 
64 |         # print('-----bacth_loss------')
65 |         # print(batch_loss)
66 | 
67 |         if self.size_average:
68 |             loss = batch_loss.mean()
69 |         else:
70 |             loss = batch_loss.sum()
71 |         return loss
72 | 


--------------------------------------------------------------------------------
/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .functions import *
2 | from .modules import *
3 | 


--------------------------------------------------------------------------------
/layers/box_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import torch
  3 | 
  4 | 
  5 | def point_form(boxes):
  6 |     """ Convert prior_boxes to (xmin, ymin, xmax, ymax)
  7 |     representation for comparison to point form ground truth data.
  8 |     Args:
  9 |         boxes: (tensor) center-size default boxes from priorbox layers.
 10 |     Return:
 11 |         boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
 12 |     """
 13 |     #原本boxes包括：xmin, ymin, xmax, ymax
 14 |     return torch.cat((boxes[:, :2] - boxes[:, 2:]/2,     # xmin, ymin
 15 |                      boxes[:, :2] + boxes[:, 2:]/2), 1)  # xmax, ymax
 16 | 
 17 | 
 18 | def center_size(boxes):
 19 |     """ Convert prior_boxes to (cx, cy, w, h)
 20 |     representation for comparison to center-size form ground truth data.
 21 |     Args:
 22 |         boxes: (tensor) point_form boxes
 23 |     Return:
 24 |         boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
 25 |     """
 26 |     return torch.cat((boxes[:, 2:] + boxes[:, :2])/2,  # cx, cy
 27 |                      boxes[:, 2:] - boxes[:, :2], 1)  # w, h
 28 | 
 29 | 
 30 | def intersect(box_a, box_b):
 31 |     """ We resize both tensors to [A,B,2] without new malloc:
 32 |     [A,2] -> [A,1,2] -> [A,B,2]
 33 |     [B,2] -> [1,B,2] -> [A,B,2]
 34 |     Then we compute the area of intersect between box_a and box_b.
 35 |     Args:
 36 |       box_a: (tensor) bounding boxes, Shape: [A,4].
 37 |       box_b: (tensor) bounding boxes, Shape: [B,4].
 38 |     Return:
 39 |       (tensor) intersection area, Shape: [A,B].
 40 |     """
 41 |     A = box_a.size(0) #box_a的行数，个数 【ground_truth标签框】少数个
 42 |     B = box_b.size(0) #box_b的行数，个数 【所有的生成框prior框】8732个
 43 |     max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
 44 |                        box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
 45 |     min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
 46 |                        box_b[:, :2].unsqueeze(0).expand(A, B, 2))
 47 |     inter = torch.clamp((max_xy - min_xy), min=0)
 48 |     return inter[:, :, 0] * inter[:, :, 1] #返回【A:ground_truth标签框 和 B:所有的生成框prior框】相交的面积 【A:目标数，B列：prior数8732】
 49 | 
 50 | 
 51 | def jaccard(box_a, box_b):
 52 |     """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
 53 |     is simply the intersection over union of two boxes.  Here we operate on
 54 |     ground truth boxes and default boxes.
 55 |     E.g.:
 56 |         A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
 57 |     Args:
 58 |         box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] =【目标的数量，4】
 59 |         box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]=【先验框数量（8732），4】
 60 |     Return:
 61 |         jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
 62 |     """
 63 |     inter = intersect(box_a, box_b)#【A行:目标数，B列：prior数8732】的相交区域网格形式
 64 |     area_a = ((box_a[:, 2]-box_a[:, 0]) *
 65 |               (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
 66 |     area_b = ((box_b[:, 2]-box_b[:, 0]) *
 67 |               (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
 68 |     union = area_a + area_b - inter
 69 |     return inter / union  # [A,B]【目标和所有prior的交并比】
 70 | 
 71 | #match函数参数输入【阈值，ground_truth,设置的先验框prior,variance方差？,真实标签，位置预测，类别预测，遍历8732个框的顺序】
 72 | #这里的variance要去确认一下
 73 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx):
 74 |     """Match each prior box with the ground truth box of the highest jaccard
 75 |     overlap, encode the bounding boxes, then return the matched indices
 76 |     corresponding to both confidence and location preds.
 77 |     Args:
 78 |         threshold: (float) The overlap threshold used when mathing boxes.
 79 |         truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
 80 |         priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
 81 |         variances: (tensor) Variances corresponding to each prior coord,
 82 |             Shape: [num_priors, 4].
 83 |         labels: (tensor) All the class labels for the image, Shape: [num_obj].
 84 |         loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
 85 |         conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
 86 |         idx: (int) current batch index 【idx对应batch里的每一张图片】
 87 |     Return:
 88 |         The matched indices corresponding to 1)location and 2)confidence preds.
 89 |     """
 90 |     # jaccard index
 91 |     overlaps = jaccard( #输入为真实的ground_truth的框truths 和 生成的所有预测框priors
 92 |         truths,
 93 |         point_form(priors) #priors【生成框记录的中点信息和宽高信息】point_form函数将其转化为【左上角，右下角信息】
 94 |     )#返回overlaps目标和所有prior的交并比 【A行:目标数，B列：prior数8732】的相交区域网格形式
 95 |     # (Bipartite Matching)
 96 |     # [1,num_objects] best prior for each ground truth
 97 |     # 【返回每一行 ground_truth对应的最高的prior的IOU，返回该IOU对应的index】
 98 |     best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) #返回的均为列向量【行：目标数，列：1】
 99 |     # [1,num_priors] best ground truth for each prior
100 |     # 【返回每一个prior对应的ground_truth的最大IOU,返回对应的IUO的index】
101 |     best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) #返回的均为行向量【行：1，列：priors数目8732】
102 |     #下面四行全部压缩成向量
103 |     best_truth_idx.squeeze_(0) #将第一个维度数为1的删除 变成 torch.Size([8732]) 注意squeeze后面有'_'这代表有在本身上进行修改
104 |     best_truth_overlap.squeeze_(0) #将第一个维度数为1的删除 变成 torch.Size([8732])
105 |     best_prior_idx.squeeze_(1) #将第二个维度为1的删除，变成 torch.Size([目标数]) #里面包含的是ground_truth对应的最大的prior_box的index，该index的范围是【0-8731】
106 |     best_prior_overlap.squeeze_(1) #将第二个维度为1的删除，变成 torch.Size([目标数])
107 |     #下面这行说明，每个ground_truth对应的prior一定是positive正样本，不论其IOU大小是否与阈值的大小
108 |     best_truth_overlap.index_fill_(0, best_prior_idx, 2)  # ensure best prior##在0的这个维度上，在best_prior_idx这些位置，填充为2
109 |     #输出的best_truth_overlap仍然是8732长度的向量，每个GT匹配的最好的prior_box的面积为‘2’
110 |     # TODO refactor: index  best_prior_idx with long tensor
111 |     # ensure every gt matches with its prior of max overlap
112 |     for j in range(best_prior_idx.size(0)): #目标数的循环
113 |         best_truth_idx[best_prior_idx[j]] = j # 给每个prior标记上对应的最好的ground truth的 标签 输出8732向量
114 |     matches = truths[best_truth_idx]          # Shape: [num_priors(8732),4] 将每个GT的标签复制到8732份数（重复）
115 |     conf = labels[best_truth_idx] + 1         # Shape: [num_priors(8732)] +1是因为0作为背景 将每个GT的标签重复复制8732份数
116 |     '''通过上面两行代码，8732个prior_box,每个都对应一个GT,若没有重合的区域，那么就对应最后个GT'''
117 |     conf[best_truth_overlap < threshold] = 0  # # label 0 as background
118 |     loc = encode(matches, priors, variances) # # [g_cxcy, g_wh] 输入【ground_truth信息 先验框信息】 输出【编码后的位置信息】【8732,4】
119 |     loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn  【idx是batch里面的每一张图片】
120 |     conf_t[idx] = conf  # [num_priors] top class label for each prior
121 | 
122 | 
123 | def encode(matched, priors, variances):
124 |     """Encode the variances from the priorbox layers into the ground truth boxes
125 |     we have matched (based on jaccard overlap) with the prior boxes.
126 |     Args:
127 |         matched: (tensor) Coords of ground truth for each prior in point-form
128 |             Shape: [num_priors, 4].
129 |         priors: (tensor) Prior boxes in center-offset form
130 |             Shape: [num_priors,4].
131 |         variances: (list[float]) Variances of priorboxes
132 |     Return:
133 |         encoded boxes (tensor), Shape: [num_priors, 4]
134 |     """
135 | 
136 |     # dist b/t match center and prior's center
137 |     g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
138 |     # encode variance
139 |     g_cxcy /= (variances[0] * priors[:, 2:])
140 |     # match wh / prior wh
141 |     g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
142 |     g_wh = torch.log(g_wh) / variances[1]
143 |     # return target for smooth_l1_loss
144 |     return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]
145 | 
146 | 
147 | # Adapted from https://github.com/Hakuyume/chainer-ssd
148 | def decode(loc, priors, variances):
149 |     """Decode locations from predictions using priors to undo
150 |     the encoding we did for offset regression at train time.
151 |     Args:
152 |         loc (tensor): location predictions for loc layers,
153 |             Shape: [num_priors,4]
154 |         priors (tensor): Prior boxes in center-offset form.
155 |             Shape: [num_priors,4].
156 |         variances: (list[float]) Variances of priorboxes
157 |     Return:
158 |         decoded bounding box predictions
159 |     """
160 | 
161 |     boxes = torch.cat((
162 |         priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
163 |         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
164 |     boxes[:, :2] -= boxes[:, 2:] / 2
165 |     boxes[:, 2:] += boxes[:, :2]
166 |     return boxes
167 | 
168 | 
169 | def log_sum_exp(x):
170 |     """Utility function for computing log_sum_exp while determining
171 |     This will be used to determine unaveraged confidence loss across
172 |     all examples in a batch.
173 |     Args:
174 |         x (Variable(tensor)): conf_preds from conf layers
175 |     """
176 |     x_max = x.data.max()
177 |     return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
178 | 
179 | 
180 | # Original author: Francisco Massa:
181 | # https://github.com/fmassa/object-detection.torch
182 | # Ported to PyTorch by Max deGroot (02/01/2017)
183 | def nms(boxes, scores, overlap=0.5, top_k=200):
184 |     """Apply non-maximum suppression at test time to avoid detecting too many
185 |     overlapping bounding boxes for a given object.
186 |     Args:
187 |         boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
188 |         scores: (tensor) The class predscores for the img, Shape:[num_priors].
189 |         overlap: (float) The overlap thresh for suppressing unnecessary boxes.
190 |         top_k: (int) The Maximum number of box preds to consider.
191 |     Return:
192 |         The indices of the kept boxes with respect to num_priors.
193 |     """
194 | 
195 |     keep = scores.new(scores.size(0)).zero_().long()
196 |     if boxes.numel() == 0:
197 |         return keep
198 |     x1 = boxes[:, 0]
199 |     y1 = boxes[:, 1]
200 |     x2 = boxes[:, 2]
201 |     y2 = boxes[:, 3]
202 |     area = torch.mul(x2 - x1, y2 - y1)
203 |     v, idx = scores.sort(0)  # sort in ascending order
204 |     # I = I[v >= 0.01]
205 |     idx = idx[-top_k:]  # indices of the top-k largest vals
206 |     xx1 = boxes.new()
207 |     yy1 = boxes.new()
208 |     xx2 = boxes.new()
209 |     yy2 = boxes.new()
210 |     w = boxes.new()
211 |     h = boxes.new()
212 | 
213 |     # keep = torch.Tensor()
214 |     count = 0
215 |     while idx.numel() > 0:
216 |         i = idx[-1]  # index of current largest val
217 |         # keep.append(i)
218 |         keep[count] = i
219 |         count += 1
220 |         if idx.size(0) == 1:
221 |             break
222 |         idx = idx[:-1]  # remove kept element from view
223 |         # load bboxes of next highest vals
224 |         torch.index_select(x1, 0, idx, out=xx1)
225 |         torch.index_select(y1, 0, idx, out=yy1)
226 |         torch.index_select(x2, 0, idx, out=xx2)
227 |         torch.index_select(y2, 0, idx, out=yy2)
228 |         # store element-wise max with next highest score
229 |         xx1 = torch.clamp(xx1, min=x1[i])
230 |         yy1 = torch.clamp(yy1, min=y1[i])
231 |         xx2 = torch.clamp(xx2, max=x2[i])
232 |         yy2 = torch.clamp(yy2, max=y2[i])
233 |         w.resize_as_(xx2)
234 |         h.resize_as_(yy2)
235 |         w = xx2 - xx1
236 |         h = yy2 - yy1
237 |         # check sizes of xx1 and xx2.. after each iteration
238 |         w = torch.clamp(w, min=0.0)
239 |         h = torch.clamp(h, min=0.0)
240 |         inter = w*h
241 |         # IoU = i / (area(a) + area(b) - i)
242 |         rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
243 |         union = (rem_areas - inter) + area[i]
244 |         IoU = inter/union  # store result in iou
245 |         # keep only elements with an IoU <= overlap
246 |         idx = idx[IoU.le(overlap)]
247 |     return keep, count
248 | 


--------------------------------------------------------------------------------
/layers/functions/__init__.py:
--------------------------------------------------------------------------------
1 | from .detection import Detect
2 | from .prior_box import PriorBox
3 | 
4 | 
5 | __all__ = ['Detect', 'PriorBox']
6 | 


--------------------------------------------------------------------------------
/layers/functions/detection.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Function
 3 | from ..box_utils import decode, nms
 4 | from data import voc as cfg
 5 | 
 6 | 
 7 | class Detect(Function):
 8 |     """At test time, Detect is the final layer of SSD.  Decode location preds,
 9 |     apply non-maximum suppression to location predictions based on conf
10 |     scores and threshold to a top_k number of output predictions for both
11 |     confidence score and locations.
12 |     """
13 |     def __init__(self, num_classes, bkg_label, top_k, conf_thresh, nms_thresh):
14 |         self.num_classes = num_classes
15 |         self.background_label = bkg_label
16 |         self.top_k = top_k
17 |         # Parameters used in nms.
18 |         self.nms_thresh = nms_thresh
19 |         if nms_thresh <= 0:
20 |             raise ValueError('nms_threshold must be non negative.')
21 |         self.conf_thresh = conf_thresh
22 |         self.variance = cfg['variance']
23 | 
24 |     def forward(self, loc_data, conf_data, prior_data):
25 |         """
26 |         Args:
27 |             loc_data: (tensor) Loc preds from loc layers
28 |                 Shape: [batch,num_priors*4]
29 |             conf_data: (tensor) Shape: Conf preds from conf layers
30 |                 Shape: [batch*num_priors,num_classes]
31 |             prior_data: (tensor) Prior boxes and variances from priorbox layers
32 |                 Shape: [1,num_priors,4]
33 |         """
34 |         num = loc_data.size(0)  # batch size
35 |         num_priors = prior_data.size(0)
36 |         output = torch.zeros(num, self.num_classes, self.top_k, 5)
37 |         conf_preds = conf_data.view(num, num_priors,
38 |                                     self.num_classes).transpose(2, 1)
39 | 
40 |         # Decode predictions into bboxes.
41 |         for i in range(num):
42 |             decoded_boxes = decode(loc_data[i], prior_data, self.variance)
43 |             # For each class, perform nms
44 |             conf_scores = conf_preds[i].clone()
45 | 
46 |             for cl in range(1, self.num_classes):
47 |                 c_mask = conf_scores[cl].gt(self.conf_thresh)
48 |                 scores = conf_scores[cl][c_mask]
49 |                 if scores.size(0) == 0:
50 |                     continue
51 |                 l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes)
52 |                 boxes = decoded_boxes[l_mask].view(-1, 4)
53 |                 # idx of highest scoring and non-overlapping boxes per class
54 |                 ids, count = nms(boxes, scores, self.nms_thresh, self.top_k)
55 |                 output[i, cl, :count] = \
56 |                     torch.cat((scores[ids[:count]].unsqueeze(1),
57 |                                boxes[ids[:count]]), 1)
58 |         flt = output.contiguous().view(num, -1, 5)
59 |         _, idx = flt[:, :, 0].sort(1, descending=True)
60 |         _, rank = idx.sort(1)
61 |         flt[(rank < self.top_k).unsqueeze(-1).expand_as(flt)].fill_(0)
62 |         return output
63 | 


--------------------------------------------------------------------------------
/layers/functions/prior_box.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from math import sqrt as sqrt
 3 | from itertools import product as product
 4 | import torch
 5 | 
 6 | 
 7 | class PriorBox(object):
 8 |     """Compute priorbox coordinates in center-offset form for each source
 9 |     feature map.
10 |     """
11 |     """cfg= voc = {
12 |     'num_classes': 2, #【改成自己训练的类别数】
13 |     'lr_steps': (80000, 100000, 120000),
14 |     'max_iter': 120000, #【改成自己训练的迭代次数】
15 |     'feature_maps': [38, 19, 10, 5, 3, 1],
16 |     'min_dim': 300,
17 |     'steps': [8, 16, 32, 64, 100, 300],
18 |     'min_sizes': [30, 60, 111, 162, 213, 264],
19 |     'max_sizes': [60, 111, 162, 213, 264, 315],
20 |     'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
21 |     'variance': [0.1, 0.2],
22 |     'clip': True,
23 |     'name': 'VOC',}
24 |      """
25 |     def __init__(self, cfg):
26 |         super(PriorBox, self).__init__()
27 |         self.image_size = cfg['min_dim']
28 |         # number of priors for feature map location (either 4 or 6)
29 |         self.num_priors = len(cfg['aspect_ratios']) # 6
30 |         self.variance = cfg['variance'] or [0.1]
31 |         self.feature_maps = cfg['feature_maps']
32 |         self.min_sizes = cfg['min_sizes']
33 |         self.max_sizes = cfg['max_sizes']
34 |         self.steps = cfg['steps']
35 |         self.aspect_ratios = cfg['aspect_ratios']
36 |         self.clip = cfg['clip']
37 |         self.version = cfg['name']
38 |         for v in self.variance:
39 |             if v <= 0:
40 |                 raise ValueError('Variances must be greater than 0')
41 | 
42 |     def forward(self):
43 |         mean = []
44 |         for k, f in enumerate(self.feature_maps):#每个特征层的尺寸大小  'feature_maps': [38, 19, 10, 5, 3, 1],
45 |             for i, j in product(range(f), repeat=2):#生成平面的网格位置坐标
46 |                 f_k = self.image_size / self.steps[k]  #300/[8, 16, 32, 64, 100, 300] f_k=[37.5, 18.75, 9.375, 4.6875, 3, 1]
47 |                 # unit center x,y
48 |                 cx = (j + 0.5) / f_k
49 |                 cy = (i + 0.5) / f_k
50 | 
51 |                 # aspect_ratio: 1
52 |                 # rel size: min_size
53 |                 s_k = self.min_sizes[k]/self.image_size#'min_sizes': [30, 60, 111, 162, 213, 264]/300=[0.1, 0.2, 0.37, 0.54, 0.71, 0.88]=s_k
54 |                 mean += [cx, cy, s_k, s_k]
55 | 
56 |                 # aspect_ratio: 1
57 |                 # rel size: sqrt(s_k * s_(k+1))
58 |                 s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size))#'max_sizes': [60, 111, 162, 213, 264, 315]/300=[0.2 0.37 0.54 0.71 0.88 1.05]
59 |                 mean += [cx, cy, s_k_prime, s_k_prime]
60 | 
61 |                 # rest of aspect ratios
62 |                 for ar in self.aspect_ratios[k]:#'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
63 |                     mean += [cx, cy, s_k*sqrt(ar), s_k/sqrt(ar)]
64 |                     mean += [cx, cy, s_k/sqrt(ar), s_k*sqrt(ar)]
65 |         # back to torch land
66 |         output = torch.Tensor(mean).view(-1, 4)
67 |         if self.clip:
68 |             output.clamp_(max=1, min=0)  #对超出范围的点坐标位置和prior的宽高限制在0-1之间
69 |         return output #返回的output中是所有的生成框anchor的位置和尺寸
70 | 


--------------------------------------------------------------------------------
/layers/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .l2norm import L2Norm
2 | from .multibox_loss import MultiBoxLoss
3 | 
4 | __all__ = ['L2Norm', 'MultiBoxLoss']
5 | 


--------------------------------------------------------------------------------
/layers/modules/l2norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Function
 4 | from torch.autograd import Variable
 5 | import torch.nn.init as init
 6 | 
 7 | class L2Norm(nn.Module):
 8 |     def __init__(self,n_channels, scale):
 9 |         super(L2Norm,self).__init__()
10 |         self.n_channels = n_channels
11 |         self.gamma = scale or None
12 |         self.eps = 1e-10
13 |         self.weight = nn.Parameter(torch.Tensor(self.n_channels))
14 |         self.reset_parameters()
15 | 
16 |     def reset_parameters(self):
17 |         init.constant_(self.weight,self.gamma)
18 | 
19 |     def forward(self, x):
20 |         norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()+self.eps
21 |         #x /= norm
22 |         x = torch.div(x,norm)
23 |         out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
24 |         return out
25 | 


--------------------------------------------------------------------------------
/layers/modules/multibox_loss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | from data import config as cfg  # ##将原本的coco换成了voc0712
  7 | from ..box_utils import match, log_sum_exp
  8 | import focal_loss  # ##引入了FocalLoss
  9 | 
 10 | 
 11 | class MultiBoxLoss(nn.Module):
 12 |     """SSD Weighted Loss Function
 13 |     Compute Targets:
 14 |         1) Produce Confidence Target Indices by matching  ground truth boxes
 15 |            with (default) 'priorboxes' that have jaccard index > threshold parameter
 16 |            (default threshold: 0.5).
 17 |            ###ground——trut和自定义的prior框做匹配，IOU大于0.5就默认为正样本
 18 |         2) Produce localization target by 'encoding' variance into offsets of ground
 19 |            truth boxes and their matched  'priorboxes'.
 20 |            ###编码过程
 21 |         3) Hard negative mining to filter the excessive number of negative examples
 22 |            that comes with using a large number of default bounding boxes.
 23 |            (default negative:positive ratio 3:1)
 24 |            ###难例挖掘部分（正负样本的比值为1：3）
 25 |     Objective Loss:
 26 |         L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
 27 |         损失部分有两部分组成（类别的交叉熵loss和位置的smoothL2损失）
 28 |         Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
 29 |         weighted by α which is set to 1 by cross val.
 30 |         Args:
 31 |             c: class confidences, （预测的类别置信度）
 32 |             l: predicted boxes,（预测的回归框）
 33 |             g: ground truth boxes （ground——truth框）
 34 |             N: number of matched default boxes （匹配的默认框数目）
 35 |         See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 36 |     """
 37 |     # 输入参数：类别数，阈值，是否用prior框匹配（bool）,背景的标签值, 是否难例挖掘（bool）,负例和正例的比例, 确定为困难负例的IUO最小值, 编码对象（bool）,默认使用GPU
 38 |     def __init__(self, num_classes, overlap_thresh, prior_for_matching,
 39 |                  bkg_label, neg_mining, neg_pos, neg_overlap, encode_target, use_gpu=True):
 40 |         super(MultiBoxLoss, self).__init__()
 41 |         self.use_gpu = use_gpu
 42 |         self.num_classes = num_classes
 43 |         self.threshold = overlap_thresh  # ##阈值
 44 |         self.background_label = bkg_label  # ##背景的标签值*****但是后面好像没使用
 45 |         self.encode_target = encode_target  # ##不明白，后面好像也没用
 46 |         self.use_prior_for_matching = prior_for_matching # ##是否用prior框匹配（bool）
 47 |         self.do_neg_mining = neg_mining  # ##是否难例挖掘（bool）
 48 |         self.negpos_ratio = neg_pos  # ##负例和正例的比例
 49 |         self.neg_overlap = neg_overlap  # ##确定为困难负例的IUO最小值
 50 |         self.variance = cfg.voc['variance']
 51 | 
 52 |     # ##forward里面包括了【难例挖掘】
 53 |     # ##输入参数1：网络结构net输出的out:[loc conf priors]
 54 |     # ##输入参数2：targets:真实目标的位置标签值
 55 |     def forward(self, predictions, targets):
 56 |         """Multibox Loss
 57 |         Args:
 58 |             predictions (tuple): A tuple containing loc preds, conf preds,
 59 |             and prior boxes from SSD net.
 60 |                 conf shape: torch.size【batch_size, num_priors, num_classes】 3维度
 61 |                 loc shape: torch.size【batch_size, num_priors, 4】 3维度
 62 |                 priors shape: torch.size【num_priors,4】
 63 | 
 64 |             targets (tensor): Ground truth boxes and labels for a batch,
 65 |                 shape: [batch_size,num_objs,5] (last idx is the label).
 66 |         """
 67 |         loc_data, conf_data, priors = predictions  # 【prediction包括net预测的位置信息，预测的类别，所有的先验框】
 68 |         num = loc_data.size(0)  # batch_size每次输入的图片数
 69 |         priors = priors[:loc_data.size(1), :]  # priors里面包括所有的先验prior框[8732,4] # feel no use
 70 |         num_priors = (priors.size(0))  # 8732 anchors的数量
 71 |         num_classes = self.num_classes  # 类别数
 72 | 
 73 |         # match priors (default boxes) and ground truth boxes
 74 |         # ##下面的loc_t和conf_t是生成的随机的
 75 |         loc_t = torch.Tensor(num, num_priors, 4) # [batch_size,8732,4] 每张图片有8732个先验框，每个先验框有四个数值[中心点xy，高，宽]
 76 |         # 用来记录每一个default box的类别，0类就是负样本
 77 |         conf_t = torch.LongTensor(num, num_priors)  # [batch_size,8732] 每张图片生成8732个先验框 每个先验框有一个置信度的的值
 78 |         for idx in range(num):  # 对每个batch_size里每一张图进行遍历
 79 |             # target里面是五维度tensor，最后个维度是label
 80 |             truths = targets[idx][:, :-1].data  # position 真实的ground_truth方框信息 targets是5维数据【前4维表示位置信息，最后1维表示类别】
 81 |             labels = targets[idx][:, -1].data  # labels 真实的回归框标签信息
 82 |             defaults = priors.data  # [8732,4] default box在同一尺度下的坐标是不变的，与batch无关
 83 | 
 84 |             # 【MATCH函数】参数输入【阈值，ground_truth,设置的先验框prior,variance方差？,真实标签，位置预测，类别预测，遍历每个batch中的图片顺序】
 85 |             match(self.threshold, truths, defaults, self.variance, labels,loc_t, conf_t, idx)
 86 |             # match这个函数给每个ground truth匹配了最好的priors，给每个priors匹配最好的ground truth
 87 |             # 经过encode后的offset([g_cx cy, g_wh])->loc_t,top class label for each prior->conf_t
 88 |             # match函数最后更新 loc_t, conf_t 【编码之后的位置信息和类别信息】
 89 |             # loc_t 【batch_size, 8732, 4】
 90 |             # conf_t【batch_size, 8732】
 91 |         if self.use_gpu:  # 将编码后的位置信息和类别信息放在GPU上
 92 |             loc_t = loc_t.cuda()  # 【loc_t里面是一个batch中所有图片的位置信息，每张图片有（8732,4）】 Tensor:【batch_size,7843,4】
 93 |             conf_t = conf_t.cuda()  # Tensor: 【batch_size,8732】
 94 |         # wrap targets
 95 |         loc_t = Variable(loc_t, requires_grad=False)  # #Tensor:【batch_size,7843,4】 encoded offsets to learn
 96 |         conf_t = Variable(conf_t, requires_grad=False)
 97 |         # #Tensor: 【batch_size,8732】 top class label for each prior conf_t是标签值
 98 | 
 99 |         pos = conf_t > 0  # 只有大于0的才被认为不是背景，而是存在目标 pos=bool型 pos=Tensor:【batch_size,8732】
100 |         num_pos = pos.sum(dim=1, keepdim=True)  # num_pos记录的是8732个框中是存在目标的方框 选择为正样本的数量？？？
101 | 
102 |         # Localization Loss (Smooth L1)
103 |         # Shape: [batch,num_priors,4]
104 |         # loc_loss是只考虑正样本的  loc_data是预测的tensor
105 |         # ## pos_idx是bool型【batch_size,8732,4】，记录的是每张图片中生成的prior中是目标是True 背景是False
106 |         pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
107 |         # 首先将pos的最后个维度添加个'1' 再将bool型的pos【batch_size,8732】->【batch_size,8732,4】
108 |         loc_p = loc_data[pos_idx].view(-1, 4)  # ## 由net预测的存在目标的区域目标 loc_p (p代表positive) 【前景目标区域的个数，4】
109 |         loc_t = loc_t[pos_idx].view(-1, 4)  # ## 由实际GT 编码出来的loc_t
110 |         # 输入的loc_p是指真实编码后的ground_truth 和 网络的预测位置结果 通过L1函数计算损失
111 |         '''
112 |         【loss_l】即为位置损失值
113 |         '''
114 |         loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)  # ##输入参数1：网络net的位置预测 输入参数2：真实GT编码后的位置信息
115 | # ############################################################################################################################################
116 | # ############################################################################################################################################
117 |         '''【难例挖掘】'''
118 |         # 【conf_data】: torch.size(batch_size,num_priors,num_classes)
119 |         batch_conf = conf_data.view(-1, self.num_classes)  # 【batch_size*8732行,num_classes列】  一个batch_size中所有prior的数量
120 |         # 【参照论文中conf计算方式】
121 |         # ## conf_t.view(-1, 1) 【batch_size*8732行, 1列】 与GT匹配之后的置信度的值
122 |         # ## batch_conf 【batch_size*8732行,num_classes列】 每个prior中N类别的置信度
123 |         loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1))  # ##将预测信息按照论文中的公式编码【难懂】
124 |         # 得到的loss_c  torch.Size([batch_size*8732, 1])
125 | 
126 |         # 【Hard Negative Mining】
127 |         # loss_c[pos.view(-1, 1)] = 0###上面两行被同时注释掉
128 |         loss_c = loss_c.view(num, -1)  # ##这里和下面一行调换了 loss=【torch.Size([batch_size, 8732])】
129 |         loss_c[pos] = 0  # ##将正例样本的损失置为0，背景样本的loss不是0 pos(bool型)=Tensor:【batch_size,8732】
130 |         _, loss_idx = loss_c.sort(1, descending=True)  # _ 里面存 放每行由大到小的数列， loss_idx 降序后的元素在原本每行中的index
131 |         _, idx_rank = loss_idx.sort(1)  # ##idx_rank [batch_size ,8732]
132 |         # ## 第一次sort：得到的index是按顺序排的索引   第两次sort：得到原Tensor的损失从大到小的映射，排第几的数字变为排名【难懂但看懂了】
133 |         # ## 总结：正样本为默认框与真实框根据iou匹配得到，负样本为分类loss值排序得到。
134 |         # ## 先将 pos bool型（True，False）转化为（1，0） num_pos：【batch_size, 1】 每一行记录的是batch中 每一张图片中有目标的prior数量
135 |         num_pos = pos.long().sum(1, keepdim=True)
136 |         # ## max=pos.size(1)-1 表示最多有多少个prior，每张图片中的负样本数不能超过每张图片中最大的prior数
137 |         # ## negpos_ratio*num_pos 表示负样本数是正样本数的3倍
138 |         num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)  # num_neg返回的是 torch.Size([batch_size, 1])
139 |         # ## 【num_pos，num_neg】均为【batch_size, 1】 分别记录了每张图片中正样本和负样本的数目 比例 1:3
140 | 
141 |         # ## neg(bool型)【batch_size, 8732】 选取了每张图片中 排名前（对应负样本数量）的 设置为True
142 |         neg = idx_rank < num_neg.expand_as(idx_rank)
143 |         # 置信度的损失包括 正/负样本都包括损失
144 |         # 因为pos 和 neg 都是bool型 因此 pos_idx 和 neg_idx 也是bool型
145 |         # ## pos_idx 和 neg_idx 均为【batch_size, 8732 ,num_classes】
146 |         pos_idx = pos.unsqueeze(2).expand_as(conf_data)
147 |         neg_idx = neg.unsqueeze(2).expand_as(conf_data)
148 | 
149 |         # ## conf_p：【batch_size*8732 , num_classes】
150 |         # ## conf_p  包括 正/负样本都要算入损失
151 |         conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes)
152 |         # ## Net在每个prior框中分别预测每一类别结果【batch_size*8732 , num_classes】
153 |         targets_weighted = conf_t[(pos+neg).gt(0)]  # ## 含有GT信息【batch_size,8732】
154 |         '''
155 |         【loss_c】即为类别损失值
156 |         '''
157 |         # ##参数1：conf_p 是Net在每个prior框中分别预测每一类别结果
158 |         # ##参数2：targets_weighted 是存储的标签值long形式
159 |         # ##【FocalLoss函数是针对类别损失部分 【问题1】：正样本/负样本不均衡 【问题2】：难易样本本身对损失函数的贡献不一样】
160 |         # ##-------------------------------------------------------------------------------------------------
161 |         compute_c_loss = focal_loss.FocalLoss(alpha=None, gamma=2, class_num=num_classes, size_average=False)
162 |         loss_c = compute_c_loss(conf_p, targets_weighted)
163 |         # ##下面是原本的损失函数 若引入FocalLoss那么就注释掉这一行
164 |         # loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)  ###【难懂没懂】  ************
165 |         # ## Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
166 |         # ##-------------------------------------------------------------------------------------------------
167 |         
168 |         N = num_pos.data.sum()  # ## N：一个batch中的所有图片的目标总数
169 |         N=N.double()
170 |         loss_l = loss_l.double()  # 上面加入double()下面也添加了一行
171 |         loss_c = loss_c.double()
172 |         loss_l /= N
173 |         loss_c /= N
174 |         return loss_l, loss_c
175 | 


--------------------------------------------------------------------------------
/loc-txt.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "train and val size 616\n",
13 |       "train size 554\n"
14 |      ]
15 |     }
16 |    ],
17 |    "source": [
18 |     "import os  \n",
19 |     "import random   \n",
20 |     "  \n",
21 |     "xmlfilepath=r'/data/lp/project/ssd.pytorch/xml_zc_fz'  \n",
22 |     "saveBasePath=r'/data/lp/project/ssd.pytorch/txtsave'\n",
23 |     "  \n",
24 |     "trainval_percent=1.0\n",
25 |     "train_percent=0.9\n",
26 |     "total_xml = os.listdir(xmlfilepath)  \n",
27 |     "num=len(total_xml)    \n",
28 |     "list=range(num)    \n",
29 |     "tv=int(num*trainval_percent)    \n",
30 |     "tr=int(tv*train_percent)    \n",
31 |     "trainval= random.sample(list,tv)    \n",
32 |     "train=random.sample(trainval,tr)    \n",
33 |     "  \n",
34 |     "print(\"train and val size\",tv)  \n",
35 |     "print(\"train size\",tr)  \n",
36 |     "ftrainval = open(os.path.join(saveBasePath,'trainval.txt'), 'w')    \n",
37 |     "ftest = open(os.path.join(saveBasePath,'test.txt'), 'w')    \n",
38 |     "ftrain = open(os.path.join(saveBasePath,'train.txt'), 'w')    \n",
39 |     "fval = open(os.path.join(saveBasePath,'val.txt'), 'w')    \n",
40 |     "  \n",
41 |     "for i  in list:    \n",
42 |     "    name=total_xml[i][:-4]+'\\n'    \n",
43 |     "    if i in trainval:    \n",
44 |     "        ftrainval.write(name)    \n",
45 |     "        if i in train:    \n",
46 |     "            ftrain.write(name)    \n",
47 |     "        else:    \n",
48 |     "            fval.write(name)    \n",
49 |     "    else:    \n",
50 |     "        ftest.write(name)    \n",
51 |     "    \n",
52 |     "ftrainval.close()    \n",
53 |     "ftrain.close()    \n",
54 |     "fval.close()    \n",
55 |     "ftest .close() "
56 |    ]
57 |   },
58 |   {
59 |    "cell_type": "code",
60 |    "execution_count": null,
61 |    "metadata": {
62 |     "collapsed": true
63 |    },
64 |    "outputs": [],
65 |    "source": []
66 |   }
67 |  ],
68 |  "metadata": {
69 |   "kernelspec": {
70 |    "display_name": "Python 3",
71 |    "language": "python",
72 |    "name": "python3"
73 |   },
74 |   "language_info": {
75 |    "codemirror_mode": {
76 |     "name": "ipython",
77 |     "version": 3
78 |    },
79 |    "file_extension": ".py",
80 |    "mimetype": "text/x-python",
81 |    "name": "python",
82 |    "nbconvert_exporter": "python",
83 |    "pygments_lexer": "ipython3",
84 |    "version": "3.6.10"
85 |   }
86 |  },
87 |  "nbformat": 4,
88 |  "nbformat_minor": 2
89 | }
90 | 


--------------------------------------------------------------------------------
/ssd.py:
--------------------------------------------------------------------------------
   1 | # import torch
   2 | # import torch.nn as nn
   3 | # import torch.nn.functional as F
   4 | # from torch.autograd import Variable
   5 | # from layers import *
   6 | # from data import voc #coco
   7 | # import os
   8 | #
   9 | #
  10 | # class SSD(nn.Module):
  11 | #     """Single Shot Multibox Architecture
  12 | #     The network is composed of a base VGG network followed by the
  13 | #     added multibox conv layers.  Each multibox layer branches into
  14 | #         1) conv2d for class conf scores
  15 | #         2) conv2d for localization predictions
  16 | #         3) associated priorbox layer to produce default bounding
  17 | #     Args:
  18 | #         phase: (string) Can be "test" or "train"
  19 | #         size: input image size
  20 | #         base: VGG16 layers for input, size of either 300 or 500
  21 | #         extras: extra layers that feed to multibox loc and conf layers
  22 | #         head: "multibox head" consists of loc and conf conv layers
  23 | #     """
  24 | #     def __init__(self, phase, size, base, extras, head, num_classes):
  25 | #         super(SSD, self).__init__()
  26 | #         self.phase = phase#训练的状态是train还是test
  27 | #         self.num_classes = num_classes
  28 | #         self.cfg =voc #(coco, )[num_classes == 2]#voc和coco都是字典型 找到num_classes键 对应为值为21的模型，这里返回【voc】
  29 | #         self.priorbox = PriorBox(self.cfg) #实例化一个类PriorBox，类实现的功能是生成所有的先验框 prior anchors
  30 | #         self.priors = Variable(self.priorbox.forward(), volatile=True)#结合上面一句话执行生成先验框的操作，priors保存的是【tensor 8760行4列】
  31 | #         self.size = size #图片大小
  32 | #
  33 | #         # SSD network
  34 | #         self.vgg = nn.ModuleList(base)#####SSD前面的VGG16层
  35 | #         # Layer learns to scale the l2 normalized features from conv4_3
  36 | #         #conv4-3需要做L2归一化
  37 | #         self.L2Norm = L2Norm(512, 20)
  38 | #         self.extras = nn.ModuleList(extras)#SSD后面添加的额外层
  39 | #         #head包括两个list【第一个list是位置预测，第二个list是类别预测】
  40 | #         self.loc = nn.ModuleList(head[0])
  41 | #         self.conf = nn.ModuleList(head[1])
  42 | #
  43 | #         if phase == 'test': #看train步骤的时候别看
  44 | #             self.softmax = nn.Softmax(dim=-1) #最后一个维度是预测的类别信息，要经过softmax
  45 | #             self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)
  46 | #
  47 | #     def forward(self, x):
  48 | #         """Applies network layers and ops on input image(s) x.
  49 | #
  50 | #         Args:
  51 | #             x: input image or batch of images. Shape: [batch,3,300,300].
  52 | #
  53 | #         Return:
  54 | #             Depending on phase:
  55 | #             test:
  56 | #                 Variable(tensor) of output class label predictions,
  57 | #                 confidence score, and corresponding location predictions for
  58 | #                 each object detected. Shape: [batch,topk,7]
  59 | #
  60 | #             train:
  61 | #                 list of concat outputs from:
  62 | #                     1: confidence layers, Shape: [batch*num_priors,num_classes]
  63 | #                     2: localization layers, Shape: [batch,num_priors*4]
  64 | #                     3: priorbox layers, Shape: [2,num_priors*4]
  65 | #         """
  66 | #         sources = list()
  67 | #         loc = list()
  68 | #         conf = list()
  69 | #
  70 | #         # apply vgg up to 【conv4_3 relu激活后再L2Norm操作后的输出tensor】
  71 | #         for k in range(23):
  72 | #             x = self.vgg[k](x)
  73 | #         s = self.L2Norm(x)
  74 | #         sources.append(s)
  75 | #
  76 | #         # apply vgg up to 【fc7 也就是vgg基础层最后一层 relu激活层操作后的输出tensor】
  77 | #         for k in range(23, len(self.vgg)):
  78 | #             x = self.vgg[k](x)
  79 | #         sources.append(x)
  80 | #
  81 | #         # apply extra layers and cache source layer outputs【将额外添加的4个tensor提取出来】
  82 | #         for k, v in enumerate(self.extras):
  83 | #             x = F.relu(v(x), inplace=True)
  84 | #             if k % 2 == 1:
  85 | #                 sources.append(x)
  86 | #         #到此为止 【sources里面包括了6个特征层】
  87 | #         # apply multibox head to source layers
  88 | #         for (x, l, c) in zip(sources, self.loc, self.conf):#【sources loc conf都是具有六个元素的list】
  89 | #             # [b, C, H, W]——[b, H, W, C]，因为我们最后要在通道这个维度上做softmax
  90 | #             loc.append(l(x).permute(0, 2, 3, 1).contiguous())
  91 | #             conf.append(c(x).permute(0, 2, 3, 1).contiguous())
  92 | #
  93 | #         loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
  94 | #         conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)###########没有看特别明白
  95 | #
  96 | #         if self.phase == "test":
  97 | #             output = self.detect(
  98 | #                 loc.view(loc.size(0), -1, 4),# loc preds
  99 | #                 self.softmax(conf.view(conf.size(0), -1,self.num_classes)), # conf preds
 100 | #                 self.priors.type(type(x.data))# default boxes
 101 | #             )
 102 | #         else:
 103 | #             output = (
 104 | #                 loc.view(loc.size(0), -1, 4),
 105 | #                 conf.view(conf.size(0), -1, self.num_classes),
 106 | #                 self.priors
 107 | #             )
 108 | #         return output
 109 | #
 110 | #     def load_weights(self, base_file):
 111 | #         other, ext = os.path.splitext(base_file)
 112 | #         if ext == '.pkl' or '.pth':
 113 | #             print('Loading weights into state dict...')
 114 | #             self.load_state_dict(torch.load(base_file,
 115 | #                                  map_location=lambda storage, loc: storage))
 116 | #             print('Finished!')
 117 | #         else:
 118 | #             print('Sorry only .pth and .pkl files supported.')
 119 | #
 120 | #
 121 | # # This function is derived from torchvision VGG make_layers()
 122 | # # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
 123 | # def vgg(cfg, i, batch_norm=False):
 124 | #     layers = [] # 用于存放vgg网络的list
 125 | #     in_channels = i# 最前面那层的维度--300*300*3，因此i=3 我的理解是输入时候的维度
 126 | #     for v in cfg: # 代码厉害的地方，循环建立多层，数据信息存放在一个字典中
 127 | #         if v == 'M': #'M'代表Maxpooling ceil_mode=False # maxpooling 时边缘不补
 128 | #             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
 129 | #         elif v == 'C':#'C'代表Maxpooling ceil_mode=True # maxpooling 时边缘补NAN
 130 | #             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
 131 | #         else:
 132 | #             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
 133 | #             if batch_norm:
 134 | #                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
 135 | #             else:
 136 | #                 layers += [conv2d, nn.ReLU(inplace=True)]
 137 | #             in_channels = v
 138 | #     pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
 139 | #     conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 140 | #     #  dilation=卷积核元素之间的间距,扩大卷积感受野的范围，没有增加卷积size
 141 | #     conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
 142 | #     layers += [pool5, conv6,
 143 | #                nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
 144 | #     return layers #返回的是vgg的结构
 145 | #
 146 | #
 147 | # def add_extras(cfg, i, batch_norm=False):
 148 | #     # Extra layers added to VGG for feature scaling
 149 | #     layers = []
 150 | #     in_channels = i
 151 | #     flag = False
 152 | #     for k, v in enumerate(cfg):
 153 | #         if in_channels != 'S':# S代表stride，为2时候就相当于缩小feature map
 154 | #             if v == 'S':
 155 | #                 layers += [nn.Conv2d(in_channels, cfg[k + 1],
 156 | #                            kernel_size=(1, 3)[flag], stride=2, padding=1)]
 157 | #             else:
 158 | #                 layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
 159 | #             flag = not flag
 160 | #         in_channels = v
 161 | #     return layers
 162 | #
 163 | #
 164 | # def multibox(vgg, extra_layers, cfg, num_classes):
 165 | #     loc_layers = [] # loc_layers的输出维度是default box的种类(4or6)*4
 166 | #     conf_layers = [] # conf_layers的输出维度是default box的种类(4or6)*num_class
 167 | #     vgg_source = [21, -2] #第21层和倒数第二层
 168 | #     for k, v in enumerate(vgg_source):
 169 | #         loc_layers += [nn.Conv2d(vgg[v].out_channels,
 170 | #                                  cfg[k] * 4, kernel_size=3, padding=1)]  #特征图的尺寸没有改变，通道数变成 【4/6*4】
 171 | #         conf_layers += [nn.Conv2d(vgg[v].out_channels,
 172 | #                         cfg[k] * num_classes, kernel_size=3, padding=1)] #特征图的尺寸没有改变，通道数变成 【4/6*num_classes】
 173 | #     for k, v in enumerate(extra_layers[1::2], 2):
 174 | #         loc_layers += [nn.Conv2d(v.out_channels, cfg[k]
 175 | #                                  * 4, kernel_size=3, padding=1)]  #特征图的尺寸没有改变，通道数变成 【4/6*4】
 176 | #         conf_layers += [nn.Conv2d(v.out_channels, cfg[k]
 177 | #                                   * num_classes, kernel_size=3, padding=1)] #特征图的尺寸没有改变，通道数变成 【4/6*num_classes】
 178 | #     return vgg, extra_layers, (loc_layers, conf_layers)
 179 | #     #返回的是vgg，extra_layers的结构 以及六个特征层提取的【位置回归特征图，类别回归特征图】
 180 | #
 181 | #
 182 | # base = {
 183 | #     '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 184 | #             512, 512, 512],
 185 | #     '512': [],
 186 | # }
 187 | # extras = {
 188 | #     '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
 189 | #     '512': [],
 190 | # }
 191 | # mbox = {
 192 | #     '300': [4, 6, 6, 6, 4, 4],  # number of boxes per feature map location
 193 | #     '512': [],
 194 | # }
 195 | #
 196 | #
 197 | # def build_ssd(phase, size=300, num_classes=21):#阶段【train or test】 输入图片尺寸大小 类别数
 198 | #     if phase != "test" and phase != "train":#分成训练和测试两个阶段
 199 | #         print("ERROR: Phase: " + phase + " not recognized")
 200 | #         return
 201 | #     if size != 300:
 202 | #         print("ERROR: You specified size " + repr(size) + ". However, " +
 203 | #               "currently only SSD300 (size=300) is supported!")
 204 | #         return
 205 | #     base_, extras_, head_ = multibox(vgg(base[str(size)], 3), #网络结构是先经过vgg+add_extras 这里的vgg输出通道是1024，add_extras输入为1024
 206 | #                                      add_extras(extras[str(size)], 1024),
 207 | #                                      mbox[str(size)], num_classes)
 208 | #     #返回的head_是个元组，里面包括两个list【第一个list是位置预测，第二个list是类别预测】，每个list 6个元素，每个元素是个特征层tensor
 209 | #     return SSD(phase, size, base_, extras_, head_, num_classes)
 210 | ######################版本二##################################
 211 | # import torch
 212 | # import torch.nn as nn
 213 | # import torch.nn.functional as F
 214 | # from torch.autograd import Variable
 215 | # # from SSD_pytorch.models import *
 216 | # # from SSD_pytorch.utils.config import opt
 217 | # import os
 218 | # from torch.autograd import Variable
 219 | # from layers import *
 220 | # from data import voc #coco
 221 | #
 222 | #
 223 | # class SSD(nn.Module):
 224 | #     """Single Shot Multibox Architecture
 225 | #     The network is composed of a base VGG network followed by the
 226 | #     added multibox conv layers.  Each multibox layer branches into
 227 | #         1) conv2d for class conf scores
 228 | #         2) conv2d for localization predictions
 229 | #         3) associated priorbox layer to produce default bounding
 230 | #            boxes specific to the layer's feature map size.
 231 | #     SSD模型由去掉全连接层的vgg网络为基础组成。在之后添加了多盒转化层。
 232 | #     每个多盒层分支是：
 233 | #         1）conv2d 获取分类置信度
 234 | #         2）conv2d进行坐标位置预测
 235 | #         3）相关层去产生特定于该层特征图大小的默认的预测框bounding  boxes
 236 | #
 237 | #
 238 | #
 239 | #     See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 240 | #
 241 | #     Args:
 242 | #         phase: (string) Can be "test" or "train"
 243 | #         size: input image size  输入的图像尺寸
 244 | #         base: VGG16 layers for input, size of either 300 or 500   经过修改的vgg网络
 245 | #         extras: extra layers that feed to multibox loc and conf layers
 246 | #                 提供多盒定位的格外层  和 分类置信层（vgg网络后面新增的额外层）
 247 | #         head: "multibox head" consists of loc and conf conv layers
 248 | #                 由定位和分类卷积层组成的multibox head
 249 | #                 (loc_layers, conf_layers)     vgg与extras中进行分类和回归的层
 250 | #     """
 251 | #
 252 | #     def __init__(self, phase, size, base, extras, head, num_classes):
 253 | #         super(SSD, self).__init__()
 254 | #         self.phase = phase
 255 | #         self.num_classes = num_classes
 256 | #         self.cfg = voc
 257 | #         # 新定义一个类，该类的功能：对于每个feature map，生成预测框（中心坐标及偏移量）
 258 | #         self.priorbox = PriorBox(self.cfg)
 259 | #         # 调用forward，返回生成的预测框结果
 260 | #         # 对于所有预测的feature map，存储着生成的不同长宽比的默认框（可以理解为anchor）
 261 | #         self.priors = Variable(self.priorbox.forward(), volatile=True)
 262 | #         #300
 263 | #         self.size = size
 264 | #
 265 | #         # SSD network范围
 266 | #         # 经过修改的vgg网络
 267 | #         self.vgg = nn.ModuleList(base)################################################
 268 | #         # Layer learns to scale the l2 normalized features from conv4_3
 269 | #         # Layer层从conv4_3学习去缩放l2正则化特征
 270 | #         # 论文中conv4_3 相比较于其他的layers，有着不同的 feature scale，我们使用 ParseNet 中的 L2 normalization 技术
 271 | #         # 将conv4_3 feature map 中每一个位置的 feature norm scale 到 20，并且在 back-propagation 中学习这个 scale
 272 | #         self.L2Norm = L2Norm(512, 20)
 273 | #         # vgg网络后面新增的额外层
 274 | #         self.extras = nn.ModuleList(extras)####################################################
 275 | #         # vgg与extras中进行分类和回归的层
 276 | #         self.loc = nn.ModuleList(head[0])
 277 | #         self.conf = nn.ModuleList(head[1])
 278 | #
 279 | #         # 如果网络用于测试，则加入softmax和检测
 280 | #         if phase == 'test':
 281 | #             self.softmax = nn.Softmax(dim=-1)
 282 | #             self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)
 283 | #
 284 | #         #=====bobo新增==================
 285 | #         # pool2到conv4_3  扩张卷积，尺度少一半
 286 | #         self.DilationConv_128_128= nn.Conv2d(in_channels=128,out_channels= 128, kernel_size=3, padding=2, dilation=2,stride=2)
 287 | #         # conv4_3到conv4_3  尺度不变
 288 | #         self.conv_512_256 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3, padding=1, stride=1)
 289 | #         # fc7 到 conv4_3    反卷积上采样，尺度大一倍
 290 | #         self.DeConv_1024_128 = nn.ConvTranspose2d(in_channels=1024,out_channels=128,kernel_size=2,stride=2)
 291 | #
 292 | #         # conv4_3 到FC7  扩张卷积，尺度少一半
 293 | #         self.DilationConv_512_128 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, padding=2, dilation=2,stride=2)
 294 | #         # FC7到FC7 尺度不变
 295 | #         self.conv_1024_256 = nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=3, padding=1, stride=1)
 296 | #         # conv8_2 到 FC7    反卷积上采样，尺度大一倍  10->19
 297 | #         self.DeConv_512_128 = nn.ConvTranspose2d(in_channels=512, out_channels=128, kernel_size=3, stride=2,padding=1)
 298 | #
 299 | #
 300 | #         # conv5_3到conv8_2
 301 | #         self.DilationConv_512_128_2 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, padding=2, dilation=2, stride=2)
 302 | #         # conv8_2到conv8_2 尺度不变
 303 | #         self.conv_512_256_2 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3, padding=1, stride=1)
 304 | #         # conv9_2到conv8_2
 305 | #         self.DeConv_256_128_2 = nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=2, stride=2)
 306 | #
 307 | #         # 平滑层
 308 | #         self.smooth = nn.Conv2d(512, 512, kernel_size = 3, padding = 1, stride = 1)
 309 | #
 310 | #         # 通道数BN层的参数是输出通道数out_channels
 311 | #         self.bn = nn.BatchNorm2d(128)
 312 | #     def forward(self, x):
 313 | #         """Applies network layers and ops on input image(s) x.
 314 | #         前向传播
 315 | #
 316 | #         Args:
 317 | #             x: input image or batch of images. Shape: [batch,3,300,300].
 318 | #
 319 | #         Return:
 320 | #             Depending on phase:
 321 | #             test测试集:
 322 | #                 Variable(tensor) of output class label predictions,
 323 | #                 confidence score, and corresponding location predictions for
 324 | #                 each object detected. Shape: [batch,topk,7]
 325 | #
 326 | #             train训练集:
 327 | #                 list of concat outputs from:
 328 | #                     1: 分类层confidence layers, Shape: [batch*num_priors,num_classes]
 329 | #                     2: 回归定位层localization layers, Shape: [batch,num_priors*4]
 330 | #                     3: priorbox layers, Shape: [2,num_priors*4]
 331 | #         """
 332 | #         # sources保存 网络生成的不同层feature map结果，以便使用这些feature map来进行分类与回归
 333 | #         sources = list()
 334 | #         # 保存预测层不同feature map通过回归和分类网络的输出结果
 335 | #         loc = list()
 336 | #         conf = list()
 337 | #
 338 | #         # 原论文中vgg的conv4_3，relu之后加入L2 Normalization正则化，然后保存feature map
 339 | #         # apply vgg up to conv4_3 relu
 340 | #         # 将vgg层的feature map保存
 341 | #         # k的范围为0-22
 342 | #
 343 | #         #=========开始保存 所需的所有中间信息
 344 | #
 345 | #
 346 | #         # 保存pool2（pool下标从1开始）的结果
 347 | #         # 经过maxpool，所以不需要L2Norm正则化
 348 | #         for k in range(10):
 349 | #             x = self.vgg[k](x)
 350 | #         sources.append(x)
 351 | #
 352 | #         # 保存conv4_3结果
 353 | #         for k in range(10,23):
 354 | #             x = self.vgg[k](x)
 355 | #         s = self.L2Norm(x)
 356 | #         sources.append(s)
 357 | #
 358 | #         # 保存conv5_3结果  类似conv4_3原仓库一样，加入L2Norm
 359 | #         for k in range(23, 30):
 360 | #             x = self.vgg[k](x)
 361 | #         s = self.L2Norm(x)
 362 | #         sources.append(s)
 363 | #
 364 | #         # 保存 原fc7的输出结果
 365 | #         # apply vgg up to fc7，即将原fc7层更改为卷积层输出的结果，经过relu之后保存结果
 366 | #         # k的范围为23 - 结束
 367 | #         for k in range(30, len(self.vgg)):
 368 | #             x = self.vgg[k](x)
 369 | #         sources.append(x)
 370 | #
 371 | #         # 将新加的额外层 conv8_2、conv9_2、conv10_2、conv11_2结果保存
 372 | #         # apply extra layers and cache source layer outputs
 373 | #         # 将新增层的feature map保存
 374 | #         for k, v in enumerate(self.extras):
 375 | #             # 每经过一个conv卷积，都relu一下
 376 | #             x = F.relu(v(x), inplace=True)
 377 | #             # 论文中隔一个conv保存一个结果
 378 | #             if k % 2 == 1:
 379 | #                 sources.append(x)
 380 | #
 381 | #         # 此时sources保存了所有中间结果，论文中的pool2、conv4_3、conv5_3、fc7、conv8_2、conv9_2、conv10_2、conv11_2
 382 | #
 383 | #         # sources_final保存各层融合之后的最终结果
 384 | #         sources_final=list()
 385 | #
 386 | #         # con4_3层融合结果  self.bn1(self.conv1(x)) 在通道维度上融合
 387 | #         conv4_fp=torch.cat((F.relu(self.bn(self.DilationConv_128_128(sources[0])),inplace=True), F.relu(self.conv_512_256(sources[1]),inplace=True), F.relu(self.DeConv_1024_128(sources[3]),inplace=True)),1)
 388 | #         sources_final.append(F.relu( self.smooth(conv4_fp) , inplace=True))
 389 | #         # FC7层融合结果
 390 | #         fc7_fp = torch.cat((F.relu( self.bn(self.DilationConv_512_128(sources[1])) ,inplace=True),F.relu( self.conv_1024_256(sources[3]),inplace=True) ,F.relu(  self.DeConv_512_128(sources[4]),inplace=True)),1)
 391 | #         sources_final.append(F.relu( self.smooth(fc7_fp) , inplace=True))
 392 | #         # conv8_2层融合结果
 393 | #         conv8_fp= torch.cat(( F.relu( self.bn(self.DilationConv_512_128_2(sources[2])),inplace=True) ,F.relu(self.conv_512_256_2(sources[4]) ,inplace=True)  ,F.relu( self.DeConv_256_128_2(sources[5]),inplace=True)  ),1)
 394 | #         sources_final.append( F.relu( self.smooth(conv8_fp) , inplace=True) )
 395 | #
 396 | #
 397 | #         # 保存 conv9_2、conv10_2、conv11_2
 398 | #         sources_final.append(sources[5])
 399 | #         sources_final.append(sources[6])
 400 | #         sources_final.append(sources[7])
 401 | #
 402 | #
 403 | #         # apply multibox head to source layers
 404 | #         # permute  将tensor的维度换位  参数为换位顺序
 405 | #         #contiguous 返回一个内存连续的有相同数据的tensor
 406 | #
 407 | #         #source保存的是每个预测层的网络输出,即feature maps
 408 | #         #loc 通过使用feature map去预测回归
 409 | #         #conf通过使用feature map去预测分类
 410 | #         for (x, l, c) in zip(sources_final, self.loc, self.conf):
 411 | #             loc.append(l(x).permute(0, 2, 3, 1).contiguous())
 412 | #             conf.append(c(x).permute(0, 2, 3, 1).contiguous())
 413 | #         # 在给定维度上对输入的张量序列seq 进行连接操作    dimension=1表示在列上连接
 414 | #         loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
 415 | #         conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
 416 | #         # 测试集上的输出
 417 | #         if self.phase == "test":
 418 | #             output = self.detect(
 419 | #                 loc.view(loc.size(0), -1, 4),                   # loc preds  定位的预测
 420 | #                 self.softmax(conf.view(conf.size(0), -1,
 421 | #                              self.num_classes)),                # conf preds  分类的预测
 422 | #                 self.priors.type(type(x.data))                  # default boxes  预测框
 423 | #             )
 424 | #         else:
 425 | #             # 训练集上的输出
 426 | #             output = (
 427 | #                 loc.view(loc.size(0), -1, 4),    # loc preds [32,8732,4] 通过网络输出的定位的预测
 428 | #                 conf.view(conf.size(0), -1, self.num_classes),  #conf preds [32,8732,21]  通过网络输出的分类的预测
 429 | #                 self.priors   # 不同feature map根据公式生成的锚结果 [8732,4]   内容为 中心点坐标和宽高
 430 | #             )
 431 | #         return output
 432 | #
 433 | #
 434 | #     def load_weights(self, base_file):
 435 | #         other, ext = os.path.splitext(base_file)
 436 | #         if ext == '.pkl' or '.pth':
 437 | #             print('Loading weights into state dict...')
 438 | #             self.load_state_dict(torch.load(base_file,
 439 | #                                  map_location=lambda storage, loc: storage))
 440 | #             print('Finished!')
 441 | #         else:
 442 | #             print('Sorry only .pth and .pkl files supported.')
 443 | #
 444 | #
 445 | #
 446 | #
 447 | # # This function is derived from torchvision VGG make_layers()
 448 | # # 此方法源自torchvision VGG make_layers（）
 449 | # # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
 450 | # def vgg(cfg, i, batch_norm=False):
 451 | #     '''
 452 | #     vgg的结构
 453 | #     cfg:  vgg的结构
 454 | #      '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 455 | #             512, 512, 512],
 456 | #     i: 3   输入图像通道数
 457 | #     batch_norm    为False。若为True，则网络中加入batch_norm
 458 | #
 459 | #     返回没有全连接层的vgg网络
 460 | #     '''
 461 | #     #保存vgg所有层
 462 | #     layers = []
 463 | #     #输入图像通道数
 464 | #     in_channels = i
 465 | #     for v in cfg:   #M与C会导致生成的feature map大小出现变化
 466 | #         if v == 'M':  #最大池化层   默认floor模式
 467 | #             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
 468 | #         elif v == 'C':  #最大池化层   ceil模式   两种不同的maxpool方式    参考https://blog.csdn.net/GZHermit/article/details/79351803
 469 | #             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
 470 | #         else:
 471 | #             # 卷积
 472 | #             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
 473 | #             if batch_norm:
 474 | #                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
 475 | #             else:
 476 | #                 layers += [conv2d, nn.ReLU(inplace=True)]
 477 | #             in_channels = v
 478 | #     # 论文将 Pool5 layer 的参数，从 卷积核2×2步长为2  转变成 卷积核3×3 步长为1 外加一个 pad
 479 | #     pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
 480 | #     # 论文中将VGG的FC6 layer、FC7 layer 转成为 卷积层conv6,conv7 并从模型的FC6、FC7 上的参数，进行采样得到这两个卷积层的 参数
 481 | #     #输入通道512  输出通道为1024  卷积核为3  padding为6    dilation为卷积核中元素之间的空洞大小
 482 | #     # 修改Pool5 layer参数，导致感受野大小改变。所以conv6采用 atrous 算法，即孔填充算法。
 483 | #     # 孔填充算法将卷积 weights 膨胀扩大，即原来卷积核是 3x3，膨胀后，可能变成 7x7 了，这样 receptive field 变大了，而 score map 也很大，即输出变成 dense
 484 | #     #这么做的好处是，输出的 score map 变大了，即是 dense 的输出了，而且 receptive field 不会变小，而且可以变大。这对做分割、检测等工作非常重要。
 485 | #     conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 486 | #     #输入通道512  输出通道为1024  卷积核为3
 487 | #     conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
 488 | #     #将 修改的层也加入到vgg网络中
 489 | #     layers += [pool5, conv6,
 490 | #                nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
 491 | #     return layers
 492 | #
 493 | #
 494 | # def add_extras(cfg, i, batch_norm=False):
 495 | #     '''
 496 | #     vgg网络后面新增的额外层
 497 | #     :param cfg:  '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
 498 | #     :param i:    1024  输入通道数
 499 | #     :param batch_norm:  flase
 500 | #     :return:
 501 | #     '''
 502 | #     # 添加到VGG的额外图层用于特征缩放
 503 | #     layers = []
 504 | #     #1024  输入通道数
 505 | #     in_channels = i
 506 | #     # 控制卷积核尺寸，一维数组选前一个数还是后一个数。在每次循环时flag都改变，导致网络的卷积核尺寸为1,3,1,3交替
 507 | #     # False 为1，True为3
 508 | #     # SSD网络图中s1指步长为1，s2指步长为2
 509 | #     # 在该代码中，S代表步长为2，无S代表默认，即步长为1，所以cfg与论文网络结构完全匹配
 510 | #     flag = False
 511 | #     # enumerate枚举   k为下标   v为值
 512 | #     for k, v in enumerate(cfg):
 513 | #         if in_channels != 'S':
 514 | #             if v == 'S':
 515 | #                 layers += [nn.Conv2d(in_channels, cfg[k + 1],
 516 | #                            kernel_size=(1, 3)[flag], stride=2, padding=1)]
 517 | #             else:
 518 | #                 layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
 519 | #             flag = not flag
 520 | #         in_channels = v
 521 | #     return layers
 522 | #
 523 | #
 524 | # def multibox(vgg, extra_layers, cfg, num_classes):
 525 | #     '''
 526 | #     :param vgg: 经过修改后的vgg网络（去掉全连接层，修改pool5参数并添加新层）
 527 | #     :param extra_layers: vgg网络后面新增的额外层
 528 | #     :param cfg: '300': [4, 6, 6, 6, 4, 4],  不同部分的feature map上一个网格预测多少框
 529 | #     :param num_classes: 20分类+1背景，共21类
 530 | #     :return:
 531 | #     '''
 532 | #     # 保存所有参与预测的网络层
 533 | #     loc_layers = []
 534 | #     conf_layers = []
 535 | #     # 传入的修改过的vgg网络用于预测的网络是21层以及 倒数第二层
 536 | #     vgg_source = [21, -2]
 537 | #     for k, v in enumerate(vgg_source):
 538 | #         # 按照fp-ssd论文，将1024改为512通道
 539 | #         if k==1:
 540 | #             in_channels=512
 541 | #         else:
 542 | #             in_channels=vgg[v].out_channels
 543 | #         #4是回归的坐标参数  cfg代表该层feature map上一个网格预测多少框
 544 | #         loc_layers += [nn.Conv2d(in_channels,
 545 | #                                  cfg[k] * 4, kernel_size=3, padding=1)]
 546 | #         #num_classes是类别数 cfg代表该层feature map上一个网格预测多少框
 547 | #         conf_layers += [nn.Conv2d(in_channels,
 548 | #                         cfg[k] * num_classes, kernel_size=3, padding=1)]
 549 | #     # [x::y] 从下标x开始，每隔y取值
 550 | #     #论文中新增层也是每隔一个层添加一个预测层
 551 | #     # 将新增的额外层中的预测层也添加上   start=2：下标起始位置
 552 | #     for k, v in enumerate(extra_layers[1::2], 2):
 553 | #         loc_layers += [nn.Conv2d(v.out_channels, cfg[k]
 554 | #                                  * 4, kernel_size=3, padding=1)]
 555 | #         conf_layers += [nn.Conv2d(v.out_channels, cfg[k]
 556 | #                                   * num_classes, kernel_size=3, padding=1)]
 557 | #     return vgg, extra_layers, (loc_layers, conf_layers)
 558 | #
 559 | #
 560 | # base = {
 561 | #     # 数字为每层feature map的层数  M代表最大池化层（默认floor模式）    C代表最大池化层（ceil模式）  (去掉vgg16的最后的 maxpool、fc、fc、fc、softmax)
 562 | #     '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 563 | #             512, 512, 512],
 564 | #     '512': [],
 565 | # }
 566 | # extras = {
 567 | #     # 每个特征图都是由 两个conv 组成， conv1x1 和conv3x3
 568 | #     '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
 569 | #     '512': [],
 570 | # }
 571 | # mbox = {
 572 | #     '300': [4, 6, 6, 6, 4, 4],  # 不同部分的feature map上一个网格预测多少框
 573 | #     '512': [],
 574 | # }
 575 | #
 576 | #
 577 | # def build_ssd(phase, size=300, num_classes=21):
 578 | #     '''
 579 | #     新建SSD模型
 580 | #     '''
 581 | #     # 训练或测试
 582 | #     if phase != "test" and phase != "train":
 583 | #         print("ERROR: Phase: " + phase + " not recognized")
 584 | #         return
 585 | #     #当前SSD300只支持大小300×300的数据集训练
 586 | #     if size != 300:
 587 | #         print("ERROR: You specified size " + repr(size) + ". However, " +
 588 | #               "currently only SSD300 (size=300) is supported!")
 589 | #         return
 590 | #
 591 | #     #base_： 经过修改后的vgg网络（去掉全连接层，修改pool5参数并添加新层）
 592 | #     #extras_：  vgg网络后面新增的额外层
 593 | #     # head_ :    (loc_layers, conf_layers)   vgg与extras中进行分类和回归的层
 594 | #     base_, extras_, head_ = multibox(vgg(base[str(size)], 3),  #vgg方法返回 经过修改后的vgg网络（去掉全连接层，修改pool5参数并添加新层）
 595 | #                                      add_extras(extras[str(size)], 1024), #vgg网络后面新增的额外层
 596 | #                                      mbox[str(size)],  #mbox指不同部分的feature map上一个网格预测多少框
 597 | #                                      num_classes)
 598 | #     # phase：'train'    size：300    num_classes： 21 类别数（20类+1背景）
 599 | #     return SSD(phase, size, base_, extras_, head_, num_classes)
 600 | import torch
 601 | import torch.nn as nn
 602 | import torch.nn.functional as F
 603 | from torch.autograd import Variable
 604 | import os
 605 | from layers import *
 606 | from data import voc  # coco
 607 | ###############################################################################
 608 | # 【通道显著性模块】
 609 | class ChannelAttention(nn.Module):
 610 |     def __init__(self, in_planes, ratio=16):
 611 |         super(ChannelAttention, self).__init__()
 612 |         # 特征图先经过最大池化和平均池化 结果是1*1*通道数的tensor【最大池化，平均池化】
 613 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
 614 |         self.max_pool = nn.AdaptiveMaxPool2d(1)
 615 |         # 在经过全连接层先降低维度再升高维度，进行特征融合【MLP】
 616 |         self.fc1 = nn.Conv2d(in_planes, in_planes // 16, 1, bias=False)
 617 |         self.relu1 = nn.ReLU()
 618 |         self.fc2 = nn.Conv2d(in_planes // 16, in_planes, 1, bias=False)
 619 |         # 【激活层】
 620 |         self.sigmoid = nn.Sigmoid()
 621 | 
 622 |     def forward(self, x):
 623 |         avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x))))
 624 |         max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x))))
 625 |         out = avg_out + max_out  # 相加之后每个像素点的位置元素相加
 626 |         return self.sigmoid(out)
 627 | 
 628 | # 【空间显著性模块】
 629 | class SpatialAttention(nn.Module):
 630 |     def __init__(self, kernel_size=7):
 631 |         super(SpatialAttention, self).__init__()
 632 |         assert kernel_size in (3, 7), 'kernel size must be 3 or 7'  # 这里设定kernal_size必须是3,7
 633 |         padding = 3 if kernel_size == 7 else 1
 634 | 
 635 |         self.conv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
 636 |         self.sigmoid = nn.Sigmoid()
 637 | 
 638 |     def forward(self, x):
 639 |         avg_out = torch.mean(x, dim=1, keepdim=True)
 640 |         max_out, _ = torch.max(x, dim=1, keepdim=True)  # 会返回结果元素的值 和 对应的位置index
 641 |         x = torch.cat([avg_out, max_out], dim=1)
 642 |         x = self.conv1(x)
 643 |         return self.sigmoid(x)
 644 | 
 645 | # 【Bottleneck将特征图先经过 通道显著性模块，再经过 空间显著性模块】
 646 | class Bottleneck(nn.Module):  # 将通道显著性和空间显著性模块相连接
 647 |     def __init__(self, inplanes, stride=1, downsample=None):
 648 |         super(Bottleneck, self).__init__()
 649 |         self.ca = ChannelAttention(inplanes)
 650 |         self.sa = SpatialAttention()
 651 |         self.stride = stride
 652 |         self.relu = nn.ReLU(inplace=True)
 653 | 
 654 |     def forward(self, x):
 655 |         save = x  # 先将原本的特征图保存下来
 656 |         out = self.ca(x) * x  # 先经过通道显著性模块
 657 |         out = self.sa(out) * out  # 再经过空间显著性模块
 658 |         out += save  ###这里不应该是相乘吗？？？？？为啥变成了相加
 659 |         out = self.relu(out)  # 最后再经过relu激活函数
 660 |         return out  # 输出结果尺寸不变，但是通道数变成了【planes * 4】这就是残差模块
 661 | 
 662 | #############################【SSD中融合特征显著性模块CBAM】######################
 663 | class SSD(nn.Module):
 664 |     """Single Shot Multibox Architecture
 665 |     The network is composed of a base VGG network followed by the
 666 |     added multibox conv layers.  Each multibox layer branches into
 667 |         1) conv2d for class conf scores
 668 |         2) conv2d for localization predictions
 669 |         3) associated priorbox layer to produce default bounding
 670 |            boxes specific to the layer's feature map size.
 671 |     SSD模型由去掉全连接层的vgg网络为基础组成。在之后添加了多盒转化层。
 672 |     每个多盒层分支是：
 673 |         1）conv2d 获取分类置信度
 674 |         2）conv2d进行坐标位置预测
 675 |         3）相关层去产生特定于该层特征图大小的默认的预测框bounding  boxes
 676 | 
 677 |     See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 678 | 
 679 |     Args:
 680 |         phase: (string) Can be "test" or "train"
 681 |         size: input image size  输入的图像尺寸
 682 |         base: VGG16 layers for input, size of either 300 or 500   经过修改的vgg网络
 683 |         extras: extra layers that feed to multibox loc and conf layers
 684 |                 提供多盒定位的格外层  和 分类置信层（vgg网络后面新增的额外层）
 685 |         head: "multibox head" consists of loc and conf conv layers
 686 |                 由定位和分类卷积层组成的multibox head
 687 |                 (loc_layers, conf_layers)     vgg与extras中进行分类和回归的层
 688 |     """
 689 | 
 690 |     def __init__(self, phase, size, base, extras, head, num_classes):
 691 |         super(SSD, self).__init__()
 692 |         self.phase = phase
 693 |         self.num_classes = num_classes
 694 |         self.cfg = voc
 695 |         # 新定义一个类，该类的功能：对于每个feature map，生成预测框（中心坐标及偏移量）
 696 |         self.priorbox = PriorBox(self.cfg)
 697 |         # 调用forward，返回生成的预测框结果
 698 |         # 对于所有预测的feature map，存储着生成的不同长宽比的默认框（可以理解为anchor）
 699 |         self.priors = Variable(self.priorbox.forward(), volatile=True)
 700 |         # 300
 701 |         self.size = size
 702 | 
 703 |         # SSD network范围
 704 |         # 经过修改的vgg网络
 705 |         self.vgg = nn.ModuleList(base)  ################################################
 706 |         # Layer learns to scale the l2 normalized features from conv4_3
 707 |         # Layer层从conv4_3学习去缩放l2正则化特征
 708 |         # 论文中conv4_3 相比较于其他的layers，有着不同的 feature scale，我们使用 ParseNet 中的 L2 normalization 技术
 709 |         # 将conv4_3 feature map 中每一个位置的 feature norm scale 到 20，并且在 back-propagation 中学习这个 scale
 710 |         self.L2Norm = L2Norm(512, 20)
 711 |         # vgg网络后面新增的额外层
 712 |         self.extras = nn.ModuleList(extras)
 713 |         # vgg与extras中进行分类和回归的层
 714 |         self.loc = nn.ModuleList(head[0])
 715 |         self.conf = nn.ModuleList(head[1])
 716 | 
 717 |         # 如果网络用于测试，则加入softmax和检测
 718 |         if phase == 'test':
 719 |             self.softmax = nn.Softmax(dim=-1)
 720 |             self.detect = Detect(num_classes, 0, 200, 0.01, 0.45)
 721 | 
 722 |         # =====bobo新增==================
 723 |         # pool2到conv4_3  扩张卷积，尺度少一半
 724 |         self.DilationConv_128_128 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=2, dilation=2,
 725 |                                               stride=2)
 726 |         # conv4_3到conv4_3  尺度不变
 727 |         self.conv_512_256 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3, padding=1, stride=1)
 728 |         # fc7 到 conv4_3    反卷积上采样，尺度大一倍
 729 |         self.DeConv_1024_128 = nn.ConvTranspose2d(in_channels=1024, out_channels=128, kernel_size=2, stride=2)
 730 | 
 731 |         # conv4_3 到FC7  扩张卷积，尺度少一半
 732 |         self.DilationConv_512_128 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, padding=2, dilation=2,
 733 |                                               stride=2)
 734 |         # FC7到FC7 尺度不变
 735 |         self.conv_1024_256 = nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=3, padding=1, stride=1)
 736 |         # conv8_2 到 FC7    反卷积上采样，尺度大一倍  10->19
 737 |         self.DeConv_512_128 = nn.ConvTranspose2d(in_channels=512, out_channels=128, kernel_size=3, stride=2, padding=1)
 738 | 
 739 |         # conv5_3到conv8_2
 740 |         self.DilationConv_512_128_2 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, padding=2, dilation=2,
 741 |                                                 stride=2)
 742 |         # conv8_2到conv8_2 尺度不变
 743 |         self.conv_512_256_2 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3, padding=1, stride=1)
 744 |         # conv9_2到conv8_2
 745 |         self.DeConv_256_128_2 = nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=2, stride=2)
 746 | 
 747 |         # 平滑层
 748 |         self.smooth = nn.Conv2d(512, 512, kernel_size=3, padding=1, stride=1)
 749 | 
 750 |         # 通道数BN层的参数是输出通道数out_channels
 751 |         self.bn = nn.BatchNorm2d(128)
 752 | 
 753 |         # CBAM模块【6个特征层：512 512 512 256 256 256 】
 754 |         self.CBAM1 = Bottleneck(512)
 755 |         self.CBAM2 = Bottleneck(512)
 756 |         self.CBAM3 = Bottleneck(512)
 757 |         self.CBAM4 = Bottleneck(256)
 758 |         self.CBAM5 = Bottleneck(256)
 759 |         self.CBAM6 = Bottleneck(256)
 760 | 
 761 |     def forward(self, x):
 762 |         """Applies network layers and ops on input image(s) x.
 763 |         前向传播
 764 |         Args:
 765 |             x: input image or batch of images. Shape: [batch,3,300,300].
 766 | 
 767 |         Return:
 768 |             Depending on phase:
 769 |             test测试集:
 770 |                 Variable(tensor) of output class label predictions,
 771 |                 confidence score, and corresponding location predictions for
 772 |                 each object detected. Shape: [batch,topk,7]
 773 | 
 774 |             train训练集:
 775 |                 list of concat outputs from:
 776 |                     1: 分类层confidence layers, Shape: [batch*num_priors,num_classes]
 777 |                     2: 回归定位层localization layers, Shape: [batch,num_priors*4]
 778 |                     3: priorbox layers, Shape: [2,num_priors*4]
 779 |         """
 780 |         # sources保存 网络生成的不同层feature map结果，以便使用这些feature map来进行分类与回归
 781 |         sources = list()
 782 |         # 保存预测层不同feature map通过回归和分类网络的输出结果
 783 |         loc = list()
 784 |         conf = list()
 785 | 
 786 |         # 原论文中vgg的conv4_3，relu之后加入L2 Normalization正则化，然后保存feature map
 787 |         # apply vgg up to conv4_3 relu
 788 |         # 将vgg层的feature map保存
 789 |         # k的范围为0-22
 790 |         # =========开始保存 所需的所有中间信息
 791 | 
 792 |         # 保存pool2（pool下标从1开始）的结果
 793 |         # 经过maxpool，所以不需要L2Norm正则化
 794 |         for k in range(10):
 795 |             x = self.vgg[k](x)
 796 |         sources.append(x)
 797 | 
 798 |         # 保存conv4_3结果
 799 |         for k in range(10, 23):
 800 |             x = self.vgg[k](x)
 801 |         s = self.L2Norm(x)
 802 |         sources.append(s)
 803 | 
 804 |         # 保存conv5_3结果  类似conv4_3原仓库一样，加入L2Norm
 805 |         for k in range(23, 30):
 806 |             x = self.vgg[k](x)
 807 |         s = self.L2Norm(x)
 808 |         sources.append(s)
 809 | 
 810 |         # 保存 原fc7的输出结果
 811 |         # apply vgg up to fc7，即将原fc7层更改为卷积层输出的结果，经过relu之后保存结果
 812 |         # k的范围为23 - 结束
 813 |         for k in range(30, len(self.vgg)):
 814 |             x = self.vgg[k](x)
 815 |         sources.append(x)
 816 | 
 817 |         # 将新加的额外层 conv8_2、conv9_2、conv10_2、conv11_2结果保存
 818 |         # apply extra layers and cache source layer outputs
 819 |         # 将新增层的feature map保存
 820 |         for k, v in enumerate(self.extras):
 821 |             # 每经过一个conv卷积，都relu一下
 822 |             x = F.relu(v(x), inplace=True)
 823 |             # 论文中隔一个conv保存一个结果
 824 |             if k % 2 == 1:
 825 |                 sources.append(x)
 826 | 
 827 |         # 此时sources保存了所有中间结果，论文中的pool2、conv4_3、conv5_3、fc7、conv8_2、conv9_2、conv10_2、conv11_2
 828 |         # sources_final保存各层融合之后的最终结果
 829 |         sources_final = list()
 830 |         # con4_3层融合结果  self.bn1(self.conv1(x)) 在通道维度上融合
 831 |         conv4_fp = torch.cat((F.relu(self.bn(self.DilationConv_128_128(sources[0])), inplace=True),
 832 |                               F.relu(self.conv_512_256(sources[1]), inplace=True),
 833 |                               F.relu(self.DeConv_1024_128(sources[3]), inplace=True)), 1)
 834 |         # sources_final.append(F.relu( self.smooth(conv4_fp) , inplace=True))
 835 |         conv4_fp = F.relu(self.smooth(conv4_fp), inplace=True)
 836 |         sources_final.append(self.CBAM1(conv4_fp))
 837 |         # FC7层融合结果
 838 |         fc7_fp = torch.cat((F.relu(self.bn(self.DilationConv_512_128(sources[1])), inplace=True),
 839 |                             F.relu(self.conv_1024_256(sources[3]), inplace=True),
 840 |                             F.relu(self.DeConv_512_128(sources[4]), inplace=True)), 1)
 841 |         # sources_final.append(F.relu( self.smooth(fc7_fp) , inplace=True))
 842 |         fc7_fp = F.relu(self.smooth(fc7_fp), inplace=True)
 843 |         sources_final.append(self.CBAM2(fc7_fp))
 844 |         # conv8_2层融合结果
 845 |         conv8_fp = torch.cat((F.relu(self.bn(self.DilationConv_512_128_2(sources[2])), inplace=True),
 846 |                               F.relu(self.conv_512_256_2(sources[4]), inplace=True),
 847 |                               F.relu(self.DeConv_256_128_2(sources[5]), inplace=True)), 1)
 848 |         # sources_final.append(F.relu( self.smooth(conv8_fp) , inplace=True))
 849 |         conv8_fp = F.relu(self.smooth(conv8_fp), inplace=True)
 850 |         sources_final.append(self.CBAM3(conv8_fp))
 851 | 
 852 |         # 保存 conv9_2、conv10_2、conv11_2
 853 |         sources_final.append(self.CBAM4(sources[5]))
 854 |         sources_final.append(self.CBAM5(sources[6]))
 855 |         sources_final.append(self.CBAM6(sources[7]))
 856 | 
 857 |         # apply multibox head to source layers
 858 |         # permute  将tensor的维度换位  参数为换位顺序
 859 |         # contiguous 返回一个内存连续的有相同数据的tensor
 860 | 
 861 |         # source保存的是每个预测层的网络输出,即feature maps
 862 |         # loc 通过使用feature map去预测回归
 863 |         # conf通过使用feature map去预测分类
 864 |         for (x, l, c) in zip(sources_final, self.loc, self.conf):
 865 |             loc.append(l(x).permute(0, 2, 3, 1).contiguous())
 866 |             conf.append(c(x).permute(0, 2, 3, 1).contiguous())
 867 |         # 在给定维度上对输入的张量序列seq 进行连接操作    dimension=1表示在列上连接
 868 |         loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
 869 |         conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
 870 |         # 测试集上的输出
 871 |         if self.phase == "test":
 872 |             output = self.detect(
 873 |                 loc.view(loc.size(0), -1, 4),  # loc preds  定位的预测
 874 |                 self.softmax(conf.view(conf.size(0), -1,
 875 |                                        self.num_classes)),  # conf preds  分类的预测
 876 |                 self.priors.type(type(x.data))  # default boxes  预测框
 877 |             )
 878 |         else:
 879 |             # 训练集上的输出
 880 |             output = (
 881 |                 loc.view(loc.size(0), -1, 4),  # loc preds [32,8732,4] 通过网络输出的定位的预测
 882 |                 conf.view(conf.size(0), -1, self.num_classes),  # conf preds [32,8732,21]  通过网络输出的分类的预测
 883 |                 self.priors  # 不同feature map根据公式生成的锚结果 [8732,4]   内容为 中心点坐标和宽高
 884 |             )
 885 |         return output
 886 | 
 887 |     def load_weights(self, base_file):
 888 |         other, ext = os.path.splitext(base_file)
 889 |         if ext == '.pkl' or '.pth':
 890 |             print('Loading weights into state dict...')
 891 |             self.load_state_dict(torch.load(base_file, map_location=lambda storage, loc: storage))
 892 |             print('Finished!')
 893 |         else:
 894 |             print('Sorry only .pth and .pkl files supported.')
 895 | 
 896 | 
 897 | # This function is derived from torchvision VGG make_layers()
 898 | # 此方法源自torchvision VGG make_layers（）
 899 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
 900 | def vgg(cfg, i, batch_norm=False):
 901 |     '''
 902 |     vgg的结构
 903 |     cfg:  vgg的结构
 904 |      '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
 905 |             512, 512, 512],
 906 |     i: 3   输入图像通道数
 907 |     batch_norm    为False。若为True，则网络中加入batch_norm
 908 | 
 909 |     返回没有全连接层的vgg网络
 910 |     '''
 911 |     # 保存vgg所有层
 912 |     layers = []
 913 |     # 输入图像通道数
 914 |     in_channels = i
 915 |     for v in cfg:  # M与C会导致生成的feature map大小出现变化
 916 |         if v == 'M':  # 最大池化层   默认floor模式
 917 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
 918 |         elif v == 'C':  # 最大池化层   ceil模式   两种不同的maxpool方式    参考https://blog.csdn.net/GZHermit/article/details/79351803
 919 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
 920 |         else:
 921 |             # 卷积
 922 |             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
 923 |             if batch_norm:
 924 |                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
 925 |             else:
 926 |                 layers += [conv2d, nn.ReLU(inplace=True)]
 927 |             in_channels = v
 928 |     # 论文将 Pool5 layer 的参数，从 卷积核2×2步长为2  转变成 卷积核3×3 步长为1 外加一个 pad
 929 |     pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
 930 |     # 论文中将VGG的FC6 layer、FC7 layer 转成为 卷积层conv6,conv7 并从模型的FC6、FC7 上的参数，进行采样得到这两个卷积层的 参数
 931 |     # 输入通道512  输出通道为1024  卷积核为3  padding为6    dilation为卷积核中元素之间的空洞大小
 932 |     # 修改Pool5 layer参数，导致感受野大小改变。所以conv6采用 atrous 算法，即孔填充算法。
 933 |     # 孔填充算法将卷积 weights 膨胀扩大，即原来卷积核是 3x3，膨胀后，可能变成 7x7 了，这样 receptive field 变大了，而 score map 也很大，即输出变成 dense
 934 |     # 这么做的好处是，输出的 score map 变大了，即是 dense 的输出了，而且 receptive field 不会变小，而且可以变大。这对做分割、检测等工作非常重要。
 935 |     conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
 936 |     # 输入通道512  输出通道为1024  卷积核为3
 937 |     conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
 938 |     # 将 修改的层也加入到vgg网络中
 939 |     layers += [pool5, conv6,
 940 |                nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
 941 |     return layers
 942 | 
 943 | 
 944 | def add_extras(cfg, i, batch_norm=False):
 945 |     '''
 946 |     vgg网络后面新增的额外层
 947 |     :param cfg:  '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
 948 |     :param i:    1024  输入通道数
 949 |     :param batch_norm:  flase
 950 |     :return:
 951 |     '''
 952 |     # 添加到VGG的额外图层用于特征缩放
 953 |     layers = []
 954 |     # 1024  输入通道数
 955 |     in_channels = i
 956 |     # 控制卷积核尺寸，一维数组选前一个数还是后一个数。在每次循环时flag都改变，导致网络的卷积核尺寸为1,3,1,3交替
 957 |     # False 为1，True为3
 958 |     # SSD网络图中s1指步长为1，s2指步长为2
 959 |     # 在该代码中，S代表步长为2，无S代表默认，即步长为1，所以cfg与论文网络结构完全匹配
 960 |     flag = False
 961 |     # enumerate枚举   k为下标   v为值
 962 |     for k, v in enumerate(cfg):
 963 |         if in_channels != 'S':
 964 |             if v == 'S':
 965 |                 layers += [nn.Conv2d(in_channels, cfg[k + 1],
 966 |                                      kernel_size=(1, 3)[flag], stride=2, padding=1)]
 967 |             else:
 968 |                 layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])]
 969 |             flag = not flag
 970 |         in_channels = v
 971 |     return layers
 972 | 
 973 | 
 974 | def multibox(vgg, extra_layers, cfg, num_classes):
 975 |     '''
 976 |     :param vgg: 经过修改后的vgg网络（去掉全连接层，修改pool5参数并添加新层）
 977 |     :param extra_layers: vgg网络后面新增的额外层
 978 |     :param cfg: '300': [4, 6, 6, 6, 4, 4],  不同部分的feature map上一个网格预测多少框
 979 |     :param num_classes: 20分类+1背景，共21类
 980 |     :return:
 981 |     '''
 982 |     # 保存所有参与预测的网络层
 983 |     loc_layers = []
 984 |     conf_layers = []
 985 |     # 传入的修改过的vgg网络用于预测的网络是21层以及 倒数第二层
 986 |     vgg_source = [21, -2]
 987 |     for k, v in enumerate(vgg_source):
 988 |         # 按照fp-ssd论文，将1024改为512通道
 989 |         if k == 1:
 990 |             in_channels = 512
 991 |         else:
 992 |             in_channels = vgg[v].out_channels
 993 |         # 4是回归的坐标参数  cfg代表该层feature map上一个网格预测多少框
 994 |         loc_layers += [nn.Conv2d(in_channels,
 995 |                                  cfg[k] * 4, kernel_size=3, padding=1)]
 996 |         # num_classes是类别数 cfg代表该层feature map上一个网格预测多少框
 997 |         conf_layers += [nn.Conv2d(in_channels,
 998 |                                   cfg[k] * num_classes, kernel_size=3, padding=1)]
 999 |     # [x::y] 从下标x开始，每隔y取值
1000 |     # 论文中新增层也是每隔一个层添加一个预测层
1001 |     # 将新增的额外层中的预测层也添加上   start=2：下标起始位置
1002 |     for k, v in enumerate(extra_layers[1::2], 2):
1003 |         loc_layers += [nn.Conv2d(v.out_channels, cfg[k]
1004 |                                  * 4, kernel_size=3, padding=1)]
1005 |         conf_layers += [nn.Conv2d(v.out_channels, cfg[k]
1006 |                                   * num_classes, kernel_size=3, padding=1)]
1007 |     return vgg, extra_layers, (loc_layers, conf_layers)
1008 | 
1009 | 
1010 | base = {
1011 |     # 数字为每层feature map的层数  M代表最大池化层（默认floor模式）    C代表最大池化层（ceil模式）  (去掉vgg16的最后的 maxpool、fc、fc、fc、softmax)
1012 |     '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
1013 |             512, 512, 512],
1014 |     '512': [],
1015 | }
1016 | extras = {
1017 |     # 每个特征图都是由 两个conv 组成， conv1x1 和conv3x3
1018 |     '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256],
1019 |     '512': [],
1020 | }
1021 | mbox = {
1022 |     '300': [4, 6, 6, 6, 4, 4],  # 不同部分的feature map上一个网格预测多少框
1023 |     '512': [],
1024 | }
1025 | 
1026 | 
1027 | def build_ssd(phase, size=300, num_classes=21):
1028 |     '''
1029 |     新建SSD模型
1030 |     '''
1031 |     # 训练或测试
1032 |     if phase != "test" and phase != "train":
1033 |         print("ERROR: Phase: " + phase + " not recognized")
1034 |         return
1035 |     # 当前SSD300只支持大小300×300的数据集训练
1036 |     if size != 300:
1037 |         print("ERROR: You specified size " + repr(size) + ". However, " +
1038 |               "currently only SSD300 (size=300) is supported!")
1039 |         return
1040 | 
1041 |     # base_： 经过修改后的vgg网络（去掉全连接层，修改pool5参数并添加新层）
1042 |     # extras_：  vgg网络后面新增的额外层
1043 |     # head_ :    (loc_layers, conf_layers)   vgg与extras中进行分类和回归的层
1044 |     base_, extras_, head_ = multibox(vgg(base[str(size)], 3),
1045 |                                      add_extras(extras[str(size)], 1024),
1046 |                                      mbox[str(size)],
1047 |                                      num_classes)
1048 |     # phase：'train'    size：300    num_classes： 21 类别数（20类+1背景）
1049 |     return SSD(phase, size, base_, extras_, head_, num_classes)
1050 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | # test.py 用于测试单张图片的效果
  2 | from __future__ import print_function
  3 | import sys
  4 | import os
  5 | import argparse
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.backends.cudnn as cudnn
  9 | import torchvision.transforms as transforms
 10 | from torch.autograd import Variable
 11 | from data import VOC_ROOT, VOC_CLASSES as labelmap
 12 | from PIL import Image
 13 | from data import VOCAnnotationTransform, VOCDetection, BaseTransform, VOC_CLASSES
 14 | import torch.utils.data as data
 15 | from ssd import build_ssd
 16 | 
 17 | parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection')
 18 | parser.add_argument('--trained_model', default='weights/ssd300_VOC_10000.pth',  # #########修改检测模型的路径
 19 |                     type=str, help='Trained state_dict file path to open')
 20 | parser.add_argument('--save_folder', default='eval/', type=str,
 21 |                     help='Dir to save results')
 22 | parser.add_argument('--visual_threshold', default=0.6, type=float,
 23 |                     help='Final confidence threshold')
 24 | parser.add_argument('--cuda', default=True, type=bool,
 25 |                     help='Use cuda to train model')
 26 | parser.add_argument('--voc_root', default='data/VOCdevkit', help='Location of VOC root directory')  # ###修改读取图片的路径【VOC_ROOT】
 27 | parser.add_argument('-f', default=None, type=str, help="Dummy arg so we can load in Jupyter Notebooks")
 28 | args = parser.parse_args()
 29 | 
 30 | if args.cuda and torch.cuda.is_available():
 31 |     torch.set_default_tensor_type('torch.cuda.FloatTensor')
 32 | else:
 33 |     torch.set_default_tensor_type('torch.FloatTensor')
 34 | 
 35 | if not os.path.exists(args.save_folder):
 36 |     os.mkdir(args.save_folder)
 37 | 
 38 | 
 39 | # 输入参数：【VOC数据集root，网络，cuda，输入的测试数据，预处理函数， 阈值】
 40 | def test_net(save_folder, net, cuda, testset, transform, thresh):
 41 |     # dump predictions and assoc. ground truth to text file for now
 42 |     filename = save_folder+'test1.txt'   # 保存的txt文件名
 43 |     num_images = len(testset)    # 测试的数据数量
 44 |     for i in range(num_images):  # 依次遍历每一张图片
 45 |         print('Testing image {:d}/{:d}....'.format(i+1, num_images))  # 索引加1才是加载显示的图片数从1开始
 46 | 
 47 |         img = testset.pull_image(i)  # pull_image的功能是cv2.imread读取某张图片 放到img中
 48 |         img_id, annotation = testset.pull_anno(i)  # pull_anno的功能是读取标签信息 img_id
 49 | 
 50 |         x = torch.from_numpy(transform(img)[0]).permute(2, 0, 1)  # #先去掉第一个维度，然后将最后个通道的提前
 51 |         x = Variable(x.unsqueeze(0))  # #然后在将第一维度添加1维度还原成4维度
 52 | 
 53 |         with open(filename, mode='a') as f:   # 下面是添加GT的信息 真实目标的标签
 54 |             f.write('\nGROUND TRUTH FOR: '+img_id+'\n')
 55 |             for box in annotation:
 56 |                 f.write('label: '+' || '.join(str(b) for b in box)+'\n')
 57 |         if cuda:
 58 |             x = x.cuda()  # 将单张的图片转化为GPU上的数据
 59 | 
 60 |         y = net(x)      # forward pass 将数据放在网上前向传播
 61 |         detections = y.data
 62 |         # scale each detection back up to the image
 63 |         scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
 64 |         pred_num = 0
 65 |         for i in range(detections.size(1)):  #
 66 |             j = 0
 67 |             while detections[0, i, j, 0] >= 0.6:
 68 |                 if pred_num == 0:
 69 |                     with open(filename, mode='a') as f:
 70 |                         f.write('PREDICTIONS: '+'\n')
 71 |                 score = detections[0, i, j, 0]
 72 |                 label_name = labelmap[i-1]
 73 |                 pt = (detections[0, i, j, 1:]*scale).cpu().numpy()
 74 |                 coords = (pt[0], pt[1], pt[2], pt[3])
 75 |                 pred_num += 1
 76 |                 with open(filename, mode='a') as f:
 77 |                     f.write(str(pred_num)+' label: '+label_name+' score: ' +
 78 |                             str(score) + ' '+' || '.join(str(c) for c in coords) + '\n')
 79 |                 j += 1
 80 | 
 81 | 
 82 | def test_voc():
 83 |     # load net
 84 |     num_classes = len(VOC_CLASSES) + 1  # +1 background【这里我觉得也不应该加1】
 85 |     net = build_ssd('test', 300, num_classes)  # initialize SSD
 86 |     net.load_state_dict(torch.load(args.trained_model))  # 在创建的网络中添加前面训练过的权重系数
 87 |     net.eval()  # 开始机进行eval()模式
 88 |     print('Finished loading model!')
 89 |     # load data
 90 |     testset = VOCDetection(args.voc_root, [('2007', 'test')], None, VOCAnnotationTransform())  # 将 第二个参数的默认值改变成【要选用的数据集】
 91 |     if args.cuda:
 92 |         net = net.cuda()
 93 |         cudnn.benchmark = True
 94 |     # evaluation
 95 |     # #输入参数：【VOC数据集root，网络，cuda，输入的测试数据，预处理函数， 阈值】
 96 |     test_net(args.save_folder,  # VOC数据集root
 97 |              net,               # 网络
 98 |              args.cuda,         # cuda
 99 |              testset,           # 输入的测试数据
100 |              BaseTransform(net.size, (104, 117, 123)),  # 预处理函数
101 |              thresh=args.visual_threshold               # 阈值
102 |              )
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     test_voc()
107 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | from data import *
  3 | from utils.augmentations import SSDAugmentation
  4 | from layers.modules import MultiBoxLoss
  5 | from ssd import build_ssd
  6 | import time
  7 | import torch
  8 | from torch.autograd import Variable
  9 | import torch.nn as nn
 10 | import torch.optim as optim
 11 | import torch.backends.cudnn as cudnn
 12 | import torch.nn.init as init
 13 | import torch.utils.data as data
 14 | import argparse
 15 | import visdom as viz
 16 | import os
 17 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # 指定GPU做训练
 18 | 
 19 | 
 20 | def str2bool(v):
 21 |     return v.lower() in ("yes", "true", "t", "1")
 22 | 
 23 | 
 24 | parser = argparse.ArgumentParser(
 25 |     description='Single Shot MultiBox Detector Training With Pytorch')
 26 | train_set = parser.add_mutually_exclusive_group()
 27 | parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'],
 28 |                     type=str, help='VOC or COCO')
 29 | parser.add_argument('--dataset_root', default="data/VOCdevkit/",   # 修改【dataset_root】
 30 |                     help='Dataset root directory path')
 31 | parser.add_argument('--basenet', default='weights/vgg16_reducedfc.pth',  # 【预训练好的权重系数】
 32 |                     help='Pretrained base model')
 33 | parser.add_argument('--batch_size', default=16, type=int,  # 【修改batch_size】
 34 |                     help='Batch size for training')
 35 | parser.add_argument('--resume', default='weights/ssd300_VOC_500.pth', type=str,  # 【是否从某节点开始训练】没有就是None
 36 |                     help='Checkpoint state_dict file to resume training from')
 37 | parser.add_argument('--start_iter', default=501, type=int,
 38 |                     help='Resume training at this iter')
 39 | parser.add_argument('--num_workers', default=2, type=int,  # 【num_workers】
 40 |                     help='Number of workers used in dataloading')
 41 | parser.add_argument('--cuda', default=True, type=str2bool,
 42 |                     help='Use CUDA to train model')
 43 | parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float,  # 【修改学习率】
 44 |                     help='initial learning rate')
 45 | parser.add_argument('--momentum', default=0.9, type=float,
 46 |                     help='Momentum value for optim')
 47 | parser.add_argument('--weight_decay', default=5e-4, type=float,
 48 |                     help='Weight decay for SGD')
 49 | parser.add_argument('--gamma', default=0.1, type=float,
 50 |                     help='Gamma update for SGD')
 51 | parser.add_argument('--visdom', default=False, type=str2bool,   # 可视化 这次设置为【【】可视化】】】
 52 |                     help='Use visdom for loss visualization')
 53 | parser.add_argument('--save_folder', default='weights/',
 54 |                     help='Directory for saving checkpoint models')
 55 | args = parser.parse_args()
 56 | 
 57 | if torch.cuda.is_available():
 58 |     if args.cuda:
 59 |         torch.set_default_tensor_type('torch.cuda.FloatTensor')
 60 |     if not args.cuda:
 61 |         print("WARNING: It looks like you have a CUDA device, but aren't " +
 62 |               "using CUDA.\nRun with --cuda for optimal training speed.")
 63 |         torch.set_default_tensor_type('torch.FloatTensor')
 64 | else:
 65 |     torch.set_default_tensor_type('torch.FloatTensor')
 66 | 
 67 | if not os.path.exists(args.save_folder):
 68 |     os.mkdir(args.save_folder)
 69 | 
 70 | 
 71 | def train():
 72 |     cfg = voc  # voc是一个字典 里面包括网络的一系列参数信息
 73 |     dataset = VOCDetection(  # 是一个VOC数据的类
 74 |         root=args.dataset_root,  # 数据集的根目录
 75 |         transform=SSDAugmentation(cfg['min_dim'], MEANS))  # 图片的预处理方法(输入图片的尺寸和均值) 原本类中定义为None 后面的MEANS我人为可以删除
 76 | 
 77 |     if args.visdom:  # 这里是可视化工具，不用管###################
 78 |         import visdom
 79 |         viz = visdom.Visdom()
 80 | 
 81 |     ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes'])
 82 |     # 阶段【train or test】 输入图片尺寸大小 类别数
 83 |     # build_ssd是一个放在ssd.py的函数
 84 |     # return是一个类的对象，也就是class SSD(nn.Module)，ssd_net也就是SSD类的一个对象
 85 |     # ssd_net拥有所有class SSD继承于nn.Module以及作者增加方法的所有属性
 86 |     # 在SSD这个类中就定义了网络的base部分（修改全连接层后的VGG16）和extras部分（论文作者加入的多尺度feature map）和head部分
 87 |     # 对选定的6个尺度下的feature map进行卷积操作得到的每个default box 的每一个分类类别的confidence以及位置坐标的信息
 88 |     net = ssd_net  # 到这里class类SSD只完成了__init__()并没有执行__forward__() net是一个类
 89 | 
 90 |     if args.cuda:  # 是否将模型放到多个个GPU上运行{我认为在我的任务中不要放在多线程GPU中}
 91 |         net = torch.nn.DataParallel(ssd_net)
 92 |         cudnn.benchmark = True
 93 |     if args.resume:  # 【resume】的默认值是None,表示不是接着某个断点来继续训练这个模型 【其实checkpoint里面最好还要加上优化器的保存】
 94 |         # 【model_state_dict,optimizer_state_dict,epoch】 见深度之眼
 95 |         print('Resuming training, loading {}...'.format(args.resume))
 96 |         ssd_net.load_weights(args.resume)
 97 |     else:  # 那么就从weights文件夹下面直接加载预训练好vgg基础网络预训练权重
 98 |         vgg_weights = torch.load(args.save_folder + args.basenet)  # 整个ssd_net中vgg基础网络的权重
 99 |         print('Loading base network...')
100 |         ssd_net.vgg.load_state_dict(vgg_weights)  # 只在整个ssd_net中的vgg模块中加载预训练好的权重，其余的extra，特征融合，CBAM模块没有加载预训练权重
101 |     if args.cuda:  # 将模型结构放在GPU上训练
102 |         net = net.cuda()
103 |     if not args.resume:  # ######################################################################
104 |         print('Initializing weights...')  # 如果不是接着某个断点接着训练，那么其余extras loc con都会xavier方法初始化
105 |         # initialize newly added layers' weights with xavier method
106 |         ssd_net.extras.apply(weights_init)  # extras 模块由 xavier 方法默认初始化data和bias
107 |         ssd_net.loc.apply(weights_init)  # loc 模块由 xavier 方法默认初始化data和bias
108 |         ssd_net.conf.apply(weights_init)  # conf 模块由 xavier 方法默认初始化data和bias
109 | 
110 |     # 【优化器】net.parameters()是网络结构中的参数，学习率，动量，权重衰减率
111 |     optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
112 |     # 定义损失函数部分【MultiBoxesLoss是一个类用于计算网络的损失，criterion是一个对象】
113 |     # 【损失函数】 关键！！！ criterion是个nn.Moudule的形式 里面包括两部分loss_c 和 loss_l
114 |     criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, False, args.cuda)
115 |     # 前向传播
116 |     net.train()
117 |     # loss counters
118 |     loc_loss = 0
119 |     conf_loss = 0
120 |     epoch = 0
121 |     print('Loading the dataset...')
122 |     epoch_size = len(dataset) // args.batch_size  # 每个epoch中有多少个batch
123 |     print('Training SSD on:', dataset.name)
124 |     print('Using the specified args:')
125 |     print(args)  # 讲设定的参数打印出来
126 | 
127 |     step_index = 0
128 |     # 可视化部分
129 |     if args.visdom:  # 默认值为False
130 |         vis_title = 'SSD.PyTorch on ' + dataset.name
131 |         vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss']
132 |         iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend)
133 |         epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend)
134 | 
135 |     data_loader = data.DataLoader(dataset, args.batch_size,
136 |                                   num_workers=args.num_workers,  # 默认值我修改成了0
137 |                                   shuffle=True,
138 |                                   collate_fn=detection_collate,  # collate_fn将一个batch_size数目的图片进行合并成batch
139 |                                   pin_memory=True)
140 |     batch_iterator = iter(data_loader)  # batch迭代器 依次迭代batch
141 |     for iteration in range(args.start_iter, cfg['max_iter']):  # 由最大迭代次数来迭代训练
142 |         if args.visdom and iteration != 0 and (iteration % epoch_size == 0):   # 因为args.visdom一直设置为False因此没有被调用
143 |             update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None, 'append', epoch_size)
144 |             # reset epoch loss counters
145 |             loc_loss = 0
146 |             conf_loss = 0
147 |             epoch += 1
148 | 
149 |         if iteration in cfg['lr_steps']:  # 通过多少次epoch调节一次学习率
150 |             step_index += 1
151 |             adjust_learning_rate(optimizer, args.gamma, step_index)
152 | 
153 |         # load train data
154 |         try:
155 |             images, targets = next(batch_iterator)
156 |             # targets 和image都是读取的训练数据
157 |         except StopIteration:
158 |             bath_iterator = iter(data_loader)
159 |             images, targets = next(bath_iterator)
160 |         # images=【batch_size,3,300,300】
161 |         # targets=【batch_size,num_object,5】
162 |         # num_object代表一张图里面有几个ground truth，5代表四个位置信息和一个label
163 |         if args.cuda:  # 将数据放在cuda上
164 |             images = Variable(images.cuda())
165 |             targets = [Variable(ann.cuda(), volatile=True) for ann in targets]
166 |         else:
167 |             images = Variable(images)
168 |             targets = [Variable(ann, volatile=True) for ann in targets]
169 |         # forward
170 |         t0 = time.time()
171 |         # ##out是netforward的输出：是个元组，里面包括3个部分[loc conf  priors]
172 |         out = net(images)
173 |         # ## backprop 优化器梯度清零
174 |         optimizer.zero_grad()
175 |         # ## criterion是nn.Module形式，下面是调用它的forward模式【重点看，里面包括难例挖掘的内容】
176 |         # ###################################【【【训练阶段的损失！！！】】】######################################
177 |         # ##输入参数1：网络结构net输出的out:[loc conf priors]
178 |         # ##输入参数2：targets:真实目标的位置标签值
179 |         loss_l, loss_c = criterion(out, targets)  # criterion就是MultiBoxLoss类定义的对象，forward前传播返回的结果是【loss_l, loss_c】
180 |         loss = loss_l + loss_c  # 总loss
181 |         loss.backward()
182 |         optimizer.step()
183 |         t1 = time.time()
184 |         # 下面两行好像没有使用
185 |         loc_loss += loss_l.data  # ###到底是改成item()还是data
186 |         conf_loss += loss_c.data  # ###到底是改成item()还是data
187 | 
188 |         if iteration % 10 == 0:
189 |             print('timer: %.4f sec.' % (t1 - t0))
190 |             print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % loss.data, end=' ')  # 到底是改成item()还是data
191 | 
192 |         if args.visdom:
193 |             update_vis_plot(iteration, loss_l.data, loss_c.data, iter_plot, epoch_plot, 'append')
194 | 
195 |         if iteration != 0 and iteration % 2000 == 0:
196 |             # 迭代多少次保存一次模型。 在尝试阶段，为了节省时间，建议将根据迭代次数保存模型的参数调低，例如调节到500
197 |             print('Saving state, iter:', iteration) # 保存的checkpoint
198 |             torch.save(ssd_net.state_dict(), 'weights/ssd300_VOC_' + repr(iteration) + '.pth')  # 保存模型的路径
199 |     torch.save(ssd_net.state_dict(), args.save_folder + '' + args.dataset + '.pth')  # 最后的保存：不是保存整个模型，只是保存了参数
200 | 
201 | 
202 | def adjust_learning_rate(optimizer, gamma, step):
203 |     """Sets the learning rate to the initial LR decayed by 10 at every
204 |         specified step
205 |     """
206 |     lr = args.lr * (gamma ** (step))
207 |     for param_group in optimizer.param_groups:
208 |         param_group['lr'] = lr
209 | 
210 | 
211 | def xavier(param):
212 |     init.xavier_uniform(param)
213 | 
214 | 
215 | def weights_init(m):
216 |     if isinstance(m, nn.Conv2d):
217 |         xavier(m.weight.data)
218 |         m.bias.data.zero_()
219 | 
220 | 
221 | def create_vis_plot(_xlabel, _ylabel, _title, _legend):
222 |     return viz.line(
223 |         X=torch.zeros((1,)).cpu(),
224 |         Y=torch.zeros((1, 3)).cpu(),
225 |         opts=dict(
226 |             xlabel=_xlabel,
227 |             ylabel=_ylabel,
228 |             title=_title,
229 |             legend=_legend
230 |         )
231 |     )
232 | 
233 | 
234 | def update_vis_plot(iteration, loc, conf, window1, window2, update_type,
235 |                     epoch_size=1):
236 |     viz.line(
237 |         X=torch.ones((1, 3)).cpu() * iteration,
238 |         Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu() / epoch_size,
239 |         win=window1,
240 |         update=update_type
241 |     )
242 |     # initialize epoch plot on first iteration
243 |     if iteration == 0:
244 |         viz.line(
245 |             X=torch.zeros((1, 3)).cpu(),
246 |             Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu(),
247 |             win=window2,
248 |             update=True
249 |         )
250 | 
251 | 
252 | if __name__ == '__main__':
253 |     train()
254 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .augmentations import SSDAugmentation


--------------------------------------------------------------------------------
/utils/augmentations.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torchvision import transforms
  3 | import cv2
  4 | import numpy as np
  5 | import types
  6 | from numpy import random
  7 | 
  8 | 
  9 | def intersect(box_a, box_b):
 10 |     max_xy = np.minimum(box_a[:, 2:], box_b[2:])
 11 |     min_xy = np.maximum(box_a[:, :2], box_b[:2])
 12 |     inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
 13 |     return inter[:, 0] * inter[:, 1]
 14 | 
 15 | 
 16 | def jaccard_numpy(box_a, box_b):
 17 |     """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
 18 |     is simply the intersection over union of two boxes.
 19 |     E.g.:
 20 |         A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
 21 |     Args:
 22 |         box_a: Multiple bounding boxes, Shape: [num_boxes,4]
 23 |         box_b: Single bounding box, Shape: [4]
 24 |     Return:
 25 |         jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
 26 |     """
 27 |     inter = intersect(box_a, box_b)
 28 |     area_a = ((box_a[:, 2]-box_a[:, 0]) *
 29 |               (box_a[:, 3]-box_a[:, 1]))  # [A,B]
 30 |     area_b = ((box_b[2]-box_b[0]) *
 31 |               (box_b[3]-box_b[1]))  # [A,B]
 32 |     union = area_a + area_b - inter
 33 |     return inter / union  # [A,B]
 34 | 
 35 | 
 36 | class Compose(object):
 37 |     """Composes several augmentations together.
 38 |     Args:
 39 |         transforms (List[Transform]): list of transforms to compose.
 40 |     Example:
 41 |         >>> augmentations.Compose([
 42 |         >>>     transforms.CenterCrop(10),
 43 |         >>>     transforms.ToTensor(),
 44 |         >>> ])
 45 |     """
 46 | 
 47 |     def __init__(self, transforms):
 48 |         self.transforms = transforms
 49 | 
 50 |     def __call__(self, img, boxes=None, labels=None):
 51 |         for t in self.transforms:
 52 |             img, boxes, labels = t(img, boxes, labels)
 53 |         return img, boxes, labels
 54 | 
 55 | 
 56 | class Lambda(object):
 57 |     """Applies a lambda as a transform."""
 58 | 
 59 |     def __init__(self, lambd):
 60 |         assert isinstance(lambd, types.LambdaType)
 61 |         self.lambd = lambd
 62 | 
 63 |     def __call__(self, img, boxes=None, labels=None):
 64 |         return self.lambd(img, boxes, labels)
 65 | 
 66 | 
 67 | class ConvertFromInts(object):
 68 |     def __call__(self, image, boxes=None, labels=None):
 69 |         return image.astype(np.float32), boxes, labels
 70 | 
 71 | 
 72 | class SubtractMeans(object):
 73 |     def __init__(self, mean):
 74 |         self.mean = np.array(mean, dtype=np.float32)
 75 | 
 76 |     def __call__(self, image, boxes=None, labels=None):
 77 |         image = image.astype(np.float32)
 78 |         image -= self.mean
 79 |         return image.astype(np.float32), boxes, labels
 80 | 
 81 | 
 82 | class ToAbsoluteCoords(object):
 83 |     def __call__(self, image, boxes=None, labels=None):
 84 |         height, width, channels = image.shape
 85 |         boxes[:, 0] *= width
 86 |         boxes[:, 2] *= width
 87 |         boxes[:, 1] *= height
 88 |         boxes[:, 3] *= height
 89 | 
 90 |         return image, boxes, labels
 91 | 
 92 | 
 93 | class ToPercentCoords(object):
 94 |     def __call__(self, image, boxes=None, labels=None):
 95 |         height, width, channels = image.shape
 96 |         boxes[:, 0] /= width
 97 |         boxes[:, 2] /= width
 98 |         boxes[:, 1] /= height
 99 |         boxes[:, 3] /= height
100 | 
101 |         return image, boxes, labels
102 | 
103 | 
104 | class Resize(object):
105 |     def __init__(self, size=300):
106 |         self.size = size
107 | 
108 |     def __call__(self, image, boxes=None, labels=None):
109 |         image = cv2.resize(image, (self.size,
110 |                                  self.size))
111 |         return image, boxes, labels
112 | 
113 | 
114 | class RandomSaturation(object):
115 |     def __init__(self, lower=0.5, upper=1.5):
116 |         self.lower = lower
117 |         self.upper = upper
118 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
119 |         assert self.lower >= 0, "contrast lower must be non-negative."
120 | 
121 |     def __call__(self, image, boxes=None, labels=None):
122 |         if random.randint(2):
123 |             image[:, :, 1] *= random.uniform(self.lower, self.upper)
124 | 
125 |         return image, boxes, labels
126 | 
127 | 
128 | class RandomHue(object):
129 |     def __init__(self, delta=18.0):
130 |         assert delta >= 0.0 and delta <= 360.0
131 |         self.delta = delta
132 | 
133 |     def __call__(self, image, boxes=None, labels=None):
134 |         if random.randint(2):
135 |             image[:, :, 0] += random.uniform(-self.delta, self.delta)
136 |             image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
137 |             image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
138 |         return image, boxes, labels
139 | 
140 | 
141 | class RandomLightingNoise(object):
142 |     def __init__(self):
143 |         self.perms = ((0, 1, 2), (0, 2, 1),
144 |                       (1, 0, 2), (1, 2, 0),
145 |                       (2, 0, 1), (2, 1, 0))
146 | 
147 |     def __call__(self, image, boxes=None, labels=None):
148 |         if random.randint(2):
149 |             swap = self.perms[random.randint(len(self.perms))]
150 |             shuffle = SwapChannels(swap)  # shuffle channels
151 |             image = shuffle(image)
152 |         return image, boxes, labels
153 | 
154 | 
155 | class ConvertColor(object):
156 |     def __init__(self, current='BGR', transform='HSV'):
157 |         self.transform = transform
158 |         self.current = current
159 | 
160 |     def __call__(self, image, boxes=None, labels=None):
161 |         if self.current == 'BGR' and self.transform == 'HSV':
162 |             image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
163 |         elif self.current == 'HSV' and self.transform == 'BGR':
164 |             image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
165 |         else:
166 |             raise NotImplementedError
167 |         return image, boxes, labels
168 | 
169 | 
170 | class RandomContrast(object):
171 |     def __init__(self, lower=0.5, upper=1.5):
172 |         self.lower = lower
173 |         self.upper = upper
174 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
175 |         assert self.lower >= 0, "contrast lower must be non-negative."
176 | 
177 |     # expects float image
178 |     def __call__(self, image, boxes=None, labels=None):
179 |         if random.randint(2):
180 |             alpha = random.uniform(self.lower, self.upper)
181 |             image *= alpha
182 |         return image, boxes, labels
183 | 
184 | 
185 | class RandomBrightness(object):
186 |     def __init__(self, delta=32):
187 |         assert delta >= 0.0
188 |         assert delta <= 255.0
189 |         self.delta = delta
190 | 
191 |     def __call__(self, image, boxes=None, labels=None):
192 |         if random.randint(2):
193 |             delta = random.uniform(-self.delta, self.delta)
194 |             image += delta
195 |         return image, boxes, labels
196 | 
197 | 
198 | class ToCV2Image(object):
199 |     def __call__(self, tensor, boxes=None, labels=None):
200 |         return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels
201 | 
202 | 
203 | class ToTensor(object):
204 |     def __call__(self, cvimage, boxes=None, labels=None):
205 |         return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels
206 | 
207 | 
208 | class RandomSampleCrop(object):
209 |     """Crop
210 |     Arguments:
211 |         img (Image): the image being input during training
212 |         boxes (Tensor): the original bounding boxes in pt form
213 |         labels (Tensor): the class labels for each bbox
214 |         mode (float tuple): the min and max jaccard overlaps
215 |     Return:
216 |         (img, boxes, classes)
217 |             img (Image): the cropped image
218 |             boxes (Tensor): the adjusted bounding boxes in pt form
219 |             labels (Tensor): the class labels for each bbox
220 |     """
221 |     def __init__(self):
222 |         self.sample_options = (
223 |             # using entire original input image
224 |             None,
225 |             # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
226 |             (0.1, None),
227 |             (0.3, None),
228 |             (0.7, None),
229 |             (0.9, None),
230 |             # randomly sample a patch
231 |             (None, None),
232 |         )
233 | 
234 |     def __call__(self, image, boxes=None, labels=None):
235 |         height, width, _ = image.shape
236 |         while True:
237 |             # randomly choose a mode
238 |             mode = random.choice(self.sample_options)
239 |             if mode is None:
240 |                 return image, boxes, labels
241 | 
242 |             min_iou, max_iou = mode
243 |             if min_iou is None:
244 |                 min_iou = float('-inf')
245 |             if max_iou is None:
246 |                 max_iou = float('inf')
247 | 
248 |             # max trails (50)
249 |             for _ in range(50):
250 |                 current_image = image
251 | 
252 |                 w = random.uniform(0.3 * width, width)
253 |                 h = random.uniform(0.3 * height, height)
254 | 
255 |                 # aspect ratio constraint b/t .5 & 2
256 |                 if h / w < 0.5 or h / w > 2:
257 |                     continue
258 | 
259 |                 left = random.uniform(width - w)
260 |                 top = random.uniform(height - h)
261 | 
262 |                 # convert to integer rect x1,y1,x2,y2
263 |                 rect = np.array([int(left), int(top), int(left+w), int(top+h)])
264 | 
265 |                 # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
266 |                 overlap = jaccard_numpy(boxes, rect)
267 | 
268 |                 # is min and max overlap constraint satisfied? if not try again
269 |                 if overlap.min() < min_iou and max_iou < overlap.max():
270 |                     continue
271 | 
272 |                 # cut the crop from the image
273 |                 current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
274 |                                               :]
275 | 
276 |                 # keep overlap with gt box IF center in sampled patch
277 |                 centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
278 | 
279 |                 # mask in all gt boxes that above and to the left of centers
280 |                 m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
281 | 
282 |                 # mask in all gt boxes that under and to the right of centers
283 |                 m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
284 | 
285 |                 # mask in that both m1 and m2 are true
286 |                 mask = m1 * m2
287 | 
288 |                 # have any valid boxes? try again if not
289 |                 if not mask.any():
290 |                     continue
291 | 
292 |                 # take only matching gt boxes
293 |                 current_boxes = boxes[mask, :].copy()
294 | 
295 |                 # take only matching gt labels
296 |                 current_labels = labels[mask]
297 | 
298 |                 # should we use the box left and top corner or the crop's
299 |                 current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
300 |                                                   rect[:2])
301 |                 # adjust to crop (by substracting crop's left,top)
302 |                 current_boxes[:, :2] -= rect[:2]
303 | 
304 |                 current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
305 |                                                   rect[2:])
306 |                 # adjust to crop (by substracting crop's left,top)
307 |                 current_boxes[:, 2:] -= rect[:2]
308 | 
309 |                 return current_image, current_boxes, current_labels
310 | 
311 | 
312 | class Expand(object):
313 |     def __init__(self, mean):
314 |         self.mean = mean
315 | 
316 |     def __call__(self, image, boxes, labels):
317 |         if random.randint(0,2):###这里我自己修改了
318 |             return image, boxes, labels
319 | 
320 |         height, width, depth = image.shape
321 |         ratio = random.uniform(1, 4)#在（1-4）随机生成一个实数
322 |         left = random.uniform(0, width*ratio - width)
323 |         top = random.uniform(0, height*ratio - height)
324 | 
325 |         expand_image = np.zeros(
326 |             (int(height*ratio), int(width*ratio), depth),
327 |             dtype=image.dtype)
328 |         expand_image[:, :, :] = self.mean
329 |         expand_image[int(top):int(top + height),
330 |                      int(left):int(left + width)] = image
331 |         image = expand_image
332 | 
333 |         boxes = boxes.copy()
334 |         boxes[:, :2] += (int(left), int(top))
335 |         boxes[:, 2:] += (int(left), int(top))
336 | 
337 |         return image, boxes, labels
338 | 
339 | 
340 | class RandomMirror(object):
341 |     def __call__(self, image, boxes, classes):
342 |         _, width, _ = image.shape
343 |         if random.randint(2):
344 |             image = image[:, ::-1]
345 |             boxes = boxes.copy()
346 |             boxes[:, 0::2] = width - boxes[:, 2::-2]
347 |         return image, boxes, classes
348 | 
349 | 
350 | class SwapChannels(object):
351 |     """Transforms a tensorized image by swapping the channels in the order
352 |      specified in the swap tuple.
353 |     Args:
354 |         swaps (int triple): final order of channels
355 |             eg: (2, 1, 0)
356 |     """
357 | 
358 |     def __init__(self, swaps):
359 |         self.swaps = swaps
360 | 
361 |     def __call__(self, image):
362 |         """
363 |         Args:
364 |             image (Tensor): image tensor to be transformed
365 |         Return:
366 |             a tensor with channels swapped according to swap
367 |         """
368 |         # if torch.is_tensor(image):
369 |         #     image = image.data.cpu().numpy()
370 |         # else:
371 |         #     image = np.array(image)
372 |         image = image[:, :, self.swaps]
373 |         return image
374 | 
375 | 
376 | class PhotometricDistort(object):
377 |     def __init__(self):
378 |         self.pd = [
379 |             RandomContrast(),
380 |             ConvertColor(transform='HSV'),
381 |             RandomSaturation(),
382 |             RandomHue(),
383 |             ConvertColor(current='HSV', transform='BGR'),
384 |             RandomContrast()
385 |         ]
386 |         self.rand_brightness = RandomBrightness()
387 |         self.rand_light_noise = RandomLightingNoise()
388 | 
389 |     def __call__(self, image, boxes, labels):
390 |         im = image.copy()
391 |         im, boxes, labels = self.rand_brightness(im, boxes, labels)
392 |         if random.randint(2):
393 |             distort = Compose(self.pd[:-1])
394 |         else:
395 |             distort = Compose(self.pd[1:])
396 |         im, boxes, labels = distort(im, boxes, labels)
397 |         return self.rand_light_noise(im, boxes, labels)
398 | 
399 | 
400 | class SSDAugmentation(object):
401 |     def __init__(self, size=300, mean=(104, 117, 123)):
402 |         self.mean = mean
403 |         self.size = size
404 |         self.augment = Compose([  #对输入的图片进行处理
405 |             ConvertFromInts(),
406 |             ToAbsoluteCoords(),
407 |             PhotometricDistort(),
408 |             Expand(self.mean),
409 |             RandomSampleCrop(),
410 |             RandomMirror(),
411 |             ToPercentCoords(),
412 |             Resize(self.size),
413 |             SubtractMeans(self.mean)
414 |         ])
415 | 
416 |     def __call__(self, img, boxes, labels):
417 |         return self.augment(img, boxes, labels)
418 | 


--------------------------------------------------------------------------------
/xml2regresstxt.py:
--------------------------------------------------------------------------------
 1 | # #!/usr/bin/env python
 2 | # # -*- encoding: utf-8 -*-
 3 | # '''
 4 | # @File    :   bbox-regress.py
 5 | # @Version :   1.0
 6 | # @Author  :   2014Vee
 7 | # @Contact :   1976535998@qq.com
 8 | # @License :   (C)Copyright 2014Vee From UESTC
 9 | # @Modify Time :   2020/4/14 9:44
10 | # @Desciption  :   生成回归框训练的数据文件
11 | # '''
12 | # import os
13 | # import random
14 | #
15 | # xmlfilepath = r'/data/lp/project/ssd.pytorch/xml_zc_fz'
16 | # saveBasePath = r'/data/lp/project/ssd.pytorch/txtsave'
17 | #
18 | # trainval_percent = 1.0
19 | # train_percent = 0.9
20 | # total_xml = os.listdir(xmlfilepath)
21 | # num = len(total_xml)
22 | # list = range(num)
23 | # tv = int(num * trainval_percent)
24 | # tr = int(tv * train_percent)
25 | # trainval = random.sample(list, tv)
26 | # train = random.sample(trainval, tr)
27 | #
28 | # print("train and val size", tv)
29 | # print("train size", tr)
30 | # ftrainval = open(os.path.join(saveBasePath, 'trainval.txt'), 'w')
31 | # ftest = open(os.path.join(saveBasePath, 'test.txt'), 'w')
32 | # ftrain = open(os.path.join(saveBasePath, 'train.txt'), 'w')
33 | # fval = open(os.path.join(saveBasePath, 'val.txt'), 'w')
34 | #
35 | # for i in list:
36 | #     name = total_xml[i][:-4] + '\n'
37 | #     if i in trainval:
38 | #         ftrainval.write(name)
39 | #         if i in train:
40 | #             ftrain.write(name)
41 | #         else:
42 | #             fval.write(name)
43 | #     else:
44 | #         ftest.write(name)
45 | #
46 | # ftrainval.close()
47 | # ftrain.close()
48 | # fval.close()
49 | # ftest.close()
50 | # # test
51 | 
52 | 
53 | tensors_list = [[[[1,2],[3,4],[5,6]],[[7,8],[9,10],[11,12]],[[13,14],[15,16],[17,18]],[[19,20],[21,22],[23,24]]], [[[25,26],[27,28],[29,30]],[[31,32],[33,34],[35,36]],[[37,38],[39,40],[41,42]],[[43,44],[45,46],[47,48]]], [[[49,50],[51,52],[53,54]],[[55,56],[57,58],[59,60]],[[61,62],[63,64],[65,66]],[[67,68],[69,70],[71,72]]], [[[73,74],[75,76],[77,78]],[[79,80],[81,82],[83,84]],[[85,86],[87,88],[89,90]],[[91,92],[93,94],[95,96]]], [[[97,98],[99,100],[101,102]],[[103,104],[105,106],[107,108]],[[109,110],[111,112],[113,114]],[[115,116],[117,118],[119,120]]]]
54 | print(tensors_list)


--------------------------------------------------------------------------------
/代码详解blog.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/代码详解blog.txt


--------------------------------------------------------------------------------
/保存权重/train.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | from data import *
  3 | from utils.augmentations import SSDAugmentation
  4 | from layers.modules import MultiBoxLoss
  5 | from ssd import build_ssd
  6 | import time
  7 | import torch
  8 | from torch.autograd import Variable
  9 | import torch.nn as nn
 10 | import torch.optim as optim
 11 | import torch.backends.cudnn as cudnn
 12 | import torch.nn.init as init
 13 | import torch.utils.data as data
 14 | import argparse
 15 | import visdom as viz
 16 | import os
 17 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # 指定GPU做训练
 18 | 
 19 | 
 20 | def str2bool(v):
 21 |     return v.lower() in ("yes", "true", "t", "1")
 22 | 
 23 | 
 24 | parser = argparse.ArgumentParser(
 25 |     description='Single Shot MultiBox Detector Training With Pytorch')
 26 | train_set = parser.add_mutually_exclusive_group()
 27 | parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'],
 28 |                     type=str, help='VOC or COCO')
 29 | parser.add_argument('--dataset_root', default="data/VOCdevkit/",   # 修改【dataset_root】
 30 |                     help='Dataset root directory path')
 31 | parser.add_argument('--basenet', default='vgg16_reducedfc.pth',  # 【预训练好的权重系数】
 32 |                     help='Pretrained base model')
 33 | parser.add_argument('--batch_size', default=4, type=int,  # 【修改batch_size】
 34 |                     help='Batch size for training')
 35 | parser.add_argument('--resume', default=None, type=str,  # 【是否从某节点开始训练】没有就是None
 36 |                     help='Checkpoint state_dict file to resume training from')
 37 | parser.add_argument('--start_iter', default=0, type=int,
 38 |                     help='Resume training at this iter')
 39 | parser.add_argument('--num_workers', default=2, type=int,  # 【num_workers】
 40 |                     help='Number of workers used in dataloading')
 41 | parser.add_argument('--cuda', default=True, type=str2bool,
 42 |                     help='Use CUDA to train model')
 43 | parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float,  # 【修改学习率】
 44 |                     help='initial learning rate')
 45 | parser.add_argument('--momentum', default=0.9, type=float,
 46 |                     help='Momentum value for optim')
 47 | parser.add_argument('--weight_decay', default=5e-4, type=float,
 48 |                     help='Weight decay for SGD')
 49 | parser.add_argument('--gamma', default=0.1, type=float,
 50 |                     help='Gamma update for SGD')
 51 | parser.add_argument('--visdom', default=False, type=str2bool,   # 可视化 这次设置为【【】可视化】】】
 52 |                     help='Use visdom for loss visualization')
 53 | parser.add_argument('--save_folder', default='weights/',
 54 |                     help='Directory for saving checkpoint models')
 55 | args = parser.parse_args()
 56 | 
 57 | if torch.cuda.is_available():
 58 |     if args.cuda:
 59 |         torch.set_default_tensor_type('torch.cuda.FloatTensor')
 60 |     if not args.cuda:
 61 |         print("WARNING: It looks like you have a CUDA device, but aren't " +
 62 |               "using CUDA.\nRun with --cuda for optimal training speed.")
 63 |         torch.set_default_tensor_type('torch.FloatTensor')
 64 | else:
 65 |     torch.set_default_tensor_type('torch.FloatTensor')
 66 | 
 67 | if not os.path.exists(args.save_folder):
 68 |     os.mkdir(args.save_folder)
 69 | 
 70 | 
 71 | def train():
 72 |     cfg = voc  # voc是一个字典 里面包括网络的一系列参数信息
 73 |     dataset = VOCDetection(  # 是一个VOC数据的类
 74 |         root=args.dataset_root,  # 数据集的根目录
 75 |         transform=SSDAugmentation(cfg['min_dim'], MEANS))  # 图片的预处理方法(输入图片的尺寸和均值) 原本类中定义为None 后面的MEANS我人为可以删除
 76 | 
 77 |     if args.visdom:  # 这里是可视化工具，不用管###################
 78 |         import visdom
 79 |         viz = visdom.Visdom()
 80 | 
 81 |     ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes'])
 82 |     # 阶段【train or test】 输入图片尺寸大小 类别数
 83 |     # build_ssd是一个放在ssd.py的函数
 84 |     # return是一个类的对象，也就是class SSD(nn.Module)，ssd_net也就是SSD类的一个对象
 85 |     # ssd_net拥有所有class SSD继承于nn.Module以及作者增加方法的所有属性
 86 |     # 在SSD这个类中就定义了网络的base部分（修改全连接层后的VGG16）和extras部分（论文作者加入的多尺度feature map）和head部分
 87 |     # 对选定的6个尺度下的feature map进行卷积操作得到的每个default box 的每一个分类类别的confidence以及位置坐标的信息
 88 |     net = ssd_net  # 到这里class类SSD只完成了__init__()并没有执行__forward__() net是一个类
 89 | 
 90 |     if args.cuda:  # 是否将模型放到多个个GPU上运行{我认为在我的任务中不要放在多线程GPU中}
 91 |         net = torch.nn.DataParallel(ssd_net)
 92 |         cudnn.benchmark = True
 93 |     if args.resume:  # 【resume】的默认值是None,表示不是接着某个断点来继续训练这个模型 【其实checkpoint里面最好还要加上优化器的保存】
 94 |         # 【model_state_dict,optimizer_state_dict,epoch】 见深度之眼
 95 |         print('Resuming training, loading {}...'.format(args.resume))
 96 |         ssd_net.load_weights(args.resume)
 97 |     else:  # 那么就从weights文件夹下面直接加载预训练好vgg基础网络预训练权重
 98 |         vgg_weights = torch.load(args.save_folder + args.basenet)  # 整个ssd_net中vgg基础网络的权重
 99 |         print('Loading base network...')
100 |         ssd_net.vgg.load_state_dict(vgg_weights)  # 只在整个ssd_net中的vgg模块中加载预训练好的权重，其余的extra，特征融合，CBAM模块没有加载预训练权重
101 |     if args.cuda:  # 将模型结构放在GPU上训练
102 |         net = net.cuda()
103 |     if not args.resume:  # ######################################################################
104 |         print('Initializing weights...')  # 如果不是接着某个断点接着训练，那么其余extras loc con都会xavier方法初始化
105 |         # initialize newly added layers' weights with xavier method
106 |         ssd_net.extras.apply(weights_init)  # extras 模块由 xavier 方法默认初始化data和bias
107 |         ssd_net.loc.apply(weights_init)  # loc 模块由 xavier 方法默认初始化data和bias
108 |         ssd_net.conf.apply(weights_init)  # conf 模块由 xavier 方法默认初始化data和bias
109 | 
110 |     # 【优化器】net.parameters()是网络结构中的参数，学习率，动量，权重衰减率
111 |     optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
112 |     # 定义损失函数部分【MultiBoxesLoss是一个类用于计算网络的损失，criterion是一个对象】
113 |     # 【损失函数】 关键！！！ criterion是个nn.Moudule的形式 里面包括两部分loss_c 和 loss_l
114 |     criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, False, args.cuda)
115 |     # 前向传播
116 |     net.train()
117 |     # loss counters
118 |     loc_loss = 0
119 |     conf_loss = 0
120 |     epoch = 0
121 |     print('Loading the dataset...')
122 |     epoch_size = len(dataset) // args.batch_size  # 每个epoch中有多少个batch
123 |     print('Training SSD on:', dataset.name)
124 |     print('Using the specified args:')
125 |     print(args)  # 讲设定的参数打印出来
126 | 
127 |     step_index = 0
128 |     # 可视化部分
129 |     if args.visdom:  # 默认值为False
130 |         vis_title = 'SSD.PyTorch on ' + dataset.name
131 |         vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss']
132 |         iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend)
133 |         epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend)
134 | 
135 |     data_loader = data.DataLoader(dataset, args.batch_size,
136 |                                   num_workers=args.num_workers,  # 默认值我修改成了0
137 |                                   shuffle=True,
138 |                                   collate_fn=detection_collate,  # collate_fn将一个batch_size数目的图片进行合并成batch
139 |                                   pin_memory=True)
140 |     batch_iterator = iter(data_loader)  # batch迭代器 依次迭代batch
141 |     for iteration in range(args.start_iter, cfg['max_iter']):  # 由最大迭代次数来迭代训练
142 |         if args.visdom and iteration != 0 and (iteration % epoch_size == 0):   # 因为args.visdom一直设置为False因此没有被调用
143 |             update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None, 'append', epoch_size)
144 |             # reset epoch loss counters
145 |             loc_loss = 0
146 |             conf_loss = 0
147 |             epoch += 1
148 | 
149 |         if iteration in cfg['lr_steps']:  # 通过多少次epoch调节一次学习率
150 |             step_index += 1
151 |             adjust_learning_rate(optimizer, args.gamma, step_index)
152 | 
153 |         # load train data
154 |         try:
155 |             images, targets = next(batch_iterator)
156 |             # targets 和image都是读取的训练数据
157 |         except StopIteration:
158 |             bath_iterator = iter(data_loader)
159 |             images, targets = next(bath_iterator)
160 |         # images=【batch_size,3,300,300】
161 |         # targets=【batch_size,num_object,5】
162 |         # num_object代表一张图里面有几个ground truth，5代表四个位置信息和一个label
163 |         if args.cuda:  # 将数据放在cuda上
164 |             images = Variable(images.cuda())
165 |             targets = [Variable(ann.cuda(), volatile=True) for ann in targets]
166 |         else:
167 |             images = Variable(images)
168 |             targets = [Variable(ann, volatile=True) for ann in targets]
169 |         # forward
170 |         t0 = time.time()
171 |         # ##out是netforward的输出：是个元组，里面包括3个部分[loc conf  priors]
172 |         out = net(images)
173 |         # ## backprop 优化器梯度清零
174 |         optimizer.zero_grad()
175 |         # ## criterion是nn.Module形式，下面是调用它的forward模式【重点看，里面包括难例挖掘的内容】
176 |         # ###################################【【【训练阶段的损失！！！】】】######################################
177 |         # ##输入参数1：网络结构net输出的out:[loc conf priors]
178 |         # ##输入参数2：targets:真实目标的位置标签值
179 |         loss_l, loss_c = criterion(out, targets)  # criterion就是MultiBoxLoss类定义的对象，forward前传播返回的结果是【loss_l, loss_c】
180 |         loss = loss_l + loss_c  # 总loss
181 |         loss.backward()
182 |         optimizer.step()
183 |         t1 = time.time()
184 |         # 下面两行好像没有使用
185 |         loc_loss += loss_l.data  # ###到底是改成item()还是data
186 |         conf_loss += loss_c.data  # ###到底是改成item()还是data
187 | 
188 |         if iteration % 10 == 0:
189 |             print('timer: %.4f sec.' % (t1 - t0))
190 |             print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % loss.data, end=' ')  # 到底是改成item()还是data
191 | 
192 |         if args.visdom:
193 |             update_vis_plot(iteration, loss_l.data, loss_c.data, iter_plot, epoch_plot, 'append')
194 | 
195 |         if iteration != 0 and iteration % 10000 == 0:
196 |             # 迭代多少次保存一次模型。 在尝试阶段，为了节省时间，建议将根据迭代次数保存模型的参数调低，例如调节到500
197 |             print('Saving state, iter:', iteration) # 保存的checkpoint
198 |             torch.save(ssd_net.state_dict(), 'weights/ssd300_VOC_' + repr(iteration) + '.pth')  # 保存模型的路径
199 |     torch.save(ssd_net.state_dict(), args.save_folder + '' + args.dataset + '.pth')  # 最后的保存：不是保存整个模型，只是保存了参数
200 | 
201 | 
202 | def adjust_learning_rate(optimizer, gamma, step):
203 |     """Sets the learning rate to the initial LR decayed by 10 at every
204 |         specified step
205 |     """
206 |     lr = args.lr * (gamma ** (step))
207 |     for param_group in optimizer.param_groups:
208 |         param_group['lr'] = lr
209 | 
210 | 
211 | def xavier(param):
212 |     init.xavier_uniform(param)
213 | 
214 | 
215 | def weights_init(m):
216 |     if isinstance(m, nn.Conv2d):
217 |         xavier(m.weight.data)
218 |         m.bias.data.zero_()
219 | 
220 | 
221 | def create_vis_plot(_xlabel, _ylabel, _title, _legend):
222 |     return viz.line(
223 |         X=torch.zeros((1,)).cpu(),
224 |         Y=torch.zeros((1, 3)).cpu(),
225 |         opts=dict(
226 |             xlabel=_xlabel,
227 |             ylabel=_ylabel,
228 |             title=_title,
229 |             legend=_legend
230 |         )
231 |     )
232 | 
233 | 
234 | def update_vis_plot(iteration, loc, conf, window1, window2, update_type,
235 |                     epoch_size=1):
236 |     viz.line(
237 |         X=torch.ones((1, 3)).cpu() * iteration,
238 |         Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu() / epoch_size,
239 |         win=window1,
240 |         update=update_type
241 |     )
242 |     # initialize epoch plot on first iteration
243 |     if iteration == 0:
244 |         viz.line(
245 |             X=torch.zeros((1, 3)).cpu(),
246 |             Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu(),
247 |             win=window2,
248 |             update=True
249 |         )
250 | 
251 | 
252 | if __name__ == '__main__':
253 |     train()
254 | 


--------------------------------------------------------------------------------
/保存权重/代码详解blog.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/保存权重/代码详解blog.txt


--------------------------------------------------------------------------------
/显示检测结果code.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | with open('D:/Deep_learning/ssd.pytorch-master/eval/test1.txt','r') as f:
 3 |     line=f.readline()
 4 |     pic=list()
 5 |     loc=list()
 6 |     match={}
 7 |     while line:
 8 |         if 'GROUND TRUTH FOR:' in line:
 9 |             pic.append(line[-7:-1])
10 |         if 'ship score' in line:
11 |             location=line.split(' ')[5:12:2]
12 |             location=[float(x) for x in location]
13 |             loc.append(location)
14 |         if  len(line)==1: 
15 |             match[pic[0]]=loc
16 |             pic=list()
17 |             loc=list()
18 |         line=f.readline()
19 |     f.close()
20 | for i in match.keys():  
21 |     #print('D:/Deep_learning/ssd.pytorch-master/data/VOCdevkit/VOC2007/ground_truth/'+i+'.jpg.jpg')
22 |     img=cv2.imread('/data/lp/project/ssd.pytorch/data/VOCdevkit/VOC2007/ground_truth/'+i+'.jpg.jpg')
23 |     #print(match[i],'每一幅图的框个数： ',len(match[i]))
24 |     for num in range(len(match[i])):
25 |         x1=int(match[i][num][0])
26 |         y1=int(match[i][num][1])
27 |         x2=int(match[i][num][2])
28 |         y2=int(match[i][num][3])
29 |         cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), thickness=2)
30 |     cv2.imwrite("/data/lp/project/ssd.pytorch/data/VOCdevkit/VOC2007/PREDECTION/"+i+'.jpg.jpg', img)
31 | 


--------------------------------------------------------------------------------
/训练步骤.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/训练步骤.txt


--------------------------------------------------------------------------------