├── .gitattributes ├── .gitignore ├── 2voctxt.py ├── LICENSE ├── README.md ├── bbox-regression.py ├── data ├── MYSELF.py ├── VOCdevkit │ └── VOC2007 │ │ └── ImageSets │ │ └── Main │ │ ├── test.txt │ │ ├── train.txt │ │ ├── trainval.txt │ │ └── val.txt ├── VOCdevkitVOC2007 │ ├── annotations_cache │ │ └── annots.pkl │ └── results │ │ ├── det_test_None.txt │ │ ├── det_test_ship.txt │ │ ├── det_train_None.txt │ │ ├── det_train_ship.txt │ │ ├── det_trainval_None.txt │ │ ├── det_trainval_ship.txt │ │ ├── det_val_None.txt │ │ └── det_val_ship.txt ├── __init__.py ├── config.py ├── example.jpg ├── scripts │ ├── COCO2014.sh │ ├── VOC2007.sh │ └── VOC2012.sh └── voc0712.py ├── demo ├── __init__.py ├── demo.ipynb └── live.py ├── doc ├── SSD.jpg ├── detection_example.png ├── detection_example2.png ├── detection_examples.png └── ssd.png ├── eval.py ├── focal_loss.py ├── layers ├── __init__.py ├── box_utils.py ├── functions │ ├── __init__.py │ ├── detection.py │ └── prior_box.py └── modules │ ├── __init__.py │ ├── l2norm.py │ └── multibox_loss.py ├── loc-txt.ipynb ├── ssd.py ├── test.py ├── train.py ├── utils ├── __init__.py └── augmentations.py ├── xml2regresstxt.py ├── 代码详解blog.txt ├── 保存权重 ├── train.py └── 代码详解blog.txt ├── 显示检测结果code.py └── 训练步骤.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-language=Python 2 | .ipynb_checkpoints/* linguist-documentation 3 | dev.ipynb linguist-documentation 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # atom remote-sync package 92 | .remote-sync.json 93 | 94 | # weights 95 | weights/ 96 | 97 | #DS_Store 98 | .DS_Store 99 | 100 | # dev stuff 101 | eval/ 102 | eval.ipynb 103 | dev.ipynb 104 | .vscode/ 105 | 106 | # not ready 107 | videos/ 108 | templates/ 109 | data/ssd_dataloader.py 110 | data/datasets/ 111 | doc/visualize.py 112 | read_results.py 113 | ssd300_120000/ 114 | demos/live 115 | webdemo.py 116 | test_data_aug.py 117 | 118 | # attributes 119 | 120 | # pycharm 121 | .idea/ 122 | 123 | # temp checkout soln 124 | data/datasets/ 125 | data/ssd_dataloader.py 126 | 127 | # pylint 128 | .pylintrc -------------------------------------------------------------------------------- /2voctxt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | ''' 4 | @File : 2voctxt.py 5 | @Version : 1.0 6 | @Author : 2014Vee 7 | @Contact : 1976535998@qq.com 8 | @License : (C)Copyright 2014Vee From UESTC 9 | @Modify Time : 2020/4/17 15:37 10 | @Desciption : None 11 | ''' 12 | import os 13 | import random 14 | 15 | # https://blog.csdn.net/duanyajun987/article/details/81507656 16 | #*这里小心一下,里面的训练集和数据集文件的比例如下 17 | #这里由于数据集NWPU里已经有测试集所以不需要在把数据留一部分测试 18 | trainval_percent = 0.2 19 | train_percent = 0.8 20 | xmlfilepath = './data/VOCdevkit/VOC2007/Annotations' 21 | txtsavepath = './data/VOCdevkit/VOC2007/ImageSets/Main' 22 | total_xml = os.listdir(xmlfilepath) 23 | 24 | num = len(total_xml) 25 | list = range(num) 26 | tv = int(num * trainval_percent) 27 | tr = int(tv * train_percent) 28 | trainval = random.sample(list, tv) 29 | train = random.sample(trainval, tr) 30 | 31 | ftrainval = open(txtsavepath + '/trainval.txt', 'w') 32 | ftest = open(txtsavepath + '/test.txt', 'w') 33 | ftrain = open(txtsavepath + '/train.txt', 'w') 34 | fval = open(txtsavepath + '/val.txt', 'w') 35 | 36 | for i in list: 37 | name = total_xml[i][:-4] + '\n' 38 | if i in trainval: 39 | ftrainval.write(name) 40 | if i in train: 41 | ftest.write(name) 42 | else: 43 | fval.write(name) 44 | else: 45 | ftrain.write(name) 46 | 47 | ftrainval.close() 48 | ftrain.close() 49 | fval.close() 50 | ftest.close() 51 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Max deGroot, Ellis Brown 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SSD: Single Shot MultiBox Object Detector, in PyTorch 2 | A [PyTorch](http://pytorch.org/) implementation of [Single Shot MultiBox Detector](http://arxiv.org/abs/1512.02325) from the 2016 paper by Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang, and Alexander C. Berg. The official and original Caffe code can be found [here](https://github.com/weiliu89/caffe/tree/ssd). 3 | 4 | 5 | 6 | 7 | ### Table of Contents 8 | - Installation 9 | - Datasets 10 | - Train 11 | - Evaluate 12 | - Performance 13 | - Demos 14 | - Future Work 15 | - Reference 16 | 17 |   18 |   19 |   20 |   21 | 22 | ## Installation 23 | - Install [PyTorch](http://pytorch.org/) by selecting your environment on the website and running the appropriate command. 24 | - Clone this repository. 25 | * Note: We currently only support Python 3+. 26 | - Then download the dataset by following the [instructions](#datasets) below. 27 | - We now support [Visdom](https://github.com/facebookresearch/visdom) for real-time loss visualization during training! 28 | * To use Visdom in the browser: 29 | ```Shell 30 | # First install Python server and client 31 | pip install visdom 32 | # Start the server (probably in a screen or tmux) 33 | python -m visdom.server 34 | ``` 35 | * Then (during training) navigate to http://localhost:8097/ (see the Train section below for training details). 36 | - Note: For training, we currently support [VOC](http://host.robots.ox.ac.uk/pascal/VOC/) and [COCO](http://mscoco.org/), and aim to add [ImageNet](http://www.image-net.org/) support soon. 37 | 38 | ## Datasets 39 | To make things easy, we provide bash scripts to handle the dataset downloads and setup for you. We also provide simple dataset loaders that inherit `torch.utils.data.Dataset`, making them fully compatible with the `torchvision.datasets` [API](http://pytorch.org/docs/torchvision/datasets.html). 40 | 41 | 42 | ### COCO 43 | Microsoft COCO: Common Objects in Context 44 | 45 | ##### Download COCO 2014 46 | ```Shell 47 | # specify a directory for dataset to be downloaded into, else default is ~/data/ 48 | sh data/scripts/COCO2014.sh 49 | ``` 50 | 51 | ### VOC Dataset 52 | PASCAL VOC: Visual Object Classes 53 | 54 | ##### Download VOC2007 trainval & test 55 | ```Shell 56 | # specify a directory for dataset to be downloaded into, else default is ~/data/ 57 | sh data/scripts/VOC2007.sh # 58 | ``` 59 | 60 | ##### Download VOC2012 trainval 61 | ```Shell 62 | # specify a directory for dataset to be downloaded into, else default is ~/data/ 63 | sh data/scripts/VOC2012.sh # 64 | ``` 65 | 66 | ## Training SSD 67 | - First download the fc-reduced [VGG-16](https://arxiv.org/abs/1409.1556) PyTorch base network weights at: https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth 68 | - By default, we assume you have downloaded the file in the `ssd.pytorch/weights` dir: 69 | 70 | ```Shell 71 | mkdir weights 72 | cd weights 73 | wget https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth 74 | ``` 75 | 76 | - To train SSD using the train script simply specify the parameters listed in `train.py` as a flag or manually change them. 77 | 78 | ```Shell 79 | python train.py 80 | ``` 81 | 82 | - Note: 83 | * For training, an NVIDIA GPU is strongly recommended for speed. 84 | * For instructions on Visdom usage/installation, see the Installation section. 85 | * You can pick-up training from a checkpoint by specifying the path as one of the training parameters (again, see `train.py` for options) 86 | 87 | ## Evaluation 88 | To evaluate a trained network: 89 | 90 | ```Shell 91 | python eval.py 92 | ``` 93 | 94 | You can specify the parameters listed in the `eval.py` file by flagging them or manually changing them. 95 | 96 | 97 | 98 | 99 | ## Performance 100 | 101 | #### VOC2007 Test 102 | 103 | ##### mAP 104 | 105 | | Original | Converted weiliu89 weights | From scratch w/o data aug | From scratch w/ data aug | 106 | |:-:|:-:|:-:|:-:| 107 | | 77.2 % | 77.26 % | 58.12% | 77.43 % | 108 | 109 | ##### FPS 110 | **GTX 1060:** ~45.45 FPS 111 | 112 | ## Demos 113 | 114 | ### Use a pre-trained SSD network for detection 115 | 116 | #### Download a pre-trained network 117 | - We are trying to provide PyTorch `state_dicts` (dict of weight tensors) of the latest SSD model definitions trained on different datasets. 118 | - Currently, we provide the following PyTorch models: 119 | * SSD300 trained on VOC0712 (newest PyTorch weights) 120 | - https://s3.amazonaws.com/amdegroot-models/ssd300_mAP_77.43_v2.pth 121 | * SSD300 trained on VOC0712 (original Caffe weights) 122 | - https://s3.amazonaws.com/amdegroot-models/ssd_300_VOC0712.pth 123 | - Our goal is to reproduce this table from the [original paper](http://arxiv.org/abs/1512.02325) 124 |

125 | SSD results on multiple datasets

126 | 127 | ### Try the demo notebook 128 | - Make sure you have [jupyter notebook](http://jupyter.readthedocs.io/en/latest/install.html) installed. 129 | - Two alternatives for installing jupyter notebook: 130 | 1. If you installed PyTorch with [conda](https://www.continuum.io/downloads) (recommended), then you should already have it. (Just navigate to the ssd.pytorch cloned repo and run): 131 | `jupyter notebook` 132 | 133 | 2. If using [pip](https://pypi.python.org/pypi/pip): 134 | 135 | ```Shell 136 | # make sure pip is upgraded 137 | pip3 install --upgrade pip 138 | # install jupyter notebook 139 | pip install jupyter 140 | # Run this inside ssd.pytorch 141 | jupyter notebook 142 | ``` 143 | 144 | - Now navigate to `demo/demo.ipynb` at http://localhost:8888 (by default) and have at it! 145 | 146 | ### Try the webcam demo 147 | - Works on CPU (may have to tweak `cv2.waitkey` for optimal fps) or on an NVIDIA GPU 148 | - This demo currently requires opencv2+ w/ python bindings and an onboard webcam 149 | * You can change the default webcam in `demo/live.py` 150 | - Install the [imutils](https://github.com/jrosebr1/imutils) package to leverage multi-threading on CPU: 151 | * `pip install imutils` 152 | - Running `python -m demo.live` opens the webcam and begins detecting! 153 | 154 | ## TODO 155 | We have accumulated the following to-do list, which we hope to complete in the near future 156 | - Still to come: 157 | * [x] Support for the MS COCO dataset 158 | * [ ] Support for SSD512 training and testing 159 | * [ ] Support for training on custom datasets 160 | 161 | ## Authors 162 | 163 | * [**Max deGroot**](https://github.com/amdegroot) 164 | * [**Ellis Brown**](http://github.com/ellisbrown) 165 | 166 | ***Note:*** Unfortunately, this is just a hobby of ours and not a full-time job, so we'll do our best to keep things up to date, but no guarantees. That being said, thanks to everyone for your continued help and feedback as it is really appreciated. We will try to address everything as soon as possible. 167 | 168 | ## References 169 | - Wei Liu, et al. "SSD: Single Shot MultiBox Detector." [ECCV2016]((http://arxiv.org/abs/1512.02325)). 170 | - [Original Implementation (CAFFE)](https://github.com/weiliu89/caffe/tree/ssd) 171 | - A huge thank you to [Alex Koltun](https://github.com/alexkoltun) and his team at [Webyclip](http://www.webyclip.com) for their help in finishing the data augmentation portion. 172 | - A list of other great SSD ports that were sources of inspiration (especially the Chainer repo): 173 | * [Chainer](https://github.com/Hakuyume/chainer-ssd), [Keras](https://github.com/rykov8/ssd_keras), [MXNet](https://github.com/zhreshold/mxnet-ssd), [Tensorflow](https://github.com/balancap/SSD-Tensorflow) 174 | -------------------------------------------------------------------------------- /bbox-regression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | ''' 4 | @File : bbox-regression.py 5 | @Version : 1.0 6 | @Author : 2014Vee 7 | @Contact : 1976535998@qq.com 8 | @License : (C)Copyright 2014Vee From UESTC 9 | @Modify Time : 2020/4/14 10:37 10 | @Desciption : None 11 | ''' 12 | 13 | import cv2 14 | import numpy as np 15 | import xml.dom.minidom 16 | import tensorflow as tf 17 | import os 18 | import time 19 | from tensorflow.python.framework import graph_util 20 | 21 | slim = tf.contrib.slim 22 | 23 | #读取txt文件 24 | train_txt = open('/data/lp/project/ssd.pytorch/txtsave/train.txt') 25 | val_txt = open('/data/lp/project/ssd.pytorch/txtsave/val.txt') 26 | train_content = train_txt.readlines() #保存的train.txt中的内容 27 | val_content = val_txt.readlines() #保存的val.txt中的内容 28 | # for linetr in train_content: 29 | # print ("train_content",linetr.rstrip('\n')) 30 | # for lineva in val_content: 31 | # print ("val_content",lineva.rstrip('\n')) 32 | 33 | #根据txt文件读取图像数据,并归一化图像,并保存缩放比例 34 | train_imgs=[]#缩放后的图像尺寸 35 | train_imgs_ratio=[] #width 缩放比,height缩放比 36 | val_imgs=[] 37 | val_imgs_ratio=[] 38 | 39 | 40 | h=48 41 | w=192 #归一化的尺寸 42 | c=3 #通道 43 | 44 | 45 | for linetr in train_content: 46 | img_path='/data/lp/project/ssd.pytorch/oripic/'+linetr.rstrip('\n')+'.jpg' 47 | img = cv2.imread(img_path) #读取原图 48 | # print("image_name", str(linetr.rstrip('\n'))) 49 | # print("imgshape", img.shape) 50 | imgresize= cv2.resize(img,(w,h)) #图像归一化 51 | ratio = np.array([imgresize.shape[0]/img.shape[0], imgresize.shape[1]/img.shape[1]],np.float32) #height缩放比 ,width 缩放比, 52 | train_imgs_ratio.append(ratio) 53 | train_imgs.append(imgresize) 54 | train_img_arr = np.asarray(train_imgs,np.float32) #保存训练图像数据的列表 h w c 55 | print(len(train_img_arr),len(train_imgs_ratio)) 56 | 57 | for lineva in val_content: 58 | img_path='/data/lp/project/ssd.pytorch/oripic/'+lineva.rstrip('\n')+'.jpg' 59 | img = cv2.imread(img_path) # h w c 60 | imgresize= cv2.resize(img,(w,h)) #h w c 61 | ratio = np.array([imgresize.shape[0]/img.shape[0], imgresize.shape[1]/img.shape[1]],np.float32) #height缩放比, width 缩放比, 62 | val_imgs_ratio.append(ratio) 63 | val_imgs.append(imgresize) 64 | # print(imgresize.shape[0], imgresize.shape[1], imgresize.shape[2]) 65 | val_img_arr = np.asarray(val_imgs,np.float32) #保存验证图像的数据的列表 h w c 66 | 67 | # print(len(val_img_arr),len(val_imgs_ratio)) 68 | 69 | # 根据txt文件读取xml,并获取xml中的坐标(xmin,ymin,xmax,ymax)(x表示width,y表示height),并获取经过缩放后的坐标 70 | train_xml = [] # 保存标记的边框坐标 71 | train_xml_resize = [] # 保存标记的边框坐标经过缩放后的坐标,缩放比与图像归一化的缩放比 72 | val_xml = [] 73 | val_xml_resize = [] 74 | for linetr in train_content: 75 | xml_path = '/data/lp/project/ssd.pytorch/xml_zc_fz/' + linetr.rstrip( 76 | '\n') + '.xml' 77 | print(xml_path) 78 | xml_DomTree = xml.dom.minidom.parse(xml_path) 79 | xml_annotation = xml_DomTree.documentElement 80 | xml_object = xml_annotation.getElementsByTagName('object') 81 | xml_bndbox = xml_object[0].getElementsByTagName('bndbox') 82 | xmin_list = xml_bndbox[0].getElementsByTagName('xmin') 83 | xmin = int(xmin_list[0].childNodes[0].data) 84 | ymin_list = xml_bndbox[0].getElementsByTagName('ymin') 85 | ymin = int(ymin_list[0].childNodes[0].data) 86 | xmax_list = xml_bndbox[0].getElementsByTagName('xmax') 87 | xmax = int(xmax_list[0].childNodes[0].data) 88 | ymax_list = xml_bndbox[0].getElementsByTagName('ymax') 89 | ymax = int(ymax_list[0].childNodes[0].data) 90 | coordinate = np.array([ymin, xmin, ymax, xmax], np.int) # h w h w 91 | train_xml.append(coordinate) # 保存训练图像的xml的坐标 92 | # print("bbox:", coordinate) 93 | # print(len(train_xml)) 94 | 95 | for lineva in val_content: 96 | xml_path = '/data/lp/project/ssd.pytorch/xml_zc_fz/' + lineva.rstrip( 97 | '\n') + '.xml' 98 | print(xml_path) 99 | xml_DomTree = xml.dom.minidom.parse(xml_path) 100 | xml_annotation = xml_DomTree.documentElement 101 | xml_object = xml_annotation.getElementsByTagName('object') 102 | xml_bndbox = xml_object[0].getElementsByTagName('bndbox') 103 | xmin_list = xml_bndbox[0].getElementsByTagName('xmin') 104 | xmin = int(xmin_list[0].childNodes[0].data) 105 | ymin_list = xml_bndbox[0].getElementsByTagName('ymin') 106 | ymin = int(ymin_list[0].childNodes[0].data) 107 | xmax_list = xml_bndbox[0].getElementsByTagName('xmax') 108 | xmax = int(xmax_list[0].childNodes[0].data) 109 | ymax_list = xml_bndbox[0].getElementsByTagName('ymax') 110 | ymax = int(ymax_list[0].childNodes[0].data) 111 | coordinate = np.array([ymin, xmin, ymax, xmax], np.int) 112 | val_xml.append(coordinate) # 保存验证图像的xml的坐标 113 | # print(len(val_xml)) 114 | 115 | for i in range(0, len(train_imgs_ratio)): 116 | ymin_ratio = train_xml[i][0] * train_imgs_ratio[i][0] 117 | xmin_ratio = train_xml[i][1] * train_imgs_ratio[i][1] 118 | ymax_ratio = train_xml[i][2] * train_imgs_ratio[i][0] 119 | xmax_ratio = train_xml[i][3] * train_imgs_ratio[i][1] 120 | coordinate_ratio = np.array([ymin_ratio, xmin_ratio, ymax_ratio, xmax_ratio], np.float32) 121 | train_xml_resize.append(coordinate_ratio) # 保存训练图像的标记的xml的缩放后的坐标 122 | 123 | for i in range(0, len(val_imgs_ratio)): 124 | ymin_ratio = val_xml[i][0] * val_imgs_ratio[i][0] 125 | xmin_ratio = val_xml[i][1] * val_imgs_ratio[i][1] 126 | ymax_ratio = val_xml[i][2] * val_imgs_ratio[i][0] 127 | xmax_ratio = val_xml[i][3] * val_imgs_ratio[i][1] 128 | coordinate_ratio = np.array([ymin_ratio, xmin_ratio, ymax_ratio, xmax_ratio], np.float32) 129 | val_xml_resize.append(coordinate_ratio) # 保存训练验证图像的标记的xml的缩放后的坐标 130 | 131 | 132 | # 按批次取数据,获取batchsize数据 133 | # inputs 图像数据 归一化后的数据 134 | # targets xml坐标数据 归一化后的数据 135 | def getbatches(inputs=None, targets=None, batch_size=None, shuffle=False): 136 | assert len(inputs) == len(targets) 137 | if shuffle: 138 | indices = np.arange(len(inputs)) 139 | np.random.shuffle(indices) 140 | for start_idx in range(0, len(inputs) - batch_size + 1, batch_size): 141 | if shuffle: 142 | excerpt = indices[start_idx:start_idx + batch_size] # 其实就是按照batchsize做切片 143 | else: 144 | excerpt = slice(start_idx, start_idx + batch_size) 145 | yield inputs[excerpt], targets[excerpt] # 这个yield每次都是遇到了就返回类似于关键字return 146 | # 但是下次执行的时候就是从yield后面的代码进行继续,此时这个函数不是普通函数而是一个生成器了 147 | 148 | 149 | #损失函数smoothL1范数 150 | def abs_smooth(x): 151 | """Smoothed absolute function. Useful to compute an L1 smooth error. 152 | 153 | Define as: 154 | x^2 / 2 if abs(x) < 1 155 | abs(x) - 0.5 if abs(x) > 1 156 | We use here a differentiable definition using min(x) and abs(x). Clearly 157 | not optimal, but good enough for our purpose! 158 | """ 159 | absx = tf.abs(x) 160 | minx = tf.minimum(absx, 1) 161 | r = 0.5 * ((absx - 1) * minx + absx)#这个地方打开会有平方项 162 | return r 163 | 164 | #构建网络结构 165 | 166 | input_data = tf.placeholder(tf.float32,shape=[None,h,w,c],name='x') #输入的图像数据(归一化后的图像数据) 167 | input_bound = tf.placeholder(tf.float32,shape=[None,None],name='y') #输入的标记的边框坐标数据(缩放后的xml坐标) 168 | prob=tf.placeholder(tf.float32, name='keep_prob') 169 | 170 | 171 | #第一个卷积层(192——>96) (48--》24) 172 | #conv1 = slim.repeat(input_data, 2, slim.conv2d, 32, [3, 3], scope='conv1') 173 | conv1 = slim.conv2d(input_data, 32, [3, 3], scope='conv1')##32是指卷积核的个数,[3, 3]是指卷积核尺寸,默认步长是[1,1] 174 | pool1 = slim.max_pool2d(conv1, [2, 2], scope='pool1')#[2,2]是池化步长 175 | 176 | #第二个卷积层(96-48) (24-》12) 177 | #conv2 = slim.repeat(pool1, 2, slim.conv2d, 64, [3, 3], scope='conv2') 178 | conv2 = slim.conv2d(pool1, 64, [3, 3], scope='conv2') 179 | pool2 = slim.max_pool2d(conv2, [2, 2], scope='pool2') 180 | 181 | #第三个卷积层(48-24) (12-》6) 182 | #conv3 = slim.repeat(pool2, 2, slim.conv2d, 128, [3, 3], scope='conv3') 183 | conv3 = slim.conv2d(pool2, 128, [3, 3], scope='conv3') 184 | pool3 = slim.max_pool2d(conv3, [2, 2], scope='pool3') 185 | 186 | #第四个卷积层(24) (6) 187 | conv4 = slim.conv2d(pool3, 256 ,[3, 3], scope='conv4') 188 | dropout = tf.layers.dropout(conv4, rate=prob, training=True) 189 | #dropout = tf.nn.dropout(conv4,keep_prob) 190 | #pool4 = slim.max_pool2d(conv4, [2, 2], scope='pool4') 191 | 192 | #第五个卷积层(24-12) (6-》3) 193 | #conv5 = slim.repeat(dropout, 2, slim.conv2d, 128, [3, 3], scope='conv5') 194 | conv5 = slim.conv2d(dropout , 128, [3, 3], scope='conv5') 195 | pool5 = slim.max_pool2d(conv5, [2, 2], scope='pool5') 196 | 197 | #第六个卷积层(12-6) (3-》1) 198 | #conv6 = slim.repeat(pool5, 2, slim.conv2d, 64, [3, 3], scope='conv6') 199 | conv6 = slim.conv2d(pool5, 64, [3, 3], scope='conv6') 200 | pool6 = slim.max_pool2d(conv6, [2, 2], scope='pool6') 201 | 202 | reshape = tf.reshape(pool6, [-1, 6 * 1 * 64]) 203 | # print(reshape.get_shape()) 204 | 205 | fc = slim.fully_connected(reshape, 4, scope='fc') 206 | # print(fc) 207 | # print(input_data) 208 | 209 | ''' 210 | #第七个卷积层(6-3) (1-》1) 211 | conv7 = slim.conv2d(pool6, 32, [3, 3], scope='conv7') 212 | pool7 = slim.max_pool2d(conv7, [2, 2], scope='pool7') 213 | 214 | conv8 = slim.conv2d(pool7, 4, [3, 3], padding=None, activation_fn=None,scope='conv8') 215 | ''' 216 | 217 | 218 | n_epoch =500 219 | batch_size= 32 220 | print (batch_size) 221 | 222 | 223 | weights = tf.expand_dims(1. * 1., axis=-1) 224 | loss = abs_smooth(fc - input_bound)#fc层和输入标签的差,用平滑L2范数做损失函数 225 | # print(loss) 226 | train_op=tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)#优化用的adam,学习率0.001 227 | 228 | #correct_prediction = tf.equal(fc, input_bound) 229 | #correct_prediction = tf.equal(tf.cast(fc,tf.int32), tf.cast(input_bound, tf.int32)) 230 | 231 | temp_acc = tf.abs(tf.cast(fc,tf.int32) - tf.cast(input_bound, tf.int32)) #fc出来之后的和标签做个差值 232 | compare_np = np.ones((batch_size,4), np.int32) #建立一个和batch_size一样大小,4通道的compare_np 233 | compare_np[:] = 3 234 | print(compare_np) 235 | compare_tf = tf.convert_to_tensor(compare_np) # 236 | # print(compare_tf) 237 | correct_prediction = tf.less(temp_acc,compare_tf) ##temp_acc对应的元素如果比compare_tf对应的小,那么对应位置返回true 238 | # print(correct_prediction) 239 | loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value')##求张量沿着某个方向的和,求完后可以降维度 240 | tf.summary.scalar('loss',loss) #可视化观看常量 241 | # print(loss) 242 | accuracy= tf.reduce_mean(tf.cast(correct_prediction, tf.float32))###tf.cast函数转换类型### 243 | #tf.summary.scalar('accuracy',accuracy) #可视化观看常量 244 | # print(accuracy) 245 | 246 | 247 | # print(prob) 248 | 249 | # pb_file_path = '/data/liuan/jupyter/root/project/keras-retinanet-master/bbox_fz_zc_006000/bbox_pb_model/ocr_bboxregress_batch16_epoch10000.pb' 250 | pb_file_path = '/data/lp/project/ssd.pytorch/ocr_bbox_batch16_epoch' 251 | 252 | # 设置可见GPU 253 | gpu_no = '1' # or '1' 254 | os.environ["CUDA_VISIBLE_DEVICES"] = gpu_no 255 | # 定义TensorFlow配置 256 | config = tf.ConfigProto() 257 | # 配置GPU内存分配方式 258 | config.gpu_options.allow_growth = True 259 | config.gpu_options.per_process_gpu_memory_fraction = 0.6 260 | # config.gpu_options.per_process_gpu_memory_fraction = 0.8 261 | 262 | 263 | sess = tf.InteractiveSession(config=config) 264 | 265 | # //////////////////////////////// 266 | # ckpt = tf.train.get_checkpoint_state('/home/data/wangchongjin/ad_image/model_save/') 267 | # saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path +'.meta') # 载入图结构,保存在.meta文件中 268 | # saver.restore(sess,ckpt.model_checkpoint_path) 269 | # ////////////////////////////////// 270 | sess.run(tf.global_variables_initializer()) 271 | 272 | merged = tf.summary.merge_all() 273 | writer = tf.summary.FileWriter( 274 | "/data/lp/project/ssd.pytorch/ocr_bbox_batch16_epoch/record_graph", sess.graph_def) 275 | 276 | # saver = tf.train.Saver() # 声明tf.train.Saver类用于保存模型 277 | 278 | 279 | for epoch in range(n_epoch): 280 | start_time = time.time() 281 | 282 | # training 283 | train_loss, train_acc, n_batch = 0, 0, 0 284 | for x_train_a, y_train_a in getbatches(train_img_arr, train_xml_resize, batch_size, shuffle=False): 285 | _, err, acc = sess.run([train_op, loss, accuracy], 286 | feed_dict={input_data: x_train_a, input_bound: y_train_a, prob: 0.5}) 287 | train_loss += err 288 | train_acc += acc 289 | n_batch += 1 290 | 291 | # print(epoch) 292 | # print(" train loss: %f" % (train_loss/ n_batch)) 293 | # print(" train acc: %f" % (train_acc/ n_batch)) 294 | 295 | # validation 296 | val_loss, val_acc, n_batch = 0, 0, 0 297 | for x_val_a, y_val_a in getbatches(val_img_arr, val_xml_resize, batch_size, shuffle=False): 298 | err, acc = sess.run([loss, accuracy], feed_dict={input_data: x_val_a, input_bound: y_val_a, prob: 0}) 299 | # print(err) 300 | val_loss += err 301 | val_acc += acc 302 | n_batch += 1 303 | 304 | rs = sess.run([merged], feed_dict={input_data: x_val_a, input_bound: y_val_a, prob: 0}) 305 | if n_batch is batch_size: 306 | writer.add_summary(rs[0], epoch) 307 | 308 | # print(" validation loss: %f" % (val_loss/ n_batch)) 309 | # print(" validation acc: %f" % (val_acc/ n_batch)) 310 | 311 | # saver.save(sess, "/home/data/wangchongjin/ad_image/model_save_new/ad.ckpt") 312 | constant_graph = graph_util.convert_variables_to_constants(sess, sess.graph_def, ['fc/Relu']) 313 | 314 | with tf.gfile.FastGFile(pb_file_path + '_' + str(epoch) + '.pb', mode='wb') as f: 315 | f.write(constant_graph.SerializeToString()) 316 | 317 | writer.close() 318 | sess.close() -------------------------------------------------------------------------------- /data/MYSELF.py: -------------------------------------------------------------------------------- 1 | # # import os.path as osp 2 | # # import sys 3 | # # import torch 4 | # # import torch.utils.data as data 5 | # # import cv2 6 | # # import numpy as np 7 | # # if sys.version_info[0] == 2: 8 | # # import xml.etree.cElementTree as ET 9 | # # else: 10 | # # import xml.etree.ElementTree as ET 11 | # # image_sets=['2007', 'trainval'],#,('2012', 'trainval') 要选用的数据集 12 | # # root="D:/Deep_learning/ssd.pytorch-master/data/VOCdevkit/" 13 | # # ids = list() 14 | # # for (year, name) in image_sets: 15 | # # rootpath = osp.join(root, 'VOC' + year) 16 | # # for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 17 | # # ids.append((rootpath, line.strip())) 18 | # # print(ids[0]) 19 | # # 20 | # # img_id = ids[927] #('D:/Deep_learning/ssd.pytorch-master/data/VOCdevkit/VOC2007', '000001') 21 | # # anno = osp.join('%s', 'Annotations', '%s.xml') 22 | # # img = osp.join('%s', 'JPEGImages', '%s.jpg') 23 | # # target = ET.parse(anno % img_id).getroot() #读取xml文件 24 | # # img = cv2.imread(img % img_id)#获取图像 25 | # # cv2.imshow('pwn',img) 26 | # # height, width, channels = img.shape 27 | # # print(height) 28 | # # print(width) 29 | # # print(channels) 30 | # # cv2.waitKey (0) 31 | # # 32 | # # VOC_CLASSES1 = ( # always index 0 33 | # # 'aeroplane', 'bicycle', 'bird', 'boat', 34 | # # 'bottle', 'bus', 'car', 'cat', 'chair', 35 | # # 'cow', 'diningtable', 'dog', 'horse', 36 | # # 'motorbike', 'person', 'pottedplant', 37 | # # 'sheep', 'sofa', 'train', 'tvmonitor') 38 | # # VOC_CLASSES2=('ship','pwn') 39 | # # 40 | # # what=dict(zip(VOC_CLASSES1, range(len(VOC_CLASSES1)))) 41 | # # what2=dict(zip(VOC_CLASSES2, range(len(VOC_CLASSES2)))) 42 | # # print(what) 43 | # # print(what2) 44 | # ####################################################################################################################### 45 | # # from __future__ import division 46 | # # from math import sqrt as sqrt 47 | # # from itertools import product as product 48 | # # import torch 49 | # # mean = [] 50 | # # clip=True 51 | # # for i, j in product(range(5), repeat=2): # 生成平面的网格位置坐标 i=[0 0 0 0 0 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4] 52 | # # f_k = 300 / 64 #37.5 j=[0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4] 53 | # # cx = (j + 0.5) / f_k # 54 | # # cy = (i + 0.5) / f_k # 55 | # # s_k =162 / 300#0.1 56 | # # mean += [cx, cy, s_k, s_k] 57 | # # # aspect_ratio: 1 58 | # # # rel size: sqrt(s_k * s_(k+1)) 59 | # # s_k_prime = sqrt(s_k * (213/300))#0.14 60 | # # mean += [cx, cy, s_k_prime, s_k_prime] 61 | # # 62 | # # # rest of aspect ratios 63 | # # for ar in [2,3]: # 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 64 | # # mean += [cx, cy, s_k * sqrt(ar), s_k / sqrt(ar)] 65 | # # mean += [cx, cy, s_k / sqrt(ar), s_k * sqrt(ar)] 66 | # # 67 | # # output = torch.Tensor(mean).view(-1, 4) 68 | # # if clip: 69 | # # output.clamp_(max=1, min=0) 70 | # import torch as t 71 | # list1=[t.full([2,2,2],1),t.full([2,2,2],2)] 72 | # list2=[t.full([2,2,2],3),t.full([2,2,2],4)] 73 | # list3=[t.full([2,2,2],5),t.full([2,2,2],6)] 74 | # loc=[] 75 | # conf=[] 76 | # pwn=zip(list1,list2,list3) 77 | # print(pwn) 78 | # 79 | # # for (x,l,c) in zip(list1,list2,list3): 80 | # # loc.append(l(x)) 81 | # # conf.append(c(x)) 82 | # 83 | # # import torch 84 | # # x = torch.tensor([[1,2,3],[4,5,6]]) 85 | # # x.is_contiguous() # True 86 | # # print(x) 87 | # # print(x.transpose(0,1)) 88 | # # print(x.transpose(0, 1).is_contiguous()) # False 89 | # # print(x.transpose(0, 1).contiguous().is_contiguous()) # True 90 | 91 | from data import * 92 | from utils.augmentations import SSDAugmentation 93 | from layers.modules import MultiBoxLoss 94 | from ssd import build_ssd 95 | import os 96 | import time 97 | import torch 98 | from torch.autograd import Variable 99 | import torch.nn as nn 100 | import torch.optim as optim 101 | import torch.backends.cudnn as cudnn 102 | import torch.nn.init as init 103 | import torch.utils.data as data 104 | import argparse 105 | import visdom as viz 106 | 107 | list1=torch.arange(0,8) 108 | x = torch.Tensor([[1], [2], [3]]) 109 | y = x.expand(3, 4) 110 | print("x.size():", x.size()) 111 | print("y.size():", y.size()) 112 | 113 | print(x) 114 | print(y) -------------------------------------------------------------------------------- /data/VOCdevkit/VOC2007/ImageSets/Main/test.txt: -------------------------------------------------------------------------------- 1 | 000009 2 | 000010 3 | 000012 4 | 000013 5 | 000014 6 | 000015 7 | 000023 8 | 000024 9 | 000028 10 | 000029 11 | 000031 12 | 000032 13 | 000034 14 | 000040 15 | 000053 16 | 000059 17 | 000067 18 | 000074 19 | 000075 20 | 000076 21 | 000090 22 | 000101 23 | 000105 24 | 000107 25 | 000116 26 | 000120 27 | 000131 28 | 000133 29 | 000135 30 | 000136 31 | 000137 32 | 000139 33 | 000149 34 | 000157 35 | 000163 36 | 000164 37 | 000181 38 | 000183 39 | 000187 40 | 000192 41 | 000197 42 | 000203 43 | 000219 44 | 000220 45 | 000229 46 | 000230 47 | 000231 48 | 000235 49 | 000237 50 | 000241 51 | 000244 52 | 000248 53 | 000255 54 | 000256 55 | 000262 56 | 000263 57 | 000264 58 | 000266 59 | 000268 60 | 000269 61 | 000274 62 | 000275 63 | 000282 64 | 000290 65 | 000291 66 | 000303 67 | 000306 68 | 000319 69 | 000334 70 | 000340 71 | 000346 72 | 000351 73 | 000352 74 | 000360 75 | 000366 76 | 000369 77 | 000384 78 | 000390 79 | 000392 80 | 000398 81 | 000408 82 | 000409 83 | 000423 84 | 000430 85 | 000431 86 | 000440 87 | 000444 88 | 000450 89 | 000454 90 | 000462 91 | 000470 92 | 000472 93 | 000486 94 | 000487 95 | 000488 96 | 000489 97 | 000499 98 | 000501 99 | 000505 100 | 000506 101 | 000507 102 | 000513 103 | 000530 104 | 000533 105 | 000540 106 | 000542 107 | 000544 108 | 000552 109 | 000561 110 | 000564 111 | 000567 112 | 000568 113 | 000569 114 | 000571 115 | 000574 116 | 000576 117 | 000584 118 | 000590 119 | 000593 120 | 000598 121 | 000602 122 | 000604 123 | 000605 124 | 000619 125 | 000620 126 | 000631 127 | 000635 128 | 000649 129 | 000661 130 | 000672 131 | 000676 132 | 000694 133 | 000697 134 | 000708 135 | 000711 136 | 000712 137 | 000713 138 | 000717 139 | 000727 140 | 000729 141 | 000731 142 | 000732 143 | 000739 144 | 000743 145 | 000746 146 | 000748 147 | 000753 148 | 000754 149 | 000756 150 | 000764 151 | 000769 152 | 000774 153 | 000775 154 | 000786 155 | 000793 156 | 000796 157 | 000802 158 | 000808 159 | 000809 160 | 000814 161 | 000819 162 | 000821 163 | 000823 164 | 000846 165 | 000848 166 | 000858 167 | 000863 168 | 000867 169 | 000869 170 | 000875 171 | 000880 172 | 000883 173 | 000884 174 | 000888 175 | 000893 176 | 000894 177 | 000897 178 | 000898 179 | 000900 180 | 000910 181 | 000912 182 | 000918 183 | 000919 184 | 000920 185 | 000921 186 | 000922 187 | 000926 188 | 000946 189 | 000947 190 | 000954 191 | 000960 192 | 000961 193 | 000967 194 | 000971 195 | 000977 196 | 000978 197 | 000982 198 | 000984 199 | 000986 200 | 000988 201 | 000989 202 | 000996 203 | 000997 204 | 001006 205 | 001007 206 | 001020 207 | 001029 208 | 001036 209 | 001043 210 | 001044 211 | 001047 212 | 001057 213 | 001059 214 | 001062 215 | 001068 216 | 001069 217 | 001075 218 | 001076 219 | 001078 220 | 001080 221 | 001084 222 | 001091 223 | 001096 224 | 001099 225 | 001104 226 | 001114 227 | 001127 228 | 001129 229 | 001130 230 | 001139 231 | 001146 232 | 001147 233 | -------------------------------------------------------------------------------- /data/VOCdevkit/VOC2007/ImageSets/Main/train.txt: -------------------------------------------------------------------------------- 1 | 000001 2 | 000002 3 | 000003 4 | 000004 5 | 000005 6 | 000006 7 | 000007 8 | 000008 9 | 000011 10 | 000016 11 | 000018 12 | 000019 13 | 000020 14 | 000021 15 | 000025 16 | 000026 17 | 000027 18 | 000030 19 | 000033 20 | 000035 21 | 000036 22 | 000037 23 | 000038 24 | 000039 25 | 000041 26 | 000042 27 | 000043 28 | 000044 29 | 000045 30 | 000046 31 | 000047 32 | 000048 33 | 000049 34 | 000050 35 | 000051 36 | 000052 37 | 000054 38 | 000056 39 | 000057 40 | 000058 41 | 000060 42 | 000061 43 | 000062 44 | 000063 45 | 000064 46 | 000065 47 | 000066 48 | 000069 49 | 000070 50 | 000071 51 | 000072 52 | 000073 53 | 000077 54 | 000078 55 | 000079 56 | 000080 57 | 000081 58 | 000082 59 | 000083 60 | 000084 61 | 000085 62 | 000086 63 | 000087 64 | 000088 65 | 000089 66 | 000091 67 | 000093 68 | 000094 69 | 000095 70 | 000096 71 | 000097 72 | 000099 73 | 000100 74 | 000102 75 | 000103 76 | 000104 77 | 000106 78 | 000108 79 | 000109 80 | 000110 81 | 000111 82 | 000112 83 | 000113 84 | 000114 85 | 000115 86 | 000117 87 | 000121 88 | 000122 89 | 000123 90 | 000125 91 | 000126 92 | 000127 93 | 000128 94 | 000129 95 | 000130 96 | 000132 97 | 000134 98 | 000138 99 | 000140 100 | 000142 101 | 000143 102 | 000144 103 | 000145 104 | 000146 105 | 000147 106 | 000150 107 | 000151 108 | 000152 109 | 000153 110 | 000154 111 | 000156 112 | 000158 113 | 000159 114 | 000160 115 | 000162 116 | 000165 117 | 000166 118 | 000167 119 | 000168 120 | 000169 121 | 000170 122 | 000171 123 | 000172 124 | 000173 125 | 000174 126 | 000175 127 | 000176 128 | 000177 129 | 000178 130 | 000179 131 | 000180 132 | 000182 133 | 000184 134 | 000185 135 | 000186 136 | 000188 137 | 000189 138 | 000190 139 | 000191 140 | 000193 141 | 000194 142 | 000195 143 | 000196 144 | 000198 145 | 000199 146 | 000200 147 | 000201 148 | 000202 149 | 000204 150 | 000205 151 | 000206 152 | 000207 153 | 000208 154 | 000209 155 | 000210 156 | 000213 157 | 000215 158 | 000216 159 | 000217 160 | 000218 161 | 000221 162 | 000223 163 | 000225 164 | 000226 165 | 000227 166 | 000228 167 | 000232 168 | 000233 169 | 000234 170 | 000236 171 | 000238 172 | 000239 173 | 000240 174 | 000242 175 | 000243 176 | 000245 177 | 000246 178 | 000247 179 | 000249 180 | 000250 181 | 000251 182 | 000252 183 | 000253 184 | 000257 185 | 000258 186 | 000260 187 | 000261 188 | 000265 189 | 000267 190 | 000270 191 | 000271 192 | 000272 193 | 000273 194 | 000276 195 | 000277 196 | 000278 197 | 000279 198 | 000280 199 | 000281 200 | 000283 201 | 000284 202 | 000285 203 | 000286 204 | 000287 205 | 000288 206 | 000289 207 | 000292 208 | 000293 209 | 000294 210 | 000295 211 | 000296 212 | 000298 213 | 000299 214 | 000301 215 | 000302 216 | 000309 217 | 000310 218 | 000311 219 | 000312 220 | 000313 221 | 000314 222 | 000316 223 | 000317 224 | 000318 225 | 000320 226 | 000321 227 | 000322 228 | 000323 229 | 000324 230 | 000325 231 | 000326 232 | 000327 233 | 000328 234 | 000329 235 | 000331 236 | 000332 237 | 000335 238 | 000336 239 | 000337 240 | 000338 241 | 000339 242 | 000341 243 | 000342 244 | 000343 245 | 000344 246 | 000345 247 | 000347 248 | 000348 249 | 000349 250 | 000350 251 | 000353 252 | 000354 253 | 000355 254 | 000356 255 | 000357 256 | 000358 257 | 000359 258 | 000361 259 | 000362 260 | 000363 261 | 000364 262 | 000365 263 | 000367 264 | 000368 265 | 000370 266 | 000371 267 | 000372 268 | 000373 269 | 000374 270 | 000376 271 | 000377 272 | 000379 273 | 000380 274 | 000381 275 | 000383 276 | 000385 277 | 000386 278 | 000387 279 | 000388 280 | 000389 281 | 000391 282 | 000393 283 | 000394 284 | 000395 285 | 000396 286 | 000397 287 | 000399 288 | 000400 289 | 000401 290 | 000402 291 | 000403 292 | 000405 293 | 000406 294 | 000407 295 | 000410 296 | 000411 297 | 000412 298 | 000413 299 | 000414 300 | 000415 301 | 000416 302 | 000417 303 | 000418 304 | 000419 305 | 000420 306 | 000421 307 | 000422 308 | 000424 309 | 000426 310 | 000427 311 | 000428 312 | 000429 313 | 000432 314 | 000433 315 | 000434 316 | 000435 317 | 000438 318 | 000439 319 | 000441 320 | 000443 321 | 000445 322 | 000446 323 | 000447 324 | 000448 325 | 000449 326 | 000451 327 | 000452 328 | 000453 329 | 000455 330 | 000456 331 | 000457 332 | 000458 333 | 000459 334 | 000460 335 | 000463 336 | 000464 337 | 000465 338 | 000466 339 | 000467 340 | 000468 341 | 000469 342 | 000471 343 | 000473 344 | 000474 345 | 000475 346 | 000476 347 | 000477 348 | 000478 349 | 000479 350 | 000480 351 | 000482 352 | 000483 353 | 000484 354 | 000485 355 | 000490 356 | 000492 357 | 000493 358 | 000494 359 | 000495 360 | 000496 361 | 000497 362 | 000498 363 | 000500 364 | 000502 365 | 000503 366 | 000509 367 | 000510 368 | 000511 369 | 000512 370 | 000514 371 | 000515 372 | 000516 373 | 000517 374 | 000518 375 | 000520 376 | 000521 377 | 000522 378 | 000523 379 | 000525 380 | 000527 381 | 000528 382 | 000529 383 | 000531 384 | 000532 385 | 000534 386 | 000535 387 | 000536 388 | 000537 389 | 000538 390 | 000539 391 | 000541 392 | 000543 393 | 000545 394 | 000546 395 | 000547 396 | 000548 397 | 000549 398 | 000550 399 | 000551 400 | 000553 401 | 000554 402 | 000555 403 | 000556 404 | 000557 405 | 000558 406 | 000559 407 | 000560 408 | 000562 409 | 000563 410 | 000565 411 | 000566 412 | 000570 413 | 000572 414 | 000573 415 | 000575 416 | 000577 417 | 000578 418 | 000579 419 | 000580 420 | 000582 421 | 000583 422 | 000585 423 | 000586 424 | 000587 425 | 000588 426 | 000589 427 | 000591 428 | 000592 429 | 000594 430 | 000595 431 | 000596 432 | 000597 433 | 000599 434 | 000600 435 | 000601 436 | 000606 437 | 000607 438 | 000608 439 | 000609 440 | 000610 441 | 000611 442 | 000612 443 | 000613 444 | 000614 445 | 000615 446 | 000616 447 | 000617 448 | 000618 449 | 000621 450 | 000622 451 | 000623 452 | 000624 453 | 000625 454 | 000626 455 | 000627 456 | 000628 457 | 000630 458 | 000632 459 | 000633 460 | 000634 461 | 000636 462 | 000637 463 | 000638 464 | 000641 465 | 000642 466 | 000643 467 | 000645 468 | 000647 469 | 000648 470 | 000650 471 | 000651 472 | 000652 473 | 000653 474 | 000654 475 | 000656 476 | 000657 477 | 000658 478 | 000659 479 | 000663 480 | 000664 481 | 000665 482 | 000666 483 | 000667 484 | 000668 485 | 000669 486 | 000670 487 | 000671 488 | 000673 489 | 000674 490 | 000675 491 | 000677 492 | 000678 493 | 000679 494 | 000680 495 | 000681 496 | 000682 497 | 000683 498 | 000684 499 | 000685 500 | 000686 501 | 000687 502 | 000688 503 | 000689 504 | 000690 505 | 000691 506 | 000692 507 | 000693 508 | 000696 509 | 000698 510 | 000699 511 | 000700 512 | 000701 513 | 000702 514 | 000703 515 | 000704 516 | 000705 517 | 000706 518 | 000707 519 | 000709 520 | 000710 521 | 000714 522 | 000715 523 | 000718 524 | 000719 525 | 000720 526 | 000721 527 | 000722 528 | 000723 529 | 000724 530 | 000725 531 | 000726 532 | 000728 533 | 000730 534 | 000733 535 | 000734 536 | 000736 537 | 000738 538 | 000740 539 | 000741 540 | 000742 541 | 000744 542 | 000745 543 | 000747 544 | 000749 545 | 000750 546 | 000751 547 | 000755 548 | 000757 549 | 000758 550 | 000759 551 | 000760 552 | 000761 553 | 000762 554 | 000763 555 | 000765 556 | 000766 557 | 000767 558 | 000768 559 | 000770 560 | 000772 561 | 000776 562 | 000777 563 | 000778 564 | 000779 565 | 000780 566 | 000781 567 | 000782 568 | 000783 569 | 000784 570 | 000785 571 | 000787 572 | 000788 573 | 000789 574 | 000790 575 | 000791 576 | 000792 577 | 000794 578 | 000795 579 | 000797 580 | 000798 581 | 000799 582 | 000800 583 | 000801 584 | 000803 585 | 000804 586 | 000805 587 | 000806 588 | 000811 589 | 000813 590 | 000815 591 | 000816 592 | 000817 593 | 000818 594 | 000822 595 | 000824 596 | 000825 597 | 000826 598 | 000828 599 | 000829 600 | 000830 601 | 000831 602 | 000832 603 | 000833 604 | 000834 605 | 000835 606 | 000836 607 | 000837 608 | 000838 609 | 000839 610 | 000840 611 | 000841 612 | 000842 613 | 000843 614 | 000844 615 | 000845 616 | 000847 617 | 000849 618 | 000850 619 | 000851 620 | 000852 621 | 000853 622 | 000854 623 | 000855 624 | 000856 625 | 000857 626 | 000859 627 | 000860 628 | 000861 629 | 000862 630 | 000864 631 | 000865 632 | 000866 633 | 000868 634 | 000871 635 | 000872 636 | 000873 637 | 000874 638 | 000876 639 | 000877 640 | 000881 641 | 000885 642 | 000886 643 | 000887 644 | 000889 645 | 000890 646 | 000892 647 | 000895 648 | 000896 649 | 000899 650 | 000901 651 | 000902 652 | 000903 653 | 000904 654 | 000905 655 | 000907 656 | 000908 657 | 000913 658 | 000914 659 | 000915 660 | 000916 661 | 000917 662 | 000923 663 | 000924 664 | 000927 665 | 000929 666 | 000930 667 | 000931 668 | 000932 669 | 000933 670 | 000934 671 | 000935 672 | 000936 673 | 000937 674 | 000938 675 | 000939 676 | 000940 677 | 000941 678 | 000942 679 | 000943 680 | 000944 681 | 000945 682 | 000948 683 | 000949 684 | 000950 685 | 000951 686 | 000952 687 | 000953 688 | 000955 689 | 000956 690 | 000957 691 | 000958 692 | 000959 693 | 000962 694 | 000963 695 | 000964 696 | 000966 697 | 000969 698 | 000970 699 | 000972 700 | 000973 701 | 000974 702 | 000975 703 | 000976 704 | 000979 705 | 000980 706 | 000981 707 | 000983 708 | 000985 709 | 000987 710 | 000990 711 | 000992 712 | 000993 713 | 000994 714 | 000995 715 | 000998 716 | 001000 717 | 001001 718 | 001002 719 | 001003 720 | 001004 721 | 001005 722 | 001008 723 | 001009 724 | 001010 725 | 001012 726 | 001013 727 | 001014 728 | 001015 729 | 001016 730 | 001017 731 | 001018 732 | 001019 733 | 001021 734 | 001022 735 | 001024 736 | 001025 737 | 001026 738 | 001027 739 | 001028 740 | 001030 741 | 001032 742 | 001033 743 | 001034 744 | 001035 745 | 001037 746 | 001038 747 | 001039 748 | 001040 749 | 001041 750 | 001042 751 | 001046 752 | 001048 753 | 001049 754 | 001051 755 | 001052 756 | 001053 757 | 001055 758 | 001056 759 | 001058 760 | 001060 761 | 001061 762 | 001063 763 | 001064 764 | 001065 765 | 001066 766 | 001067 767 | 001070 768 | 001071 769 | 001072 770 | 001074 771 | 001077 772 | 001079 773 | 001081 774 | 001082 775 | 001083 776 | 001085 777 | 001086 778 | 001087 779 | 001088 780 | 001089 781 | 001090 782 | 001092 783 | 001093 784 | 001094 785 | 001095 786 | 001097 787 | 001098 788 | 001101 789 | 001102 790 | 001103 791 | 001105 792 | 001107 793 | 001108 794 | 001109 795 | 001110 796 | 001111 797 | 001112 798 | 001113 799 | 001115 800 | 001116 801 | 001117 802 | 001118 803 | 001119 804 | 001120 805 | 001121 806 | 001122 807 | 001123 808 | 001124 809 | 001125 810 | 001126 811 | 001128 812 | 001131 813 | 001132 814 | 001133 815 | 001134 816 | 001135 817 | 001136 818 | 001137 819 | 001138 820 | 001140 821 | 001141 822 | 001142 823 | 001143 824 | 001144 825 | 001145 826 | 001148 827 | 001149 828 | 001150 829 | 001151 830 | 001152 831 | 001154 832 | 001156 833 | 001158 834 | 001159 835 | 001160 836 | -------------------------------------------------------------------------------- /data/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt: -------------------------------------------------------------------------------- 1 | 000001 2 | 000002 3 | 000003 4 | 000004 5 | 000005 6 | 000006 7 | 000007 8 | 000008 9 | 000011 10 | 000016 11 | 000017 12 | 000018 13 | 000019 14 | 000020 15 | 000021 16 | 000022 17 | 000025 18 | 000026 19 | 000027 20 | 000030 21 | 000033 22 | 000035 23 | 000036 24 | 000037 25 | 000038 26 | 000039 27 | 000041 28 | 000042 29 | 000043 30 | 000044 31 | 000045 32 | 000046 33 | 000047 34 | 000048 35 | 000049 36 | 000050 37 | 000051 38 | 000052 39 | 000054 40 | 000055 41 | 000056 42 | 000057 43 | 000058 44 | 000060 45 | 000061 46 | 000062 47 | 000063 48 | 000064 49 | 000065 50 | 000066 51 | 000068 52 | 000069 53 | 000070 54 | 000071 55 | 000072 56 | 000073 57 | 000077 58 | 000078 59 | 000079 60 | 000080 61 | 000081 62 | 000082 63 | 000083 64 | 000084 65 | 000085 66 | 000086 67 | 000087 68 | 000088 69 | 000089 70 | 000091 71 | 000092 72 | 000093 73 | 000094 74 | 000095 75 | 000096 76 | 000097 77 | 000098 78 | 000099 79 | 000100 80 | 000102 81 | 000103 82 | 000104 83 | 000106 84 | 000108 85 | 000109 86 | 000110 87 | 000111 88 | 000112 89 | 000113 90 | 000114 91 | 000115 92 | 000117 93 | 000118 94 | 000119 95 | 000121 96 | 000122 97 | 000123 98 | 000124 99 | 000125 100 | 000126 101 | 000127 102 | 000128 103 | 000129 104 | 000130 105 | 000132 106 | 000134 107 | 000138 108 | 000140 109 | 000141 110 | 000142 111 | 000143 112 | 000144 113 | 000145 114 | 000146 115 | 000147 116 | 000148 117 | 000150 118 | 000151 119 | 000152 120 | 000153 121 | 000154 122 | 000155 123 | 000156 124 | 000158 125 | 000159 126 | 000160 127 | 000161 128 | 000162 129 | 000165 130 | 000166 131 | 000167 132 | 000168 133 | 000169 134 | 000170 135 | 000171 136 | 000172 137 | 000173 138 | 000174 139 | 000175 140 | 000176 141 | 000177 142 | 000178 143 | 000179 144 | 000180 145 | 000182 146 | 000184 147 | 000185 148 | 000186 149 | 000188 150 | 000189 151 | 000190 152 | 000191 153 | 000193 154 | 000194 155 | 000195 156 | 000196 157 | 000198 158 | 000199 159 | 000200 160 | 000201 161 | 000202 162 | 000204 163 | 000205 164 | 000206 165 | 000207 166 | 000208 167 | 000209 168 | 000210 169 | 000211 170 | 000212 171 | 000213 172 | 000214 173 | 000215 174 | 000216 175 | 000217 176 | 000218 177 | 000221 178 | 000222 179 | 000223 180 | 000224 181 | 000225 182 | 000226 183 | 000227 184 | 000228 185 | 000232 186 | 000233 187 | 000234 188 | 000236 189 | 000238 190 | 000239 191 | 000240 192 | 000242 193 | 000243 194 | 000245 195 | 000246 196 | 000247 197 | 000249 198 | 000250 199 | 000251 200 | 000252 201 | 000253 202 | 000254 203 | 000257 204 | 000258 205 | 000259 206 | 000260 207 | 000261 208 | 000265 209 | 000267 210 | 000270 211 | 000271 212 | 000272 213 | 000273 214 | 000276 215 | 000277 216 | 000278 217 | 000279 218 | 000280 219 | 000281 220 | 000283 221 | 000284 222 | 000285 223 | 000286 224 | 000287 225 | 000288 226 | 000289 227 | 000292 228 | 000293 229 | 000294 230 | 000295 231 | 000296 232 | 000297 233 | 000298 234 | 000299 235 | 000300 236 | 000301 237 | 000302 238 | 000304 239 | 000305 240 | 000307 241 | 000308 242 | 000309 243 | 000310 244 | 000311 245 | 000312 246 | 000313 247 | 000314 248 | 000315 249 | 000316 250 | 000317 251 | 000318 252 | 000320 253 | 000321 254 | 000322 255 | 000323 256 | 000324 257 | 000325 258 | 000326 259 | 000327 260 | 000328 261 | 000329 262 | 000330 263 | 000331 264 | 000332 265 | 000333 266 | 000335 267 | 000336 268 | 000337 269 | 000338 270 | 000339 271 | 000341 272 | 000342 273 | 000343 274 | 000344 275 | 000345 276 | 000347 277 | 000348 278 | 000349 279 | 000350 280 | 000353 281 | 000354 282 | 000355 283 | 000356 284 | 000357 285 | 000358 286 | 000359 287 | 000361 288 | 000362 289 | 000363 290 | 000364 291 | 000365 292 | 000367 293 | 000368 294 | 000370 295 | 000371 296 | 000372 297 | 000373 298 | 000374 299 | 000375 300 | 000376 301 | 000377 302 | 000378 303 | 000379 304 | 000380 305 | 000381 306 | 000382 307 | 000383 308 | 000385 309 | 000386 310 | 000387 311 | 000388 312 | 000389 313 | 000391 314 | 000393 315 | 000394 316 | 000395 317 | 000396 318 | 000397 319 | 000399 320 | 000400 321 | 000401 322 | 000402 323 | 000403 324 | 000404 325 | 000405 326 | 000406 327 | 000407 328 | 000410 329 | 000411 330 | 000412 331 | 000413 332 | 000414 333 | 000415 334 | 000416 335 | 000417 336 | 000418 337 | 000419 338 | 000420 339 | 000421 340 | 000422 341 | 000424 342 | 000425 343 | 000426 344 | 000427 345 | 000428 346 | 000429 347 | 000432 348 | 000433 349 | 000434 350 | 000435 351 | 000436 352 | 000437 353 | 000438 354 | 000439 355 | 000441 356 | 000442 357 | 000443 358 | 000445 359 | 000446 360 | 000447 361 | 000448 362 | 000449 363 | 000451 364 | 000452 365 | 000453 366 | 000455 367 | 000456 368 | 000457 369 | 000458 370 | 000459 371 | 000460 372 | 000461 373 | 000463 374 | 000464 375 | 000465 376 | 000466 377 | 000467 378 | 000468 379 | 000469 380 | 000471 381 | 000473 382 | 000474 383 | 000475 384 | 000476 385 | 000477 386 | 000478 387 | 000479 388 | 000480 389 | 000481 390 | 000482 391 | 000483 392 | 000484 393 | 000485 394 | 000490 395 | 000491 396 | 000492 397 | 000493 398 | 000494 399 | 000495 400 | 000496 401 | 000497 402 | 000498 403 | 000500 404 | 000502 405 | 000503 406 | 000504 407 | 000508 408 | 000509 409 | 000510 410 | 000511 411 | 000512 412 | 000514 413 | 000515 414 | 000516 415 | 000517 416 | 000518 417 | 000519 418 | 000520 419 | 000521 420 | 000522 421 | 000523 422 | 000524 423 | 000525 424 | 000526 425 | 000527 426 | 000528 427 | 000529 428 | 000531 429 | 000532 430 | 000534 431 | 000535 432 | 000536 433 | 000537 434 | 000538 435 | 000539 436 | 000541 437 | 000543 438 | 000545 439 | 000546 440 | 000547 441 | 000548 442 | 000549 443 | 000550 444 | 000551 445 | 000553 446 | 000554 447 | 000555 448 | 000556 449 | 000557 450 | 000558 451 | 000559 452 | 000560 453 | 000562 454 | 000563 455 | 000565 456 | 000566 457 | 000570 458 | 000572 459 | 000573 460 | 000575 461 | 000577 462 | 000578 463 | 000579 464 | 000580 465 | 000581 466 | 000582 467 | 000583 468 | 000585 469 | 000586 470 | 000587 471 | 000588 472 | 000589 473 | 000591 474 | 000592 475 | 000594 476 | 000595 477 | 000596 478 | 000597 479 | 000599 480 | 000600 481 | 000601 482 | 000603 483 | 000606 484 | 000607 485 | 000608 486 | 000609 487 | 000610 488 | 000611 489 | 000612 490 | 000613 491 | 000614 492 | 000615 493 | 000616 494 | 000617 495 | 000618 496 | 000621 497 | 000622 498 | 000623 499 | 000624 500 | 000625 501 | 000626 502 | 000627 503 | 000628 504 | 000629 505 | 000630 506 | 000632 507 | 000633 508 | 000634 509 | 000636 510 | 000637 511 | 000638 512 | 000639 513 | 000640 514 | 000641 515 | 000642 516 | 000643 517 | 000644 518 | 000645 519 | 000646 520 | 000647 521 | 000648 522 | 000650 523 | 000651 524 | 000652 525 | 000653 526 | 000654 527 | 000655 528 | 000656 529 | 000657 530 | 000658 531 | 000659 532 | 000660 533 | 000662 534 | 000663 535 | 000664 536 | 000665 537 | 000666 538 | 000667 539 | 000668 540 | 000669 541 | 000670 542 | 000671 543 | 000673 544 | 000674 545 | 000675 546 | 000677 547 | 000678 548 | 000679 549 | 000680 550 | 000681 551 | 000682 552 | 000683 553 | 000684 554 | 000685 555 | 000686 556 | 000687 557 | 000688 558 | 000689 559 | 000690 560 | 000691 561 | 000692 562 | 000693 563 | 000695 564 | 000696 565 | 000698 566 | 000699 567 | 000700 568 | 000701 569 | 000702 570 | 000703 571 | 000704 572 | 000705 573 | 000706 574 | 000707 575 | 000709 576 | 000710 577 | 000714 578 | 000715 579 | 000716 580 | 000718 581 | 000719 582 | 000720 583 | 000721 584 | 000722 585 | 000723 586 | 000724 587 | 000725 588 | 000726 589 | 000728 590 | 000730 591 | 000733 592 | 000734 593 | 000735 594 | 000736 595 | 000737 596 | 000738 597 | 000740 598 | 000741 599 | 000742 600 | 000744 601 | 000745 602 | 000747 603 | 000749 604 | 000750 605 | 000751 606 | 000752 607 | 000755 608 | 000757 609 | 000758 610 | 000759 611 | 000760 612 | 000761 613 | 000762 614 | 000763 615 | 000765 616 | 000766 617 | 000767 618 | 000768 619 | 000770 620 | 000771 621 | 000772 622 | 000773 623 | 000776 624 | 000777 625 | 000778 626 | 000779 627 | 000780 628 | 000781 629 | 000782 630 | 000783 631 | 000784 632 | 000785 633 | 000787 634 | 000788 635 | 000789 636 | 000790 637 | 000791 638 | 000792 639 | 000794 640 | 000795 641 | 000797 642 | 000798 643 | 000799 644 | 000800 645 | 000801 646 | 000803 647 | 000804 648 | 000805 649 | 000806 650 | 000807 651 | 000810 652 | 000811 653 | 000812 654 | 000813 655 | 000815 656 | 000816 657 | 000817 658 | 000818 659 | 000820 660 | 000822 661 | 000824 662 | 000825 663 | 000826 664 | 000827 665 | 000828 666 | 000829 667 | 000830 668 | 000831 669 | 000832 670 | 000833 671 | 000834 672 | 000835 673 | 000836 674 | 000837 675 | 000838 676 | 000839 677 | 000840 678 | 000841 679 | 000842 680 | 000843 681 | 000844 682 | 000845 683 | 000847 684 | 000849 685 | 000850 686 | 000851 687 | 000852 688 | 000853 689 | 000854 690 | 000855 691 | 000856 692 | 000857 693 | 000859 694 | 000860 695 | 000861 696 | 000862 697 | 000864 698 | 000865 699 | 000866 700 | 000868 701 | 000870 702 | 000871 703 | 000872 704 | 000873 705 | 000874 706 | 000876 707 | 000877 708 | 000878 709 | 000879 710 | 000881 711 | 000882 712 | 000885 713 | 000886 714 | 000887 715 | 000889 716 | 000890 717 | 000891 718 | 000892 719 | 000895 720 | 000896 721 | 000899 722 | 000901 723 | 000902 724 | 000903 725 | 000904 726 | 000905 727 | 000906 728 | 000907 729 | 000908 730 | 000909 731 | 000911 732 | 000913 733 | 000914 734 | 000915 735 | 000916 736 | 000917 737 | 000923 738 | 000924 739 | 000925 740 | 000927 741 | 000928 742 | 000929 743 | 000930 744 | 000931 745 | 000932 746 | 000933 747 | 000934 748 | 000935 749 | 000936 750 | 000937 751 | 000938 752 | 000939 753 | 000940 754 | 000941 755 | 000942 756 | 000943 757 | 000944 758 | 000945 759 | 000948 760 | 000949 761 | 000950 762 | 000951 763 | 000952 764 | 000953 765 | 000955 766 | 000956 767 | 000957 768 | 000958 769 | 000959 770 | 000962 771 | 000963 772 | 000964 773 | 000965 774 | 000966 775 | 000968 776 | 000969 777 | 000970 778 | 000972 779 | 000973 780 | 000974 781 | 000975 782 | 000976 783 | 000979 784 | 000980 785 | 000981 786 | 000983 787 | 000985 788 | 000987 789 | 000990 790 | 000991 791 | 000992 792 | 000993 793 | 000994 794 | 000995 795 | 000998 796 | 000999 797 | 001000 798 | 001001 799 | 001002 800 | 001003 801 | 001004 802 | 001005 803 | 001008 804 | 001009 805 | 001010 806 | 001011 807 | 001012 808 | 001013 809 | 001014 810 | 001015 811 | 001016 812 | 001017 813 | 001018 814 | 001019 815 | 001021 816 | 001022 817 | 001023 818 | 001024 819 | 001025 820 | 001026 821 | 001027 822 | 001028 823 | 001030 824 | 001031 825 | 001032 826 | 001033 827 | 001034 828 | 001035 829 | 001037 830 | 001038 831 | 001039 832 | 001040 833 | 001041 834 | 001042 835 | 001045 836 | 001046 837 | 001048 838 | 001049 839 | 001050 840 | 001051 841 | 001052 842 | 001053 843 | 001054 844 | 001055 845 | 001056 846 | 001058 847 | 001060 848 | 001061 849 | 001063 850 | 001064 851 | 001065 852 | 001066 853 | 001067 854 | 001070 855 | 001071 856 | 001072 857 | 001073 858 | 001074 859 | 001077 860 | 001079 861 | 001081 862 | 001082 863 | 001083 864 | 001085 865 | 001086 866 | 001087 867 | 001088 868 | 001089 869 | 001090 870 | 001092 871 | 001093 872 | 001094 873 | 001095 874 | 001097 875 | 001098 876 | 001100 877 | 001101 878 | 001102 879 | 001103 880 | 001105 881 | 001106 882 | 001107 883 | 001108 884 | 001109 885 | 001110 886 | 001111 887 | 001112 888 | 001113 889 | 001115 890 | 001116 891 | 001117 892 | 001118 893 | 001119 894 | 001120 895 | 001121 896 | 001122 897 | 001123 898 | 001124 899 | 001125 900 | 001126 901 | 001128 902 | 001131 903 | 001132 904 | 001133 905 | 001134 906 | 001135 907 | 001136 908 | 001137 909 | 001138 910 | 001140 911 | 001141 912 | 001142 913 | 001143 914 | 001144 915 | 001145 916 | 001148 917 | 001149 918 | 001150 919 | 001151 920 | 001152 921 | 001153 922 | 001154 923 | 001155 924 | 001156 925 | 001157 926 | 001158 927 | 001159 928 | 001160 929 | -------------------------------------------------------------------------------- /data/VOCdevkit/VOC2007/ImageSets/Main/val.txt: -------------------------------------------------------------------------------- 1 | 000017 2 | 000022 3 | 000055 4 | 000068 5 | 000092 6 | 000098 7 | 000118 8 | 000119 9 | 000124 10 | 000141 11 | 000148 12 | 000155 13 | 000161 14 | 000211 15 | 000212 16 | 000214 17 | 000222 18 | 000224 19 | 000254 20 | 000259 21 | 000297 22 | 000300 23 | 000304 24 | 000305 25 | 000307 26 | 000308 27 | 000315 28 | 000330 29 | 000333 30 | 000375 31 | 000378 32 | 000382 33 | 000404 34 | 000425 35 | 000436 36 | 000437 37 | 000442 38 | 000461 39 | 000481 40 | 000491 41 | 000504 42 | 000508 43 | 000519 44 | 000524 45 | 000526 46 | 000581 47 | 000603 48 | 000629 49 | 000639 50 | 000640 51 | 000644 52 | 000646 53 | 000655 54 | 000660 55 | 000662 56 | 000695 57 | 000716 58 | 000735 59 | 000737 60 | 000752 61 | 000771 62 | 000773 63 | 000807 64 | 000810 65 | 000812 66 | 000820 67 | 000827 68 | 000870 69 | 000878 70 | 000879 71 | 000882 72 | 000891 73 | 000906 74 | 000909 75 | 000911 76 | 000925 77 | 000928 78 | 000965 79 | 000968 80 | 000991 81 | 000999 82 | 001011 83 | 001023 84 | 001031 85 | 001045 86 | 001050 87 | 001054 88 | 001073 89 | 001100 90 | 001106 91 | 001153 92 | 001155 93 | 001157 94 | -------------------------------------------------------------------------------- /data/VOCdevkitVOC2007/annotations_cache/annots.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/data/VOCdevkitVOC2007/annotations_cache/annots.pkl -------------------------------------------------------------------------------- /data/VOCdevkitVOC2007/results/det_train_None.txt: -------------------------------------------------------------------------------- 1 | 000221 0.014 6.2 39.9 631.2 464.1 2 | 000223 0.014 6.0 28.5 555.4 374.2 3 | 000236 0.012 4.9 31.2 472.5 370.8 4 | 000238 0.012 3.9 29.1 471.7 363.1 5 | 000744 0.013 -222.6 234.6 258.1 341.3 6 | 001112 0.012 417.1 -44.6 629.0 65.2 7 | -------------------------------------------------------------------------------- /data/VOCdevkitVOC2007/results/det_trainval_None.txt: -------------------------------------------------------------------------------- 1 | 000221 0.014 6.2 39.9 631.2 464.1 2 | 000223 0.014 6.0 28.5 555.4 374.2 3 | 000224 0.040 109.3 24.8 385.1 322.4 4 | 000224 0.017 268.0 195.9 583.6 433.1 5 | 000236 0.012 4.9 31.2 472.5 370.8 6 | 000238 0.012 3.9 29.1 471.7 363.1 7 | 000744 0.013 -222.6 234.6 258.1 341.3 8 | 001112 0.012 417.1 -44.6 629.0 65.2 9 | -------------------------------------------------------------------------------- /data/VOCdevkitVOC2007/results/det_val_None.txt: -------------------------------------------------------------------------------- 1 | 000224 0.040 109.3 24.8 385.1 322.4 2 | 000224 0.017 268.0 195.9 583.6 433.1 3 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | from .voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES, VOC_ROOT 2 | 3 | #from .coco import COCODetection, COCOAnnotationTransform, COCO_CLASSES, COCO_ROOT, get_label_map 4 | from .config import * 5 | import torch 6 | import cv2 7 | import numpy as np 8 | 9 | def detection_collate(batch): 10 | """Custom collate fn for dealing with batches of images that have a different 11 | number of associated object annotations (bounding boxes). 12 | 13 | Arguments: 14 | batch: (tuple) A tuple of tensor images and lists of annotations 15 | 16 | Return: 17 | A tuple containing: 18 | 1) (tensor) batch of images stacked on their 0 dim 19 | 2) (list of tensors) annotations for a given image are stacked on 20 | 0 dim 21 | """ 22 | targets = [] 23 | imgs = [] 24 | for sample in batch: 25 | imgs.append(sample[0]) 26 | targets.append(torch.FloatTensor(sample[1])) 27 | return torch.stack(imgs, 0), targets 28 | 29 | 30 | def base_transform(image, size, mean): 31 | x = cv2.resize(image, (size, size)).astype(np.float32) 32 | x -= mean 33 | x = x.astype(np.float32) 34 | return x 35 | 36 | 37 | class BaseTransform: 38 | def __init__(self, size, mean): 39 | self.size = size 40 | self.mean = np.array(mean, dtype=np.float32) 41 | 42 | def __call__(self, image, boxes=None, labels=None): 43 | return base_transform(image, self.size, self.mean), boxes, labels 44 | -------------------------------------------------------------------------------- /data/config.py: -------------------------------------------------------------------------------- 1 | # config.py 2 | import os.path 3 | 4 | # gets home dir cross platform 5 | HOME = os.path.expanduser("~") 6 | 7 | # for making bounding boxes pretty 8 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128), 9 | (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128)) 10 | 11 | MEANS = (104, 117, 123) 12 | 13 | # SSD300 CONFIGS 14 | voc = { 15 | 'num_classes': 3, # 【改成自己训练的类别数+1】#######原本我只有‘ship’1个类别 但我这里要写2 16 | 'lr_steps': (60000, 90000, 120000), # ##原本(80000, 100000, 120000) 修改 17 | 'max_iter': 120000, # 【改成自己训练的迭代次数】 18 | 'feature_maps': [38, 19, 10, 5, 3, 1], 19 | 'min_dim': 300, 20 | 'steps': [8, 16, 32, 64, 100, 300], 21 | 'min_sizes': [30, 60, 111, 162, 213, 264], 22 | 'max_sizes': [60, 111, 162, 213, 264, 315], 23 | 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 24 | 'variance': [0.1, 0.2], 25 | 'clip': True, 26 | 'name': 'VOC', 27 | } 28 | 29 | # coco = { 30 | # 'num_classes': 91, 31 | # 'lr_steps': (280000, 360000, 400000), 32 | # 'max_iter': 400000, 33 | # 'feature_maps': [38, 19, 10, 5, 3, 1], 34 | # 'min_dim': 300, 35 | # 'steps': [8, 16, 32, 64, 100, 300], 36 | # 'min_sizes': [21, 45, 99, 153, 207, 261], 37 | # 'max_sizes': [45, 99, 153, 207, 261, 315], 38 | # 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 39 | # 'variance': [0.1, 0.2], 40 | # 'clip': True, 41 | # 'name': 'COCO', 42 | # } 43 | -------------------------------------------------------------------------------- /data/example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/data/example.jpg -------------------------------------------------------------------------------- /data/scripts/COCO2014.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start=`date +%s` 4 | 5 | # handle optional download dir 6 | if [ -z "$1" ] 7 | then 8 | # navigate to ~/data 9 | echo "navigating to ~/data/ ..." 10 | mkdir -p ~/data 11 | cd ~/data/ 12 | mkdir -p ./coco 13 | cd ./coco 14 | mkdir -p ./images 15 | mkdir -p ./annotations 16 | else 17 | # check if specified dir is valid 18 | if [ ! -d $1 ]; then 19 | echo $1 " is not a valid directory" 20 | exit 0 21 | fi 22 | echo "navigating to " $1 " ..." 23 | cd $1 24 | fi 25 | 26 | if [ ! -d images ] 27 | then 28 | mkdir -p ./images 29 | fi 30 | 31 | # Download the image data. 32 | cd ./images 33 | echo "Downloading MSCOCO train images ..." 34 | curl -LO http://images.cocodataset.org/zips/train2014.zip 35 | echo "Downloading MSCOCO val images ..." 36 | curl -LO http://images.cocodataset.org/zips/val2014.zip 37 | 38 | cd ../ 39 | if [ ! -d annotations] 40 | then 41 | mkdir -p ./annotations 42 | fi 43 | 44 | # Download the annotation data. 45 | cd ./annotations 46 | echo "Downloading MSCOCO train/val annotations ..." 47 | curl -LO http://images.cocodataset.org/annotations/annotations_trainval2014.zip 48 | echo "Finished downloading. Now extracting ..." 49 | 50 | # Unzip data 51 | echo "Extracting train images ..." 52 | unzip ../images/train2014.zip -d ../images 53 | echo "Extracting val images ..." 54 | unzip ../images/val2014.zip -d ../images 55 | echo "Extracting annotations ..." 56 | unzip ./annotations_trainval2014.zip 57 | 58 | echo "Removing zip files ..." 59 | rm ../images/train2014.zip 60 | rm ../images/val2014.zip 61 | rm ./annotations_trainval2014.zip 62 | 63 | echo "Creating trainval35k dataset..." 64 | 65 | # Download annotations json 66 | echo "Downloading trainval35k annotations from S3" 67 | curl -LO https://s3.amazonaws.com/amdegroot-datasets/instances_trainval35k.json.zip 68 | 69 | # combine train and val 70 | echo "Combining train and val images" 71 | mkdir ../images/trainval35k 72 | cd ../images/train2014 73 | find -maxdepth 1 -name '*.jpg' -exec cp -t ../trainval35k {} + # dir too large for cp 74 | cd ../val2014 75 | find -maxdepth 1 -name '*.jpg' -exec cp -t ../trainval35k {} + 76 | 77 | 78 | end=`date +%s` 79 | runtime=$((end-start)) 80 | 81 | echo "Completed in " $runtime " seconds" 82 | -------------------------------------------------------------------------------- /data/scripts/VOC2007.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2007 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 26 | echo "Downloading VOC2007 test data ..." 27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 28 | echo "Done downloading." 29 | 30 | # Extract data 31 | echo "Extracting trainval ..." 32 | tar -xvf VOCtrainval_06-Nov-2007.tar 33 | echo "Extracting test ..." 34 | tar -xvf VOCtest_06-Nov-2007.tar 35 | echo "removing tars ..." 36 | rm VOCtrainval_06-Nov-2007.tar 37 | rm VOCtest_06-Nov-2007.tar 38 | 39 | end=`date +%s` 40 | runtime=$((end-start)) 41 | 42 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/scripts/VOC2012.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2012 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 26 | echo "Done downloading." 27 | 28 | 29 | # Extract data 30 | echo "Extracting trainval ..." 31 | tar -xvf VOCtrainval_11-May-2012.tar 32 | echo "removing tar ..." 33 | rm VOCtrainval_11-May-2012.tar 34 | 35 | end=`date +%s` 36 | runtime=$((end-start)) 37 | 38 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/voc0712.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | from .config import HOME, MEANS, voc, COLORS # ################################3 9 | import os.path as osp 10 | import sys 11 | import torch 12 | import torch.utils.data as data 13 | import cv2 14 | import numpy as np 15 | if sys.version_info[0] == 2: 16 | import xml.etree.cElementTree as ET 17 | else: 18 | import xml.etree.ElementTree as ET 19 | 20 | # VOC_CLASSES = ( # always index 0 21 | # 'aeroplane', 'bicycle', 'bird', 'boat', 22 | # 'bottle', 'bus', 'car', 'cat', 'chair', 23 | # 'cow', 'diningtable', 'dog', 'horse', 24 | # 'motorbike', 'person', 'pottedplant', 25 | # 'sheep', 'sofa', 'train', 'tvmonitor') 26 | 27 | # ##VOC_CLASSES=( 'None','ship')#这里如果只写一个名字的话就有很严重的问题 28 | # ##VOC_CLASSES=('ship')###就会出现严重的错误 需要在ship后面添加‘,’ 29 | VOC_CLASSES=('face', 30 | 'face_mask') # ##************************************************************** 31 | # note: if you used our download scripts, this should be right 32 | VOC_ROOT = "/data/lp/project/ssd.pytorch/data/VOCdevkit/" # 个人感觉路径应该自己设定一下 33 | # VOC_ROOT = osp.join(HOME, "data/VOCdevkit/") # ###去掉了HOME####我认为应该修改成自己的数据位子 【这里是数据默认的读取路径】 34 | 35 | 36 | class VOCAnnotationTransform(object): 37 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 38 | Initilized with a dictionary lookup of classnames to indexes 39 | 40 | Arguments: 41 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 42 | (default: alphabetic indexing of VOC's 20 classes) 43 | keep_difficult (bool, optional): keep difficult instances or not 44 | (default: False) 45 | height (int): height 46 | width (int): width 47 | """ 48 | 49 | def __init__(self, class_to_ind=None, keep_difficult=False): 50 | self.class_to_ind = class_to_ind or dict( # class_to_ind将标签信息转化为字典形式{'aeroplane': 0, 'bicycle': 1} 51 | zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 52 | self.keep_difficult = keep_difficult 53 | 54 | def __call__(self, target, width, height): 55 | """ 56 | Arguments: 57 | target (annotation) : the target annotation to be made usable 58 | will be an ET.Element 59 | Returns: 60 | a list containing lists of bounding boxes [bbox coords, class name] 61 | """ 62 | res = [] 63 | for obj in target.iter('object'): 64 | difficult = int(obj.find('difficult').text) == 1 65 | if not self.keep_difficult and difficult: 66 | continue 67 | name = obj.find('name').text.lower().strip() # ##strip()返回移除字符串头尾指定的字符生成的新字符串。 # 去除首尾空格 68 | bbox = obj.find('bndbox') 69 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 70 | bndbox = [] 71 | for i, pt in enumerate(pts): 72 | cur_pt = int(bbox.find(pt).text) - 1 73 | # scale height or width 74 | cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height # 获得该目标在这张图向上的相对坐标位置【0-1】 75 | bndbox.append(cur_pt) 76 | label_idx = self.class_to_ind[name] 77 | bndbox.append(label_idx) 78 | res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind] 79 | # img_id = target.find('filename').text[:-4] 80 | 81 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 82 | 83 | 84 | class VOCDetection(data.Dataset): 85 | 86 | """VOC Detection Dataset Object 87 | 88 | input is image, target is annotation 89 | 90 | Arguments: 91 | root (string): filepath to VOCdevkit folder. 92 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 93 | transform (callable, optional): transformation to perform on the 94 | input image 95 | target_transform (callable, optional): transformation to perform on the 96 | target `annotation` 97 | (eg: take in caption string, return tensor of word indices) 98 | dataset_name (string, optional): which dataset to load 99 | (default: 'VOC2007') 100 | """ 101 | 102 | def __init__(self, 103 | root, # VOCdevkit folder的根目录 104 | image_sets=[('2007', 'trainval')], # ('2012', 'trainval') 要选用的数据集 是字符串的格式 105 | transform=None, # 图片的预处理方法 106 | target_transform=VOCAnnotationTransform(), # 标签的预处理方法 107 | dataset_name='VOC0712'): # 数据集的名字 108 | self.root = root # 设置数VOCdevkit folder的根目录 109 | self.image_set = image_sets # 设置要选用的数据集 110 | self.transform = transform # 定义图像转换方法 111 | self.target_transform = target_transform # 定义标签的转换方法 112 | self.name = dataset_name # 定义数据集名称 113 | self._annopath = osp.join('%s', 'Annotations', '%s.xml') # 记录标签的位置 留下了两个【%s】的部分没有填写 114 | self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg') # 记录图像的位置 留下了两个【%s】的部分没有填写 115 | self.ids = list() # 记录数据集中的所有图像的名字,没有后缀名 116 | for (year, name) in image_sets: # 【image_sets】就是我们要用训练好的模型测试的test数据集('2007', 'trainval')这样的形式 117 | # 读入数据集中的图像名称,可以依照该名称和_annopath、_imgpath推断出图片、描述文件存储的位置 118 | rootpath = osp.join(self.root, 'VOC' + year) # ...../VOC2007 119 | for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 120 | # ...../VOC2007/ImageSets/Main/(test val train).txt 121 | self.ids.append((rootpath, line.strip())) 122 | # 将这个测试集的txt文本打开后,读取每一行的数据,注意去除前后的空格 123 | # 【(ids)存放每一张图片的信息 (rootpath 和 去后缀的图片名 没有.jpg)为一个元组】ids是个list里面的每个元素都是一个元组 124 | 125 | def __getitem__(self, index): 126 | im, gt, h, w = self.pull_item(index) 127 | 128 | return im, gt 129 | 130 | def __len__(self): 131 | return len(self.ids) 132 | 133 | def pull_item(self, index): 134 | img_id = self.ids[index] # ('D:/Deep_learning/ssd.pytorch-master/data/VOCdevkit/VOC2007', '000001') 135 | 136 | target = ET.parse(self._annopath % img_id).getroot() # 将self._annopath空缺的两个部分用img_id补全。 获得需要读取xml的对象 137 | img = cv2.imread(self._imgpath % img_id) # 将self._imgpath的空缺的两个部分用img_id补全 获取对应的图像 138 | height, width, channels = img.shape # 获取图像的尺寸 高宽通道数 139 | 140 | if self.target_transform is not None: # 对读入的测试的标签进行处理 141 | target = self.target_transform(target, width, height) # 返回的是【xmin,xmax,ymin,ymax,label】 142 | if self.transform is not None: # 对测试集的图片默认的transform是None 143 | target = np.array(target) # 下面这个transform理解不了啊!!!!! 144 | img, boxes, labels = self.transform(img, target[:, :4], target[:, 4]) # 输入的图像,位置(4维度),标签(1维度) 145 | # to rgb 146 | img = img[:, :, (2, 1, 0)] # opencv读入图像的顺序是BGR,该操作将图像转为RGB 147 | # img = img.transpose(2, 0, 1) 148 | target = np.hstack((boxes, np.expand_dims(labels, axis=1))) # 首先将narry的向量(x,)转化为(x,1),然后又重新组织成了一样的格式 149 | return torch.from_numpy(img).permute(2, 0, 1), target, height, width # permute将通道数提前,符合pytorch的格式 150 | # 将通道数提前,为了统一torch的后续训练操作。 151 | # ############################################################################################################################################### 152 | 153 | def pull_image(self, index): 154 | """Returns the original image object at index in PIL form 155 | 156 | Note: not using self.__getitem__(), as any transformations passed in 157 | could mess up this functionality. 158 | 159 | Argument: 160 | index (int): index of img to show 161 | Return: 162 | PIL img 163 | """ 164 | img_id = self.ids[index] 165 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) # 从图片的路径直接读取图片 166 | 167 | def pull_anno(self, index): 168 | """Returns the original annotation of image at index 169 | 170 | Note: not using self.__getitem__(), as any transformations passed in 171 | could mess up this functionality. 172 | 173 | Argument: 174 | index (int): index of img to get annotation of 175 | Return: 176 | list: [img_id, [(label, bbox coords),...]] 177 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 178 | """ 179 | img_id = self.ids[index] 180 | anno = ET.parse(self._annopath % img_id).getroot() 181 | gt = self.target_transform(anno, 1, 1) # ##后面的1,1,就是上面实例化VOCAnno的后两个参数width和height,用于对标签进行归一化 182 | return img_id[1], gt # 返回的是图片的名字(去后缀名),还有groundtruth [[xmin, ymin, xmax, ymax, label_ind], ... ] 183 | 184 | def pull_tensor(self, index): 185 | """Returns the original image at an index in tensor form 186 | 187 | Note: not using self.__getitem__(), as any transformations passed in 188 | could mess up this functionality. 189 | 190 | Argument: 191 | index (int): index of img to show 192 | Return: 193 | tensorized version of img, squeezed 194 | """ 195 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) 196 | -------------------------------------------------------------------------------- /demo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/demo/__init__.py -------------------------------------------------------------------------------- /demo/live.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import torch 3 | from torch.autograd import Variable 4 | import cv2 5 | import time 6 | from imutils.video import FPS, WebcamVideoStream 7 | import argparse 8 | 9 | parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection') 10 | parser.add_argument('--weights', default='weights/ssd_300_VOC0712.pth', 11 | type=str, help='Trained state_dict file path') 12 | parser.add_argument('--cuda', default=False, type=bool, 13 | help='Use cuda in live demo') 14 | args = parser.parse_args() 15 | 16 | COLORS = [(255, 0, 0), (0, 255, 0), (0, 0, 255)] 17 | FONT = cv2.FONT_HERSHEY_SIMPLEX 18 | 19 | 20 | def cv2_demo(net, transform): 21 | def predict(frame): 22 | height, width = frame.shape[:2] 23 | x = torch.from_numpy(transform(frame)[0]).permute(2, 0, 1) 24 | x = Variable(x.unsqueeze(0)) 25 | y = net(x) # forward pass 26 | detections = y.data 27 | # scale each detection back up to the image 28 | scale = torch.Tensor([width, height, width, height]) 29 | for i in range(detections.size(1)): 30 | j = 0 31 | while detections[0, i, j, 0] >= 0.6: 32 | pt = (detections[0, i, j, 1:] * scale).cpu().numpy() 33 | cv2.rectangle(frame, 34 | (int(pt[0]), int(pt[1])), 35 | (int(pt[2]), int(pt[3])), 36 | COLORS[i % 3], 2) 37 | cv2.putText(frame, labelmap[i - 1], (int(pt[0]), int(pt[1])), 38 | FONT, 2, (255, 255, 255), 2, cv2.LINE_AA) 39 | j += 1 40 | return frame 41 | 42 | # start video stream thread, allow buffer to fill 43 | print("[INFO] starting threaded video stream...") 44 | stream = WebcamVideoStream(src=0).start() # default camera 45 | time.sleep(1.0) 46 | # start fps timer 47 | # loop over frames from the video file stream 48 | while True: 49 | # grab next frame 50 | frame = stream.read() 51 | key = cv2.waitKey(1) & 0xFF 52 | 53 | # update FPS counter 54 | fps.update() 55 | frame = predict(frame) 56 | 57 | # keybindings for display 58 | if key == ord('p'): # pause 59 | while True: 60 | key2 = cv2.waitKey(1) or 0xff 61 | cv2.imshow('frame', frame) 62 | if key2 == ord('p'): # resume 63 | break 64 | cv2.imshow('frame', frame) 65 | if key == 27: # exit 66 | break 67 | 68 | 69 | if __name__ == '__main__': 70 | import sys 71 | from os import path 72 | sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) 73 | 74 | from data import BaseTransform, VOC_CLASSES as labelmap 75 | from ssd import build_ssd 76 | 77 | net = build_ssd('test', 300, 21) # initialize SSD 78 | net.load_state_dict(torch.load(args.weights)) 79 | transform = BaseTransform(net.size, (104/256.0, 117/256.0, 123/256.0)) 80 | 81 | fps = FPS().start() 82 | cv2_demo(net.eval(), transform) 83 | # stop the timer and display FPS information 84 | fps.stop() 85 | 86 | print("[INFO] elasped time: {:.2f}".format(fps.elapsed())) 87 | print("[INFO] approx. FPS: {:.2f}".format(fps.fps())) 88 | 89 | # cleanup 90 | cv2.destroyAllWindows() 91 | stream.stop() 92 | -------------------------------------------------------------------------------- /doc/SSD.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/doc/SSD.jpg -------------------------------------------------------------------------------- /doc/detection_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/doc/detection_example.png -------------------------------------------------------------------------------- /doc/detection_example2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/doc/detection_example2.png -------------------------------------------------------------------------------- /doc/detection_examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/doc/detection_examples.png -------------------------------------------------------------------------------- /doc/ssd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/doc/ssd.png -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """Adapted from: 4 | @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch 5 | @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn 6 | Licensed under The MIT License [see LICENSE for details] 7 | """ 8 | 9 | from __future__ import print_function 10 | import torch 11 | import torch.nn as nn 12 | import torch.backends.cudnn as cudnn 13 | from torch.autograd import Variable 14 | from data import VOC_ROOT, VOCAnnotationTransform, VOCDetection, BaseTransform 15 | from data import VOC_CLASSES as labelmap 16 | import torch.utils.data as data 17 | 18 | from ssd import build_ssd 19 | 20 | import sys 21 | import os 22 | import time 23 | import argparse 24 | import numpy as np 25 | import pickle 26 | import cv2 27 | 28 | if sys.version_info[0] == 2: 29 | import xml.etree.cElementTree as ET 30 | else: 31 | import xml.etree.ElementTree as ET 32 | 33 | 34 | def str2bool(v): 35 | return v.lower() in ("yes", "true", "t", "1") 36 | 37 | 38 | parser = argparse.ArgumentParser( 39 | description='Single Shot MultiBox Detector Evaluation') 40 | parser.add_argument('--trained_model', 41 | default='weights/ssd300_VOC_10000.pth', type=str, 42 | help='Trained state_dict file path to open') 43 | parser.add_argument('--save_folder', default='eval/', type=str, 44 | help='File path to save results') 45 | parser.add_argument('--confidence_threshold', default=0.01, type=float, 46 | help='Detection confidence threshold') 47 | parser.add_argument('--top_k', default=5, type=int, 48 | help='Further restrict the number of predictions to parse') 49 | parser.add_argument('--cuda', default=True, type=str2bool, 50 | help='Use cuda to train model') 51 | parser.add_argument('--voc_root', default=VOC_ROOT, 52 | help='Location of VOC root directory') 53 | parser.add_argument('--cleanup', default=True, type=str2bool, 54 | help='Cleanup and remove results files following eval') 55 | 56 | args = parser.parse_args() 57 | 58 | if not os.path.exists(args.save_folder): 59 | os.mkdir(args.save_folder) 60 | 61 | if torch.cuda.is_available(): 62 | if args.cuda: 63 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 64 | if not args.cuda: 65 | print("WARNING: It looks like you have a CUDA device, but aren't using \ 66 | CUDA. Run with --cuda for optimal eval speed.") 67 | torch.set_default_tensor_type('torch.FloatTensor') 68 | else: 69 | torch.set_default_tensor_type('torch.FloatTensor') 70 | 71 | annopath = os.path.join(args.voc_root, 'VOC2007', 'Annotations', '%s.xml') 72 | imgpath = os.path.join(args.voc_root, 'VOC2007', 'JPEGImages', '%s.jpg') 73 | imgsetpath = os.path.join(args.voc_root, 'VOC2007', 'ImageSets', 74 | # 'Main', '{:s}.txt') 75 | 'Main', '{}.txt') 76 | YEAR = '2007' 77 | devkit_path = args.voc_root + 'VOC' + YEAR 78 | dataset_mean = (104, 117, 123) 79 | set_type = 'test' 80 | 81 | 82 | class Timer(object): 83 | """A simple timer.""" 84 | def __init__(self): 85 | self.total_time = 0. 86 | self.calls = 0 87 | self.start_time = 0. 88 | self.diff = 0. 89 | self.average_time = 0. 90 | 91 | def tic(self): 92 | # using time.time instead of time.clock because time time.clock 93 | # does not normalize for multithreading 94 | self.start_time = time.time() 95 | 96 | def toc(self, average=True): 97 | self.diff = time.time() - self.start_time 98 | self.total_time += self.diff 99 | self.calls += 1 100 | self.average_time = self.total_time / self.calls 101 | if average: 102 | return self.average_time 103 | else: 104 | return self.diff 105 | 106 | 107 | def parse_rec(filename): 108 | """ Parse a PASCAL VOC xml file """ 109 | tree = ET.parse(filename) 110 | objects = [] 111 | for obj in tree.findall('object'): 112 | obj_struct = {} 113 | obj_struct['name'] = obj.find('name').text 114 | # 不知道为什么下面字段的信息读不出来我这里就直接给付的默认值 115 | # obj_struct['pose'] = obj.find('pose').text 116 | # obj_struct['truncated'] = int(obj.find('truncated').text) 117 | # obj_struct['difficult'] = int(obj.find('difficult').text) 118 | obj_struct['pose'] = 'Unspecified' 119 | obj_struct['truncated'] = 0 120 | obj_struct['difficult'] = 0 121 | bbox = obj.find('bndbox') 122 | obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1, 123 | int(bbox.find('ymin').text) - 1, 124 | int(bbox.find('xmax').text) - 1, 125 | int(bbox.find('ymax').text) - 1] 126 | objects.append(obj_struct) 127 | 128 | return objects 129 | 130 | 131 | def get_output_dir(name, phase): 132 | """Return the directory where experimental artifacts are placed. 133 | If the directory does not exist, it is created. 134 | A canonical path is built using the name from an imdb and a network 135 | (if not None). 136 | """ 137 | filedir = os.path.join(name, phase) 138 | if not os.path.exists(filedir): 139 | os.makedirs(filedir) 140 | return filedir 141 | 142 | 143 | def get_voc_results_file_template(image_set, cls): 144 | # VOCdevkit/VOC2007/results/det_test_aeroplane.txt 145 | filename = 'det_' + image_set + '_%s.txt' % (cls) 146 | filedir = os.path.join(devkit_path, 'results') 147 | if not os.path.exists(filedir): 148 | os.makedirs(filedir) 149 | path = os.path.join(filedir, filename) 150 | return path 151 | 152 | 153 | def write_voc_results_file(all_boxes, dataset): 154 | for cls_ind, cls in enumerate(labelmap): 155 | print('Writing {:s} VOC results file'.format(cls)) 156 | filename = get_voc_results_file_template(set_type, cls) 157 | with open(filename, 'wt') as f: 158 | for im_ind, index in enumerate(dataset.ids): 159 | dets = all_boxes[cls_ind+1][im_ind] 160 | if dets == []: 161 | continue 162 | # the VOCdevkit expects 1-based indices 163 | for k in range(dets.shape[0]): 164 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 165 | format(index[1], dets[k, -1], 166 | dets[k, 0] + 1, dets[k, 1] + 1, 167 | dets[k, 2] + 1, dets[k, 3] + 1)) 168 | 169 | 170 | def do_python_eval(output_dir='output', use_07=True): 171 | cachedir = os.path.join(devkit_path, 'annotations_cache') 172 | aps = [] 173 | # The PASCAL VOC metric changed in 2010 174 | use_07_metric = use_07 175 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 176 | if not os.path.isdir(output_dir): 177 | os.mkdir(output_dir) 178 | for i, cls in enumerate(labelmap): 179 | filename = get_voc_results_file_template(set_type, cls) 180 | rec, prec, ap = voc_eval( 181 | filename, annopath, imgsetpath.format(set_type), cls, cachedir, 182 | ovthresh=0.5, use_07_metric=use_07_metric) 183 | aps += [ap] 184 | print('AP for {} = {:.4f}'.format(cls, ap)) 185 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: 186 | pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 187 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 188 | print('~~~~~~~~') 189 | print('Results:') 190 | for ap in aps: 191 | print('{:.3f}'.format(ap)) 192 | print('{:.3f}'.format(np.mean(aps))) 193 | print('~~~~~~~~') 194 | print('') 195 | print('--------------------------------------------------------------') 196 | print('Results computed with the **unofficial** Python eval code.') 197 | print('Results should be very close to the official MATLAB eval code.') 198 | print('--------------------------------------------------------------') 199 | 200 | 201 | def voc_ap(rec, prec, use_07_metric=True): 202 | """ ap = voc_ap(rec, prec, [use_07_metric]) 203 | Compute VOC AP given precision and recall. 204 | If use_07_metric is true, uses the 205 | VOC 07 11 point method (default:True). 206 | """ 207 | if use_07_metric: 208 | # 11 point metric 209 | ap = 0. 210 | for t in np.arange(0., 1.1, 0.1): 211 | if np.sum(rec >= t) == 0: 212 | p = 0 213 | else: 214 | p = np.max(prec[rec >= t]) 215 | ap = ap + p / 11. 216 | else: 217 | # correct AP calculation 218 | # first append sentinel values at the end 219 | mrec = np.concatenate(([0.], rec, [1.])) 220 | mpre = np.concatenate(([0.], prec, [0.])) 221 | 222 | # compute the precision envelope 223 | for i in range(mpre.size - 1, 0, -1): 224 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 225 | 226 | # to calculate area under PR curve, look for points 227 | # where X axis (recall) changes value 228 | i = np.where(mrec[1:] != mrec[:-1])[0] 229 | 230 | # and sum (\Delta recall) * prec 231 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 232 | return ap 233 | 234 | 235 | def voc_eval(detpath, 236 | annopath, 237 | imagesetfile, 238 | classname, 239 | cachedir, 240 | ovthresh=0.5, 241 | use_07_metric=True): 242 | """rec, prec, ap = voc_eval(detpath, 243 | annopath, 244 | imagesetfile, 245 | classname, 246 | [ovthresh], 247 | [use_07_metric]) 248 | Top level function that does the PASCAL VOC evaluation. 249 | detpath: Path to detections 250 | detpath.format(classname) should produce the detection results file. 251 | annopath: Path to annotations 252 | annopath.format(imagename) should be the xml annotations file. 253 | imagesetfile: Text file containing the list of images, one image per line. 254 | classname: Category name (duh) 255 | cachedir: Directory for caching the annotations 256 | [ovthresh]: Overlap threshold (default = 0.5) 257 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 258 | (default True) 259 | """ 260 | # assumes detections are in detpath.format(classname) 261 | # assumes annotations are in annopath.format(imagename) 262 | # assumes imagesetfile is a text file with each line an image name 263 | # cachedir caches the annotations in a pickle file 264 | # first load gt 265 | if not os.path.isdir(cachedir): 266 | os.mkdir(cachedir) 267 | cachefile = os.path.join(cachedir, 'annots.pkl') 268 | # read list of images 269 | with open(imagesetfile, 'r') as f: 270 | lines = f.readlines() 271 | imagenames = [x.strip() for x in lines] 272 | if not os.path.isfile(cachefile): 273 | # load annots 274 | recs = {} 275 | for i, imagename in enumerate(imagenames): 276 | recs[imagename] = parse_rec(annopath % (imagename)) 277 | if i % 100 == 0: 278 | print('Reading annotation for {:d}/{:d}'.format( 279 | i + 1, len(imagenames))) 280 | # save 281 | print('Saving cached annotations to {:s}'.format(cachefile)) 282 | with open(cachefile, 'wb') as f: 283 | pickle.dump(recs, f) 284 | else: 285 | # load 286 | with open(cachefile, 'rb') as f: 287 | recs = pickle.load(f) 288 | 289 | # extract gt objects for this class 290 | class_recs = {} 291 | npos = 0 292 | for imagename in imagenames: 293 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 294 | bbox = np.array([x['bbox'] for x in R]) 295 | # difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 296 | difficult = np.array( [None for x in R]).astype(np.bool) # 这里不知道为什么评价的参数用不了了,我就自己把difficult字段设为0的列表 297 | det = [False] * len(R) 298 | npos = npos + sum(~difficult) 299 | class_recs[imagename] = {'bbox': bbox, 300 | 'difficult': difficult, 301 | 'det': det} 302 | 303 | # read dets 304 | detfile = detpath.format(classname) 305 | with open(detfile, 'r') as f: 306 | lines = f.readlines() 307 | if any(lines) == 1: 308 | 309 | splitlines = [x.strip().split(' ') for x in lines] 310 | image_ids = [x[0] for x in splitlines] 311 | confidence = np.array([float(x[1]) for x in splitlines]) 312 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 313 | 314 | # sort by confidence 315 | sorted_ind = np.argsort(-confidence) 316 | sorted_scores = np.sort(-confidence) 317 | BB = BB[sorted_ind, :] 318 | image_ids = [image_ids[x] for x in sorted_ind] 319 | 320 | # go down dets and mark TPs and FPs 321 | nd = len(image_ids) 322 | tp = np.zeros(nd) 323 | fp = np.zeros(nd) 324 | for d in range(nd): 325 | R = class_recs[image_ids[d]] 326 | bb = BB[d, :].astype(float) 327 | ovmax = -np.inf 328 | BBGT = R['bbox'].astype(float) 329 | if BBGT.size > 0: 330 | # compute overlaps 331 | # intersection 332 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 333 | iymin = np.maximum(BBGT[:, 1], bb[1]) 334 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 335 | iymax = np.minimum(BBGT[:, 3], bb[3]) 336 | iw = np.maximum(ixmax - ixmin, 0.) 337 | ih = np.maximum(iymax - iymin, 0.) 338 | inters = iw * ih 339 | uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) + 340 | (BBGT[:, 2] - BBGT[:, 0]) * 341 | (BBGT[:, 3] - BBGT[:, 1]) - inters) 342 | overlaps = inters / uni 343 | ovmax = np.max(overlaps) 344 | jmax = np.argmax(overlaps) 345 | 346 | if ovmax > ovthresh: 347 | if not R['difficult'][jmax]: 348 | if not R['det'][jmax]: 349 | tp[d] = 1. 350 | R['det'][jmax] = 1 351 | else: 352 | fp[d] = 1. 353 | else: 354 | fp[d] = 1. 355 | 356 | # compute precision recall 357 | fp = np.cumsum(fp) 358 | tp = np.cumsum(tp) 359 | rec = tp / float(npos) 360 | # avoid divide by zero in case the first detection matches a difficult 361 | # ground truth 362 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 363 | ap = voc_ap(rec, prec, use_07_metric) 364 | else: 365 | rec = -1. 366 | prec = -1. 367 | ap = -1. 368 | 369 | return rec, prec, ap 370 | 371 | 372 | def test_net(save_folder, net, cuda, dataset, transform, top_k, 373 | im_size=300, thresh=0.05): 374 | num_images = len(dataset) 375 | # all detections are collected into: 376 | # all_boxes[cls][image] = N x 5 array of detections in 377 | # (x1, y1, x2, y2, score) 378 | all_boxes = [[[] for _ in range(num_images)] 379 | for _ in range(len(labelmap)+1)] 380 | 381 | # timers 382 | _t = {'im_detect': Timer(), 'misc': Timer()} 383 | output_dir = get_output_dir('ssd300_120000', set_type) 384 | det_file = os.path.join(output_dir, 'detections.pkl') 385 | 386 | for i in range(num_images): 387 | im, gt, h, w = dataset.pull_item(i) 388 | 389 | x = Variable(im.unsqueeze(0)) 390 | if args.cuda: 391 | x = x.cuda() 392 | _t['im_detect'].tic() 393 | detections = net(x).data 394 | detect_time = _t['im_detect'].toc(average=False) 395 | 396 | # skip j = 0, because it's the background class 397 | for j in range(1, detections.size(1)): 398 | dets = detections[0, j, :] 399 | mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t() 400 | dets = torch.masked_select(dets, mask).view(-1, 5) 401 | if dets.size(0) == 0: 402 | continue 403 | boxes = dets[:, 1:] 404 | boxes[:, 0] *= w 405 | boxes[:, 2] *= w 406 | boxes[:, 1] *= h 407 | boxes[:, 3] *= h 408 | scores = dets[:, 0].cpu().numpy() 409 | cls_dets = np.hstack((boxes.cpu().numpy(), 410 | scores[:, np.newaxis])).astype(np.float32, 411 | copy=False) 412 | all_boxes[j][i] = cls_dets 413 | 414 | print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, 415 | num_images, detect_time)) 416 | 417 | with open(det_file, 'wb') as f: 418 | pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) 419 | 420 | print('Evaluating detections') 421 | evaluate_detections(all_boxes, output_dir, dataset) 422 | 423 | 424 | def evaluate_detections(box_list, output_dir, dataset): 425 | write_voc_results_file(box_list, dataset) 426 | do_python_eval(output_dir) 427 | 428 | 429 | if __name__ == '__main__': 430 | # load net 431 | num_classes = len(labelmap) + 1 # +1 for background 432 | net = build_ssd('test', 300, num_classes) # initialize SSD 433 | net.load_state_dict(torch.load(args.trained_model)) 434 | net.eval() 435 | print('Finished loading model!') 436 | # load data 437 | dataset = VOCDetection(args.voc_root, [('2007', set_type)], 438 | BaseTransform(300, dataset_mean), 439 | VOCAnnotationTransform()) 440 | if args.cuda: 441 | net = net.cuda() 442 | cudnn.benchmark = True 443 | # evaluation 444 | test_net(args.save_folder, net, args.cuda, dataset, 445 | BaseTransform(net.size, dataset_mean), args.top_k, 300, 446 | thresh=args.confidence_threshold) 447 | -------------------------------------------------------------------------------- /focal_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | 6 | 7 | class FocalLoss(nn.Module): 8 | r""" 9 | This criterion is a implemenation of Focal Loss, which is proposed in 10 | Focal Loss for Dense Object Detection. 11 | 12 | Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class]) 13 | 14 | The losses are averaged across observations for each minibatch. 15 | 16 | Args: 17 | alpha(1D Tensor, Variable) : the scalar factor for this criterion 18 | gamma(float, double) : gamma > 0; reduces the relative loss for well-clasified examples (p > .5), 19 | putting more focus on hard, misclassified examples 20 | size_average(bool): By default, the losses are averaged over observations for each minibatch. 21 | However, if the field size_average is set to False, the losses are 22 | instead summed for each minibatch. 23 | 24 | 25 | """ 26 | def __init__(self, alpha, gamma=2, class_num=5,size_average=False): 27 | super(FocalLoss, self).__init__() 28 | if alpha is None: 29 | self.alpha = Variable(torch.ones(class_num, 1)) 30 | else: 31 | if isinstance(alpha, Variable): 32 | self.alpha = alpha 33 | else: 34 | self.alpha = Variable(alpha) 35 | 36 | self.gamma = gamma 37 | 38 | # self.class_num = class_num 39 | self.size_average = size_average 40 | 41 | def forward(self, inputs, targets): 42 | N = inputs.size(0) # batch_size 43 | C = inputs.size(1) # channels 44 | P = F.softmax(inputs, dim=1) 45 | 46 | class_mask = inputs.data.new(N, C).fill_(0) 47 | class_mask = Variable(class_mask) 48 | ids = targets.view(-1, 1) 49 | class_mask.scatter_(1, ids.data, 1.) 50 | # print(class_mask) 51 | 52 | if inputs.is_cuda and not self.alpha.is_cuda: 53 | self.alpha = self.alpha.cuda() 54 | alpha = self.alpha[ids.data.view(-1)] 55 | 56 | probs = (P*class_mask).sum(1).view(-1, 1) 57 | 58 | log_p = probs.log() 59 | # print('probs size= {}'.format(probs.size())) 60 | # print(probs) 61 | 62 | batch_loss = -alpha*(torch.pow((1-probs), self.gamma))*log_p 63 | 64 | # print('-----bacth_loss------') 65 | # print(batch_loss) 66 | 67 | if self.size_average: 68 | loss = batch_loss.mean() 69 | else: 70 | loss = batch_loss.sum() 71 | return loss 72 | -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * 2 | from .modules import * 3 | -------------------------------------------------------------------------------- /layers/box_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | 4 | 5 | def point_form(boxes): 6 | """ Convert prior_boxes to (xmin, ymin, xmax, ymax) 7 | representation for comparison to point form ground truth data. 8 | Args: 9 | boxes: (tensor) center-size default boxes from priorbox layers. 10 | Return: 11 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 12 | """ 13 | #原本boxes包括:xmin, ymin, xmax, ymax 14 | return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin 15 | boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax 16 | 17 | 18 | def center_size(boxes): 19 | """ Convert prior_boxes to (cx, cy, w, h) 20 | representation for comparison to center-size form ground truth data. 21 | Args: 22 | boxes: (tensor) point_form boxes 23 | Return: 24 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 25 | """ 26 | return torch.cat((boxes[:, 2:] + boxes[:, :2])/2, # cx, cy 27 | boxes[:, 2:] - boxes[:, :2], 1) # w, h 28 | 29 | 30 | def intersect(box_a, box_b): 31 | """ We resize both tensors to [A,B,2] without new malloc: 32 | [A,2] -> [A,1,2] -> [A,B,2] 33 | [B,2] -> [1,B,2] -> [A,B,2] 34 | Then we compute the area of intersect between box_a and box_b. 35 | Args: 36 | box_a: (tensor) bounding boxes, Shape: [A,4]. 37 | box_b: (tensor) bounding boxes, Shape: [B,4]. 38 | Return: 39 | (tensor) intersection area, Shape: [A,B]. 40 | """ 41 | A = box_a.size(0) #box_a的行数,个数 【ground_truth标签框】少数个 42 | B = box_b.size(0) #box_b的行数,个数 【所有的生成框prior框】8732个 43 | max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), 44 | box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) 45 | min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), 46 | box_b[:, :2].unsqueeze(0).expand(A, B, 2)) 47 | inter = torch.clamp((max_xy - min_xy), min=0) 48 | return inter[:, :, 0] * inter[:, :, 1] #返回【A:ground_truth标签框 和 B:所有的生成框prior框】相交的面积 【A:目标数,B列:prior数8732】 49 | 50 | 51 | def jaccard(box_a, box_b): 52 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 53 | is simply the intersection over union of two boxes. Here we operate on 54 | ground truth boxes and default boxes. 55 | E.g.: 56 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 57 | Args: 58 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] =【目标的数量,4】 59 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]=【先验框数量(8732),4】 60 | Return: 61 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] 62 | """ 63 | inter = intersect(box_a, box_b)#【A行:目标数,B列:prior数8732】的相交区域网格形式 64 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 65 | (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] 66 | area_b = ((box_b[:, 2]-box_b[:, 0]) * 67 | (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] 68 | union = area_a + area_b - inter 69 | return inter / union # [A,B]【目标和所有prior的交并比】 70 | 71 | #match函数参数输入【阈值,ground_truth,设置的先验框prior,variance方差?,真实标签,位置预测,类别预测,遍历8732个框的顺序】 72 | #这里的variance要去确认一下 73 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx): 74 | """Match each prior box with the ground truth box of the highest jaccard 75 | overlap, encode the bounding boxes, then return the matched indices 76 | corresponding to both confidence and location preds. 77 | Args: 78 | threshold: (float) The overlap threshold used when mathing boxes. 79 | truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors]. 80 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. 81 | variances: (tensor) Variances corresponding to each prior coord, 82 | Shape: [num_priors, 4]. 83 | labels: (tensor) All the class labels for the image, Shape: [num_obj]. 84 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets. 85 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. 86 | idx: (int) current batch index 【idx对应batch里的每一张图片】 87 | Return: 88 | The matched indices corresponding to 1)location and 2)confidence preds. 89 | """ 90 | # jaccard index 91 | overlaps = jaccard( #输入为真实的ground_truth的框truths 和 生成的所有预测框priors 92 | truths, 93 | point_form(priors) #priors【生成框记录的中点信息和宽高信息】point_form函数将其转化为【左上角,右下角信息】 94 | )#返回overlaps目标和所有prior的交并比 【A行:目标数,B列:prior数8732】的相交区域网格形式 95 | # (Bipartite Matching) 96 | # [1,num_objects] best prior for each ground truth 97 | # 【返回每一行 ground_truth对应的最高的prior的IOU,返回该IOU对应的index】 98 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) #返回的均为列向量【行:目标数,列:1】 99 | # [1,num_priors] best ground truth for each prior 100 | # 【返回每一个prior对应的ground_truth的最大IOU,返回对应的IUO的index】 101 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) #返回的均为行向量【行:1,列:priors数目8732】 102 | #下面四行全部压缩成向量 103 | best_truth_idx.squeeze_(0) #将第一个维度数为1的删除 变成 torch.Size([8732]) 注意squeeze后面有'_'这代表有在本身上进行修改 104 | best_truth_overlap.squeeze_(0) #将第一个维度数为1的删除 变成 torch.Size([8732]) 105 | best_prior_idx.squeeze_(1) #将第二个维度为1的删除,变成 torch.Size([目标数]) #里面包含的是ground_truth对应的最大的prior_box的index,该index的范围是【0-8731】 106 | best_prior_overlap.squeeze_(1) #将第二个维度为1的删除,变成 torch.Size([目标数]) 107 | #下面这行说明,每个ground_truth对应的prior一定是positive正样本,不论其IOU大小是否与阈值的大小 108 | best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior##在0的这个维度上,在best_prior_idx这些位置,填充为2 109 | #输出的best_truth_overlap仍然是8732长度的向量,每个GT匹配的最好的prior_box的面积为‘2’ 110 | # TODO refactor: index best_prior_idx with long tensor 111 | # ensure every gt matches with its prior of max overlap 112 | for j in range(best_prior_idx.size(0)): #目标数的循环 113 | best_truth_idx[best_prior_idx[j]] = j # 给每个prior标记上对应的最好的ground truth的 标签 输出8732向量 114 | matches = truths[best_truth_idx] # Shape: [num_priors(8732),4] 将每个GT的标签复制到8732份数(重复) 115 | conf = labels[best_truth_idx] + 1 # Shape: [num_priors(8732)] +1是因为0作为背景 将每个GT的标签重复复制8732份数 116 | '''通过上面两行代码,8732个prior_box,每个都对应一个GT,若没有重合的区域,那么就对应最后个GT''' 117 | conf[best_truth_overlap < threshold] = 0 # # label 0 as background 118 | loc = encode(matches, priors, variances) # # [g_cxcy, g_wh] 输入【ground_truth信息 先验框信息】 输出【编码后的位置信息】【8732,4】 119 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 【idx是batch里面的每一张图片】 120 | conf_t[idx] = conf # [num_priors] top class label for each prior 121 | 122 | 123 | def encode(matched, priors, variances): 124 | """Encode the variances from the priorbox layers into the ground truth boxes 125 | we have matched (based on jaccard overlap) with the prior boxes. 126 | Args: 127 | matched: (tensor) Coords of ground truth for each prior in point-form 128 | Shape: [num_priors, 4]. 129 | priors: (tensor) Prior boxes in center-offset form 130 | Shape: [num_priors,4]. 131 | variances: (list[float]) Variances of priorboxes 132 | Return: 133 | encoded boxes (tensor), Shape: [num_priors, 4] 134 | """ 135 | 136 | # dist b/t match center and prior's center 137 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] 138 | # encode variance 139 | g_cxcy /= (variances[0] * priors[:, 2:]) 140 | # match wh / prior wh 141 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 142 | g_wh = torch.log(g_wh) / variances[1] 143 | # return target for smooth_l1_loss 144 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 145 | 146 | 147 | # Adapted from https://github.com/Hakuyume/chainer-ssd 148 | def decode(loc, priors, variances): 149 | """Decode locations from predictions using priors to undo 150 | the encoding we did for offset regression at train time. 151 | Args: 152 | loc (tensor): location predictions for loc layers, 153 | Shape: [num_priors,4] 154 | priors (tensor): Prior boxes in center-offset form. 155 | Shape: [num_priors,4]. 156 | variances: (list[float]) Variances of priorboxes 157 | Return: 158 | decoded bounding box predictions 159 | """ 160 | 161 | boxes = torch.cat(( 162 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 163 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 164 | boxes[:, :2] -= boxes[:, 2:] / 2 165 | boxes[:, 2:] += boxes[:, :2] 166 | return boxes 167 | 168 | 169 | def log_sum_exp(x): 170 | """Utility function for computing log_sum_exp while determining 171 | This will be used to determine unaveraged confidence loss across 172 | all examples in a batch. 173 | Args: 174 | x (Variable(tensor)): conf_preds from conf layers 175 | """ 176 | x_max = x.data.max() 177 | return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max 178 | 179 | 180 | # Original author: Francisco Massa: 181 | # https://github.com/fmassa/object-detection.torch 182 | # Ported to PyTorch by Max deGroot (02/01/2017) 183 | def nms(boxes, scores, overlap=0.5, top_k=200): 184 | """Apply non-maximum suppression at test time to avoid detecting too many 185 | overlapping bounding boxes for a given object. 186 | Args: 187 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 188 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 189 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 190 | top_k: (int) The Maximum number of box preds to consider. 191 | Return: 192 | The indices of the kept boxes with respect to num_priors. 193 | """ 194 | 195 | keep = scores.new(scores.size(0)).zero_().long() 196 | if boxes.numel() == 0: 197 | return keep 198 | x1 = boxes[:, 0] 199 | y1 = boxes[:, 1] 200 | x2 = boxes[:, 2] 201 | y2 = boxes[:, 3] 202 | area = torch.mul(x2 - x1, y2 - y1) 203 | v, idx = scores.sort(0) # sort in ascending order 204 | # I = I[v >= 0.01] 205 | idx = idx[-top_k:] # indices of the top-k largest vals 206 | xx1 = boxes.new() 207 | yy1 = boxes.new() 208 | xx2 = boxes.new() 209 | yy2 = boxes.new() 210 | w = boxes.new() 211 | h = boxes.new() 212 | 213 | # keep = torch.Tensor() 214 | count = 0 215 | while idx.numel() > 0: 216 | i = idx[-1] # index of current largest val 217 | # keep.append(i) 218 | keep[count] = i 219 | count += 1 220 | if idx.size(0) == 1: 221 | break 222 | idx = idx[:-1] # remove kept element from view 223 | # load bboxes of next highest vals 224 | torch.index_select(x1, 0, idx, out=xx1) 225 | torch.index_select(y1, 0, idx, out=yy1) 226 | torch.index_select(x2, 0, idx, out=xx2) 227 | torch.index_select(y2, 0, idx, out=yy2) 228 | # store element-wise max with next highest score 229 | xx1 = torch.clamp(xx1, min=x1[i]) 230 | yy1 = torch.clamp(yy1, min=y1[i]) 231 | xx2 = torch.clamp(xx2, max=x2[i]) 232 | yy2 = torch.clamp(yy2, max=y2[i]) 233 | w.resize_as_(xx2) 234 | h.resize_as_(yy2) 235 | w = xx2 - xx1 236 | h = yy2 - yy1 237 | # check sizes of xx1 and xx2.. after each iteration 238 | w = torch.clamp(w, min=0.0) 239 | h = torch.clamp(h, min=0.0) 240 | inter = w*h 241 | # IoU = i / (area(a) + area(b) - i) 242 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 243 | union = (rem_areas - inter) + area[i] 244 | IoU = inter/union # store result in iou 245 | # keep only elements with an IoU <= overlap 246 | idx = idx[IoU.le(overlap)] 247 | return keep, count 248 | -------------------------------------------------------------------------------- /layers/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .detection import Detect 2 | from .prior_box import PriorBox 3 | 4 | 5 | __all__ = ['Detect', 'PriorBox'] 6 | -------------------------------------------------------------------------------- /layers/functions/detection.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from ..box_utils import decode, nms 4 | from data import voc as cfg 5 | 6 | 7 | class Detect(Function): 8 | """At test time, Detect is the final layer of SSD. Decode location preds, 9 | apply non-maximum suppression to location predictions based on conf 10 | scores and threshold to a top_k number of output predictions for both 11 | confidence score and locations. 12 | """ 13 | def __init__(self, num_classes, bkg_label, top_k, conf_thresh, nms_thresh): 14 | self.num_classes = num_classes 15 | self.background_label = bkg_label 16 | self.top_k = top_k 17 | # Parameters used in nms. 18 | self.nms_thresh = nms_thresh 19 | if nms_thresh <= 0: 20 | raise ValueError('nms_threshold must be non negative.') 21 | self.conf_thresh = conf_thresh 22 | self.variance = cfg['variance'] 23 | 24 | def forward(self, loc_data, conf_data, prior_data): 25 | """ 26 | Args: 27 | loc_data: (tensor) Loc preds from loc layers 28 | Shape: [batch,num_priors*4] 29 | conf_data: (tensor) Shape: Conf preds from conf layers 30 | Shape: [batch*num_priors,num_classes] 31 | prior_data: (tensor) Prior boxes and variances from priorbox layers 32 | Shape: [1,num_priors,4] 33 | """ 34 | num = loc_data.size(0) # batch size 35 | num_priors = prior_data.size(0) 36 | output = torch.zeros(num, self.num_classes, self.top_k, 5) 37 | conf_preds = conf_data.view(num, num_priors, 38 | self.num_classes).transpose(2, 1) 39 | 40 | # Decode predictions into bboxes. 41 | for i in range(num): 42 | decoded_boxes = decode(loc_data[i], prior_data, self.variance) 43 | # For each class, perform nms 44 | conf_scores = conf_preds[i].clone() 45 | 46 | for cl in range(1, self.num_classes): 47 | c_mask = conf_scores[cl].gt(self.conf_thresh) 48 | scores = conf_scores[cl][c_mask] 49 | if scores.size(0) == 0: 50 | continue 51 | l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes) 52 | boxes = decoded_boxes[l_mask].view(-1, 4) 53 | # idx of highest scoring and non-overlapping boxes per class 54 | ids, count = nms(boxes, scores, self.nms_thresh, self.top_k) 55 | output[i, cl, :count] = \ 56 | torch.cat((scores[ids[:count]].unsqueeze(1), 57 | boxes[ids[:count]]), 1) 58 | flt = output.contiguous().view(num, -1, 5) 59 | _, idx = flt[:, :, 0].sort(1, descending=True) 60 | _, rank = idx.sort(1) 61 | flt[(rank < self.top_k).unsqueeze(-1).expand_as(flt)].fill_(0) 62 | return output 63 | -------------------------------------------------------------------------------- /layers/functions/prior_box.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from math import sqrt as sqrt 3 | from itertools import product as product 4 | import torch 5 | 6 | 7 | class PriorBox(object): 8 | """Compute priorbox coordinates in center-offset form for each source 9 | feature map. 10 | """ 11 | """cfg= voc = { 12 | 'num_classes': 2, #【改成自己训练的类别数】 13 | 'lr_steps': (80000, 100000, 120000), 14 | 'max_iter': 120000, #【改成自己训练的迭代次数】 15 | 'feature_maps': [38, 19, 10, 5, 3, 1], 16 | 'min_dim': 300, 17 | 'steps': [8, 16, 32, 64, 100, 300], 18 | 'min_sizes': [30, 60, 111, 162, 213, 264], 19 | 'max_sizes': [60, 111, 162, 213, 264, 315], 20 | 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 21 | 'variance': [0.1, 0.2], 22 | 'clip': True, 23 | 'name': 'VOC',} 24 | """ 25 | def __init__(self, cfg): 26 | super(PriorBox, self).__init__() 27 | self.image_size = cfg['min_dim'] 28 | # number of priors for feature map location (either 4 or 6) 29 | self.num_priors = len(cfg['aspect_ratios']) # 6 30 | self.variance = cfg['variance'] or [0.1] 31 | self.feature_maps = cfg['feature_maps'] 32 | self.min_sizes = cfg['min_sizes'] 33 | self.max_sizes = cfg['max_sizes'] 34 | self.steps = cfg['steps'] 35 | self.aspect_ratios = cfg['aspect_ratios'] 36 | self.clip = cfg['clip'] 37 | self.version = cfg['name'] 38 | for v in self.variance: 39 | if v <= 0: 40 | raise ValueError('Variances must be greater than 0') 41 | 42 | def forward(self): 43 | mean = [] 44 | for k, f in enumerate(self.feature_maps):#每个特征层的尺寸大小 'feature_maps': [38, 19, 10, 5, 3, 1], 45 | for i, j in product(range(f), repeat=2):#生成平面的网格位置坐标 46 | f_k = self.image_size / self.steps[k] #300/[8, 16, 32, 64, 100, 300] f_k=[37.5, 18.75, 9.375, 4.6875, 3, 1] 47 | # unit center x,y 48 | cx = (j + 0.5) / f_k 49 | cy = (i + 0.5) / f_k 50 | 51 | # aspect_ratio: 1 52 | # rel size: min_size 53 | s_k = self.min_sizes[k]/self.image_size#'min_sizes': [30, 60, 111, 162, 213, 264]/300=[0.1, 0.2, 0.37, 0.54, 0.71, 0.88]=s_k 54 | mean += [cx, cy, s_k, s_k] 55 | 56 | # aspect_ratio: 1 57 | # rel size: sqrt(s_k * s_(k+1)) 58 | s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size))#'max_sizes': [60, 111, 162, 213, 264, 315]/300=[0.2 0.37 0.54 0.71 0.88 1.05] 59 | mean += [cx, cy, s_k_prime, s_k_prime] 60 | 61 | # rest of aspect ratios 62 | for ar in self.aspect_ratios[k]:#'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 63 | mean += [cx, cy, s_k*sqrt(ar), s_k/sqrt(ar)] 64 | mean += [cx, cy, s_k/sqrt(ar), s_k*sqrt(ar)] 65 | # back to torch land 66 | output = torch.Tensor(mean).view(-1, 4) 67 | if self.clip: 68 | output.clamp_(max=1, min=0) #对超出范围的点坐标位置和prior的宽高限制在0-1之间 69 | return output #返回的output中是所有的生成框anchor的位置和尺寸 70 | -------------------------------------------------------------------------------- /layers/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .l2norm import L2Norm 2 | from .multibox_loss import MultiBoxLoss 3 | 4 | __all__ = ['L2Norm', 'MultiBoxLoss'] 5 | -------------------------------------------------------------------------------- /layers/modules/l2norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Function 4 | from torch.autograd import Variable 5 | import torch.nn.init as init 6 | 7 | class L2Norm(nn.Module): 8 | def __init__(self,n_channels, scale): 9 | super(L2Norm,self).__init__() 10 | self.n_channels = n_channels 11 | self.gamma = scale or None 12 | self.eps = 1e-10 13 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 14 | self.reset_parameters() 15 | 16 | def reset_parameters(self): 17 | init.constant_(self.weight,self.gamma) 18 | 19 | def forward(self, x): 20 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()+self.eps 21 | #x /= norm 22 | x = torch.div(x,norm) 23 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x 24 | return out 25 | -------------------------------------------------------------------------------- /layers/modules/multibox_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from data import config as cfg # ##将原本的coco换成了voc0712 7 | from ..box_utils import match, log_sum_exp 8 | import focal_loss # ##引入了FocalLoss 9 | 10 | 11 | class MultiBoxLoss(nn.Module): 12 | """SSD Weighted Loss Function 13 | Compute Targets: 14 | 1) Produce Confidence Target Indices by matching ground truth boxes 15 | with (default) 'priorboxes' that have jaccard index > threshold parameter 16 | (default threshold: 0.5). 17 | ###ground——trut和自定义的prior框做匹配,IOU大于0.5就默认为正样本 18 | 2) Produce localization target by 'encoding' variance into offsets of ground 19 | truth boxes and their matched 'priorboxes'. 20 | ###编码过程 21 | 3) Hard negative mining to filter the excessive number of negative examples 22 | that comes with using a large number of default bounding boxes. 23 | (default negative:positive ratio 3:1) 24 | ###难例挖掘部分(正负样本的比值为1:3) 25 | Objective Loss: 26 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 27 | 损失部分有两部分组成(类别的交叉熵loss和位置的smoothL2损失) 28 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 29 | weighted by α which is set to 1 by cross val. 30 | Args: 31 | c: class confidences, (预测的类别置信度) 32 | l: predicted boxes,(预测的回归框) 33 | g: ground truth boxes (ground——truth框) 34 | N: number of matched default boxes (匹配的默认框数目) 35 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 36 | """ 37 | # 输入参数:类别数,阈值,是否用prior框匹配(bool),背景的标签值, 是否难例挖掘(bool),负例和正例的比例, 确定为困难负例的IUO最小值, 编码对象(bool),默认使用GPU 38 | def __init__(self, num_classes, overlap_thresh, prior_for_matching, 39 | bkg_label, neg_mining, neg_pos, neg_overlap, encode_target, use_gpu=True): 40 | super(MultiBoxLoss, self).__init__() 41 | self.use_gpu = use_gpu 42 | self.num_classes = num_classes 43 | self.threshold = overlap_thresh # ##阈值 44 | self.background_label = bkg_label # ##背景的标签值*****但是后面好像没使用 45 | self.encode_target = encode_target # ##不明白,后面好像也没用 46 | self.use_prior_for_matching = prior_for_matching # ##是否用prior框匹配(bool) 47 | self.do_neg_mining = neg_mining # ##是否难例挖掘(bool) 48 | self.negpos_ratio = neg_pos # ##负例和正例的比例 49 | self.neg_overlap = neg_overlap # ##确定为困难负例的IUO最小值 50 | self.variance = cfg.voc['variance'] 51 | 52 | # ##forward里面包括了【难例挖掘】 53 | # ##输入参数1:网络结构net输出的out:[loc conf priors] 54 | # ##输入参数2:targets:真实目标的位置标签值 55 | def forward(self, predictions, targets): 56 | """Multibox Loss 57 | Args: 58 | predictions (tuple): A tuple containing loc preds, conf preds, 59 | and prior boxes from SSD net. 60 | conf shape: torch.size【batch_size, num_priors, num_classes】 3维度 61 | loc shape: torch.size【batch_size, num_priors, 4】 3维度 62 | priors shape: torch.size【num_priors,4】 63 | 64 | targets (tensor): Ground truth boxes and labels for a batch, 65 | shape: [batch_size,num_objs,5] (last idx is the label). 66 | """ 67 | loc_data, conf_data, priors = predictions # 【prediction包括net预测的位置信息,预测的类别,所有的先验框】 68 | num = loc_data.size(0) # batch_size每次输入的图片数 69 | priors = priors[:loc_data.size(1), :] # priors里面包括所有的先验prior框[8732,4] # feel no use 70 | num_priors = (priors.size(0)) # 8732 anchors的数量 71 | num_classes = self.num_classes # 类别数 72 | 73 | # match priors (default boxes) and ground truth boxes 74 | # ##下面的loc_t和conf_t是生成的随机的 75 | loc_t = torch.Tensor(num, num_priors, 4) # [batch_size,8732,4] 每张图片有8732个先验框,每个先验框有四个数值[中心点xy,高,宽] 76 | # 用来记录每一个default box的类别,0类就是负样本 77 | conf_t = torch.LongTensor(num, num_priors) # [batch_size,8732] 每张图片生成8732个先验框 每个先验框有一个置信度的的值 78 | for idx in range(num): # 对每个batch_size里每一张图进行遍历 79 | # target里面是五维度tensor,最后个维度是label 80 | truths = targets[idx][:, :-1].data # position 真实的ground_truth方框信息 targets是5维数据【前4维表示位置信息,最后1维表示类别】 81 | labels = targets[idx][:, -1].data # labels 真实的回归框标签信息 82 | defaults = priors.data # [8732,4] default box在同一尺度下的坐标是不变的,与batch无关 83 | 84 | # 【MATCH函数】参数输入【阈值,ground_truth,设置的先验框prior,variance方差?,真实标签,位置预测,类别预测,遍历每个batch中的图片顺序】 85 | match(self.threshold, truths, defaults, self.variance, labels,loc_t, conf_t, idx) 86 | # match这个函数给每个ground truth匹配了最好的priors,给每个priors匹配最好的ground truth 87 | # 经过encode后的offset([g_cx cy, g_wh])->loc_t,top class label for each prior->conf_t 88 | # match函数最后更新 loc_t, conf_t 【编码之后的位置信息和类别信息】 89 | # loc_t 【batch_size, 8732, 4】 90 | # conf_t【batch_size, 8732】 91 | if self.use_gpu: # 将编码后的位置信息和类别信息放在GPU上 92 | loc_t = loc_t.cuda() # 【loc_t里面是一个batch中所有图片的位置信息,每张图片有(8732,4)】 Tensor:【batch_size,7843,4】 93 | conf_t = conf_t.cuda() # Tensor: 【batch_size,8732】 94 | # wrap targets 95 | loc_t = Variable(loc_t, requires_grad=False) # #Tensor:【batch_size,7843,4】 encoded offsets to learn 96 | conf_t = Variable(conf_t, requires_grad=False) 97 | # #Tensor: 【batch_size,8732】 top class label for each prior conf_t是标签值 98 | 99 | pos = conf_t > 0 # 只有大于0的才被认为不是背景,而是存在目标 pos=bool型 pos=Tensor:【batch_size,8732】 100 | num_pos = pos.sum(dim=1, keepdim=True) # num_pos记录的是8732个框中是存在目标的方框 选择为正样本的数量??? 101 | 102 | # Localization Loss (Smooth L1) 103 | # Shape: [batch,num_priors,4] 104 | # loc_loss是只考虑正样本的 loc_data是预测的tensor 105 | # ## pos_idx是bool型【batch_size,8732,4】,记录的是每张图片中生成的prior中是目标是True 背景是False 106 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 107 | # 首先将pos的最后个维度添加个'1' 再将bool型的pos【batch_size,8732】->【batch_size,8732,4】 108 | loc_p = loc_data[pos_idx].view(-1, 4) # ## 由net预测的存在目标的区域目标 loc_p (p代表positive) 【前景目标区域的个数,4】 109 | loc_t = loc_t[pos_idx].view(-1, 4) # ## 由实际GT 编码出来的loc_t 110 | # 输入的loc_p是指真实编码后的ground_truth 和 网络的预测位置结果 通过L1函数计算损失 111 | ''' 112 | 【loss_l】即为位置损失值 113 | ''' 114 | loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) # ##输入参数1:网络net的位置预测 输入参数2:真实GT编码后的位置信息 115 | # ############################################################################################################################################ 116 | # ############################################################################################################################################ 117 | '''【难例挖掘】''' 118 | # 【conf_data】: torch.size(batch_size,num_priors,num_classes) 119 | batch_conf = conf_data.view(-1, self.num_classes) # 【batch_size*8732行,num_classes列】 一个batch_size中所有prior的数量 120 | # 【参照论文中conf计算方式】 121 | # ## conf_t.view(-1, 1) 【batch_size*8732行, 1列】 与GT匹配之后的置信度的值 122 | # ## batch_conf 【batch_size*8732行,num_classes列】 每个prior中N类别的置信度 123 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1)) # ##将预测信息按照论文中的公式编码【难懂】 124 | # 得到的loss_c torch.Size([batch_size*8732, 1]) 125 | 126 | # 【Hard Negative Mining】 127 | # loss_c[pos.view(-1, 1)] = 0###上面两行被同时注释掉 128 | loss_c = loss_c.view(num, -1) # ##这里和下面一行调换了 loss=【torch.Size([batch_size, 8732])】 129 | loss_c[pos] = 0 # ##将正例样本的损失置为0,背景样本的loss不是0 pos(bool型)=Tensor:【batch_size,8732】 130 | _, loss_idx = loss_c.sort(1, descending=True) # _ 里面存 放每行由大到小的数列, loss_idx 降序后的元素在原本每行中的index 131 | _, idx_rank = loss_idx.sort(1) # ##idx_rank [batch_size ,8732] 132 | # ## 第一次sort:得到的index是按顺序排的索引 第两次sort:得到原Tensor的损失从大到小的映射,排第几的数字变为排名【难懂但看懂了】 133 | # ## 总结:正样本为默认框与真实框根据iou匹配得到,负样本为分类loss值排序得到。 134 | # ## 先将 pos bool型(True,False)转化为(1,0) num_pos:【batch_size, 1】 每一行记录的是batch中 每一张图片中有目标的prior数量 135 | num_pos = pos.long().sum(1, keepdim=True) 136 | # ## max=pos.size(1)-1 表示最多有多少个prior,每张图片中的负样本数不能超过每张图片中最大的prior数 137 | # ## negpos_ratio*num_pos 表示负样本数是正样本数的3倍 138 | num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1) # num_neg返回的是 torch.Size([batch_size, 1]) 139 | # ## 【num_pos,num_neg】均为【batch_size, 1】 分别记录了每张图片中正样本和负样本的数目 比例 1:3 140 | 141 | # ## neg(bool型)【batch_size, 8732】 选取了每张图片中 排名前(对应负样本数量)的 设置为True 142 | neg = idx_rank < num_neg.expand_as(idx_rank) 143 | # 置信度的损失包括 正/负样本都包括损失 144 | # 因为pos 和 neg 都是bool型 因此 pos_idx 和 neg_idx 也是bool型 145 | # ## pos_idx 和 neg_idx 均为【batch_size, 8732 ,num_classes】 146 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 147 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 148 | 149 | # ## conf_p:【batch_size*8732 , num_classes】 150 | # ## conf_p 包括 正/负样本都要算入损失 151 | conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes) 152 | # ## Net在每个prior框中分别预测每一类别结果【batch_size*8732 , num_classes】 153 | targets_weighted = conf_t[(pos+neg).gt(0)] # ## 含有GT信息【batch_size,8732】 154 | ''' 155 | 【loss_c】即为类别损失值 156 | ''' 157 | # ##参数1:conf_p 是Net在每个prior框中分别预测每一类别结果 158 | # ##参数2:targets_weighted 是存储的标签值long形式 159 | # ##【FocalLoss函数是针对类别损失部分 【问题1】:正样本/负样本不均衡 【问题2】:难易样本本身对损失函数的贡献不一样】 160 | # ##------------------------------------------------------------------------------------------------- 161 | compute_c_loss = focal_loss.FocalLoss(alpha=None, gamma=2, class_num=num_classes, size_average=False) 162 | loss_c = compute_c_loss(conf_p, targets_weighted) 163 | # ##下面是原本的损失函数 若引入FocalLoss那么就注释掉这一行 164 | # loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False) ###【难懂没懂】 ************ 165 | # ## Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 166 | # ##------------------------------------------------------------------------------------------------- 167 | 168 | N = num_pos.data.sum() # ## N:一个batch中的所有图片的目标总数 169 | N=N.double() 170 | loss_l = loss_l.double() # 上面加入double()下面也添加了一行 171 | loss_c = loss_c.double() 172 | loss_l /= N 173 | loss_c /= N 174 | return loss_l, loss_c 175 | -------------------------------------------------------------------------------- /loc-txt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "train and val size 616\n", 13 | "train size 554\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import os \n", 19 | "import random \n", 20 | " \n", 21 | "xmlfilepath=r'/data/lp/project/ssd.pytorch/xml_zc_fz' \n", 22 | "saveBasePath=r'/data/lp/project/ssd.pytorch/txtsave'\n", 23 | " \n", 24 | "trainval_percent=1.0\n", 25 | "train_percent=0.9\n", 26 | "total_xml = os.listdir(xmlfilepath) \n", 27 | "num=len(total_xml) \n", 28 | "list=range(num) \n", 29 | "tv=int(num*trainval_percent) \n", 30 | "tr=int(tv*train_percent) \n", 31 | "trainval= random.sample(list,tv) \n", 32 | "train=random.sample(trainval,tr) \n", 33 | " \n", 34 | "print(\"train and val size\",tv) \n", 35 | "print(\"train size\",tr) \n", 36 | "ftrainval = open(os.path.join(saveBasePath,'trainval.txt'), 'w') \n", 37 | "ftest = open(os.path.join(saveBasePath,'test.txt'), 'w') \n", 38 | "ftrain = open(os.path.join(saveBasePath,'train.txt'), 'w') \n", 39 | "fval = open(os.path.join(saveBasePath,'val.txt'), 'w') \n", 40 | " \n", 41 | "for i in list: \n", 42 | " name=total_xml[i][:-4]+'\\n' \n", 43 | " if i in trainval: \n", 44 | " ftrainval.write(name) \n", 45 | " if i in train: \n", 46 | " ftrain.write(name) \n", 47 | " else: \n", 48 | " fval.write(name) \n", 49 | " else: \n", 50 | " ftest.write(name) \n", 51 | " \n", 52 | "ftrainval.close() \n", 53 | "ftrain.close() \n", 54 | "fval.close() \n", 55 | "ftest .close() " 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [] 66 | } 67 | ], 68 | "metadata": { 69 | "kernelspec": { 70 | "display_name": "Python 3", 71 | "language": "python", 72 | "name": "python3" 73 | }, 74 | "language_info": { 75 | "codemirror_mode": { 76 | "name": "ipython", 77 | "version": 3 78 | }, 79 | "file_extension": ".py", 80 | "mimetype": "text/x-python", 81 | "name": "python", 82 | "nbconvert_exporter": "python", 83 | "pygments_lexer": "ipython3", 84 | "version": "3.6.10" 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 2 89 | } 90 | -------------------------------------------------------------------------------- /ssd.py: -------------------------------------------------------------------------------- 1 | # import torch 2 | # import torch.nn as nn 3 | # import torch.nn.functional as F 4 | # from torch.autograd import Variable 5 | # from layers import * 6 | # from data import voc #coco 7 | # import os 8 | # 9 | # 10 | # class SSD(nn.Module): 11 | # """Single Shot Multibox Architecture 12 | # The network is composed of a base VGG network followed by the 13 | # added multibox conv layers. Each multibox layer branches into 14 | # 1) conv2d for class conf scores 15 | # 2) conv2d for localization predictions 16 | # 3) associated priorbox layer to produce default bounding 17 | # Args: 18 | # phase: (string) Can be "test" or "train" 19 | # size: input image size 20 | # base: VGG16 layers for input, size of either 300 or 500 21 | # extras: extra layers that feed to multibox loc and conf layers 22 | # head: "multibox head" consists of loc and conf conv layers 23 | # """ 24 | # def __init__(self, phase, size, base, extras, head, num_classes): 25 | # super(SSD, self).__init__() 26 | # self.phase = phase#训练的状态是train还是test 27 | # self.num_classes = num_classes 28 | # self.cfg =voc #(coco, )[num_classes == 2]#voc和coco都是字典型 找到num_classes键 对应为值为21的模型,这里返回【voc】 29 | # self.priorbox = PriorBox(self.cfg) #实例化一个类PriorBox,类实现的功能是生成所有的先验框 prior anchors 30 | # self.priors = Variable(self.priorbox.forward(), volatile=True)#结合上面一句话执行生成先验框的操作,priors保存的是【tensor 8760行4列】 31 | # self.size = size #图片大小 32 | # 33 | # # SSD network 34 | # self.vgg = nn.ModuleList(base)#####SSD前面的VGG16层 35 | # # Layer learns to scale the l2 normalized features from conv4_3 36 | # #conv4-3需要做L2归一化 37 | # self.L2Norm = L2Norm(512, 20) 38 | # self.extras = nn.ModuleList(extras)#SSD后面添加的额外层 39 | # #head包括两个list【第一个list是位置预测,第二个list是类别预测】 40 | # self.loc = nn.ModuleList(head[0]) 41 | # self.conf = nn.ModuleList(head[1]) 42 | # 43 | # if phase == 'test': #看train步骤的时候别看 44 | # self.softmax = nn.Softmax(dim=-1) #最后一个维度是预测的类别信息,要经过softmax 45 | # self.detect = Detect(num_classes, 0, 200, 0.01, 0.45) 46 | # 47 | # def forward(self, x): 48 | # """Applies network layers and ops on input image(s) x. 49 | # 50 | # Args: 51 | # x: input image or batch of images. Shape: [batch,3,300,300]. 52 | # 53 | # Return: 54 | # Depending on phase: 55 | # test: 56 | # Variable(tensor) of output class label predictions, 57 | # confidence score, and corresponding location predictions for 58 | # each object detected. Shape: [batch,topk,7] 59 | # 60 | # train: 61 | # list of concat outputs from: 62 | # 1: confidence layers, Shape: [batch*num_priors,num_classes] 63 | # 2: localization layers, Shape: [batch,num_priors*4] 64 | # 3: priorbox layers, Shape: [2,num_priors*4] 65 | # """ 66 | # sources = list() 67 | # loc = list() 68 | # conf = list() 69 | # 70 | # # apply vgg up to 【conv4_3 relu激活后再L2Norm操作后的输出tensor】 71 | # for k in range(23): 72 | # x = self.vgg[k](x) 73 | # s = self.L2Norm(x) 74 | # sources.append(s) 75 | # 76 | # # apply vgg up to 【fc7 也就是vgg基础层最后一层 relu激活层操作后的输出tensor】 77 | # for k in range(23, len(self.vgg)): 78 | # x = self.vgg[k](x) 79 | # sources.append(x) 80 | # 81 | # # apply extra layers and cache source layer outputs【将额外添加的4个tensor提取出来】 82 | # for k, v in enumerate(self.extras): 83 | # x = F.relu(v(x), inplace=True) 84 | # if k % 2 == 1: 85 | # sources.append(x) 86 | # #到此为止 【sources里面包括了6个特征层】 87 | # # apply multibox head to source layers 88 | # for (x, l, c) in zip(sources, self.loc, self.conf):#【sources loc conf都是具有六个元素的list】 89 | # # [b, C, H, W]——[b, H, W, C],因为我们最后要在通道这个维度上做softmax 90 | # loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 91 | # conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 92 | # 93 | # loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 94 | # conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)###########没有看特别明白 95 | # 96 | # if self.phase == "test": 97 | # output = self.detect( 98 | # loc.view(loc.size(0), -1, 4),# loc preds 99 | # self.softmax(conf.view(conf.size(0), -1,self.num_classes)), # conf preds 100 | # self.priors.type(type(x.data))# default boxes 101 | # ) 102 | # else: 103 | # output = ( 104 | # loc.view(loc.size(0), -1, 4), 105 | # conf.view(conf.size(0), -1, self.num_classes), 106 | # self.priors 107 | # ) 108 | # return output 109 | # 110 | # def load_weights(self, base_file): 111 | # other, ext = os.path.splitext(base_file) 112 | # if ext == '.pkl' or '.pth': 113 | # print('Loading weights into state dict...') 114 | # self.load_state_dict(torch.load(base_file, 115 | # map_location=lambda storage, loc: storage)) 116 | # print('Finished!') 117 | # else: 118 | # print('Sorry only .pth and .pkl files supported.') 119 | # 120 | # 121 | # # This function is derived from torchvision VGG make_layers() 122 | # # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 123 | # def vgg(cfg, i, batch_norm=False): 124 | # layers = [] # 用于存放vgg网络的list 125 | # in_channels = i# 最前面那层的维度--300*300*3,因此i=3 我的理解是输入时候的维度 126 | # for v in cfg: # 代码厉害的地方,循环建立多层,数据信息存放在一个字典中 127 | # if v == 'M': #'M'代表Maxpooling ceil_mode=False # maxpooling 时边缘不补 128 | # layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 129 | # elif v == 'C':#'C'代表Maxpooling ceil_mode=True # maxpooling 时边缘补NAN 130 | # layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 131 | # else: 132 | # conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 133 | # if batch_norm: 134 | # layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 135 | # else: 136 | # layers += [conv2d, nn.ReLU(inplace=True)] 137 | # in_channels = v 138 | # pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 139 | # conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 140 | # # dilation=卷积核元素之间的间距,扩大卷积感受野的范围,没有增加卷积size 141 | # conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 142 | # layers += [pool5, conv6, 143 | # nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 144 | # return layers #返回的是vgg的结构 145 | # 146 | # 147 | # def add_extras(cfg, i, batch_norm=False): 148 | # # Extra layers added to VGG for feature scaling 149 | # layers = [] 150 | # in_channels = i 151 | # flag = False 152 | # for k, v in enumerate(cfg): 153 | # if in_channels != 'S':# S代表stride,为2时候就相当于缩小feature map 154 | # if v == 'S': 155 | # layers += [nn.Conv2d(in_channels, cfg[k + 1], 156 | # kernel_size=(1, 3)[flag], stride=2, padding=1)] 157 | # else: 158 | # layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 159 | # flag = not flag 160 | # in_channels = v 161 | # return layers 162 | # 163 | # 164 | # def multibox(vgg, extra_layers, cfg, num_classes): 165 | # loc_layers = [] # loc_layers的输出维度是default box的种类(4or6)*4 166 | # conf_layers = [] # conf_layers的输出维度是default box的种类(4or6)*num_class 167 | # vgg_source = [21, -2] #第21层和倒数第二层 168 | # for k, v in enumerate(vgg_source): 169 | # loc_layers += [nn.Conv2d(vgg[v].out_channels, 170 | # cfg[k] * 4, kernel_size=3, padding=1)] #特征图的尺寸没有改变,通道数变成 【4/6*4】 171 | # conf_layers += [nn.Conv2d(vgg[v].out_channels, 172 | # cfg[k] * num_classes, kernel_size=3, padding=1)] #特征图的尺寸没有改变,通道数变成 【4/6*num_classes】 173 | # for k, v in enumerate(extra_layers[1::2], 2): 174 | # loc_layers += [nn.Conv2d(v.out_channels, cfg[k] 175 | # * 4, kernel_size=3, padding=1)] #特征图的尺寸没有改变,通道数变成 【4/6*4】 176 | # conf_layers += [nn.Conv2d(v.out_channels, cfg[k] 177 | # * num_classes, kernel_size=3, padding=1)] #特征图的尺寸没有改变,通道数变成 【4/6*num_classes】 178 | # return vgg, extra_layers, (loc_layers, conf_layers) 179 | # #返回的是vgg,extra_layers的结构 以及六个特征层提取的【位置回归特征图,类别回归特征图】 180 | # 181 | # 182 | # base = { 183 | # '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 184 | # 512, 512, 512], 185 | # '512': [], 186 | # } 187 | # extras = { 188 | # '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 189 | # '512': [], 190 | # } 191 | # mbox = { 192 | # '300': [4, 6, 6, 6, 4, 4], # number of boxes per feature map location 193 | # '512': [], 194 | # } 195 | # 196 | # 197 | # def build_ssd(phase, size=300, num_classes=21):#阶段【train or test】 输入图片尺寸大小 类别数 198 | # if phase != "test" and phase != "train":#分成训练和测试两个阶段 199 | # print("ERROR: Phase: " + phase + " not recognized") 200 | # return 201 | # if size != 300: 202 | # print("ERROR: You specified size " + repr(size) + ". However, " + 203 | # "currently only SSD300 (size=300) is supported!") 204 | # return 205 | # base_, extras_, head_ = multibox(vgg(base[str(size)], 3), #网络结构是先经过vgg+add_extras 这里的vgg输出通道是1024,add_extras输入为1024 206 | # add_extras(extras[str(size)], 1024), 207 | # mbox[str(size)], num_classes) 208 | # #返回的head_是个元组,里面包括两个list【第一个list是位置预测,第二个list是类别预测】,每个list 6个元素,每个元素是个特征层tensor 209 | # return SSD(phase, size, base_, extras_, head_, num_classes) 210 | ######################版本二################################## 211 | # import torch 212 | # import torch.nn as nn 213 | # import torch.nn.functional as F 214 | # from torch.autograd import Variable 215 | # # from SSD_pytorch.models import * 216 | # # from SSD_pytorch.utils.config import opt 217 | # import os 218 | # from torch.autograd import Variable 219 | # from layers import * 220 | # from data import voc #coco 221 | # 222 | # 223 | # class SSD(nn.Module): 224 | # """Single Shot Multibox Architecture 225 | # The network is composed of a base VGG network followed by the 226 | # added multibox conv layers. Each multibox layer branches into 227 | # 1) conv2d for class conf scores 228 | # 2) conv2d for localization predictions 229 | # 3) associated priorbox layer to produce default bounding 230 | # boxes specific to the layer's feature map size. 231 | # SSD模型由去掉全连接层的vgg网络为基础组成。在之后添加了多盒转化层。 232 | # 每个多盒层分支是: 233 | # 1)conv2d 获取分类置信度 234 | # 2)conv2d进行坐标位置预测 235 | # 3)相关层去产生特定于该层特征图大小的默认的预测框bounding boxes 236 | # 237 | # 238 | # 239 | # See: https://arxiv.org/pdf/1512.02325.pdf for more details. 240 | # 241 | # Args: 242 | # phase: (string) Can be "test" or "train" 243 | # size: input image size 输入的图像尺寸 244 | # base: VGG16 layers for input, size of either 300 or 500 经过修改的vgg网络 245 | # extras: extra layers that feed to multibox loc and conf layers 246 | # 提供多盒定位的格外层 和 分类置信层(vgg网络后面新增的额外层) 247 | # head: "multibox head" consists of loc and conf conv layers 248 | # 由定位和分类卷积层组成的multibox head 249 | # (loc_layers, conf_layers) vgg与extras中进行分类和回归的层 250 | # """ 251 | # 252 | # def __init__(self, phase, size, base, extras, head, num_classes): 253 | # super(SSD, self).__init__() 254 | # self.phase = phase 255 | # self.num_classes = num_classes 256 | # self.cfg = voc 257 | # # 新定义一个类,该类的功能:对于每个feature map,生成预测框(中心坐标及偏移量) 258 | # self.priorbox = PriorBox(self.cfg) 259 | # # 调用forward,返回生成的预测框结果 260 | # # 对于所有预测的feature map,存储着生成的不同长宽比的默认框(可以理解为anchor) 261 | # self.priors = Variable(self.priorbox.forward(), volatile=True) 262 | # #300 263 | # self.size = size 264 | # 265 | # # SSD network范围 266 | # # 经过修改的vgg网络 267 | # self.vgg = nn.ModuleList(base)################################################ 268 | # # Layer learns to scale the l2 normalized features from conv4_3 269 | # # Layer层从conv4_3学习去缩放l2正则化特征 270 | # # 论文中conv4_3 相比较于其他的layers,有着不同的 feature scale,我们使用 ParseNet 中的 L2 normalization 技术 271 | # # 将conv4_3 feature map 中每一个位置的 feature norm scale 到 20,并且在 back-propagation 中学习这个 scale 272 | # self.L2Norm = L2Norm(512, 20) 273 | # # vgg网络后面新增的额外层 274 | # self.extras = nn.ModuleList(extras)#################################################### 275 | # # vgg与extras中进行分类和回归的层 276 | # self.loc = nn.ModuleList(head[0]) 277 | # self.conf = nn.ModuleList(head[1]) 278 | # 279 | # # 如果网络用于测试,则加入softmax和检测 280 | # if phase == 'test': 281 | # self.softmax = nn.Softmax(dim=-1) 282 | # self.detect = Detect(num_classes, 0, 200, 0.01, 0.45) 283 | # 284 | # #=====bobo新增================== 285 | # # pool2到conv4_3 扩张卷积,尺度少一半 286 | # self.DilationConv_128_128= nn.Conv2d(in_channels=128,out_channels= 128, kernel_size=3, padding=2, dilation=2,stride=2) 287 | # # conv4_3到conv4_3 尺度不变 288 | # self.conv_512_256 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3, padding=1, stride=1) 289 | # # fc7 到 conv4_3 反卷积上采样,尺度大一倍 290 | # self.DeConv_1024_128 = nn.ConvTranspose2d(in_channels=1024,out_channels=128,kernel_size=2,stride=2) 291 | # 292 | # # conv4_3 到FC7 扩张卷积,尺度少一半 293 | # self.DilationConv_512_128 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, padding=2, dilation=2,stride=2) 294 | # # FC7到FC7 尺度不变 295 | # self.conv_1024_256 = nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=3, padding=1, stride=1) 296 | # # conv8_2 到 FC7 反卷积上采样,尺度大一倍 10->19 297 | # self.DeConv_512_128 = nn.ConvTranspose2d(in_channels=512, out_channels=128, kernel_size=3, stride=2,padding=1) 298 | # 299 | # 300 | # # conv5_3到conv8_2 301 | # self.DilationConv_512_128_2 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, padding=2, dilation=2, stride=2) 302 | # # conv8_2到conv8_2 尺度不变 303 | # self.conv_512_256_2 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3, padding=1, stride=1) 304 | # # conv9_2到conv8_2 305 | # self.DeConv_256_128_2 = nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=2, stride=2) 306 | # 307 | # # 平滑层 308 | # self.smooth = nn.Conv2d(512, 512, kernel_size = 3, padding = 1, stride = 1) 309 | # 310 | # # 通道数BN层的参数是输出通道数out_channels 311 | # self.bn = nn.BatchNorm2d(128) 312 | # def forward(self, x): 313 | # """Applies network layers and ops on input image(s) x. 314 | # 前向传播 315 | # 316 | # Args: 317 | # x: input image or batch of images. Shape: [batch,3,300,300]. 318 | # 319 | # Return: 320 | # Depending on phase: 321 | # test测试集: 322 | # Variable(tensor) of output class label predictions, 323 | # confidence score, and corresponding location predictions for 324 | # each object detected. Shape: [batch,topk,7] 325 | # 326 | # train训练集: 327 | # list of concat outputs from: 328 | # 1: 分类层confidence layers, Shape: [batch*num_priors,num_classes] 329 | # 2: 回归定位层localization layers, Shape: [batch,num_priors*4] 330 | # 3: priorbox layers, Shape: [2,num_priors*4] 331 | # """ 332 | # # sources保存 网络生成的不同层feature map结果,以便使用这些feature map来进行分类与回归 333 | # sources = list() 334 | # # 保存预测层不同feature map通过回归和分类网络的输出结果 335 | # loc = list() 336 | # conf = list() 337 | # 338 | # # 原论文中vgg的conv4_3,relu之后加入L2 Normalization正则化,然后保存feature map 339 | # # apply vgg up to conv4_3 relu 340 | # # 将vgg层的feature map保存 341 | # # k的范围为0-22 342 | # 343 | # #=========开始保存 所需的所有中间信息 344 | # 345 | # 346 | # # 保存pool2(pool下标从1开始)的结果 347 | # # 经过maxpool,所以不需要L2Norm正则化 348 | # for k in range(10): 349 | # x = self.vgg[k](x) 350 | # sources.append(x) 351 | # 352 | # # 保存conv4_3结果 353 | # for k in range(10,23): 354 | # x = self.vgg[k](x) 355 | # s = self.L2Norm(x) 356 | # sources.append(s) 357 | # 358 | # # 保存conv5_3结果 类似conv4_3原仓库一样,加入L2Norm 359 | # for k in range(23, 30): 360 | # x = self.vgg[k](x) 361 | # s = self.L2Norm(x) 362 | # sources.append(s) 363 | # 364 | # # 保存 原fc7的输出结果 365 | # # apply vgg up to fc7,即将原fc7层更改为卷积层输出的结果,经过relu之后保存结果 366 | # # k的范围为23 - 结束 367 | # for k in range(30, len(self.vgg)): 368 | # x = self.vgg[k](x) 369 | # sources.append(x) 370 | # 371 | # # 将新加的额外层 conv8_2、conv9_2、conv10_2、conv11_2结果保存 372 | # # apply extra layers and cache source layer outputs 373 | # # 将新增层的feature map保存 374 | # for k, v in enumerate(self.extras): 375 | # # 每经过一个conv卷积,都relu一下 376 | # x = F.relu(v(x), inplace=True) 377 | # # 论文中隔一个conv保存一个结果 378 | # if k % 2 == 1: 379 | # sources.append(x) 380 | # 381 | # # 此时sources保存了所有中间结果,论文中的pool2、conv4_3、conv5_3、fc7、conv8_2、conv9_2、conv10_2、conv11_2 382 | # 383 | # # sources_final保存各层融合之后的最终结果 384 | # sources_final=list() 385 | # 386 | # # con4_3层融合结果 self.bn1(self.conv1(x)) 在通道维度上融合 387 | # conv4_fp=torch.cat((F.relu(self.bn(self.DilationConv_128_128(sources[0])),inplace=True), F.relu(self.conv_512_256(sources[1]),inplace=True), F.relu(self.DeConv_1024_128(sources[3]),inplace=True)),1) 388 | # sources_final.append(F.relu( self.smooth(conv4_fp) , inplace=True)) 389 | # # FC7层融合结果 390 | # fc7_fp = torch.cat((F.relu( self.bn(self.DilationConv_512_128(sources[1])) ,inplace=True),F.relu( self.conv_1024_256(sources[3]),inplace=True) ,F.relu( self.DeConv_512_128(sources[4]),inplace=True)),1) 391 | # sources_final.append(F.relu( self.smooth(fc7_fp) , inplace=True)) 392 | # # conv8_2层融合结果 393 | # conv8_fp= torch.cat(( F.relu( self.bn(self.DilationConv_512_128_2(sources[2])),inplace=True) ,F.relu(self.conv_512_256_2(sources[4]) ,inplace=True) ,F.relu( self.DeConv_256_128_2(sources[5]),inplace=True) ),1) 394 | # sources_final.append( F.relu( self.smooth(conv8_fp) , inplace=True) ) 395 | # 396 | # 397 | # # 保存 conv9_2、conv10_2、conv11_2 398 | # sources_final.append(sources[5]) 399 | # sources_final.append(sources[6]) 400 | # sources_final.append(sources[7]) 401 | # 402 | # 403 | # # apply multibox head to source layers 404 | # # permute 将tensor的维度换位 参数为换位顺序 405 | # #contiguous 返回一个内存连续的有相同数据的tensor 406 | # 407 | # #source保存的是每个预测层的网络输出,即feature maps 408 | # #loc 通过使用feature map去预测回归 409 | # #conf通过使用feature map去预测分类 410 | # for (x, l, c) in zip(sources_final, self.loc, self.conf): 411 | # loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 412 | # conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 413 | # # 在给定维度上对输入的张量序列seq 进行连接操作 dimension=1表示在列上连接 414 | # loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 415 | # conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 416 | # # 测试集上的输出 417 | # if self.phase == "test": 418 | # output = self.detect( 419 | # loc.view(loc.size(0), -1, 4), # loc preds 定位的预测 420 | # self.softmax(conf.view(conf.size(0), -1, 421 | # self.num_classes)), # conf preds 分类的预测 422 | # self.priors.type(type(x.data)) # default boxes 预测框 423 | # ) 424 | # else: 425 | # # 训练集上的输出 426 | # output = ( 427 | # loc.view(loc.size(0), -1, 4), # loc preds [32,8732,4] 通过网络输出的定位的预测 428 | # conf.view(conf.size(0), -1, self.num_classes), #conf preds [32,8732,21] 通过网络输出的分类的预测 429 | # self.priors # 不同feature map根据公式生成的锚结果 [8732,4] 内容为 中心点坐标和宽高 430 | # ) 431 | # return output 432 | # 433 | # 434 | # def load_weights(self, base_file): 435 | # other, ext = os.path.splitext(base_file) 436 | # if ext == '.pkl' or '.pth': 437 | # print('Loading weights into state dict...') 438 | # self.load_state_dict(torch.load(base_file, 439 | # map_location=lambda storage, loc: storage)) 440 | # print('Finished!') 441 | # else: 442 | # print('Sorry only .pth and .pkl files supported.') 443 | # 444 | # 445 | # 446 | # 447 | # # This function is derived from torchvision VGG make_layers() 448 | # # 此方法源自torchvision VGG make_layers() 449 | # # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 450 | # def vgg(cfg, i, batch_norm=False): 451 | # ''' 452 | # vgg的结构 453 | # cfg: vgg的结构 454 | # '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 455 | # 512, 512, 512], 456 | # i: 3 输入图像通道数 457 | # batch_norm 为False。若为True,则网络中加入batch_norm 458 | # 459 | # 返回没有全连接层的vgg网络 460 | # ''' 461 | # #保存vgg所有层 462 | # layers = [] 463 | # #输入图像通道数 464 | # in_channels = i 465 | # for v in cfg: #M与C会导致生成的feature map大小出现变化 466 | # if v == 'M': #最大池化层 默认floor模式 467 | # layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 468 | # elif v == 'C': #最大池化层 ceil模式 两种不同的maxpool方式 参考https://blog.csdn.net/GZHermit/article/details/79351803 469 | # layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 470 | # else: 471 | # # 卷积 472 | # conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 473 | # if batch_norm: 474 | # layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 475 | # else: 476 | # layers += [conv2d, nn.ReLU(inplace=True)] 477 | # in_channels = v 478 | # # 论文将 Pool5 layer 的参数,从 卷积核2×2步长为2 转变成 卷积核3×3 步长为1 外加一个 pad 479 | # pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 480 | # # 论文中将VGG的FC6 layer、FC7 layer 转成为 卷积层conv6,conv7 并从模型的FC6、FC7 上的参数,进行采样得到这两个卷积层的 参数 481 | # #输入通道512 输出通道为1024 卷积核为3 padding为6 dilation为卷积核中元素之间的空洞大小 482 | # # 修改Pool5 layer参数,导致感受野大小改变。所以conv6采用 atrous 算法,即孔填充算法。 483 | # # 孔填充算法将卷积 weights 膨胀扩大,即原来卷积核是 3x3,膨胀后,可能变成 7x7 了,这样 receptive field 变大了,而 score map 也很大,即输出变成 dense 484 | # #这么做的好处是,输出的 score map 变大了,即是 dense 的输出了,而且 receptive field 不会变小,而且可以变大。这对做分割、检测等工作非常重要。 485 | # conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 486 | # #输入通道512 输出通道为1024 卷积核为3 487 | # conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 488 | # #将 修改的层也加入到vgg网络中 489 | # layers += [pool5, conv6, 490 | # nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 491 | # return layers 492 | # 493 | # 494 | # def add_extras(cfg, i, batch_norm=False): 495 | # ''' 496 | # vgg网络后面新增的额外层 497 | # :param cfg: '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 498 | # :param i: 1024 输入通道数 499 | # :param batch_norm: flase 500 | # :return: 501 | # ''' 502 | # # 添加到VGG的额外图层用于特征缩放 503 | # layers = [] 504 | # #1024 输入通道数 505 | # in_channels = i 506 | # # 控制卷积核尺寸,一维数组选前一个数还是后一个数。在每次循环时flag都改变,导致网络的卷积核尺寸为1,3,1,3交替 507 | # # False 为1,True为3 508 | # # SSD网络图中s1指步长为1,s2指步长为2 509 | # # 在该代码中,S代表步长为2,无S代表默认,即步长为1,所以cfg与论文网络结构完全匹配 510 | # flag = False 511 | # # enumerate枚举 k为下标 v为值 512 | # for k, v in enumerate(cfg): 513 | # if in_channels != 'S': 514 | # if v == 'S': 515 | # layers += [nn.Conv2d(in_channels, cfg[k + 1], 516 | # kernel_size=(1, 3)[flag], stride=2, padding=1)] 517 | # else: 518 | # layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 519 | # flag = not flag 520 | # in_channels = v 521 | # return layers 522 | # 523 | # 524 | # def multibox(vgg, extra_layers, cfg, num_classes): 525 | # ''' 526 | # :param vgg: 经过修改后的vgg网络(去掉全连接层,修改pool5参数并添加新层) 527 | # :param extra_layers: vgg网络后面新增的额外层 528 | # :param cfg: '300': [4, 6, 6, 6, 4, 4], 不同部分的feature map上一个网格预测多少框 529 | # :param num_classes: 20分类+1背景,共21类 530 | # :return: 531 | # ''' 532 | # # 保存所有参与预测的网络层 533 | # loc_layers = [] 534 | # conf_layers = [] 535 | # # 传入的修改过的vgg网络用于预测的网络是21层以及 倒数第二层 536 | # vgg_source = [21, -2] 537 | # for k, v in enumerate(vgg_source): 538 | # # 按照fp-ssd论文,将1024改为512通道 539 | # if k==1: 540 | # in_channels=512 541 | # else: 542 | # in_channels=vgg[v].out_channels 543 | # #4是回归的坐标参数 cfg代表该层feature map上一个网格预测多少框 544 | # loc_layers += [nn.Conv2d(in_channels, 545 | # cfg[k] * 4, kernel_size=3, padding=1)] 546 | # #num_classes是类别数 cfg代表该层feature map上一个网格预测多少框 547 | # conf_layers += [nn.Conv2d(in_channels, 548 | # cfg[k] * num_classes, kernel_size=3, padding=1)] 549 | # # [x::y] 从下标x开始,每隔y取值 550 | # #论文中新增层也是每隔一个层添加一个预测层 551 | # # 将新增的额外层中的预测层也添加上 start=2:下标起始位置 552 | # for k, v in enumerate(extra_layers[1::2], 2): 553 | # loc_layers += [nn.Conv2d(v.out_channels, cfg[k] 554 | # * 4, kernel_size=3, padding=1)] 555 | # conf_layers += [nn.Conv2d(v.out_channels, cfg[k] 556 | # * num_classes, kernel_size=3, padding=1)] 557 | # return vgg, extra_layers, (loc_layers, conf_layers) 558 | # 559 | # 560 | # base = { 561 | # # 数字为每层feature map的层数 M代表最大池化层(默认floor模式) C代表最大池化层(ceil模式) (去掉vgg16的最后的 maxpool、fc、fc、fc、softmax) 562 | # '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 563 | # 512, 512, 512], 564 | # '512': [], 565 | # } 566 | # extras = { 567 | # # 每个特征图都是由 两个conv 组成, conv1x1 和conv3x3 568 | # '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 569 | # '512': [], 570 | # } 571 | # mbox = { 572 | # '300': [4, 6, 6, 6, 4, 4], # 不同部分的feature map上一个网格预测多少框 573 | # '512': [], 574 | # } 575 | # 576 | # 577 | # def build_ssd(phase, size=300, num_classes=21): 578 | # ''' 579 | # 新建SSD模型 580 | # ''' 581 | # # 训练或测试 582 | # if phase != "test" and phase != "train": 583 | # print("ERROR: Phase: " + phase + " not recognized") 584 | # return 585 | # #当前SSD300只支持大小300×300的数据集训练 586 | # if size != 300: 587 | # print("ERROR: You specified size " + repr(size) + ". However, " + 588 | # "currently only SSD300 (size=300) is supported!") 589 | # return 590 | # 591 | # #base_: 经过修改后的vgg网络(去掉全连接层,修改pool5参数并添加新层) 592 | # #extras_: vgg网络后面新增的额外层 593 | # # head_ : (loc_layers, conf_layers) vgg与extras中进行分类和回归的层 594 | # base_, extras_, head_ = multibox(vgg(base[str(size)], 3), #vgg方法返回 经过修改后的vgg网络(去掉全连接层,修改pool5参数并添加新层) 595 | # add_extras(extras[str(size)], 1024), #vgg网络后面新增的额外层 596 | # mbox[str(size)], #mbox指不同部分的feature map上一个网格预测多少框 597 | # num_classes) 598 | # # phase:'train' size:300 num_classes: 21 类别数(20类+1背景) 599 | # return SSD(phase, size, base_, extras_, head_, num_classes) 600 | import torch 601 | import torch.nn as nn 602 | import torch.nn.functional as F 603 | from torch.autograd import Variable 604 | import os 605 | from layers import * 606 | from data import voc # coco 607 | ############################################################################### 608 | # 【通道显著性模块】 609 | class ChannelAttention(nn.Module): 610 | def __init__(self, in_planes, ratio=16): 611 | super(ChannelAttention, self).__init__() 612 | # 特征图先经过最大池化和平均池化 结果是1*1*通道数的tensor【最大池化,平均池化】 613 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 614 | self.max_pool = nn.AdaptiveMaxPool2d(1) 615 | # 在经过全连接层先降低维度再升高维度,进行特征融合【MLP】 616 | self.fc1 = nn.Conv2d(in_planes, in_planes // 16, 1, bias=False) 617 | self.relu1 = nn.ReLU() 618 | self.fc2 = nn.Conv2d(in_planes // 16, in_planes, 1, bias=False) 619 | # 【激活层】 620 | self.sigmoid = nn.Sigmoid() 621 | 622 | def forward(self, x): 623 | avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x)))) 624 | max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x)))) 625 | out = avg_out + max_out # 相加之后每个像素点的位置元素相加 626 | return self.sigmoid(out) 627 | 628 | # 【空间显著性模块】 629 | class SpatialAttention(nn.Module): 630 | def __init__(self, kernel_size=7): 631 | super(SpatialAttention, self).__init__() 632 | assert kernel_size in (3, 7), 'kernel size must be 3 or 7' # 这里设定kernal_size必须是3,7 633 | padding = 3 if kernel_size == 7 else 1 634 | 635 | self.conv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False) 636 | self.sigmoid = nn.Sigmoid() 637 | 638 | def forward(self, x): 639 | avg_out = torch.mean(x, dim=1, keepdim=True) 640 | max_out, _ = torch.max(x, dim=1, keepdim=True) # 会返回结果元素的值 和 对应的位置index 641 | x = torch.cat([avg_out, max_out], dim=1) 642 | x = self.conv1(x) 643 | return self.sigmoid(x) 644 | 645 | # 【Bottleneck将特征图先经过 通道显著性模块,再经过 空间显著性模块】 646 | class Bottleneck(nn.Module): # 将通道显著性和空间显著性模块相连接 647 | def __init__(self, inplanes, stride=1, downsample=None): 648 | super(Bottleneck, self).__init__() 649 | self.ca = ChannelAttention(inplanes) 650 | self.sa = SpatialAttention() 651 | self.stride = stride 652 | self.relu = nn.ReLU(inplace=True) 653 | 654 | def forward(self, x): 655 | save = x # 先将原本的特征图保存下来 656 | out = self.ca(x) * x # 先经过通道显著性模块 657 | out = self.sa(out) * out # 再经过空间显著性模块 658 | out += save ###这里不应该是相乘吗?????为啥变成了相加 659 | out = self.relu(out) # 最后再经过relu激活函数 660 | return out # 输出结果尺寸不变,但是通道数变成了【planes * 4】这就是残差模块 661 | 662 | #############################【SSD中融合特征显著性模块CBAM】###################### 663 | class SSD(nn.Module): 664 | """Single Shot Multibox Architecture 665 | The network is composed of a base VGG network followed by the 666 | added multibox conv layers. Each multibox layer branches into 667 | 1) conv2d for class conf scores 668 | 2) conv2d for localization predictions 669 | 3) associated priorbox layer to produce default bounding 670 | boxes specific to the layer's feature map size. 671 | SSD模型由去掉全连接层的vgg网络为基础组成。在之后添加了多盒转化层。 672 | 每个多盒层分支是: 673 | 1)conv2d 获取分类置信度 674 | 2)conv2d进行坐标位置预测 675 | 3)相关层去产生特定于该层特征图大小的默认的预测框bounding boxes 676 | 677 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 678 | 679 | Args: 680 | phase: (string) Can be "test" or "train" 681 | size: input image size 输入的图像尺寸 682 | base: VGG16 layers for input, size of either 300 or 500 经过修改的vgg网络 683 | extras: extra layers that feed to multibox loc and conf layers 684 | 提供多盒定位的格外层 和 分类置信层(vgg网络后面新增的额外层) 685 | head: "multibox head" consists of loc and conf conv layers 686 | 由定位和分类卷积层组成的multibox head 687 | (loc_layers, conf_layers) vgg与extras中进行分类和回归的层 688 | """ 689 | 690 | def __init__(self, phase, size, base, extras, head, num_classes): 691 | super(SSD, self).__init__() 692 | self.phase = phase 693 | self.num_classes = num_classes 694 | self.cfg = voc 695 | # 新定义一个类,该类的功能:对于每个feature map,生成预测框(中心坐标及偏移量) 696 | self.priorbox = PriorBox(self.cfg) 697 | # 调用forward,返回生成的预测框结果 698 | # 对于所有预测的feature map,存储着生成的不同长宽比的默认框(可以理解为anchor) 699 | self.priors = Variable(self.priorbox.forward(), volatile=True) 700 | # 300 701 | self.size = size 702 | 703 | # SSD network范围 704 | # 经过修改的vgg网络 705 | self.vgg = nn.ModuleList(base) ################################################ 706 | # Layer learns to scale the l2 normalized features from conv4_3 707 | # Layer层从conv4_3学习去缩放l2正则化特征 708 | # 论文中conv4_3 相比较于其他的layers,有着不同的 feature scale,我们使用 ParseNet 中的 L2 normalization 技术 709 | # 将conv4_3 feature map 中每一个位置的 feature norm scale 到 20,并且在 back-propagation 中学习这个 scale 710 | self.L2Norm = L2Norm(512, 20) 711 | # vgg网络后面新增的额外层 712 | self.extras = nn.ModuleList(extras) 713 | # vgg与extras中进行分类和回归的层 714 | self.loc = nn.ModuleList(head[0]) 715 | self.conf = nn.ModuleList(head[1]) 716 | 717 | # 如果网络用于测试,则加入softmax和检测 718 | if phase == 'test': 719 | self.softmax = nn.Softmax(dim=-1) 720 | self.detect = Detect(num_classes, 0, 200, 0.01, 0.45) 721 | 722 | # =====bobo新增================== 723 | # pool2到conv4_3 扩张卷积,尺度少一半 724 | self.DilationConv_128_128 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=2, dilation=2, 725 | stride=2) 726 | # conv4_3到conv4_3 尺度不变 727 | self.conv_512_256 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3, padding=1, stride=1) 728 | # fc7 到 conv4_3 反卷积上采样,尺度大一倍 729 | self.DeConv_1024_128 = nn.ConvTranspose2d(in_channels=1024, out_channels=128, kernel_size=2, stride=2) 730 | 731 | # conv4_3 到FC7 扩张卷积,尺度少一半 732 | self.DilationConv_512_128 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, padding=2, dilation=2, 733 | stride=2) 734 | # FC7到FC7 尺度不变 735 | self.conv_1024_256 = nn.Conv2d(in_channels=1024, out_channels=256, kernel_size=3, padding=1, stride=1) 736 | # conv8_2 到 FC7 反卷积上采样,尺度大一倍 10->19 737 | self.DeConv_512_128 = nn.ConvTranspose2d(in_channels=512, out_channels=128, kernel_size=3, stride=2, padding=1) 738 | 739 | # conv5_3到conv8_2 740 | self.DilationConv_512_128_2 = nn.Conv2d(in_channels=512, out_channels=128, kernel_size=3, padding=2, dilation=2, 741 | stride=2) 742 | # conv8_2到conv8_2 尺度不变 743 | self.conv_512_256_2 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=3, padding=1, stride=1) 744 | # conv9_2到conv8_2 745 | self.DeConv_256_128_2 = nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=2, stride=2) 746 | 747 | # 平滑层 748 | self.smooth = nn.Conv2d(512, 512, kernel_size=3, padding=1, stride=1) 749 | 750 | # 通道数BN层的参数是输出通道数out_channels 751 | self.bn = nn.BatchNorm2d(128) 752 | 753 | # CBAM模块【6个特征层:512 512 512 256 256 256 】 754 | self.CBAM1 = Bottleneck(512) 755 | self.CBAM2 = Bottleneck(512) 756 | self.CBAM3 = Bottleneck(512) 757 | self.CBAM4 = Bottleneck(256) 758 | self.CBAM5 = Bottleneck(256) 759 | self.CBAM6 = Bottleneck(256) 760 | 761 | def forward(self, x): 762 | """Applies network layers and ops on input image(s) x. 763 | 前向传播 764 | Args: 765 | x: input image or batch of images. Shape: [batch,3,300,300]. 766 | 767 | Return: 768 | Depending on phase: 769 | test测试集: 770 | Variable(tensor) of output class label predictions, 771 | confidence score, and corresponding location predictions for 772 | each object detected. Shape: [batch,topk,7] 773 | 774 | train训练集: 775 | list of concat outputs from: 776 | 1: 分类层confidence layers, Shape: [batch*num_priors,num_classes] 777 | 2: 回归定位层localization layers, Shape: [batch,num_priors*4] 778 | 3: priorbox layers, Shape: [2,num_priors*4] 779 | """ 780 | # sources保存 网络生成的不同层feature map结果,以便使用这些feature map来进行分类与回归 781 | sources = list() 782 | # 保存预测层不同feature map通过回归和分类网络的输出结果 783 | loc = list() 784 | conf = list() 785 | 786 | # 原论文中vgg的conv4_3,relu之后加入L2 Normalization正则化,然后保存feature map 787 | # apply vgg up to conv4_3 relu 788 | # 将vgg层的feature map保存 789 | # k的范围为0-22 790 | # =========开始保存 所需的所有中间信息 791 | 792 | # 保存pool2(pool下标从1开始)的结果 793 | # 经过maxpool,所以不需要L2Norm正则化 794 | for k in range(10): 795 | x = self.vgg[k](x) 796 | sources.append(x) 797 | 798 | # 保存conv4_3结果 799 | for k in range(10, 23): 800 | x = self.vgg[k](x) 801 | s = self.L2Norm(x) 802 | sources.append(s) 803 | 804 | # 保存conv5_3结果 类似conv4_3原仓库一样,加入L2Norm 805 | for k in range(23, 30): 806 | x = self.vgg[k](x) 807 | s = self.L2Norm(x) 808 | sources.append(s) 809 | 810 | # 保存 原fc7的输出结果 811 | # apply vgg up to fc7,即将原fc7层更改为卷积层输出的结果,经过relu之后保存结果 812 | # k的范围为23 - 结束 813 | for k in range(30, len(self.vgg)): 814 | x = self.vgg[k](x) 815 | sources.append(x) 816 | 817 | # 将新加的额外层 conv8_2、conv9_2、conv10_2、conv11_2结果保存 818 | # apply extra layers and cache source layer outputs 819 | # 将新增层的feature map保存 820 | for k, v in enumerate(self.extras): 821 | # 每经过一个conv卷积,都relu一下 822 | x = F.relu(v(x), inplace=True) 823 | # 论文中隔一个conv保存一个结果 824 | if k % 2 == 1: 825 | sources.append(x) 826 | 827 | # 此时sources保存了所有中间结果,论文中的pool2、conv4_3、conv5_3、fc7、conv8_2、conv9_2、conv10_2、conv11_2 828 | # sources_final保存各层融合之后的最终结果 829 | sources_final = list() 830 | # con4_3层融合结果 self.bn1(self.conv1(x)) 在通道维度上融合 831 | conv4_fp = torch.cat((F.relu(self.bn(self.DilationConv_128_128(sources[0])), inplace=True), 832 | F.relu(self.conv_512_256(sources[1]), inplace=True), 833 | F.relu(self.DeConv_1024_128(sources[3]), inplace=True)), 1) 834 | # sources_final.append(F.relu( self.smooth(conv4_fp) , inplace=True)) 835 | conv4_fp = F.relu(self.smooth(conv4_fp), inplace=True) 836 | sources_final.append(self.CBAM1(conv4_fp)) 837 | # FC7层融合结果 838 | fc7_fp = torch.cat((F.relu(self.bn(self.DilationConv_512_128(sources[1])), inplace=True), 839 | F.relu(self.conv_1024_256(sources[3]), inplace=True), 840 | F.relu(self.DeConv_512_128(sources[4]), inplace=True)), 1) 841 | # sources_final.append(F.relu( self.smooth(fc7_fp) , inplace=True)) 842 | fc7_fp = F.relu(self.smooth(fc7_fp), inplace=True) 843 | sources_final.append(self.CBAM2(fc7_fp)) 844 | # conv8_2层融合结果 845 | conv8_fp = torch.cat((F.relu(self.bn(self.DilationConv_512_128_2(sources[2])), inplace=True), 846 | F.relu(self.conv_512_256_2(sources[4]), inplace=True), 847 | F.relu(self.DeConv_256_128_2(sources[5]), inplace=True)), 1) 848 | # sources_final.append(F.relu( self.smooth(conv8_fp) , inplace=True)) 849 | conv8_fp = F.relu(self.smooth(conv8_fp), inplace=True) 850 | sources_final.append(self.CBAM3(conv8_fp)) 851 | 852 | # 保存 conv9_2、conv10_2、conv11_2 853 | sources_final.append(self.CBAM4(sources[5])) 854 | sources_final.append(self.CBAM5(sources[6])) 855 | sources_final.append(self.CBAM6(sources[7])) 856 | 857 | # apply multibox head to source layers 858 | # permute 将tensor的维度换位 参数为换位顺序 859 | # contiguous 返回一个内存连续的有相同数据的tensor 860 | 861 | # source保存的是每个预测层的网络输出,即feature maps 862 | # loc 通过使用feature map去预测回归 863 | # conf通过使用feature map去预测分类 864 | for (x, l, c) in zip(sources_final, self.loc, self.conf): 865 | loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 866 | conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 867 | # 在给定维度上对输入的张量序列seq 进行连接操作 dimension=1表示在列上连接 868 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 869 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 870 | # 测试集上的输出 871 | if self.phase == "test": 872 | output = self.detect( 873 | loc.view(loc.size(0), -1, 4), # loc preds 定位的预测 874 | self.softmax(conf.view(conf.size(0), -1, 875 | self.num_classes)), # conf preds 分类的预测 876 | self.priors.type(type(x.data)) # default boxes 预测框 877 | ) 878 | else: 879 | # 训练集上的输出 880 | output = ( 881 | loc.view(loc.size(0), -1, 4), # loc preds [32,8732,4] 通过网络输出的定位的预测 882 | conf.view(conf.size(0), -1, self.num_classes), # conf preds [32,8732,21] 通过网络输出的分类的预测 883 | self.priors # 不同feature map根据公式生成的锚结果 [8732,4] 内容为 中心点坐标和宽高 884 | ) 885 | return output 886 | 887 | def load_weights(self, base_file): 888 | other, ext = os.path.splitext(base_file) 889 | if ext == '.pkl' or '.pth': 890 | print('Loading weights into state dict...') 891 | self.load_state_dict(torch.load(base_file, map_location=lambda storage, loc: storage)) 892 | print('Finished!') 893 | else: 894 | print('Sorry only .pth and .pkl files supported.') 895 | 896 | 897 | # This function is derived from torchvision VGG make_layers() 898 | # 此方法源自torchvision VGG make_layers() 899 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 900 | def vgg(cfg, i, batch_norm=False): 901 | ''' 902 | vgg的结构 903 | cfg: vgg的结构 904 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 905 | 512, 512, 512], 906 | i: 3 输入图像通道数 907 | batch_norm 为False。若为True,则网络中加入batch_norm 908 | 909 | 返回没有全连接层的vgg网络 910 | ''' 911 | # 保存vgg所有层 912 | layers = [] 913 | # 输入图像通道数 914 | in_channels = i 915 | for v in cfg: # M与C会导致生成的feature map大小出现变化 916 | if v == 'M': # 最大池化层 默认floor模式 917 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 918 | elif v == 'C': # 最大池化层 ceil模式 两种不同的maxpool方式 参考https://blog.csdn.net/GZHermit/article/details/79351803 919 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 920 | else: 921 | # 卷积 922 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 923 | if batch_norm: 924 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 925 | else: 926 | layers += [conv2d, nn.ReLU(inplace=True)] 927 | in_channels = v 928 | # 论文将 Pool5 layer 的参数,从 卷积核2×2步长为2 转变成 卷积核3×3 步长为1 外加一个 pad 929 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 930 | # 论文中将VGG的FC6 layer、FC7 layer 转成为 卷积层conv6,conv7 并从模型的FC6、FC7 上的参数,进行采样得到这两个卷积层的 参数 931 | # 输入通道512 输出通道为1024 卷积核为3 padding为6 dilation为卷积核中元素之间的空洞大小 932 | # 修改Pool5 layer参数,导致感受野大小改变。所以conv6采用 atrous 算法,即孔填充算法。 933 | # 孔填充算法将卷积 weights 膨胀扩大,即原来卷积核是 3x3,膨胀后,可能变成 7x7 了,这样 receptive field 变大了,而 score map 也很大,即输出变成 dense 934 | # 这么做的好处是,输出的 score map 变大了,即是 dense 的输出了,而且 receptive field 不会变小,而且可以变大。这对做分割、检测等工作非常重要。 935 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 936 | # 输入通道512 输出通道为1024 卷积核为3 937 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 938 | # 将 修改的层也加入到vgg网络中 939 | layers += [pool5, conv6, 940 | nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 941 | return layers 942 | 943 | 944 | def add_extras(cfg, i, batch_norm=False): 945 | ''' 946 | vgg网络后面新增的额外层 947 | :param cfg: '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 948 | :param i: 1024 输入通道数 949 | :param batch_norm: flase 950 | :return: 951 | ''' 952 | # 添加到VGG的额外图层用于特征缩放 953 | layers = [] 954 | # 1024 输入通道数 955 | in_channels = i 956 | # 控制卷积核尺寸,一维数组选前一个数还是后一个数。在每次循环时flag都改变,导致网络的卷积核尺寸为1,3,1,3交替 957 | # False 为1,True为3 958 | # SSD网络图中s1指步长为1,s2指步长为2 959 | # 在该代码中,S代表步长为2,无S代表默认,即步长为1,所以cfg与论文网络结构完全匹配 960 | flag = False 961 | # enumerate枚举 k为下标 v为值 962 | for k, v in enumerate(cfg): 963 | if in_channels != 'S': 964 | if v == 'S': 965 | layers += [nn.Conv2d(in_channels, cfg[k + 1], 966 | kernel_size=(1, 3)[flag], stride=2, padding=1)] 967 | else: 968 | layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 969 | flag = not flag 970 | in_channels = v 971 | return layers 972 | 973 | 974 | def multibox(vgg, extra_layers, cfg, num_classes): 975 | ''' 976 | :param vgg: 经过修改后的vgg网络(去掉全连接层,修改pool5参数并添加新层) 977 | :param extra_layers: vgg网络后面新增的额外层 978 | :param cfg: '300': [4, 6, 6, 6, 4, 4], 不同部分的feature map上一个网格预测多少框 979 | :param num_classes: 20分类+1背景,共21类 980 | :return: 981 | ''' 982 | # 保存所有参与预测的网络层 983 | loc_layers = [] 984 | conf_layers = [] 985 | # 传入的修改过的vgg网络用于预测的网络是21层以及 倒数第二层 986 | vgg_source = [21, -2] 987 | for k, v in enumerate(vgg_source): 988 | # 按照fp-ssd论文,将1024改为512通道 989 | if k == 1: 990 | in_channels = 512 991 | else: 992 | in_channels = vgg[v].out_channels 993 | # 4是回归的坐标参数 cfg代表该层feature map上一个网格预测多少框 994 | loc_layers += [nn.Conv2d(in_channels, 995 | cfg[k] * 4, kernel_size=3, padding=1)] 996 | # num_classes是类别数 cfg代表该层feature map上一个网格预测多少框 997 | conf_layers += [nn.Conv2d(in_channels, 998 | cfg[k] * num_classes, kernel_size=3, padding=1)] 999 | # [x::y] 从下标x开始,每隔y取值 1000 | # 论文中新增层也是每隔一个层添加一个预测层 1001 | # 将新增的额外层中的预测层也添加上 start=2:下标起始位置 1002 | for k, v in enumerate(extra_layers[1::2], 2): 1003 | loc_layers += [nn.Conv2d(v.out_channels, cfg[k] 1004 | * 4, kernel_size=3, padding=1)] 1005 | conf_layers += [nn.Conv2d(v.out_channels, cfg[k] 1006 | * num_classes, kernel_size=3, padding=1)] 1007 | return vgg, extra_layers, (loc_layers, conf_layers) 1008 | 1009 | 1010 | base = { 1011 | # 数字为每层feature map的层数 M代表最大池化层(默认floor模式) C代表最大池化层(ceil模式) (去掉vgg16的最后的 maxpool、fc、fc、fc、softmax) 1012 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 1013 | 512, 512, 512], 1014 | '512': [], 1015 | } 1016 | extras = { 1017 | # 每个特征图都是由 两个conv 组成, conv1x1 和conv3x3 1018 | '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 1019 | '512': [], 1020 | } 1021 | mbox = { 1022 | '300': [4, 6, 6, 6, 4, 4], # 不同部分的feature map上一个网格预测多少框 1023 | '512': [], 1024 | } 1025 | 1026 | 1027 | def build_ssd(phase, size=300, num_classes=21): 1028 | ''' 1029 | 新建SSD模型 1030 | ''' 1031 | # 训练或测试 1032 | if phase != "test" and phase != "train": 1033 | print("ERROR: Phase: " + phase + " not recognized") 1034 | return 1035 | # 当前SSD300只支持大小300×300的数据集训练 1036 | if size != 300: 1037 | print("ERROR: You specified size " + repr(size) + ". However, " + 1038 | "currently only SSD300 (size=300) is supported!") 1039 | return 1040 | 1041 | # base_: 经过修改后的vgg网络(去掉全连接层,修改pool5参数并添加新层) 1042 | # extras_: vgg网络后面新增的额外层 1043 | # head_ : (loc_layers, conf_layers) vgg与extras中进行分类和回归的层 1044 | base_, extras_, head_ = multibox(vgg(base[str(size)], 3), 1045 | add_extras(extras[str(size)], 1024), 1046 | mbox[str(size)], 1047 | num_classes) 1048 | # phase:'train' size:300 num_classes: 21 类别数(20类+1背景) 1049 | return SSD(phase, size, base_, extras_, head_, num_classes) 1050 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # test.py 用于测试单张图片的效果 2 | from __future__ import print_function 3 | import sys 4 | import os 5 | import argparse 6 | import torch 7 | import torch.nn as nn 8 | import torch.backends.cudnn as cudnn 9 | import torchvision.transforms as transforms 10 | from torch.autograd import Variable 11 | from data import VOC_ROOT, VOC_CLASSES as labelmap 12 | from PIL import Image 13 | from data import VOCAnnotationTransform, VOCDetection, BaseTransform, VOC_CLASSES 14 | import torch.utils.data as data 15 | from ssd import build_ssd 16 | 17 | parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection') 18 | parser.add_argument('--trained_model', default='weights/ssd300_VOC_10000.pth', # #########修改检测模型的路径 19 | type=str, help='Trained state_dict file path to open') 20 | parser.add_argument('--save_folder', default='eval/', type=str, 21 | help='Dir to save results') 22 | parser.add_argument('--visual_threshold', default=0.6, type=float, 23 | help='Final confidence threshold') 24 | parser.add_argument('--cuda', default=True, type=bool, 25 | help='Use cuda to train model') 26 | parser.add_argument('--voc_root', default='data/VOCdevkit', help='Location of VOC root directory') # ###修改读取图片的路径【VOC_ROOT】 27 | parser.add_argument('-f', default=None, type=str, help="Dummy arg so we can load in Jupyter Notebooks") 28 | args = parser.parse_args() 29 | 30 | if args.cuda and torch.cuda.is_available(): 31 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 32 | else: 33 | torch.set_default_tensor_type('torch.FloatTensor') 34 | 35 | if not os.path.exists(args.save_folder): 36 | os.mkdir(args.save_folder) 37 | 38 | 39 | # 输入参数:【VOC数据集root,网络,cuda,输入的测试数据,预处理函数, 阈值】 40 | def test_net(save_folder, net, cuda, testset, transform, thresh): 41 | # dump predictions and assoc. ground truth to text file for now 42 | filename = save_folder+'test1.txt' # 保存的txt文件名 43 | num_images = len(testset) # 测试的数据数量 44 | for i in range(num_images): # 依次遍历每一张图片 45 | print('Testing image {:d}/{:d}....'.format(i+1, num_images)) # 索引加1才是加载显示的图片数从1开始 46 | 47 | img = testset.pull_image(i) # pull_image的功能是cv2.imread读取某张图片 放到img中 48 | img_id, annotation = testset.pull_anno(i) # pull_anno的功能是读取标签信息 img_id 49 | 50 | x = torch.from_numpy(transform(img)[0]).permute(2, 0, 1) # #先去掉第一个维度,然后将最后个通道的提前 51 | x = Variable(x.unsqueeze(0)) # #然后在将第一维度添加1维度还原成4维度 52 | 53 | with open(filename, mode='a') as f: # 下面是添加GT的信息 真实目标的标签 54 | f.write('\nGROUND TRUTH FOR: '+img_id+'\n') 55 | for box in annotation: 56 | f.write('label: '+' || '.join(str(b) for b in box)+'\n') 57 | if cuda: 58 | x = x.cuda() # 将单张的图片转化为GPU上的数据 59 | 60 | y = net(x) # forward pass 将数据放在网上前向传播 61 | detections = y.data 62 | # scale each detection back up to the image 63 | scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]]) 64 | pred_num = 0 65 | for i in range(detections.size(1)): # 66 | j = 0 67 | while detections[0, i, j, 0] >= 0.6: 68 | if pred_num == 0: 69 | with open(filename, mode='a') as f: 70 | f.write('PREDICTIONS: '+'\n') 71 | score = detections[0, i, j, 0] 72 | label_name = labelmap[i-1] 73 | pt = (detections[0, i, j, 1:]*scale).cpu().numpy() 74 | coords = (pt[0], pt[1], pt[2], pt[3]) 75 | pred_num += 1 76 | with open(filename, mode='a') as f: 77 | f.write(str(pred_num)+' label: '+label_name+' score: ' + 78 | str(score) + ' '+' || '.join(str(c) for c in coords) + '\n') 79 | j += 1 80 | 81 | 82 | def test_voc(): 83 | # load net 84 | num_classes = len(VOC_CLASSES) + 1 # +1 background【这里我觉得也不应该加1】 85 | net = build_ssd('test', 300, num_classes) # initialize SSD 86 | net.load_state_dict(torch.load(args.trained_model)) # 在创建的网络中添加前面训练过的权重系数 87 | net.eval() # 开始机进行eval()模式 88 | print('Finished loading model!') 89 | # load data 90 | testset = VOCDetection(args.voc_root, [('2007', 'test')], None, VOCAnnotationTransform()) # 将 第二个参数的默认值改变成【要选用的数据集】 91 | if args.cuda: 92 | net = net.cuda() 93 | cudnn.benchmark = True 94 | # evaluation 95 | # #输入参数:【VOC数据集root,网络,cuda,输入的测试数据,预处理函数, 阈值】 96 | test_net(args.save_folder, # VOC数据集root 97 | net, # 网络 98 | args.cuda, # cuda 99 | testset, # 输入的测试数据 100 | BaseTransform(net.size, (104, 117, 123)), # 预处理函数 101 | thresh=args.visual_threshold # 阈值 102 | ) 103 | 104 | 105 | if __name__ == '__main__': 106 | test_voc() 107 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | from data import * 3 | from utils.augmentations import SSDAugmentation 4 | from layers.modules import MultiBoxLoss 5 | from ssd import build_ssd 6 | import time 7 | import torch 8 | from torch.autograd import Variable 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | import torch.backends.cudnn as cudnn 12 | import torch.nn.init as init 13 | import torch.utils.data as data 14 | import argparse 15 | import visdom as viz 16 | import os 17 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" # 指定GPU做训练 18 | 19 | 20 | def str2bool(v): 21 | return v.lower() in ("yes", "true", "t", "1") 22 | 23 | 24 | parser = argparse.ArgumentParser( 25 | description='Single Shot MultiBox Detector Training With Pytorch') 26 | train_set = parser.add_mutually_exclusive_group() 27 | parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'], 28 | type=str, help='VOC or COCO') 29 | parser.add_argument('--dataset_root', default="data/VOCdevkit/", # 修改【dataset_root】 30 | help='Dataset root directory path') 31 | parser.add_argument('--basenet', default='weights/vgg16_reducedfc.pth', # 【预训练好的权重系数】 32 | help='Pretrained base model') 33 | parser.add_argument('--batch_size', default=16, type=int, # 【修改batch_size】 34 | help='Batch size for training') 35 | parser.add_argument('--resume', default='weights/ssd300_VOC_500.pth', type=str, # 【是否从某节点开始训练】没有就是None 36 | help='Checkpoint state_dict file to resume training from') 37 | parser.add_argument('--start_iter', default=501, type=int, 38 | help='Resume training at this iter') 39 | parser.add_argument('--num_workers', default=2, type=int, # 【num_workers】 40 | help='Number of workers used in dataloading') 41 | parser.add_argument('--cuda', default=True, type=str2bool, 42 | help='Use CUDA to train model') 43 | parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float, # 【修改学习率】 44 | help='initial learning rate') 45 | parser.add_argument('--momentum', default=0.9, type=float, 46 | help='Momentum value for optim') 47 | parser.add_argument('--weight_decay', default=5e-4, type=float, 48 | help='Weight decay for SGD') 49 | parser.add_argument('--gamma', default=0.1, type=float, 50 | help='Gamma update for SGD') 51 | parser.add_argument('--visdom', default=False, type=str2bool, # 可视化 这次设置为【【】可视化】】】 52 | help='Use visdom for loss visualization') 53 | parser.add_argument('--save_folder', default='weights/', 54 | help='Directory for saving checkpoint models') 55 | args = parser.parse_args() 56 | 57 | if torch.cuda.is_available(): 58 | if args.cuda: 59 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 60 | if not args.cuda: 61 | print("WARNING: It looks like you have a CUDA device, but aren't " + 62 | "using CUDA.\nRun with --cuda for optimal training speed.") 63 | torch.set_default_tensor_type('torch.FloatTensor') 64 | else: 65 | torch.set_default_tensor_type('torch.FloatTensor') 66 | 67 | if not os.path.exists(args.save_folder): 68 | os.mkdir(args.save_folder) 69 | 70 | 71 | def train(): 72 | cfg = voc # voc是一个字典 里面包括网络的一系列参数信息 73 | dataset = VOCDetection( # 是一个VOC数据的类 74 | root=args.dataset_root, # 数据集的根目录 75 | transform=SSDAugmentation(cfg['min_dim'], MEANS)) # 图片的预处理方法(输入图片的尺寸和均值) 原本类中定义为None 后面的MEANS我人为可以删除 76 | 77 | if args.visdom: # 这里是可视化工具,不用管################### 78 | import visdom 79 | viz = visdom.Visdom() 80 | 81 | ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes']) 82 | # 阶段【train or test】 输入图片尺寸大小 类别数 83 | # build_ssd是一个放在ssd.py的函数 84 | # return是一个类的对象,也就是class SSD(nn.Module),ssd_net也就是SSD类的一个对象 85 | # ssd_net拥有所有class SSD继承于nn.Module以及作者增加方法的所有属性 86 | # 在SSD这个类中就定义了网络的base部分(修改全连接层后的VGG16)和extras部分(论文作者加入的多尺度feature map)和head部分 87 | # 对选定的6个尺度下的feature map进行卷积操作得到的每个default box 的每一个分类类别的confidence以及位置坐标的信息 88 | net = ssd_net # 到这里class类SSD只完成了__init__()并没有执行__forward__() net是一个类 89 | 90 | if args.cuda: # 是否将模型放到多个个GPU上运行{我认为在我的任务中不要放在多线程GPU中} 91 | net = torch.nn.DataParallel(ssd_net) 92 | cudnn.benchmark = True 93 | if args.resume: # 【resume】的默认值是None,表示不是接着某个断点来继续训练这个模型 【其实checkpoint里面最好还要加上优化器的保存】 94 | # 【model_state_dict,optimizer_state_dict,epoch】 见深度之眼 95 | print('Resuming training, loading {}...'.format(args.resume)) 96 | ssd_net.load_weights(args.resume) 97 | else: # 那么就从weights文件夹下面直接加载预训练好vgg基础网络预训练权重 98 | vgg_weights = torch.load(args.save_folder + args.basenet) # 整个ssd_net中vgg基础网络的权重 99 | print('Loading base network...') 100 | ssd_net.vgg.load_state_dict(vgg_weights) # 只在整个ssd_net中的vgg模块中加载预训练好的权重,其余的extra,特征融合,CBAM模块没有加载预训练权重 101 | if args.cuda: # 将模型结构放在GPU上训练 102 | net = net.cuda() 103 | if not args.resume: # ###################################################################### 104 | print('Initializing weights...') # 如果不是接着某个断点接着训练,那么其余extras loc con都会xavier方法初始化 105 | # initialize newly added layers' weights with xavier method 106 | ssd_net.extras.apply(weights_init) # extras 模块由 xavier 方法默认初始化data和bias 107 | ssd_net.loc.apply(weights_init) # loc 模块由 xavier 方法默认初始化data和bias 108 | ssd_net.conf.apply(weights_init) # conf 模块由 xavier 方法默认初始化data和bias 109 | 110 | # 【优化器】net.parameters()是网络结构中的参数,学习率,动量,权重衰减率 111 | optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) 112 | # 定义损失函数部分【MultiBoxesLoss是一个类用于计算网络的损失,criterion是一个对象】 113 | # 【损失函数】 关键!!! criterion是个nn.Moudule的形式 里面包括两部分loss_c 和 loss_l 114 | criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, False, args.cuda) 115 | # 前向传播 116 | net.train() 117 | # loss counters 118 | loc_loss = 0 119 | conf_loss = 0 120 | epoch = 0 121 | print('Loading the dataset...') 122 | epoch_size = len(dataset) // args.batch_size # 每个epoch中有多少个batch 123 | print('Training SSD on:', dataset.name) 124 | print('Using the specified args:') 125 | print(args) # 讲设定的参数打印出来 126 | 127 | step_index = 0 128 | # 可视化部分 129 | if args.visdom: # 默认值为False 130 | vis_title = 'SSD.PyTorch on ' + dataset.name 131 | vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss'] 132 | iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend) 133 | epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend) 134 | 135 | data_loader = data.DataLoader(dataset, args.batch_size, 136 | num_workers=args.num_workers, # 默认值我修改成了0 137 | shuffle=True, 138 | collate_fn=detection_collate, # collate_fn将一个batch_size数目的图片进行合并成batch 139 | pin_memory=True) 140 | batch_iterator = iter(data_loader) # batch迭代器 依次迭代batch 141 | for iteration in range(args.start_iter, cfg['max_iter']): # 由最大迭代次数来迭代训练 142 | if args.visdom and iteration != 0 and (iteration % epoch_size == 0): # 因为args.visdom一直设置为False因此没有被调用 143 | update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None, 'append', epoch_size) 144 | # reset epoch loss counters 145 | loc_loss = 0 146 | conf_loss = 0 147 | epoch += 1 148 | 149 | if iteration in cfg['lr_steps']: # 通过多少次epoch调节一次学习率 150 | step_index += 1 151 | adjust_learning_rate(optimizer, args.gamma, step_index) 152 | 153 | # load train data 154 | try: 155 | images, targets = next(batch_iterator) 156 | # targets 和image都是读取的训练数据 157 | except StopIteration: 158 | bath_iterator = iter(data_loader) 159 | images, targets = next(bath_iterator) 160 | # images=【batch_size,3,300,300】 161 | # targets=【batch_size,num_object,5】 162 | # num_object代表一张图里面有几个ground truth,5代表四个位置信息和一个label 163 | if args.cuda: # 将数据放在cuda上 164 | images = Variable(images.cuda()) 165 | targets = [Variable(ann.cuda(), volatile=True) for ann in targets] 166 | else: 167 | images = Variable(images) 168 | targets = [Variable(ann, volatile=True) for ann in targets] 169 | # forward 170 | t0 = time.time() 171 | # ##out是netforward的输出:是个元组,里面包括3个部分[loc conf priors] 172 | out = net(images) 173 | # ## backprop 优化器梯度清零 174 | optimizer.zero_grad() 175 | # ## criterion是nn.Module形式,下面是调用它的forward模式【重点看,里面包括难例挖掘的内容】 176 | # ###################################【【【训练阶段的损失!!!】】】###################################### 177 | # ##输入参数1:网络结构net输出的out:[loc conf priors] 178 | # ##输入参数2:targets:真实目标的位置标签值 179 | loss_l, loss_c = criterion(out, targets) # criterion就是MultiBoxLoss类定义的对象,forward前传播返回的结果是【loss_l, loss_c】 180 | loss = loss_l + loss_c # 总loss 181 | loss.backward() 182 | optimizer.step() 183 | t1 = time.time() 184 | # 下面两行好像没有使用 185 | loc_loss += loss_l.data # ###到底是改成item()还是data 186 | conf_loss += loss_c.data # ###到底是改成item()还是data 187 | 188 | if iteration % 10 == 0: 189 | print('timer: %.4f sec.' % (t1 - t0)) 190 | print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % loss.data, end=' ') # 到底是改成item()还是data 191 | 192 | if args.visdom: 193 | update_vis_plot(iteration, loss_l.data, loss_c.data, iter_plot, epoch_plot, 'append') 194 | 195 | if iteration != 0 and iteration % 2000 == 0: 196 | # 迭代多少次保存一次模型。 在尝试阶段,为了节省时间,建议将根据迭代次数保存模型的参数调低,例如调节到500 197 | print('Saving state, iter:', iteration) # 保存的checkpoint 198 | torch.save(ssd_net.state_dict(), 'weights/ssd300_VOC_' + repr(iteration) + '.pth') # 保存模型的路径 199 | torch.save(ssd_net.state_dict(), args.save_folder + '' + args.dataset + '.pth') # 最后的保存:不是保存整个模型,只是保存了参数 200 | 201 | 202 | def adjust_learning_rate(optimizer, gamma, step): 203 | """Sets the learning rate to the initial LR decayed by 10 at every 204 | specified step 205 | """ 206 | lr = args.lr * (gamma ** (step)) 207 | for param_group in optimizer.param_groups: 208 | param_group['lr'] = lr 209 | 210 | 211 | def xavier(param): 212 | init.xavier_uniform(param) 213 | 214 | 215 | def weights_init(m): 216 | if isinstance(m, nn.Conv2d): 217 | xavier(m.weight.data) 218 | m.bias.data.zero_() 219 | 220 | 221 | def create_vis_plot(_xlabel, _ylabel, _title, _legend): 222 | return viz.line( 223 | X=torch.zeros((1,)).cpu(), 224 | Y=torch.zeros((1, 3)).cpu(), 225 | opts=dict( 226 | xlabel=_xlabel, 227 | ylabel=_ylabel, 228 | title=_title, 229 | legend=_legend 230 | ) 231 | ) 232 | 233 | 234 | def update_vis_plot(iteration, loc, conf, window1, window2, update_type, 235 | epoch_size=1): 236 | viz.line( 237 | X=torch.ones((1, 3)).cpu() * iteration, 238 | Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu() / epoch_size, 239 | win=window1, 240 | update=update_type 241 | ) 242 | # initialize epoch plot on first iteration 243 | if iteration == 0: 244 | viz.line( 245 | X=torch.zeros((1, 3)).cpu(), 246 | Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu(), 247 | win=window2, 248 | update=True 249 | ) 250 | 251 | 252 | if __name__ == '__main__': 253 | train() 254 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .augmentations import SSDAugmentation -------------------------------------------------------------------------------- /utils/augmentations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision import transforms 3 | import cv2 4 | import numpy as np 5 | import types 6 | from numpy import random 7 | 8 | 9 | def intersect(box_a, box_b): 10 | max_xy = np.minimum(box_a[:, 2:], box_b[2:]) 11 | min_xy = np.maximum(box_a[:, :2], box_b[:2]) 12 | inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf) 13 | return inter[:, 0] * inter[:, 1] 14 | 15 | 16 | def jaccard_numpy(box_a, box_b): 17 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 18 | is simply the intersection over union of two boxes. 19 | E.g.: 20 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 21 | Args: 22 | box_a: Multiple bounding boxes, Shape: [num_boxes,4] 23 | box_b: Single bounding box, Shape: [4] 24 | Return: 25 | jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]] 26 | """ 27 | inter = intersect(box_a, box_b) 28 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 29 | (box_a[:, 3]-box_a[:, 1])) # [A,B] 30 | area_b = ((box_b[2]-box_b[0]) * 31 | (box_b[3]-box_b[1])) # [A,B] 32 | union = area_a + area_b - inter 33 | return inter / union # [A,B] 34 | 35 | 36 | class Compose(object): 37 | """Composes several augmentations together. 38 | Args: 39 | transforms (List[Transform]): list of transforms to compose. 40 | Example: 41 | >>> augmentations.Compose([ 42 | >>> transforms.CenterCrop(10), 43 | >>> transforms.ToTensor(), 44 | >>> ]) 45 | """ 46 | 47 | def __init__(self, transforms): 48 | self.transforms = transforms 49 | 50 | def __call__(self, img, boxes=None, labels=None): 51 | for t in self.transforms: 52 | img, boxes, labels = t(img, boxes, labels) 53 | return img, boxes, labels 54 | 55 | 56 | class Lambda(object): 57 | """Applies a lambda as a transform.""" 58 | 59 | def __init__(self, lambd): 60 | assert isinstance(lambd, types.LambdaType) 61 | self.lambd = lambd 62 | 63 | def __call__(self, img, boxes=None, labels=None): 64 | return self.lambd(img, boxes, labels) 65 | 66 | 67 | class ConvertFromInts(object): 68 | def __call__(self, image, boxes=None, labels=None): 69 | return image.astype(np.float32), boxes, labels 70 | 71 | 72 | class SubtractMeans(object): 73 | def __init__(self, mean): 74 | self.mean = np.array(mean, dtype=np.float32) 75 | 76 | def __call__(self, image, boxes=None, labels=None): 77 | image = image.astype(np.float32) 78 | image -= self.mean 79 | return image.astype(np.float32), boxes, labels 80 | 81 | 82 | class ToAbsoluteCoords(object): 83 | def __call__(self, image, boxes=None, labels=None): 84 | height, width, channels = image.shape 85 | boxes[:, 0] *= width 86 | boxes[:, 2] *= width 87 | boxes[:, 1] *= height 88 | boxes[:, 3] *= height 89 | 90 | return image, boxes, labels 91 | 92 | 93 | class ToPercentCoords(object): 94 | def __call__(self, image, boxes=None, labels=None): 95 | height, width, channels = image.shape 96 | boxes[:, 0] /= width 97 | boxes[:, 2] /= width 98 | boxes[:, 1] /= height 99 | boxes[:, 3] /= height 100 | 101 | return image, boxes, labels 102 | 103 | 104 | class Resize(object): 105 | def __init__(self, size=300): 106 | self.size = size 107 | 108 | def __call__(self, image, boxes=None, labels=None): 109 | image = cv2.resize(image, (self.size, 110 | self.size)) 111 | return image, boxes, labels 112 | 113 | 114 | class RandomSaturation(object): 115 | def __init__(self, lower=0.5, upper=1.5): 116 | self.lower = lower 117 | self.upper = upper 118 | assert self.upper >= self.lower, "contrast upper must be >= lower." 119 | assert self.lower >= 0, "contrast lower must be non-negative." 120 | 121 | def __call__(self, image, boxes=None, labels=None): 122 | if random.randint(2): 123 | image[:, :, 1] *= random.uniform(self.lower, self.upper) 124 | 125 | return image, boxes, labels 126 | 127 | 128 | class RandomHue(object): 129 | def __init__(self, delta=18.0): 130 | assert delta >= 0.0 and delta <= 360.0 131 | self.delta = delta 132 | 133 | def __call__(self, image, boxes=None, labels=None): 134 | if random.randint(2): 135 | image[:, :, 0] += random.uniform(-self.delta, self.delta) 136 | image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0 137 | image[:, :, 0][image[:, :, 0] < 0.0] += 360.0 138 | return image, boxes, labels 139 | 140 | 141 | class RandomLightingNoise(object): 142 | def __init__(self): 143 | self.perms = ((0, 1, 2), (0, 2, 1), 144 | (1, 0, 2), (1, 2, 0), 145 | (2, 0, 1), (2, 1, 0)) 146 | 147 | def __call__(self, image, boxes=None, labels=None): 148 | if random.randint(2): 149 | swap = self.perms[random.randint(len(self.perms))] 150 | shuffle = SwapChannels(swap) # shuffle channels 151 | image = shuffle(image) 152 | return image, boxes, labels 153 | 154 | 155 | class ConvertColor(object): 156 | def __init__(self, current='BGR', transform='HSV'): 157 | self.transform = transform 158 | self.current = current 159 | 160 | def __call__(self, image, boxes=None, labels=None): 161 | if self.current == 'BGR' and self.transform == 'HSV': 162 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 163 | elif self.current == 'HSV' and self.transform == 'BGR': 164 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 165 | else: 166 | raise NotImplementedError 167 | return image, boxes, labels 168 | 169 | 170 | class RandomContrast(object): 171 | def __init__(self, lower=0.5, upper=1.5): 172 | self.lower = lower 173 | self.upper = upper 174 | assert self.upper >= self.lower, "contrast upper must be >= lower." 175 | assert self.lower >= 0, "contrast lower must be non-negative." 176 | 177 | # expects float image 178 | def __call__(self, image, boxes=None, labels=None): 179 | if random.randint(2): 180 | alpha = random.uniform(self.lower, self.upper) 181 | image *= alpha 182 | return image, boxes, labels 183 | 184 | 185 | class RandomBrightness(object): 186 | def __init__(self, delta=32): 187 | assert delta >= 0.0 188 | assert delta <= 255.0 189 | self.delta = delta 190 | 191 | def __call__(self, image, boxes=None, labels=None): 192 | if random.randint(2): 193 | delta = random.uniform(-self.delta, self.delta) 194 | image += delta 195 | return image, boxes, labels 196 | 197 | 198 | class ToCV2Image(object): 199 | def __call__(self, tensor, boxes=None, labels=None): 200 | return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels 201 | 202 | 203 | class ToTensor(object): 204 | def __call__(self, cvimage, boxes=None, labels=None): 205 | return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels 206 | 207 | 208 | class RandomSampleCrop(object): 209 | """Crop 210 | Arguments: 211 | img (Image): the image being input during training 212 | boxes (Tensor): the original bounding boxes in pt form 213 | labels (Tensor): the class labels for each bbox 214 | mode (float tuple): the min and max jaccard overlaps 215 | Return: 216 | (img, boxes, classes) 217 | img (Image): the cropped image 218 | boxes (Tensor): the adjusted bounding boxes in pt form 219 | labels (Tensor): the class labels for each bbox 220 | """ 221 | def __init__(self): 222 | self.sample_options = ( 223 | # using entire original input image 224 | None, 225 | # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9 226 | (0.1, None), 227 | (0.3, None), 228 | (0.7, None), 229 | (0.9, None), 230 | # randomly sample a patch 231 | (None, None), 232 | ) 233 | 234 | def __call__(self, image, boxes=None, labels=None): 235 | height, width, _ = image.shape 236 | while True: 237 | # randomly choose a mode 238 | mode = random.choice(self.sample_options) 239 | if mode is None: 240 | return image, boxes, labels 241 | 242 | min_iou, max_iou = mode 243 | if min_iou is None: 244 | min_iou = float('-inf') 245 | if max_iou is None: 246 | max_iou = float('inf') 247 | 248 | # max trails (50) 249 | for _ in range(50): 250 | current_image = image 251 | 252 | w = random.uniform(0.3 * width, width) 253 | h = random.uniform(0.3 * height, height) 254 | 255 | # aspect ratio constraint b/t .5 & 2 256 | if h / w < 0.5 or h / w > 2: 257 | continue 258 | 259 | left = random.uniform(width - w) 260 | top = random.uniform(height - h) 261 | 262 | # convert to integer rect x1,y1,x2,y2 263 | rect = np.array([int(left), int(top), int(left+w), int(top+h)]) 264 | 265 | # calculate IoU (jaccard overlap) b/t the cropped and gt boxes 266 | overlap = jaccard_numpy(boxes, rect) 267 | 268 | # is min and max overlap constraint satisfied? if not try again 269 | if overlap.min() < min_iou and max_iou < overlap.max(): 270 | continue 271 | 272 | # cut the crop from the image 273 | current_image = current_image[rect[1]:rect[3], rect[0]:rect[2], 274 | :] 275 | 276 | # keep overlap with gt box IF center in sampled patch 277 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 278 | 279 | # mask in all gt boxes that above and to the left of centers 280 | m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1]) 281 | 282 | # mask in all gt boxes that under and to the right of centers 283 | m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1]) 284 | 285 | # mask in that both m1 and m2 are true 286 | mask = m1 * m2 287 | 288 | # have any valid boxes? try again if not 289 | if not mask.any(): 290 | continue 291 | 292 | # take only matching gt boxes 293 | current_boxes = boxes[mask, :].copy() 294 | 295 | # take only matching gt labels 296 | current_labels = labels[mask] 297 | 298 | # should we use the box left and top corner or the crop's 299 | current_boxes[:, :2] = np.maximum(current_boxes[:, :2], 300 | rect[:2]) 301 | # adjust to crop (by substracting crop's left,top) 302 | current_boxes[:, :2] -= rect[:2] 303 | 304 | current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], 305 | rect[2:]) 306 | # adjust to crop (by substracting crop's left,top) 307 | current_boxes[:, 2:] -= rect[:2] 308 | 309 | return current_image, current_boxes, current_labels 310 | 311 | 312 | class Expand(object): 313 | def __init__(self, mean): 314 | self.mean = mean 315 | 316 | def __call__(self, image, boxes, labels): 317 | if random.randint(0,2):###这里我自己修改了 318 | return image, boxes, labels 319 | 320 | height, width, depth = image.shape 321 | ratio = random.uniform(1, 4)#在(1-4)随机生成一个实数 322 | left = random.uniform(0, width*ratio - width) 323 | top = random.uniform(0, height*ratio - height) 324 | 325 | expand_image = np.zeros( 326 | (int(height*ratio), int(width*ratio), depth), 327 | dtype=image.dtype) 328 | expand_image[:, :, :] = self.mean 329 | expand_image[int(top):int(top + height), 330 | int(left):int(left + width)] = image 331 | image = expand_image 332 | 333 | boxes = boxes.copy() 334 | boxes[:, :2] += (int(left), int(top)) 335 | boxes[:, 2:] += (int(left), int(top)) 336 | 337 | return image, boxes, labels 338 | 339 | 340 | class RandomMirror(object): 341 | def __call__(self, image, boxes, classes): 342 | _, width, _ = image.shape 343 | if random.randint(2): 344 | image = image[:, ::-1] 345 | boxes = boxes.copy() 346 | boxes[:, 0::2] = width - boxes[:, 2::-2] 347 | return image, boxes, classes 348 | 349 | 350 | class SwapChannels(object): 351 | """Transforms a tensorized image by swapping the channels in the order 352 | specified in the swap tuple. 353 | Args: 354 | swaps (int triple): final order of channels 355 | eg: (2, 1, 0) 356 | """ 357 | 358 | def __init__(self, swaps): 359 | self.swaps = swaps 360 | 361 | def __call__(self, image): 362 | """ 363 | Args: 364 | image (Tensor): image tensor to be transformed 365 | Return: 366 | a tensor with channels swapped according to swap 367 | """ 368 | # if torch.is_tensor(image): 369 | # image = image.data.cpu().numpy() 370 | # else: 371 | # image = np.array(image) 372 | image = image[:, :, self.swaps] 373 | return image 374 | 375 | 376 | class PhotometricDistort(object): 377 | def __init__(self): 378 | self.pd = [ 379 | RandomContrast(), 380 | ConvertColor(transform='HSV'), 381 | RandomSaturation(), 382 | RandomHue(), 383 | ConvertColor(current='HSV', transform='BGR'), 384 | RandomContrast() 385 | ] 386 | self.rand_brightness = RandomBrightness() 387 | self.rand_light_noise = RandomLightingNoise() 388 | 389 | def __call__(self, image, boxes, labels): 390 | im = image.copy() 391 | im, boxes, labels = self.rand_brightness(im, boxes, labels) 392 | if random.randint(2): 393 | distort = Compose(self.pd[:-1]) 394 | else: 395 | distort = Compose(self.pd[1:]) 396 | im, boxes, labels = distort(im, boxes, labels) 397 | return self.rand_light_noise(im, boxes, labels) 398 | 399 | 400 | class SSDAugmentation(object): 401 | def __init__(self, size=300, mean=(104, 117, 123)): 402 | self.mean = mean 403 | self.size = size 404 | self.augment = Compose([ #对输入的图片进行处理 405 | ConvertFromInts(), 406 | ToAbsoluteCoords(), 407 | PhotometricDistort(), 408 | Expand(self.mean), 409 | RandomSampleCrop(), 410 | RandomMirror(), 411 | ToPercentCoords(), 412 | Resize(self.size), 413 | SubtractMeans(self.mean) 414 | ]) 415 | 416 | def __call__(self, img, boxes, labels): 417 | return self.augment(img, boxes, labels) 418 | -------------------------------------------------------------------------------- /xml2regresstxt.py: -------------------------------------------------------------------------------- 1 | # #!/usr/bin/env python 2 | # # -*- encoding: utf-8 -*- 3 | # ''' 4 | # @File : bbox-regress.py 5 | # @Version : 1.0 6 | # @Author : 2014Vee 7 | # @Contact : 1976535998@qq.com 8 | # @License : (C)Copyright 2014Vee From UESTC 9 | # @Modify Time : 2020/4/14 9:44 10 | # @Desciption : 生成回归框训练的数据文件 11 | # ''' 12 | # import os 13 | # import random 14 | # 15 | # xmlfilepath = r'/data/lp/project/ssd.pytorch/xml_zc_fz' 16 | # saveBasePath = r'/data/lp/project/ssd.pytorch/txtsave' 17 | # 18 | # trainval_percent = 1.0 19 | # train_percent = 0.9 20 | # total_xml = os.listdir(xmlfilepath) 21 | # num = len(total_xml) 22 | # list = range(num) 23 | # tv = int(num * trainval_percent) 24 | # tr = int(tv * train_percent) 25 | # trainval = random.sample(list, tv) 26 | # train = random.sample(trainval, tr) 27 | # 28 | # print("train and val size", tv) 29 | # print("train size", tr) 30 | # ftrainval = open(os.path.join(saveBasePath, 'trainval.txt'), 'w') 31 | # ftest = open(os.path.join(saveBasePath, 'test.txt'), 'w') 32 | # ftrain = open(os.path.join(saveBasePath, 'train.txt'), 'w') 33 | # fval = open(os.path.join(saveBasePath, 'val.txt'), 'w') 34 | # 35 | # for i in list: 36 | # name = total_xml[i][:-4] + '\n' 37 | # if i in trainval: 38 | # ftrainval.write(name) 39 | # if i in train: 40 | # ftrain.write(name) 41 | # else: 42 | # fval.write(name) 43 | # else: 44 | # ftest.write(name) 45 | # 46 | # ftrainval.close() 47 | # ftrain.close() 48 | # fval.close() 49 | # ftest.close() 50 | # # test 51 | 52 | 53 | tensors_list = [[[[1,2],[3,4],[5,6]],[[7,8],[9,10],[11,12]],[[13,14],[15,16],[17,18]],[[19,20],[21,22],[23,24]]], [[[25,26],[27,28],[29,30]],[[31,32],[33,34],[35,36]],[[37,38],[39,40],[41,42]],[[43,44],[45,46],[47,48]]], [[[49,50],[51,52],[53,54]],[[55,56],[57,58],[59,60]],[[61,62],[63,64],[65,66]],[[67,68],[69,70],[71,72]]], [[[73,74],[75,76],[77,78]],[[79,80],[81,82],[83,84]],[[85,86],[87,88],[89,90]],[[91,92],[93,94],[95,96]]], [[[97,98],[99,100],[101,102]],[[103,104],[105,106],[107,108]],[[109,110],[111,112],[113,114]],[[115,116],[117,118],[119,120]]]] 54 | print(tensors_list) -------------------------------------------------------------------------------- /代码详解blog.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/代码详解blog.txt -------------------------------------------------------------------------------- /保存权重/train.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | from data import * 3 | from utils.augmentations import SSDAugmentation 4 | from layers.modules import MultiBoxLoss 5 | from ssd import build_ssd 6 | import time 7 | import torch 8 | from torch.autograd import Variable 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | import torch.backends.cudnn as cudnn 12 | import torch.nn.init as init 13 | import torch.utils.data as data 14 | import argparse 15 | import visdom as viz 16 | import os 17 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 指定GPU做训练 18 | 19 | 20 | def str2bool(v): 21 | return v.lower() in ("yes", "true", "t", "1") 22 | 23 | 24 | parser = argparse.ArgumentParser( 25 | description='Single Shot MultiBox Detector Training With Pytorch') 26 | train_set = parser.add_mutually_exclusive_group() 27 | parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'], 28 | type=str, help='VOC or COCO') 29 | parser.add_argument('--dataset_root', default="data/VOCdevkit/", # 修改【dataset_root】 30 | help='Dataset root directory path') 31 | parser.add_argument('--basenet', default='vgg16_reducedfc.pth', # 【预训练好的权重系数】 32 | help='Pretrained base model') 33 | parser.add_argument('--batch_size', default=4, type=int, # 【修改batch_size】 34 | help='Batch size for training') 35 | parser.add_argument('--resume', default=None, type=str, # 【是否从某节点开始训练】没有就是None 36 | help='Checkpoint state_dict file to resume training from') 37 | parser.add_argument('--start_iter', default=0, type=int, 38 | help='Resume training at this iter') 39 | parser.add_argument('--num_workers', default=2, type=int, # 【num_workers】 40 | help='Number of workers used in dataloading') 41 | parser.add_argument('--cuda', default=True, type=str2bool, 42 | help='Use CUDA to train model') 43 | parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float, # 【修改学习率】 44 | help='initial learning rate') 45 | parser.add_argument('--momentum', default=0.9, type=float, 46 | help='Momentum value for optim') 47 | parser.add_argument('--weight_decay', default=5e-4, type=float, 48 | help='Weight decay for SGD') 49 | parser.add_argument('--gamma', default=0.1, type=float, 50 | help='Gamma update for SGD') 51 | parser.add_argument('--visdom', default=False, type=str2bool, # 可视化 这次设置为【【】可视化】】】 52 | help='Use visdom for loss visualization') 53 | parser.add_argument('--save_folder', default='weights/', 54 | help='Directory for saving checkpoint models') 55 | args = parser.parse_args() 56 | 57 | if torch.cuda.is_available(): 58 | if args.cuda: 59 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 60 | if not args.cuda: 61 | print("WARNING: It looks like you have a CUDA device, but aren't " + 62 | "using CUDA.\nRun with --cuda for optimal training speed.") 63 | torch.set_default_tensor_type('torch.FloatTensor') 64 | else: 65 | torch.set_default_tensor_type('torch.FloatTensor') 66 | 67 | if not os.path.exists(args.save_folder): 68 | os.mkdir(args.save_folder) 69 | 70 | 71 | def train(): 72 | cfg = voc # voc是一个字典 里面包括网络的一系列参数信息 73 | dataset = VOCDetection( # 是一个VOC数据的类 74 | root=args.dataset_root, # 数据集的根目录 75 | transform=SSDAugmentation(cfg['min_dim'], MEANS)) # 图片的预处理方法(输入图片的尺寸和均值) 原本类中定义为None 后面的MEANS我人为可以删除 76 | 77 | if args.visdom: # 这里是可视化工具,不用管################### 78 | import visdom 79 | viz = visdom.Visdom() 80 | 81 | ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes']) 82 | # 阶段【train or test】 输入图片尺寸大小 类别数 83 | # build_ssd是一个放在ssd.py的函数 84 | # return是一个类的对象,也就是class SSD(nn.Module),ssd_net也就是SSD类的一个对象 85 | # ssd_net拥有所有class SSD继承于nn.Module以及作者增加方法的所有属性 86 | # 在SSD这个类中就定义了网络的base部分(修改全连接层后的VGG16)和extras部分(论文作者加入的多尺度feature map)和head部分 87 | # 对选定的6个尺度下的feature map进行卷积操作得到的每个default box 的每一个分类类别的confidence以及位置坐标的信息 88 | net = ssd_net # 到这里class类SSD只完成了__init__()并没有执行__forward__() net是一个类 89 | 90 | if args.cuda: # 是否将模型放到多个个GPU上运行{我认为在我的任务中不要放在多线程GPU中} 91 | net = torch.nn.DataParallel(ssd_net) 92 | cudnn.benchmark = True 93 | if args.resume: # 【resume】的默认值是None,表示不是接着某个断点来继续训练这个模型 【其实checkpoint里面最好还要加上优化器的保存】 94 | # 【model_state_dict,optimizer_state_dict,epoch】 见深度之眼 95 | print('Resuming training, loading {}...'.format(args.resume)) 96 | ssd_net.load_weights(args.resume) 97 | else: # 那么就从weights文件夹下面直接加载预训练好vgg基础网络预训练权重 98 | vgg_weights = torch.load(args.save_folder + args.basenet) # 整个ssd_net中vgg基础网络的权重 99 | print('Loading base network...') 100 | ssd_net.vgg.load_state_dict(vgg_weights) # 只在整个ssd_net中的vgg模块中加载预训练好的权重,其余的extra,特征融合,CBAM模块没有加载预训练权重 101 | if args.cuda: # 将模型结构放在GPU上训练 102 | net = net.cuda() 103 | if not args.resume: # ###################################################################### 104 | print('Initializing weights...') # 如果不是接着某个断点接着训练,那么其余extras loc con都会xavier方法初始化 105 | # initialize newly added layers' weights with xavier method 106 | ssd_net.extras.apply(weights_init) # extras 模块由 xavier 方法默认初始化data和bias 107 | ssd_net.loc.apply(weights_init) # loc 模块由 xavier 方法默认初始化data和bias 108 | ssd_net.conf.apply(weights_init) # conf 模块由 xavier 方法默认初始化data和bias 109 | 110 | # 【优化器】net.parameters()是网络结构中的参数,学习率,动量,权重衰减率 111 | optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) 112 | # 定义损失函数部分【MultiBoxesLoss是一个类用于计算网络的损失,criterion是一个对象】 113 | # 【损失函数】 关键!!! criterion是个nn.Moudule的形式 里面包括两部分loss_c 和 loss_l 114 | criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5, False, args.cuda) 115 | # 前向传播 116 | net.train() 117 | # loss counters 118 | loc_loss = 0 119 | conf_loss = 0 120 | epoch = 0 121 | print('Loading the dataset...') 122 | epoch_size = len(dataset) // args.batch_size # 每个epoch中有多少个batch 123 | print('Training SSD on:', dataset.name) 124 | print('Using the specified args:') 125 | print(args) # 讲设定的参数打印出来 126 | 127 | step_index = 0 128 | # 可视化部分 129 | if args.visdom: # 默认值为False 130 | vis_title = 'SSD.PyTorch on ' + dataset.name 131 | vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss'] 132 | iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend) 133 | epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend) 134 | 135 | data_loader = data.DataLoader(dataset, args.batch_size, 136 | num_workers=args.num_workers, # 默认值我修改成了0 137 | shuffle=True, 138 | collate_fn=detection_collate, # collate_fn将一个batch_size数目的图片进行合并成batch 139 | pin_memory=True) 140 | batch_iterator = iter(data_loader) # batch迭代器 依次迭代batch 141 | for iteration in range(args.start_iter, cfg['max_iter']): # 由最大迭代次数来迭代训练 142 | if args.visdom and iteration != 0 and (iteration % epoch_size == 0): # 因为args.visdom一直设置为False因此没有被调用 143 | update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None, 'append', epoch_size) 144 | # reset epoch loss counters 145 | loc_loss = 0 146 | conf_loss = 0 147 | epoch += 1 148 | 149 | if iteration in cfg['lr_steps']: # 通过多少次epoch调节一次学习率 150 | step_index += 1 151 | adjust_learning_rate(optimizer, args.gamma, step_index) 152 | 153 | # load train data 154 | try: 155 | images, targets = next(batch_iterator) 156 | # targets 和image都是读取的训练数据 157 | except StopIteration: 158 | bath_iterator = iter(data_loader) 159 | images, targets = next(bath_iterator) 160 | # images=【batch_size,3,300,300】 161 | # targets=【batch_size,num_object,5】 162 | # num_object代表一张图里面有几个ground truth,5代表四个位置信息和一个label 163 | if args.cuda: # 将数据放在cuda上 164 | images = Variable(images.cuda()) 165 | targets = [Variable(ann.cuda(), volatile=True) for ann in targets] 166 | else: 167 | images = Variable(images) 168 | targets = [Variable(ann, volatile=True) for ann in targets] 169 | # forward 170 | t0 = time.time() 171 | # ##out是netforward的输出:是个元组,里面包括3个部分[loc conf priors] 172 | out = net(images) 173 | # ## backprop 优化器梯度清零 174 | optimizer.zero_grad() 175 | # ## criterion是nn.Module形式,下面是调用它的forward模式【重点看,里面包括难例挖掘的内容】 176 | # ###################################【【【训练阶段的损失!!!】】】###################################### 177 | # ##输入参数1:网络结构net输出的out:[loc conf priors] 178 | # ##输入参数2:targets:真实目标的位置标签值 179 | loss_l, loss_c = criterion(out, targets) # criterion就是MultiBoxLoss类定义的对象,forward前传播返回的结果是【loss_l, loss_c】 180 | loss = loss_l + loss_c # 总loss 181 | loss.backward() 182 | optimizer.step() 183 | t1 = time.time() 184 | # 下面两行好像没有使用 185 | loc_loss += loss_l.data # ###到底是改成item()还是data 186 | conf_loss += loss_c.data # ###到底是改成item()还是data 187 | 188 | if iteration % 10 == 0: 189 | print('timer: %.4f sec.' % (t1 - t0)) 190 | print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % loss.data, end=' ') # 到底是改成item()还是data 191 | 192 | if args.visdom: 193 | update_vis_plot(iteration, loss_l.data, loss_c.data, iter_plot, epoch_plot, 'append') 194 | 195 | if iteration != 0 and iteration % 10000 == 0: 196 | # 迭代多少次保存一次模型。 在尝试阶段,为了节省时间,建议将根据迭代次数保存模型的参数调低,例如调节到500 197 | print('Saving state, iter:', iteration) # 保存的checkpoint 198 | torch.save(ssd_net.state_dict(), 'weights/ssd300_VOC_' + repr(iteration) + '.pth') # 保存模型的路径 199 | torch.save(ssd_net.state_dict(), args.save_folder + '' + args.dataset + '.pth') # 最后的保存:不是保存整个模型,只是保存了参数 200 | 201 | 202 | def adjust_learning_rate(optimizer, gamma, step): 203 | """Sets the learning rate to the initial LR decayed by 10 at every 204 | specified step 205 | """ 206 | lr = args.lr * (gamma ** (step)) 207 | for param_group in optimizer.param_groups: 208 | param_group['lr'] = lr 209 | 210 | 211 | def xavier(param): 212 | init.xavier_uniform(param) 213 | 214 | 215 | def weights_init(m): 216 | if isinstance(m, nn.Conv2d): 217 | xavier(m.weight.data) 218 | m.bias.data.zero_() 219 | 220 | 221 | def create_vis_plot(_xlabel, _ylabel, _title, _legend): 222 | return viz.line( 223 | X=torch.zeros((1,)).cpu(), 224 | Y=torch.zeros((1, 3)).cpu(), 225 | opts=dict( 226 | xlabel=_xlabel, 227 | ylabel=_ylabel, 228 | title=_title, 229 | legend=_legend 230 | ) 231 | ) 232 | 233 | 234 | def update_vis_plot(iteration, loc, conf, window1, window2, update_type, 235 | epoch_size=1): 236 | viz.line( 237 | X=torch.ones((1, 3)).cpu() * iteration, 238 | Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu() / epoch_size, 239 | win=window1, 240 | update=update_type 241 | ) 242 | # initialize epoch plot on first iteration 243 | if iteration == 0: 244 | viz.line( 245 | X=torch.zeros((1, 3)).cpu(), 246 | Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu(), 247 | win=window2, 248 | update=True 249 | ) 250 | 251 | 252 | if __name__ == '__main__': 253 | train() 254 | -------------------------------------------------------------------------------- /保存权重/代码详解blog.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/保存权重/代码详解blog.txt -------------------------------------------------------------------------------- /显示检测结果code.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | with open('D:/Deep_learning/ssd.pytorch-master/eval/test1.txt','r') as f: 3 | line=f.readline() 4 | pic=list() 5 | loc=list() 6 | match={} 7 | while line: 8 | if 'GROUND TRUTH FOR:' in line: 9 | pic.append(line[-7:-1]) 10 | if 'ship score' in line: 11 | location=line.split(' ')[5:12:2] 12 | location=[float(x) for x in location] 13 | loc.append(location) 14 | if len(line)==1: 15 | match[pic[0]]=loc 16 | pic=list() 17 | loc=list() 18 | line=f.readline() 19 | f.close() 20 | for i in match.keys(): 21 | #print('D:/Deep_learning/ssd.pytorch-master/data/VOCdevkit/VOC2007/ground_truth/'+i+'.jpg.jpg') 22 | img=cv2.imread('/data/lp/project/ssd.pytorch/data/VOCdevkit/VOC2007/ground_truth/'+i+'.jpg.jpg') 23 | #print(match[i],'每一幅图的框个数: ',len(match[i])) 24 | for num in range(len(match[i])): 25 | x1=int(match[i][num][0]) 26 | y1=int(match[i][num][1]) 27 | x2=int(match[i][num][2]) 28 | y2=int(match[i][num][3]) 29 | cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), thickness=2) 30 | cv2.imwrite("/data/lp/project/ssd.pytorch/data/VOCdevkit/VOC2007/PREDECTION/"+i+'.jpg.jpg', img) 31 | -------------------------------------------------------------------------------- /训练步骤.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2014Vee/ssd-pytorch/b534eeee10f3b7df2da49934e47d67a4d62be048/训练步骤.txt --------------------------------------------------------------------------------