├── .gitignore
├── LICENSE
├── README.md
├── download_data.sh
├── test.py
├── test
    ├── cat.jpg
    └── person.jpg
├── train.py
├── utils
    ├── __init__.py
    ├── pascal_voc.py
    └── timer.py
└── yolo
    ├── __init__.py
    ├── config.py
    └── yolo_net.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.swp
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Peng Zhang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## YOLO_tensorflow
 2 | 
 3 | Tensorflow implementation of [YOLO](https://arxiv.org/pdf/1506.02640.pdf), including training and test phase.
 4 | 
 5 | ### Installation
 6 | 
 7 | 1. Clone yolo_tensorflow repository
 8 | 	```Shell
 9 | 	$ git clone https://github.com/hizhangp/yolo_tensorflow.git
10 |     $ cd yolo_tensorflow
11 | 	```
12 | 
13 | 2. Download Pascal VOC dataset, and create correct directories
14 | 	```Shell
15 | 	$ ./download_data.sh
16 | 	```
17 | 
18 | 3. Download [YOLO_small](https://drive.google.com/file/d/0B5aC8pI-akZUNVFZMmhmcVRpbTA/view?usp=sharing)
19 | weight file and put it in `data/weight`
20 | 
21 | 4. Modify configuration in `yolo/config.py`
22 | 
23 | 5. Training
24 | 	```Shell
25 | 	$ python train.py
26 | 	```
27 | 
28 | 6. Test
29 | 	```Shell
30 | 	$ python test.py
31 | 	```
32 | 
33 | ### Requirements
34 | 1. Tensorflow
35 | 
36 | 2. OpenCV
37 | 


--------------------------------------------------------------------------------
/download_data.sh:
--------------------------------------------------------------------------------
 1 | echo "Creating data directory..."
 2 | mkdir -p data && cd data
 3 | mkdir weights
 4 | mkdir pascal_voc
 5 | 
 6 | echo "Downloading Pascal VOC 2012 data..."
 7 | wget http://pjreddie.com/media/files/VOCtrainval_06-Nov-2007.tar
 8 | 
 9 | echo "Extracting VOC data..."
10 | tar xf VOCtrainval_06-Nov-2007.tar
11 | 
12 | mv VOCdevkit pascal_voc/.
13 | 
14 | echo "Done."
15 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import argparse
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | import yolo.config as cfg
  7 | from yolo.yolo_net import YOLONet
  8 | from utils.timer import Timer
  9 | 
 10 | 
 11 | class Detector(object):
 12 | 
 13 |     def __init__(self, net, weight_file):
 14 |         self.net = net
 15 |         self.weights_file = weight_file
 16 | 
 17 |         self.classes = cfg.CLASSES
 18 |         self.num_class = len(self.classes)
 19 |         self.image_size = cfg.IMAGE_SIZE
 20 |         self.cell_size = cfg.CELL_SIZE
 21 |         self.boxes_per_cell = cfg.BOXES_PER_CELL
 22 |         self.threshold = cfg.THRESHOLD
 23 |         self.iou_threshold = cfg.IOU_THRESHOLD
 24 |         self.boundary1 = self.cell_size * self.cell_size * self.num_class
 25 |         self.boundary2 = self.boundary1 +\
 26 |             self.cell_size * self.cell_size * self.boxes_per_cell
 27 | 
 28 |         self.sess = tf.Session()
 29 |         self.sess.run(tf.global_variables_initializer())
 30 | 
 31 |         print('Restoring weights from: ' + self.weights_file)
 32 |         self.saver = tf.train.Saver()
 33 |         self.saver.restore(self.sess, self.weights_file)
 34 | 
 35 |     def draw_result(self, img, result):
 36 |         for i in range(len(result)):
 37 |             x = int(result[i][1])
 38 |             y = int(result[i][2])
 39 |             w = int(result[i][3] / 2)
 40 |             h = int(result[i][4] / 2)
 41 |             cv2.rectangle(img, (x - w, y - h), (x + w, y + h), (0, 255, 0), 2)
 42 |             cv2.rectangle(img, (x - w, y - h - 20),
 43 |                           (x + w, y - h), (125, 125, 125), -1)
 44 |             lineType = cv2.LINE_AA if cv2.__version__ > '3' else cv2.CV_AA
 45 |             cv2.putText(
 46 |                 img, result[i][0] + ' : %.2f' % result[i][5],
 47 |                 (x - w + 5, y - h - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
 48 |                 (0, 0, 0), 1, lineType)
 49 | 
 50 |     def detect(self, img):
 51 |         img_h, img_w, _ = img.shape
 52 |         inputs = cv2.resize(img, (self.image_size, self.image_size))
 53 |         inputs = cv2.cvtColor(inputs, cv2.COLOR_BGR2RGB).astype(np.float32)
 54 |         inputs = (inputs / 255.0) * 2.0 - 1.0
 55 |         inputs = np.reshape(inputs, (1, self.image_size, self.image_size, 3))
 56 | 
 57 |         result = self.detect_from_cvmat(inputs)[0]
 58 | 
 59 |         for i in range(len(result)):
 60 |             result[i][1] *= (1.0 * img_w / self.image_size)
 61 |             result[i][2] *= (1.0 * img_h / self.image_size)
 62 |             result[i][3] *= (1.0 * img_w / self.image_size)
 63 |             result[i][4] *= (1.0 * img_h / self.image_size)
 64 | 
 65 |         return result
 66 | 
 67 |     def detect_from_cvmat(self, inputs):
 68 |         net_output = self.sess.run(self.net.logits,
 69 |                                    feed_dict={self.net.images: inputs})
 70 |         results = []
 71 |         for i in range(net_output.shape[0]):
 72 |             results.append(self.interpret_output(net_output[i]))
 73 | 
 74 |         return results
 75 | 
 76 |     def interpret_output(self, output):
 77 |         probs = np.zeros((self.cell_size, self.cell_size,
 78 |                           self.boxes_per_cell, self.num_class))
 79 |         class_probs = np.reshape(
 80 |             output[0:self.boundary1],
 81 |             (self.cell_size, self.cell_size, self.num_class))
 82 |         scales = np.reshape(
 83 |             output[self.boundary1:self.boundary2],
 84 |             (self.cell_size, self.cell_size, self.boxes_per_cell))
 85 |         boxes = np.reshape(
 86 |             output[self.boundary2:],
 87 |             (self.cell_size, self.cell_size, self.boxes_per_cell, 4))
 88 |         offset = np.array(
 89 |             [np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell)
 90 |         offset = np.transpose(
 91 |             np.reshape(
 92 |                 offset,
 93 |                 [self.boxes_per_cell, self.cell_size, self.cell_size]),
 94 |             (1, 2, 0))
 95 | 
 96 |         boxes[:, :, :, 0] += offset
 97 |         boxes[:, :, :, 1] += np.transpose(offset, (1, 0, 2))
 98 |         boxes[:, :, :, :2] = 1.0 * boxes[:, :, :, 0:2] / self.cell_size
 99 |         boxes[:, :, :, 2:] = np.square(boxes[:, :, :, 2:])
100 | 
101 |         boxes *= self.image_size
102 | 
103 |         for i in range(self.boxes_per_cell):
104 |             for j in range(self.num_class):
105 |                 probs[:, :, i, j] = np.multiply(
106 |                     class_probs[:, :, j], scales[:, :, i])
107 | 
108 |         filter_mat_probs = np.array(probs >= self.threshold, dtype='bool')
109 |         filter_mat_boxes = np.nonzero(filter_mat_probs)
110 |         boxes_filtered = boxes[filter_mat_boxes[0],
111 |                                filter_mat_boxes[1], filter_mat_boxes[2]]
112 |         probs_filtered = probs[filter_mat_probs]
113 |         classes_num_filtered = np.argmax(
114 |             filter_mat_probs, axis=3)[
115 |             filter_mat_boxes[0], filter_mat_boxes[1], filter_mat_boxes[2]]
116 | 
117 |         argsort = np.array(np.argsort(probs_filtered))[::-1]
118 |         boxes_filtered = boxes_filtered[argsort]
119 |         probs_filtered = probs_filtered[argsort]
120 |         classes_num_filtered = classes_num_filtered[argsort]
121 | 
122 |         for i in range(len(boxes_filtered)):
123 |             if probs_filtered[i] == 0:
124 |                 continue
125 |             for j in range(i + 1, len(boxes_filtered)):
126 |                 if self.iou(boxes_filtered[i], boxes_filtered[j]) > self.iou_threshold:
127 |                     probs_filtered[j] = 0.0
128 | 
129 |         filter_iou = np.array(probs_filtered > 0.0, dtype='bool')
130 |         boxes_filtered = boxes_filtered[filter_iou]
131 |         probs_filtered = probs_filtered[filter_iou]
132 |         classes_num_filtered = classes_num_filtered[filter_iou]
133 | 
134 |         result = []
135 |         for i in range(len(boxes_filtered)):
136 |             result.append(
137 |                 [self.classes[classes_num_filtered[i]],
138 |                  boxes_filtered[i][0],
139 |                  boxes_filtered[i][1],
140 |                  boxes_filtered[i][2],
141 |                  boxes_filtered[i][3],
142 |                  probs_filtered[i]])
143 | 
144 |         return result
145 | 
146 |     def iou(self, box1, box2):
147 |         tb = min(box1[0] + 0.5 * box1[2], box2[0] + 0.5 * box2[2]) - \
148 |             max(box1[0] - 0.5 * box1[2], box2[0] - 0.5 * box2[2])
149 |         lr = min(box1[1] + 0.5 * box1[3], box2[1] + 0.5 * box2[3]) - \
150 |             max(box1[1] - 0.5 * box1[3], box2[1] - 0.5 * box2[3])
151 |         inter = 0 if tb < 0 or lr < 0 else tb * lr
152 |         return inter / (box1[2] * box1[3] + box2[2] * box2[3] - inter)
153 | 
154 |     def camera_detector(self, cap, wait=10):
155 |         detect_timer = Timer()
156 |         ret, _ = cap.read()
157 | 
158 |         while ret:
159 |             ret, frame = cap.read()
160 |             detect_timer.tic()
161 |             result = self.detect(frame)
162 |             detect_timer.toc()
163 |             print('Average detecting time: {:.3f}s'.format(
164 |                 detect_timer.average_time))
165 | 
166 |             self.draw_result(frame, result)
167 |             cv2.imshow('Camera', frame)
168 |             cv2.waitKey(wait)
169 | 
170 |             ret, frame = cap.read()
171 | 
172 |     def image_detector(self, imname, wait=0):
173 |         detect_timer = Timer()
174 |         image = cv2.imread(imname)
175 | 
176 |         detect_timer.tic()
177 |         result = self.detect(image)
178 |         detect_timer.toc()
179 |         print('Average detecting time: {:.3f}s'.format(
180 |             detect_timer.average_time))
181 | 
182 |         self.draw_result(image, result)
183 |         cv2.imshow('Image', image)
184 |         cv2.waitKey(wait)
185 | 
186 | 
187 | def main():
188 |     parser = argparse.ArgumentParser()
189 |     parser.add_argument('--weights', default="YOLO_small.ckpt", type=str)
190 |     parser.add_argument('--weight_dir', default='weights', type=str)
191 |     parser.add_argument('--data_dir', default="data", type=str)
192 |     parser.add_argument('--gpu', default='', type=str)
193 |     args = parser.parse_args()
194 | 
195 |     os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
196 | 
197 |     yolo = YOLONet(False)
198 |     weight_file = os.path.join(args.data_dir, args.weight_dir, args.weights)
199 |     detector = Detector(yolo, weight_file)
200 | 
201 |     # detect from camera
202 |     # cap = cv2.VideoCapture(-1)
203 |     # detector.camera_detector(cap)
204 | 
205 |     # detect from image file
206 |     imname = 'test/person.jpg'
207 |     detector.image_detector(imname)
208 | 
209 | 
210 | if __name__ == '__main__':
211 |     main()
212 | 


--------------------------------------------------------------------------------
/test/cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hizhangp/yolo_tensorflow/88aba9d5569c04170f294a093455390a90f2686e/test/cat.jpg


--------------------------------------------------------------------------------
/test/person.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hizhangp/yolo_tensorflow/88aba9d5569c04170f294a093455390a90f2686e/test/person.jpg


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import datetime
  4 | import tensorflow as tf
  5 | import yolo.config as cfg
  6 | from yolo.yolo_net import YOLONet
  7 | from utils.timer import Timer
  8 | from utils.pascal_voc import pascal_voc
  9 | 
 10 | slim = tf.contrib.slim
 11 | 
 12 | 
 13 | class Solver(object):
 14 | 
 15 |     def __init__(self, net, data):
 16 |         self.net = net
 17 |         self.data = data
 18 |         self.weights_file = cfg.WEIGHTS_FILE
 19 |         self.max_iter = cfg.MAX_ITER
 20 |         self.initial_learning_rate = cfg.LEARNING_RATE
 21 |         self.decay_steps = cfg.DECAY_STEPS
 22 |         self.decay_rate = cfg.DECAY_RATE
 23 |         self.staircase = cfg.STAIRCASE
 24 |         self.summary_iter = cfg.SUMMARY_ITER
 25 |         self.save_iter = cfg.SAVE_ITER
 26 |         self.output_dir = os.path.join(
 27 |             cfg.OUTPUT_DIR, datetime.datetime.now().strftime('%Y_%m_%d_%H_%M'))
 28 |         if not os.path.exists(self.output_dir):
 29 |             os.makedirs(self.output_dir)
 30 |         self.save_cfg()
 31 | 
 32 |         self.variable_to_restore = tf.global_variables()
 33 |         self.saver = tf.train.Saver(self.variable_to_restore, max_to_keep=None)
 34 |         self.ckpt_file = os.path.join(self.output_dir, 'yolo')
 35 |         self.summary_op = tf.summary.merge_all()
 36 |         self.writer = tf.summary.FileWriter(self.output_dir, flush_secs=60)
 37 | 
 38 |         self.global_step = tf.train.create_global_step()
 39 |         self.learning_rate = tf.train.exponential_decay(
 40 |             self.initial_learning_rate, self.global_step, self.decay_steps,
 41 |             self.decay_rate, self.staircase, name='learning_rate')
 42 |         self.optimizer = tf.train.GradientDescentOptimizer(
 43 |             learning_rate=self.learning_rate)
 44 |         self.train_op = slim.learning.create_train_op(
 45 |             self.net.total_loss, self.optimizer, global_step=self.global_step)
 46 | 
 47 |         gpu_options = tf.GPUOptions()
 48 |         config = tf.ConfigProto(gpu_options=gpu_options)
 49 |         self.sess = tf.Session(config=config)
 50 |         self.sess.run(tf.global_variables_initializer())
 51 | 
 52 |         if self.weights_file is not None:
 53 |             print('Restoring weights from: ' + self.weights_file)
 54 |             self.saver.restore(self.sess, self.weights_file)
 55 | 
 56 |         self.writer.add_graph(self.sess.graph)
 57 | 
 58 |     def train(self):
 59 | 
 60 |         train_timer = Timer()
 61 |         load_timer = Timer()
 62 | 
 63 |         for step in range(1, self.max_iter + 1):
 64 | 
 65 |             load_timer.tic()
 66 |             images, labels = self.data.get()
 67 |             load_timer.toc()
 68 |             feed_dict = {self.net.images: images,
 69 |                          self.net.labels: labels}
 70 | 
 71 |             if step % self.summary_iter == 0:
 72 |                 if step % (self.summary_iter * 10) == 0:
 73 | 
 74 |                     train_timer.tic()
 75 |                     summary_str, loss, _ = self.sess.run(
 76 |                         [self.summary_op, self.net.total_loss, self.train_op],
 77 |                         feed_dict=feed_dict)
 78 |                     train_timer.toc()
 79 | 
 80 |                     log_str = '''{} Epoch: {}, Step: {}, Learning rate: {},'''
 81 |                     ''' Loss: {:5.3f}\nSpeed: {:.3f}s/iter,'''
 82 |                     '''' Load: {:.3f}s/iter, Remain: {}'''.format(
 83 |                         datetime.datetime.now().strftime('%m-%d %H:%M:%S'),
 84 |                         self.data.epoch,
 85 |                         int(step),
 86 |                         round(self.learning_rate.eval(session=self.sess), 6),
 87 |                         loss,
 88 |                         train_timer.average_time,
 89 |                         load_timer.average_time,
 90 |                         train_timer.remain(step, self.max_iter))
 91 |                     print(log_str)
 92 | 
 93 |                 else:
 94 |                     train_timer.tic()
 95 |                     summary_str, _ = self.sess.run(
 96 |                         [self.summary_op, self.train_op],
 97 |                         feed_dict=feed_dict)
 98 |                     train_timer.toc()
 99 | 
100 |                 self.writer.add_summary(summary_str, step)
101 | 
102 |             else:
103 |                 train_timer.tic()
104 |                 self.sess.run(self.train_op, feed_dict=feed_dict)
105 |                 train_timer.toc()
106 | 
107 |             if step % self.save_iter == 0:
108 |                 print('{} Saving checkpoint file to: {}'.format(
109 |                     datetime.datetime.now().strftime('%m-%d %H:%M:%S'),
110 |                     self.output_dir))
111 |                 self.saver.save(
112 |                     self.sess, self.ckpt_file, global_step=self.global_step)
113 | 
114 |     def save_cfg(self):
115 | 
116 |         with open(os.path.join(self.output_dir, 'config.txt'), 'w') as f:
117 |             cfg_dict = cfg.__dict__
118 |             for key in sorted(cfg_dict.keys()):
119 |                 if key[0].isupper():
120 |                     cfg_str = '{}: {}\n'.format(key, cfg_dict[key])
121 |                     f.write(cfg_str)
122 | 
123 | 
124 | def update_config_paths(data_dir, weights_file):
125 |     cfg.DATA_PATH = data_dir
126 |     cfg.PASCAL_PATH = os.path.join(data_dir, 'pascal_voc')
127 |     cfg.CACHE_PATH = os.path.join(cfg.PASCAL_PATH, 'cache')
128 |     cfg.OUTPUT_DIR = os.path.join(cfg.PASCAL_PATH, 'output')
129 |     cfg.WEIGHTS_DIR = os.path.join(cfg.PASCAL_PATH, 'weights')
130 | 
131 |     cfg.WEIGHTS_FILE = os.path.join(cfg.WEIGHTS_DIR, weights_file)
132 | 
133 | 
134 | def main():
135 |     parser = argparse.ArgumentParser()
136 |     parser.add_argument('--weights', default="YOLO_small.ckpt", type=str)
137 |     parser.add_argument('--data_dir', default="data", type=str)
138 |     parser.add_argument('--threshold', default=0.2, type=float)
139 |     parser.add_argument('--iou_threshold', default=0.5, type=float)
140 |     parser.add_argument('--gpu', default='', type=str)
141 |     args = parser.parse_args()
142 | 
143 |     if args.gpu is not None:
144 |         cfg.GPU = args.gpu
145 | 
146 |     if args.data_dir != cfg.DATA_PATH:
147 |         update_config_paths(args.data_dir, args.weights)
148 | 
149 |     os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPU
150 | 
151 |     yolo = YOLONet()
152 |     pascal = pascal_voc('train')
153 | 
154 |     solver = Solver(yolo, pascal)
155 | 
156 |     print('Start training ...')
157 |     solver.train()
158 |     print('Done training.')
159 | 
160 | 
161 | if __name__ == '__main__':
162 | 
163 |     # python train.py --weights YOLO_small.ckpt --gpu 0
164 |     main()
165 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hizhangp/yolo_tensorflow/88aba9d5569c04170f294a093455390a90f2686e/utils/__init__.py


--------------------------------------------------------------------------------
/utils/pascal_voc.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import xml.etree.ElementTree as ET
  3 | import numpy as np
  4 | import cv2
  5 | import pickle
  6 | import copy
  7 | import yolo.config as cfg
  8 | 
  9 | 
 10 | class pascal_voc(object):
 11 |     def __init__(self, phase, rebuild=False):
 12 |         self.devkil_path = os.path.join(cfg.PASCAL_PATH, 'VOCdevkit')
 13 |         self.data_path = os.path.join(self.devkil_path, 'VOC2007')
 14 |         self.cache_path = cfg.CACHE_PATH
 15 |         self.batch_size = cfg.BATCH_SIZE
 16 |         self.image_size = cfg.IMAGE_SIZE
 17 |         self.cell_size = cfg.CELL_SIZE
 18 |         self.classes = cfg.CLASSES
 19 |         self.class_to_ind = dict(zip(self.classes, range(len(self.classes))))
 20 |         self.flipped = cfg.FLIPPED
 21 |         self.phase = phase
 22 |         self.rebuild = rebuild
 23 |         self.cursor = 0
 24 |         self.epoch = 1
 25 |         self.gt_labels = None
 26 |         self.prepare()
 27 | 
 28 |     def get(self):
 29 |         images = np.zeros(
 30 |             (self.batch_size, self.image_size, self.image_size, 3))
 31 |         labels = np.zeros(
 32 |             (self.batch_size, self.cell_size, self.cell_size, 25))
 33 |         count = 0
 34 |         while count < self.batch_size:
 35 |             imname = self.gt_labels[self.cursor]['imname']
 36 |             flipped = self.gt_labels[self.cursor]['flipped']
 37 |             images[count, :, :, :] = self.image_read(imname, flipped)
 38 |             labels[count, :, :, :] = self.gt_labels[self.cursor]['label']
 39 |             count += 1
 40 |             self.cursor += 1
 41 |             if self.cursor >= len(self.gt_labels):
 42 |                 np.random.shuffle(self.gt_labels)
 43 |                 self.cursor = 0
 44 |                 self.epoch += 1
 45 |         return images, labels
 46 | 
 47 |     def image_read(self, imname, flipped=False):
 48 |         image = cv2.imread(imname)
 49 |         image = cv2.resize(image, (self.image_size, self.image_size))
 50 |         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
 51 |         image = (image / 255.0) * 2.0 - 1.0
 52 |         if flipped:
 53 |             image = image[:, ::-1, :]
 54 |         return image
 55 | 
 56 |     def prepare(self):
 57 |         gt_labels = self.load_labels()
 58 |         if self.flipped:
 59 |             print('Appending horizontally-flipped training examples ...')
 60 |             gt_labels_cp = copy.deepcopy(gt_labels)
 61 |             for idx in range(len(gt_labels_cp)):
 62 |                 gt_labels_cp[idx]['flipped'] = True
 63 |                 gt_labels_cp[idx]['label'] =\
 64 |                     gt_labels_cp[idx]['label'][:, ::-1, :]
 65 |                 for i in range(self.cell_size):
 66 |                     for j in range(self.cell_size):
 67 |                         if gt_labels_cp[idx]['label'][i, j, 0] == 1:
 68 |                             gt_labels_cp[idx]['label'][i, j, 1] = \
 69 |                                 self.image_size - 1 -\
 70 |                                 gt_labels_cp[idx]['label'][i, j, 1]
 71 |             gt_labels += gt_labels_cp
 72 |         np.random.shuffle(gt_labels)
 73 |         self.gt_labels = gt_labels
 74 |         return gt_labels
 75 | 
 76 |     def load_labels(self):
 77 |         cache_file = os.path.join(
 78 |             self.cache_path, 'pascal_' + self.phase + '_gt_labels.pkl')
 79 | 
 80 |         if os.path.isfile(cache_file) and not self.rebuild:
 81 |             print('Loading gt_labels from: ' + cache_file)
 82 |             with open(cache_file, 'rb') as f:
 83 |                 gt_labels = pickle.load(f)
 84 |             return gt_labels
 85 | 
 86 |         print('Processing gt_labels from: ' + self.data_path)
 87 | 
 88 |         if not os.path.exists(self.cache_path):
 89 |             os.makedirs(self.cache_path)
 90 | 
 91 |         if self.phase == 'train':
 92 |             txtname = os.path.join(
 93 |                 self.data_path, 'ImageSets', 'Main', 'trainval.txt')
 94 |         else:
 95 |             txtname = os.path.join(
 96 |                 self.data_path, 'ImageSets', 'Main', 'test.txt')
 97 |         with open(txtname, 'r') as f:
 98 |             self.image_index = [x.strip() for x in f.readlines()]
 99 | 
100 |         gt_labels = []
101 |         for index in self.image_index:
102 |             label, num = self.load_pascal_annotation(index)
103 |             if num == 0:
104 |                 continue
105 |             imname = os.path.join(self.data_path, 'JPEGImages', index + '.jpg')
106 |             gt_labels.append({'imname': imname,
107 |                               'label': label,
108 |                               'flipped': False})
109 |         print('Saving gt_labels to: ' + cache_file)
110 |         with open(cache_file, 'wb') as f:
111 |             pickle.dump(gt_labels, f)
112 |         return gt_labels
113 | 
114 |     def load_pascal_annotation(self, index):
115 |         """
116 |         Load image and bounding boxes info from XML file in the PASCAL VOC
117 |         format.
118 |         """
119 | 
120 |         imname = os.path.join(self.data_path, 'JPEGImages', index + '.jpg')
121 |         im = cv2.imread(imname)
122 |         h_ratio = 1.0 * self.image_size / im.shape[0]
123 |         w_ratio = 1.0 * self.image_size / im.shape[1]
124 |         # im = cv2.resize(im, [self.image_size, self.image_size])
125 | 
126 |         label = np.zeros((self.cell_size, self.cell_size, 25))
127 |         filename = os.path.join(self.data_path, 'Annotations', index + '.xml')
128 |         tree = ET.parse(filename)
129 |         objs = tree.findall('object')
130 | 
131 |         for obj in objs:
132 |             bbox = obj.find('bndbox')
133 |             # Make pixel indexes 0-based
134 |             x1 = max(min((float(bbox.find('xmin').text) - 1) * w_ratio, self.image_size - 1), 0)
135 |             y1 = max(min((float(bbox.find('ymin').text) - 1) * h_ratio, self.image_size - 1), 0)
136 |             x2 = max(min((float(bbox.find('xmax').text) - 1) * w_ratio, self.image_size - 1), 0)
137 |             y2 = max(min((float(bbox.find('ymax').text) - 1) * h_ratio, self.image_size - 1), 0)
138 |             cls_ind = self.class_to_ind[obj.find('name').text.lower().strip()]
139 |             boxes = [(x2 + x1) / 2.0, (y2 + y1) / 2.0, x2 - x1, y2 - y1]
140 |             x_ind = int(boxes[0] * self.cell_size / self.image_size)
141 |             y_ind = int(boxes[1] * self.cell_size / self.image_size)
142 |             if label[y_ind, x_ind, 0] == 1:
143 |                 continue
144 |             label[y_ind, x_ind, 0] = 1
145 |             label[y_ind, x_ind, 1:5] = boxes
146 |             label[y_ind, x_ind, 5 + cls_ind] = 1
147 | 
148 |         return label, len(objs)
149 | 


--------------------------------------------------------------------------------
/utils/timer.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import datetime
 3 | 
 4 | 
 5 | class Timer(object):
 6 |     '''
 7 |     A simple timer.
 8 |     '''
 9 | 
10 |     def __init__(self):
11 |         self.init_time = time.time()
12 |         self.total_time = 0.
13 |         self.calls = 0
14 |         self.start_time = 0.
15 |         self.diff = 0.
16 |         self.average_time = 0.
17 |         self.remain_time = 0.
18 | 
19 |     def tic(self):
20 |         # using time.time instead of time.clock because time time.clock
21 |         # does not normalize for multithreading
22 |         self.start_time = time.time()
23 | 
24 |     def toc(self, average=True):
25 |         self.diff = time.time() - self.start_time
26 |         self.total_time += self.diff
27 |         self.calls += 1
28 |         self.average_time = self.total_time / self.calls
29 |         if average:
30 |             return self.average_time
31 |         else:
32 |             return self.diff
33 | 
34 |     def remain(self, iters, max_iters):
35 |         if iters == 0:
36 |             self.remain_time = 0
37 |         else:
38 |             self.remain_time = (time.time() - self.init_time) * \
39 |                 (max_iters - iters) / iters
40 |         return str(datetime.timedelta(seconds=int(self.remain_time)))
41 | 


--------------------------------------------------------------------------------
/yolo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hizhangp/yolo_tensorflow/88aba9d5569c04170f294a093455390a90f2686e/yolo/__init__.py


--------------------------------------------------------------------------------
/yolo/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | #
 4 | # path and dataset parameter
 5 | #
 6 | 
 7 | DATA_PATH = 'data'
 8 | 
 9 | PASCAL_PATH = os.path.join(DATA_PATH, 'pascal_voc')
10 | 
11 | CACHE_PATH = os.path.join(PASCAL_PATH, 'cache')
12 | 
13 | OUTPUT_DIR = os.path.join(PASCAL_PATH, 'output')
14 | 
15 | WEIGHTS_DIR = os.path.join(PASCAL_PATH, 'weights')
16 | 
17 | WEIGHTS_FILE = None
18 | # WEIGHTS_FILE = os.path.join(DATA_PATH, 'weights', 'YOLO_small.ckpt')
19 | 
20 | CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
21 |            'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
22 |            'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
23 |            'train', 'tvmonitor']
24 | 
25 | FLIPPED = True
26 | 
27 | 
28 | #
29 | # model parameter
30 | #
31 | 
32 | IMAGE_SIZE = 448
33 | 
34 | CELL_SIZE = 7
35 | 
36 | BOXES_PER_CELL = 2
37 | 
38 | ALPHA = 0.1
39 | 
40 | DISP_CONSOLE = False
41 | 
42 | OBJECT_SCALE = 1.0
43 | NOOBJECT_SCALE = 1.0
44 | CLASS_SCALE = 2.0
45 | COORD_SCALE = 5.0
46 | 
47 | 
48 | #
49 | # solver parameter
50 | #
51 | 
52 | GPU = ''
53 | 
54 | LEARNING_RATE = 0.0001
55 | 
56 | DECAY_STEPS = 30000
57 | 
58 | DECAY_RATE = 0.1
59 | 
60 | STAIRCASE = True
61 | 
62 | BATCH_SIZE = 45
63 | 
64 | MAX_ITER = 15000
65 | 
66 | SUMMARY_ITER = 10
67 | 
68 | SAVE_ITER = 1000
69 | 
70 | 
71 | #
72 | # test parameter
73 | #
74 | 
75 | THRESHOLD = 0.2
76 | 
77 | IOU_THRESHOLD = 0.5
78 | 


--------------------------------------------------------------------------------
/yolo/yolo_net.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import yolo.config as cfg
  4 | 
  5 | slim = tf.contrib.slim
  6 | 
  7 | 
  8 | class YOLONet(object):
  9 | 
 10 |     def __init__(self, is_training=True):
 11 |         self.classes = cfg.CLASSES
 12 |         self.num_class = len(self.classes)
 13 |         self.image_size = cfg.IMAGE_SIZE
 14 |         self.cell_size = cfg.CELL_SIZE
 15 |         self.boxes_per_cell = cfg.BOXES_PER_CELL
 16 |         self.output_size = (self.cell_size * self.cell_size) *\
 17 |             (self.num_class + self.boxes_per_cell * 5)
 18 |         self.scale = 1.0 * self.image_size / self.cell_size
 19 |         self.boundary1 = self.cell_size * self.cell_size * self.num_class
 20 |         self.boundary2 = self.boundary1 +\
 21 |             self.cell_size * self.cell_size * self.boxes_per_cell
 22 | 
 23 |         self.object_scale = cfg.OBJECT_SCALE
 24 |         self.noobject_scale = cfg.NOOBJECT_SCALE
 25 |         self.class_scale = cfg.CLASS_SCALE
 26 |         self.coord_scale = cfg.COORD_SCALE
 27 | 
 28 |         self.learning_rate = cfg.LEARNING_RATE
 29 |         self.batch_size = cfg.BATCH_SIZE
 30 |         self.alpha = cfg.ALPHA
 31 | 
 32 |         self.offset = np.transpose(np.reshape(np.array(
 33 |             [np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell),
 34 |             (self.boxes_per_cell, self.cell_size, self.cell_size)), (1, 2, 0))
 35 | 
 36 |         self.images = tf.placeholder(
 37 |             tf.float32, [None, self.image_size, self.image_size, 3],
 38 |             name='images')
 39 |         self.logits = self.build_network(
 40 |             self.images, num_outputs=self.output_size, alpha=self.alpha,
 41 |             is_training=is_training)
 42 | 
 43 |         if is_training:
 44 |             self.labels = tf.placeholder(
 45 |                 tf.float32,
 46 |                 [None, self.cell_size, self.cell_size, 5 + self.num_class])
 47 |             self.loss_layer(self.logits, self.labels)
 48 |             self.total_loss = tf.losses.get_total_loss()
 49 |             tf.summary.scalar('total_loss', self.total_loss)
 50 | 
 51 |     def build_network(self,
 52 |                       images,
 53 |                       num_outputs,
 54 |                       alpha,
 55 |                       keep_prob=0.5,
 56 |                       is_training=True,
 57 |                       scope='yolo'):
 58 |         with tf.variable_scope(scope):
 59 |             with slim.arg_scope(
 60 |                 [slim.conv2d, slim.fully_connected],
 61 |                 activation_fn=leaky_relu(alpha),
 62 |                 weights_regularizer=slim.l2_regularizer(0.0005),
 63 |                 weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)
 64 |             ):
 65 |                 net = tf.pad(
 66 |                     images, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]),
 67 |                     name='pad_1')
 68 |                 net = slim.conv2d(
 69 |                     net, 64, 7, 2, padding='VALID', scope='conv_2')
 70 |                 net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_3')
 71 |                 net = slim.conv2d(net, 192, 3, scope='conv_4')
 72 |                 net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_5')
 73 |                 net = slim.conv2d(net, 128, 1, scope='conv_6')
 74 |                 net = slim.conv2d(net, 256, 3, scope='conv_7')
 75 |                 net = slim.conv2d(net, 256, 1, scope='conv_8')
 76 |                 net = slim.conv2d(net, 512, 3, scope='conv_9')
 77 |                 net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_10')
 78 |                 net = slim.conv2d(net, 256, 1, scope='conv_11')
 79 |                 net = slim.conv2d(net, 512, 3, scope='conv_12')
 80 |                 net = slim.conv2d(net, 256, 1, scope='conv_13')
 81 |                 net = slim.conv2d(net, 512, 3, scope='conv_14')
 82 |                 net = slim.conv2d(net, 256, 1, scope='conv_15')
 83 |                 net = slim.conv2d(net, 512, 3, scope='conv_16')
 84 |                 net = slim.conv2d(net, 256, 1, scope='conv_17')
 85 |                 net = slim.conv2d(net, 512, 3, scope='conv_18')
 86 |                 net = slim.conv2d(net, 512, 1, scope='conv_19')
 87 |                 net = slim.conv2d(net, 1024, 3, scope='conv_20')
 88 |                 net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_21')
 89 |                 net = slim.conv2d(net, 512, 1, scope='conv_22')
 90 |                 net = slim.conv2d(net, 1024, 3, scope='conv_23')
 91 |                 net = slim.conv2d(net, 512, 1, scope='conv_24')
 92 |                 net = slim.conv2d(net, 1024, 3, scope='conv_25')
 93 |                 net = slim.conv2d(net, 1024, 3, scope='conv_26')
 94 |                 net = tf.pad(
 95 |                     net, np.array([[0, 0], [1, 1], [1, 1], [0, 0]]),
 96 |                     name='pad_27')
 97 |                 net = slim.conv2d(
 98 |                     net, 1024, 3, 2, padding='VALID', scope='conv_28')
 99 |                 net = slim.conv2d(net, 1024, 3, scope='conv_29')
100 |                 net = slim.conv2d(net, 1024, 3, scope='conv_30')
101 |                 net = tf.transpose(net, [0, 3, 1, 2], name='trans_31')
102 |                 net = slim.flatten(net, scope='flat_32')
103 |                 net = slim.fully_connected(net, 512, scope='fc_33')
104 |                 net = slim.fully_connected(net, 4096, scope='fc_34')
105 |                 net = slim.dropout(
106 |                     net, keep_prob=keep_prob, is_training=is_training,
107 |                     scope='dropout_35')
108 |                 net = slim.fully_connected(
109 |                     net, num_outputs, activation_fn=None, scope='fc_36')
110 |         return net
111 | 
112 |     def calc_iou(self, boxes1, boxes2, scope='iou'):
113 |         """calculate ious
114 |         Args:
115 |           boxes1: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4]  ====> (x_center, y_center, w, h)
116 |           boxes2: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ===> (x_center, y_center, w, h)
117 |         Return:
118 |           iou: 4-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
119 |         """
120 |         with tf.variable_scope(scope):
121 |             # transform (x_center, y_center, w, h) to (x1, y1, x2, y2)
122 |             boxes1_t = tf.stack([boxes1[..., 0] - boxes1[..., 2] / 2.0,
123 |                                  boxes1[..., 1] - boxes1[..., 3] / 2.0,
124 |                                  boxes1[..., 0] + boxes1[..., 2] / 2.0,
125 |                                  boxes1[..., 1] + boxes1[..., 3] / 2.0],
126 |                                 axis=-1)
127 | 
128 |             boxes2_t = tf.stack([boxes2[..., 0] - boxes2[..., 2] / 2.0,
129 |                                  boxes2[..., 1] - boxes2[..., 3] / 2.0,
130 |                                  boxes2[..., 0] + boxes2[..., 2] / 2.0,
131 |                                  boxes2[..., 1] + boxes2[..., 3] / 2.0],
132 |                                 axis=-1)
133 | 
134 |             # calculate the left up point & right down point
135 |             lu = tf.maximum(boxes1_t[..., :2], boxes2_t[..., :2])
136 |             rd = tf.minimum(boxes1_t[..., 2:], boxes2_t[..., 2:])
137 | 
138 |             # intersection
139 |             intersection = tf.maximum(0.0, rd - lu)
140 |             inter_square = intersection[..., 0] * intersection[..., 1]
141 | 
142 |             # calculate the boxs1 square and boxs2 square
143 |             square1 = boxes1[..., 2] * boxes1[..., 3]
144 |             square2 = boxes2[..., 2] * boxes2[..., 3]
145 | 
146 |             union_square = tf.maximum(square1 + square2 - inter_square, 1e-10)
147 | 
148 |         return tf.clip_by_value(inter_square / union_square, 0.0, 1.0)
149 | 
150 |     def loss_layer(self, predicts, labels, scope='loss_layer'):
151 |         with tf.variable_scope(scope):
152 |             predict_classes = tf.reshape(
153 |                 predicts[:, :self.boundary1],
154 |                 [self.batch_size, self.cell_size, self.cell_size, self.num_class])
155 |             predict_scales = tf.reshape(
156 |                 predicts[:, self.boundary1:self.boundary2],
157 |                 [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell])
158 |             predict_boxes = tf.reshape(
159 |                 predicts[:, self.boundary2:],
160 |                 [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell, 4])
161 | 
162 |             response = tf.reshape(
163 |                 labels[..., 0],
164 |                 [self.batch_size, self.cell_size, self.cell_size, 1])
165 |             boxes = tf.reshape(
166 |                 labels[..., 1:5],
167 |                 [self.batch_size, self.cell_size, self.cell_size, 1, 4])
168 |             boxes = tf.tile(
169 |                 boxes, [1, 1, 1, self.boxes_per_cell, 1]) / self.image_size
170 |             classes = labels[..., 5:]
171 | 
172 |             offset = tf.reshape(
173 |                 tf.constant(self.offset, dtype=tf.float32),
174 |                 [1, self.cell_size, self.cell_size, self.boxes_per_cell])
175 |             offset = tf.tile(offset, [self.batch_size, 1, 1, 1])
176 |             offset_tran = tf.transpose(offset, (0, 2, 1, 3))
177 |             predict_boxes_tran = tf.stack(
178 |                 [(predict_boxes[..., 0] + offset) / self.cell_size,
179 |                  (predict_boxes[..., 1] + offset_tran) / self.cell_size,
180 |                  tf.square(predict_boxes[..., 2]),
181 |                  tf.square(predict_boxes[..., 3])], axis=-1)
182 | 
183 |             iou_predict_truth = self.calc_iou(predict_boxes_tran, boxes)
184 | 
185 |             # calculate I tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
186 |             object_mask = tf.reduce_max(iou_predict_truth, 3, keep_dims=True)
187 |             object_mask = tf.cast(
188 |                 (iou_predict_truth >= object_mask), tf.float32) * response
189 | 
190 |             # calculate no_I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
191 |             noobject_mask = tf.ones_like(
192 |                 object_mask, dtype=tf.float32) - object_mask
193 | 
194 |             boxes_tran = tf.stack(
195 |                 [boxes[..., 0] * self.cell_size - offset,
196 |                  boxes[..., 1] * self.cell_size - offset_tran,
197 |                  tf.sqrt(boxes[..., 2]),
198 |                  tf.sqrt(boxes[..., 3])], axis=-1)
199 | 
200 |             # class_loss
201 |             class_delta = response * (predict_classes - classes)
202 |             class_loss = tf.reduce_mean(
203 |                 tf.reduce_sum(tf.square(class_delta), axis=[1, 2, 3]),
204 |                 name='class_loss') * self.class_scale
205 | 
206 |             # object_loss
207 |             object_delta = object_mask * (predict_scales - iou_predict_truth)
208 |             object_loss = tf.reduce_mean(
209 |                 tf.reduce_sum(tf.square(object_delta), axis=[1, 2, 3]),
210 |                 name='object_loss') * self.object_scale
211 | 
212 |             # noobject_loss
213 |             noobject_delta = noobject_mask * predict_scales
214 |             noobject_loss = tf.reduce_mean(
215 |                 tf.reduce_sum(tf.square(noobject_delta), axis=[1, 2, 3]),
216 |                 name='noobject_loss') * self.noobject_scale
217 | 
218 |             # coord_loss
219 |             coord_mask = tf.expand_dims(object_mask, 4)
220 |             boxes_delta = coord_mask * (predict_boxes - boxes_tran)
221 |             coord_loss = tf.reduce_mean(
222 |                 tf.reduce_sum(tf.square(boxes_delta), axis=[1, 2, 3, 4]),
223 |                 name='coord_loss') * self.coord_scale
224 | 
225 |             tf.losses.add_loss(class_loss)
226 |             tf.losses.add_loss(object_loss)
227 |             tf.losses.add_loss(noobject_loss)
228 |             tf.losses.add_loss(coord_loss)
229 | 
230 |             tf.summary.scalar('class_loss', class_loss)
231 |             tf.summary.scalar('object_loss', object_loss)
232 |             tf.summary.scalar('noobject_loss', noobject_loss)
233 |             tf.summary.scalar('coord_loss', coord_loss)
234 | 
235 |             tf.summary.histogram('boxes_delta_x', boxes_delta[..., 0])
236 |             tf.summary.histogram('boxes_delta_y', boxes_delta[..., 1])
237 |             tf.summary.histogram('boxes_delta_w', boxes_delta[..., 2])
238 |             tf.summary.histogram('boxes_delta_h', boxes_delta[..., 3])
239 |             tf.summary.histogram('iou', iou_predict_truth)
240 | 
241 | 
242 | def leaky_relu(alpha):
243 |     def op(inputs):
244 |         return tf.nn.leaky_relu(inputs, alpha=alpha, name='leaky_relu')
245 |     return op
246 | 


--------------------------------------------------------------------------------