├── .gitignore ├── LICENSE ├── README.md ├── download_data.sh ├── test.py ├── test ├── cat.jpg └── person.jpg ├── train.py ├── utils ├── __init__.py ├── pascal_voc.py └── timer.py └── yolo ├── __init__.py ├── config.py └── yolo_net.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Peng Zhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## YOLO_tensorflow 2 | 3 | Tensorflow implementation of [YOLO](https://arxiv.org/pdf/1506.02640.pdf), including training and test phase. 4 | 5 | ### Installation 6 | 7 | 1. Clone yolo_tensorflow repository 8 | ```Shell 9 | $ git clone https://github.com/hizhangp/yolo_tensorflow.git 10 | $ cd yolo_tensorflow 11 | ``` 12 | 13 | 2. Download Pascal VOC dataset, and create correct directories 14 | ```Shell 15 | $ ./download_data.sh 16 | ``` 17 | 18 | 3. Download [YOLO_small](https://drive.google.com/file/d/0B5aC8pI-akZUNVFZMmhmcVRpbTA/view?usp=sharing) 19 | weight file and put it in `data/weight` 20 | 21 | 4. Modify configuration in `yolo/config.py` 22 | 23 | 5. Training 24 | ```Shell 25 | $ python train.py 26 | ``` 27 | 28 | 6. Test 29 | ```Shell 30 | $ python test.py 31 | ``` 32 | 33 | ### Requirements 34 | 1. Tensorflow 35 | 36 | 2. OpenCV 37 | -------------------------------------------------------------------------------- /download_data.sh: -------------------------------------------------------------------------------- 1 | echo "Creating data directory..." 2 | mkdir -p data && cd data 3 | mkdir weights 4 | mkdir pascal_voc 5 | 6 | echo "Downloading Pascal VOC 2012 data..." 7 | wget http://pjreddie.com/media/files/VOCtrainval_06-Nov-2007.tar 8 | 9 | echo "Extracting VOC data..." 10 | tar xf VOCtrainval_06-Nov-2007.tar 11 | 12 | mv VOCdevkit pascal_voc/. 13 | 14 | echo "Done." 15 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import argparse 4 | import numpy as np 5 | import tensorflow as tf 6 | import yolo.config as cfg 7 | from yolo.yolo_net import YOLONet 8 | from utils.timer import Timer 9 | 10 | 11 | class Detector(object): 12 | 13 | def __init__(self, net, weight_file): 14 | self.net = net 15 | self.weights_file = weight_file 16 | 17 | self.classes = cfg.CLASSES 18 | self.num_class = len(self.classes) 19 | self.image_size = cfg.IMAGE_SIZE 20 | self.cell_size = cfg.CELL_SIZE 21 | self.boxes_per_cell = cfg.BOXES_PER_CELL 22 | self.threshold = cfg.THRESHOLD 23 | self.iou_threshold = cfg.IOU_THRESHOLD 24 | self.boundary1 = self.cell_size * self.cell_size * self.num_class 25 | self.boundary2 = self.boundary1 +\ 26 | self.cell_size * self.cell_size * self.boxes_per_cell 27 | 28 | self.sess = tf.Session() 29 | self.sess.run(tf.global_variables_initializer()) 30 | 31 | print('Restoring weights from: ' + self.weights_file) 32 | self.saver = tf.train.Saver() 33 | self.saver.restore(self.sess, self.weights_file) 34 | 35 | def draw_result(self, img, result): 36 | for i in range(len(result)): 37 | x = int(result[i][1]) 38 | y = int(result[i][2]) 39 | w = int(result[i][3] / 2) 40 | h = int(result[i][4] / 2) 41 | cv2.rectangle(img, (x - w, y - h), (x + w, y + h), (0, 255, 0), 2) 42 | cv2.rectangle(img, (x - w, y - h - 20), 43 | (x + w, y - h), (125, 125, 125), -1) 44 | lineType = cv2.LINE_AA if cv2.__version__ > '3' else cv2.CV_AA 45 | cv2.putText( 46 | img, result[i][0] + ' : %.2f' % result[i][5], 47 | (x - w + 5, y - h - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.5, 48 | (0, 0, 0), 1, lineType) 49 | 50 | def detect(self, img): 51 | img_h, img_w, _ = img.shape 52 | inputs = cv2.resize(img, (self.image_size, self.image_size)) 53 | inputs = cv2.cvtColor(inputs, cv2.COLOR_BGR2RGB).astype(np.float32) 54 | inputs = (inputs / 255.0) * 2.0 - 1.0 55 | inputs = np.reshape(inputs, (1, self.image_size, self.image_size, 3)) 56 | 57 | result = self.detect_from_cvmat(inputs)[0] 58 | 59 | for i in range(len(result)): 60 | result[i][1] *= (1.0 * img_w / self.image_size) 61 | result[i][2] *= (1.0 * img_h / self.image_size) 62 | result[i][3] *= (1.0 * img_w / self.image_size) 63 | result[i][4] *= (1.0 * img_h / self.image_size) 64 | 65 | return result 66 | 67 | def detect_from_cvmat(self, inputs): 68 | net_output = self.sess.run(self.net.logits, 69 | feed_dict={self.net.images: inputs}) 70 | results = [] 71 | for i in range(net_output.shape[0]): 72 | results.append(self.interpret_output(net_output[i])) 73 | 74 | return results 75 | 76 | def interpret_output(self, output): 77 | probs = np.zeros((self.cell_size, self.cell_size, 78 | self.boxes_per_cell, self.num_class)) 79 | class_probs = np.reshape( 80 | output[0:self.boundary1], 81 | (self.cell_size, self.cell_size, self.num_class)) 82 | scales = np.reshape( 83 | output[self.boundary1:self.boundary2], 84 | (self.cell_size, self.cell_size, self.boxes_per_cell)) 85 | boxes = np.reshape( 86 | output[self.boundary2:], 87 | (self.cell_size, self.cell_size, self.boxes_per_cell, 4)) 88 | offset = np.array( 89 | [np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell) 90 | offset = np.transpose( 91 | np.reshape( 92 | offset, 93 | [self.boxes_per_cell, self.cell_size, self.cell_size]), 94 | (1, 2, 0)) 95 | 96 | boxes[:, :, :, 0] += offset 97 | boxes[:, :, :, 1] += np.transpose(offset, (1, 0, 2)) 98 | boxes[:, :, :, :2] = 1.0 * boxes[:, :, :, 0:2] / self.cell_size 99 | boxes[:, :, :, 2:] = np.square(boxes[:, :, :, 2:]) 100 | 101 | boxes *= self.image_size 102 | 103 | for i in range(self.boxes_per_cell): 104 | for j in range(self.num_class): 105 | probs[:, :, i, j] = np.multiply( 106 | class_probs[:, :, j], scales[:, :, i]) 107 | 108 | filter_mat_probs = np.array(probs >= self.threshold, dtype='bool') 109 | filter_mat_boxes = np.nonzero(filter_mat_probs) 110 | boxes_filtered = boxes[filter_mat_boxes[0], 111 | filter_mat_boxes[1], filter_mat_boxes[2]] 112 | probs_filtered = probs[filter_mat_probs] 113 | classes_num_filtered = np.argmax( 114 | filter_mat_probs, axis=3)[ 115 | filter_mat_boxes[0], filter_mat_boxes[1], filter_mat_boxes[2]] 116 | 117 | argsort = np.array(np.argsort(probs_filtered))[::-1] 118 | boxes_filtered = boxes_filtered[argsort] 119 | probs_filtered = probs_filtered[argsort] 120 | classes_num_filtered = classes_num_filtered[argsort] 121 | 122 | for i in range(len(boxes_filtered)): 123 | if probs_filtered[i] == 0: 124 | continue 125 | for j in range(i + 1, len(boxes_filtered)): 126 | if self.iou(boxes_filtered[i], boxes_filtered[j]) > self.iou_threshold: 127 | probs_filtered[j] = 0.0 128 | 129 | filter_iou = np.array(probs_filtered > 0.0, dtype='bool') 130 | boxes_filtered = boxes_filtered[filter_iou] 131 | probs_filtered = probs_filtered[filter_iou] 132 | classes_num_filtered = classes_num_filtered[filter_iou] 133 | 134 | result = [] 135 | for i in range(len(boxes_filtered)): 136 | result.append( 137 | [self.classes[classes_num_filtered[i]], 138 | boxes_filtered[i][0], 139 | boxes_filtered[i][1], 140 | boxes_filtered[i][2], 141 | boxes_filtered[i][3], 142 | probs_filtered[i]]) 143 | 144 | return result 145 | 146 | def iou(self, box1, box2): 147 | tb = min(box1[0] + 0.5 * box1[2], box2[0] + 0.5 * box2[2]) - \ 148 | max(box1[0] - 0.5 * box1[2], box2[0] - 0.5 * box2[2]) 149 | lr = min(box1[1] + 0.5 * box1[3], box2[1] + 0.5 * box2[3]) - \ 150 | max(box1[1] - 0.5 * box1[3], box2[1] - 0.5 * box2[3]) 151 | inter = 0 if tb < 0 or lr < 0 else tb * lr 152 | return inter / (box1[2] * box1[3] + box2[2] * box2[3] - inter) 153 | 154 | def camera_detector(self, cap, wait=10): 155 | detect_timer = Timer() 156 | ret, _ = cap.read() 157 | 158 | while ret: 159 | ret, frame = cap.read() 160 | detect_timer.tic() 161 | result = self.detect(frame) 162 | detect_timer.toc() 163 | print('Average detecting time: {:.3f}s'.format( 164 | detect_timer.average_time)) 165 | 166 | self.draw_result(frame, result) 167 | cv2.imshow('Camera', frame) 168 | cv2.waitKey(wait) 169 | 170 | ret, frame = cap.read() 171 | 172 | def image_detector(self, imname, wait=0): 173 | detect_timer = Timer() 174 | image = cv2.imread(imname) 175 | 176 | detect_timer.tic() 177 | result = self.detect(image) 178 | detect_timer.toc() 179 | print('Average detecting time: {:.3f}s'.format( 180 | detect_timer.average_time)) 181 | 182 | self.draw_result(image, result) 183 | cv2.imshow('Image', image) 184 | cv2.waitKey(wait) 185 | 186 | 187 | def main(): 188 | parser = argparse.ArgumentParser() 189 | parser.add_argument('--weights', default="YOLO_small.ckpt", type=str) 190 | parser.add_argument('--weight_dir', default='weights', type=str) 191 | parser.add_argument('--data_dir', default="data", type=str) 192 | parser.add_argument('--gpu', default='', type=str) 193 | args = parser.parse_args() 194 | 195 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 196 | 197 | yolo = YOLONet(False) 198 | weight_file = os.path.join(args.data_dir, args.weight_dir, args.weights) 199 | detector = Detector(yolo, weight_file) 200 | 201 | # detect from camera 202 | # cap = cv2.VideoCapture(-1) 203 | # detector.camera_detector(cap) 204 | 205 | # detect from image file 206 | imname = 'test/person.jpg' 207 | detector.image_detector(imname) 208 | 209 | 210 | if __name__ == '__main__': 211 | main() 212 | -------------------------------------------------------------------------------- /test/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hizhangp/yolo_tensorflow/88aba9d5569c04170f294a093455390a90f2686e/test/cat.jpg -------------------------------------------------------------------------------- /test/person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hizhangp/yolo_tensorflow/88aba9d5569c04170f294a093455390a90f2686e/test/person.jpg -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import datetime 4 | import tensorflow as tf 5 | import yolo.config as cfg 6 | from yolo.yolo_net import YOLONet 7 | from utils.timer import Timer 8 | from utils.pascal_voc import pascal_voc 9 | 10 | slim = tf.contrib.slim 11 | 12 | 13 | class Solver(object): 14 | 15 | def __init__(self, net, data): 16 | self.net = net 17 | self.data = data 18 | self.weights_file = cfg.WEIGHTS_FILE 19 | self.max_iter = cfg.MAX_ITER 20 | self.initial_learning_rate = cfg.LEARNING_RATE 21 | self.decay_steps = cfg.DECAY_STEPS 22 | self.decay_rate = cfg.DECAY_RATE 23 | self.staircase = cfg.STAIRCASE 24 | self.summary_iter = cfg.SUMMARY_ITER 25 | self.save_iter = cfg.SAVE_ITER 26 | self.output_dir = os.path.join( 27 | cfg.OUTPUT_DIR, datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')) 28 | if not os.path.exists(self.output_dir): 29 | os.makedirs(self.output_dir) 30 | self.save_cfg() 31 | 32 | self.variable_to_restore = tf.global_variables() 33 | self.saver = tf.train.Saver(self.variable_to_restore, max_to_keep=None) 34 | self.ckpt_file = os.path.join(self.output_dir, 'yolo') 35 | self.summary_op = tf.summary.merge_all() 36 | self.writer = tf.summary.FileWriter(self.output_dir, flush_secs=60) 37 | 38 | self.global_step = tf.train.create_global_step() 39 | self.learning_rate = tf.train.exponential_decay( 40 | self.initial_learning_rate, self.global_step, self.decay_steps, 41 | self.decay_rate, self.staircase, name='learning_rate') 42 | self.optimizer = tf.train.GradientDescentOptimizer( 43 | learning_rate=self.learning_rate) 44 | self.train_op = slim.learning.create_train_op( 45 | self.net.total_loss, self.optimizer, global_step=self.global_step) 46 | 47 | gpu_options = tf.GPUOptions() 48 | config = tf.ConfigProto(gpu_options=gpu_options) 49 | self.sess = tf.Session(config=config) 50 | self.sess.run(tf.global_variables_initializer()) 51 | 52 | if self.weights_file is not None: 53 | print('Restoring weights from: ' + self.weights_file) 54 | self.saver.restore(self.sess, self.weights_file) 55 | 56 | self.writer.add_graph(self.sess.graph) 57 | 58 | def train(self): 59 | 60 | train_timer = Timer() 61 | load_timer = Timer() 62 | 63 | for step in range(1, self.max_iter + 1): 64 | 65 | load_timer.tic() 66 | images, labels = self.data.get() 67 | load_timer.toc() 68 | feed_dict = {self.net.images: images, 69 | self.net.labels: labels} 70 | 71 | if step % self.summary_iter == 0: 72 | if step % (self.summary_iter * 10) == 0: 73 | 74 | train_timer.tic() 75 | summary_str, loss, _ = self.sess.run( 76 | [self.summary_op, self.net.total_loss, self.train_op], 77 | feed_dict=feed_dict) 78 | train_timer.toc() 79 | 80 | log_str = '''{} Epoch: {}, Step: {}, Learning rate: {},''' 81 | ''' Loss: {:5.3f}\nSpeed: {:.3f}s/iter,''' 82 | '''' Load: {:.3f}s/iter, Remain: {}'''.format( 83 | datetime.datetime.now().strftime('%m-%d %H:%M:%S'), 84 | self.data.epoch, 85 | int(step), 86 | round(self.learning_rate.eval(session=self.sess), 6), 87 | loss, 88 | train_timer.average_time, 89 | load_timer.average_time, 90 | train_timer.remain(step, self.max_iter)) 91 | print(log_str) 92 | 93 | else: 94 | train_timer.tic() 95 | summary_str, _ = self.sess.run( 96 | [self.summary_op, self.train_op], 97 | feed_dict=feed_dict) 98 | train_timer.toc() 99 | 100 | self.writer.add_summary(summary_str, step) 101 | 102 | else: 103 | train_timer.tic() 104 | self.sess.run(self.train_op, feed_dict=feed_dict) 105 | train_timer.toc() 106 | 107 | if step % self.save_iter == 0: 108 | print('{} Saving checkpoint file to: {}'.format( 109 | datetime.datetime.now().strftime('%m-%d %H:%M:%S'), 110 | self.output_dir)) 111 | self.saver.save( 112 | self.sess, self.ckpt_file, global_step=self.global_step) 113 | 114 | def save_cfg(self): 115 | 116 | with open(os.path.join(self.output_dir, 'config.txt'), 'w') as f: 117 | cfg_dict = cfg.__dict__ 118 | for key in sorted(cfg_dict.keys()): 119 | if key[0].isupper(): 120 | cfg_str = '{}: {}\n'.format(key, cfg_dict[key]) 121 | f.write(cfg_str) 122 | 123 | 124 | def update_config_paths(data_dir, weights_file): 125 | cfg.DATA_PATH = data_dir 126 | cfg.PASCAL_PATH = os.path.join(data_dir, 'pascal_voc') 127 | cfg.CACHE_PATH = os.path.join(cfg.PASCAL_PATH, 'cache') 128 | cfg.OUTPUT_DIR = os.path.join(cfg.PASCAL_PATH, 'output') 129 | cfg.WEIGHTS_DIR = os.path.join(cfg.PASCAL_PATH, 'weights') 130 | 131 | cfg.WEIGHTS_FILE = os.path.join(cfg.WEIGHTS_DIR, weights_file) 132 | 133 | 134 | def main(): 135 | parser = argparse.ArgumentParser() 136 | parser.add_argument('--weights', default="YOLO_small.ckpt", type=str) 137 | parser.add_argument('--data_dir', default="data", type=str) 138 | parser.add_argument('--threshold', default=0.2, type=float) 139 | parser.add_argument('--iou_threshold', default=0.5, type=float) 140 | parser.add_argument('--gpu', default='', type=str) 141 | args = parser.parse_args() 142 | 143 | if args.gpu is not None: 144 | cfg.GPU = args.gpu 145 | 146 | if args.data_dir != cfg.DATA_PATH: 147 | update_config_paths(args.data_dir, args.weights) 148 | 149 | os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPU 150 | 151 | yolo = YOLONet() 152 | pascal = pascal_voc('train') 153 | 154 | solver = Solver(yolo, pascal) 155 | 156 | print('Start training ...') 157 | solver.train() 158 | print('Done training.') 159 | 160 | 161 | if __name__ == '__main__': 162 | 163 | # python train.py --weights YOLO_small.ckpt --gpu 0 164 | main() 165 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hizhangp/yolo_tensorflow/88aba9d5569c04170f294a093455390a90f2686e/utils/__init__.py -------------------------------------------------------------------------------- /utils/pascal_voc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import xml.etree.ElementTree as ET 3 | import numpy as np 4 | import cv2 5 | import pickle 6 | import copy 7 | import yolo.config as cfg 8 | 9 | 10 | class pascal_voc(object): 11 | def __init__(self, phase, rebuild=False): 12 | self.devkil_path = os.path.join(cfg.PASCAL_PATH, 'VOCdevkit') 13 | self.data_path = os.path.join(self.devkil_path, 'VOC2007') 14 | self.cache_path = cfg.CACHE_PATH 15 | self.batch_size = cfg.BATCH_SIZE 16 | self.image_size = cfg.IMAGE_SIZE 17 | self.cell_size = cfg.CELL_SIZE 18 | self.classes = cfg.CLASSES 19 | self.class_to_ind = dict(zip(self.classes, range(len(self.classes)))) 20 | self.flipped = cfg.FLIPPED 21 | self.phase = phase 22 | self.rebuild = rebuild 23 | self.cursor = 0 24 | self.epoch = 1 25 | self.gt_labels = None 26 | self.prepare() 27 | 28 | def get(self): 29 | images = np.zeros( 30 | (self.batch_size, self.image_size, self.image_size, 3)) 31 | labels = np.zeros( 32 | (self.batch_size, self.cell_size, self.cell_size, 25)) 33 | count = 0 34 | while count < self.batch_size: 35 | imname = self.gt_labels[self.cursor]['imname'] 36 | flipped = self.gt_labels[self.cursor]['flipped'] 37 | images[count, :, :, :] = self.image_read(imname, flipped) 38 | labels[count, :, :, :] = self.gt_labels[self.cursor]['label'] 39 | count += 1 40 | self.cursor += 1 41 | if self.cursor >= len(self.gt_labels): 42 | np.random.shuffle(self.gt_labels) 43 | self.cursor = 0 44 | self.epoch += 1 45 | return images, labels 46 | 47 | def image_read(self, imname, flipped=False): 48 | image = cv2.imread(imname) 49 | image = cv2.resize(image, (self.image_size, self.image_size)) 50 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32) 51 | image = (image / 255.0) * 2.0 - 1.0 52 | if flipped: 53 | image = image[:, ::-1, :] 54 | return image 55 | 56 | def prepare(self): 57 | gt_labels = self.load_labels() 58 | if self.flipped: 59 | print('Appending horizontally-flipped training examples ...') 60 | gt_labels_cp = copy.deepcopy(gt_labels) 61 | for idx in range(len(gt_labels_cp)): 62 | gt_labels_cp[idx]['flipped'] = True 63 | gt_labels_cp[idx]['label'] =\ 64 | gt_labels_cp[idx]['label'][:, ::-1, :] 65 | for i in range(self.cell_size): 66 | for j in range(self.cell_size): 67 | if gt_labels_cp[idx]['label'][i, j, 0] == 1: 68 | gt_labels_cp[idx]['label'][i, j, 1] = \ 69 | self.image_size - 1 -\ 70 | gt_labels_cp[idx]['label'][i, j, 1] 71 | gt_labels += gt_labels_cp 72 | np.random.shuffle(gt_labels) 73 | self.gt_labels = gt_labels 74 | return gt_labels 75 | 76 | def load_labels(self): 77 | cache_file = os.path.join( 78 | self.cache_path, 'pascal_' + self.phase + '_gt_labels.pkl') 79 | 80 | if os.path.isfile(cache_file) and not self.rebuild: 81 | print('Loading gt_labels from: ' + cache_file) 82 | with open(cache_file, 'rb') as f: 83 | gt_labels = pickle.load(f) 84 | return gt_labels 85 | 86 | print('Processing gt_labels from: ' + self.data_path) 87 | 88 | if not os.path.exists(self.cache_path): 89 | os.makedirs(self.cache_path) 90 | 91 | if self.phase == 'train': 92 | txtname = os.path.join( 93 | self.data_path, 'ImageSets', 'Main', 'trainval.txt') 94 | else: 95 | txtname = os.path.join( 96 | self.data_path, 'ImageSets', 'Main', 'test.txt') 97 | with open(txtname, 'r') as f: 98 | self.image_index = [x.strip() for x in f.readlines()] 99 | 100 | gt_labels = [] 101 | for index in self.image_index: 102 | label, num = self.load_pascal_annotation(index) 103 | if num == 0: 104 | continue 105 | imname = os.path.join(self.data_path, 'JPEGImages', index + '.jpg') 106 | gt_labels.append({'imname': imname, 107 | 'label': label, 108 | 'flipped': False}) 109 | print('Saving gt_labels to: ' + cache_file) 110 | with open(cache_file, 'wb') as f: 111 | pickle.dump(gt_labels, f) 112 | return gt_labels 113 | 114 | def load_pascal_annotation(self, index): 115 | """ 116 | Load image and bounding boxes info from XML file in the PASCAL VOC 117 | format. 118 | """ 119 | 120 | imname = os.path.join(self.data_path, 'JPEGImages', index + '.jpg') 121 | im = cv2.imread(imname) 122 | h_ratio = 1.0 * self.image_size / im.shape[0] 123 | w_ratio = 1.0 * self.image_size / im.shape[1] 124 | # im = cv2.resize(im, [self.image_size, self.image_size]) 125 | 126 | label = np.zeros((self.cell_size, self.cell_size, 25)) 127 | filename = os.path.join(self.data_path, 'Annotations', index + '.xml') 128 | tree = ET.parse(filename) 129 | objs = tree.findall('object') 130 | 131 | for obj in objs: 132 | bbox = obj.find('bndbox') 133 | # Make pixel indexes 0-based 134 | x1 = max(min((float(bbox.find('xmin').text) - 1) * w_ratio, self.image_size - 1), 0) 135 | y1 = max(min((float(bbox.find('ymin').text) - 1) * h_ratio, self.image_size - 1), 0) 136 | x2 = max(min((float(bbox.find('xmax').text) - 1) * w_ratio, self.image_size - 1), 0) 137 | y2 = max(min((float(bbox.find('ymax').text) - 1) * h_ratio, self.image_size - 1), 0) 138 | cls_ind = self.class_to_ind[obj.find('name').text.lower().strip()] 139 | boxes = [(x2 + x1) / 2.0, (y2 + y1) / 2.0, x2 - x1, y2 - y1] 140 | x_ind = int(boxes[0] * self.cell_size / self.image_size) 141 | y_ind = int(boxes[1] * self.cell_size / self.image_size) 142 | if label[y_ind, x_ind, 0] == 1: 143 | continue 144 | label[y_ind, x_ind, 0] = 1 145 | label[y_ind, x_ind, 1:5] = boxes 146 | label[y_ind, x_ind, 5 + cls_ind] = 1 147 | 148 | return label, len(objs) 149 | -------------------------------------------------------------------------------- /utils/timer.py: -------------------------------------------------------------------------------- 1 | import time 2 | import datetime 3 | 4 | 5 | class Timer(object): 6 | ''' 7 | A simple timer. 8 | ''' 9 | 10 | def __init__(self): 11 | self.init_time = time.time() 12 | self.total_time = 0. 13 | self.calls = 0 14 | self.start_time = 0. 15 | self.diff = 0. 16 | self.average_time = 0. 17 | self.remain_time = 0. 18 | 19 | def tic(self): 20 | # using time.time instead of time.clock because time time.clock 21 | # does not normalize for multithreading 22 | self.start_time = time.time() 23 | 24 | def toc(self, average=True): 25 | self.diff = time.time() - self.start_time 26 | self.total_time += self.diff 27 | self.calls += 1 28 | self.average_time = self.total_time / self.calls 29 | if average: 30 | return self.average_time 31 | else: 32 | return self.diff 33 | 34 | def remain(self, iters, max_iters): 35 | if iters == 0: 36 | self.remain_time = 0 37 | else: 38 | self.remain_time = (time.time() - self.init_time) * \ 39 | (max_iters - iters) / iters 40 | return str(datetime.timedelta(seconds=int(self.remain_time))) 41 | -------------------------------------------------------------------------------- /yolo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hizhangp/yolo_tensorflow/88aba9d5569c04170f294a093455390a90f2686e/yolo/__init__.py -------------------------------------------------------------------------------- /yolo/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # 4 | # path and dataset parameter 5 | # 6 | 7 | DATA_PATH = 'data' 8 | 9 | PASCAL_PATH = os.path.join(DATA_PATH, 'pascal_voc') 10 | 11 | CACHE_PATH = os.path.join(PASCAL_PATH, 'cache') 12 | 13 | OUTPUT_DIR = os.path.join(PASCAL_PATH, 'output') 14 | 15 | WEIGHTS_DIR = os.path.join(PASCAL_PATH, 'weights') 16 | 17 | WEIGHTS_FILE = None 18 | # WEIGHTS_FILE = os.path.join(DATA_PATH, 'weights', 'YOLO_small.ckpt') 19 | 20 | CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 21 | 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 22 | 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 23 | 'train', 'tvmonitor'] 24 | 25 | FLIPPED = True 26 | 27 | 28 | # 29 | # model parameter 30 | # 31 | 32 | IMAGE_SIZE = 448 33 | 34 | CELL_SIZE = 7 35 | 36 | BOXES_PER_CELL = 2 37 | 38 | ALPHA = 0.1 39 | 40 | DISP_CONSOLE = False 41 | 42 | OBJECT_SCALE = 1.0 43 | NOOBJECT_SCALE = 1.0 44 | CLASS_SCALE = 2.0 45 | COORD_SCALE = 5.0 46 | 47 | 48 | # 49 | # solver parameter 50 | # 51 | 52 | GPU = '' 53 | 54 | LEARNING_RATE = 0.0001 55 | 56 | DECAY_STEPS = 30000 57 | 58 | DECAY_RATE = 0.1 59 | 60 | STAIRCASE = True 61 | 62 | BATCH_SIZE = 45 63 | 64 | MAX_ITER = 15000 65 | 66 | SUMMARY_ITER = 10 67 | 68 | SAVE_ITER = 1000 69 | 70 | 71 | # 72 | # test parameter 73 | # 74 | 75 | THRESHOLD = 0.2 76 | 77 | IOU_THRESHOLD = 0.5 78 | -------------------------------------------------------------------------------- /yolo/yolo_net.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import yolo.config as cfg 4 | 5 | slim = tf.contrib.slim 6 | 7 | 8 | class YOLONet(object): 9 | 10 | def __init__(self, is_training=True): 11 | self.classes = cfg.CLASSES 12 | self.num_class = len(self.classes) 13 | self.image_size = cfg.IMAGE_SIZE 14 | self.cell_size = cfg.CELL_SIZE 15 | self.boxes_per_cell = cfg.BOXES_PER_CELL 16 | self.output_size = (self.cell_size * self.cell_size) *\ 17 | (self.num_class + self.boxes_per_cell * 5) 18 | self.scale = 1.0 * self.image_size / self.cell_size 19 | self.boundary1 = self.cell_size * self.cell_size * self.num_class 20 | self.boundary2 = self.boundary1 +\ 21 | self.cell_size * self.cell_size * self.boxes_per_cell 22 | 23 | self.object_scale = cfg.OBJECT_SCALE 24 | self.noobject_scale = cfg.NOOBJECT_SCALE 25 | self.class_scale = cfg.CLASS_SCALE 26 | self.coord_scale = cfg.COORD_SCALE 27 | 28 | self.learning_rate = cfg.LEARNING_RATE 29 | self.batch_size = cfg.BATCH_SIZE 30 | self.alpha = cfg.ALPHA 31 | 32 | self.offset = np.transpose(np.reshape(np.array( 33 | [np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell), 34 | (self.boxes_per_cell, self.cell_size, self.cell_size)), (1, 2, 0)) 35 | 36 | self.images = tf.placeholder( 37 | tf.float32, [None, self.image_size, self.image_size, 3], 38 | name='images') 39 | self.logits = self.build_network( 40 | self.images, num_outputs=self.output_size, alpha=self.alpha, 41 | is_training=is_training) 42 | 43 | if is_training: 44 | self.labels = tf.placeholder( 45 | tf.float32, 46 | [None, self.cell_size, self.cell_size, 5 + self.num_class]) 47 | self.loss_layer(self.logits, self.labels) 48 | self.total_loss = tf.losses.get_total_loss() 49 | tf.summary.scalar('total_loss', self.total_loss) 50 | 51 | def build_network(self, 52 | images, 53 | num_outputs, 54 | alpha, 55 | keep_prob=0.5, 56 | is_training=True, 57 | scope='yolo'): 58 | with tf.variable_scope(scope): 59 | with slim.arg_scope( 60 | [slim.conv2d, slim.fully_connected], 61 | activation_fn=leaky_relu(alpha), 62 | weights_regularizer=slim.l2_regularizer(0.0005), 63 | weights_initializer=tf.truncated_normal_initializer(0.0, 0.01) 64 | ): 65 | net = tf.pad( 66 | images, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]), 67 | name='pad_1') 68 | net = slim.conv2d( 69 | net, 64, 7, 2, padding='VALID', scope='conv_2') 70 | net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_3') 71 | net = slim.conv2d(net, 192, 3, scope='conv_4') 72 | net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_5') 73 | net = slim.conv2d(net, 128, 1, scope='conv_6') 74 | net = slim.conv2d(net, 256, 3, scope='conv_7') 75 | net = slim.conv2d(net, 256, 1, scope='conv_8') 76 | net = slim.conv2d(net, 512, 3, scope='conv_9') 77 | net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_10') 78 | net = slim.conv2d(net, 256, 1, scope='conv_11') 79 | net = slim.conv2d(net, 512, 3, scope='conv_12') 80 | net = slim.conv2d(net, 256, 1, scope='conv_13') 81 | net = slim.conv2d(net, 512, 3, scope='conv_14') 82 | net = slim.conv2d(net, 256, 1, scope='conv_15') 83 | net = slim.conv2d(net, 512, 3, scope='conv_16') 84 | net = slim.conv2d(net, 256, 1, scope='conv_17') 85 | net = slim.conv2d(net, 512, 3, scope='conv_18') 86 | net = slim.conv2d(net, 512, 1, scope='conv_19') 87 | net = slim.conv2d(net, 1024, 3, scope='conv_20') 88 | net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_21') 89 | net = slim.conv2d(net, 512, 1, scope='conv_22') 90 | net = slim.conv2d(net, 1024, 3, scope='conv_23') 91 | net = slim.conv2d(net, 512, 1, scope='conv_24') 92 | net = slim.conv2d(net, 1024, 3, scope='conv_25') 93 | net = slim.conv2d(net, 1024, 3, scope='conv_26') 94 | net = tf.pad( 95 | net, np.array([[0, 0], [1, 1], [1, 1], [0, 0]]), 96 | name='pad_27') 97 | net = slim.conv2d( 98 | net, 1024, 3, 2, padding='VALID', scope='conv_28') 99 | net = slim.conv2d(net, 1024, 3, scope='conv_29') 100 | net = slim.conv2d(net, 1024, 3, scope='conv_30') 101 | net = tf.transpose(net, [0, 3, 1, 2], name='trans_31') 102 | net = slim.flatten(net, scope='flat_32') 103 | net = slim.fully_connected(net, 512, scope='fc_33') 104 | net = slim.fully_connected(net, 4096, scope='fc_34') 105 | net = slim.dropout( 106 | net, keep_prob=keep_prob, is_training=is_training, 107 | scope='dropout_35') 108 | net = slim.fully_connected( 109 | net, num_outputs, activation_fn=None, scope='fc_36') 110 | return net 111 | 112 | def calc_iou(self, boxes1, boxes2, scope='iou'): 113 | """calculate ious 114 | Args: 115 | boxes1: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ====> (x_center, y_center, w, h) 116 | boxes2: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ===> (x_center, y_center, w, h) 117 | Return: 118 | iou: 4-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL] 119 | """ 120 | with tf.variable_scope(scope): 121 | # transform (x_center, y_center, w, h) to (x1, y1, x2, y2) 122 | boxes1_t = tf.stack([boxes1[..., 0] - boxes1[..., 2] / 2.0, 123 | boxes1[..., 1] - boxes1[..., 3] / 2.0, 124 | boxes1[..., 0] + boxes1[..., 2] / 2.0, 125 | boxes1[..., 1] + boxes1[..., 3] / 2.0], 126 | axis=-1) 127 | 128 | boxes2_t = tf.stack([boxes2[..., 0] - boxes2[..., 2] / 2.0, 129 | boxes2[..., 1] - boxes2[..., 3] / 2.0, 130 | boxes2[..., 0] + boxes2[..., 2] / 2.0, 131 | boxes2[..., 1] + boxes2[..., 3] / 2.0], 132 | axis=-1) 133 | 134 | # calculate the left up point & right down point 135 | lu = tf.maximum(boxes1_t[..., :2], boxes2_t[..., :2]) 136 | rd = tf.minimum(boxes1_t[..., 2:], boxes2_t[..., 2:]) 137 | 138 | # intersection 139 | intersection = tf.maximum(0.0, rd - lu) 140 | inter_square = intersection[..., 0] * intersection[..., 1] 141 | 142 | # calculate the boxs1 square and boxs2 square 143 | square1 = boxes1[..., 2] * boxes1[..., 3] 144 | square2 = boxes2[..., 2] * boxes2[..., 3] 145 | 146 | union_square = tf.maximum(square1 + square2 - inter_square, 1e-10) 147 | 148 | return tf.clip_by_value(inter_square / union_square, 0.0, 1.0) 149 | 150 | def loss_layer(self, predicts, labels, scope='loss_layer'): 151 | with tf.variable_scope(scope): 152 | predict_classes = tf.reshape( 153 | predicts[:, :self.boundary1], 154 | [self.batch_size, self.cell_size, self.cell_size, self.num_class]) 155 | predict_scales = tf.reshape( 156 | predicts[:, self.boundary1:self.boundary2], 157 | [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell]) 158 | predict_boxes = tf.reshape( 159 | predicts[:, self.boundary2:], 160 | [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell, 4]) 161 | 162 | response = tf.reshape( 163 | labels[..., 0], 164 | [self.batch_size, self.cell_size, self.cell_size, 1]) 165 | boxes = tf.reshape( 166 | labels[..., 1:5], 167 | [self.batch_size, self.cell_size, self.cell_size, 1, 4]) 168 | boxes = tf.tile( 169 | boxes, [1, 1, 1, self.boxes_per_cell, 1]) / self.image_size 170 | classes = labels[..., 5:] 171 | 172 | offset = tf.reshape( 173 | tf.constant(self.offset, dtype=tf.float32), 174 | [1, self.cell_size, self.cell_size, self.boxes_per_cell]) 175 | offset = tf.tile(offset, [self.batch_size, 1, 1, 1]) 176 | offset_tran = tf.transpose(offset, (0, 2, 1, 3)) 177 | predict_boxes_tran = tf.stack( 178 | [(predict_boxes[..., 0] + offset) / self.cell_size, 179 | (predict_boxes[..., 1] + offset_tran) / self.cell_size, 180 | tf.square(predict_boxes[..., 2]), 181 | tf.square(predict_boxes[..., 3])], axis=-1) 182 | 183 | iou_predict_truth = self.calc_iou(predict_boxes_tran, boxes) 184 | 185 | # calculate I tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL] 186 | object_mask = tf.reduce_max(iou_predict_truth, 3, keep_dims=True) 187 | object_mask = tf.cast( 188 | (iou_predict_truth >= object_mask), tf.float32) * response 189 | 190 | # calculate no_I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL] 191 | noobject_mask = tf.ones_like( 192 | object_mask, dtype=tf.float32) - object_mask 193 | 194 | boxes_tran = tf.stack( 195 | [boxes[..., 0] * self.cell_size - offset, 196 | boxes[..., 1] * self.cell_size - offset_tran, 197 | tf.sqrt(boxes[..., 2]), 198 | tf.sqrt(boxes[..., 3])], axis=-1) 199 | 200 | # class_loss 201 | class_delta = response * (predict_classes - classes) 202 | class_loss = tf.reduce_mean( 203 | tf.reduce_sum(tf.square(class_delta), axis=[1, 2, 3]), 204 | name='class_loss') * self.class_scale 205 | 206 | # object_loss 207 | object_delta = object_mask * (predict_scales - iou_predict_truth) 208 | object_loss = tf.reduce_mean( 209 | tf.reduce_sum(tf.square(object_delta), axis=[1, 2, 3]), 210 | name='object_loss') * self.object_scale 211 | 212 | # noobject_loss 213 | noobject_delta = noobject_mask * predict_scales 214 | noobject_loss = tf.reduce_mean( 215 | tf.reduce_sum(tf.square(noobject_delta), axis=[1, 2, 3]), 216 | name='noobject_loss') * self.noobject_scale 217 | 218 | # coord_loss 219 | coord_mask = tf.expand_dims(object_mask, 4) 220 | boxes_delta = coord_mask * (predict_boxes - boxes_tran) 221 | coord_loss = tf.reduce_mean( 222 | tf.reduce_sum(tf.square(boxes_delta), axis=[1, 2, 3, 4]), 223 | name='coord_loss') * self.coord_scale 224 | 225 | tf.losses.add_loss(class_loss) 226 | tf.losses.add_loss(object_loss) 227 | tf.losses.add_loss(noobject_loss) 228 | tf.losses.add_loss(coord_loss) 229 | 230 | tf.summary.scalar('class_loss', class_loss) 231 | tf.summary.scalar('object_loss', object_loss) 232 | tf.summary.scalar('noobject_loss', noobject_loss) 233 | tf.summary.scalar('coord_loss', coord_loss) 234 | 235 | tf.summary.histogram('boxes_delta_x', boxes_delta[..., 0]) 236 | tf.summary.histogram('boxes_delta_y', boxes_delta[..., 1]) 237 | tf.summary.histogram('boxes_delta_w', boxes_delta[..., 2]) 238 | tf.summary.histogram('boxes_delta_h', boxes_delta[..., 3]) 239 | tf.summary.histogram('iou', iou_predict_truth) 240 | 241 | 242 | def leaky_relu(alpha): 243 | def op(inputs): 244 | return tf.nn.leaky_relu(inputs, alpha=alpha, name='leaky_relu') 245 | return op 246 | --------------------------------------------------------------------------------