├── LICENSE ├── README.md ├── demo_yolo_v1.py ├── demo_yolo_v2.py ├── models └── yolov2-coco.meta ├── test ├── cat.jpg ├── person.jpg ├── sample_computer.jpg ├── sample_dog.jpg ├── sample_eagle.jpg ├── sample_giraffe.jpg ├── sample_horses.jpg ├── sample_office.jpg ├── sample_person.jpg ├── sample_scream.jpg └── test.mp4 ├── train.py ├── utils ├── __init__.py ├── box.py ├── im_transform.py ├── pascal_voc.py ├── timer.py └── tool.py └── yolo ├── __init__.py ├── config.py └── yolo_net.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Peng Zhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## yolov2_tensorflow 2 | 3 | ### Requirements 4 | 1. Tensorflow 5 | 2. OpenCV 6 | 7 | Tensorflow implementation of [YOLO](https://pjreddie.com/darknet/yolo/), including yolov1 and yolov2 demo. 8 | 9 | 10 | ### Installation 11 | 12 | 1. Clone yolov2_tensorflow repository 13 | ```Shell 14 | $ git clone https://github.com/shishichang/yolov2-tensorflow.git 15 | $ cd yolov2_tensorflow 16 | ``` 17 | 18 | 2. Download [YOLO_v1](http://pan.baidu.com/s/1cGV694) [YOLO_v2_pb](http://pan.baidu.com/s/1hrRszrA) [YOLO_v2_meta](http://pan.baidu.com/s/1dEOaGPr) 19 | put it in `models` 20 | 21 | 3. Modify configuration in `yolo/config.py` for yolov1 22 | 23 | 4. Run 24 | ```Shell 25 | $ python demo_yolo_v1.py 26 | $ python demo_yolo_v2.py 27 | ``` 28 | -------------------------------------------------------------------------------- /demo_yolo_v1.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import cv2 5 | import argparse 6 | import yolo.config as cfg 7 | from yolo.yolo_net import YOLONet 8 | from utils.timer import Timer 9 | 10 | 11 | class Detector(object): 12 | 13 | def __init__(self, net, weight_file): 14 | self.net = net 15 | self.weights_file = weight_file 16 | 17 | self.classes = cfg.CLASSES 18 | self.num_class = len(self.classes) 19 | self.image_size = cfg.IMAGE_SIZE 20 | self.cell_size = cfg.CELL_SIZE 21 | self.boxes_per_cell = cfg.BOXES_PER_CELL 22 | self.threshold = cfg.THRESHOLD 23 | self.iou_threshold = cfg.IOU_THRESHOLD 24 | self.boundary1 = self.cell_size * self.cell_size * self.num_class 25 | self.boundary2 = self.boundary1 + self.cell_size * self.cell_size * self.boxes_per_cell 26 | 27 | self.sess = tf.Session() 28 | self.sess.run(tf.global_variables_initializer()) 29 | 30 | print('Restoring weights from: ' + self.weights_file) 31 | self.saver = tf.train.Saver() 32 | self.saver.restore(self.sess, self.weights_file) 33 | 34 | def draw_result(self, img, result): 35 | for i in range(len(result)): 36 | x = int(result[i][1]) 37 | y = int(result[i][2]) 38 | w = int(result[i][3] / 2) 39 | h = int(result[i][4] / 2) 40 | cv2.rectangle(img, (x - w, y - h), (x + w, y + h), (0, 255, 0), 2) 41 | cv2.rectangle(img, (x - w, y - h - 20), 42 | (x + w, y - h), (125, 125, 125), -1) 43 | cv2.putText(img, result[i][0] + ' : %.2f' % result[i][5], (x - w + 5, y - h - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA) 44 | 45 | def detect(self, img): 46 | img_h, img_w, _ = img.shape 47 | inputs = cv2.resize(img, (self.image_size, self.image_size)) 48 | inputs = cv2.cvtColor(inputs, cv2.COLOR_BGR2RGB).astype(np.float32) 49 | inputs = (inputs / 255.0) * 2.0 - 1.0 50 | inputs = np.reshape(inputs, (1, self.image_size, self.image_size, 3)) 51 | 52 | result = self.detect_from_cvmat(inputs)[0] 53 | 54 | for i in range(len(result)): 55 | result[i][1] *= (1.0 * img_w / self.image_size) 56 | result[i][2] *= (1.0 * img_h / self.image_size) 57 | result[i][3] *= (1.0 * img_w / self.image_size) 58 | result[i][4] *= (1.0 * img_h / self.image_size) 59 | 60 | return result 61 | 62 | def detect_from_cvmat(self, inputs): 63 | net_output = self.sess.run(self.net.logits, 64 | feed_dict={self.net.images: inputs}) 65 | results = [] 66 | for i in range(net_output.shape[0]): 67 | results.append(self.interpret_output(net_output[i])) 68 | 69 | return results 70 | 71 | def interpret_output(self, output): 72 | probs = np.zeros((self.cell_size, self.cell_size, 73 | self.boxes_per_cell, self.num_class)) 74 | class_probs = np.reshape(output[0:self.boundary1], (self.cell_size, self.cell_size, self.num_class)) 75 | scales = np.reshape(output[self.boundary1:self.boundary2], (self.cell_size, self.cell_size, self.boxes_per_cell)) 76 | boxes = np.reshape(output[self.boundary2:], (self.cell_size, self.cell_size, self.boxes_per_cell, 4)) 77 | offset = np.transpose(np.reshape(np.array([np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell), 78 | [self.boxes_per_cell, self.cell_size, self.cell_size]), (1, 2, 0)) 79 | 80 | boxes[:, :, :, 0] += offset 81 | boxes[:, :, :, 1] += np.transpose(offset, (1, 0, 2)) 82 | boxes[:, :, :, :2] = 1.0 * boxes[:, :, :, 0:2] / self.cell_size 83 | boxes[:, :, :, 2:] = np.square(boxes[:, :, :, 2:]) 84 | 85 | boxes *= self.image_size 86 | 87 | for i in range(self.boxes_per_cell): 88 | for j in range(self.num_class): 89 | probs[:, :, i, j] = np.multiply( 90 | class_probs[:, :, j], scales[:, :, i]) 91 | 92 | filter_mat_probs = np.array(probs >= self.threshold, dtype='bool') 93 | filter_mat_boxes = np.nonzero(filter_mat_probs) 94 | boxes_filtered = boxes[filter_mat_boxes[0], 95 | filter_mat_boxes[1], filter_mat_boxes[2]] 96 | probs_filtered = probs[filter_mat_probs] 97 | classes_num_filtered = np.argmax(filter_mat_probs, axis=3)[filter_mat_boxes[ 98 | 0], filter_mat_boxes[1], filter_mat_boxes[2]] 99 | 100 | argsort = np.array(np.argsort(probs_filtered))[::-1] 101 | boxes_filtered = boxes_filtered[argsort] 102 | probs_filtered = probs_filtered[argsort] 103 | classes_num_filtered = classes_num_filtered[argsort] 104 | 105 | for i in range(len(boxes_filtered)): 106 | if probs_filtered[i] == 0: 107 | continue 108 | for j in range(i + 1, len(boxes_filtered)): 109 | if self.iou(boxes_filtered[i], boxes_filtered[j]) > self.iou_threshold: 110 | probs_filtered[j] = 0.0 111 | 112 | filter_iou = np.array(probs_filtered > 0.0, dtype='bool') 113 | boxes_filtered = boxes_filtered[filter_iou] 114 | probs_filtered = probs_filtered[filter_iou] 115 | classes_num_filtered = classes_num_filtered[filter_iou] 116 | 117 | result = [] 118 | for i in range(len(boxes_filtered)): 119 | result.append([self.classes[classes_num_filtered[i]], boxes_filtered[i][0], boxes_filtered[ 120 | i][1], boxes_filtered[i][2], boxes_filtered[i][3], probs_filtered[i]]) 121 | 122 | return result 123 | 124 | def iou(self, box1, box2): 125 | tb = min(box1[0] + 0.5 * box1[2], box2[0] + 0.5 * box2[2]) - \ 126 | max(box1[0] - 0.5 * box1[2], box2[0] - 0.5 * box2[2]) 127 | lr = min(box1[1] + 0.5 * box1[3], box2[1] + 0.5 * box2[3]) - \ 128 | max(box1[1] - 0.5 * box1[3], box2[1] - 0.5 * box2[3]) 129 | if tb < 0 or lr < 0: 130 | intersection = 0 131 | else: 132 | intersection = tb * lr 133 | return intersection / (box1[2] * box1[3] + box2[2] * box2[3] - intersection) 134 | 135 | def camera_detector(self, cap, wait=10): 136 | detect_timer = Timer() 137 | ret, _ = cap.read() 138 | 139 | while ret: 140 | ret, frame = cap.read() 141 | detect_timer.tic() 142 | result = self.detect(frame) 143 | detect_timer.toc() 144 | print('Average detecting time: {:.3f}s'.format(detect_timer.average_time)) 145 | 146 | self.draw_result(frame, result) 147 | cv2.imshow('Camera', frame) 148 | cv2.waitKey(wait) 149 | 150 | ret, frame = cap.read() 151 | 152 | def image_detector(self, imname, wait=0): 153 | detect_timer = Timer() 154 | image = cv2.imread(imname) 155 | 156 | detect_timer.tic() 157 | result = self.detect(image) 158 | detect_timer.toc() 159 | print('Average detecting time: {:.3f}s'.format(detect_timer.average_time)) 160 | 161 | self.draw_result(image, result) 162 | cv2.imshow('Image', image) 163 | cv2.waitKey(wait) 164 | 165 | 166 | def main(): 167 | parser = argparse.ArgumentParser() 168 | parser.add_argument('--weights', default="YOLO_small.ckpt", type=str) 169 | parser.add_argument('--weight_dir', default='models', type=str) 170 | parser.add_argument('--data_dir', default="data", type=str) 171 | parser.add_argument('--gpu', default='2', type=str) 172 | args = parser.parse_args() 173 | 174 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 175 | 176 | yolo = YOLONet(False) 177 | weight_file = os.path.join(args.weight_dir, args.weights) 178 | detector = Detector(yolo, weight_file) 179 | 180 | select = 2 181 | if 1 == select: 182 | # detect from camera 183 | cap = cv2.VideoCapture('test/test.mp4') 184 | detector.camera_detector(cap) 185 | if 2 == select: 186 | #detect from image file 187 | imname = 'test/person.jpg' 188 | detector.image_detector(imname) 189 | 190 | 191 | if __name__ == '__main__': 192 | main() 193 | -------------------------------------------------------------------------------- /demo_yolo_v2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("./") 3 | from utils.im_transform import imcv2_recolor, imcv2_affine_trans 4 | from utils import box 5 | import math 6 | import random 7 | import time 8 | import os 9 | 10 | import numpy as np 11 | import tensorflow as tf 12 | import cv2 13 | slim = tf.contrib.slim 14 | import matplotlib.pyplot as plt 15 | from multiprocessing.pool import ThreadPool 16 | from utils import tool 17 | from collections import Counter 18 | import json 19 | 20 | pool = ThreadPool() 21 | os.environ["CUDA_VISIBLE_DEVICES"]='3' 22 | 23 | class YOLO_detector(object): 24 | 25 | def __init__(self): 26 | model_name = 'yolov2-coco' 27 | model_dir = './models' 28 | gpu_id = 4 29 | self.gpu_utility = 0.9 30 | 31 | self.pb_file = '{}/{}.pb'.format(model_dir, model_name) 32 | self.meta_file = '{}/{}.meta'.format(model_dir, model_name) 33 | self.batch = 4 34 | 35 | self.graph = tf.Graph() 36 | with tf.device('/gpu:1'): 37 | with self.graph.as_default() as g: 38 | self.build_from_pb() 39 | gpu_options = tf.GPUOptions(allow_growth=True) 40 | sess_config = tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False) 41 | self.sess = tf.Session(config = sess_config) 42 | self.sess.run(tf.global_variables_initializer()) 43 | return 44 | 45 | def build_from_pb(self): 46 | with tf.gfile.FastGFile(self.pb_file, "rb") as f: 47 | graph_def = tf.GraphDef() 48 | graph_def.ParseFromString(f.read()) 49 | tf.import_graph_def(graph_def, name="") 50 | 51 | with open(self.meta_file, "r") as fp: 52 | self.meta = json.load(fp) 53 | #Placeholders 54 | self.inp = tf.get_default_graph().get_tensor_by_name('input:0') 55 | self.out = tf.get_default_graph().get_tensor_by_name('output:0') 56 | 57 | #self.setup_meta_ops() 58 | 59 | def setup_meta_ops(self): 60 | cfg = dict({ 61 | 'allow_soft_placement': False, 62 | 'log_device_placement': False 63 | }) 64 | utility = min(self.gpu_utility, 1.0) 65 | if utility > 0.0: 66 | print('GPU model with {} usage'.format(utility)) 67 | cfg['gpu_options'] = tf.GPUOptions(per_process_gpu_memory_fraction = utility) 68 | cfg['allow_soft_placement'] = True 69 | else: 70 | print('Run totally on CPU') 71 | cfg['device_count'] = {'GPU': 0} 72 | 73 | self.sess = tf.Session(config = tf.ConfigProto(**cfg)) 74 | self.sess.run(tf.global_variables_initializer()) 75 | 76 | def resize_input(self, im): 77 | h, w, c = self.meta['inp_size'] 78 | imsz = cv2.resize(im, (w, h)) 79 | imsz = imsz / 255. 80 | imsz = imsz[:,:,::-1] 81 | return imsz 82 | 83 | def process_box(self, b, h, w, threshold): 84 | max_indx = np.argmax(b.probs) 85 | max_prob = b.probs[max_indx] 86 | label = self.meta['labels'][max_indx] 87 | if max_prob > threshold: 88 | left = int ((b.x - b.w/2.) * w) 89 | right = int ((b.x + b.w/2.) * w) 90 | top = int ((b.y - b.h/2.) * h) 91 | bot = int ((b.y + b.h/2.) * h) 92 | if left < 0 : left = 0 93 | if right > w - 1: right = w - 1 94 | if top < 0 : top = 0 95 | if bot > h - 1: bot = h - 1 96 | mess = '{}'.format(label) 97 | return (left, right, top, bot, mess, max_indx, max_prob) 98 | return None 99 | 100 | def preprocess(self, im, allobj = None): 101 | """ 102 | """ 103 | if type(im) is not np.ndarray: 104 | im = cv2.imread(im) 105 | 106 | if allobj is not None: # in training mode 107 | result = imcv2_affine_trans(im) 108 | im, dims, trans_param = result 109 | scale, offs, flip = trans_param 110 | for obj in allobj: 111 | _fix(obj, dims, scale, offs) 112 | if not flip: continue 113 | obj_1_ = obj[1] 114 | obj[1] = dims[0] - obj[3] 115 | obj[3] = dims[0] - obj_1_ 116 | im = imcv2_recolor(im) 117 | 118 | im = self.resize_input(im) 119 | if allobj is None: return im 120 | return im#, np.array(im) # for unit testing 121 | 122 | def postprocess(self, net_out): 123 | meta = self.meta 124 | result = box.box_constructor(meta,net_out) 125 | return result 126 | 127 | 128 | def detect_object(self, im): 129 | this_inp = self.preprocess(im) 130 | expanded = np.expand_dims(this_inp, 0) 131 | inp_feed = list() 132 | feed_dict = {self.inp: expanded} 133 | inp_feed.append(expanded) 134 | feed_dict = {self.inp : expanded} 135 | 136 | print("Forwarding the image input.") 137 | start = time.time() 138 | out = self.sess.run(self.out, feed_dict) 139 | 140 | time_value = time.time() 141 | last = time_value - start 142 | print('Cost time of run = {}s.'.format(last)) 143 | result = self.postprocess(out[0]) 144 | last = time.time() - time_value 145 | 146 | print('Cost time of postprocess = {}s.'.format(last)) 147 | return result 148 | 149 | def demo_image(): 150 | yolo = YOLO_detector() 151 | colors = yolo.meta['colors'] 152 | img_dir = "./test" 153 | image_names = tool.find_files(img_dir) 154 | for filename in image_names: 155 | im = cv2.imread(filename) 156 | h,w,_ = im.shape 157 | results = yolo.detect_object(im) 158 | thick = int((h + w) // 300) 159 | draw = im.copy() 160 | h, w, _ = draw.shape 161 | for i in range(len(results)): 162 | cv2.putText(draw,str(results[i]['category']),(int(w*results[i]['x1']),int(h*results[i]['y1'])-12), 0, 1e-3*h, colors[results[i]['label']], thick//3) 163 | cv2.rectangle(draw,(int(w*results[i]['x1']),int(h*results[i]['y1'])),(int(w*results[i]['x2']),int(h*results[i]['y2'])), colors[results[i]['label']], thick) 164 | cv2.imshow("result", draw) 165 | cv2.waitKey() 166 | 167 | def demo_video(): 168 | yolo = YOLO_detector() 169 | colors = yolo.meta['colors'] 170 | video_name = 'test.mp4' 171 | data_dir = "./test" 172 | video_file = os.path.join(data_dir, video_name) 173 | 174 | print(video_file) 175 | vcap = cv2.VideoCapture(video_file) 176 | if False == vcap.isOpened(): 177 | print("video cannot open!\n") 178 | return -1 179 | idx = 0 180 | while True: 181 | idx += 1 182 | ret, img = vcap.read() 183 | if False == ret: 184 | break 185 | print('video is read') 186 | im = img 187 | h,w,_ = im.shape 188 | start = time.time() 189 | results = yolo.detect_object(im) 190 | last = (time.time() - start) 191 | thick = int((h + w) // 300) 192 | draw = im.copy() 193 | h, w, _ = draw.shape 194 | for i in range(len(results)): 195 | cv2.putText(draw,"fps:{}".format(1/last),(1,18), 0, 1e-3*h, colors[results[i]['label']], thick//3) 196 | cv2.putText(draw,"{},{}".format(str(results[i]['category']), results[i]['score']),(int(w*results[i]['x1']),int(h*results[i]['y1'])-12), 0, 1e-3*h, colors[results[i]['label']], thick//3) 197 | cv2.rectangle(draw,(int(w*results[i]['x1']),int(h*results[i]['y1'])),(int(w*results[i]['x2']),int(h*results[i]['y2'])), colors[results[i]['label']], thick) 198 | cv2.imshow("result", draw) 199 | cv2.waitKey() 200 | 201 | if __name__ == '__main__': 202 | print("run demo_video...") 203 | demo_image() 204 | 205 | -------------------------------------------------------------------------------- /models/yolov2-coco.meta: -------------------------------------------------------------------------------- 1 | {"jitter": 0.3, "object_scale": 5, "model": "../cfg/yolo-coco.cfg", "bias_match": 1, "absolute": 1, "thresh": 0.6, "random": 1, "net": {"exposure": 1.5, "momentum": 0.9, "saturation": 1.5, "batch": 1, "hue": 0.1, "width": 608, "channels": 3, "scales": ".1,.1", "type": "[net]", "learning_rate": 0.001, "max_batches": 500200, "policy": "steps", "steps": "400000,450000", "burn_in": 1000, "height": 608, "subdivisions": 1, "decay": 0.0005, "angle": 0}, "colors": [[254.0, 254.0, 254], [248.92, 228.6, 127], [243.84, 203.20000000000002, 0], [238.76, 177.79999999999998, -127], [233.68, 152.4, -254], [228.6, 127.0, 254], [223.52, 101.60000000000001, 127], [218.44, 76.20000000000002, 0], [213.35999999999999, 50.79999999999999, -127], [208.28000000000003, 25.399999999999995, -254], [203.20000000000002, 0.0, 254], [198.12, -25.400000000000023, 127], [193.04, -50.79999999999999, 0], [187.96, -76.20000000000002, -127], [182.88, -101.59999999999998, -254], [177.79999999999998, -127.0, 254], [172.71999999999997, -152.40000000000003, 127], [167.64, -177.79999999999998, 0], [162.56, -203.20000000000002, -127], [157.48, -228.59999999999997, -254], [152.4, -254.0, 254], [147.32000000000002, -279.40000000000003, 127], [142.24, -304.80000000000007, 0], [137.16, -330.19999999999993, -127], [132.08, -355.59999999999997, -254], [127.0, 254.0, 254], [121.92, 228.6, 127], [116.83999999999999, 203.20000000000002, 0], [111.75999999999999, 177.79999999999998, -127], [106.68, 152.4, -254], [101.60000000000001, 127.0, 254], [96.52, 101.60000000000001, 127], [91.44, 76.20000000000002, 0], [86.35999999999999, 50.79999999999999, -127], [81.27999999999999, 25.399999999999995, -254], [76.20000000000002, 0.0, 254], [71.12, -25.400000000000023, 127], [66.04, -50.79999999999999, 0], [60.96, -76.20000000000002, -127], [55.879999999999995, -101.59999999999998, -254], [50.79999999999999, -127.0, 254], [45.72000000000001, -152.40000000000003, 127], [40.64000000000001, -177.79999999999998, 0], [35.56, -203.20000000000002, -127], [30.48, -228.59999999999997, -254], [25.399999999999995, -254.0, 254], [20.31999999999999, -279.40000000000003, 127], [15.240000000000013, -304.80000000000007, 0], [10.160000000000009, -330.19999999999993, -127], [5.0800000000000045, -355.59999999999997, -254], [0.0, 254.0, 254], [-5.0800000000000045, 228.6, 127], [-10.160000000000009, 203.20000000000002, 0], [-15.240000000000013, 177.79999999999998, -127], [-20.320000000000018, 152.4, -254], [-25.400000000000023, 127.0, 254], [-30.480000000000025, 101.60000000000001, 127], [-35.559999999999974, 76.20000000000002, 0], [-40.63999999999998, 50.79999999999999, -127], [-45.719999999999985, 25.399999999999995, -254], [-50.79999999999999, 0.0, 254], [-55.879999999999995, -25.400000000000023, 127], [-60.96, -50.79999999999999, 0], [-66.04, -76.20000000000002, -127], [-71.12, -101.59999999999998, -254], [-76.20000000000002, -127.0, 254], [-81.28000000000002, -152.40000000000003, 127], [-86.36000000000001, -177.79999999999998, 0], [-91.44000000000003, -203.20000000000002, -127], [-96.51999999999997, -228.59999999999997, -254], [-101.59999999999998, -254.0, 254], [-106.67999999999998, -279.40000000000003, 127], [-111.75999999999999, -304.80000000000007, 0], [-116.83999999999999, -330.19999999999993, -127], [-121.92, -355.59999999999997, -254], [-127.0, 254.0, 254], [-132.08, 228.6, 127], [-137.16, 203.20000000000002, 0], [-142.24, 177.79999999999998, -127], [-147.32000000000002, 152.4, -254]], "class_scale": 1, "type": "[region]", "rescore": 1, "anchors": [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828], "coords": 4, "classes": 80, "noobject_scale": 1, "inp_size": [608, 608, 3], "num": 5, "coord_scale": 1, "out_size": [19, 19, 425], "labels": ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"], "softmax": 1, "name": "yolo-coco"} -------------------------------------------------------------------------------- /test/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishichang/yolov2-tensorflow/a71a57465422c4806e1dcbcf8b40a36463a10272/test/cat.jpg -------------------------------------------------------------------------------- /test/person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishichang/yolov2-tensorflow/a71a57465422c4806e1dcbcf8b40a36463a10272/test/person.jpg -------------------------------------------------------------------------------- /test/sample_computer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishichang/yolov2-tensorflow/a71a57465422c4806e1dcbcf8b40a36463a10272/test/sample_computer.jpg -------------------------------------------------------------------------------- /test/sample_dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishichang/yolov2-tensorflow/a71a57465422c4806e1dcbcf8b40a36463a10272/test/sample_dog.jpg -------------------------------------------------------------------------------- /test/sample_eagle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishichang/yolov2-tensorflow/a71a57465422c4806e1dcbcf8b40a36463a10272/test/sample_eagle.jpg -------------------------------------------------------------------------------- /test/sample_giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishichang/yolov2-tensorflow/a71a57465422c4806e1dcbcf8b40a36463a10272/test/sample_giraffe.jpg -------------------------------------------------------------------------------- /test/sample_horses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishichang/yolov2-tensorflow/a71a57465422c4806e1dcbcf8b40a36463a10272/test/sample_horses.jpg -------------------------------------------------------------------------------- /test/sample_office.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishichang/yolov2-tensorflow/a71a57465422c4806e1dcbcf8b40a36463a10272/test/sample_office.jpg -------------------------------------------------------------------------------- /test/sample_person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishichang/yolov2-tensorflow/a71a57465422c4806e1dcbcf8b40a36463a10272/test/sample_person.jpg -------------------------------------------------------------------------------- /test/sample_scream.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishichang/yolov2-tensorflow/a71a57465422c4806e1dcbcf8b40a36463a10272/test/sample_scream.jpg -------------------------------------------------------------------------------- /test/test.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishichang/yolov2-tensorflow/a71a57465422c4806e1dcbcf8b40a36463a10272/test/test.mp4 -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import datetime 3 | import os 4 | import argparse 5 | import yolo.config as cfg 6 | from yolo.yolo_net import YOLONet 7 | from utils.timer import Timer 8 | from utils.pascal_voc import pascal_voc 9 | 10 | 11 | class Solver(object): 12 | 13 | def __init__(self, net, data): 14 | self.net = net 15 | self.data = data 16 | self.weights_file = cfg.WEIGHTS_FILE 17 | self.max_iter = cfg.MAX_ITER 18 | self.initial_learning_rate = cfg.LEARNING_RATE 19 | self.decay_steps = cfg.DECAY_STEPS 20 | self.decay_rate = cfg.DECAY_RATE 21 | self.staircase = cfg.STAIRCASE 22 | self.summary_iter = cfg.SUMMARY_ITER 23 | self.save_iter = cfg.SAVE_ITER 24 | self.output_dir = os.path.join( 25 | cfg.OUTPUT_DIR, datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')) 26 | if not os.path.exists(self.output_dir): 27 | os.makedirs(self.output_dir) 28 | self.save_cfg() 29 | 30 | self.variable_to_restore = tf.global_variables() 31 | self.restorer = tf.train.Saver(self.variable_to_restore, max_to_keep=None) 32 | self.saver = tf.train.Saver(self.variable_to_restore, max_to_keep=None) 33 | self.ckpt_file = os.path.join(self.output_dir, 'save.ckpt') 34 | self.summary_op = tf.summary.merge_all() 35 | self.writer = tf.summary.FileWriter(self.output_dir, flush_secs=60) 36 | 37 | self.global_step = tf.get_variable( 38 | 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) 39 | self.learning_rate = tf.train.exponential_decay( 40 | self.initial_learning_rate, self.global_step, self.decay_steps, 41 | self.decay_rate, self.staircase, name='learning_rate') 42 | self.optimizer = tf.train.GradientDescentOptimizer( 43 | learning_rate=self.learning_rate).minimize( 44 | self.net.total_loss, global_step=self.global_step) 45 | self.ema = tf.train.ExponentialMovingAverage(decay=0.9999) 46 | self.averages_op = self.ema.apply(tf.trainable_variables()) 47 | with tf.control_dependencies([self.optimizer]): 48 | self.train_op = tf.group(self.averages_op) 49 | 50 | gpu_options = tf.GPUOptions() 51 | config = tf.ConfigProto(gpu_options=gpu_options) 52 | self.sess = tf.Session(config=config) 53 | self.sess.run(tf.global_variables_initializer()) 54 | 55 | if self.weights_file is not None: 56 | print('Restoring weights from: ' + self.weights_file) 57 | self.restorer.restore(self.sess, self.weights_file) 58 | 59 | self.writer.add_graph(self.sess.graph) 60 | 61 | def train(self): 62 | 63 | train_timer = Timer() 64 | load_timer = Timer() 65 | 66 | for step in xrange(1, self.max_iter + 1): 67 | 68 | load_timer.tic() 69 | images, labels = self.data.get() 70 | load_timer.toc() 71 | feed_dict = {self.net.images: images, self.net.labels: labels} 72 | 73 | if step % self.summary_iter == 0: 74 | if step % (self.summary_iter * 10) == 0: 75 | 76 | train_timer.tic() 77 | summary_str, loss, _ = self.sess.run( 78 | [self.summary_op, self.net.total_loss, self.train_op], 79 | feed_dict=feed_dict) 80 | train_timer.toc() 81 | 82 | log_str = ('{} Epoch: {}, Step: {}, Learning rate: {},' 83 | ' Loss: {:5.3f}\nSpeed: {:.3f}s/iter,' 84 | ' Load: {:.3f}s/iter, Remain: {}').format( 85 | datetime.datetime.now().strftime('%m/%d %H:%M:%S'), 86 | self.data.epoch, 87 | int(step), 88 | round(self.learning_rate.eval(session=self.sess), 6), 89 | loss, 90 | train_timer.average_time, 91 | load_timer.average_time, 92 | train_timer.remain(step, self.max_iter)) 93 | print(log_str) 94 | 95 | else: 96 | train_timer.tic() 97 | summary_str, _ = self.sess.run( 98 | [self.summary_op, self.train_op], 99 | feed_dict=feed_dict) 100 | train_timer.toc() 101 | 102 | self.writer.add_summary(summary_str, step) 103 | 104 | else: 105 | train_timer.tic() 106 | self.sess.run(self.train_op, feed_dict=feed_dict) 107 | train_timer.toc() 108 | 109 | if step % self.save_iter == 0: 110 | print('{} Saving checkpoint file to: {}'.format( 111 | datetime.datetime.now().strftime('%m/%d %H:%M:%S'), 112 | self.output_dir)) 113 | self.saver.save(self.sess, self.ckpt_file, 114 | global_step=self.global_step) 115 | 116 | def save_cfg(self): 117 | 118 | with open(os.path.join(self.output_dir, 'config.txt'), 'w') as f: 119 | cfg_dict = cfg.__dict__ 120 | for key in sorted(cfg_dict.keys()): 121 | if key[0].isupper(): 122 | cfg_str = '{}: {}\n'.format(key, cfg_dict[key]) 123 | f.write(cfg_str) 124 | 125 | 126 | def update_config_paths(data_dir, weights_file): 127 | cfg.DATA_PATH = data_dir 128 | cfg.PASCAL_PATH = os.path.join(data_dir, 'pascal_voc') 129 | cfg.CACHE_PATH = os.path.join(cfg.PASCAL_PATH, 'cache') 130 | cfg.OUTPUT_DIR = os.path.join(cfg.PASCAL_PATH, 'output') 131 | cfg.WEIGHTS_DIR = os.path.join(cfg.PASCAL_PATH, 'weights') 132 | 133 | cfg.WEIGHTS_FILE = os.path.join(cfg.WEIGHTS_DIR, weights_file) 134 | 135 | 136 | def main(): 137 | parser = argparse.ArgumentParser() 138 | parser.add_argument('--weights', default="YOLO_small.ckpt", type=str) 139 | parser.add_argument('--data_dir', default="data", type=str) 140 | parser.add_argument('--threshold', default=0.2, type=float) 141 | parser.add_argument('--iou_threshold', default=0.5, type=float) 142 | parser.add_argument('--gpu', default='', type=str) 143 | args = parser.parse_args() 144 | 145 | if args.gpu is not None: 146 | cfg.GPU = args.gpu 147 | 148 | if args.data_dir != cfg.DATA_PATH: 149 | update_config_paths(args.data_dir, args.weights) 150 | 151 | os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPU 152 | 153 | yolo = YOLONet() 154 | pascal = pascal_voc('train') 155 | 156 | solver = Solver(yolo, pascal) 157 | 158 | print('Start training ...') 159 | solver.train() 160 | print('Done training.') 161 | 162 | if __name__ == '__main__': 163 | 164 | # python train.py --weights YOLO_small.ckpt --gpu 0 165 | main() 166 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishichang/yolov2-tensorflow/a71a57465422c4806e1dcbcf8b40a36463a10272/utils/__init__.py -------------------------------------------------------------------------------- /utils/box.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class BoundBox: 4 | def __init__(self, classes): 5 | self.x, self.y = float(), float() 6 | self.w, self.h = float(), float() 7 | self.c = float() 8 | self.class_num = classes 9 | self.probs = np.zeros((classes,)) 10 | 11 | def overlap(x1,w1,x2,w2): 12 | l1 = x1 - w1 / 2.; 13 | l2 = x2 - w2 / 2.; 14 | left = max(l1, l2) 15 | r1 = x1 + w1 / 2.; 16 | r2 = x2 + w2 / 2.; 17 | right = min(r1, r2) 18 | return right - left; 19 | 20 | def box_intersection(a, b): 21 | w = overlap(a.x, a.w, b.x, b.w); 22 | h = overlap(a.y, a.h, b.y, b.h); 23 | if w < 0 or h < 0: return 0; 24 | area = w * h; 25 | return area; 26 | 27 | def box_union(a, b): 28 | i = box_intersection(a, b); 29 | u = a.w * a.h + b.w * b.h - i; 30 | return u; 31 | 32 | def box_iou(a, b): 33 | return box_intersection(a, b) / box_union(a, b); 34 | 35 | def iou(box1,box2): 36 | tb = min(box1[0]+0.5*box1[2],box2[0]+0.5*box2[2])-max(box1[0]-0.5*box1[2],box2[0]-0.5*box2[2]) 37 | lr = min(box1[1]+0.5*box1[3],box2[1]+0.5*box2[3])-max(box1[1]-0.5*box1[3],box2[1]-0.5*box2[3]) 38 | if tb < 0 or lr < 0 : intersection = 0 39 | else : intersection = tb*lr 40 | return intersection / (box1[2]*box1[3] + box2[2]*box2[3] - intersection) 41 | 42 | def explit_c_mine(x): 43 | y = 1.0/(1.0 + np.exp(-x)) 44 | return y 45 | 46 | def box_constructor(meta, net_out_in): 47 | threshold = meta['thresh'] 48 | classes = meta['labels'] 49 | anchors = np.asarray(meta['anchors']) 50 | H, W, _ = meta['out_size'] 51 | 52 | C = int(meta['classes']) 53 | B = int(meta['num']) 54 | net_out = net_out_in.reshape([H, W, B, int(net_out_in.shape[2]/B)]) 55 | Classes = net_out[:,:,:,5:] 56 | Bbox_pred = net_out[:,:,:,:5] 57 | probs = np.zeros((H,W,B,C), dtype=np.float32) 58 | probs_filtered = np.zeros((H,W,B,C), dtype=np.float32) 59 | Bbox_pred[:,:,:,4] = explit_c_mine(Bbox_pred[:,:,:,4]) 60 | offset = np.transpose(np.reshape(np.array([np.arange(19)]*95), (5,19,19)),(1,2,0)) 61 | Bbox_pred[:,:,:,0] = (offset + explit_c_mine(Bbox_pred[:,:,:,0])) / W 62 | Bbox_pred[:,:,:,1] = (np.transpose(offset, (1,0,2)) + explit_c_mine(Bbox_pred[:,:,:,1])) / H 63 | for box_loop in range(B): 64 | Bbox_pred[:,:,box_loop,2] = np.exp(Bbox_pred[:,:,box_loop,2]) * anchors[2*box_loop + 0] /W 65 | Bbox_pred[:,:,box_loop,3] = np.exp(Bbox_pred[:,:,box_loop,3]) * anchors[2*box_loop + 1] /H 66 | 67 | 68 | class_probs = np.ascontiguousarray(Classes).reshape([H*W*B, C]) 69 | max_all = np.max(class_probs, 1) 70 | max_all = np.expand_dims(max_all, 0) 71 | max_all = np.tile(max_all.T, (1, class_probs.shape[1])) 72 | 73 | class_probs = np.exp(class_probs - max_all) 74 | sum_all = np.sum(class_probs, 1) 75 | 76 | 77 | temp_pred = np.ascontiguousarray(Bbox_pred[:,:,:,4]).reshape([H*W*B, 1]) 78 | temp_pred = np.tile(temp_pred, (1, class_probs.shape[1])) 79 | sum_all = np.expand_dims(sum_all, 0) 80 | sum_all = np.tile(sum_all.T, (1, class_probs.shape[1])) 81 | probs = class_probs * temp_pred /sum_all 82 | probs = np.ascontiguousarray(probs).reshape([H, W, B, C]) 83 | 84 | 85 | 86 | #apply score threshold 87 | bboxes = Bbox_pred[:,:,:,:4] 88 | filter_mat_probs = np.array(probs > threshold, dtype = 'bool') 89 | probs_filtered = probs[filter_mat_probs] 90 | filter_mat_bboxes = np.nonzero(filter_mat_probs) 91 | bboxes_filtered = bboxes[filter_mat_bboxes[0], filter_mat_bboxes[1], filter_mat_bboxes[2]] 92 | probs_filtered = probs[filter_mat_probs] 93 | classes_num_filtered = np.argmax(probs, axis=3)[filter_mat_bboxes[0], filter_mat_bboxes[1], filter_mat_bboxes[2]] 94 | 95 | #NMS 96 | argsort = np.array(np.argsort(probs_filtered))[::-1] 97 | bboxes_filtered = bboxes_filtered[argsort] 98 | probs_filtered = probs_filtered[argsort] 99 | classes_num_filtered = classes_num_filtered[argsort] 100 | 101 | for i in range(len(probs_filtered)): 102 | if probs_filtered[i] == 0: continue 103 | for j in range(i+1, len(bboxes_filtered)): 104 | a = BoundBox(0) 105 | b = BoundBox(0) 106 | a.x = bboxes_filtered[i, 0] 107 | a.y = bboxes_filtered[i, 1] 108 | a.w = bboxes_filtered[i, 2] 109 | a.h = bboxes_filtered[i, 3] 110 | b.x = bboxes_filtered[j, 0] 111 | b.y = bboxes_filtered[j, 1] 112 | b.w = bboxes_filtered[j, 2] 113 | b.h = bboxes_filtered[j, 3] 114 | 115 | if box_iou(a, b) > 0.4: 116 | probs_filtered[j] = 0 117 | filter_iou = np.array(probs_filtered>0.0,dtype='bool') 118 | bboxes_filtered = bboxes_filtered[filter_iou] 119 | probs_filtered = probs_filtered[filter_iou] 120 | classes_num_filtered = classes_num_filtered[filter_iou] 121 | 122 | results = [] 123 | numbox = len(bboxes_filtered) 124 | for i in range(len(bboxes_filtered)): 125 | result = dict() 126 | result['score'] = probs_filtered[i] 127 | result['x1'] = bboxes_filtered[i][0] - bboxes_filtered[i][2]/2.0 128 | result['y1'] = bboxes_filtered[i][1] - bboxes_filtered[i][3]/2.0 129 | result['x2'] = bboxes_filtered[i][0] + bboxes_filtered[i][2]/2.0 130 | result['y2'] = bboxes_filtered[i][1] + bboxes_filtered[i][3]/2.0 131 | result['x1'] = max(0.0, result['x1']) 132 | result['y1'] = max(0.0, result['y1']) 133 | result['x2'] = min(1.0, result['x2']) 134 | result['y2'] = min(1.0, result['y2']) 135 | result['label'] = classes_num_filtered[i] 136 | result['category'] = classes[classes_num_filtered[i]] 137 | results.append(result) 138 | #print(results) 139 | 140 | 141 | return results 142 | 143 | 144 | -------------------------------------------------------------------------------- /utils/im_transform.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | def imcv2_recolor(im, a = .1): 5 | t = [np.random.uniform()] 6 | t += [np.random.uniform()] 7 | t += [np.random.uniform()] 8 | t = np.array(t) * 2. - 1. 9 | 10 | # random amplify each channel 11 | im = im * (1 + t * a) 12 | mx = 255. * (1 + a) 13 | up = np.random.uniform() * 2 - 1 14 | im = np.power(im/mx, 1. + up * .5) 15 | return np.array(im * 255., np.uint8) 16 | 17 | def imcv2_affine_trans(im): 18 | # Scale and translate 19 | h, w, c = im.shape 20 | scale = np.random.uniform() / 10. + 1. 21 | max_offx = (scale-1.) * w 22 | max_offy = (scale-1.) * h 23 | offx = int(np.random.uniform() * max_offx) 24 | offy = int(np.random.uniform() * max_offy) 25 | 26 | im = cv2.resize(im, (0,0), fx = scale, fy = scale) 27 | im = im[offy : (offy + h), offx : (offx + w)] 28 | flip = np.random.binomial(1, .5) 29 | if flip: im = cv2.flip(im, 1) 30 | return im, [w, h, c], [scale, [offx, offy], flip] -------------------------------------------------------------------------------- /utils/pascal_voc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import xml.etree.ElementTree as ET 3 | import numpy as np 4 | import cv2 5 | import cPickle 6 | import copy 7 | import yolo.config as cfg 8 | 9 | 10 | class pascal_voc(object): 11 | def __init__(self, phase, rebuild=False): 12 | self.devkil_path = os.path.join(cfg.PASCAL_PATH, 'VOCdevkit') 13 | self.data_path = os.path.join(self.devkil_path, 'VOC2007') 14 | self.cache_path = cfg.CACHE_PATH 15 | self.batch_size = cfg.BATCH_SIZE 16 | self.image_size = cfg.IMAGE_SIZE 17 | self.cell_size = cfg.CELL_SIZE 18 | self.classes = cfg.CLASSES 19 | self.class_to_ind = dict(zip(self.classes, xrange(len(self.classes)))) 20 | self.flipped = cfg.FLIPPED 21 | self.phase = phase 22 | self.rebuild = rebuild 23 | self.cursor = 0 24 | self.epoch = 1 25 | self.gt_labels = None 26 | self.prepare() 27 | 28 | def get(self): 29 | images = np.zeros((self.batch_size, self.image_size, self.image_size, 3)) 30 | labels = np.zeros((self.batch_size, self.cell_size, self.cell_size, 25)) 31 | count = 0 32 | while count < self.batch_size: 33 | imname = self.gt_labels[self.cursor]['imname'] 34 | flipped = self.gt_labels[self.cursor]['flipped'] 35 | images[count, :, :, :] = self.image_read(imname, flipped) 36 | labels[count, :, :, :] = self.gt_labels[self.cursor]['label'] 37 | count += 1 38 | self.cursor += 1 39 | if self.cursor >= len(self.gt_labels): 40 | np.random.shuffle(self.gt_labels) 41 | self.cursor = 0 42 | self.epoch += 1 43 | return images, labels 44 | 45 | def image_read(self, imname, flipped=False): 46 | image = cv2.imread(imname) 47 | image = cv2.resize(image, (self.image_size, self.image_size)) 48 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32) 49 | image = (image / 255.0) * 2.0 - 1.0 50 | if flipped: 51 | image = image[:, ::-1, :] 52 | return image 53 | 54 | def prepare(self): 55 | gt_labels = self.load_labels() 56 | if self.flipped: 57 | print('Appending horizontally-flipped training examples ...') 58 | gt_labels_cp = copy.deepcopy(gt_labels) 59 | for idx in range(len(gt_labels_cp)): 60 | gt_labels_cp[idx]['flipped'] = True 61 | gt_labels_cp[idx]['label'] = gt_labels_cp[idx]['label'][:, ::-1, :] 62 | for i in xrange(self.cell_size): 63 | for j in xrange(self.cell_size): 64 | if gt_labels_cp[idx]['label'][i, j, 0] == 1: 65 | gt_labels_cp[idx]['label'][i, j, 1] = self.image_size - 1 - gt_labels_cp[idx]['label'][i, j, 1] 66 | gt_labels += gt_labels_cp 67 | np.random.shuffle(gt_labels) 68 | self.gt_labels = gt_labels 69 | return gt_labels 70 | 71 | def load_labels(self): 72 | cache_file = os.path.join(self.cache_path, 'pascal_' + self.phase + '_gt_labels.pkl') 73 | 74 | if os.path.isfile(cache_file) and not self.rebuild: 75 | print('Loading gt_labels from: ' + cache_file) 76 | with open(cache_file, 'rb') as f: 77 | gt_labels = cPickle.load(f) 78 | return gt_labels 79 | 80 | print('Processing gt_labels from: ' + self.data_path) 81 | 82 | if not os.path.exists(self.cache_path): 83 | os.makedirs(self.cache_path) 84 | 85 | if self.phase == 'train': 86 | txtname = os.path.join(self.data_path, 'ImageSets', 'Main', 87 | 'trainval.txt') 88 | else: 89 | txtname = os.path.join(self.data_path, 'ImageSets', 'Main', 90 | 'test.txt') 91 | with open(txtname, 'r') as f: 92 | self.image_index = [x.strip() for x in f.readlines()] 93 | 94 | gt_labels = [] 95 | for index in self.image_index: 96 | label, num = self.load_pascal_annotation(index) 97 | if num == 0: 98 | continue 99 | imname = os.path.join(self.data_path, 'JPEGImages', index + '.jpg') 100 | gt_labels.append({'imname': imname, 'label': label, 'flipped': False}) 101 | print('Saving gt_labels to: ' + cache_file) 102 | with open(cache_file, 'wb') as f: 103 | cPickle.dump(gt_labels, f) 104 | return gt_labels 105 | 106 | def load_pascal_annotation(self, index): 107 | """ 108 | Load image and bounding boxes info from XML file in the PASCAL VOC 109 | format. 110 | """ 111 | 112 | imname = os.path.join(self.data_path, 'JPEGImages', index + '.jpg') 113 | im = cv2.imread(imname) 114 | h_ratio = 1.0 * self.image_size / im.shape[0] 115 | w_ratio = 1.0 * self.image_size / im.shape[1] 116 | # im = cv2.resize(im, [self.image_size, self.image_size]) 117 | 118 | label = np.zeros((self.cell_size, self.cell_size, 25)) 119 | filename = os.path.join(self.data_path, 'Annotations', index + '.xml') 120 | tree = ET.parse(filename) 121 | objs = tree.findall('object') 122 | 123 | for obj in objs: 124 | bbox = obj.find('bndbox') 125 | # Make pixel indexes 0-based 126 | x1 = max(min((float(bbox.find('xmin').text) - 1) * w_ratio, self.image_size - 1), 0) 127 | y1 = max(min((float(bbox.find('ymin').text) - 1) * h_ratio, self.image_size - 1), 0) 128 | x2 = max(min((float(bbox.find('xmax').text) - 1) * w_ratio, self.image_size - 1), 0) 129 | y2 = max(min((float(bbox.find('ymax').text) - 1) * h_ratio, self.image_size - 1), 0) 130 | cls_ind = self.class_to_ind[obj.find('name').text.lower().strip()] 131 | boxes = [(x2 + x1) / 2.0, (y2 + y1) / 2.0, x2 - x1, y2 - y1] 132 | x_ind = int(boxes[0] * self.cell_size / self.image_size) 133 | y_ind = int(boxes[1] * self.cell_size / self.image_size) 134 | if label[y_ind, x_ind, 0] == 1: 135 | continue 136 | label[y_ind, x_ind, 0] = 1 137 | label[y_ind, x_ind, 1:5] = boxes 138 | label[y_ind, x_ind, 5 + cls_ind] = 1 139 | 140 | return label, len(objs) 141 | -------------------------------------------------------------------------------- /utils/timer.py: -------------------------------------------------------------------------------- 1 | import time, datetime 2 | 3 | class Timer(object): 4 | ''' 5 | A simple timer. 6 | ''' 7 | def __init__(self): 8 | self.init_time = time.time() 9 | self.total_time = 0. 10 | self.calls = 0 11 | self.start_time = 0. 12 | self.diff = 0. 13 | self.average_time = 0. 14 | self.remain_time = 0. 15 | 16 | def tic(self): 17 | # using time.time instead of time.clock because time time.clock 18 | # does not normalize for multithreading 19 | self.start_time = time.time() 20 | 21 | def toc(self, average=True): 22 | self.diff = time.time() - self.start_time 23 | self.total_time += self.diff 24 | self.calls += 1 25 | self.average_time = self.total_time / self.calls 26 | if average: 27 | return self.average_time 28 | else: 29 | return self.diff 30 | 31 | def remain(self, iters, max_iters): 32 | if iters == 0: 33 | self.remain_time = 0 34 | else: 35 | self.remain_time = (time.time() - self.init_time) * \ 36 | (max_iters - iters) / iters 37 | return str(datetime.timedelta(seconds=int(self.remain_time))) 38 | -------------------------------------------------------------------------------- /utils/tool.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | 5 | PATTERN = ('.jpg', '.jpeg') 6 | def find_files(directory, pattern=PATTERN): 7 | files = [] 8 | for path, d, filelist in os.walk(directory): 9 | for filename in filelist: 10 | if filename.lower().endswith(pattern): 11 | files.append(os.path.join(path, filename)) 12 | return files 13 | 14 | def map2classnames(labelmap_file): 15 | classes = [] 16 | f = open(labelmap_file, 'r') 17 | pat = 'display_name' 18 | for line in f.readlines(): 19 | if re.search(pat, line): 20 | line_strs = line.split('"') 21 | class_name = line_strs[-2] 22 | classes.append(class_name) 23 | f.close() 24 | return classes 25 | -------------------------------------------------------------------------------- /yolo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shishichang/yolov2-tensorflow/a71a57465422c4806e1dcbcf8b40a36463a10272/yolo/__init__.py -------------------------------------------------------------------------------- /yolo/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # 4 | # path and dataset parameter 5 | # 6 | 7 | DATA_PATH = 'data' 8 | 9 | PASCAL_PATH = os.path.join(DATA_PATH, 'pascal_voc') 10 | 11 | CACHE_PATH = os.path.join(PASCAL_PATH, 'cache') 12 | 13 | OUTPUT_DIR = os.path.join(PASCAL_PATH, 'output') 14 | 15 | WEIGHTS_DIR = os.path.join(PASCAL_PATH, 'weights') 16 | 17 | # WEIGHTS_FILE = None 18 | WEIGHTS_FILE = os.path.join(DATA_PATH, 'weights', 'YOLO_small.ckpt') 19 | 20 | CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 21 | 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 22 | 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 23 | 'train', 'tvmonitor'] 24 | 25 | FLIPPED = True 26 | 27 | 28 | # 29 | # model parameter 30 | # 31 | 32 | IMAGE_SIZE = 448 33 | 34 | CELL_SIZE = 7 35 | 36 | BOXES_PER_CELL = 2 37 | 38 | ALPHA = 0.1 39 | 40 | DISP_CONSOLE = False 41 | 42 | OBJECT_SCALE = 1.0 43 | NOOBJECT_SCALE = 1.0 44 | CLASS_SCALE = 2.0 45 | COORD_SCALE = 5.0 46 | 47 | 48 | # 49 | # solver parameter 50 | # 51 | 52 | GPU = '2' 53 | 54 | LEARNING_RATE = 0.0001 55 | 56 | DECAY_STEPS = 30000 57 | 58 | DECAY_RATE = 0.1 59 | 60 | STAIRCASE = True 61 | 62 | BATCH_SIZE = 45 63 | 64 | MAX_ITER = 15000 65 | 66 | SUMMARY_ITER = 10 67 | 68 | SAVE_ITER = 1000 69 | 70 | 71 | # 72 | # test parameter 73 | # 74 | 75 | 76 | THRESHOLD = 0.3 77 | 78 | IOU_THRESHOLD = 0.5 79 | -------------------------------------------------------------------------------- /yolo/yolo_net.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import yolo.config as cfg 4 | 5 | slim = tf.contrib.slim 6 | 7 | 8 | class YOLONet(object): 9 | 10 | def __init__(self, is_training=True): 11 | self.classes = cfg.CLASSES 12 | self.num_class = len(self.classes) 13 | self.image_size = cfg.IMAGE_SIZE 14 | self.cell_size = cfg.CELL_SIZE 15 | self.boxes_per_cell = cfg.BOXES_PER_CELL 16 | self.output_size = (self.cell_size * self.cell_size) * (self.num_class + self.boxes_per_cell * 5) 17 | self.scale = 1.0 * self.image_size / self.cell_size 18 | self.boundary1 = self.cell_size * self.cell_size * self.num_class 19 | self.boundary2 = self.boundary1 + self.cell_size * self.cell_size * self.boxes_per_cell 20 | 21 | self.object_scale = cfg.OBJECT_SCALE 22 | self.noobject_scale = cfg.NOOBJECT_SCALE 23 | self.class_scale = cfg.CLASS_SCALE 24 | self.coord_scale = cfg.COORD_SCALE 25 | 26 | self.learning_rate = cfg.LEARNING_RATE 27 | self.batch_size = cfg.BATCH_SIZE 28 | self.alpha = cfg.ALPHA 29 | 30 | self.offset = np.transpose(np.reshape(np.array( 31 | [np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell), 32 | (self.boxes_per_cell, self.cell_size, self.cell_size)), (1, 2, 0)) 33 | 34 | self.images = tf.placeholder(tf.float32, [None, self.image_size, self.image_size, 3], name='images') 35 | self.logits = self.build_network(self.images, num_outputs=self.output_size, alpha=self.alpha, is_training=is_training) 36 | 37 | if is_training: 38 | self.labels = tf.placeholder(tf.float32, [None, self.cell_size, self.cell_size, 5 + self.num_class]) 39 | self.loss_layer(self.logits, self.labels) 40 | self.total_loss = tf.losses.get_total_loss() 41 | tf.summary.scalar('total_loss', self.total_loss) 42 | 43 | def build_network(self, 44 | images, 45 | num_outputs, 46 | alpha, 47 | keep_prob=0.5, 48 | is_training=True, 49 | scope='yolo'): 50 | with tf.variable_scope(scope): 51 | with slim.arg_scope([slim.conv2d, slim.fully_connected], 52 | activation_fn=leaky_relu(alpha), 53 | weights_initializer=tf.truncated_normal_initializer(0.0, 0.01), 54 | weights_regularizer=slim.l2_regularizer(0.0005)): 55 | net = tf.pad(images, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]), name='pad_1') 56 | net = slim.conv2d(net, 64, 7, 2, padding='VALID', scope='conv_2') 57 | net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_3') 58 | net = slim.conv2d(net, 192, 3, scope='conv_4') 59 | net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_5') 60 | net = slim.conv2d(net, 128, 1, scope='conv_6') 61 | net = slim.conv2d(net, 256, 3, scope='conv_7') 62 | net = slim.conv2d(net, 256, 1, scope='conv_8') 63 | net = slim.conv2d(net, 512, 3, scope='conv_9') 64 | net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_10') 65 | net = slim.conv2d(net, 256, 1, scope='conv_11') 66 | net = slim.conv2d(net, 512, 3, scope='conv_12') 67 | net = slim.conv2d(net, 256, 1, scope='conv_13') 68 | net = slim.conv2d(net, 512, 3, scope='conv_14') 69 | net = slim.conv2d(net, 256, 1, scope='conv_15') 70 | net = slim.conv2d(net, 512, 3, scope='conv_16') 71 | net = slim.conv2d(net, 256, 1, scope='conv_17') 72 | net = slim.conv2d(net, 512, 3, scope='conv_18') 73 | net = slim.conv2d(net, 512, 1, scope='conv_19') 74 | net = slim.conv2d(net, 1024, 3, scope='conv_20') 75 | net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_21') 76 | net = slim.conv2d(net, 512, 1, scope='conv_22') 77 | net = slim.conv2d(net, 1024, 3, scope='conv_23') 78 | net = slim.conv2d(net, 512, 1, scope='conv_24') 79 | net = slim.conv2d(net, 1024, 3, scope='conv_25') 80 | net = slim.conv2d(net, 1024, 3, scope='conv_26') 81 | net = tf.pad(net, np.array([[0, 0], [1, 1], [1, 1], [0, 0]]), name='pad_27') 82 | net = slim.conv2d(net, 1024, 3, 2, padding='VALID', scope='conv_28') 83 | net = slim.conv2d(net, 1024, 3, scope='conv_29') 84 | net = slim.conv2d(net, 1024, 3, scope='conv_30') 85 | net = tf.transpose(net, [0, 3, 1, 2], name='trans_31') 86 | net = slim.flatten(net, scope='flat_32') 87 | net = slim.fully_connected(net, 512, scope='fc_33') 88 | net = slim.fully_connected(net, 4096, scope='fc_34') 89 | net = slim.dropout(net, keep_prob=keep_prob, 90 | is_training=is_training, scope='dropout_35') 91 | net = slim.fully_connected(net, num_outputs, 92 | activation_fn=None, scope='fc_36') 93 | return net 94 | 95 | def calc_iou(self, boxes1, boxes2, scope='iou'): 96 | """calculate ious 97 | Args: 98 | boxes1: 4-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ====> (x_center, y_center, w, h) 99 | boxes2: 1-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ===> (x_center, y_center, w, h) 100 | Return: 101 | iou: 3-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL] 102 | """ 103 | with tf.variable_scope(scope): 104 | boxes1 = tf.stack([boxes1[:, :, :, :, 0] - boxes1[:, :, :, :, 2] / 2.0, 105 | boxes1[:, :, :, :, 1] - boxes1[:, :, :, :, 3] / 2.0, 106 | boxes1[:, :, :, :, 0] + boxes1[:, :, :, :, 2] / 2.0, 107 | boxes1[:, :, :, :, 1] + boxes1[:, :, :, :, 3] / 2.0]) 108 | boxes1 = tf.transpose(boxes1, [1, 2, 3, 4, 0]) 109 | 110 | boxes2 = tf.stack([boxes2[:, :, :, :, 0] - boxes2[:, :, :, :, 2] / 2.0, 111 | boxes2[:, :, :, :, 1] - boxes2[:, :, :, :, 3] / 2.0, 112 | boxes2[:, :, :, :, 0] + boxes2[:, :, :, :, 2] / 2.0, 113 | boxes2[:, :, :, :, 1] + boxes2[:, :, :, :, 3] / 2.0]) 114 | boxes2 = tf.transpose(boxes2, [1, 2, 3, 4, 0]) 115 | 116 | # calculate the left up point & right down point 117 | lu = tf.maximum(boxes1[:, :, :, :, :2], boxes2[:, :, :, :, :2]) 118 | rd = tf.minimum(boxes1[:, :, :, :, 2:], boxes2[:, :, :, :, 2:]) 119 | 120 | # intersection 121 | intersection = tf.maximum(0.0, rd - lu) 122 | inter_square = intersection[:, :, :, :, 0] * intersection[:, :, :, :, 1] 123 | 124 | # calculate the boxs1 square and boxs2 square 125 | square1 = (boxes1[:, :, :, :, 2] - boxes1[:, :, :, :, 0]) * \ 126 | (boxes1[:, :, :, :, 3] - boxes1[:, :, :, :, 1]) 127 | square2 = (boxes2[:, :, :, :, 2] - boxes2[:, :, :, :, 0]) * \ 128 | (boxes2[:, :, :, :, 3] - boxes2[:, :, :, :, 1]) 129 | 130 | union_square = tf.maximum(square1 + square2 - inter_square, 1e-10) 131 | 132 | return tf.clip_by_value(inter_square / union_square, 0.0, 1.0) 133 | 134 | def loss_layer(self, predicts, labels, scope='loss_layer'): 135 | with tf.variable_scope(scope): 136 | predict_classes = tf.reshape(predicts[:, :self.boundary1], [self.batch_size, self.cell_size, self.cell_size, self.num_class]) 137 | predict_scales = tf.reshape(predicts[:, self.boundary1:self.boundary2], [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell]) 138 | predict_boxes = tf.reshape(predicts[:, self.boundary2:], [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell, 4]) 139 | 140 | response = tf.reshape(labels[:, :, :, 0], [self.batch_size, self.cell_size, self.cell_size, 1]) 141 | boxes = tf.reshape(labels[:, :, :, 1:5], [self.batch_size, self.cell_size, self.cell_size, 1, 4]) 142 | boxes = tf.tile(boxes, [1, 1, 1, self.boxes_per_cell, 1]) / self.image_size 143 | classes = labels[:, :, :, 5:] 144 | 145 | offset = tf.constant(self.offset, dtype=tf.float32) 146 | offset = tf.reshape(offset, [1, self.cell_size, self.cell_size, self.boxes_per_cell]) 147 | offset = tf.tile(offset, [self.batch_size, 1, 1, 1]) 148 | predict_boxes_tran = tf.stack([(predict_boxes[:, :, :, :, 0] + offset) / self.cell_size, 149 | (predict_boxes[:, :, :, :, 1] + tf.transpose(offset, (0, 2, 1, 3))) / self.cell_size, 150 | tf.square(predict_boxes[:, :, :, :, 2]), 151 | tf.square(predict_boxes[:, :, :, :, 3])]) 152 | predict_boxes_tran = tf.transpose(predict_boxes_tran, [1, 2, 3, 4, 0]) 153 | 154 | iou_predict_truth = self.calc_iou(predict_boxes_tran, boxes) 155 | 156 | # calculate I tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL] 157 | object_mask = tf.reduce_max(iou_predict_truth, 3, keep_dims=True) 158 | object_mask = tf.cast((iou_predict_truth >= object_mask), tf.float32) * response 159 | 160 | # calculate no_I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL] 161 | noobject_mask = tf.ones_like(object_mask, dtype=tf.float32) - object_mask 162 | 163 | boxes_tran = tf.stack([boxes[:, :, :, :, 0] * self.cell_size - offset, 164 | boxes[:, :, :, :, 1] * self.cell_size - tf.transpose(offset, (0, 2, 1, 3)), 165 | tf.sqrt(boxes[:, :, :, :, 2]), 166 | tf.sqrt(boxes[:, :, :, :, 3])]) 167 | boxes_tran = tf.transpose(boxes_tran, [1, 2, 3, 4, 0]) 168 | 169 | # class_loss 170 | class_delta = response * (predict_classes - classes) 171 | class_loss = tf.reduce_mean(tf.reduce_sum(tf.square(class_delta), axis=[1, 2, 3]), name='class_loss') * self.class_scale 172 | 173 | # object_loss 174 | object_delta = object_mask * (predict_scales - iou_predict_truth) 175 | object_loss = tf.reduce_mean(tf.reduce_sum(tf.square(object_delta), axis=[1, 2, 3]), name='object_loss') * self.object_scale 176 | 177 | # noobject_loss 178 | noobject_delta = noobject_mask * predict_scales 179 | noobject_loss = tf.reduce_mean(tf.reduce_sum(tf.square(noobject_delta), axis=[1, 2, 3]), name='noobject_loss') * self.noobject_scale 180 | 181 | # coord_loss 182 | coord_mask = tf.expand_dims(object_mask, 4) 183 | boxes_delta = coord_mask * (predict_boxes - boxes_tran) 184 | coord_loss = tf.reduce_mean(tf.reduce_sum(tf.square(boxes_delta), axis=[1, 2, 3, 4]), name='coord_loss') * self.coord_scale 185 | 186 | tf.losses.add_loss(class_loss) 187 | tf.losses.add_loss(object_loss) 188 | tf.losses.add_loss(noobject_loss) 189 | tf.losses.add_loss(coord_loss) 190 | 191 | tf.summary.scalar('class_loss', class_loss) 192 | tf.summary.scalar('object_loss', object_loss) 193 | tf.summary.scalar('noobject_loss', noobject_loss) 194 | tf.summary.scalar('coord_loss', coord_loss) 195 | 196 | tf.summary.histogram('boxes_delta_x', boxes_delta[:, :, :, :, 0]) 197 | tf.summary.histogram('boxes_delta_y', boxes_delta[:, :, :, :, 1]) 198 | tf.summary.histogram('boxes_delta_w', boxes_delta[:, :, :, :, 2]) 199 | tf.summary.histogram('boxes_delta_h', boxes_delta[:, :, :, :, 3]) 200 | tf.summary.histogram('iou', iou_predict_truth) 201 | 202 | 203 | def leaky_relu(alpha): 204 | def op(inputs): 205 | return tf.maximum(alpha * inputs, inputs, name='leaky_relu') 206 | return op 207 | --------------------------------------------------------------------------------