├── .gitignore ├── README.md ├── YOLOtiny.py ├── YOLOtiny_v2.model ├── input_video.mp4 ├── lib └── preprocess.py ├── predictor.py ├── sample_images └── sample.jpg ├── tiny-yolo-voc.weights ├── yolo_camera.py ├── yolo_predict.py └── yolo_video.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # YOLOtiny V2のChainerバージョン 2 | 3 | 4 | ## オブジェクト検出 5 | 以下のコマンドで、指定した画像をダウンロードし、領域検出を行う。 6 | 7 | ``` 8 | python yolo_predict.py -u '画像のURL' 9 | ``` 10 | 11 | ## カメラでリアルタイム検出 12 | 以下のコマンドで、カメラを起動し、リアルタイムオブジェクト救出を行う。 13 | 14 | ``` 15 | python yolo_camera.py 16 | ``` 17 | 18 | ## 動画でオブジェクト検出 19 | 以下のコマンドで、input_video.mp4のビデオファイルを読み込み、オブエジェクト検出処理を行う。ビデオ内では、人間小さく映ってるのに加えて密集する事が多いので認識精度はかなり悪い。 20 | 21 | ``` 22 | python yolo_video.py 23 | ``` 24 | 25 | 26 | ## darknetのweights変換 27 | すでに変換済みなので必要ないが、以下のコマンドで、darknetの重みパラメータのファイル`tiny-yolo-voc.weights`を読み込み、chainerのモデルファイル`YOLOtiny_v2.model`に変換できる。 28 | 29 | ``` 30 | python YOLOtiny.py 31 | ``` 32 | 33 | ## 参考 34 | [darknet](http://pjreddie.com/darknet/yolo/) 35 | 36 | [YOLOでPPAP](http://qiita.com/ashitani/items/566cf9234682cb5f2d60) 37 | 38 | [YOLOでPPAP実装](https://github.com/ashitani) -------------------------------------------------------------------------------- /YOLOtiny.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import chainer 3 | from chainer import cuda, Function, gradient_check, Variable, optimizers, serializers, utils 4 | from chainer import Link, Chain, ChainList 5 | import chainer.functions as F 6 | import chainer.links as L 7 | from chainer import training 8 | from chainer.training import extensions 9 | 10 | def darknetConv2D(in_channel, out_channel, bn=True): 11 | if(bn): 12 | return Chain( 13 | c = L.Convolution2D(in_channel, out_channel, ksize=3, pad=1, nobias=True), 14 | n = L.BatchNormalization(out_channel, use_beta=False, eps=0.000001), 15 | b = L.Bias(shape=[out_channel,]), 16 | ) 17 | else: 18 | return Chain( 19 | c = L.Convolution2D(in_channel,out_channel, ksize=3, pad=1,nobias=True), 20 | b = L.Bias(shape=[out_channel,]), 21 | ) 22 | 23 | # Convolution -> ReLU -> Pooling 24 | def CRP(c, h, stride=2, pooling=True): 25 | # convolution -> leakyReLU -> MaxPooling 26 | h = c.b(c.n(c.c(h), test=True)) 27 | h = F.leaky_relu(h,slope=0.1) 28 | if pooling: 29 | h = F.max_pooling_2d(h, ksize=2, stride=stride, pad=0) 30 | return h 31 | 32 | class YOLOtiny(Chain): 33 | def __init__(self): 34 | super(YOLOtiny, self).__init__( 35 | c1 = darknetConv2D(3, 16), 36 | c2 = darknetConv2D(None, 32), 37 | c3 = darknetConv2D(None, 64), 38 | c4 = darknetConv2D(None, 128), 39 | c5 = darknetConv2D(None, 256), 40 | c6 = darknetConv2D(None, 512), 41 | c7 = darknetConv2D(None, 1024), 42 | c8 = darknetConv2D(None, 1024), 43 | c9 = darknetConv2D(None, 125, bn=False) 44 | ) 45 | def __call__(self,x): 46 | return self.predict(x) 47 | 48 | def predict(self, x): 49 | h = CRP(self.c1, x) 50 | h = CRP(self.c2, h) 51 | h = CRP(self.c3, h) 52 | h = CRP(self.c4, h) 53 | h = CRP(self.c5, h) 54 | h = CRP(self.c6, h, stride=1) 55 | h = F.get_item(h,(slice(None),slice(None),slice(1,14),slice(1,14))) # x[:,:,0:13,0:13] 56 | h = CRP(self.c7, h, pooling=False) 57 | h = CRP(self.c8, h, pooling=False) 58 | h = self.c9.b(self.c9.c(h)) # no leaky relu, no BN 59 | return h 60 | 61 | def loadCoef(self,filename): 62 | print("loading",filename) 63 | file = open(filename, "rb") 64 | dat=np.fromfile(file,dtype=np.float32)[4:] # skip header(4xint) 65 | 66 | layers=[[3, 16], [16, 32], [32, 64], [64, 128], [128, 256], [256, 512], [512, 1024], [1024, 1024]] 67 | 68 | offset=0 69 | for i, l in enumerate(layers): 70 | in_ch = l[0] 71 | out_ch = l[1] 72 | 73 | # load bias(Bias.bはout_chと同じサイズ) 74 | txt = "self.c%d.b.b.data = dat[%d:%d]" % (i+1, offset, offset+out_ch) 75 | offset+=out_ch 76 | exec(txt) 77 | 78 | # load bn(BatchNormalization.gammaはout_chと同じサイズ) 79 | txt= "self.c%d.n.gamma.data = dat[%d:%d]" % (i+1, offset,offset+out_ch) 80 | offset+=out_ch 81 | exec(txt) 82 | 83 | # (BatchNormalization.avg_meanはout_chと同じサイズ) 84 | txt= "self.c%d.n.avg_mean = dat[%d:%d]" % (i+1, offset,offset+out_ch) 85 | offset+=out_ch 86 | exec(txt) 87 | 88 | # (BatchNormalization.avg_varはout_chと同じサイズ) 89 | txt= "self.c%d.n.avg_var = dat[%d:%d]" % (i+1, offset,offset+out_ch) 90 | offset+=out_ch 91 | exec(txt) 92 | 93 | # load convolution weight(Convolution2D.Wは、outch * in_ch * フィルタサイズ。これを(out_ch, in_ch, 3, 3)にreshapeする) 94 | txt= "self.c%d.c.W.data = dat[%d:%d].reshape(%d,%d,3,3)" % (i+1, offset, offset+(out_ch*in_ch*9), out_ch,in_ch) 95 | 96 | offset+= (out_ch*in_ch*9) 97 | exec(txt) 98 | print(offset) 99 | 100 | # load last convolution weight(BiasとConvolution2Dのみロードする) 101 | in_ch = 1024 102 | out_ch = 125 103 | 104 | txt= "self.c9.b.b.data = dat[%d:%d]" % ( offset, offset+out_ch) 105 | offset+=out_ch 106 | exec(txt) 107 | 108 | txt= "self.c9.c.W.data = dat[%d:%d].reshape(%d,%d,1,1)" % ( offset, offset+out_ch*in_ch*1, out_ch,in_ch) 109 | offset+=out_ch*in_ch*1 110 | exec(txt) 111 | print(offset) 112 | 113 | if __name__ == '__main__': 114 | c=YOLOtiny() 115 | im=np.zeros((1, 3, 416, 416),dtype=np.float32) # ネットワークの入出力設定がNoneでも初回forward時にshape決まるので、とりあえず意味なく1回forwardする 116 | c.predict(im) 117 | 118 | c.loadCoef("tiny-yolo-voc.weights") # パラメータ代入 119 | serializers.save_npz('YOLOtiny_v2.model', c) 120 | -------------------------------------------------------------------------------- /YOLOtiny_v2.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leetenki/YOLOtiny_v2_chainer/820d8f5a15397104774db40d516da12c25212847/YOLOtiny_v2.model -------------------------------------------------------------------------------- /input_video.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leetenki/YOLOtiny_v2_chainer/820d8f5a15397104774db40d516da12c25212847/input_video.mp4 -------------------------------------------------------------------------------- /lib/preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import urllib.request 3 | 4 | def download_image(): 5 | parser = argparse.ArgumentParser(description='Web上から画像をダウンロードし、処理を行う。') 6 | parser.add_argument('--url', '-u', default='https://images-na.ssl-images-amazon.com/images/G/01/img15/pet-products/small-tiles/23695_pets_vertical_store_dogs_small_tile_8._CB312176604_.jpg', help='ダウンロードするイメージのURLを指定する') 7 | args = parser.parse_args() 8 | 9 | print('Download Image From {0} ....'.format(args.url)) 10 | image_file_path = './sample_images/sample.jpg' 11 | urllib.request.urlretrieve(args.url, image_file_path) 12 | 13 | return image_file_path 14 | -------------------------------------------------------------------------------- /predictor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import chainer 3 | import chainer.functions as F 4 | import cv2 5 | 6 | # x, y, w, hの4パラメータを保持するだけのクラス 7 | class Box(): 8 | def __init__(self, x, y, w, h): 9 | self.x = x 10 | self.y = y 11 | self.w = w 12 | self.h = h 13 | 14 | # 2本の線の情報を受取り、被ってる線分の長さを返す。あくまで線分 15 | def overlap(x1, len1, x2, len2): 16 | len1_half = len1/2 17 | len2_half = len2/2 18 | 19 | left = max(x1 - len1_half, x2 - len2_half) 20 | right = min(x1 + len1_half, x2 + len2_half) 21 | 22 | return right - left 23 | 24 | # 2つのboxを受け取り、被ってる面積を返す(intersection of 2 boxes) 25 | def box_intersection(a, b): 26 | w = overlap(a.x, a.w, b.x, b.w) 27 | h = overlap(a.x, a.h, b.x, b.h) 28 | if w < 0 or h < 0: 29 | return 0 30 | 31 | area = w * h 32 | return area 33 | 34 | # 2つのboxを受け取り、合計面積を返す。(union of 2 boxes) 35 | def box_union(a, b): 36 | i = box_intersection(a, b) 37 | u = a.w * a.h + b.w * b.h - i 38 | return u 39 | 40 | # compute iou 41 | def box_iou(a, b): 42 | return box_intersection(a, b) / box_union(a, b) 43 | 44 | def sigmoid(x): 45 | return 1.0 / (np.exp(-x) + 1.0) 46 | 47 | def softmax(x): 48 | x = np.array([x]) # reshape (len(x),) to (1, len(x)) 49 | return F.softmax(x).data 50 | 51 | def forward_cnn(model, im_org, img_width, img_height, n_grid_x, n_grid_y, n_bbox, n_classes): 52 | img = cv2.cvtColor(im_org, cv2.COLOR_BGR2RGB) 53 | img = cv2.resize(img, (img_height, img_width)) # (416, 416, 3) 54 | img = np.asarray(img, dtype=np.float32) / 255.0 55 | 56 | ans = model.predict(img.transpose(2, 0, 1).reshape(1, 3, img_height, img_width)).data[0] # (125, 13, 13) 57 | ans = ans.transpose(1, 2, 0) # (13, 13, 125) 58 | ans = ans.reshape(n_grid_y, n_grid_x, n_bbox, (n_classes + 5)) # (13, 13, 5, 25) 59 | return ans 60 | 61 | # cnnの処理結果を解釈して、box情報に変換する 62 | def get_detected_boxes(ans, n_grid_x, n_grid_y, n_bbox, n_classes, prob_thresh, img_width, img_height, biases): 63 | detected_boxes = [] 64 | grid_width = img_width / float(n_grid_x) 65 | grid_height = img_height / float(n_grid_y) 66 | 67 | for grid_y in range(n_grid_y): 68 | for grid_x in range(n_grid_x): 69 | for i in range(n_bbox): 70 | box = ans[grid_y, grid_x, i, 0:4] # (4,) 71 | conf = sigmoid(ans[grid_y, grid_x, i, 4]) 72 | probs = softmax(ans[grid_y, grid_x, i, 5:])[0] # (20,) 73 | 74 | p_class = probs * conf # (20,) 75 | if np.max(p_class) < prob_thresh: 76 | continue 77 | 78 | class_id = np.argmax(p_class) 79 | x = (grid_x + sigmoid(box[0])) * grid_width 80 | y = (grid_y + sigmoid(box[1])) * grid_height 81 | w = np.exp(box[2]) * biases[i][0] * grid_width 82 | h = np.exp(box[3]) * biases[i][1] * grid_height 83 | b = Box(x, y, w, h) 84 | 85 | detected_boxes.append([b, class_id, max(p_class)]) 86 | 87 | return detected_boxes 88 | 89 | def sort_boxes(boxes): 90 | from operator import itemgetter 91 | return sorted(boxes, key=itemgetter(1, 2), reverse=True) # boxes[2]とboxes[3]を使ったソート(class_idごとに、probability大きい順) 92 | 93 | # non maximum suppression 94 | def nms(sorted_boxes, iou_thresh): 95 | import itertools 96 | import copy 97 | 98 | nms_boxes = copy.copy(sorted_boxes) 99 | for a_sb, b_sb in itertools.combinations(sorted_boxes, 2): # 2つのboxから成る全ての組合せパターンを作る 100 | a = a_sb 101 | b = b_sb 102 | # 2つのclassが同じid、かつiouがthresh以上、かつ前のboxよりもprobabilityが大きい時、probの小さいほうを削除する 103 | if a[1] == b[1] and box_iou(a[0], b[0]) > iou_thresh and a[2] > b[2]: 104 | if b in nms_boxes: 105 | nms_boxes.remove(b) 106 | 107 | return nms_boxes 108 | 109 | # clip box to object(はみ出した部分の座標を、エッジと一致させる。更にラベルを付ける。) 110 | def clip_objects(boxes, img_width, img_height): 111 | classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", 112 | "bus", "car", "cat", "chair", "cow", 113 | "diningtable", "dog", "horse", "motorbike", "person", 114 | "pottedplant", "sheep", "sofa", "train","tvmonitor"] 115 | 116 | clipped_objects = [] 117 | for box in boxes: 118 | b, class_id, p_class = box 119 | label = classes[class_id] 120 | prob = p_class * 100 121 | half_box_width = b.w / 2.0 122 | half_box_height = b.h / 2.0 123 | x0, y0, x1, y1 = ( 124 | int(np.clip(b.x - half_box_width, 0, img_width)), 125 | int(np.clip(b.y - half_box_height, 0, img_height)), 126 | int(np.clip(b.x + half_box_width, 0, img_width)), 127 | int(np.clip(b.y + half_box_height, 0, img_height)) 128 | ) 129 | clipped_objects.append([(x0, y0, x1, y1), label, prob]) 130 | 131 | return clipped_objects 132 | 133 | def predict(model, im_org): 134 | img_width = 416 135 | img_height = 416 136 | n_grid_x = 13 137 | n_grid_y = 13 138 | n_classes = 20 139 | n_bbox = 5 140 | biases = [[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]] 141 | prob_thresh = 0.2 142 | iou_thresh = 0.05 143 | org_img_height, org_img_width = im_org.shape[0:2] 144 | 145 | # forward 146 | ans = forward_cnn(model, im_org, img_width, img_height, n_grid_x, n_grid_y, n_bbox, n_classes) 147 | 148 | # compute detected boxes 149 | detected_boxes = get_detected_boxes(ans, n_grid_x, n_grid_y, n_bbox, n_classes, prob_thresh, org_img_width, org_img_height, biases) 150 | 151 | # sort boxes by class_id 152 | sorted_boxes = sort_boxes(detected_boxes) 153 | 154 | # non maximum suppression 155 | boxes = nms(sorted_boxes, iou_thresh) 156 | 157 | # clip objects 158 | clipped_objects = clip_objects(boxes, org_img_width, org_img_height) 159 | 160 | return clipped_objects 161 | -------------------------------------------------------------------------------- /sample_images/sample.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leetenki/YOLOtiny_v2_chainer/820d8f5a15397104774db40d516da12c25212847/sample_images/sample.jpg -------------------------------------------------------------------------------- /tiny-yolo-voc.weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leetenki/YOLOtiny_v2_chainer/820d8f5a15397104774db40d516da12c25212847/tiny-yolo-voc.weights -------------------------------------------------------------------------------- /yolo_camera.py: -------------------------------------------------------------------------------- 1 | from YOLOtiny import * 2 | from predictor import * 3 | from lib.preprocess import * 4 | import cv2 5 | 6 | model = YOLOtiny() 7 | serializers.load_npz("YOLOtiny_v2.model", model) 8 | 9 | cap = cv2.VideoCapture(0) 10 | cap.set(3, 640) 11 | cap.set(4, 480) 12 | 13 | while(True): 14 | ret, img = cap.read() 15 | objects = predict(model, img) 16 | print(objects) 17 | 18 | for object in objects: 19 | cv2.rectangle(img, object[0][0:2], object[0][2:4], (0, 0, 255), 2) 20 | cv2.putText(img, "%s:%.2f%%" % (object[1], object[2]), (object[0][0], object[0][1]-4), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2) 21 | 22 | cv2.imshow('image', img) 23 | if cv2.waitKey(1) & 0xFF == ord('q'): 24 | break 25 | -------------------------------------------------------------------------------- /yolo_predict.py: -------------------------------------------------------------------------------- 1 | from YOLOtiny import * 2 | from predictor import * 3 | from lib.preprocess import * 4 | import cv2 5 | 6 | model = YOLOtiny() 7 | serializers.load_npz("YOLOtiny_v2.model", model) 8 | file_path = download_image() 9 | img = cv2.imread(file_path) 10 | objects = predict(model, img) 11 | print(objects) 12 | 13 | for object in objects: 14 | cv2.rectangle(img, object[0][0:2], object[0][2:4], (0, 0, 255), 2) 15 | cv2.putText(img, "%s:%.2f%%" % (object[1], object[2]), (object[0][0], object[0][1]-4), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2) 16 | 17 | cv2.imshow('image', img) 18 | cv2.waitKey() 19 | -------------------------------------------------------------------------------- /yolo_video.py: -------------------------------------------------------------------------------- 1 | from YOLOtiny import * 2 | from predictor import * 3 | from lib.preprocess import * 4 | import cv2 5 | 6 | model = YOLOtiny() 7 | serializers.load_npz("YOLOtiny_v2.model", model) 8 | 9 | cap = cv2.VideoCapture("input_video.mp4") 10 | 11 | while(True): 12 | ret, img = cap.read() 13 | objects = predict(model, img) 14 | print(objects) 15 | 16 | for object in objects: 17 | cv2.rectangle(img, object[0][0:2], object[0][2:4], (0, 0, 255), 2) 18 | cv2.putText(img, "%s:%.2f%%" % (object[1], object[2]), (object[0][0], object[0][1]-4), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2) 19 | 20 | cv2.imshow('image', img) 21 | if cv2.waitKey(1) & 0xFF == ord('q'): 22 | break 23 | --------------------------------------------------------------------------------