├── .gitignore
├── README.md
├── YOLOtiny.py
├── YOLOtiny_v2.model
├── input_video.mp4
├── lib
    └── preprocess.py
├── predictor.py
├── sample_images
    └── sample.jpg
├── tiny-yolo-voc.weights
├── yolo_camera.py
├── yolo_predict.py
└── yolo_video.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # YOLOtiny V2のChainerバージョン
 2 | 
 3 | 
 4 | ## オブジェクト検出
 5 | 以下のコマンドで、指定した画像をダウンロードし、領域検出を行う。
 6 | 
 7 | ```
 8 | python yolo_predict.py -u '画像のURL'
 9 | ```
10 | 
11 | ## カメラでリアルタイム検出
12 | 以下のコマンドで、カメラを起動し、リアルタイムオブジェクト救出を行う。
13 | 
14 | ```
15 | python yolo_camera.py
16 | ```
17 | 
18 | ## 動画でオブジェクト検出
19 | 以下のコマンドで、input_video.mp4のビデオファイルを読み込み、オブエジェクト検出処理を行う。ビデオ内では、人間小さく映ってるのに加えて密集する事が多いので認識精度はかなり悪い。
20 | 
21 | ```
22 | python yolo_video.py
23 | ```
24 | 
25 | 
26 | ## darknetのweights変換
27 | すでに変換済みなので必要ないが、以下のコマンドで、darknetの重みパラメータのファイル`tiny-yolo-voc.weights`を読み込み、chainerのモデルファイル`YOLOtiny_v2.model`に変換できる。
28 | 
29 | ```
30 | python YOLOtiny.py
31 | ```
32 | 
33 | ## 参考
34 | [darknet](http://pjreddie.com/darknet/yolo/)
35 | 
36 | [YOLOでPPAP](http://qiita.com/ashitani/items/566cf9234682cb5f2d60)
37 | 
38 | [YOLOでPPAP実装](https://github.com/ashitani)


--------------------------------------------------------------------------------
/YOLOtiny.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import chainer
  3 | from chainer import cuda, Function, gradient_check, Variable, optimizers, serializers, utils
  4 | from chainer import Link, Chain, ChainList
  5 | import chainer.functions as F
  6 | import chainer.links as L
  7 | from chainer import training
  8 | from chainer.training import extensions
  9 | 
 10 | def darknetConv2D(in_channel, out_channel, bn=True):
 11 |     if(bn):
 12 |         return Chain(
 13 |             c  = L.Convolution2D(in_channel, out_channel, ksize=3, pad=1, nobias=True),
 14 |             n  = L.BatchNormalization(out_channel, use_beta=False, eps=0.000001),
 15 |             b  = L.Bias(shape=[out_channel,]),
 16 |         )
 17 |     else:
 18 |         return Chain(
 19 |             c  = L.Convolution2D(in_channel,out_channel, ksize=3, pad=1,nobias=True),
 20 |             b  = L.Bias(shape=[out_channel,]),
 21 |         )
 22 | 
 23 | # Convolution -> ReLU -> Pooling
 24 | def CRP(c, h, stride=2, pooling=True):
 25 |     # convolution -> leakyReLU -> MaxPooling
 26 |     h = c.b(c.n(c.c(h), test=True))
 27 |     h = F.leaky_relu(h,slope=0.1)
 28 |     if pooling:
 29 |         h = F.max_pooling_2d(h, ksize=2, stride=stride, pad=0)
 30 |     return h
 31 | 
 32 | class YOLOtiny(Chain):
 33 |     def __init__(self):
 34 |         super(YOLOtiny, self).__init__(
 35 |             c1 = darknetConv2D(3, 16),
 36 |             c2 = darknetConv2D(None, 32),
 37 |             c3 = darknetConv2D(None, 64),
 38 |             c4 = darknetConv2D(None, 128),
 39 |             c5 = darknetConv2D(None, 256),
 40 |             c6 = darknetConv2D(None, 512),
 41 |             c7 = darknetConv2D(None, 1024),
 42 |             c8 = darknetConv2D(None, 1024),
 43 |             c9 = darknetConv2D(None, 125, bn=False)
 44 |         )
 45 |     def __call__(self,x):
 46 |        return self.predict(x)
 47 | 
 48 |     def predict(self, x):
 49 |         h = CRP(self.c1, x)
 50 |         h = CRP(self.c2, h)
 51 |         h = CRP(self.c3, h)
 52 |         h = CRP(self.c4, h)
 53 |         h = CRP(self.c5, h)
 54 |         h = CRP(self.c6, h, stride=1)
 55 |         h = F.get_item(h,(slice(None),slice(None),slice(1,14),slice(1,14))) # x[:,:,0:13,0:13]
 56 |         h = CRP(self.c7, h, pooling=False)
 57 |         h = CRP(self.c8, h, pooling=False)
 58 |         h = self.c9.b(self.c9.c(h)) # no leaky relu, no BN
 59 |         return h
 60 | 
 61 |     def loadCoef(self,filename):
 62 |         print("loading",filename)
 63 |         file = open(filename, "rb")
 64 |         dat=np.fromfile(file,dtype=np.float32)[4:] # skip header(4xint)
 65 | 
 66 |         layers=[[3, 16], [16, 32], [32, 64], [64, 128], [128, 256], [256, 512], [512, 1024], [1024, 1024]]
 67 | 
 68 |         offset=0
 69 |         for i, l in enumerate(layers):
 70 |             in_ch = l[0]
 71 |             out_ch = l[1]
 72 | 
 73 |             # load bias(Bias.bはout_chと同じサイズ)
 74 |             txt = "self.c%d.b.b.data = dat[%d:%d]" % (i+1, offset, offset+out_ch)
 75 |             offset+=out_ch
 76 |             exec(txt)
 77 | 
 78 |             # load bn(BatchNormalization.gammaはout_chと同じサイズ)
 79 |             txt= "self.c%d.n.gamma.data = dat[%d:%d]" % (i+1, offset,offset+out_ch)
 80 |             offset+=out_ch
 81 |             exec(txt)
 82 | 
 83 |             # (BatchNormalization.avg_meanはout_chと同じサイズ)
 84 |             txt= "self.c%d.n.avg_mean = dat[%d:%d]" % (i+1, offset,offset+out_ch)
 85 |             offset+=out_ch
 86 |             exec(txt)
 87 | 
 88 |             # (BatchNormalization.avg_varはout_chと同じサイズ)
 89 |             txt= "self.c%d.n.avg_var = dat[%d:%d]" % (i+1, offset,offset+out_ch)
 90 |             offset+=out_ch
 91 |             exec(txt)
 92 | 
 93 |             # load convolution weight(Convolution2D.Wは、outch * in_ch * フィルタサイズ。これを(out_ch, in_ch, 3, 3)にreshapeする)
 94 |             txt= "self.c%d.c.W.data = dat[%d:%d].reshape(%d,%d,3,3)" % (i+1, offset, offset+(out_ch*in_ch*9), out_ch,in_ch)
 95 | 
 96 |             offset+= (out_ch*in_ch*9)
 97 |             exec(txt)
 98 |             print(offset)
 99 | 
100 |         # load last convolution weight(BiasとConvolution2Dのみロードする)
101 |         in_ch = 1024
102 |         out_ch = 125
103 | 
104 |         txt= "self.c9.b.b.data = dat[%d:%d]" % ( offset, offset+out_ch)
105 |         offset+=out_ch
106 |         exec(txt)
107 | 
108 |         txt= "self.c9.c.W.data = dat[%d:%d].reshape(%d,%d,1,1)" % ( offset, offset+out_ch*in_ch*1, out_ch,in_ch)
109 |         offset+=out_ch*in_ch*1
110 |         exec(txt)
111 |         print(offset)
112 | 
113 | if __name__ == '__main__':
114 |     c=YOLOtiny()
115 |     im=np.zeros((1, 3, 416, 416),dtype=np.float32) # ネットワークの入出力設定がNoneでも初回forward時にshape決まるので、とりあえず意味なく1回forwardする
116 |     c.predict(im)
117 | 
118 |     c.loadCoef("tiny-yolo-voc.weights") # パラメータ代入
119 |     serializers.save_npz('YOLOtiny_v2.model', c)
120 | 


--------------------------------------------------------------------------------
/YOLOtiny_v2.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leetenki/YOLOtiny_v2_chainer/820d8f5a15397104774db40d516da12c25212847/YOLOtiny_v2.model


--------------------------------------------------------------------------------
/input_video.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leetenki/YOLOtiny_v2_chainer/820d8f5a15397104774db40d516da12c25212847/input_video.mp4


--------------------------------------------------------------------------------
/lib/preprocess.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import urllib.request
 3 | 
 4 | def download_image(): 
 5 |     parser = argparse.ArgumentParser(description='Web上から画像をダウンロードし、処理を行う。')
 6 |     parser.add_argument('--url', '-u', default='https://images-na.ssl-images-amazon.com/images/G/01/img15/pet-products/small-tiles/23695_pets_vertical_store_dogs_small_tile_8._CB312176604_.jpg', help='ダウンロードするイメージのURLを指定する')
 7 |     args = parser.parse_args()
 8 | 
 9 |     print('Download Image From {0} ....'.format(args.url))
10 |     image_file_path = './sample_images/sample.jpg'
11 |     urllib.request.urlretrieve(args.url, image_file_path)
12 | 
13 |     return image_file_path
14 | 


--------------------------------------------------------------------------------
/predictor.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import chainer
  3 | import chainer.functions as F
  4 | import cv2
  5 | 
  6 | # x, y, w, hの4パラメータを保持するだけのクラス
  7 | class Box():
  8 |     def __init__(self, x, y, w, h):
  9 |         self.x = x
 10 |         self.y = y
 11 |         self.w = w
 12 |         self.h = h
 13 | 
 14 | # 2本の線の情報を受取り、被ってる線分の長さを返す。あくまで線分
 15 | def overlap(x1, len1, x2, len2):
 16 |     len1_half = len1/2
 17 |     len2_half = len2/2
 18 | 
 19 |     left = max(x1 - len1_half, x2 - len2_half)
 20 |     right = min(x1 + len1_half, x2 + len2_half)
 21 | 
 22 |     return right - left
 23 | 
 24 | # 2つのboxを受け取り、被ってる面積を返す(intersection of 2 boxes)
 25 | def box_intersection(a, b):
 26 |     w = overlap(a.x, a.w, b.x, b.w)
 27 |     h = overlap(a.x, a.h, b.x, b.h)
 28 |     if w < 0 or h < 0:
 29 |         return 0
 30 | 
 31 |     area = w * h
 32 |     return area
 33 | 
 34 | # 2つのboxを受け取り、合計面積を返す。(union of 2 boxes)
 35 | def box_union(a, b):
 36 |     i = box_intersection(a, b)
 37 |     u = a.w * a.h + b.w * b.h - i
 38 |     return u
 39 | 
 40 | # compute iou
 41 | def box_iou(a, b):
 42 |     return box_intersection(a, b) / box_union(a, b)
 43 | 
 44 | def sigmoid(x):
 45 |     return 1.0 / (np.exp(-x) + 1.0)
 46 | 
 47 | def softmax(x):
 48 |     x = np.array([x]) # reshape (len(x),) to (1, len(x))
 49 |     return F.softmax(x).data
 50 | 
 51 | def forward_cnn(model, im_org, img_width, img_height, n_grid_x, n_grid_y, n_bbox, n_classes):
 52 |     img = cv2.cvtColor(im_org, cv2.COLOR_BGR2RGB)
 53 |     img = cv2.resize(img, (img_height, img_width)) # (416, 416, 3)
 54 |     img = np.asarray(img, dtype=np.float32) / 255.0
 55 | 
 56 |     ans = model.predict(img.transpose(2, 0, 1).reshape(1, 3, img_height, img_width)).data[0] # (125, 13, 13)
 57 |     ans = ans.transpose(1, 2, 0) # (13, 13, 125)
 58 |     ans = ans.reshape(n_grid_y, n_grid_x, n_bbox, (n_classes + 5)) # (13, 13, 5, 25)
 59 |     return ans
 60 | 
 61 | # cnnの処理結果を解釈して、box情報に変換する
 62 | def get_detected_boxes(ans, n_grid_x, n_grid_y, n_bbox, n_classes, prob_thresh, img_width, img_height, biases):
 63 |     detected_boxes = []
 64 |     grid_width = img_width / float(n_grid_x)
 65 |     grid_height = img_height / float(n_grid_y)
 66 | 
 67 |     for grid_y in range(n_grid_y):
 68 |         for grid_x in range(n_grid_x):
 69 |             for i in range(n_bbox):
 70 |                 box = ans[grid_y, grid_x, i, 0:4] # (4,)
 71 |                 conf = sigmoid(ans[grid_y, grid_x, i, 4]) 
 72 |                 probs = softmax(ans[grid_y, grid_x, i, 5:])[0] # (20,)
 73 | 
 74 |                 p_class = probs * conf # (20,)
 75 |                 if np.max(p_class) < prob_thresh:
 76 |                     continue
 77 | 
 78 |                 class_id = np.argmax(p_class)
 79 |                 x = (grid_x + sigmoid(box[0])) * grid_width
 80 |                 y = (grid_y + sigmoid(box[1])) * grid_height
 81 |                 w = np.exp(box[2]) * biases[i][0] * grid_width
 82 |                 h = np.exp(box[3]) * biases[i][1] * grid_height
 83 |                 b = Box(x, y, w, h)
 84 | 
 85 |                 detected_boxes.append([b, class_id, max(p_class)])
 86 | 
 87 |     return detected_boxes
 88 | 
 89 | def sort_boxes(boxes):
 90 |     from operator import itemgetter
 91 |     return sorted(boxes, key=itemgetter(1, 2), reverse=True) # boxes[2]とboxes[3]を使ったソート(class_idごとに、probability大きい順)
 92 | 
 93 | # non maximum suppression
 94 | def nms(sorted_boxes, iou_thresh):
 95 |     import itertools
 96 |     import copy
 97 | 
 98 |     nms_boxes = copy.copy(sorted_boxes)
 99 |     for a_sb, b_sb in itertools.combinations(sorted_boxes, 2): # 2つのboxから成る全ての組合せパターンを作る
100 |         a = a_sb
101 |         b = b_sb
102 |         # 2つのclassが同じid、かつiouがthresh以上、かつ前のboxよりもprobabilityが大きい時、probの小さいほうを削除する
103 |         if a[1] == b[1] and box_iou(a[0], b[0]) > iou_thresh and a[2] > b[2]:
104 |             if b in nms_boxes:
105 |                 nms_boxes.remove(b)
106 | 
107 |     return nms_boxes
108 | 
109 | # clip box to object(はみ出した部分の座標を、エッジと一致させる。更にラベルを付ける。)
110 | def clip_objects(boxes, img_width, img_height):
111 |     classes = ["aeroplane", "bicycle", "bird", "boat", "bottle",
112 |               "bus", "car", "cat", "chair", "cow",
113 |               "diningtable", "dog", "horse", "motorbike", "person",
114 |               "pottedplant", "sheep", "sofa", "train","tvmonitor"] 
115 | 
116 |     clipped_objects = []
117 |     for box in boxes:
118 |         b, class_id, p_class = box
119 |         label = classes[class_id]
120 |         prob = p_class * 100
121 |         half_box_width = b.w / 2.0
122 |         half_box_height = b.h / 2.0
123 |         x0, y0, x1, y1 = (
124 |             int(np.clip(b.x - half_box_width, 0, img_width)),
125 |             int(np.clip(b.y - half_box_height, 0, img_height)),
126 |             int(np.clip(b.x + half_box_width, 0, img_width)),
127 |             int(np.clip(b.y + half_box_height, 0, img_height))
128 |         )
129 |         clipped_objects.append([(x0, y0, x1, y1), label, prob])
130 | 
131 |     return clipped_objects
132 | 
133 | def predict(model, im_org):
134 |     img_width = 416
135 |     img_height = 416
136 |     n_grid_x = 13
137 |     n_grid_y = 13
138 |     n_classes = 20
139 |     n_bbox = 5
140 |     biases = [[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]]
141 |     prob_thresh = 0.2
142 |     iou_thresh = 0.05
143 |     org_img_height, org_img_width = im_org.shape[0:2]
144 | 
145 |     # forward
146 |     ans = forward_cnn(model, im_org, img_width, img_height, n_grid_x, n_grid_y, n_bbox, n_classes)
147 | 
148 |     # compute detected boxes
149 |     detected_boxes = get_detected_boxes(ans, n_grid_x, n_grid_y, n_bbox, n_classes, prob_thresh, org_img_width, org_img_height, biases)
150 | 
151 |     # sort boxes by class_id
152 |     sorted_boxes = sort_boxes(detected_boxes)
153 | 
154 |     # non maximum suppression
155 |     boxes = nms(sorted_boxes, iou_thresh)
156 | 
157 |     # clip objects
158 |     clipped_objects = clip_objects(boxes, org_img_width, org_img_height)
159 | 
160 |     return clipped_objects
161 | 


--------------------------------------------------------------------------------
/sample_images/sample.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leetenki/YOLOtiny_v2_chainer/820d8f5a15397104774db40d516da12c25212847/sample_images/sample.jpg


--------------------------------------------------------------------------------
/tiny-yolo-voc.weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leetenki/YOLOtiny_v2_chainer/820d8f5a15397104774db40d516da12c25212847/tiny-yolo-voc.weights


--------------------------------------------------------------------------------
/yolo_camera.py:
--------------------------------------------------------------------------------
 1 | from YOLOtiny import *
 2 | from predictor import *
 3 | from lib.preprocess import *
 4 | import cv2
 5 | 
 6 | model = YOLOtiny()
 7 | serializers.load_npz("YOLOtiny_v2.model", model)
 8 | 
 9 | cap = cv2.VideoCapture(0)
10 | cap.set(3, 640)
11 | cap.set(4, 480)
12 | 
13 | while(True):
14 |     ret, img = cap.read()
15 |     objects = predict(model, img)
16 |     print(objects)
17 | 
18 |     for object in objects:
19 |         cv2.rectangle(img, object[0][0:2], object[0][2:4], (0, 0, 255), 2)
20 |         cv2.putText(img, "%s:%.2f%%" % (object[1], object[2]), (object[0][0], object[0][1]-4), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
21 | 
22 |     cv2.imshow('image', img)
23 |     if cv2.waitKey(1) & 0xFF == ord('q'):
24 |         break
25 | 


--------------------------------------------------------------------------------
/yolo_predict.py:
--------------------------------------------------------------------------------
 1 | from YOLOtiny import *
 2 | from predictor import *
 3 | from lib.preprocess import *
 4 | import cv2
 5 | 
 6 | model = YOLOtiny()
 7 | serializers.load_npz("YOLOtiny_v2.model", model)
 8 | file_path = download_image()
 9 | img = cv2.imread(file_path)
10 | objects = predict(model, img)
11 | print(objects)
12 | 
13 | for object in objects:
14 |     cv2.rectangle(img, object[0][0:2], object[0][2:4], (0, 0, 255), 2)
15 |     cv2.putText(img, "%s:%.2f%%" % (object[1], object[2]), (object[0][0], object[0][1]-4), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
16 | 
17 | cv2.imshow('image', img)
18 | cv2.waitKey()
19 | 


--------------------------------------------------------------------------------
/yolo_video.py:
--------------------------------------------------------------------------------
 1 | from YOLOtiny import *
 2 | from predictor import *
 3 | from lib.preprocess import *
 4 | import cv2
 5 | 
 6 | model = YOLOtiny()
 7 | serializers.load_npz("YOLOtiny_v2.model", model)
 8 | 
 9 | cap = cv2.VideoCapture("input_video.mp4")
10 | 
11 | while(True):
12 |     ret, img = cap.read()
13 |     objects = predict(model, img)
14 |     print(objects)
15 | 
16 |     for object in objects:
17 |         cv2.rectangle(img, object[0][0:2], object[0][2:4], (0, 0, 255), 2)
18 |         cv2.putText(img, "%s:%.2f%%" % (object[1], object[2]), (object[0][0], object[0][1]-4), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
19 | 
20 |     cv2.imshow('image', img)
21 |     if cv2.waitKey(1) & 0xFF == ord('q'):
22 |         break
23 | 


--------------------------------------------------------------------------------