├── core
    ├── __init__.py
    ├── __pycache__
    │   ├── config.cpython-37.pyc
    │   └── __init__.cpython-37.pyc
    └── config.py
├── models
    ├── __init__.py
    ├── __pycache__
    │   └── __init__.cpython-37.pyc
    ├── config
    │   └── config.yaml
    └── rknnlite_rk3588_tracker.py
├── utils
    ├── __init__.py
    ├── __pycache__
    │   ├── bbox.cpython-37.pyc
    │   └── __init__.cpython-37.pyc
    └── bbox.py
├── weights
    ├── head.rknn
    ├── track_backbone_T.rknn
    └── track_backbone_X.rknn
├── README.md
└── main.py


/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/weights/head.rknn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Try2ChangeX/NanoTrack_RK3588_python/HEAD/weights/head.rknn


--------------------------------------------------------------------------------
/weights/track_backbone_T.rknn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Try2ChangeX/NanoTrack_RK3588_python/HEAD/weights/track_backbone_T.rknn


--------------------------------------------------------------------------------
/weights/track_backbone_X.rknn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Try2ChangeX/NanoTrack_RK3588_python/HEAD/weights/track_backbone_X.rknn


--------------------------------------------------------------------------------
/core/__pycache__/config.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Try2ChangeX/NanoTrack_RK3588_python/HEAD/core/__pycache__/config.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/bbox.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Try2ChangeX/NanoTrack_RK3588_python/HEAD/utils/__pycache__/bbox.cpython-37.pyc


--------------------------------------------------------------------------------
/core/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Try2ChangeX/NanoTrack_RK3588_python/HEAD/core/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Try2ChangeX/NanoTrack_RK3588_python/HEAD/utils/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/models/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Try2ChangeX/NanoTrack_RK3588_python/HEAD/models/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NanoTrack_RK3588_python
 2 | 
 3 | 基于瑞芯微RK3588 NPU的NanoTrack跟踪算法，可运行于RK3588开发板，可达120FPS.
 4 | 
 5 | 
 6 | ## dependence
 7 | 
 8 | ```
 9 | 	numpy
10 | 	opencv
11 | 	rknn_toolkit_lite2 == 1.3 
12 | ```
13 | 
14 | RKNN3588对应的rknn_toolkit_lite2官方开发库以及开发文档请参考[rknn-toolkit2](https://github.com/rockchip-linux/rknn-toolkit2)
15 | 
16 | 
17 | ## demo
18 | 
19 | 模型转换需先使用rknn-toolkit2转为.rknn格式
20 | 
21 | ```
22 | 	python3 main.py
23 | ```
24 | 
25 | - video_name 为目标视频地址
26 | - init_rect 为初始检测bbox
27 | 
28 | 
29 | ## reference
30 | 
31 | [rknn-toolkit2](https://github.com/rockchip-linux/rknn-toolkit2)  
32 | [SiamTracker](https://github.com/HonglinChu/SiamTrackers)
33 | 


--------------------------------------------------------------------------------
/models/config/config.yaml:
--------------------------------------------------------------------------------
 1 | META_ARC: "nanotrack"
 2 | 
 3 | BACKBONE:
 4 |     TYPE: "mobilenetv3_small"
 5 |     KWARGS:
 6 |         used_layers:  [4] #
 7 |     PRETRAINED:  './models/pretrained/mobilenetv3_small_1.0.pth'  
 8 |     TRAIN_LAYERS:  ['features'] # 
 9 |     TRAIN_EPOCH: 10 
10 |     LAYERS_LR: 0.1
11 | 
12 | ADJUST: 
13 |     ADJUST: True
14 |     TYPE: 'AdjustLayer' 
15 |     KWARGS:
16 |         in_channels:  64  
17 |         out_channels: 64  
18 | 
19 | BAN: 
20 |     BAN: True 
21 |     TYPE: DepthwiseBAN 
22 |     KWARGS: 
23 |         in_channels:  64  
24 |         out_channels: 64 
25 |         
26 | CUDA: True 
27 | 
28 | POINT:
29 |     STRIDE: 16  
30 | 
31 | TRACK:
32 |     TYPE: 'NanoTracker' 
33 |     WINDOW_INFLUENCE: 0.455  
34 |     PENALTY_K: 0.15  
35 |     LR:  0.37
36 |     EXEMPLAR_SIZE: 127
37 |     INSTANCE_SIZE: 255
38 |     BASE_SIZE:  7   
39 |     CONTEXT_AMOUNT: 0.5 
40 | 
41 | TRAIN:
42 |     EPOCH: 50  
43 |     START_EPOCH: 0 
44 |     BATCH_SIZE: 32 
45 |     NUM_WORKERS: 8
46 |     BASE_LR: 0.005 
47 |     CLS_WEIGHT: 1.0
48 |     LOC_WEIGHT: 1.0
49 |     NUM_CONVS: 4 
50 |     BASE_SIZE: 7  
51 |     OUTPUT_SIZE: 16  
52 |     RESUME: '' 
53 |     PRETRAINED: ''   
54 |     SNAPSHOT_DIR: './models/snapshot'  
55 |     
56 |     LR:
57 |         TYPE: 'log'
58 |         KWARGS:
59 |             start_lr: 0.005 
60 |             end_lr: 0.0005  
61 |     LR_WARMUP:
62 |         TYPE: 'step'
63 |         EPOCH: 5
64 |         KWARGS:
65 |             start_lr: 0.001
66 |             end_lr: 0.005
67 |             step: 1
68 | 
69 | DATASET:
70 |     NAMES:
71 |     - 'GOT'
72 | 
73 |     VIDEOS_PER_EPOCH: 100000 
74 | 
75 |     TEMPLATE:
76 |         SHIFT: 4 
77 |         SCALE: 0.05 
78 |         BLUR: 0.0
79 |         FLIP: 0.0 
80 |         COLOR: 1.0  
81 | 
82 |     SEARCH:
83 |         SHIFT: 64 
84 |         SCALE: 0.18
85 |         BLUR: 0.2 
86 |         FLIP: 0.0 
87 |         COLOR: 1.0 
88 | 
89 |     NEG: 0.2 
90 |     GRAY: 0.0 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import unicode_literals
  5 | 
  6 | import os
  7 | import time
  8 | import argparse
  9 | 
 10 | import cv2
 11 | import numpy as np
 12 | from glob import glob
 13 | 
 14 | import sys
 15 | 
 16 | sys.path.append(os.getcwd())
 17 | from core.config import cfg
 18 | 
 19 | from models.rknnlite_rk3588_tracker import NnoTracker_RKNNLite
 20 | 
 21 | parser = argparse.ArgumentParser(description='tracking demo')
 22 | 
 23 | parser.add_argument('--config', default='./models/config/config.yaml', type=str, help='config file')
 24 | 
 25 | parser.add_argument('--save', action='store_true', help='whether visualzie result')
 26 | 
 27 | args = parser.parse_args()
 28 | 
 29 | def get_frames(video_name):
 30 |     if not video_name:
 31 |         cap = cv2.VideoCapture(0)
 32 |         # warmup
 33 |         for i in range(5):
 34 |             cap.read()
 35 |         while True:
 36 |             ret, frame = cap.read()
 37 |             if ret:
 38 |                 yield frame
 39 |             else:
 40 |                 break
 41 | 
 42 |     elif video_name.endswith('avi') or \
 43 |             video_name.endswith('mp4') or \
 44 |             video_name.endswith('mov'):
 45 |         cap = cv2.VideoCapture(video_name)
 46 | 
 47 |         # warmup
 48 |         for i in range(50):
 49 |             cap.read()
 50 | 
 51 |         while True:
 52 |             ret, frame = cap.read()
 53 |             if ret:
 54 |                 yield frame
 55 |             else:
 56 |                 break
 57 |     else:
 58 |         images = glob(os.path.join(video_name, '*.jp*'))
 59 |         images = sorted(images,
 60 |                         key=lambda x: int(x.split('/')[-1].split('.')[0]))
 61 |         for img in images:
 62 |             frame = cv2.imread(img)
 63 |             yield frame
 64 | 
 65 | 
 66 | def main():
 67 |     # load config
 68 |     cfg.merge_from_file(args.config)
 69 | 
 70 |     # load_weight
 71 |     Tback_weight = './weights/track_backbone_T.rknn'
 72 |     Xback_weight = './weights/track_backbone_X.rknn'
 73 |     Head_weight = './weights/head.rknn'
 74 | 
 75 |     video_name = './data/{your_video}'
 76 |     tracker = NnoTracker_RKNNLite(Tback_weight, Xback_weight, Head_weight)
 77 |     first_frame = True
 78 | 
 79 |     # img_savedir = './data/debug_img/'
 80 |     # count = 0
 81 | 
 82 |     for frame in get_frames(video_name):
 83 |         if first_frame:
 84 |             # build video writer
 85 | 
 86 |             init_rect = [280, 472, 70, 47]
 87 |             tracker.init(frame, init_rect)
 88 |             first_frame = False
 89 |         else:
 90 |             t1 = time.time()
 91 |             outputs = tracker.track(frame)
 92 |             print('fps：', 1. / (time.time() - t1))
 93 |             if 'polygon' in outputs:
 94 |                 polygon = np.array(outputs['polygon']).astype(np.int32)
 95 |                 cv2.polylines(frame, [polygon.reshape((-1, 1, 2))],
 96 |                               True, (0, 255, 0), 3)
 97 |                 mask = ((outputs['mask'] > cfg.TRACK.MASK_THERSHOLD) * 255)
 98 |                 mask = mask.astype(np.uint8)
 99 |                 mask = np.stack([mask, mask * 255, mask]).transpose(1, 2, 0)
100 |                 frame = cv2.addWeighted(frame, 0.77, mask, 0.23, -1)
101 |             else:
102 |                 bbox = list(map(int, outputs['bbox']))
103 |                 cv2.rectangle(frame, (bbox[0], bbox[1]),
104 |                               (bbox[0] + bbox[2], bbox[1] + bbox[3]),
105 |                               (0, 255, 0), 3)
106 |             # cv2.imshow(video_name, frame)
107 |             # cv2.waitKey(30)
108 |             # cv2.imwrite(os.path.join(img_savedir, '%03d.jpg'%count), frame)
109 |             # count += 1
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     main()
114 | 


--------------------------------------------------------------------------------
/utils/bbox.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) SenseTime. All Rights Reserved.
  2 | 
  3 | from __future__ import absolute_import
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | from __future__ import unicode_literals
  7 | 
  8 | from collections import namedtuple
  9 | 
 10 | import numpy as np
 11 | 
 12 | 
 13 | Corner = namedtuple('Corner', 'x1 y1 x2 y2')
 14 | # alias
 15 | BBox = Corner
 16 | Center = namedtuple('Center', 'x y w h')
 17 | 
 18 | 
 19 | def corner2center(corner):
 20 |     """ convert (x1, y1, x2, y2) to (cx, cy, w, h)
 21 |     Args:
 22 |         conrner: Corner or np.array (4*N)
 23 |     Return:
 24 |         Center or np.array (4 * N)
 25 |     """
 26 |     if isinstance(corner, Corner):
 27 |         x1, y1, x2, y2 = corner
 28 |         return Center((x1 + x2) * 0.5, (y1 + y2) * 0.5, (x2 - x1), (y2 - y1))
 29 |     else:
 30 |         x1, y1, x2, y2 = corner[0], corner[1], corner[2], corner[3]
 31 |         x = (x1 + x2) * 0.5
 32 |         y = (y1 + y2) * 0.5
 33 |         w = x2 - x1
 34 |         h = y2 - y1
 35 |         return x, y, w, h
 36 | 
 37 | 
 38 | def center2corner(center):
 39 |     """ convert (cx, cy, w, h) to (x1, y1, x2, y2)
 40 |     Args:
 41 |         center: Center or np.array (4 * N)
 42 |     Return:
 43 |         center or np.array (4 * N)
 44 |     """
 45 |     if isinstance(center, Center):
 46 |         x, y, w, h = center
 47 |         return Corner(x - w * 0.5, y - h * 0.5, x + w * 0.5, y + h * 0.5)
 48 |     else:
 49 |         x, y, w, h = center[0], center[1], center[2], center[3]
 50 |         x1 = x - w * 0.5
 51 |         y1 = y - h * 0.5
 52 |         x2 = x + w * 0.5
 53 |         y2 = y + h * 0.5
 54 |         return x1, y1, x2, y2
 55 | 
 56 | 
 57 | def IoU(rect1, rect2):
 58 |     """ caculate interection over union
 59 |     Args:
 60 |         rect1: (x1, y1, x2, y2)
 61 |         rect2: (x1, y1, x2, y2)
 62 |     Returns:
 63 |         iou
 64 |     """
 65 |     # overlap
 66 |     x1, y1, x2, y2 = rect1[0], rect1[1], rect1[2], rect1[3]
 67 |     tx1, ty1, tx2, ty2 = rect2[0], rect2[1], rect2[2], rect2[3]
 68 | 
 69 |     xx1 = np.maximum(tx1, x1)
 70 |     yy1 = np.maximum(ty1, y1)
 71 |     xx2 = np.minimum(tx2, x2)
 72 |     yy2 = np.minimum(ty2, y2)
 73 | 
 74 |     ww = np.maximum(0, xx2 - xx1)
 75 |     hh = np.maximum(0, yy2 - yy1)
 76 | 
 77 |     area = (x2 - x1) * (y2 - y1)
 78 |     target_a = (tx2 - tx1) * (ty2 - ty1)
 79 |     inter = ww * hh
 80 |     iou = inter / (area + target_a - inter)
 81 |     return iou
 82 | 
 83 | 
 84 | def cxy_wh_2_rect(pos, sz):
 85 |     """ convert (cx, cy, w, h) to (x1, y1, w, h), 0-index
 86 |     """
 87 |     return np.array([pos[0] - sz[0] / 2, pos[1] - sz[1] / 2, sz[0], sz[1]])
 88 | 
 89 | 
 90 | def rect_2_cxy_wh(rect):
 91 |     """ convert (x1, y1, w, h) to (cx, cy, w, h), 0-index
 92 |     """
 93 |     return np.array([rect[0] + rect[2] / 2, rect[1] + rect[3] / 2]), \
 94 |         np.array([rect[2], rect[3]])
 95 | 
 96 | 
 97 | def cxy_wh_2_rect1(pos, sz):
 98 |     """ convert (cx, cy, w, h) to (x1, y1, w, h), 1-index
 99 |     """
100 |     return np.array([pos[0] - sz[0] / 2 + 1, pos[1] - sz[1] / 2 + 1, sz[0], sz[1]])
101 | 
102 | 
103 | def rect1_2_cxy_wh(rect):
104 |     """ convert (x1, y1, w, h) to (cx, cy, w, h), 1-index
105 |     """
106 |     return np.array([rect[0] + rect[2] / 2 - 1, rect[1] + rect[3] / 2 - 1]), \
107 |         np.array([rect[2], rect[3]])
108 | 
109 | 
110 | def get_axis_aligned_bbox(region):
111 |     """ convert region to (cx, cy, w, h) that represent by axis aligned box
112 |     """
113 |     nv = region.size
114 |     if nv == 8:
115 |         cx = np.mean(region[0::2])
116 |         cy = np.mean(region[1::2])
117 |         x1 = min(region[0::2])
118 |         x2 = max(region[0::2])
119 |         y1 = min(region[1::2])
120 |         y2 = max(region[1::2])
121 |         A1 = np.linalg.norm(region[0:2] - region[2:4]) * \
122 |             np.linalg.norm(region[2:4] - region[4:6])
123 |         A2 = (x2 - x1) * (y2 - y1)
124 |         s = np.sqrt(A1 / A2)
125 |         w = s * (x2 - x1) + 1
126 |         h = s * (y2 - y1) + 1
127 |     else:
128 |         x = region[0]
129 |         y = region[1]
130 |         w = region[2]
131 |         h = region[3]
132 |         cx = x + w / 2
133 |         cy = y + h / 2
134 |     return cx, cy, w, h
135 | 
136 | 
137 | 
138 | def get_min_max_bbox(region):
139 |     """ convert region to (cx, cy, w, h) that represent by mim-max box
140 |     """
141 |     nv = region.size
142 |     if nv == 8:
143 |         cx = np.mean(region[0::2])
144 |         cy = np.mean(region[1::2])
145 |         x1 = min(region[0::2])
146 |         x2 = max(region[0::2])
147 |         y1 = min(region[1::2])
148 |         y2 = max(region[1::2])
149 |         w = x2 - x1
150 |         h = y2 - y1
151 |     else:
152 |         x = region[0]
153 |         y = region[1]
154 |         w = region[2]
155 |         h = region[3]
156 |         cx = x + w / 2
157 |         cy = y + h / 2
158 |     return cx, cy, w, h
159 | 


--------------------------------------------------------------------------------
/core/config.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) SenseTime. All Rights Reserved.
  2 | 
  3 | from __future__ import absolute_import
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | from __future__ import unicode_literals
  7 | 
  8 | from yacs.config import CfgNode as CN
  9 | 
 10 | __C = CN()
 11 | 
 12 | cfg = __C
 13 | 
 14 | __C.META_ARC = ""
 15 | 
 16 | __C.CUDA = True 
 17 | 
 18 | # ------------------------------------------------------------------------ #
 19 | # Training options 
 20 | # ------------------------------------------------------------------------ #
 21 | __C.TRAIN = CN()
 22 | 
 23 | # Number of negative
 24 | __C.TRAIN.NEG_NUM = 16
 25 | 
 26 | # Number of positive
 27 | __C.TRAIN.POS_NUM = 16
 28 | 
 29 | # Number of anchors per images
 30 | __C.TRAIN.TOTAL_NUM = 64
 31 | 
 32 | 
 33 | __C.TRAIN.EXEMPLAR_SIZE = 127
 34 | 
 35 | __C.TRAIN.SEARCH_SIZE = 255
 36 | 
 37 | __C.TRAIN.BASE_SIZE = 8
 38 | 
 39 | __C.TRAIN.OUTPUT_SIZE = 25
 40 | 
 41 | __C.TRAIN.RESUME = ''
 42 | 
 43 | __C.TRAIN.PRETRAINED = ''
 44 | 
 45 | __C.TRAIN.LOG_DIR = './logs'
 46 | 
 47 | __C.TRAIN.SNAPSHOT_DIR = './snapshot'
 48 | 
 49 | __C.TRAIN.EPOCH = 20
 50 | 
 51 | __C.TRAIN.START_EPOCH = 0
 52 | __C.TRAIN.NUM_CONVS =4
 53 | 
 54 | __C.TRAIN.BATCH_SIZE = 32
 55 | 
 56 | __C.TRAIN.NUM_WORKERS = 8
 57 | 
 58 | __C.TRAIN.MOMENTUM = 0.9
 59 | 
 60 | __C.TRAIN.WEIGHT_DECAY = 0.0001
 61 | 
 62 | __C.TRAIN.CLS_WEIGHT = 1.0
 63 | 
 64 | __C.TRAIN.LOC_WEIGHT = 1.0
 65 | 
 66 | __C.TRAIN.PRINT_FREQ = 20
 67 | 
 68 | __C.TRAIN.LOG_GRADS = False
 69 | 
 70 | __C.TRAIN.GRAD_CLIP = 10.0
 71 | 
 72 | __C.TRAIN.BASE_LR = 0.005
 73 | 
 74 | __C.TRAIN.LR = CN()
 75 | 
 76 | __C.TRAIN.LR.TYPE = 'log'
 77 | 
 78 | __C.TRAIN.LR.KWARGS = CN(new_allowed=True)
 79 | 
 80 | __C.TRAIN.LR_WARMUP = CN()
 81 | 
 82 | __C.TRAIN.LR_WARMUP.WARMUP = True
 83 | 
 84 | __C.TRAIN.LR_WARMUP.TYPE = 'step'
 85 | 
 86 | __C.TRAIN.LR_WARMUP.EPOCH = 5
 87 | 
 88 | __C.TRAIN.LR_WARMUP.KWARGS = CN(new_allowed=True)
 89 | 
 90 | __C.MASK = CN()
 91 | 
 92 | __C.MASK.MASK = False 
 93 | 
 94 | # ------------------------------------------------------------------------ #
 95 | # Dataset options
 96 | # ------------------------------------------------------------------------ #
 97 | __C.DATASET = CN(new_allowed=True)
 98 | 
 99 | # Augmentation
100 | # for template
101 | __C.DATASET.TEMPLATE = CN()
102 | 
103 | # for detail discussion
104 | __C.DATASET.TEMPLATE.SHIFT = 4
105 | 
106 | __C.DATASET.TEMPLATE.SCALE = 0.05
107 | 
108 | __C.DATASET.TEMPLATE.BLUR = 0.0
109 | 
110 | __C.DATASET.TEMPLATE.FLIP = 0.0
111 | 
112 | __C.DATASET.TEMPLATE.COLOR = 1.0
113 | 
114 | __C.DATASET.SEARCH = CN()
115 | 
116 | __C.DATASET.SEARCH.SHIFT = 64
117 | 
118 | __C.DATASET.SEARCH.SCALE = 0.18
119 | 
120 | __C.DATASET.SEARCH.BLUR = 0.0
121 | 
122 | __C.DATASET.SEARCH.FLIP = 0.0
123 | 
124 | __C.DATASET.SEARCH.COLOR = 1.0
125 | 
126 | # for detail discussion
127 | __C.DATASET.NEG = 0.2
128 | 
129 | # improve tracking performance for otb100
130 | __C.DATASET.GRAY = 0.0
131 | 
132 | __C.DATASET.NAMES = ('VID', 'YOUTUBEBB', 'DET', 'COCO', 'GOT', 'LASOT')
133 | 
134 | __C.DATASET.VID = CN() 
135 | __C.DATASET.VID.ROOT = ''          # VID dataset path
136 | __C.DATASET.VID.ANNO = ''
137 | __C.DATASET.VID.FRAME_RANGE = 100
138 | __C.DATASET.VID.NUM_USE =   100000 
139 | 
140 | __C.DATASET.YOUTUBEBB = CN()
141 | __C.DATASET.YOUTUBEBB.ROOT = ''
142 | __C.DATASET.YOUTUBEBB.ANNO = ''
143 | __C.DATASET.YOUTUBEBB.FRAME_RANGE = 3
144 | __C.DATASET.YOUTUBEBB.NUM_USE = 100000 
145 | 
146 | __C.DATASET.COCO = CN()
147 | __C.DATASET.COCO.ROOT = ''
148 | __C.DATASET.COCO.ANNO = ''
149 | __C.DATASET.COCO.FRAME_RANGE = 1
150 | __C.DATASET.COCO.NUM_USE =  100000  
151 | 
152 | __C.DATASET.DET = CN()
153 | __C.DATASET.DET.ROOT = ''
154 | __C.DATASET.DET.ANNO = ''
155 | __C.DATASET.DET.FRAME_RANGE = 1
156 | __C.DATASET.DET.NUM_USE = 100000 
157 | 
158 | __C.DATASET.GOT = CN()
159 | __C.DATASET.GOT.ROOT = 'data/GOT-10k/crop511'
160 | __C.DATASET.GOT.ANNO = 'data/GOT-10k/train.json'
161 | __C.DATASET.GOT.FRAME_RANGE = 100
162 | __C.DATASET.GOT.NUM_USE = 100000
163 | 
164 | __C.DATASET.LASOT = CN()
165 | __C.DATASET.LASOT.ROOT = ''
166 | __C.DATASET.LASOT.ANNO = ''
167 | __C.DATASET.LASOT.FRAME_RANGE = 100
168 | __C.DATASET.LASOT.NUM_USE = 100000 
169 | 
170 | __C.DATASET.VIDEOS_PER_EPOCH = 600000 
171 | # ------------------------------------------------------------------------ #
172 | 
173 | # Backbone options
174 | # ------------------------------------------------------------------------ #
175 | __C.BACKBONE = CN()
176 | 
177 | __C.BACKBONE.TYPE = 'res50'
178 | 
179 | __C.BACKBONE.KWARGS = CN(new_allowed=True)
180 | 
181 | # Pretrained backbone weights
182 | __C.BACKBONE.PRETRAINED = ''
183 | 
184 | # Train layers
185 | __C.BACKBONE.TRAIN_LAYERS = []
186 | 
187 | # Layer LR
188 | __C.BACKBONE.LAYERS_LR = 0.1
189 | 
190 | # Switch to train layer
191 | __C.BACKBONE.TRAIN_EPOCH = 10
192 | 
193 | # ------------------------------------------------------------------------ #
194 | # Adjust layer options
195 | # ------------------------------------------------------------------------ #
196 | __C.ADJUST = CN()
197 | 
198 | # Adjust layer
199 | __C.ADJUST.ADJUST = True
200 | 
201 | __C.ADJUST.KWARGS = CN(new_allowed=True)
202 | 
203 | # Adjust layer type
204 | __C.ADJUST.TYPE = "AdjustAllLayer"
205 | 
206 | # ------------------------------------------------------------------------ #
207 | # BAN options
208 | # ------------------------------------------------------------------------ #
209 | __C.BAN = CN()
210 | 
211 | # Whether to use ban head
212 | __C.BAN.BAN = False
213 | 
214 | # BAN type
215 | __C.BAN.TYPE = 'MultiBAN'
216 | 
217 | __C.BAN.KWARGS = CN(new_allowed=True)
218 | 
219 | # ------------------------------------------------------------------------ #
220 | # Point options
221 | # ------------------------------------------------------------------------ #
222 | __C.POINT = CN()
223 | 
224 | # Point stride
225 | __C.POINT.STRIDE = 8
226 | 
227 | # ------------------------------------------------------------------------ #
228 | # Tracker options
229 | # ------------------------------------------------------------------------ #
230 | __C.TRACK = CN()
231 | 
232 | __C.TRACK.TYPE = 'NanoTracker'
233 | 
234 | # Scale penalty
235 | __C.TRACK.PENALTY_K = 0.16
236 | 
237 | # Window influence
238 | __C.TRACK.WINDOW_INFLUENCE = 0.46
239 | 
240 | # Interpolation learning rate
241 | __C.TRACK.LR = 0.34
242 | 
243 | # Exemplar size
244 | __C.TRACK.EXEMPLAR_SIZE = 127
245 | 
246 | # Instance size
247 | __C.TRACK.INSTANCE_SIZE = 255
248 | 
249 | # Base size
250 | __C.TRACK.BASE_SIZE = 8
251 | 
252 | # Context amount
253 | __C.TRACK.CONTEXT_AMOUNT = 0.5
254 | 


--------------------------------------------------------------------------------
/models/rknnlite_rk3588_tracker.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import unicode_literals
  5 | 
  6 | import numpy as np
  7 | import cv2
  8 | 
  9 | from core.config import cfg
 10 | from utils.bbox import corner2center
 11 | 
 12 | from rknnlite.api import RKNNLite
 13 | 
 14 | class NnoTracker_RKNNLite(object):
 15 |     def __init__(self,Tback_weight, Xback_weight, Head_weight):
 16 | 
 17 |         self.score_size = (cfg.TRACK.INSTANCE_SIZE - cfg.TRACK.EXEMPLAR_SIZE) // \
 18 |                           cfg.POINT.STRIDE + 1 + cfg.TRACK.BASE_SIZE
 19 |         hanning = np.hanning(self.score_size)
 20 |         window = np.outer(hanning, hanning)
 21 |         self.cls_out_channels = 2
 22 |         self.window = window.flatten()
 23 | 
 24 |         self.points = self.generate_points(cfg.POINT.STRIDE, self.score_size)
 25 | 
 26 |         #--------------------------------------------------------#
 27 |         #--------------modify environment------------------------#
 28 |         # 1. T init
 29 |         self.rknn_Tback = RKNNLite()
 30 | 
 31 |         # load RKNN model
 32 |         print('--> Load RKNN model')
 33 |         ret = self.rknn_Tback.load_rknn(Tback_weight)
 34 |         if ret != 0:
 35 |             print('Load RKNN model failed')
 36 |             exit(ret)
 37 |         print('done')
 38 | 
 39 |         # init runtime environment
 40 |         print('--> Init runtime environment')
 41 | 
 42 |         ret = self.rknn_Tback.init_runtime(core_mask=RKNNLite.NPU_CORE_0)
 43 |         if ret != 0:
 44 |             print('Init runtime environment failed')
 45 |             exit(ret)
 46 |         print('done')
 47 | 
 48 |         # 2. X init
 49 |         self.rknn_Xback = RKNNLite()
 50 | 
 51 |         # Load model
 52 |         print('--> rknn_Xback: Loading model')
 53 |         ret = self.rknn_Xback.load_rknn(Xback_weight)
 54 |         if ret != 0:
 55 |             print('rknn_Xback: Load model failed!')
 56 |             exit(ret)
 57 |         print('rknn_Xback:done')
 58 | 
 59 |         # Init runtime environment
 60 |         print('--> Init runtime environment')
 61 |         ret = self.rknn_Xback.init_runtime(core_mask=RKNNLite.NPU_CORE_1)
 62 |         if ret != 0:
 63 |             print('Init runtime environment failed!')
 64 |             exit(ret)
 65 |         print('done')
 66 | 
 67 |         # 3. Head init
 68 |         self.rknn_Head = RKNNLite()
 69 | 
 70 |         # Load model
 71 |         print('--> rknn_Head: Loading model')
 72 |         ret = self.rknn_Head.load_rknn(Head_weight)
 73 |         if ret != 0:
 74 |             print('rknn_Head: Load model failed!')
 75 |             exit(ret)
 76 |         print('rknn_Head:done')
 77 | 
 78 |         # Init runtime environment
 79 |         print('--> Init runtime environment')
 80 |         ret = self.rknn_Head.init_runtime(core_mask=RKNNLite.NPU_CORE_2)
 81 |         if ret != 0:
 82 |             print('Init runtime environment failed!')
 83 |             exit(ret)
 84 |         print('done')
 85 | 
 86 |     def generate_points(self, stride, size):
 87 |         ori = - (size // 2) * stride
 88 |         x, y = np.meshgrid([ori + stride * dx for dx in np.arange(0, size)],
 89 |                            [ori + stride * dy for dy in np.arange(0, size)])
 90 |         points = np.zeros((size * size, 2), dtype=np.float32)
 91 |         points[:, 0], points[:, 1] = x.astype(np.float32).flatten(), y.astype(np.float32).flatten()
 92 | 
 93 |         return points
 94 | 
 95 |     def _convert_bbox(self, delta, point):
 96 |         delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1)
 97 |         delta = delta.detach().cpu().numpy()
 98 | 
 99 |         delta[0, :] = point[:, 0] - delta[0, :]  # x1
100 |         delta[1, :] = point[:, 1] - delta[1, :]  # y1
101 |         delta[2, :] = point[:, 0] + delta[2, :]  # x2
102 |         delta[3, :] = point[:, 1] + delta[3, :]  # y2
103 |         delta[0, :], delta[1, :], delta[2, :], delta[3, :] = corner2center(delta)
104 |         return delta
105 | 
106 |     def _convert_score(self, score):
107 |         if self.cls_out_channels == 1:
108 |             score = score.permute(1, 2, 3, 0).contiguous().view(-1)
109 |             score = score.sigmoid().detach().cpu().numpy()
110 |         else:
111 |             score = score.permute(1, 2, 3, 0).contiguous().view(self.cls_out_channels, -1).permute(1, 0)
112 |             score = score.softmax(1).detach()[:, 1].cpu().numpy()
113 |         return score
114 | 
115 |     def _convert_bbox_numpy(self, delta, point):
116 |         # delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1)
117 |         # delta = delta.detach().cpu().numpy()
118 | 
119 |         delta = delta.transpose((1,2,3,0)).reshape(4, -1)
120 | 
121 |         delta[0, :] = point[:, 0] - delta[0, :]  # x1
122 |         delta[1, :] = point[:, 1] - delta[1, :]  # y1
123 |         delta[2, :] = point[:, 0] + delta[2, :]  # x2
124 |         delta[3, :] = point[:, 1] + delta[3, :]  # y2
125 |         delta[0, :], delta[1, :], delta[2, :], delta[3, :] = corner2center(delta)
126 |         return delta
127 | 
128 |     def _convert_score_numpy(self, score):
129 |         def sofmax(logits):
130 |             e_x = np.exp(logits)
131 |             probs = e_x / np.sum(e_x, axis=-1, keepdims=True)
132 |             return probs
133 | 
134 |         # score = score.permute(1, 2, 3, 0).contiguous().view(self.cls_out_channels, -1).permute(1, 0)
135 |         # score = score.softmax(1).detach()[:, 1].cpu().numpy()
136 | 
137 |         score = score.transpose((1,2,3,0)).reshape(self.cls_out_channels, -1).transpose((1,0))
138 |         score = sofmax(score)[:,1]
139 | 
140 |         return score
141 | 
142 |     def _bbox_clip(self, cx, cy, width, height, boundary):
143 |         cx = max(0, min(cx, boundary[1]))
144 |         cy = max(0, min(cy, boundary[0]))
145 |         width = max(10, min(width, boundary[1]))
146 |         height = max(10, min(height, boundary[0]))
147 |         return cx, cy, width, height
148 | 
149 |     def get_subwindow(self, im, pos, model_sz, original_sz, avg_chans):
150 |         """
151 |         args:
152 |             im: bgr based image
153 |             pos: center position
154 |             model_sz: exemplar size
155 |             s_z: original size
156 |             avg_chans: channel average
157 |         """
158 |         if isinstance(pos, float):
159 |             pos = [pos, pos]
160 |         sz = original_sz
161 |         im_sz = im.shape
162 |         c = (original_sz + 1) / 2
163 |         # context_xmin = round(pos[0] - c) # py2 and py3 round
164 |         context_xmin = np.floor(pos[0] - c + 0.5)
165 |         context_xmax = context_xmin + sz - 1
166 |         # context_ymin = round(pos[1] - c)
167 |         context_ymin = np.floor(pos[1] - c + 0.5)
168 |         context_ymax = context_ymin + sz - 1
169 |         left_pad = int(max(0., -context_xmin))
170 |         top_pad = int(max(0., -context_ymin))
171 |         right_pad = int(max(0., context_xmax - im_sz[1] + 1))
172 |         bottom_pad = int(max(0., context_ymax - im_sz[0] + 1))
173 | 
174 |         context_xmin = context_xmin + left_pad
175 |         context_xmax = context_xmax + left_pad
176 |         context_ymin = context_ymin + top_pad
177 |         context_ymax = context_ymax + top_pad
178 | 
179 |         r, c, k = im.shape
180 |         if any([top_pad, bottom_pad, left_pad, right_pad]):
181 |             size = (r + top_pad + bottom_pad, c + left_pad + right_pad, k)
182 |             te_im = np.zeros(size, np.uint8)
183 |             te_im[top_pad:top_pad + r, left_pad:left_pad + c, :] = im
184 |             if top_pad:
185 |                 te_im[0:top_pad, left_pad:left_pad + c, :] = avg_chans
186 |             if bottom_pad:
187 |                 te_im[r + top_pad:, left_pad:left_pad + c, :] = avg_chans
188 |             if left_pad:
189 |                 te_im[:, 0:left_pad, :] = avg_chans
190 |             if right_pad:
191 |                 te_im[:, c + left_pad:, :] = avg_chans
192 |             im_patch = te_im[int(context_ymin):int(context_ymax + 1),
193 |                        int(context_xmin):int(context_xmax + 1), :]
194 |         else:
195 |             im_patch = im[int(context_ymin):int(context_ymax + 1),
196 |                        int(context_xmin):int(context_xmax + 1), :]
197 | 
198 |         if not np.array_equal(model_sz, original_sz):
199 |             im_patch = cv2.resize(im_patch, (model_sz, model_sz))
200 |         im_patch = im_patch.transpose(2, 0, 1)
201 |         im_patch = im_patch[np.newaxis, :, :, :]
202 |         im_patch = im_patch.astype(np.float32)
203 | 
204 |         return im_patch
205 | 
206 |     def init(self, img, bbox):
207 |         """
208 |         args:
209 |             img(np.ndarray): BGR image
210 |             bbox: (x, y, w, h) bbox
211 |         """
212 |         self.center_pos = np.array([bbox[0] + (bbox[2] - 1) / 2,
213 |                                     bbox[1] + (bbox[3] - 1) / 2])
214 |         self.size = np.array([bbox[2], bbox[3]])
215 | 
216 |         # calculate z crop size
217 |         w_z = self.size[0] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size)
218 |         h_z = self.size[1] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size)
219 |         s_z = round(np.sqrt(w_z * h_z))
220 | 
221 |         # calculate channle average
222 |         self.channel_average = np.mean(img, axis=(0, 1))
223 | 
224 |         # get crop
225 |         z_crop = self.get_subwindow(img, self.center_pos,
226 |                                     cfg.TRACK.EXEMPLAR_SIZE,
227 |                                     s_z, self.channel_average)
228 | 
229 |         back_T_in = z_crop.transpose((0,2,3,1))
230 | 
231 |         # self.Toutput = self.rknn_Tback.inference(inputs=[z_crop], data_format='nchw')
232 |         self.Toutput = self.rknn_Tback.inference(inputs=[back_T_in])
233 | 
234 |         self.rknn_Tback.release()
235 | 
236 |     def track(self, img):
237 |         """
238 |         args:
239 |             img(np.ndarray): BGR image
240 |         return:
241 |             bbox(list):[x, y, width, height]
242 |         """
243 |         w_z = self.size[0] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size)
244 |         h_z = self.size[1] + cfg.TRACK.CONTEXT_AMOUNT * np.sum(self.size)
245 |         s_z = np.sqrt(w_z * h_z)
246 |         scale_z = cfg.TRACK.EXEMPLAR_SIZE / s_z
247 |         s_x = s_z * (cfg.TRACK.INSTANCE_SIZE / cfg.TRACK.EXEMPLAR_SIZE)
248 |         x_crop = self.get_subwindow(img, self.center_pos,
249 |                                     cfg.TRACK.INSTANCE_SIZE,
250 |                                     round(s_x), self.channel_average)
251 | 
252 |         ## yuce
253 |         back_X_in = x_crop.transpose((0,2,3,1))
254 |         # self.Xoutput = self.rknn_Xback.inference(inputs=[x_crop], data_format='nchw')
255 |         self.Xoutput = self.rknn_Xback.inference(inputs=[back_X_in])
256 | 
257 |         head_T_in = self.Toutput[0].transpose((0,2,3,1))
258 |         head_X_in = self.Xoutput[0].transpose((0,2,3,1))
259 | 
260 |         # outputs = self.rknn_Head.inference(inputs=[self.Toutput[0], self.Xoutput[0]], data_format='nchw')
261 |         outputs = self.rknn_Head.inference(inputs=[head_T_in, head_X_in])
262 | 
263 |         score = self._convert_score_numpy(outputs[0])
264 |         pred_bbox = self._convert_bbox_numpy(outputs[1], self.points)
265 | 
266 |         # score = self._convert_score(outputs['cls'])
267 |         # pred_bbox = self._convert_bbox(outputs['loc'], self.points)
268 | 
269 |         def change(r):
270 |             return np.maximum(r, 1. / r)
271 | 
272 |         def sz(w, h):
273 |             pad = (w + h) * 0.5
274 |             return np.sqrt((w + pad) * (h + pad))
275 | 
276 |         # scale penalty
277 |         s_c = change(sz(pred_bbox[2, :], pred_bbox[3, :]) /
278 |                      (sz(self.size[0] * scale_z, self.size[1] * scale_z)))
279 | 
280 |         # aspect ratio penalty
281 |         r_c = change((self.size[0] / self.size[1]) /
282 |                      (pred_bbox[2, :] / pred_bbox[3, :]))
283 |         penalty = np.exp(-(r_c * s_c - 1) * cfg.TRACK.PENALTY_K)
284 | 
285 |         # score
286 |         pscore = penalty * score
287 | 
288 |         # window penalty
289 |         pscore = pscore * (1 - cfg.TRACK.WINDOW_INFLUENCE) + \
290 |                  self.window * cfg.TRACK.WINDOW_INFLUENCE
291 | 
292 |         best_idx = np.argmax(pscore)
293 | 
294 |         bbox = pred_bbox[:, best_idx] / scale_z
295 | 
296 |         lr = penalty[best_idx] * score[best_idx] * cfg.TRACK.LR
297 | 
298 |         cx = bbox[0] + self.center_pos[0]
299 | 
300 |         cy = bbox[1] + self.center_pos[1]
301 | 
302 |         # smooth bbox
303 |         width = self.size[0] * (1 - lr) + bbox[2] * lr
304 | 
305 |         height = self.size[1] * (1 - lr) + bbox[3] * lr
306 | 
307 |         # clip boundary
308 |         cx, cy, width, height = self._bbox_clip(cx, cy, width,
309 |                                                 height, img.shape[:2])
310 | 
311 |         # udpate state
312 |         self.center_pos = np.array([cx, cy])
313 |         self.size = np.array([width, height])
314 | 
315 |         bbox = [cx - width / 2,
316 |                 cy - height / 2,
317 |                 width,
318 |                 height]
319 | 
320 |         best_score = score[best_idx]
321 |         return {
322 |             'bbox': bbox,
323 |             'best_score': best_score
324 |         }


--------------------------------------------------------------------------------