├── README.md ├── cpp ├── Tokenizer.hpp ├── images │ ├── 000000008826.jpg │ ├── 000000049490.jpg │ ├── ssd_horse.jpg │ └── test.jpg ├── main.cpp └── vocab.txt └── python ├── bpe_simple_vocab_16e6.txt.gz ├── images ├── 000000008826.jpg ├── 000000049490.jpg ├── ssd_horse.jpg └── test.jpg ├── main.py ├── simple_tokenizer.py └── tokenizer.py /README.md: -------------------------------------------------------------------------------- 1 | 本套程序是OWL-ViT,它是谷歌于 22 年 5 月提出的一种新的 OVD(Open Vocabulary Detection)算法。 2 | 传统的检测算法会收到训练时标注类别的限制,无法在推理时检测出训练集中未出现的类别。 3 | 而 OVD 算法,在推理时可以检测由开放词表定义的任意新类。 4 | 训练源码在 https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit 5 | 文章在 https://arxiv.org/abs/2205.06230 6 | 7 | onnx文件在百度云盘,链接:https://pan.baidu.com/s/1vlMRc9Pi8dSgU3kpGWvK9g?pwd=ts1j 8 | 提取码:ts1j 9 | -------------------------------------------------------------------------------- /cpp/Tokenizer.hpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/cpp/Tokenizer.hpp -------------------------------------------------------------------------------- /cpp/images/000000008826.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/cpp/images/000000008826.jpg -------------------------------------------------------------------------------- /cpp/images/000000049490.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/cpp/images/000000049490.jpg -------------------------------------------------------------------------------- /cpp/images/ssd_horse.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/cpp/images/ssd_horse.jpg -------------------------------------------------------------------------------- /cpp/images/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/cpp/images/test.jpg -------------------------------------------------------------------------------- /cpp/main.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/cpp/main.cpp -------------------------------------------------------------------------------- /python/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/python/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /python/images/000000008826.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/python/images/000000008826.jpg -------------------------------------------------------------------------------- /python/images/000000049490.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/python/images/000000049490.jpg -------------------------------------------------------------------------------- /python/images/ssd_horse.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/python/images/ssd_horse.jpg -------------------------------------------------------------------------------- /python/images/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/python/images/test.jpg -------------------------------------------------------------------------------- /python/main.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import onnxruntime as ort 3 | import numpy as np 4 | from tokenizer import build_tokenizer 5 | print(ort.__version__) ###onnxruntime1.11.1加载onnx文件会报错, onnxruntime1.14.1能正常运行 6 | class OWLVIT(): 7 | def __init__(self, image_modelpath, text_modelpath, post_modelpath, box_thresh = 0.2, text_thresh = 0.25): 8 | self.image_model = cv2.dnn.readNet(image_modelpath) 9 | self.input_height, self.input_width = 768, 768 10 | 11 | self.mean = np.array([0.48145466, 0.4578275, 0.40821073], 12 | dtype=np.float32).reshape((1, 1, 3)) 13 | self.std = np.array([0.26862954, 0.26130258, 0.27577711], 14 | dtype=np.float32).reshape((1, 1, 3)) 15 | 16 | so = ort.SessionOptions() 17 | so.log_severity_level = 3 18 | self.bert = ort.InferenceSession(text_modelpath, so) 19 | self.bert_input_names = [] 20 | for i in range(len(self.bert.get_inputs())): 21 | self.bert_input_names.append(self.bert.get_inputs()[i].name) 22 | 23 | self.transformer = ort.InferenceSession(post_modelpath, so) 24 | self.transformer_input_names = [] 25 | for i in range(len(self.transformer.get_inputs())): 26 | self.transformer_input_names.append(self.transformer.get_inputs()[i].name) 27 | 28 | self.box_thresh = box_thresh 29 | self.text_thresh = text_thresh 30 | self.tokenizer = build_tokenizer('bpe_simple_vocab_16e6.txt.gz') 31 | 32 | def preprocess(self, srcimg): 33 | img = cv2.cvtColor(srcimg, cv2.COLOR_BGR2RGB) 34 | img = cv2.resize(img, (self.input_width, self.input_height)) 35 | img = (img.astype(np.float32)/255.0 - self.mean) / self.std 36 | return img 37 | 38 | def encode_image(self, srcimg): 39 | img = self.preprocess(srcimg) 40 | blob = cv2.dnn.blobFromImage(img) 41 | self.image_model.setInput(blob) 42 | image_features, pred_boxes = self.image_model.forward(self.image_model.getUnconnectedOutLayersNames()) 43 | return image_features, pred_boxes.reshape(-1,4) 44 | 45 | def encode_texts(self, text_prompt): 46 | token_ids = [self.tokenizer.encode(t) for t in text_prompt] 47 | input_ids, text_features = [], [] 48 | for ids in token_ids: 49 | input_id = np.pad([49406, *ids, 49407],(0,16-len(ids)-2)).astype(np.int64) 50 | input_ids.append(input_id) 51 | mask = (input_id > 0).astype(np.int64) 52 | 53 | text_feature = self.bert.run(None, {self.bert_input_names[0]:input_id.reshape(1,16), self.bert_input_names[1]:mask.reshape(1,16)})[0].reshape(1,-1) 54 | text_features.append(text_feature) 55 | return text_features, input_ids 56 | 57 | def decode(self, image_feature, text_feature, input_id): 58 | logits = self.transformer.run(None, {self.transformer_input_names[0]:image_feature[0].reshape(1,24,24,768), self.transformer_input_names[1]:text_feature, self.transformer_input_names[2]:input_id.reshape(1,16)})[0] 59 | logits = 1/(1+np.exp(-logits)).reshape(-1) ###sigmoid 60 | return logits 61 | 62 | def detect(self, srcimg, text_prompt): 63 | if isinstance(text_prompt, str): 64 | text_prompt = [text_prompt] 65 | srch, srcw = srcimg.shape[:2] 66 | image_features, pred_boxes = self.encode_image(srcimg) 67 | text_features, input_ids = self.encode_texts(text_prompt) 68 | objects = [] 69 | for i,input_id in enumerate(input_ids): 70 | logits = self.decode(image_features, text_features[i], input_id) 71 | boxes = pred_boxes[logits > self.box_thresh] ###形状nx4 72 | score = logits[logits > self.box_thresh] 73 | for j in range(boxes.shape[0]): 74 | #cx,cy,w,h = boxes[j, :] 75 | xmin = int((boxes[j, 0]-0.5*boxes[j, 2])*srcw) 76 | ymin = int((boxes[j, 1]-0.5*boxes[j, 3])*srch) 77 | xmax = int((boxes[j, 0]+0.5*boxes[j, 2])*srcw) 78 | ymax = int((boxes[j, 1]+0.5*boxes[j, 3])*srch) 79 | objects.append({'xmin':xmin, 'ymin':ymin, 'xmax':xmax, 'ymax':ymax,'name':text_prompt[i],'score':score[j]}) 80 | return objects 81 | 82 | if __name__=='__main__': 83 | mynet = OWLVIT('weights/owlvit-image.onnx', 'weights/owlvit-text.onnx', 'weights/owlvit-post.onnx') 84 | 85 | imgpath = 'images/test.jpg' 86 | srcimg = cv2.imread(imgpath) 87 | text_prompt = ["football", "a photo of person"] ###人,不能直接写person,要写成a photo of person 88 | 89 | objects = mynet.detect(srcimg, text_prompt) 90 | 91 | for obj in objects: 92 | cv2.rectangle(srcimg, (obj['xmin'], obj['ymin']), (obj['xmax'], obj['ymax']), (0,0,255), 2) 93 | cv2.putText(srcimg, obj['name'], (obj['xmin'], obj['ymin']-5), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 1, cv2.LINE_AA) 94 | 95 | # cv2.imwrite('result.jpg', srcimg) 96 | winName = 'Simple Open-Vocabulary Object Detection with Vision Transformers use OpenCV' 97 | cv2.namedWindow(winName, 0) 98 | cv2.imshow(winName, srcimg) 99 | cv2.waitKey(0) 100 | cv2.destroyAllWindows() 101 | -------------------------------------------------------------------------------- /python/simple_tokenizer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import html 3 | import os 4 | from functools import lru_cache 5 | 6 | import ftfy 7 | import regex as re 8 | 9 | 10 | @lru_cache() 11 | def default_bpe(): 12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") 13 | 14 | 15 | @lru_cache() 16 | def bytes_to_unicode(): 17 | """ 18 | Returns list of utf-8 byte and a corresponding list of unicode strings. 19 | The reversible bpe codes work on unicode strings. 20 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 21 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 22 | This is a signficant percentage of your normal, say, 32K bpe vocab. 23 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 24 | And avoids mapping to whitespace/control characters the bpe code barfs on. 25 | """ 26 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 27 | cs = bs[:] 28 | n = 0 29 | for b in range(2**8): 30 | if b not in bs: 31 | bs.append(b) 32 | cs.append(2**8+n) 33 | n += 1 34 | cs = [chr(n) for n in cs] 35 | return dict(zip(bs, cs)) 36 | 37 | 38 | def get_pairs(word): 39 | """Return set of symbol pairs in a word. 40 | Word is represented as tuple of symbols (symbols being variable-length strings). 41 | """ 42 | pairs = set() 43 | prev_char = word[0] 44 | for char in word[1:]: 45 | pairs.add((prev_char, char)) 46 | prev_char = char 47 | return pairs 48 | 49 | 50 | def basic_clean(text): 51 | text = ftfy.fix_text(text) 52 | text = html.unescape(html.unescape(text)) 53 | return text.strip() 54 | 55 | 56 | def whitespace_clean(text): 57 | text = re.sub(r'\s+', ' ', text) 58 | text = text.strip() 59 | return text 60 | 61 | 62 | class SimpleTokenizer(object): 63 | def __init__(self, bpe_path: str = default_bpe()): 64 | self.byte_encoder = bytes_to_unicode() 65 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 66 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') 67 | merges = merges[1:49152-256-2+1] 68 | merges = [tuple(merge.split()) for merge in merges] 69 | vocab = list(bytes_to_unicode().values()) 70 | vocab = vocab + [v+'' for v in vocab] 71 | for merge in merges: 72 | vocab.append(''.join(merge)) 73 | vocab.extend(['<|startoftext|>', '<|endoftext|>']) 74 | self.encoder = dict(zip(vocab, range(len(vocab)))) 75 | self.decoder = {v: k for k, v in self.encoder.items()} 76 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 77 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} 78 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) 79 | 80 | def bpe(self, token): 81 | if token in self.cache: 82 | return self.cache[token] 83 | word = tuple(token[:-1]) + ( token[-1] + '',) 84 | pairs = get_pairs(word) 85 | 86 | if not pairs: 87 | return token+'' 88 | 89 | while True: 90 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 91 | if bigram not in self.bpe_ranks: 92 | break 93 | first, second = bigram 94 | new_word = [] 95 | i = 0 96 | while i < len(word): 97 | try: 98 | j = word.index(first, i) 99 | new_word.extend(word[i:j]) 100 | i = j 101 | except: 102 | new_word.extend(word[i:]) 103 | break 104 | 105 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 106 | new_word.append(first+second) 107 | i += 2 108 | else: 109 | new_word.append(word[i]) 110 | i += 1 111 | new_word = tuple(new_word) 112 | word = new_word 113 | if len(word) == 1: 114 | break 115 | else: 116 | pairs = get_pairs(word) 117 | word = ' '.join(word) 118 | self.cache[token] = word 119 | return word 120 | 121 | def encode(self, text): 122 | bpe_tokens = [] 123 | text = whitespace_clean(basic_clean(text)).lower() 124 | for token in re.findall(self.pat, text): 125 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 126 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 127 | return bpe_tokens 128 | 129 | def decode(self, tokens): 130 | text = ''.join([self.decoder[token] for token in tokens]) 131 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') 132 | return text -------------------------------------------------------------------------------- /python/tokenizer.py: -------------------------------------------------------------------------------- 1 | """Simple CLIP tokenizer wrapper.""" 2 | 3 | import functools 4 | from typing import List, Optional 5 | 6 | import simple_tokenizer 7 | # from scenic.projects.baselines.clip import download 8 | 9 | 10 | # pylint: disable=line-too-long 11 | DEFAULT_BPE_PATH = "bpe_simple_vocab_16e6.txt.gz" 12 | DEFAULT_BPE_URL = 'https://github.com/openai/CLIP/blob/main/clip/bpe_simple_vocab_16e6.txt.gz?raw=true' 13 | # pylint: enable=line-too-long 14 | 15 | 16 | def tokenize(text: str, max_token_len: int = 77) -> List[int]: 17 | tokenizer = build_tokenizer() 18 | sot_token = tokenizer.encoder['<|startoftext|>'] 19 | eot_token = tokenizer.encoder['<|endoftext|>'] 20 | tokens = [sot_token] + tokenizer.encode(text) + [eot_token] 21 | output = [0] * max_token_len 22 | output[:min(max_token_len, len(tokens))] = tokens[:max_token_len] 23 | return output 24 | 25 | 26 | @functools.lru_cache(maxsize=1) 27 | def build_tokenizer( 28 | bpe_path: Optional[str] = DEFAULT_BPE_PATH 29 | ) -> simple_tokenizer.SimpleTokenizer: 30 | 31 | return simple_tokenizer.SimpleTokenizer(bpe_path) 32 | --------------------------------------------------------------------------------