├── README.md
├── cpp
    ├── Tokenizer.hpp
    ├── images
    │   ├── 000000008826.jpg
    │   ├── 000000049490.jpg
    │   ├── ssd_horse.jpg
    │   └── test.jpg
    ├── main.cpp
    └── vocab.txt
└── python
    ├── bpe_simple_vocab_16e6.txt.gz
    ├── images
        ├── 000000008826.jpg
        ├── 000000049490.jpg
        ├── ssd_horse.jpg
        └── test.jpg
    ├── main.py
    ├── simple_tokenizer.py
    └── tokenizer.py


/README.md:
--------------------------------------------------------------------------------
1 | 本套程序是OWL-ViT，它是谷歌于 22 年 5 月提出的一种新的 OVD（Open Vocabulary Detection）算法。
2 | 传统的检测算法会收到训练时标注类别的限制，无法在推理时检测出训练集中未出现的类别。
3 | 而 OVD 算法，在推理时可以检测由开放词表定义的任意新类。
4 | 训练源码在 https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit
5 | 文章在 https://arxiv.org/abs/2205.06230
6 | 
7 | onnx文件在百度云盘，链接：https://pan.baidu.com/s/1vlMRc9Pi8dSgU3kpGWvK9g?pwd=ts1j 
8 | 提取码：ts1j 
9 | 


--------------------------------------------------------------------------------
/cpp/Tokenizer.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/cpp/Tokenizer.hpp


--------------------------------------------------------------------------------
/cpp/images/000000008826.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/cpp/images/000000008826.jpg


--------------------------------------------------------------------------------
/cpp/images/000000049490.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/cpp/images/000000049490.jpg


--------------------------------------------------------------------------------
/cpp/images/ssd_horse.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/cpp/images/ssd_horse.jpg


--------------------------------------------------------------------------------
/cpp/images/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/cpp/images/test.jpg


--------------------------------------------------------------------------------
/cpp/main.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/cpp/main.cpp


--------------------------------------------------------------------------------
/python/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/python/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/python/images/000000008826.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/python/images/000000008826.jpg


--------------------------------------------------------------------------------
/python/images/000000049490.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/python/images/000000049490.jpg


--------------------------------------------------------------------------------
/python/images/ssd_horse.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/python/images/ssd_horse.jpg


--------------------------------------------------------------------------------
/python/images/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc203/Open-Vocabulary-Object-Detection-opencv-onnxrun/853e1cc2487c9b388435ecae46e87e986daa1096/python/images/test.jpg


--------------------------------------------------------------------------------
/python/main.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import onnxruntime as ort
  3 | import numpy as np
  4 | from tokenizer import build_tokenizer
  5 | print(ort.__version__)  ###onnxruntime1.11.1加载onnx文件会报错, onnxruntime1.14.1能正常运行
  6 | class OWLVIT():
  7 |     def __init__(self, image_modelpath, text_modelpath, post_modelpath, box_thresh = 0.2, text_thresh = 0.25):
  8 |         self.image_model = cv2.dnn.readNet(image_modelpath)
  9 |         self.input_height, self.input_width = 768, 768
 10 |         
 11 |         self.mean = np.array([0.48145466, 0.4578275, 0.40821073],
 12 |                              dtype=np.float32).reshape((1, 1, 3))
 13 |         self.std = np.array([0.26862954, 0.26130258, 0.27577711],
 14 |                             dtype=np.float32).reshape((1, 1, 3))
 15 |         
 16 |         so = ort.SessionOptions()
 17 |         so.log_severity_level = 3
 18 |         self.bert = ort.InferenceSession(text_modelpath, so)
 19 |         self.bert_input_names = []
 20 |         for i in range(len(self.bert.get_inputs())):
 21 |             self.bert_input_names.append(self.bert.get_inputs()[i].name)
 22 | 
 23 |         self.transformer = ort.InferenceSession(post_modelpath, so)
 24 |         self.transformer_input_names = []
 25 |         for i in range(len(self.transformer.get_inputs())):
 26 |             self.transformer_input_names.append(self.transformer.get_inputs()[i].name)
 27 |         
 28 |         self.box_thresh = box_thresh
 29 |         self.text_thresh = text_thresh
 30 |         self.tokenizer = build_tokenizer('bpe_simple_vocab_16e6.txt.gz')
 31 |     
 32 |     def preprocess(self, srcimg):
 33 |         img = cv2.cvtColor(srcimg, cv2.COLOR_BGR2RGB)
 34 |         img = cv2.resize(img, (self.input_width, self.input_height))
 35 |         img = (img.astype(np.float32)/255.0 - self.mean) / self.std
 36 |         return img
 37 |     
 38 |     def encode_image(self, srcimg):
 39 |         img = self.preprocess(srcimg)
 40 |         blob = cv2.dnn.blobFromImage(img)
 41 |         self.image_model.setInput(blob)
 42 |         image_features, pred_boxes = self.image_model.forward(self.image_model.getUnconnectedOutLayersNames())
 43 |         return image_features, pred_boxes.reshape(-1,4)
 44 |     
 45 |     def encode_texts(self, text_prompt):
 46 |         token_ids = [self.tokenizer.encode(t) for t in text_prompt]
 47 |         input_ids, text_features = [], []
 48 |         for ids in token_ids:
 49 |             input_id = np.pad([49406, *ids, 49407],(0,16-len(ids)-2)).astype(np.int64)
 50 |             input_ids.append(input_id)
 51 |             mask = (input_id > 0).astype(np.int64)
 52 | 
 53 |             text_feature = self.bert.run(None, {self.bert_input_names[0]:input_id.reshape(1,16), self.bert_input_names[1]:mask.reshape(1,16)})[0].reshape(1,-1)
 54 |             text_features.append(text_feature)
 55 |         return text_features, input_ids
 56 |     
 57 |     def decode(self, image_feature, text_feature, input_id):
 58 |         logits = self.transformer.run(None, {self.transformer_input_names[0]:image_feature[0].reshape(1,24,24,768), self.transformer_input_names[1]:text_feature, self.transformer_input_names[2]:input_id.reshape(1,16)})[0]
 59 |         logits = 1/(1+np.exp(-logits)).reshape(-1)  ###sigmoid
 60 |         return logits
 61 | 
 62 |     def detect(self, srcimg, text_prompt):
 63 |         if isinstance(text_prompt, str):
 64 |             text_prompt = [text_prompt]
 65 |         srch, srcw = srcimg.shape[:2]
 66 |         image_features, pred_boxes = self.encode_image(srcimg)
 67 |         text_features, input_ids = self.encode_texts(text_prompt)
 68 |         objects = []
 69 |         for i,input_id in enumerate(input_ids):
 70 |             logits = self.decode(image_features, text_features[i], input_id)
 71 |             boxes = pred_boxes[logits > self.box_thresh]  ###形状nx4
 72 |             score = logits[logits > self.box_thresh]
 73 |             for j in range(boxes.shape[0]):
 74 |                 #cx,cy,w,h = boxes[j, :]
 75 |                 xmin = int((boxes[j, 0]-0.5*boxes[j, 2])*srcw)
 76 |                 ymin = int((boxes[j, 1]-0.5*boxes[j, 3])*srch)
 77 |                 xmax = int((boxes[j, 0]+0.5*boxes[j, 2])*srcw)
 78 |                 ymax = int((boxes[j, 1]+0.5*boxes[j, 3])*srch)
 79 |                 objects.append({'xmin':xmin, 'ymin':ymin, 'xmax':xmax, 'ymax':ymax,'name':text_prompt[i],'score':score[j]})
 80 |         return objects
 81 |     
 82 | if __name__=='__main__':
 83 |     mynet = OWLVIT('weights/owlvit-image.onnx', 'weights/owlvit-text.onnx', 'weights/owlvit-post.onnx')
 84 | 
 85 |     imgpath = 'images/test.jpg'
 86 |     srcimg = cv2.imread(imgpath)
 87 |     text_prompt = ["football", "a photo of person"]  ###人，不能直接写person,要写成a photo of person
 88 |     
 89 |     objects = mynet.detect(srcimg, text_prompt)
 90 | 
 91 |     for obj in objects:
 92 |         cv2.rectangle(srcimg, (obj['xmin'], obj['ymin']), (obj['xmax'], obj['ymax']), (0,0,255), 2)
 93 |         cv2.putText(srcimg, obj['name'], (obj['xmin'], obj['ymin']-5), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 1, cv2.LINE_AA)
 94 | 
 95 |     # cv2.imwrite('result.jpg', srcimg)
 96 |     winName = 'Simple Open-Vocabulary Object Detection with Vision Transformers use OpenCV'
 97 |     cv2.namedWindow(winName, 0)
 98 |     cv2.imshow(winName, srcimg)
 99 |     cv2.waitKey(0)
100 |     cv2.destroyAllWindows()
101 | 


--------------------------------------------------------------------------------
/python/simple_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import html
  3 | import os
  4 | from functools import lru_cache
  5 | 
  6 | import ftfy
  7 | import regex as re
  8 | 
  9 | 
 10 | @lru_cache()
 11 | def default_bpe():
 12 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
 13 | 
 14 | 
 15 | @lru_cache()
 16 | def bytes_to_unicode():
 17 |     """
 18 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 19 |     The reversible bpe codes work on unicode strings.
 20 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 21 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 22 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 23 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 24 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 25 |     """
 26 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 27 |     cs = bs[:]
 28 |     n = 0
 29 |     for b in range(2**8):
 30 |         if b not in bs:
 31 |             bs.append(b)
 32 |             cs.append(2**8+n)
 33 |             n += 1
 34 |     cs = [chr(n) for n in cs]
 35 |     return dict(zip(bs, cs))
 36 | 
 37 | 
 38 | def get_pairs(word):
 39 |     """Return set of symbol pairs in a word.
 40 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 41 |     """
 42 |     pairs = set()
 43 |     prev_char = word[0]
 44 |     for char in word[1:]:
 45 |         pairs.add((prev_char, char))
 46 |         prev_char = char
 47 |     return pairs
 48 | 
 49 | 
 50 | def basic_clean(text):
 51 |     text = ftfy.fix_text(text)
 52 |     text = html.unescape(html.unescape(text))
 53 |     return text.strip()
 54 | 
 55 | 
 56 | def whitespace_clean(text):
 57 |     text = re.sub(r'\s+', ' ', text)
 58 |     text = text.strip()
 59 |     return text
 60 | 
 61 | 
 62 | class SimpleTokenizer(object):
 63 |     def __init__(self, bpe_path: str = default_bpe()):
 64 |         self.byte_encoder = bytes_to_unicode()
 65 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 66 |         merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
 67 |         merges = merges[1:49152-256-2+1]
 68 |         merges = [tuple(merge.split()) for merge in merges]
 69 |         vocab = list(bytes_to_unicode().values())
 70 |         vocab = vocab + [v+'</w>' for v in vocab]
 71 |         for merge in merges:
 72 |             vocab.append(''.join(merge))
 73 |         vocab.extend(['<|startoftext|>', '<|endoftext|>'])
 74 |         self.encoder = dict(zip(vocab, range(len(vocab))))
 75 |         self.decoder = {v: k for k, v in self.encoder.items()}
 76 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
 77 |         self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
 78 |         self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
 79 | 
 80 |     def bpe(self, token):
 81 |         if token in self.cache:
 82 |             return self.cache[token]
 83 |         word = tuple(token[:-1]) + ( token[-1] + '</w>',)
 84 |         pairs = get_pairs(word)
 85 | 
 86 |         if not pairs:
 87 |             return token+'</w>'
 88 | 
 89 |         while True:
 90 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
 91 |             if bigram not in self.bpe_ranks:
 92 |                 break
 93 |             first, second = bigram
 94 |             new_word = []
 95 |             i = 0
 96 |             while i < len(word):
 97 |                 try:
 98 |                     j = word.index(first, i)
 99 |                     new_word.extend(word[i:j])
100 |                     i = j
101 |                 except:
102 |                     new_word.extend(word[i:])
103 |                     break
104 | 
105 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
106 |                     new_word.append(first+second)
107 |                     i += 2
108 |                 else:
109 |                     new_word.append(word[i])
110 |                     i += 1
111 |             new_word = tuple(new_word)
112 |             word = new_word
113 |             if len(word) == 1:
114 |                 break
115 |             else:
116 |                 pairs = get_pairs(word)
117 |         word = ' '.join(word)
118 |         self.cache[token] = word
119 |         return word
120 | 
121 |     def encode(self, text):
122 |         bpe_tokens = []
123 |         text = whitespace_clean(basic_clean(text)).lower()
124 |         for token in re.findall(self.pat, text):
125 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
126 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
127 |         return bpe_tokens
128 | 
129 |     def decode(self, tokens):
130 |         text = ''.join([self.decoder[token] for token in tokens])
131 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
132 |         return text


--------------------------------------------------------------------------------
/python/tokenizer.py:
--------------------------------------------------------------------------------
 1 | """Simple CLIP tokenizer wrapper."""
 2 | 
 3 | import functools
 4 | from typing import List, Optional
 5 | 
 6 | import simple_tokenizer
 7 | # from scenic.projects.baselines.clip import download
 8 | 
 9 | 
10 | # pylint: disable=line-too-long
11 | DEFAULT_BPE_PATH = "bpe_simple_vocab_16e6.txt.gz"
12 | DEFAULT_BPE_URL = 'https://github.com/openai/CLIP/blob/main/clip/bpe_simple_vocab_16e6.txt.gz?raw=true'
13 | # pylint: enable=line-too-long
14 | 
15 | 
16 | def tokenize(text: str, max_token_len: int = 77) -> List[int]:
17 |   tokenizer = build_tokenizer()
18 |   sot_token = tokenizer.encoder['<|startoftext|>']
19 |   eot_token = tokenizer.encoder['<|endoftext|>']
20 |   tokens = [sot_token] + tokenizer.encode(text) + [eot_token]
21 |   output = [0] * max_token_len
22 |   output[:min(max_token_len, len(tokens))] = tokens[:max_token_len]
23 |   return output
24 | 
25 | 
26 | @functools.lru_cache(maxsize=1)
27 | def build_tokenizer(
28 |     bpe_path: Optional[str] = DEFAULT_BPE_PATH
29 | ) -> simple_tokenizer.SimpleTokenizer:
30 | 
31 |   return simple_tokenizer.SimpleTokenizer(bpe_path)
32 | 


--------------------------------------------------------------------------------