├── .gitignore
├── README.md
├── code
    ├── ocr
    │   ├── dataloader.py
    │   ├── densenet.py
    │   ├── main.py
    │   ├── resnet.py
    │   └── tools
    │   │   ├── __init__.py
    │   │   ├── measures.py
    │   │   ├── parse.py
    │   │   ├── plot.py
    │   │   ├── py_op.py
    │   │   ├── segmentation.py
    │   │   └── utils.py
    └── preprocessing
    │   ├── analysis_dataset.py
    │   ├── map_word_to_index.py
    │   └── show_black.py
├── files
    ├── alphabet_count_dict.json
    ├── alphabet_index_dict.json
    ├── black.json
    ├── image_hw_ratio_dict.json
    ├── src
    │   ├── A81.png
    │   └── B1000_0.png
    ├── train.csv
    ├── train_alphabet.json
    └── ttf
    │   └── simsun.ttf
└── requirement.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | data/
106 | result/
107 | results/
108 | tmp.py
109 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # OCR
  2 | [第一届西安交通大学人工智能实践大赛（2018AI实践大赛--图片文字识别）](http://competition.heils.cn/main.html)冠军
  3 | 
  4 | 
  5 | # 模型结果
  6 | 该比赛计算每一个条目的f1score，取所有条目的平均，具体计算方式在[这里](http://competition.heils.cn/main.html)。这里的计算方式不对一句话里的相同文字重复计算，故f1score比提交的最终结果低：
  7 | 
  8 | |  -  | train | val | 
  9 | | :----------------: | :----------------: | :----------------: | 
 10 | | f1score   | 0.9911 | 0.9582 |
 11 | | recall    | 0.9943 | 0.9574 |
 12 | | precision | 0.9894 | 0.9637 |
 13 | 
 14 | # 模型说明
 15 | 1. 模型
 16 | 
 17 | 采用densenet结构，模型输入为(64×512)的图片，输出为(8×64×2159)的概率。
 18 | 
 19 | 将图片划分为多个(8×8)的方格，在每个方格预测2159个字符的概率。
 20 | 
 21 | 2. Loss
 22 | 
 23 | 将(8×64×2159)的概率沿着长宽方向取最大值，得到(2159)的概率，表示这张图片里有对应字符的概率。
 24 | 
 25 | balance: 对正例和负例分别计算loss，使得正例loss权重之和与负例loss权重之和相等，解决数据不平衡的问题。
 26 | 
 27 | hard-mining
 28 | 
 29 | 3. 文字检测
 30 | 将(8×64×2159)的概率沿着宽方向取最大值，得到(64×2159)的概率。
 31 | 沿着长方向一个个方格预测文字，然后连起来可得到一句完整的语句。
 32 | 
 33 | 存在问题：两个连续的文字无法重复检测
 34 | 
 35 | 下图是一个文字识别正确的示例：的长为半径作圆
 36 | 
 37 | <img src="files/src/B1000_0.png" width=50%>
 38 | 
 39 | 下图是一个文字识别错误的示例：为10元；经粗加工后销售，每
 40 | 
 41 | <img src="files/src/A81.png" width=50%>
 42 | 
 43 | 
 44 | # 文件目录
 45 | 	ocr
 46 | 	|
 47 | 	|--code
 48 | 	|
 49 | 	|--files
 50 | 	|	|
 51 | 	|	|--train.csv
 52 | 	|
 53 | 	|--data
 54 | 		|
 55 | 		|--dataset
 56 | 		|	|
 57 | 		|	|--train
 58 | 		|	|
 59 | 		|	|--test
 60 | 		|
 61 | 		|--result
 62 | 		|	|
 63 | 		|	|--test_result.csv
 64 | 		|
 65 | 		|--images		此文件夹放置任何图片均可，我放的celebA数据集用作pretrain
 66 | 
 67 | # 运行环境
 68 | Ubuntu16.04, python2.7, CUDA9.0
 69 | 
 70 | 安装[pytorch](https://pytorch.org/), 推荐版本: 0.2.0_3
 71 | ```
 72 | pip install -r requirement.txt
 73 | ```
 74 | 
 75 | # 下载数据
 76 | 从[这里](https://pan.baidu.com/s/1w0iEE7q84IolmZXwttOxVw)下载初赛、复赛数据、模型，合并训练集、测试集。
 77 | 
 78 | 
 79 | # 预处理
 80 | 如果不更换数据集，不需要执行这一步。
 81 | 
 82 | 如果更换其他数据集，一并更换 files/train.csv
 83 | ```
 84 | cd code/preprocessing
 85 | python map_word_to_index.py
 86 | python analysis_dataset.py  
 87 | ```
 88 | 
 89 | # 训练
 90 | ```
 91 | cd code/ocr
 92 | python main.py
 93 | ```
 94 | 
 95 | # 测试
 96 | f1score在0.9以下，lr=0.001，不使用hard-mining；
 97 | 
 98 | f1score在0.9以上，lr=0.0001，使用hard-mining；
 99 | 
100 | 生成的model保存在不同的文件夹里。
101 | ```
102 | cd code/ocr
103 | python main.py --phase test --resume  ../../data/models-small/densenet/eval-16-1/best_f1score.ckpt
104 | ```
105 | 


--------------------------------------------------------------------------------
/code/ocr/dataloader.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | 
  3 | """
  4 | Read images and corresponding labels.
  5 | """
  6 | 
  7 | import numpy as np
  8 | import os
  9 | import json
 10 | # import skimage
 11 | # from skimage import io
 12 | from PIL import Image,ImageDraw,ImageFont,ImageFilter
 13 | from torch.utils.data import Dataset
 14 | import time
 15 | 
 16 | filters = [
 17 |             ImageFilter.SMOOTH,                 # 平滑，大于16可以用
 18 |             ImageFilter.SMOOTH_MORE,            # 平滑，大于16可以用
 19 |             ImageFilter.GaussianBlur(radius=1), # 大于16可以用
 20 | 
 21 |             ImageFilter.GaussianBlur(radius=2), # 大于32可以用
 22 |             ImageFilter.BLUR,                   # 大于32可以用
 23 |         ]
 24 | 
 25 | def histeq (im,nbr_bins =256):  
 26 |     # 对一副灰度图像进行直方图均衡化  
 27 |     #该函数有两个输入参数，一个是灰度图像，一个是直方图中使用小区间的数目  
 28 |     #函数返回直方图均衡化后的图像，以及用来做像素值映射的累计分布函数  
 29 |     # 计算图像的直方图  
 30 |     imhist,bins =np.histogram(im.flatten(),nbr_bins,normed=True)  
 31 |     cdf =imhist.cumsum() #cumulative distribution function  
 32 |     cdf =255*cdf/cdf[-1] #归一化，函数中使用累计分布函数的最后一个元素（下标为-1，目标是  
 33 |     # 将其归一化到0-1范围 ）  
 34 |     # 使用累计分布函数的线性插值，计算新的像素值  
 35 |     im2=np.interp(im.flatten(),bins[:-1],cdf) # im2 is an array  
 36 |     return im2.reshape(im.shape),cdf  
 37 | 
 38 | 
 39 | class DataSet(Dataset):
 40 |     def __init__(self, 
 41 |             image_names, 
 42 |             image_label_dict, 
 43 |             class_num, 
 44 |             transform=None, 
 45 |             image_size=None,        # 最后生成的图片大小
 46 |             word_index_dict=None,   # 字符与index的对应
 47 |             phase='train',          # phase
 48 |             args=None,              # 全局参数
 49 |             font_range=None,        # 生成字符大小范围
 50 |             rotate_range=None,      # 图片旋转范围
 51 |             margin=None             # 图片边缘不覆盖字符，以免旋转时候丢失
 52 |             ):
 53 | 
 54 |         self.font_range = font_range
 55 |         self.rotate_range = rotate_range
 56 |         self.margin = margin
 57 |         self.image_names = image_names
 58 |         self.image_label_dict = image_label_dict
 59 |         self.transform = transform
 60 |         self.phase = phase
 61 |         self.class_num = class_num
 62 |         self.word_labels = { }
 63 |         self.image_size = image_size
 64 |         self.word_index_dict = word_index_dict
 65 |         self.args = args
 66 |         if self.phase != 'pretrain':
 67 |             for image_name in image_names:
 68 |                 image_name = image_name.split('/')[-1]
 69 |                 if image_name not in image_label_dict:
 70 |                     try:
 71 |                         image_label_dict[image_name] = image_label_dict[image_name.replace('seg.','').split('.png')[0]+'.png']
 72 |                     except:
 73 |                         image_label_dict[image_name] = ''
 74 |                 word_label = np.zeros(class_num)
 75 |                 label = image_label_dict[image_name]
 76 |                 for l in label.split():
 77 |                     word_label[int(l)] = 1
 78 |                 self.word_labels[image_name] = word_label.astype(np.float32)
 79 | 
 80 |     def __getitem__(self, index):
 81 |         image_name = self.image_names[index]
 82 |         # print self.image_size
 83 |         if self.phase == 'pretrain':
 84 |             image = Image.open(image_name).convert('RGB')
 85 |             # 改变灰度
 86 |             image = np.array(image)
 87 |             r = get_random(index) 
 88 |             # 通常背景为高亮度颜色
 89 |             if r < 0.3:
 90 |                 min_rgb = 192.
 91 |             elif r < 0.7:
 92 |                 min_rgb = 128.
 93 |             else:
 94 |                 min_rgb = 64.
 95 |             if self.args.model == 'resnet':
 96 |                 pass
 97 |             elif index % 2 == 0:
 98 |                 image = image / (255. - min_rgb) + min_rgb
 99 |             else:
100 |                 image[image<min_rgb] = min_rgb
101 |             image = Image.fromarray(image.astype(np.uint8))
102 |             no_aug = get_random(index+1000) < 0.1
103 |             if self.args.epoch < 40:
104 |                 no_aug = 1
105 |             image, label, bbox_label, seg_label, font_size = generate_image( index, image, no_aug, self)
106 | 
107 |             # 转化为numpy数组之后，增加一些其他的augmentation
108 |             image = np.transpose(np.array(image), [2,0,1]).astype(np.float32)
109 | 
110 |             if get_random(index+1) < 0.2:
111 |                 # 灰度反向，变成黑底，白字
112 |                 image = 255. - image
113 | 
114 |             if not no_aug:
115 |                 # 每一列灰度有所改变
116 |                 if get_random(index + 3) < 0.3:
117 |                     change_level = 256. / image.shape[1]
118 |                     gray_change = 0 
119 |                     for j in range(image.shape[1]):
120 |                         gray_change += change_level * get_random(j+index) - change_level / 2
121 |                         image[:,j,:] += int(gray_change)
122 |                 # 每一行灰度有所改变
123 |                 if get_random(index + 4) < 0.3:
124 |                     change_level = 256. / image.shape[2]
125 |                     gray_change = 0
126 |                     for k in range(image.shape[2]):
127 |                         gray_change += change_level * get_random(10+k+index) - change_level / 2
128 |                         image[:,:,k] += int(gray_change)
129 |                 image_name = image_name.split('/')[-1]
130 |             '''
131 |             # 增加噪声
132 |             if get_random(index+5) > 0.5 and self.args.epoch > 35:
133 |                 noise_level = 10
134 |                 noise = np.random.random(image.shape) * noise_level - noise_level / 2.
135 |                 image = image + noise
136 |             '''
137 |             image = (image / 128. - 1).astype(np.float32)
138 | 
139 |             if font_size > 32:
140 |                 size_label = 1
141 |             elif font_size < 16:
142 |                 size_label = 0
143 |             else:
144 |                 size_label = 11
145 |             size_label = np.array([size_label]).astype(np.float32)
146 | 
147 |             return image_name, image.astype(np.float32), label, bbox_label, seg_label, size_label
148 | 
149 |         elif self.phase == 'seg':
150 | 				# 保持和原图相同的分辨率
151 |                 image = Image.open(image_name).convert('RGB')
152 |                 # image_name = image_name.split('/')[-1]
153 |                 # image = image.resize(self.image_size)
154 |                 image = np.transpose(np.array(image), [2,0,1]).astype(np.float32)
155 |                 min_size = 32
156 |                 shape = (np.array(image.shape).astype(np.int32) / min_size) * min_size + min_size # * 2
157 |                 new_image = np.zeros([3, shape[1], shape[2]], dtype=np.float32) 
158 |                 '''
159 |                 for i in range(3):
160 |                     gray = sorted(image[i].reshape(-1))
161 |                     gray = gray[len(gray)/2]
162 |                     new_image[i] = gray
163 |                 '''
164 |                 # new_image[:, min_size/2:image.shape[1]+min_size/2, min_size/2:image.shape[2]+min_size/2] = image
165 |                 new_image[:, :image.shape[1], :image.shape[2]] = image
166 |                 image = new_image
167 |                 # word_label = self.word_labels[image_name]
168 |                 image = (image / 128. - 1).astype(np.float32)
169 |                 return image_name, image, np.zeros(self.class_num, dtype=np.float32)
170 |         else:
171 |             seg_name = image_name.replace('train','seg.train').replace('test','seg.test') + '.seg.crop.png'
172 |             no_aug = self.args.no_aug
173 |             if os.path.exists(seg_name):
174 |                 # image, word_label = random_crop_image(seg_name, self.image_label_dict[image_name.split('/')[-1]], self.image_size, self.class_num, self.phase, index, no_aug)
175 |                 image, word_label = random_crop_image(image_name, self.image_label_dict[image_name.split('/')[-1]], self.image_size, self.class_num, self.phase, index, no_aug, self.args)
176 |             else:
177 |                 image, word_label = random_crop_image(image_name, self.image_label_dict[image_name.split('/')[-1]], self.image_size, self.class_num, self.phase, index, no_aug, self.args)
178 | 
179 |             # 灰度反向翻转，变成黑底，白字
180 |             if self.phase == 'train':
181 |                 r = get_random(index+111) 
182 |                 if r < 0.1:
183 |                     image[0,:,:] = 255 - image[0,:,:]
184 |                 elif r < 0.2:
185 |                     image[1,:,:] = 255 - image[1,:,:]
186 |                 elif r < 0.3:
187 |                     image[2,:,:] = 255 - image[2,:,:]
188 |                 if get_random(index+112) < 0.2:
189 |                     image = 255. - image
190 | 
191 |             image = (image / 128. - 1).astype(np.float32)
192 |             return image_name, image, word_label
193 | 
194 |     def __len__(self):
195 |         return len(self.image_names) 
196 | 
197 | last_random = 10
198 | def get_random(idx):
199 |     global last_random
200 |     if last_random < 1:
201 |         np.random.seed(int(last_random * 1000000 + time.time()) + idx)
202 |     else:
203 |         np.random.seed(int((time.time())))
204 |     x = np.random.random()
205 |     while np.abs(last_random - x) < 0.1:
206 |         x = np.random.random()
207 |     last_random = x
208 |     return x
209 | 
210 | def comput_iou(font, proposal):
211 |     fx,fy,fh,fw = font
212 |     px,py,pd = proposal
213 |     overlap_x =  max(min(pd, fh) - np.abs(fx - px), 0)
214 |     overlap_y =  max(min(pd, fw) - np.abs(fy - py), 0)
215 |     # 面积
216 |     sf = fh * fw
217 |     sp = pd * pd
218 |     so = overlap_x * overlap_y
219 |     iou = float(so) / (sf + sp - so)
220 |     return iou
221 | 
222 | def generate_bbox_label(image, font_place, font_size, font_num, args, image_size):
223 |     imgh,imgw = image.size
224 |     seg_label = np.zeros((image_size[0]/2, image_size[1]/2), dtype=np.float32)
225 |     sx = float(font_place[0]) / image.size[0] * image_size[0]
226 |     ex = sx + float(font_size) / image.size[0] * image_size[0] * font_num
227 |     sy = float(font_place[1]) / image.size[1] * image_size[1]
228 |     ey = sy + float(font_size) / image.size[1] * image_size[1]
229 |     seg_label[int(sx)/2:int(ex)/2, int(sy)/2:int(ey)/2] = 1
230 |     seg_label = seg_label.transpose((1,0))
231 | 
232 |     bbox_label = np.zeros((
233 |         image_size[0]/args.stride,  # 16
234 |         image_size[1]/args.stride,  # 16
235 |         len(args.anchors),          # 4
236 |         4                           # dx,dy,dd,c
237 |         ), dtype=np.float32)
238 |     fonts= []
239 |     for i in range(font_num):
240 |         x = font_place[0] + font_size/2. + i * font_size
241 |         y = font_place[1] + font_size/2.
242 |         h = font_size
243 |         w = font_size
244 | 
245 |         x = float(x) * image_size[0] / imgh
246 |         h = float(h) * image_size[0] / imgh
247 |         y = float(y) * image_size[1] / imgw
248 |         w = float(w) * image_size[1] / imgw
249 |         fonts.append([x,y,h,w])
250 | 
251 |     # print bbox_label.shape
252 |     for ix in range(bbox_label.shape[0]):
253 |         for iy in range(bbox_label.shape[1]):
254 |             for ia in range(bbox_label.shape[2]):
255 |                 proposal = [ix*args.stride + args.stride/2, iy*args.stride + args.stride/2, args.anchors[ia]]
256 |                 iou_fi = []
257 |                 for fi, font in enumerate(fonts):
258 |                     iou = comput_iou(font, proposal)
259 |                     iou_fi.append((iou, fi))
260 |                 max_iou, max_fi = sorted(iou_fi)[-1]
261 |                 if max_iou > 0.5:
262 |                     # 正例
263 |                     dx = (font[0] - proposal[0]) / float(proposal[2])
264 |                     dy = (font[1] - proposal[1]) / float(proposal[2])
265 |                     fd = max(font[2:])
266 |                     dd = np.log(fd / float(proposal[2]))
267 |                     # bbox_label[ix,iy,ia] = [dx, dy, dd, 1]
268 |                     bbox_label[ix,iy,ia] = [dx, dy, dd, 1]
269 |                 elif max_iou > 0.25:
270 |                     # 忽略
271 |                     bbox_label[ix,iy,ia,3] = 0
272 |                 else:
273 |                     # 负例
274 |                     bbox_label[ix,iy,ia,3] = -1
275 |     # 这里有一个transpose操作
276 |     bbox_label = bbox_label.transpose((1,0,2,3))
277 | 
278 | 
279 |                 # 计算anchor信息
280 |     return bbox_label, seg_label
281 | 
282 | def get_resize_para(size, idx):
283 |     if size > 48:
284 |         rh, rw = 4,4
285 |     elif size > 32:
286 |         if idx % 2:
287 |             rh, rw = 2,4
288 |         else:
289 |             rh, rw = 4,2
290 |     elif size > 16:
291 |         if idx % 2:
292 |             rh, rw = 1,2
293 |         else:
294 |             rh, rw = 2,1
295 |     else:
296 |         return 1,1
297 | 
298 |     rhs = range(rh)
299 |     np.random.seed(int(time.time()) + idx + 1)
300 |     np.random.shuffle(rhs)
301 |     rh = rhs[0] + 1
302 | 
303 |     rws = range(rw)
304 |     np.random.seed(int(time.time()) + idx + 2)
305 |     np.random.shuffle(rws)
306 |     rw = rws[0] + 1
307 | 
308 |     return rh, rw
309 | 
310 | # def generate_image(idx, image, word_index_dict, class_num, args, image_size, no_aug, epoch):
311 | def generate_image( idx, image, no_aug, dataset):
312 |     '''
313 |     args.model == 'resnet' 的时候只是用于训练分割网络，大部分augmentation都不用
314 |     这里的注释，默认参数是
315 |         image_size [512, 64]
316 |         rotate_range [-5, 5]
317 |         font_range [8,32]
318 |     '''
319 | 
320 |     word_index_dict = dataset.word_index_dict
321 |     class_num = dataset.class_num
322 |     args = dataset.args
323 |     image_size = dataset.image_size
324 |     font_range = dataset.font_range
325 |     rotate_range = dataset.rotate_range 
326 |     epoch = args.epoch
327 |     margin = dataset.margin
328 | 
329 |     # 选择文字背景
330 |     image = image.resize((1024,1024))
331 |     h,w = image.size
332 |     # 随机crop一个部分，resize成固定大小，会对文字有一定的水平竖直方向拉伸
333 |     h_crop = int(get_random(idx + 10) * image_size[0] * 2 / 8) + image_size[0] * 6 / 8 # 长度范围 [374, 512]
334 |     w_crop = int(get_random(idx + 11) * image_size[1] * 2 / 8) + image_size[1] * 6 / 8 # 宽度范围 [48, 64]
335 |     if args.model == 'resnet' or no_aug or epoch < 60:
336 |         # resnet: 分割网络采用固定大小crop
337 |         # epoch<60: 网络训练初期采用固定大小，加速收敛
338 |         h_crop = image_size[0]
339 |         w_crop = image_size[1]
340 |     # 选择文字背景，随机选择crop起始位置
341 |     x = int(get_random(idx+12) * (h - h_crop))
342 |     y = int(get_random(idx+13) * (w - w_crop))
343 |     image = image.crop((x,y,x+h_crop,y+w_crop))
344 | 
345 | 
346 |     # 字体大小是最容易引起错误的变量，字体大小不能超出图片中心区域大小
347 |     size = font_range[0] + int(get_random(idx+20) * (font_range[1] - font_range[0]))
348 |     size = min(size, h_crop - 2*margin - 2, w_crop - 2*margin - 2)
349 | 
350 |     # 字体数量，超过可容纳数量的一半以上，至少包含一个字符
351 |     large_num = max(0, (h_crop - 2 * margin)/ size - 1)     
352 |     word_num = int(min(large_num / 2, 5) + get_random(idx+21) * large_num / 2) + 1
353 |     # word_num = int(large_num / 2 + get_random(idx+21) * large_num / 2) + 1
354 |     word_num = max(1, word_num)
355 | 
356 |     # 添加字体位置，并生成label信息
357 |     place_x = int(get_random(idx+22) * (h_crop - word_num * size - margin)) + margin
358 |     if margin == 0:
359 |         # 用于添加两排文字
360 |         place_y = int(get_random(idx+23) * (w_crop/2 - size - margin)) + margin
361 |     else:
362 |         place_y = int(get_random(idx+23) * (w_crop - size - margin)) + margin
363 |     place = (place_x, place_y)
364 |     label = np.zeros(class_num).astype(np.float32)
365 | 
366 |     text = u''
367 |     words = word_index_dict.keys()
368 | 
369 |     if margin == 0:
370 |         # 两排文字
371 |         word_num *= 2
372 |     while len(text) < word_num:
373 |         np.random.shuffle(words)
374 |         w = words[len(text)]
375 |         if w in u'"(),':
376 |             # 部分字符不建议生成
377 |             continue
378 |         text = text + w
379 |         index = word_index_dict[w]
380 |         label[index] = 1
381 | 
382 |     # 得到bbox_label
383 |     if args.model == 'resnet':
384 |         bbox_label, seg_label = generate_bbox_label(image, place, size, word_num, args, image_size)
385 |     else:
386 |         bbox_label, seg_label = 0,0
387 | 
388 |     # 字体，可以添加其他字体
389 |     fonts = ['../../files/ttf/simsun.ttf']
390 |     np.random.shuffle(fonts)
391 |     font = fonts[0]
392 | 
393 |     # 颜色
394 |     r = get_random(idx+24)
395 |     if no_aug or r < 0.7:
396 |         # 选择不同程度的黑色
397 |         if r < 0.3:
398 |             c = int(get_random(idx + 25) * 64)
399 |             color = (c,c,c)
400 |         else:
401 |             rgb = 64
402 |             r = int(get_random(idx + 27) * rgb)
403 |             g = int(get_random(idx + 28) * rgb)
404 |             b = int(get_random(idx + 29) * rgb)
405 |             color = (r,g,b)
406 |     else:
407 |         # 随机颜色，但是选择较暗的颜色
408 |         rgb = 256
409 |         r = int(get_random(idx + 27) * rgb)
410 |         g = int(get_random(idx + 28) * rgb)
411 |         b = int(get_random(idx + 29) * rgb)
412 |         ra = get_random(idx + 30)
413 |         if ra < 0.5:
414 |             ra = int(1000 * ra) % 3
415 |             if ra == 0:
416 |                 r = 0
417 |             elif ra == 1:
418 |                 g = 0
419 |             else:
420 |                 b = 0
421 |         color = (r,g,b)
422 | 
423 |     # 增加文字到图片
424 |     if margin == 0:
425 |         image = add_text_to_img(image, text[:word_num/2], size, font, color, place)
426 |         image = add_text_to_img(image, text[word_num/2:], size, font, color, (place[0], place[1]+image_size[1]/2))
427 |     else:
428 |         image = add_text_to_img(image, text, size, font, color, place)
429 | 
430 |     '''
431 |     # 随机翻转，增加泛化程度
432 |     if args.model != 'resnet':
433 |         if get_random(idx+130) < 0.3:
434 |             image = image.transpose(Image.FLIP_LEFT_RIGHT)
435 |         if get_random(idx+131) < 0.3:
436 |             image = image.transpose(Image.FLIP_TOP_BOTTOM)
437 | 
438 |     # 先做旋转，然后在拉伸图片
439 |     h,w = image.size
440 |     max_hw, min_hw = float(max(h,w)), float(min(h,w))
441 |     if max_hw / min_hw >= 5:
442 |         rotate_size = 5
443 |     elif max_hw / min_hw >= 3:
444 |         rotate_size = 10
445 |     elif max_hw / min_hw >= 1.5:
446 |         rotate_size = 30
447 |     else:
448 |         rotate_size = 50
449 |     if args.model != 'resnet' and not no_aug and epoch>70 and get_random(idx+50) < 0.8:
450 |         theta = int(rotate_size * 2 * get_random(idx+32)) - rotate_size
451 |         image = image.rotate(theta)
452 |     else:
453 |         theta = 0
454 |     '''
455 | 
456 | 
457 |     # 还原成 [512, 64] 的大小
458 |     image = image.resize(image_size)
459 | 
460 | 
461 |     # 最后生成图片后再一次旋转，图片模糊化
462 |     if args.model == 'resnet' or (get_random(idx+50) < 0.8 and not no_aug):
463 | 
464 |         # 旋转
465 |         if args.model == 'resnet' :
466 |             rotate_size = 10
467 |         else:
468 |             rotate_size = rotate_range[0] + int(get_random(idx+32) * (rotate_range[1] - rotate_range[0]))
469 |         theta = int(rotate_size * 2 * get_random(idx+33)) - rotate_size
470 |         image = image.rotate(theta)
471 |         if args.model == 'resnet':
472 |             # 作分割的时候，标签信息也需要一起旋转
473 |             seg_label = np.array([seg_label, seg_label, seg_label]) * 255
474 |             seg_label = np.array(Image.fromarray(seg_label.transpose([1,2,0]).astype(np.uint8)).rotate(theta))
475 |             seg_label = (seg_label[:,:,0] > 128).astype(np.float32)
476 | 
477 |     filters = [
478 |             ImageFilter.SMOOTH,                 # 平滑，大于16可以用
479 |             ImageFilter.SMOOTH_MORE,            # 平滑，大于16可以用
480 |             ImageFilter.GaussianBlur(radius=1), # 大于16可以用
481 | 
482 |             ImageFilter.GaussianBlur(radius=2), # 大于32可以用
483 |             ImageFilter.BLUR,                   # 大于32可以用
484 |             ImageFilter.GaussianBlur(radius=2), # 多来两次
485 |             ImageFilter.BLUR,                   # 多来两次
486 |             ]
487 | 
488 |     # 当文字比较大的时候，增加一些模糊
489 |     if size > 16:
490 |         if size < 32:
491 |             filters = filters[:3]
492 |         np.random.shuffle(filters)
493 |         image = image.filter(filters[idx % len(filters)])
494 | 
495 |     if args.model == 'resnet':
496 |         # add noise
497 |         noise_level = 32
498 |         image = np.array(image)
499 |         noise = np.random.random(image.shape) * noise_level - noise_level / 2.
500 |         image = image + noise
501 |         image = image.astype(np.uint8)
502 |         image = Image.fromarray(image)
503 | 
504 | 
505 |     # 有时候需要低分辨率的图片
506 |     resize_0, resize_1 = get_resize_para(size, idx)
507 |     image = image.resize([image_size[0]/resize_0, image_size[1]/resize_1])
508 | 
509 |     # 还原成 [512, 64] 的大小
510 |     image = image.resize(image_size)
511 | 
512 |     return image, label, bbox_label, seg_label, size
513 | 
514 | def add_text_to_img(img, text, size, font, color, place):
515 |     imgdraw = ImageDraw.Draw(img)
516 |     imgfont = ImageFont.truetype(font,size=size)
517 |     imgdraw.text(place, text, fill=color, font=imgfont)
518 |     return img
519 | 
520 | def random_crop_image(image_name, text, image_size, class_num, phase, idx, no_aug, args):
521 |     # label
522 |     text = text.split()
523 |     word_label = np.zeros(class_num, dtype=np.float32)
524 | 
525 |     
526 |     if args.hist:
527 |         if get_random(idx+34) < 0.4 and phase == 'train':
528 |             image = Image.open(image_name).convert('RGB')
529 |         else:
530 |             # 直方图均衡化
531 |             image = Image.open(image_name).convert('YCbCr')
532 |             image = np.array(image)
533 |             imy = image[:,:,0]
534 |             imy,_ = histeq(imy)
535 |             image[:,:,0] = imy
536 |             image = Image.fromarray(image, mode='YCbCr').convert('RGB')
537 |     else:
538 |         image = Image.open(image_name).convert('RGB')
539 |     x = np.array(image)
540 |     assert x.min() >= 0
541 |     assert x.max() < 256
542 | 
543 |     if phase == 'train' and not no_aug:
544 |         # 旋转
545 |         if get_random(idx+11) < 0.8:
546 |             theta = int(6 * get_random(idx+1)) - 3
547 |             image = image.rotate(theta)
548 | 
549 |         # 模糊处理
550 |         if get_random(idx+2) < 0.3:
551 |             np.random.shuffle(filters)
552 |             image = image.filter(filters[0])
553 | 
554 |         # 短边小于64， 直接填0
555 |         h,w = image.size
556 |         if w < image_size[1] and h > 64:
557 |             if get_random(idx+3) < 0.3:
558 |                 image = np.array(image)
559 |                 start_index = (image_size[1] - w)/2
560 |                 new_image = np.zeros((image_size[1], h, 3), dtype=np.uint8)
561 |                 new_image[start_index:start_index+w, :, :] = image
562 |                 image = Image.fromarray(new_image)
563 | 
564 | 
565 |     # 先处理成 X * 64 的图片
566 |     h,w = image.size
567 |     h = int(float(h) * image_size[1] / w)
568 |     image = image.resize((h, image_size[1]))
569 | 
570 |     if phase == 'train' and not no_aug:
571 | 
572 |         # 放缩 0.8~1.2
573 |         h,w = image.size
574 |         r = get_random(idx+4) / 4. + 0.8
575 |         image = image.resize((int(h*r), int(w*r)))
576 | 
577 |         # crop
578 |         if min(h,w) > 32:
579 |             crop_size = 20
580 |             x = int((crop_size * get_random(idx+5) - crop_size/2) * r)
581 |             y = int((crop_size * get_random(idx+6) - crop_size/2) * r)
582 |             image = image.crop((max(0,x),max(0,y),min(0,x)+h,min(0,y)+w))
583 | 
584 |         # 有时需要生成一些低分辨率的图片
585 |         h,w = image.size
586 |         r = get_random(idx+7)
587 |         
588 |         '''
589 |         if r < 0.01 and min(h,w) > 64:
590 |             image = image.resize((h/8, w/8))
591 |         elif r < 0.1 and min(h,w) > 64:
592 |             image = image.resize((h/4, w/4))
593 |         elif r < 0.3 and min(h,w) > 32:
594 |             image = image.resize((h/2, w/2))
595 |         '''
596 | 
597 |         # 从新变为 X * 64 的图片
598 |         h = int(float(h) * image_size[1] / w)
599 |         image = image.resize((h, image_size[1]))
600 | 
601 |     # 填充成固定大小
602 |     image = np.transpose(np.array(image), [2,0,1]).astype(np.float32)
603 |     if image.shape[2] < image_size[0]:
604 |         # 长宽比例小于8(16)，直接填充
605 |         if phase == 'test':
606 |             # 正中间
607 |             start = np.abs(image_size[0] - image.shape[2])/2
608 |         else:
609 |             start = int(np.random.random() * np.abs(image_size[0] - image.shape[2]))
610 |         new_image = np.zeros((3, image_size[1], image_size[0]), dtype=np.float32)
611 |         new_image[:,:,start:start+image.shape[2]] = image
612 |         if phase == 'test':
613 |             new_image = np.array([new_image]).astype(np.float32)
614 |         for w in text:
615 |             word_label[int(w)] = 1
616 |     else:
617 |         # 长宽比例大于16，随机截取
618 |         if phase == 'test':
619 |             # 测试阶段直接合并
620 |             crop_num = image.shape[2] * 2 / image_size[0] + 1
621 |             new_image = np.zeros((crop_num, 3, image_size[1], image_size[0]), dtype=np.float32)
622 |             for i in range(crop_num):
623 |                 start_index = i * image_size[0] / 2
624 |                 end_index = start_index + image_size[0]
625 |                 if end_index > image.shape[2]:
626 |                     new_image[i,:,:,:image.shape[2] - start_index] = image[:,:,start_index:end_index]
627 |                 else:
628 |                     new_image[i] = image[:,:,start_index:end_index]
629 |             for w in text:
630 |                 word_label[int(w)] = 1
631 |         else:
632 |             # 训练阶段不算负例loss
633 |             start = int(np.random.random() * np.abs(image_size[0] - image.shape[2]))
634 |             new_image = image[:,:,start:start+image_size[0]]
635 |             for w in text:
636 |                 word_label[int(w)] = -1
637 | 
638 |     image = new_image
639 |     if phase == 'train':
640 |         image = image.astype(np.float32)
641 |         '''
642 |         # 每一列灰度有所改变
643 |         if get_random(idx+9) < 0.3:
644 |             change_level = 256. / image.shape[1]
645 |             gray_change = 0 
646 |             for j in range(image.shape[1]):
647 |                 gray_change += change_level * get_random(j+idx) - change_level / 2
648 |                 image[:,j,:] += gray_change
649 |         # 每一行灰度有所改变
650 |         if get_random(idx+10) < 0.3:
651 |             change_level = 256. / image.shape[2]
652 |             gray_change = 0
653 |             for k in range(image.shape[2]):
654 |                 gray_change += change_level * get_random(10+k+idx) - change_level / 2
655 |                 image[:,:,k] += gray_change
656 |         '''
657 |         # 增加噪声
658 |         if get_random(idx+8) < 0.1:
659 |             noise_level = 64
660 |             noise = np.random.random(image.shape) * noise_level - noise_level / 2.
661 |             image = image + noise 
662 |             # noise = np.random.random(image.shape[1:]) * noise_level - noise_level / 2.
663 |             # image = image + np.array([noise, noise, noise])
664 |             image = image.astype(np.float32)
665 | 
666 |     return image, word_label
667 | 


--------------------------------------------------------------------------------
/code/ocr/densenet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torch.utils.model_zoo as model_zoo
  5 | from collections import OrderedDict
  6 | 
  7 | __all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']
  8 | 
  9 | 
 10 | model_urls = {
 11 |     'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',
 12 |     'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',
 13 |     'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',
 14 |     'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',
 15 | }
 16 | 
 17 | 
 18 | def densenet121(pretrained=False, small=0,**kwargs):
 19 |     r"""Densenet-121 model from
 20 |     `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
 21 | 
 22 |     Args:
 23 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
 24 |     """
 25 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16), small=small,
 26 |                      **kwargs)
 27 |     if pretrained:
 28 |         model.load_state_dict(model_zoo.load_url(model_urls['densenet121']))
 29 |     return model
 30 | 
 31 | 
 32 | def densenet169(pretrained=False, **kwargs):
 33 |     r"""Densenet-169 model from
 34 |     `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
 35 | 
 36 |     Args:
 37 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
 38 |     """
 39 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
 40 |                      **kwargs)
 41 |     if pretrained:
 42 |         model.load_state_dict(model_zoo.load_url(model_urls['densenet169']))
 43 |     return model
 44 | 
 45 | 
 46 | def densenet201(pretrained=False, **kwargs):
 47 |     r"""Densenet-201 model from
 48 |     `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
 49 | 
 50 |     Args:
 51 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
 52 |     """
 53 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
 54 |                      **kwargs)
 55 |     if pretrained:
 56 |         model.load_state_dict(model_zoo.load_url(model_urls['densenet201']))
 57 |     return model
 58 | 
 59 | 
 60 | def densenet161(pretrained=False, **kwargs):
 61 |     r"""Densenet-161 model from
 62 |     `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
 63 | 
 64 |     Args:
 65 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
 66 |     """
 67 |     model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
 68 |                      **kwargs)
 69 |     if pretrained:
 70 |         model.load_state_dict(model_zoo.load_url(model_urls['densenet161']))
 71 |     return model
 72 | 
 73 | 
 74 | class _DenseLayer(nn.Sequential):
 75 |     def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
 76 |         super(_DenseLayer, self).__init__()
 77 |         self.add_module('norm.1', nn.BatchNorm2d(num_input_features)),
 78 |         self.add_module('relu.1', nn.ReLU(inplace=True)),
 79 |         self.add_module('conv.1', nn.Conv2d(num_input_features, bn_size *
 80 |                         growth_rate, kernel_size=1, stride=1, bias=False)),
 81 |         self.add_module('norm.2', nn.BatchNorm2d(bn_size * growth_rate)),
 82 |         self.add_module('relu.2', nn.ReLU(inplace=True)),
 83 |         self.add_module('conv.2', nn.Conv2d(bn_size * growth_rate, growth_rate,
 84 |                         kernel_size=3, stride=1, padding=1, bias=False)),
 85 |         self.drop_rate = drop_rate
 86 | 
 87 |     def forward(self, x):
 88 |         new_features = super(_DenseLayer, self).forward(x)
 89 |         if self.drop_rate > 0:
 90 |             new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
 91 |         return torch.cat([x, new_features], 1)
 92 | 
 93 | 
 94 | class _DenseBlock(nn.Sequential):
 95 |     def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
 96 |         super(_DenseBlock, self).__init__()
 97 |         for i in range(num_layers):
 98 |             layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
 99 |             self.add_module('denselayer%d' % (i + 1), layer)
100 | 
101 | 
102 | class _Transition(nn.Sequential):
103 |     def __init__(self, num_input_features, num_output_features, use_pool):
104 |         super(_Transition, self).__init__()
105 |         self.add_module('norm', nn.BatchNorm2d(num_input_features))
106 |         self.add_module('relu', nn.ReLU(inplace=True))
107 |         self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
108 |                                           kernel_size=1, stride=1, bias=False))
109 |         if use_pool:
110 |             self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
111 | 
112 | 
113 | class DenseNet(nn.Module):
114 |     r"""Densenet-BC model class, based on
115 |     `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_
116 | 
117 |     Args:
118 |         growth_rate (int) - how many filters to add each layer (`k` in paper)
119 |         block_config (list of 4 ints) - how many layers in each pooling block
120 |         num_init_features (int) - the number of filters to learn in the first convolution layer
121 |         bn_size (int) - multiplicative factor for number of bottle neck layers
122 |           (i.e. bn_size * k features in the bottleneck layer)
123 |         drop_rate (float) - dropout rate after each dense layer
124 |         num_classes (int) - number of classification classes
125 |     """
126 |     def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16), small=0,
127 |                  num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000):
128 | 
129 |         super(DenseNet, self).__init__()
130 | 
131 |         # First convolution
132 |         self.features = nn.Sequential(OrderedDict([
133 |             ('conv0', nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
134 |             ('norm0', nn.BatchNorm2d(num_init_features)),
135 |             ('relu0', nn.ReLU(inplace=True)),
136 |             ('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
137 |         ]))
138 | 
139 |         # Each denseblock
140 |         num_features = num_init_features
141 |         for i, num_layers in enumerate(block_config):
142 |             block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
143 |                                 bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
144 |             self.features.add_module('denseblock%d' % (i + 1), block)
145 |             num_features = num_features + num_layers * growth_rate
146 |             if i != len(block_config) - 1:
147 |                 if small and i > 0:
148 |                     use_pool = 0
149 |                 else:
150 |                     use_pool = 1
151 |                 trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2, use_pool=use_pool)
152 |                 self.features.add_module('transition%d' % (i + 1), trans)
153 |                 num_features = num_features // 2
154 | 
155 |         # Final batch norm
156 |         self.features.add_module('norm5', nn.BatchNorm2d(num_features))
157 | 
158 |         # Linear layer
159 |         self.classifier = nn.Linear(num_features, num_classes)
160 | 
161 |     def forward(self, x):
162 |         features = self.features(x)
163 |         return features
164 |         att_feats = features
165 |         out = F.relu(features, inplace=True)
166 |         out = F.avg_pool2d(out, kernel_size=7, stride=1).view(features.size(0), -1)
167 |         # out = F.avg_pool2d(out, kernel_size=3, stride=1).view(features.size(0), -1)
168 |         fc_feats = out
169 |         out = self.classifier(out)
170 |         return att_feats, fc_feats, out
171 | 


--------------------------------------------------------------------------------
/code/ocr/main.py:
--------------------------------------------------------------------------------
  1 | # coding=utf8
  2 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #         http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ==============================================================================
 16 | 
 17 | """ResNet Train/Eval module.
 18 | """
 19 | import time
 20 | import sys
 21 | import os
 22 | 
 23 | import numpy as np
 24 | import dataloader
 25 | import json
 26 | from tqdm import tqdm
 27 | 
 28 | import densenet
 29 | import resnet
 30 | from PIL import Image
 31 | 
 32 | import torchvision
 33 | 
 34 | import torch
 35 | import torch.nn as nn
 36 | import torch.backends.cudnn as cudnn
 37 | from torch.autograd import Variable
 38 | from torch.utils.data import DataLoader
 39 | import torch.nn.functional as F
 40 | 
 41 | from sklearn.metrics import roc_auc_score
 42 | 
 43 | from tools import parse
 44 | from glob import glob
 45 | from skimage import measure
 46 | import sys
 47 | reload(sys)
 48 | sys.setdefaultencoding('utf8')
 49 | import traceback
 50 | 
 51 | args = parse.args
 52 | # anchor大小
 53 | args.anchors = [8, 12, 18, 27, 40, 60]
 54 | args.stride = 8
 55 | args.image_size = [512,64]
 56 | 
 57 | 
 58 | class DenseNet121(nn.Module):
 59 |     """Model modified.
 60 | 
 61 |     The architecture of our model is the same as standard DenseNet121
 62 |     except the classifier layer which has an additional sigmoid function.
 63 | 
 64 |     """
 65 |     def __init__(self, out_size):
 66 |         super(DenseNet121, self).__init__()
 67 |         self.inplanes = 1024
 68 |         self.densenet121 = densenet.densenet121(pretrained=True, small=args.small)
 69 |         num_ftrs = self.densenet121.classifier.in_features
 70 |         self.classifier_font = nn.Sequential(
 71 |                 # 这里可以用fc做分类
 72 |                 # nn.Linear(num_ftrs, out_size)
 73 |                 # 这里可以用1×1卷积做分类
 74 |                 nn.Conv2d(num_ftrs, out_size, kernel_size=1, bias=False)
 75 |         )
 76 |         self.train_params = []
 77 |         self.unpool = nn.MaxUnpool2d(kernel_size=2, stride=2)
 78 | 
 79 |     def _make_layer(self, block, planes, blocks, stride=1):
 80 |         downsample = None
 81 |         if stride != 1 or self.inplanes != planes * block.expansion:
 82 |             downsample = nn.Sequential(
 83 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
 84 |                           kernel_size=1, stride=stride, bias=False),
 85 |                 nn.BatchNorm2d(planes * block.expansion),
 86 |             )
 87 | 
 88 |         layers = []
 89 |         layers.append(block(self.inplanes, planes, stride, downsample))
 90 |         self.inplanes = planes * block.expansion
 91 |         for i in range(1, blocks):
 92 |             layers.append(block(self.inplanes, planes))
 93 | 
 94 |         return nn.Sequential(*layers)
 95 | 
 96 |     def forward(self, x, phase='train'):
 97 |         feats = self.densenet121(x)     # (32, 1024, 2, 16)
 98 |         if not args.small:
 99 |             feats = F.max_pool2d(feats, kernel_size=2, stride=2) # (32, 1024, 1, 8)
100 |         out = self.classifier_font(feats) # (32, 1824, 1, 8)
101 |         out_size = out.size()
102 |         # print out.size()
103 |         out = out.view(out.size(0),out.size(1),-1) # (32, 1824, 8)
104 |         # print out.size()
105 |         if phase == 'train':
106 |             out = F.adaptive_max_pool1d(out, output_size=(1)).view(out.size(0),-1) # (32, 1824)
107 |             return out
108 |         else:
109 |             out = out.transpose(1,2).contiguous()
110 |             out = out.view(out_size[0],out_size[2], out_size[3], out_size[1]) # (32, 1, 8, 1824)
111 |             return out, feats
112 | 
113 | class Loss(nn.Module):
114 |     def __init__(self):
115 |         super(Loss, self).__init__()
116 |         self.classify_loss = nn.BCELoss()
117 |         self.sigmoid = nn.Sigmoid()
118 |         self.regress_loss = nn.SmoothL1Loss()
119 | 
120 |     def forward(self, font_output, font_target, weight=None, use_hard_mining=False):
121 |         font_output = self.sigmoid(font_output)
122 |         font_loss = F.binary_cross_entropy(font_output, font_target, weight)
123 | 
124 |         # hard_mining 
125 |         if use_hard_mining:
126 |             font_output = font_output.view(-1)
127 |             font_target = font_target.view(-1)
128 |             pos_index = font_target > 0.5
129 |             neg_index = font_target == 0
130 | 
131 |             # pos
132 |             pos_output = font_output[pos_index]
133 |             pos_target = font_target[pos_index]
134 |             num_hard_pos = max(len(pos_output)/4, min(5, len(pos_output)))
135 |             if len(pos_output) > 5:
136 |                 pos_output, pos_target = hard_mining(pos_output, pos_target, num_hard_pos, largest=False)
137 |             pos_loss = self.classify_loss(pos_output, pos_target) * 0.5
138 | 
139 | 
140 |             # neg
141 |             num_hard_neg = len(pos_output) * 2
142 |             neg_output = font_output[neg_index]
143 |             neg_target = font_target[neg_index]
144 |             neg_output, neg_target = hard_mining(neg_output, neg_target, num_hard_neg, largest=True)
145 |             neg_loss = self.classify_loss(neg_output, neg_target) * 0.5
146 | 
147 |             font_loss += pos_loss + neg_loss
148 | 
149 |         else:
150 |             pos_loss, neg_loss = font_loss, font_loss
151 |         return [font_loss, pos_loss, neg_loss]
152 | 
153 |     def _forward(self, font_output, font_target, weight, bbox_output=None, bbox_label=None, seg_output=None, seg_labels=None):
154 |         font_output = self.sigmoid(font_output)
155 |         font_loss = F.binary_cross_entropy(font_output, font_target, weight)
156 | 
157 |         acc = []
158 |         if bbox_output is not None:
159 |             # bbox_loss = 0
160 |             bbox_output = bbox_output.view((-1, 4))
161 |             bbox_label = bbox_label.view((-1, 4))
162 |             pos_index = bbox_label[:,-1] >= 0.5
163 |             pos_index = pos_index.unsqueeze(1).expand(pos_index.size(0), 4)
164 |             neg_index = bbox_label[:,-1] <= -0.5
165 |             neg_index = neg_index.unsqueeze(1).expand(neg_index.size(0), 4)
166 | 
167 |             # 正例
168 |             pos_label = bbox_label[pos_index].view((-1,4))
169 |             pos_output = bbox_output[pos_index].view((-1,4))
170 |             lx,ly,ld,lc = pos_label[:,0],pos_label[:,1],pos_label[:,2],pos_label[:,3]
171 |             ox,oy,od,oc = pos_output[:,0],pos_output[:,1],pos_output[:,2],pos_output[:,3]
172 |             regress_loss = [
173 |                     self.regress_loss(ox, lx),
174 |                     self.regress_loss(oy, ly),
175 |                     self.regress_loss(od, ld),
176 |                     ]
177 |             pc = self.sigmoid(oc)
178 |             acc.append((pc>=0.5).data.cpu().numpy().astype(np.float32).sum())
179 |             acc.append(len(pc))
180 |             # print pc.size(), lc.size()
181 |             classify_loss = self.classify_loss(pc, lc) * 0.5
182 | 
183 |             # 负例
184 |             neg_label = bbox_label[neg_index].view((-1,4))
185 |             neg_output = bbox_output[neg_index].view((-1,4))
186 |             lc = neg_label[:, 3]
187 |             oc = neg_output[:, 3]
188 |             pc = self.sigmoid(oc)
189 |             acc.append((pc<=0.5).data.cpu().numpy().astype(np.float32).sum())
190 |             acc.append(len(pc))
191 |             # print pc.size(), lc.size()
192 |             classify_loss += self.classify_loss(pc, lc+1) * 0.5
193 | 
194 |             # seg_loss
195 |             seg_output = seg_output.view(-1)
196 |             seg_labels = seg_labels.view(-1)
197 |             pos_index = seg_labels > 0.5
198 |             neg_index = seg_labels < 0.5
199 |             seg_loss = 0.5 * self.classify_loss(seg_output[pos_index], seg_labels[pos_index]) + \
200 |                        0.5 * self.classify_loss(seg_output[neg_index], seg_labels[neg_index])
201 |             seg_tpr = (seg_output[pos_index] > 0.5).data.cpu().numpy().astype(np.float32).sum() / len(seg_labels[pos_index])
202 |             seg_tnr = (seg_output[neg_index] < 0.5).data.cpu().numpy().astype(np.float32).sum() / len(seg_labels[neg_index])
203 |             # print seg_output[neg_index]
204 |             # print seg_labels[neg_index]
205 | 
206 | 
207 | 
208 | 
209 |         else:
210 |             return font_loss
211 | 
212 |         if args.model == 'resnet':
213 |             loss = font_loss + classify_loss + seg_loss
214 |         else:
215 |             loss = font_loss + classify_loss + seg_loss
216 |         for reg in regress_loss:
217 |             loss += reg
218 |         # if args.model == 'resnet':
219 |         #     loss = seg_loss
220 | 
221 |         return [loss, font_loss, seg_loss, classify_loss] + regress_loss + acc + [seg_tpr, seg_tnr]
222 | 
223 |         font_num = font_target.sum(0).data.cpu().numpy()
224 |         font_loss = 0
225 |         for di in range(font_num.shape[0]):
226 |             if font_num[di] > 0:
227 |                 font_output_i = font_output[:,di]
228 |                 font_target_i = font_target[:,di]
229 |                 pos_font_index = font_target_i > 0.5
230 |                 font_loss  += 0.5 * self.classify_loss(font_output_i[pos_font_index], font_target_i[pos_font_index])
231 |                 neg_font_index = font_target_i < 0.5
232 |                 if len(font_target_i[neg_font_index]) > 0:
233 |                     font_loss  += 0.5 * self.classify_loss(font_output_i[neg_font_index], font_target_i[neg_font_index])
234 |         font_loss = font_loss / (font_num>0).sum()
235 | 
236 |         return font_loss
237 |         # '''
238 | 
239 | def hard_mining(neg_output, neg_labels, num_hard, largest=True):
240 |     num_hard = min(max(num_hard, 10), len(neg_output))
241 |     _, idcs = torch.topk(neg_output, min(num_hard, len(neg_output)), largest=largest)
242 |     neg_output = torch.index_select(neg_output, 0, idcs)
243 |     neg_labels = torch.index_select(neg_labels, 0, idcs)
244 |     return neg_output, neg_labels
245 | 
246 | def save_model(save_dir, phase, name, epoch, f1score, model):
247 |     if not os.path.exists(save_dir):
248 |         os.mkdir(save_dir)
249 |     save_dir = os.path.join(save_dir, args.model)
250 |     if not os.path.exists(save_dir):
251 |         os.mkdir(save_dir)
252 |     save_dir = os.path.join(save_dir, phase)
253 |     if not os.path.exists(save_dir):
254 |         os.mkdir(save_dir)
255 |     state_dict = model.state_dict()
256 |     for key in state_dict.keys():
257 |         state_dict[key] = state_dict[key].cpu()
258 |     state_dict_all = {
259 |             'state_dict': state_dict,
260 |             'epoch': epoch,
261 |             'f1score': f1score,
262 |             }
263 |     torch.save( state_dict_all , os.path.join(save_dir, '{:s}.ckpt'.format(name)))
264 |     if 'best' in name and f1score > 0.3:
265 |         torch.save( state_dict_all , os.path.join(save_dir, '{:s}_{:s}.ckpt'.format(name, str(epoch))))
266 | 
267 | def mkdir(path):
268 |     if not os.path.exists(path):
269 |         os.mkdir(path)
270 | 
271 | def test(epoch, model, train_loader, phase='test'):
272 |     print '\ntest {:s}_files, epoch: {:d}'.format(phase, epoch)
273 |     mkdir('../../data/result')
274 |     model.eval()
275 |     f1score_list = []
276 |     recall_list = []
277 |     precision_list = []
278 |     word_index_dict = json.load(open(args.word_index_json))
279 |     index_word_dict = { v:k for k,v in word_index_dict.items() }
280 |     result_file = open('../../data/result/{:d}_{:s}_result.csv'.format(epoch, phase), 'w')
281 |     result_file.write('name,content\n')
282 |     name_f1score_dict = dict()
283 | 
284 |     # 保存densenet生成的feature
285 |     feat_dir = args.data_dir.replace('dataset', 'feats')
286 |     mkdir(feat_dir)
287 |     feat_dir = os.path.join(feat_dir, phase)
288 |     print feat_dir
289 |     mkdir(feat_dir)
290 | 
291 |     names = []
292 |     if phase != 'test':
293 |         gt_file = open('../../data/result/{:d}_{:s}_gt.csv'.format(epoch, phase), 'w')
294 |         gt_file.write('name,content\n')
295 |         analysis_file = open('../../data/result/{:s}_{:s}_gt.csv'.format('analysis', phase), 'w')
296 |         os.system('rm -r ../../data/analysis/{:s}'.format(phase))
297 |         labels_all = []
298 |     probs_all = []
299 |     for i,data in enumerate(tqdm(train_loader)):
300 |         name = data[0][0].split('/')[-1].split('.seg')[0]
301 |         names.append(name)
302 |         images, labels = [Variable(x.cuda(async=True)) for x in data[1:3]]
303 |         if len(images.size()) == 5:
304 |             images = images[0]
305 | 
306 |         probs, feats = model(images, 'test')
307 |         probs_all.append(probs.data.cpu().numpy().max(2).max(1).max(0))
308 | 
309 |         preds = probs.data.cpu().numpy() > 0.5 # (-1, 8, 1824)
310 | 
311 |         # result_file.write(name+',')
312 |         result = u''
313 |         last_set = set()
314 |         all_set = set()
315 | 
316 |         if args.feat:
317 |             # 保存所有的feat
318 |             feats = feats.data.cpu().numpy()
319 |             if i == 0:
320 |                 print feats.shape
321 |             np.save(os.path.join(feat_dir, name.replace('.png','.npy')), feats)
322 |             if len(feats) > 1: # feats: [-1, 1024, 1, 8]
323 |                 # 多个patch
324 |                 new_feats = []
325 |                 for i,feat in enumerate(feats):
326 |                     if i == 0:
327 |                         # 第一个patch,保存前6个
328 |                         new_feats.append(feat[:,:,:6])
329 |                     elif i == len(feats) - 1:
330 |                         # 最后一个patch,保存后6个
331 |                         new_feats.append(feat[:,:,2:])
332 |                     else:
333 |                         # 保存中间4个
334 |                         new_feats.append(feat[:,:,2:6])
335 |                 feats = np.concatenate(new_feats, 2)
336 | 
337 |         # 这种方法用于检测不同区域的同一个字，当同一个字同一个区域出现时，可能检测不到多次
338 |         preds = preds.max(1) # 沿着竖直方向pooling
339 |         # if len(preds) > 1:
340 |         #     print name
341 |         for patch_i, patch_pred in enumerate(preds):
342 |             for part_i, part_pred in enumerate(patch_pred):
343 |                 new_set = set()
344 |                 for idx,p in enumerate(part_pred):
345 |                     if p:
346 |                         # 出现了这个字
347 |                         w = index_word_dict[idx]
348 |                         new_set.add(w)
349 |                         if w not in all_set:
350 |                             # 从没见过的字
351 |                             all_set.add(w)
352 |                             result += w
353 |                         elif w not in last_set:
354 |                             # 以前出现过
355 |                             if patch_i == 0:
356 |                                 # 第一个patch # 上一个部分没有这个字
357 |                                 result += w
358 |                             elif part_i >= preds.shape[1]/2 :
359 |                                 # 后续patch的后一半，不写 # 上一个部分没有这个字
360 |                                 result += w
361 |                 last_set = new_set
362 |         # if len(result) > len(set(result)):
363 |         #     print name
364 | 
365 | 
366 | 
367 | 
368 |         '''
369 |         for idx,p in enumerate(preds.reshape(-1)):
370 |             if p:
371 |                 # result_file.write(index_word_dict[idx])
372 |                 result = result + index_word_dict[idx]
373 |         '''
374 | 
375 |         result = result.replace(u'"', u'')
376 |         if u','  in result:
377 |             result = '"' + result + '"'
378 |         if len(result) == 0:
379 |             global_prob = probs.data.cpu().numpy().max(0).max(0).max(0)
380 |             max_index = global_prob.argmax()
381 |             result = index_word_dict[max_index]
382 |             print name
383 | 
384 |         result_file.write(name+','+result+'\n')
385 |         # result_file.write('\n')
386 | 
387 |         if phase == 'test':
388 |             continue
389 |         labels = labels.data.cpu().numpy()
390 |         gt_file.write(name+',')
391 |         gt = u''
392 |         for idx,l in enumerate(labels.reshape(-1)):
393 |             if l:
394 |                 gt = gt + index_word_dict[idx]
395 |                 gt_file.write(index_word_dict[idx])
396 |         gt_file.write('\n')
397 | 
398 |         
399 |         labels_all.append(labels[0])
400 |         # 全局pooling
401 |         preds = np.array([preds.max(1).max(0)])
402 |         # print preds.shape
403 |         for pred, label in zip(preds, labels):
404 |             tp = (pred + label == 2).sum()
405 |             tn = (pred + label == 0).sum()
406 |             fp = (pred - label == 1).sum()
407 |             fn = (pred - label ==-1).sum()
408 |             precision = 1.0 * tp / max(tp + fp , 10e-20)
409 |             recall   = 1.0 * tp / max(tp + fn , 10e-20)
410 |             f1score = 2. * precision * recall / max(precision + recall , 10e-20)
411 |             precision_list.append(precision)
412 |             recall_list.append(recall)
413 |             f1score_list.append(f1score)
414 |             name_f1score_dict[name] = f1score
415 | 
416 |             # 分析不好的结果
417 |             if phase == 'train_val':
418 |                 th = 0.8
419 |             elif phase == 'train':
420 |                 th = 0.95
421 |             else:
422 |                 th = 0.6
423 |             if f1score < th:
424 |                 save_dir = '../../data/analysis'
425 |                 if not os.path.exists(save_dir):
426 |                     os.mkdir(save_dir)
427 |                 save_dir = os.path.join(save_dir, phase)
428 |                 if not os.path.exists(save_dir):
429 |                     os.mkdir(save_dir)
430 |                 os.system('cp ../../data/dataset/train/{:s} {:s}/{:d}_{:s}'.format(name, save_dir, 100000+i, name))
431 |                 analysis_file.write(name+'\t\t')
432 |                 gt = set(gt)
433 |                 result = set(result.strip('"'))
434 |                 analysis_file.write(''.join(sorted(gt - result))+'\t\t')
435 |                 analysis_file.write(''.join(sorted(result - gt))+'\t\n')
436 |                 
437 |             
438 |     
439 |     if phase != 'test':
440 |         # f1score = np.mean(f1score_list)
441 |         # print 'f1score all', f1score
442 |         # f1score_list = sorted(f1score_list)[500:]
443 |         f1score = np.mean(f1score_list)
444 |         recall = np.mean(recall_list)
445 |         precision = np.mean(precision_list)
446 |         print 'f1score', f1score
447 |         print 'recall', recall
448 |         print 'precision', precision
449 |         gt_file.write('f1score,' +  str(f1score))
450 |         gt_file.write('recall,' +  str(recall))
451 |         gt_file.write('precision,' +  str(precision))
452 |         gt_file.close()
453 |         result_file.write('f1score,' +  str(f1score))
454 |         result_file.write('recall,' +  str(recall))
455 |         result_file.write('precision,' +  str(precision))
456 |         with open('../../data/result/name_f1score_dict.json','w') as f:
457 |             f.write(json.dumps(name_f1score_dict, indent=4))
458 |         np.save('../../data/result/{:d}_{:s}_labels.npy'.format(epoch, phase), labels_all)
459 |     result_file.close()
460 |     os.system('cp ../../data/result/{:d}_{:s}_result.csv ../../data/result/{:s}_result.csv'.format(epoch, phase, phase))
461 | 
462 |     np.save('../../data/result/{:d}_{:s}_probs.npy'.format(epoch, phase), probs_all)
463 |     with open('../../data/result/{:s}_names.json'.format(phase), 'w') as f:
464 |         f.write(json.dumps(names, indent=4))
465 | 
466 | def get_weight(labels):
467 |     labels = labels.data.cpu().numpy()
468 |     weights = np.zeros_like(labels)
469 |     # weight_false = 1.0 / ((labels<0.5).sum() + 10e-20)
470 |     # weight_true  = 1.0 / ((labels>0.5).sum() + 10e-20)
471 |     weight_false = 1.0 / ((labels<0.5).sum(0) + 10e-20)
472 |     label_true = (labels>0.5).sum(0)
473 |     for i in range(labels.shape[1]):
474 |         label_i = labels[:,i]
475 |         weight_i = np.ones(labels.shape[0]) * weight_false[i]
476 |         # weight_i = np.ones(labels.shape[0]) * weight_false
477 |         if label_true[i] > 0:
478 |             weight_i[label_i>0.5] = 1.0 / label_true[i]
479 |         weights[:,i] = weight_i
480 |     weights *= np.ones_like(labels).sum() / (weights.sum() + 10e-20)
481 |     weights[labels<-0.5] = 0
482 |     return weights
483 | 
484 | def train_eval(epoch, model, train_loader, loss, optimizer, best_f1score=0, phase='train'):
485 |     print '\n',epoch, phase
486 |     if 'train' in phase:
487 |         model.train()
488 |     else:
489 |         model.eval()
490 |     loss_list = []
491 |     f1score_list = []
492 |     recall_list = []
493 |     precision_list = []
494 |     for i,data in enumerate(tqdm(train_loader)):
495 |         images, labels = [Variable(x.cuda(async=True)) for x in data[1:3]]
496 |         weights = torch.from_numpy(get_weight(labels)).cuda(async=True)
497 |         probs = model(images)
498 | 
499 |         # 训练阶段
500 |         if 'train' in phase:
501 |             loss_output = loss(probs, labels, weights, args.hard_mining)
502 |             try:
503 |                 optimizer.zero_grad()
504 |                 loss_output[0].backward()
505 |                 optimizer.step()
506 |                 loss_list.append([x.data.cpu().numpy()[0] for x in loss_output])
507 |             except:
508 |                 # pass
509 |                 traceback.print_exc()
510 | 
511 | 
512 |         # 计算 f1score, recall, precision
513 |         '''
514 |         x = probs.data.cpu().numpy() 
515 |         l = labels.data.cpu().numpy()
516 |         print (get_weight(labels) * l).sum()
517 |         l = 1 - l
518 |         print (get_weight(labels) * l).sum()
519 |         print x.max()
520 |         print x.min()
521 |         print x.mean()
522 |         print
523 |         # '''
524 |         preds = probs.data.cpu().numpy() > 0
525 |         labels = labels.data.cpu().numpy()
526 |         for pred, label in zip(preds, labels):
527 |             pred[label<0] = -1
528 |             if label.sum() < 0.5:
529 |                 continue
530 |             tp = (pred + label == 2).sum()
531 |             tn = (pred + label == 0).sum()
532 |             fp = (pred - label == 1).sum()
533 |             fn = (pred - label ==-1).sum()
534 |             precision = 1.0 * tp / (tp + fp + 10e-20)
535 |             recall   = 1.0 * tp / (tp + fn + 10e-20)
536 |             f1score = 2. * precision * recall / (precision + recall + 10e-20)
537 |             precision_list.append(precision)
538 |             recall_list.append(recall)
539 |             f1score_list.append(f1score)
540 |     
541 |             
542 |         # 保存中间结果到 data/middle_result，用于分析
543 |         if i == 0:
544 |             images = images.data.cpu().numpy() * 128 + 128
545 |             if phase == 'pretrain':
546 |                 bbox_labels = bbox_labels.data.cpu().numpy()
547 |                 seg_labels = seg_labels.data.cpu().numpy()
548 |                 seg_output = seg_output.data.cpu().numpy()
549 |             for ii in range(len(images)):
550 |                 middle_dir = os.path.join(args.save_dir, 'middle_result')
551 |                 if not os.path.exists(middle_dir):
552 |                     os.mkdir(middle_dir)
553 |                 middle_dir = os.path.join(middle_dir, phase)
554 |                 if not os.path.exists(middle_dir):
555 |                     os.mkdir(middle_dir)
556 |                 Image.fromarray(images[ii].astype(np.uint8).transpose(1,2,0)).save(os.path.join(middle_dir, str(ii)+'.image.png'))
557 |                 if phase == 'pretrain':
558 |                     segi = seg_labels[ii]
559 |                     _segi = np.array([segi, segi, segi]) * 255
560 |                     segi = np.zeros([3, _segi.shape[1]*2, _segi.shape[2]*2])
561 |                     for si in range(segi.shape[1]):
562 |                         for sj in range(segi.shape[2]):
563 |                             segi[:,si,sj] = _segi[:,si/2,sj/2]
564 |                     Image.fromarray(segi.transpose(1,2,0).astype(np.uint8)).save(os.path.join(middle_dir, str(ii)+'.seg.png'))
565 |                     segi = seg_output[ii]
566 |                     _segi = np.array([segi, segi, segi]) * 255
567 |                     segi = np.zeros([3, _segi.shape[1]*2, _segi.shape[2]*2])
568 |                     for si in range(segi.shape[1]):
569 |                         for sj in range(segi.shape[2]):
570 |                             segi[:,si,sj] = _segi[:,si/2,sj/2]
571 |                     Image.fromarray(segi.transpose(1,2,0).astype(np.uint8)).save(os.path.join(middle_dir, str(ii)+'.seg.out.png'))
572 | 
573 |     f1score = np.mean(f1score_list)
574 |     print 'f1score', f1score
575 |     print 'recall', np.mean(recall_list)
576 |     print 'precision', np.mean(precision_list)
577 |     if 'train' in phase:
578 |         loss_mean = np.array(loss_list).mean(0)
579 |         print 'loss: {:3.4f}    pos loss: {:3.4f}   neg loss: {:3.4f}'.format(loss_mean[0], loss_mean[1], loss_mean[2])
580 | 
581 |     # 保存模型
582 |     if ('eval' in phase or 'pretrain' in phase)and best_f1score < 2: 
583 |         if args.small:
584 |             save_dir = os.path.join(args.save_dir, 'models-small')
585 |         else:
586 |             save_dir = os.path.join(args.save_dir, 'models')
587 |         if not os.path.exists(save_dir):
588 |             os.mkdir(save_dir)
589 |         if epoch % 5 == 0:
590 |             save_model(save_dir, phase, str(epoch), epoch, f1score, model)
591 |         if f1score > best_f1score:
592 |             save_model(save_dir, phase, 'best_f1score', epoch, f1score, model)
593 |         if args.model == 'resnet':
594 |             tpnr = loss[11] + loss[12]
595 |             # 这里用 best_f1score 也当tpnr好了，懒得改
596 |             if tpnr > best_f1score:
597 |                 best_f1score = tpnr
598 |                 save_model(save_dir, phase, 'best_tpnr', epoch, f1score, model)
599 |             print 'best tpnr', best_f1score
600 |         else:
601 |             best_f1score = max(best_f1score, f1score)
602 |             if best_f1score < 1:
603 |                 print '\n\t{:s}\tbest f1score {:3.4f}\n'.format(phase, best_f1score)
604 |         return best_f1score
605 | 
606 | 
607 | def main():
608 |     word_index_dict = json.load(open(args.word_index_json))
609 |     num_classes = len(word_index_dict)
610 |     image_label_dict = json.load(open(args.image_label_json))
611 | 
612 |     cudnn.benchmark = True
613 |     if args.model == 'densenet':
614 |         # 两千多种字符，multi-label分类
615 |         model = DenseNet121(num_classes).cuda()
616 |     elif args.model == 'resnet':
617 |         # resnet主要用于文字区域的segmentation以及object detection操作
618 |         model = resnet.ResNet(num_classes=num_classes, args=args).cuda()
619 |     else:
620 |         return
621 |     optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
622 |     # model = torch.nn.DataParallel(model).cuda()
623 |     loss = Loss().cuda()
624 | 
625 |     if args.resume:
626 |         state_dict = torch.load(args.resume)
627 |         model.load_state_dict(state_dict['state_dict'])
628 |         best_f1score = state_dict['f1score']
629 |         start_epoch = state_dict['epoch'] + 1
630 |     else:
631 |         best_f1score = 0
632 |         if args.model == 'resnet':
633 |             start_epoch = 100
634 |         else:
635 |             start_epoch = 1
636 |     args.epoch = start_epoch
637 |     print 'best_f1score', best_f1score
638 | 
639 | 
640 |     # 划分数据集
641 |     test_filelist = sorted(glob(os.path.join(args.data_dir,'test','*')))
642 |     trainval_filelist = sorted(glob(os.path.join(args.data_dir,'train','*')))
643 | 
644 |     # 两种输入size训练
645 |     # train_filelist1: 长宽比小于8:1的图片，经过padding后变成 64*512 的输入
646 |     # train_filelist2: 长宽比大于8:1的图片，经过padding,crop后变成 64*1024的输入
647 |     train_filelist1, train_filelist2 = [],[]
648 | 
649 |     # 黑名单，这些图片的label是有问题的
650 |     black_list = set(json.load(open(args.black_json))['black_list'])
651 |     image_hw_ratio_dict = json.load(open(args.image_hw_ratio_json))
652 |     for f in trainval_filelist:
653 |         image = f.split('/')[-1]
654 |         if image in black_list:
655 |             continue
656 |         r = image_hw_ratio_dict[image]
657 |         if r == 0:
658 |             train_filelist1.append(f)
659 |         else:
660 |             train_filelist2.append(f)
661 |     train_val_filelist = train_filelist1 + train_filelist2
662 |     val_filelist = train_filelist1[-2048:]
663 |     train_filelist1 = train_filelist1[:-2048]
664 | 
665 |     train_filelist2 = train_filelist2
666 |     image_size = [512, 64]
667 | 
668 |     if args.phase in ['test', 'val', 'train_val']:
669 |         # 测试输出文字检测结果
670 |         test_dataset = dataloader.DataSet(
671 |                 test_filelist, 
672 |                 image_label_dict,
673 |                 num_classes, 
674 |                 # transform=train_transform, 
675 |                 args=args,
676 |                 image_size=image_size,
677 |                 phase='test')
678 |         test_loader = DataLoader(
679 |                 dataset=test_dataset, 
680 |                 batch_size=1, 
681 |                 shuffle=False, 
682 |                 num_workers=8, 
683 |                 pin_memory=True)
684 |         train_filelist = train_filelist1[-2048:]
685 |         train_dataset  = dataloader.DataSet(
686 |                 train_filelist, 
687 |                 image_label_dict, 
688 |                 num_classes, 
689 |                 image_size=image_size,
690 |                 args=args,
691 |                 phase='test')
692 |         train_loader = DataLoader(
693 |                 dataset=train_dataset, 
694 |                 batch_size=1,
695 |                 shuffle=False, 
696 |                 num_workers=8, 
697 |                 pin_memory=True)
698 | 
699 |         val_dataset  = dataloader.DataSet(
700 |                 val_filelist, 
701 |                 image_label_dict, 
702 |                 num_classes, 
703 |                 image_size=image_size,
704 |                 args=args,
705 |                 phase='test')
706 |         val_loader = DataLoader(
707 |                 dataset=val_dataset, 
708 |                 batch_size=1,
709 |                 shuffle=False, 
710 |                 num_workers=8, 
711 |                 pin_memory=True)
712 | 
713 |         train_val_dataset  = dataloader.DataSet(
714 |                 train_val_filelist, 
715 |                 image_label_dict, 
716 |                 num_classes, 
717 |                 image_size=image_size,
718 |                 args=args,
719 |                 phase='test')
720 |         train_val_loader= DataLoader(
721 |                 dataset=train_val_dataset, 
722 |                 batch_size=1,
723 |                 shuffle=False, 
724 |                 num_workers=8, 
725 |                 pin_memory=True)
726 | 
727 |         if args.phase == 'test':
728 |             test(start_epoch - 1, model, val_loader, 'val')
729 |             test(start_epoch - 1, model, test_loader, 'test')
730 |             # test(start_epoch - 1, model, train_val_loader, 'train_val')
731 |         elif args.phase == 'val':
732 |             test(start_epoch - 1, model, train_loader, 'train')
733 |             test(start_epoch - 1, model, val_loader, 'val')
734 |         elif args.phase == 'train_val':
735 |             test(start_epoch - 1, model, train_val_loader, 'train_val')
736 |         return
737 | 
738 |     elif args.phase == 'train':
739 | 
740 |         train_dataset1 = dataloader.DataSet(
741 |                 train_filelist1,
742 |                 image_label_dict,
743 |                 num_classes, 
744 |                 image_size=image_size,
745 |                 args=args,
746 |                 phase='train')
747 |         train_loader1 = DataLoader(
748 |                 dataset=train_dataset1, 
749 |                 batch_size=args.batch_size, 
750 |                 shuffle=True, 
751 |                 num_workers=8, 
752 |                 pin_memory=True)
753 |         train_dataset2 = dataloader.DataSet(
754 |                 train_filelist2, 
755 |                 image_label_dict,
756 |                 num_classes, 
757 |                 image_size=(1024,64),
758 |                 args=args,
759 |                 phase='train')
760 |         train_loader2 = DataLoader(
761 |                 dataset=train_dataset2, 
762 |                 batch_size=args.batch_size / 2, 
763 |                 shuffle=True, 
764 |                 num_workers=8, 
765 |                 pin_memory=True)
766 |         val_dataset  = dataloader.DataSet(
767 |                 val_filelist, 
768 |                 image_label_dict, 
769 |                 num_classes, 
770 |                 image_size=image_size,
771 |                 args=args,
772 |                 phase='val')
773 |         val_loader = DataLoader(
774 |                 dataset=val_dataset, 
775 |                 batch_size=min(8,args.batch_size),
776 |                 shuffle=False, 
777 |                 num_workers=8, 
778 |                 pin_memory=True)
779 |         filelist = glob(os.path.join(args.bg_dir,'*'))
780 |         pretrain_dataset1 = dataloader.DataSet(
781 |                 filelist, 
782 |                 image_label_dict,
783 |                 num_classes, 
784 |                 image_size=args.image_size,
785 |                 word_index_dict = word_index_dict,
786 |                 args=args,
787 |                 font_range=[8,32],
788 |                 margin=10,
789 |                 rotate_range=[-10., 10. ],
790 |                 phase='pretrain')
791 |         pretrain_loader1 = DataLoader(
792 |                 dataset=pretrain_dataset1, 
793 |                 batch_size=args.batch_size, 
794 |                 shuffle=True, 
795 |                 num_workers=8, 
796 |                 pin_memory=True)
797 |         pretrain_dataset2 = dataloader.DataSet(
798 |                 filelist, 
799 |                 image_label_dict,
800 |                 num_classes, 
801 |                 image_size=(256, 128),
802 |                 word_index_dict = word_index_dict,
803 |                 args=args,
804 |                 font_range=[24,64],
805 |                 margin=20,
806 |                 rotate_range=[-20., 20.],
807 |                 phase='pretrain')
808 |         pretrain_loader2 = DataLoader(
809 |                 dataset=pretrain_dataset2, 
810 |                 batch_size=args.batch_size, 
811 |                 shuffle=True, 
812 |                 num_workers=8, 
813 |                 pin_memory=True)
814 |     
815 |         best_f1score = 0
816 |         # eval_mode = 'pretrain-2'
817 |         eval_mode = 'eval'
818 |         for epoch in range(start_epoch, args.epochs):
819 | 
820 |             args.epoch = epoch
821 | 
822 |             if eval_mode == 'eval':
823 |                 if best_f1score > 0.9:
824 |                     args.lr = 0.0001
825 |                 if best_f1score > 0.9:
826 |                     args.hard_mining = 1
827 | 
828 |             for param_group in optimizer.param_groups:
829 |                 param_group['lr'] = args.lr
830 | 
831 |             train_eval(epoch, model, train_loader1, loss, optimizer, 2., 'train-1')
832 |             if best_f1score > 0.9:
833 |                 train_eval(epoch, model, train_loader2, loss, optimizer, 2., 'train-2')
834 |             best_f1score = train_eval(epoch, model, val_loader, loss, optimizer, best_f1score, 'eval-{:d}-{:d}'.format(args.batch_size, args.hard_mining))
835 |             continue
836 |             '''
837 | 
838 |             if eval_mode == 'pretrain-2':
839 |                 args.epoch = 1
840 |                 best_f1score = train_eval(epoch, model, pretrain_loader2, loss, optimizer, best_f1score, 'pretrain-2')
841 |                 if best_f1score > 0.8:
842 |                     eval_mode = 'pretrain-1'
843 |                     best_f1score = 0
844 |             elif eval_mode == 'pretrain-1':
845 |                 args.epoch = max(100, epoch)
846 |                 train_eval(epoch, model, pretrain_loader2, loss, optimizer, 2.0 , 'pretrain-2')
847 |                 best_f1score = train_eval(epoch, model, pretrain_loader1, loss, optimizer, best_f1score, 'pretrain-1')
848 |                 if best_f1score > 0.5:
849 |                     eval_mode = 'eval'
850 |                     best_f1score = 0
851 |             else:
852 |                 train_eval(epoch, model, train_loader1, loss, optimizer, 2., 'train-1')
853 |                 train_eval(epoch, model, train_loader2, loss, optimizer, 2., 'train-2')
854 |                 best_f1score = train_eval(epoch, model, val_loader, loss, optimizer, best_f1score, 'eval-{:d}-{:d}'.format(args.batch_size, args.hard_mining))
855 | 
856 |             '''
857 | 
858 | 
859 |     
860 | 
861 | 
862 | 
863 | if __name__ == '__main__':
864 |     main()
865 | 


--------------------------------------------------------------------------------
/code/ocr/resnet.py:
--------------------------------------------------------------------------------
  1 | # Implementation of https://arxiv.org/pdf/1512.03385.pdf.
  2 | # See section 4.2 for model architecture on CIFAR-10.
  3 | # Some part of the code was referenced below.
  4 | # https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
  5 | import torch 
  6 | import torch.nn as nn
  7 | import torchvision.datasets as dsets
  8 | import torchvision.transforms as transforms
  9 | from torch.autograd import Variable
 10 | import torch.nn.functional as F
 11 | 
 12 | # 3x3 Convolution
 13 | def conv3x3(in_channels, out_channels, stride=1):
 14 |     return nn.Conv2d(in_channels, out_channels, kernel_size=3, 
 15 |                      stride=stride, padding=1, bias=False)
 16 | 
 17 | # Residual Block
 18 | class ResidualBlock(nn.Module):
 19 |     def __init__(self, in_channels, out_channels, stride=1, downsample=None):
 20 |         super(ResidualBlock, self).__init__()
 21 |         self.conv1 = conv3x3(in_channels, out_channels, stride)
 22 |         self.bn1 = nn.BatchNorm2d(out_channels)
 23 |         self.relu = nn.ReLU(inplace=True)
 24 |         self.conv2 = conv3x3(out_channels, out_channels)
 25 |         self.bn2 = nn.BatchNorm2d(out_channels)
 26 |         self.downsample = downsample
 27 |         
 28 |     def forward(self, x):
 29 |         residual = x
 30 |         out = self.conv1(x)
 31 |         out = self.bn1(out)
 32 |         out = self.relu(out)
 33 |         out = self.conv2(out)
 34 |         out = self.bn2(out)
 35 |         if self.downsample:
 36 |             residual = self.downsample(x)
 37 |         out += residual
 38 |         out = self.relu(out)
 39 |         return out
 40 | 
 41 | # ResNet Module
 42 | class ResNet(nn.Module):
 43 |     def __init__(self, block=ResidualBlock, layers=[2,3], num_classes=10, args=None):
 44 |         super(ResNet, self).__init__()
 45 |         self.in_channels = 16
 46 |         self.conv = conv3x3(3, 16)
 47 |         self.bn = nn.BatchNorm2d(16)
 48 |         self.relu = nn.ReLU(inplace=True)
 49 |         self.layer1 = self.make_layer(block, 32, layers[0], 2)
 50 |         self.layer2 = self.make_layer(block, 64, layers[0], 2)
 51 |         self.layer3 = self.make_layer(block, 128, layers[0], 2)
 52 |         self.layer4 = self.make_layer(block, 128, layers[0], 2)
 53 |         self.layer5 = self.make_layer(block, 128, layers[0], 2)
 54 |         self.fc = nn.Linear(128, num_classes)
 55 | 
 56 |         # detect
 57 |         self.convt1 = nn.Sequential(
 58 |                 nn.ConvTranspose2d(128,128,kernel_size=2, stride=2), 
 59 |                 nn.BatchNorm2d(128),
 60 |                 nn.ReLU(inplace=True))
 61 |         self.convt2 = nn.Sequential(
 62 |                 nn.ConvTranspose2d(128,128,kernel_size=2, stride=2), 
 63 |                 nn.BatchNorm2d(128),
 64 |                 nn.ReLU(inplace=True))
 65 |         self.convt3 = nn.Sequential(
 66 |                 nn.ConvTranspose2d(128,128,kernel_size=2, stride=2), 
 67 |                 nn.BatchNorm2d(128),
 68 |                 nn.ReLU(inplace=True))
 69 |         self.convt4 = nn.Sequential(
 70 |                 nn.ConvTranspose2d(128,128,kernel_size=2, stride=2), 
 71 |                 nn.BatchNorm2d(128),
 72 |                 nn.ReLU(inplace=True))
 73 |         self.in_channels = 256
 74 |         self.dec1 = self.make_layer(block, 128, layers[0])
 75 |         self.in_channels = 256
 76 |         self.dec2 = self.make_layer(block, 128, layers[0])
 77 |         self.in_channels = 192
 78 |         self.dec3 = self.make_layer(block, 128, layers[0])
 79 |         self.in_channels = 160
 80 |         # self.dec4 = self.make_layer(block, 1, layers[0])
 81 |         self.dec4 = nn.Sequential(
 82 |                 nn.Conv2d(160, 256, kernel_size=3, padding=1),
 83 |                 nn.BatchNorm2d(256),
 84 |                 nn.ReLU(inplace=True),
 85 |                 nn.Conv2d(256, 1, kernel_size=1, bias=True)
 86 |                 )
 87 |         self.in_channels = 256
 88 |         # self.dec2 = self.make_layer(block, 256, layers[0])
 89 |         # self.output = conv3x3(256, 4 * len(args.anchors))
 90 |         self.bbox = nn.Sequential(
 91 |                 nn.Conv2d(256, 256, kernel_size=3, padding=1),
 92 |                 nn.BatchNorm2d(256),
 93 |                 nn.ReLU(inplace=True),
 94 |                 nn.Conv2d(256, 4 * len(args.anchors), kernel_size=1, bias=True)
 95 |                 )
 96 |         self.sigmoid = nn.Sigmoid()
 97 | 
 98 |         
 99 |     def make_layer(self, block, out_channels, blocks, stride=1):
100 |         downsample = None
101 |         if (stride != 1) or (self.in_channels != out_channels):
102 |             downsample = nn.Sequential(
103 |                 conv3x3(self.in_channels, out_channels, stride=stride),
104 |                 nn.BatchNorm2d(out_channels))
105 |         layers = []
106 |         layers.append(block(self.in_channels, out_channels, stride, downsample))
107 |         self.in_channels = out_channels
108 |         for i in range(1, blocks):
109 |             layers.append(block(out_channels, out_channels))
110 |         return nn.Sequential(*layers)
111 |     
112 |     def forward(self, x, phase='train'):
113 |         out = self.conv(x)
114 |         # print out.size()
115 |         out = self.bn(out)
116 |         # print out.size()
117 |         out = self.relu(out)
118 |         # print out.size()
119 |         out1 = self.layer1(out)     # 64
120 |         # print out1.size()
121 |         out2 = self.layer2(out1)    # 32
122 |         # print out2.size()
123 |         out3 = self.layer3(out2)    # 16
124 |         # print out3.size()
125 |         out4 = self.layer4(out3)    # 8
126 |         # print out4.size()
127 |         out5 = self.layer5(out4)    # 4
128 |         # print out5.size()
129 | 
130 |         # out = F.adaptive_max_pool2d(out5, output_size=(1,1)).view(out.size(0), -1) # 128
131 |         # out = out.view(out.size(0), -1)
132 | 
133 |         if phase == 'seg':
134 |             out = F.adaptive_max_pool2d(out5, output_size=(1,1)).view(out.size(0), -1) # 128
135 |             out = self.fc(out)
136 |             out = out.view(out.size(0), -1)
137 |         else:
138 |             out = F.max_pool2d(out5, 2)
139 |             out_size = out.size()
140 |             # out = out.view(out_size[0],out_size[1],out_size[3]).transpose(1,2).contiguous().view(-1, out_size[1])
141 |             out = out.view(out_size[0],out_size[1],out_size[2] * out_size[3]).transpose(1,2).contiguous().view(-1, out_size[1])
142 |             out = self.fc(out)
143 |             out = out.view(out_size[0], out_size[2] * out_size[3], -1).transpose(1,2).contiguous()
144 |             out = F.adaptive_max_pool1d(out, output_size=(1)).view(out_size[0], -1)
145 | 
146 |         # print out.size()
147 |         if phase not in ['seg', 'pretrain', 'pretrain2']:
148 |             return out
149 | 
150 |         # detect
151 |         cat1 = torch.cat([self.convt1(out5), out4], 1)
152 |         # print cat1.size()
153 |         dec1 = self.dec1(cat1)
154 |         # print dec1.size()
155 |         # print out3.size()
156 |         cat2 = torch.cat([self.convt2(dec1), out3], 1) 
157 |         # print cat2.size()
158 |         dec2 = self.dec2(cat2)
159 |         cat3 = torch.cat([self.convt3(dec2), out2], 1)
160 |         dec3 = self.dec3(cat3)
161 |         cat4 = torch.cat([self.convt4(dec3), out1], 1)
162 |         seg = self.dec4(cat4)
163 |         seg = seg.view((seg.size(0), seg.size(2), seg.size(3)))
164 |         seg = self.sigmoid(seg)
165 |         
166 |         bbox = self.bbox(cat2)
167 |         # dec2 = self.output(dec2)
168 |         # print dec2.size()
169 |         size = bbox.size()
170 |         bbox = bbox.view((size[0], size[1], -1)).transpose(1,2).contiguous()
171 |         bbox = bbox.view((size[0], size[2],size[3],-1, 4))
172 | 
173 |         return out, bbox, seg
174 |     
175 | # resnet = ResNet(ResidualBlock, [2, 2, 2, 2])
176 | 


--------------------------------------------------------------------------------
/code/ocr/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinchangchang/ocr_densenet/a31f57e006f73b52b3881fd4a771320f02df2147/code/ocr/tools/__init__.py


--------------------------------------------------------------------------------
/code/ocr/tools/measures.py:
--------------------------------------------------------------------------------
  1 | # coding=utf8
  2 | import os
  3 | import numpy as np
  4 | from sklearn import metrics
  5 | from PIL import Image
  6 | import traceback
  7 | 
  8 | def stati_class_number_true_flase(label, pred):
  9 |     label = np.array(label)
 10 |     pred = np.array(pred)
 11 | 
 12 |     cls_list = set(label) | set(pred)
 13 |     d = dict()
 14 |     for cls in cls_list:
 15 |         d[cls] = dict()
 16 |         d[cls]['number'] = np.sum(label==cls)
 17 |         d[cls]['true'] = np.sum(label[label==cls]==pred[label==cls])
 18 |         d[cls]['pred'] = np.sum(pred==cls)
 19 |     return d
 20 | 
 21 | def stati_class_number_true_flase_multi_label_margin(labels, preds):
 22 | 
 23 |     d = dict()
 24 |     for label, pred in zip(labels, preds):
 25 |         label = set(label[label>=0])
 26 |         for cls in range(len(pred)):
 27 |             if cls not in d:
 28 |                 d[cls] = dict()
 29 |                 d[cls]['number'] = 0
 30 |                 d[cls]['true'] = 0
 31 |                 d[cls]['pred'] = 0
 32 |             if cls in label:
 33 |                 d[cls]['number'] += 1
 34 |                 if pred[cls] > 0.5:
 35 |                     d[cls]['true'] += 1
 36 |             if pred[cls] > 0.5:
 37 |                 d[cls]['pred'] += 1
 38 |     return d
 39 | 
 40 | def stati_class_number_true_flase_bce(labels, preds):
 41 |     d = dict()
 42 |     labels = labels.astype(np.int64).reshape(-1)
 43 |     preds = preds.reshape(-1) > 0
 44 |     index = labels >= 0
 45 |     labels = labels[index]
 46 |     preds = preds[index]
 47 | 
 48 |     preds_num = preds.sum(0)
 49 |     true_num = (labels+preds==2).sum(0)
 50 |     for cls in range(2):
 51 |         d[cls] = dict()
 52 |         d[cls]['number'] = (labels==cls).sum()
 53 |         d[cls]['true'] = (labels+preds==2*cls).sum()
 54 |         d[cls]['pred'] = (labels==cls).sum()
 55 |     return d
 56 | 
 57 | def measures(d_list):
 58 |     # 合并每一个预测的结果
 59 |     d_all = dict()
 60 |     for d in d_list:
 61 |         for cls in d.keys():
 62 |             if cls not in d_all:
 63 |                 d_all[cls] = dict()
 64 |             for k in d[cls].keys():
 65 |                 if k not in d_all[cls]:
 66 |                     d_all[cls][k] = 0
 67 |                 d_all[cls][k] += d[cls][k]
 68 |     m = dict()
 69 |     number = sum([d_all[cls]['number'] for cls in d_all.keys()])
 70 |     for cls in d_all:
 71 |         m[cls] = dict()
 72 |         m[cls]['number'] = d_all[cls]['number']
 73 |         m[cls]['true'] = d_all[cls]['true']
 74 |         m[cls]['pred'] = d_all[cls]['pred']
 75 |         m[cls]['ratio'] = d_all[cls]['number'] / (float(number) + 10e-10)
 76 |         m[cls]['accuracy'] = d_all[cls]['true'] / (float(d_all[cls]['number']) + 10e-10)
 77 |         m[cls]['precision'] = d_all[cls]['true'] /(float(d_all[cls]['pred']) + 10e-10) 
 78 |     return m
 79 | 
 80 | def print_measures(m, s = 'measures'):
 81 |     print s
 82 |     accuracy = 0
 83 |     for cls in sorted(m.keys()):
 84 |         print '\tclass: {:d}\taccuracy:{:.6f}\tprecision:{:.6f}\tratio:{:.6f}\t\tN/T/P:{:d}/{:d}/{:d}\
 85 |             '.format(cls, m[cls]['accuracy'],m[cls]['precision'],m[cls]['ratio'],m[cls]['number'],m[cls]['true'],m[cls]['pred'])
 86 | 	accuracy += m[cls]['accuracy'] * m[cls]['ratio']
 87 |     print '\tacc:{:.6f}'.format(accuracy)
 88 |     return accuracy
 89 | 
 90 | def mse(pred_image, image):
 91 |     pred_image = pred_image.reshape(-1).astype(np.float32)
 92 |     image = image.reshape(-1).astype(np.float32)
 93 |     mse_err = metrics.mean_squared_error(pred_image,image)
 94 |     return mse_err
 95 | 
 96 | def psnr(pred_image, image):
 97 |     return 10 * np.log10(255*255/mse(pred_image,image))
 98 | 
 99 | 
100 | def psnr_pred(stain_vis=20, end= 10000):
101 |     clean_dir = '../../data/AI/testB/'
102 |     psnr_list = []
103 |     f = open('../../data/result.csv','w')
104 |     for i,clean in enumerate(os.listdir(clean_dir)):
105 |         clean = os.path.join(clean_dir, clean)
106 |         clean_file = clean
107 |         pred = clean.replace('.jpg','.png').replace('data','data/test_clean')
108 |         stain = clean.replace('trainB','trainA').replace('testB','testA').replace('.jpg','_.jpg')
109 | 
110 |         try:
111 |             pred = np.array(Image.open(pred).resize((250,250))).astype(np.float32)
112 |             clean = np.array(Image.open(clean).resize((250,250))).astype(np.float32)
113 |             stain = np.array(Image.open(stain).resize((250,250))).astype(np.float32)
114 | 
115 |             # diff = np.abs(stain - pred)
116 |             # vis = 20
117 |             # pred[diff<vis] = stain[diff<vis]
118 | 
119 |             # gray_vis = 240
120 |             # pred[stain>gray_vis] = stain[stain>gray_vis]
121 | 
122 |             if end < 1000:
123 |                 diff = np.abs(clean - stain)
124 |                 # stain[diff>stain_vis] = pred[diff>stain_vis]
125 |                 stain[diff>stain_vis] = clean[diff>stain_vis]
126 | 
127 |             psnr_pred  = psnr(clean, pred)
128 |             psnr_stain = psnr(clean, stain)
129 |             psnr_list.append([psnr_stain, psnr_pred])
130 |         except:
131 |             continue
132 |         if i>end:
133 |             break
134 |         print i, min(end, 1000)
135 | 
136 |         f.write(clean_file.split('/')[-1].split('.')[0])
137 |         f.write(',')
138 |         f.write(str(psnr_stain))
139 |         f.write(',')
140 |         f.write(str(psnr_pred))
141 |         f.write(',')
142 |         f.write(str(psnr_pred/psnr_stain - 1))
143 |         f.write('\n')
144 |     # print '预测',np.mean(psnr_list)
145 |     psnr_list = np.array(psnr_list)
146 |     psnr_mean = ((psnr_list[:,1] - psnr_list[:,0]) / psnr_list[:,0]).mean()
147 |     if end > 1000:
148 |         print '网纹图PSNR', psnr_list[:,0].mean()
149 |         print '预测图PSNR', psnr_list[:,1].mean()
150 |         print '增益率', psnr_mean
151 |     f.write(str(psnr_mean))
152 |     f.close()
153 |     return psnr_list[:,0].mean()
154 | 
155 | def main():
156 |     pmax = [0.,0.]
157 |     for vis in range(1, 30):
158 |         p = psnr_pred(vis, 10)
159 |         print vis, p
160 |         if p > pmax[1]:
161 |             pmax = [vis, p]
162 |     print '...'
163 |     # print 256,psnr_pred(256)
164 |     print pmax
165 |     # print 10 * np.log10(255*255/metrics.mean_squared_error([3],[9]))
166 | 
167 | 
168 | if __name__ == '__main__':
169 |     psnr_pred(4000)
170 |     # main()
171 |     # for v in range(1,10):
172 |     #     print v, 10 * np.log10(255*255/v/v)
173 | 


--------------------------------------------------------------------------------
/code/ocr/tools/parse.py:
--------------------------------------------------------------------------------
  1 | # coding=utf8
  2 | 
  3 | import argparse
  4 | 
  5 | parser = argparse.ArgumentParser(description='medical caption GAN')
  6 | 
  7 | parser.add_argument(
  8 |         '--model',
  9 |         '-m',
 10 |         type=str,
 11 |         default='densenet',
 12 |         help='model'
 13 |         )
 14 | parser.add_argument(
 15 |         '--data-dir',
 16 |         '-d',
 17 |         type=str,
 18 |         default='../../data/dataset/',
 19 |         help='data directory'
 20 |         )
 21 | parser.add_argument(
 22 |         '--bg-dir',
 23 |         type=str,
 24 |         default='../../data/images',
 25 |         help='back groud images directory'
 26 |         )
 27 | parser.add_argument(
 28 |         '--hard-mining',
 29 |         type=int,
 30 |         default=0,
 31 |         help='use hard mining'
 32 |         )
 33 | parser.add_argument('--phase',
 34 |         default='train',
 35 |         type=str,
 36 |         metavar='S',
 37 |         help='pretrain/train/test phase')
 38 | parser.add_argument(
 39 |         '--batch-size',
 40 |         '-b',
 41 |         metavar='BATCH SIZE',
 42 |         type=int,
 43 |         default=16,
 44 |         help='batch size'
 45 |         )
 46 | parser.add_argument('--save-dir',
 47 |         default='../../data',
 48 |         type=str,
 49 |         metavar='S',
 50 |         help='save dir')
 51 | parser.add_argument('--word-index-json',
 52 |         default='../../files/alphabet_index_dict.json',
 53 |         type=str,
 54 |         metavar='S',
 55 |         help='save dir')
 56 | parser.add_argument('--black-json',
 57 |         default='../../files/black.json',
 58 |         type=str,
 59 |         metavar='S',
 60 |         help='black_list json')
 61 | parser.add_argument('--image-hw-ratio-json',
 62 |         default='../../files/image_hw_ratio_dict.json',
 63 |         type=str,
 64 |         metavar='S',
 65 |         help='image h:w ratio dict')
 66 | parser.add_argument('--word-count-json',
 67 |         default='../../files/alphabet_count_dict.json',
 68 |         type=str,
 69 |         metavar='S',
 70 |         help='word count file')
 71 | parser.add_argument('--image-label-json',
 72 |         default='../../files/train_alphabet.json',
 73 |         type=str,
 74 |         metavar='S',
 75 |         help='image label json')
 76 | parser.add_argument('--resume',
 77 |         default='',
 78 |         type=str,
 79 |         metavar='S',
 80 |         help='start from checkpoints')
 81 | parser.add_argument('--no-aug',
 82 |         default=0,
 83 |         type=int,
 84 |         metavar='S',
 85 |         help='no augmentation')
 86 | parser.add_argument('--small',
 87 |         default=1,
 88 |         type=int,
 89 |         metavar='S',
 90 |         help='small fonts')
 91 | parser.add_argument('--difficult',
 92 |         default=0,
 93 |         type=int,
 94 |         metavar='S',
 95 |         help='只计算比较难的图片')
 96 | parser.add_argument('--hist',
 97 |         default=0,
 98 |         type=int,
 99 |         metavar='S',
100 |         help='采用直方图均衡化')
101 | parser.add_argument('--feat',
102 |         default=0,
103 |         type=int,
104 |         metavar='S',
105 |         help='生成LSTM的feature')
106 | 
107 | #####
108 | parser.add_argument('-j',
109 |         '--workers',
110 |         default=8,
111 |         type=int,
112 |         metavar='N',
113 |         help='number of data loading workers (default: 32)')
114 | parser.add_argument('--lr',
115 |         '--learning-rate',
116 |         default=0.001,
117 |         type=float,
118 |         metavar='LR',
119 |         help='initial learning rate')
120 | parser.add_argument('--epochs',
121 |         default=10000,
122 |         type=int,
123 |         metavar='N',
124 |         help='number of total epochs to run')
125 | parser.add_argument('--save-freq',
126 |         default='5',
127 |         type=int,
128 |         metavar='S',
129 |         help='save frequency')
130 | parser.add_argument('--save-pred-freq',
131 |         default='10',
132 |         type=int,
133 |         metavar='S',
134 |         help='save pred clean frequency')
135 | parser.add_argument('--val-freq',
136 |         default='5',
137 |         type=int,
138 |         metavar='S',
139 |         help='val frequency')
140 | parser.add_argument('--debug',
141 |         default=0,
142 |         type=int,
143 |         metavar='S',
144 |         help='debug')
145 | parser.add_argument('--input-filter',
146 |         default=7,
147 |         type=int,
148 |         metavar='S',
149 |         help='val frequency')
150 | parser.add_argument('--use-gan',
151 |         default=0,
152 |         type=int,
153 |         metavar='S',
154 |         help='use GAN')
155 | parser.add_argument('--write-pred',
156 |         default=0,
157 |         type=int,
158 |         metavar='S',
159 |         help='writ predictions')
160 | parser.add_argument(
161 |         '--result-file',
162 |         '-r',
163 |         type=str,
164 |         default='../../data/result/test_result.csv',
165 |         help='result file'
166 |         )
167 | parser.add_argument(
168 |         '--output-file',
169 |         '-o',
170 |         type=str,
171 |         default='../../data/result/test.csv',
172 |         help='output file'
173 |         )
174 | args = parser.parse_args()
175 | 


--------------------------------------------------------------------------------
/code/ocr/tools/plot.py:
--------------------------------------------------------------------------------
 1 | # coding=utf8
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | def plot_multi_graph(image_list, name_list, save_path=None, show=False):
 6 |     graph_place = int(np.sqrt(len(name_list) - 1)) + 1
 7 |     for i, (image, name) in enumerate(zip(image_list, name_list)):
 8 |         ax1 = plt.subplot(graph_place,graph_place,i+1)
 9 |         ax1.set_title(name)
10 |         # plt.imshow(image,cmap='gray')
11 |         plt.imshow(image)
12 |         plt.axis('off')
13 |     if save_path:
14 |         plt.savefig(save_path)
15 |         pass
16 |     if show:
17 |         plt.show()
18 | 
19 | def plot_multi_line(x_list, y_list, name_list, save_path=None, show=False):
20 |     graph_place = int(np.sqrt(len(name_list) - 1)) + 1
21 |     for i, (x, y, name) in enumerate(zip(x_list, y_list, name_list)):
22 |         ax1 = plt.subplot(graph_place,graph_place,i+1)
23 |         ax1.set_title(name)
24 |         plt.plot(x,y)
25 |         # plt.imshow(image,cmap='gray')
26 |     if save_path:
27 |         plt.savefig(save_path)
28 |     if show:
29 |         plt.show()
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/code/ocr/tools/py_op.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | 此文件用于常用python函数的使用
  4 | """
  5 | import os
  6 | import json
  7 | import traceback
  8 | from collections import OrderedDict 
  9 | import random
 10 | from fuzzywuzzy import fuzz
 11 | 
 12 | import sys
 13 | reload(sys)
 14 | sys.setdefaultencoding('utf-8')
 15 | 
 16 | ################################################################################
 17 | ### pre define variables
 18 | #:: enumerate
 19 | #:: raw_input
 20 | #:: listdir
 21 | #:: sorted
 22 | ### pre define function
 23 | def mywritejson(save_path,content):
 24 |     content = json.dumps(content,indent=4,ensure_ascii=False)
 25 |     with open(save_path,'w') as f:
 26 |         f.write(content)
 27 | 
 28 | def myreadjson(load_path):
 29 |     with open(load_path,'r') as f:
 30 |         return json.loads(f.read())
 31 | 
 32 | def mywritefile(save_path,content):
 33 |     with open(save_path,'w') as f:
 34 |         f.write(content)
 35 | 
 36 | def myreadfile(load_path):
 37 |     with open(load_path,'r') as f:
 38 |         return f.read()
 39 | 
 40 | def myprint(content):
 41 |     print json.dumps(content,indent=4,ensure_ascii=False)
 42 | 
 43 | def rm(fi):
 44 |     os.system('rm ' + fi)
 45 | 
 46 | def mystrip(s):
 47 |     return ''.join(s.split())
 48 | 
 49 | def mysorteddict(d,key = lambda s:s, reverse=False):
 50 |     dordered = OrderedDict()
 51 |     for k in sorted(d.keys(),key = key,reverse=reverse):
 52 |         dordered[k] = d[k]
 53 |     return dordered
 54 | 
 55 | def mysorteddictfile(src,obj):
 56 |     mywritejson(obj,mysorteddict(myreadjson(src)))
 57 | 
 58 | def myfuzzymatch(srcs,objs,grade=80):
 59 |     matchDict = OrderedDict()
 60 |     for src in srcs:
 61 |         for obj in objs:
 62 |             value = fuzz.partial_ratio(src,obj)
 63 |             if value > grade:
 64 |                 try:
 65 |                     matchDict[src].append(obj)
 66 |                 except:
 67 |                     matchDict[src] = [obj]
 68 |     return matchDict
 69 | 
 70 | def mydumps(x):
 71 |     return json.dumps(content,indent=4,ensure_ascii=False)
 72 | 
 73 | def get_random_list(l,num=-1,isunique=0):
 74 |     if isunique:
 75 |         l = set(l)
 76 |     if num < 0:
 77 |         num = len(l)
 78 |     if isunique and num > len(l):
 79 |         return 
 80 |     lnew = []
 81 |     l = list(l)
 82 |     while(num>len(lnew)):
 83 |         x = l[int(random.random()*len(l))]
 84 |         if isunique and x in lnew:
 85 |             continue
 86 |         lnew.append(x)
 87 |     return lnew
 88 | 
 89 | def fuzz_list(node1_list,node2_list,score_baseline=66,proposal_num=10,string_map=None):
 90 |     node_dict = { }
 91 |     for i,node1 in enumerate(node1_list):
 92 |         match_score_dict = { }
 93 |         for node2 in node2_list:
 94 |             if node1 != node2:
 95 |                 if string_map is not None:
 96 |                     n1 = string_map(node1)
 97 |                     n2 = string_map(node2)
 98 |                     score = fuzz.partial_ratio(n1,n2)
 99 |                     if n1 == n2:
100 |                         node2_list.remove(node2)
101 |                 else:
102 |                     score = fuzz.partial_ratio(node1,node2)
103 |                 if score > score_baseline:
104 |                     match_score_dict[node2] = score
105 |             else:
106 |                 node2_list.remove(node2)
107 |         node2_sort = sorted(match_score_dict.keys(), key=lambda k:match_score_dict[k],reverse=True)
108 |         node_dict[node1] = [[n,match_score_dict[n]] for n in node2_sort[:proposal_num]]
109 |         print i,len(node1_list)
110 |     return node_dict, node2_list
111 | 
112 | def swap(a,b):
113 |     return b, a
114 | 
115 | def mkdir(d):
116 |     path = d.split('/')
117 |     for i in range(len(path)):
118 |         d = '/'.join(path[:i+1])
119 |         if not os.path.exists(d):
120 |             os.mkdir(d)
121 | 
122 | 


--------------------------------------------------------------------------------
/code/ocr/tools/segmentation.py:
--------------------------------------------------------------------------------
  1 | # coding=utf8
  2 | import matplotlib.pyplot as plt
  3 | from scipy import ndimage as ndi
  4 | from skimage import morphology,color,data
  5 | from skimage import filters
  6 | import numpy as np
  7 | import skimage 
  8 | import os
  9 | from skimage import measure
 10 | 
 11 | 
 12 | 
 13 | def watershed(image, label=None):
 14 |     denoised = filters.rank.median(image, morphology.disk(2)) #过滤噪声
 15 |     #将梯度值低于10的作为开始标记点
 16 |     markers = filters.rank.gradient(denoised, morphology.disk(5)) < 10
 17 |     markers = ndi.label(markers)[0]
 18 | 
 19 |     gradient = filters.rank.gradient(denoised, morphology.disk(2)) #计算梯度
 20 |     labels =morphology.watershed(gradient, markers, mask=image) #基于梯度的分水岭算法
 21 | 
 22 |     fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(6, 6))
 23 |     axes = axes.ravel()
 24 |     ax0, ax1, ax2, ax3 = axes
 25 | 
 26 |     ax0.imshow(image, cmap=plt.cm.gray, interpolation='nearest')
 27 |     ax0.set_title("Original")
 28 |     # ax1.imshow(gradient, cmap=plt.cm.spectral, interpolation='nearest')
 29 |     ax1.imshow(gradient, cmap=plt.cm.gray, interpolation='nearest')
 30 |     ax1.set_title("Gradient")
 31 |     if label is not None:
 32 |         # ax2.imshow(markers, cmap=plt.cm.spectral, interpolation='nearest')
 33 |         ax2.imshow(label, cmap=plt.cm.gray, interpolation='nearest')
 34 |     else:
 35 |         ax2.imshow(markers, cmap=plt.cm.spectral, interpolation='nearest')
 36 |     ax2.set_title("Markers")
 37 |     ax3.imshow(labels, cmap=plt.cm.spectral, interpolation='nearest')
 38 |     ax3.set_title("Segmented")
 39 | 
 40 |     for ax in axes:
 41 |         ax.axis('off')
 42 | 
 43 |     fig.tight_layout()
 44 |     plt.show()
 45 | 
 46 | def plot_4(image, gradient,label,segmentation, save_path=None):
 47 |     fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(6, 6))
 48 |     axes = axes.ravel()
 49 |     ax0, ax1, ax2, ax3 = axes
 50 |     ax0.imshow(image, cmap=plt.cm.gray, interpolation='nearest')
 51 |     ax0.set_title("Original")
 52 |     ax1.imshow(gradient, cmap=plt.cm.gray, interpolation='nearest')
 53 |     ax1.set_title("Gradient")
 54 |     ax2.imshow(label, cmap=plt.cm.gray, interpolation='nearest')
 55 |     ax2.set_title("label")
 56 |     ax3.imshow(segmentation, cmap=plt.cm.spectral, interpolation='nearest')
 57 |     ax3.set_title("Segmented")
 58 | 
 59 |     for ax in axes:
 60 |         ax.axis('off')
 61 | 
 62 |     fig.tight_layout()
 63 |     if save_path:
 64 |         print save_path
 65 |         plt.savefig(save_path)
 66 |     else:
 67 |         plt.show()
 68 | 
 69 | def fill(image):
 70 |     '''
 71 |     填充图片内部空白
 72 |     临时写的函数
 73 |     建议后期替换
 74 |     '''
 75 |     label_img = measure.label(image, background=1)
 76 |     props = measure.regionprops(label_img)
 77 |     max_area = np.array([p.area for p in props]).max()
 78 |     for i,prop in enumerate(props):
 79 |         if prop.area < max_area:
 80 |             image[prop.coords[:,0],prop.coords[:,1]] = 1
 81 |     return image
 82 | 
 83 | 
 84 | 
 85 | def my_watershed(image, label=None, min_gray=480, max_gray=708, min_gradient=5, show=False, save_path='/tmp/x.jpg'):
 86 |     image = image - min_gray
 87 |     image[image>max_gray] = 0
 88 |     image[image< 10]  = 0
 89 |     image = image * 5
 90 | 
 91 |     denoised = filters.rank.median(image, morphology.disk(2)) #过滤噪声
 92 |     #将梯度值低于10的作为开始标记点
 93 |     markers = filters.rank.gradient(denoised, morphology.disk(5)) < 10
 94 |     markers = ndi.label(markers)[0]
 95 | 
 96 |     gradient = filters.rank.gradient(denoised, morphology.disk(2)) #计算梯度
 97 |     labels = gradient > min_gradient
 98 | 
 99 |     mask = gradient > min_gradient
100 |     label_img = measure.label(mask, background=0)
101 |     props = measure.regionprops(label_img)
102 |     pred = np.zeros_like(gradient)
103 |     for i,prop in enumerate(props):
104 |         if prop.area > 50:
105 |             region = np.array(prop.coords)
106 |             vx,vy = region.var(0)
107 |             v = vx + vy
108 |             if v < 200:
109 |                 pred[prop.coords[:,0],prop.coords[:,1]] = 1
110 | 
111 |     # 填充边缘内部空白
112 |     pred = fill(pred)
113 | 
114 |     if show:
115 |         plot_4(image, gradient, label, pred)
116 |     else:
117 |         plot_4(image, gradient, label, pred, save_path)
118 | 
119 |     return pred
120 | 
121 | def segmentation(image_npy, label_npy,save_path):
122 |     print image_npy
123 |     image = np.load(image_npy)
124 |     label = np.load(label_npy)
125 |     if np.sum(label) == 0:
126 |         return
127 |     min_gray,max_gray = 480, 708
128 |     my_watershed(image,label,min_gray, max_gray,show=False, save_path=save_path)
129 | 
130 | def main():
131 |     data_dir = '/home/yin/all/PVL_DATA/preprocessed/2D/'
132 |     save_dir = '/home/yin/all/PVL_DATA/tool_result/'
133 |     os.system('rm -r ' + save_dir)
134 |     os.system('mkdir ' + save_dir)
135 |     for patient in os.listdir(data_dir):
136 |         patient_dir = os.path.join(data_dir, patient)
137 |         for f in os.listdir(patient_dir):
138 |             if 'roi.npy' in f:
139 |                 label_npy = os.path.join(patient_dir,f)
140 |                 image_npy = label_npy.replace('.roi.npy','.npy')
141 |                 segmentation(image_npy,label_npy, os.path.join(save_dir,label_npy.strip('/').replace('/','.').replace('npy','jpg')))
142 | 
143 | if __name__ == '__main__':
144 |     # image =color.rgb2gray(data.camera())
145 |     # watershed(image)
146 |     main()
147 |     image_npy = '/home/yin/all/PVL_DATA/preprocessed/2D/JD_chen_xi/23.npy'
148 |     image_npy = '/home/yin/all/PVL_DATA/preprocessed/2D/JD_chen_xi/14.npy' 
149 |     image_npy = '/home/yin/all/PVL_DATA/preprocessed/2D/JD_zhang_yu_chen/23.npy'
150 |     label_npy = image_npy.replace('.npy','.roi.npy')
151 |     segmentation(image_npy,label_npy)
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/code/ocr/tools/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (c) 2017 www.drcubic.com, Inc. All Rights Reserved
  5 | #
  6 | """
  7 | File: utils.py
  8 | Author: shileicao(shileicao@stu.xjtu.edu.cn)
  9 | Date: 2017-06-20 14:56:54
 10 | 
 11 | **Note.** This code absorb some code from following source.
 12 | 1. [DSB2017](https://github.com/lfz/DSB2017)
 13 | """
 14 | 
 15 | import os
 16 | import sys
 17 | 
 18 | import numpy as np
 19 | import torch
 20 | 
 21 | 
 22 | def getFreeId():
 23 |     import pynvml
 24 | 
 25 |     pynvml.nvmlInit()
 26 | 
 27 |     def getFreeRatio(id):
 28 |         handle = pynvml.nvmlDeviceGetHandleByIndex(id)
 29 |         use = pynvml.nvmlDeviceGetUtilizationRates(handle)
 30 |         ratio = 0.5 * (float(use.gpu + float(use.memory)))
 31 |         return ratio
 32 | 
 33 |     deviceCount = pynvml.nvmlDeviceGetCount()
 34 |     available = []
 35 |     for i in range(deviceCount):
 36 |         if getFreeRatio(i) < 70:
 37 |             available.append(i)
 38 |     gpus = ''
 39 |     for g in available:
 40 |         gpus = gpus + str(g) + ','
 41 |     gpus = gpus[:-1]
 42 |     return gpus
 43 | 
 44 | 
 45 | def setgpu(gpuinput):
 46 |     freeids = getFreeId()
 47 |     if gpuinput == 'all':
 48 |         gpus = freeids
 49 |     else:
 50 |         gpus = gpuinput
 51 |         busy_gpu = [g not in freeids for g in gpus.split(',')]
 52 |         if any(busy_gpu):
 53 |             raise ValueError('gpu' + ' '.join(busy_gpu) + 'is being used')
 54 |     print('using gpu ' + gpus)
 55 |     os.environ['CUDA_VISIBLE_DEVICES'] = gpus
 56 |     return len(gpus.split(','))
 57 | 
 58 | 
 59 | def error_mask_stats(labels, filenames):
 60 |     error_f = []
 61 |     for i, f in enumerate(filenames):
 62 | #         if not np.all(labels[i] > 0):
 63 | #             error_f.append(f)
 64 |         for bbox_i in range(labels[i].shape[0]):
 65 |             imgs = np.load(f)
 66 |             if not np.all(
 67 |                     np.array(imgs.shape[1:]) - labels[i][bbox_i][:-1] > 0):
 68 |                 error_f.append(f)
 69 |     error_f = list(set(error_f))
 70 |     fileid_list = [os.path.split(filename)[1].split('_')[0]
 71 |                    for filename in error_f]
 72 |     print("','".join(fileid_list))
 73 |     return error_f
 74 | 
 75 | 
 76 | class Logger(object):
 77 |     def __init__(self, logfile):
 78 |         self.terminal = sys.stdout
 79 |         self.log = open(logfile, "a")
 80 | 
 81 |     def write(self, message):
 82 |         self.terminal.write(message)
 83 |         self.log.write(message)
 84 | 
 85 |     def flush(self):
 86 |         #this flush method is needed for python 3 compatibility.
 87 |         #this handles the flush command by doing nothing.
 88 |         #you might want to specify some extra behavior here.
 89 |         pass
 90 | 
 91 | 
 92 | def split4(data, max_stride, margin):
 93 |     splits = []
 94 |     data = torch.Tensor.numpy(data)
 95 |     _, c, z, h, w = data.shape
 96 | 
 97 |     w_width = np.ceil(float(w / 2 + margin) /
 98 |                       max_stride).astype('int') * max_stride
 99 |     h_width = np.ceil(float(h / 2 + margin) /
100 |                       max_stride).astype('int') * max_stride
101 |     pad = int(np.ceil(float(z) / max_stride) * max_stride) - z
102 |     leftpad = pad / 2
103 |     pad = [[0, 0], [0, 0], [leftpad, pad - leftpad], [0, 0], [0, 0]]
104 |     data = np.pad(data, pad, 'constant', constant_values=-1)
105 |     data = torch.from_numpy(data)
106 |     splits.append(data[:, :, :, :h_width, :w_width])
107 |     splits.append(data[:, :, :, :h_width, -w_width:])
108 |     splits.append(data[:, :, :, -h_width:, :w_width])
109 |     splits.append(data[:, :, :, -h_width:, -w_width:])
110 | 
111 |     return torch.cat(splits, 0)
112 | 
113 | 
114 | def combine4(output, h, w):
115 |     splits = []
116 |     for i in range(len(output)):
117 |         splits.append(output[i])
118 | 
119 |     output = np.zeros(
120 |         (splits[0].shape[0], h, w, splits[0].shape[3],
121 |          splits[0].shape[4]), np.float32)
122 | 
123 |     h0 = output.shape[1] / 2
124 |     h1 = output.shape[1] - h0
125 |     w0 = output.shape[2] / 2
126 |     w1 = output.shape[2] - w0
127 | 
128 |     splits[0] = splits[0][:, :h0, :w0, :, :]
129 |     output[:, :h0, :w0, :, :] = splits[0]
130 | 
131 |     splits[1] = splits[1][:, :h0, -w1:, :, :]
132 |     output[:, :h0, -w1:, :, :] = splits[1]
133 | 
134 |     splits[2] = splits[2][:, -h1:, :w0, :, :]
135 |     output[:, -h1:, :w0, :, :] = splits[2]
136 | 
137 |     splits[3] = splits[3][:, -h1:, -w1:, :, :]
138 |     output[:, -h1:, -w1:, :, :] = splits[3]
139 | 
140 |     return output
141 | 
142 | 
143 | def split8(data, max_stride, margin):
144 |     splits = []
145 |     if isinstance(data, np.ndarray):
146 |         c, z, h, w = data.shape
147 |     else:
148 |         _, c, z, h, w = data.size()
149 | 
150 |     z_width = np.ceil(float(z / 2 + margin) /
151 |                       max_stride).astype('int') * max_stride
152 |     w_width = np.ceil(float(w / 2 + margin) /
153 |                       max_stride).astype('int') * max_stride
154 |     h_width = np.ceil(float(h / 2 + margin) /
155 |                       max_stride).astype('int') * max_stride
156 |     for zz in [[0, z_width], [-z_width, None]]:
157 |         for hh in [[0, h_width], [-h_width, None]]:
158 |             for ww in [[0, w_width], [-w_width, None]]:
159 |                 if isinstance(data, np.ndarray):
160 |                     splits.append(data[np.newaxis, :, zz[0]:zz[1], hh[0]:hh[1],
161 |                                        ww[0]:ww[1]])
162 |                 else:
163 |                     splits.append(data[:, :, zz[0]:zz[1], hh[0]:hh[1], ww[0]:
164 |                                        ww[1]])
165 | 
166 |     if isinstance(data, np.ndarray):
167 |         return np.concatenate(splits, 0)
168 |     else:
169 |         return torch.cat(splits, 0)
170 | 
171 | 
172 | def combine8(output, z, h, w):
173 |     splits = []
174 |     for i in range(len(output)):
175 |         splits.append(output[i])
176 | 
177 |     output = np.zeros(
178 |         (z, h, w, splits[0].shape[3], splits[0].shape[4]), np.float32)
179 | 
180 |     z_width = z / 2
181 |     h_width = h / 2
182 |     w_width = w / 2
183 |     i = 0
184 |     for zz in [[0, z_width], [z_width - z, None]]:
185 |         for hh in [[0, h_width], [h_width - h, None]]:
186 |             for ww in [[0, w_width], [w_width - w, None]]:
187 |                 output[zz[0]:zz[1], hh[0]:hh[1], ww[0]:ww[1], :, :] = splits[
188 |                     i][zz[0]:zz[1], hh[0]:hh[1], ww[0]:ww[1], :, :]
189 |                 i = i + 1
190 | 
191 |     return output
192 | 
193 | 
194 | def split16(data, max_stride, margin):
195 |     splits = []
196 |     _, c, z, h, w = data.size()
197 | 
198 |     z_width = np.ceil(float(z / 4 + margin) /
199 |                       max_stride).astype('int') * max_stride
200 |     z_pos = [z * 3 / 8 - z_width / 2, z * 5 / 8 - z_width / 2]
201 |     h_width = np.ceil(float(h / 2 + margin) /
202 |                       max_stride).astype('int') * max_stride
203 |     w_width = np.ceil(float(w / 2 + margin) /
204 |                       max_stride).astype('int') * max_stride
205 |     for zz in [[0, z_width], [z_pos[0], z_pos[0] + z_width],
206 |                [z_pos[1], z_pos[1] + z_width], [-z_width, None]]:
207 |         for hh in [[0, h_width], [-h_width, None]]:
208 |             for ww in [[0, w_width], [-w_width, None]]:
209 |                 splits.append(data[:, :, zz[0]:zz[1], hh[0]:hh[1], ww[0]:ww[
210 |                     1]])
211 | 
212 |     return torch.cat(splits, 0)
213 | 
214 | 
215 | def combine16(output, z, h, w):
216 |     splits = []
217 |     for i in range(len(output)):
218 |         splits.append(output[i])
219 | 
220 |     output = np.zeros(
221 |         (z, h, w, splits[0].shape[3], splits[0].shape[4]), np.float32)
222 | 
223 |     z_width = z / 4
224 |     h_width = h / 2
225 |     w_width = w / 2
226 |     splitzstart = splits[0].shape[0] / 2 - z_width / 2
227 |     z_pos = [z * 3 / 8 - z_width / 2, z * 5 / 8 - z_width / 2]
228 |     i = 0
229 |     for zz, zz2 in zip(
230 |         [[0, z_width], [z_width, z_width * 2], [z_width * 2, z_width * 3],
231 |          [z_width * 3 - z, None]],
232 |         [[0, z_width], [splitzstart, z_width + splitzstart],
233 |          [splitzstart, z_width + splitzstart], [z_width * 3 - z, None]]):
234 |         for hh in [[0, h_width], [h_width - h, None]]:
235 |             for ww in [[0, w_width], [w_width - w, None]]:
236 |                 output[zz[0]:zz[1], hh[0]:hh[1], ww[0]:ww[1], :, :] = splits[
237 |                     i][zz2[0]:zz2[1], hh[0]:hh[1], ww[0]:ww[1], :, :]
238 |                 i = i + 1
239 | 
240 |     return output
241 | 
242 | 
243 | def split32(data, max_stride, margin):
244 |     splits = []
245 |     _, c, z, h, w = data.size()
246 | 
247 |     z_width = np.ceil(float(z / 2 + margin) /
248 |                       max_stride).astype('int') * max_stride
249 |     w_width = np.ceil(float(w / 4 + margin) /
250 |                       max_stride).astype('int') * max_stride
251 |     h_width = np.ceil(float(h / 4 + margin) /
252 |                       max_stride).astype('int') * max_stride
253 | 
254 |     w_pos = [w * 3 / 8 - w_width / 2, w * 5 / 8 - w_width / 2]
255 |     h_pos = [h * 3 / 8 - h_width / 2, h * 5 / 8 - h_width / 2]
256 | 
257 |     for zz in [[0, z_width], [-z_width, None]]:
258 |         for hh in [[0, h_width], [h_pos[0], h_pos[0] + h_width],
259 |                    [h_pos[1], h_pos[1] + h_width], [-h_width, None]]:
260 |             for ww in [[0, w_width], [w_pos[0], w_pos[0] + w_width],
261 |                        [w_pos[1], w_pos[1] + w_width], [-w_width, None]]:
262 |                 splits.append(data[:, :, zz[0]:zz[1], hh[0]:hh[1], ww[0]:ww[
263 |                     1]])
264 | 
265 |     return torch.cat(splits, 0)
266 | 
267 | 
268 | def combine32(splits, z, h, w):
269 | 
270 |     output = np.zeros(
271 |         (z, h, w, splits[0].shape[3], splits[0].shape[4]), np.float32)
272 | 
273 |     z_width = int(np.ceil(float(z) / 2))
274 |     h_width = int(np.ceil(float(h) / 4))
275 |     w_width = int(np.ceil(float(w) / 4))
276 |     splithstart = splits[0].shape[1] / 2 - h_width / 2
277 |     splitwstart = splits[0].shape[2] / 2 - w_width / 2
278 | 
279 |     i = 0
280 |     for zz in [[0, z_width], [z_width - z, None]]:
281 | 
282 |         for hh, hh2 in zip(
283 |             [[0, h_width], [h_width, h_width * 2], [h_width * 2, h_width * 3],
284 |              [h_width * 3 - h, None]],
285 |             [[0, h_width], [splithstart, h_width + splithstart],
286 |              [splithstart, h_width + splithstart], [h_width * 3 - h, None]]):
287 | 
288 |             for ww, ww2 in zip(
289 |                 [[0, w_width], [w_width, w_width * 2],
290 |                  [w_width * 2, w_width * 3], [w_width * 3 - w, None]],
291 |                 [[0, w_width], [splitwstart, w_width + splitwstart],
292 |                  [splitwstart, w_width + splitwstart],
293 |                  [w_width * 3 - w, None]]):
294 | 
295 |                 output[zz[0]:zz[1], hh[0]:hh[1], ww[0]:ww[1], :, :] = splits[
296 |                     i][zz[0]:zz[1], hh2[0]:hh2[1], ww2[0]:ww2[1], :, :]
297 |                 i = i + 1
298 | 
299 |     return output
300 | 
301 | 
302 | def split64(data, max_stride, margin):
303 |     splits = []
304 |     _, c, z, h, w = data.size()
305 | 
306 |     z_width = np.ceil(float(z / 4 + margin) /
307 |                       max_stride).astype('int') * max_stride
308 |     w_width = np.ceil(float(w / 4 + margin) /
309 |                       max_stride).astype('int') * max_stride
310 |     h_width = np.ceil(float(h / 4 + margin) /
311 |                       max_stride).astype('int') * max_stride
312 | 
313 |     z_pos = [z * 3 / 8 - z_width / 2, z * 5 / 8 - z_width / 2]
314 |     w_pos = [w * 3 / 8 - w_width / 2, w * 5 / 8 - w_width / 2]
315 |     h_pos = [h * 3 / 8 - h_width / 2, h * 5 / 8 - h_width / 2]
316 | 
317 |     for zz in [[0, z_width], [z_pos[0], z_pos[0] + z_width],
318 |                [z_pos[1], z_pos[1] + z_width], [-z_width, None]]:
319 |         for hh in [[0, h_width], [h_pos[0], h_pos[0] + h_width],
320 |                    [h_pos[1], h_pos[1] + h_width], [-h_width, None]]:
321 |             for ww in [[0, w_width], [w_pos[0], w_pos[0] + w_width],
322 |                        [w_pos[1], w_pos[1] + w_width], [-w_width, None]]:
323 |                 splits.append(data[:, :, zz[0]:zz[1], hh[0]:hh[1], ww[0]:ww[
324 |                     1]])
325 | 
326 |     return torch.cat(splits, 0)
327 | 
328 | 
329 | def combine64(output, z, h, w):
330 |     splits = []
331 |     for i in range(len(output)):
332 |         splits.append(output[i])
333 | 
334 |     output = np.zeros(
335 |         (z, h, w, splits[0].shape[3], splits[0].shape[4]), np.float32)
336 | 
337 |     z_width = int(np.ceil(float(z) / 4))
338 |     h_width = int(np.ceil(float(h) / 4))
339 |     w_width = int(np.ceil(float(w) / 4))
340 |     splitzstart = splits[0].shape[0] / 2 - z_width / 2
341 |     splithstart = splits[0].shape[1] / 2 - h_width / 2
342 |     splitwstart = splits[0].shape[2] / 2 - w_width / 2
343 | 
344 |     i = 0
345 |     for zz, zz2 in zip(
346 |         [[0, z_width], [z_width, z_width * 2], [z_width * 2, z_width * 3],
347 |          [z_width * 3 - z, None]],
348 |         [[0, z_width], [splitzstart, z_width + splitzstart],
349 |          [splitzstart, z_width + splitzstart], [z_width * 3 - z, None]]):
350 | 
351 |         for hh, hh2 in zip(
352 |             [[0, h_width], [h_width, h_width * 2], [h_width * 2, h_width * 3],
353 |              [h_width * 3 - h, None]],
354 |             [[0, h_width], [splithstart, h_width + splithstart],
355 |              [splithstart, h_width + splithstart], [h_width * 3 - h, None]]):
356 | 
357 |             for ww, ww2 in zip(
358 |                 [[0, w_width], [w_width, w_width * 2],
359 |                  [w_width * 2, w_width * 3], [w_width * 3 - w, None]],
360 |                 [[0, w_width], [splitwstart, w_width + splitwstart],
361 |                  [splitwstart, w_width + splitwstart],
362 |                  [w_width * 3 - w, None]]):
363 | 
364 |                 output[zz[0]:zz[1], hh[0]:hh[1], ww[0]:ww[1], :, :] = splits[
365 |                     i][zz2[0]:zz2[1], hh2[0]:hh2[1], ww2[0]:ww2[1], :, :]
366 |                 i = i + 1
367 | 
368 |     return output
369 | 


--------------------------------------------------------------------------------
/code/preprocessing/analysis_dataset.py:
--------------------------------------------------------------------------------
  1 | # coding=utf8
  2 | #########################################################################
  3 | # File Name: analysis_dataset.py
  4 | # Author: ccyin
  5 | # mail: ccyin04@gmail.com
  6 | # Created Time: Fri 18 May 2018 04:19:58 PM CST
  7 | #########################################################################
  8 | '''
  9 | 此文件用于分析原有数据集信息
 10 |     stati_image_size: 统计图片大小信息
 11 |     stati_label_length: 统计文字长度信息
 12 | '''
 13 | 
 14 | import os
 15 | import json
 16 | from PIL import Image
 17 | import numpy as np
 18 | from tqdm import tqdm
 19 | import sys
 20 | sys.path.append('../ocr')
 21 | from tools import plot
 22 | 
 23 | def stati_image_size(image_dir, save_dir, big_w_dir):
 24 |     if not os.path.exists(big_w_dir):
 25 |         os.mkdir(big_w_dir)
 26 |     if not os.path.exists(save_dir):
 27 |         os.mkdir(save_dir)
 28 |     h_count_dict, w_count_dict, r_count_dict = { }, { }, { }
 29 |     image_hw_ratio_dict = { }
 30 |     for image in os.listdir(image_dir):
 31 |         h,w = Image.open(os.path.join(image_dir, image)).size
 32 |         if w > 80:
 33 |             cmd = 'cp ../../data/dataset/train/{:s} {:s}'.format(image, big_w_dir)
 34 |             # os.system(cmd)
 35 | 
 36 |         r = int(h / 8. / w)
 37 |         h = h / 10
 38 |         w = w / 10
 39 |         r_count_dict[r] = r_count_dict.get(r, 0) + 1
 40 |         h_count_dict[h] = h_count_dict.get(h, 0) + 1
 41 |         w_count_dict[w] = w_count_dict.get(w, 0) + 1
 42 |         image_hw_ratio_dict[image] = r
 43 | 
 44 |     with open(os.path.join(save_dir, 'image_hw_ratio_dict.json'), 'w') as f:
 45 |         f.write(json.dumps(image_hw_ratio_dict, indent=4))
 46 | 
 47 |     x = range(max(h_count_dict.keys())+1)
 48 |     y = [0 for _ in x]
 49 |     for h in sorted(h_count_dict.keys()):
 50 |         print '图片长度:{:d}~{:d}，有{:d}张图'.format(10*h, 10*h+10, h_count_dict[h])
 51 |         y[h] = h_count_dict[h]
 52 |     plot.plot_multi_line([x], [y], ['Length'], save_path='../../data/length.png', show=True)
 53 | 
 54 |     x = range(max(w_count_dict.keys())+1)
 55 |     y = [0 for _ in x]
 56 |     for w in sorted(w_count_dict.keys()):
 57 |         print '图片宽度:{:d}~{:d}，有{:d}张图'.format(10*w, 10*w+10, w_count_dict[w])
 58 |         y[w] = w_count_dict[w]
 59 |     plot.plot_multi_line([x], [y], ['Width'], save_path='../../data/width.png', show=True)
 60 | 
 61 |     x = range(max(r_count_dict.keys())+1)
 62 |     y = [0 for _ in x]
 63 |     for r in sorted(r_count_dict.keys()):
 64 |         print '图片比例:{:d}~{:d}，有{:d}张图'.format(8*r, 8*r+8, r_count_dict[r])
 65 |         y[r] = r_count_dict[r]
 66 |     x = [8*(_+1) for _ in x]
 67 |     plot.plot_multi_line([x], [y], ['L/W'], save_path='../../data/ratio.png', show=True)
 68 | 
 69 |     print '\n最多的长\n', sorted(h_count_dict.keys(), key=lambda h:h_count_dict[h])[-1] * 10
 70 |     print '\n最多的宽\n', sorted(w_count_dict.keys(), key=lambda w:w_count_dict[w])[-1] * 10
 71 | 
 72 |     print '建议使用 64 * 512 的输入'
 73 |     print '    部分使用 64 * 1024 的输入'
 74 |     print '    剩下的忽略'
 75 |     print '建议使用FCN来做，全局取最大值得到最终结果'
 76 | 
 77 | def stati_label_length(label_json, long_text_dir):
 78 |     if not os.path.exists(long_text_dir):
 79 |         os.mkdir(long_text_dir)
 80 |     image_label_json = json.load(open(label_json))
 81 |     l_count_dict = { }
 82 |     for image, label in image_label_json.items():
 83 |         l = len(label.split())
 84 |         l_count_dict[l] = l_count_dict.get(l, 0) + 1
 85 |         if l > 25:
 86 |             cmd = 'cp ../../data/dataset/train/{:s} {:s}'.format(image, long_text_dir)
 87 |             # os.system(cmd)
 88 | 
 89 |     word_num = 0.
 90 |     x = range(max(l_count_dict.keys())+1)
 91 |     y = [0 for _ in x]
 92 |     for l in sorted(l_count_dict.keys()):
 93 |         word_num += l * l_count_dict[l]
 94 |         print '文字长度:{:d}，有{:d}张图'.format(l, l_count_dict[l])
 95 |         y[l] = l_count_dict[l]
 96 |     plot.plot_multi_line([x], [y], ['Word Number'], save_path='../../data/word_num.png', show=True)
 97 |     print '平均每张图片{:3.4f}个字'.format(word_num / sum(l_count_dict.values()))
 98 | 
 99 | def stati_image_gray(image_dir):
100 |     print 'eval train image gray'
101 |     for image in tqdm(os.listdir(image_dir)):
102 |         image = Image.open(os.path.join(image_dir, image)).convert('RGB')
103 |         image = np.array(image)
104 |         mi,ma = image.min(), image.max()
105 |         assert mi >= 0
106 |         assert ma < 256
107 | 
108 |     print 'eval test image gray'
109 |     image_dir = image_dir.replace('train', 'test')
110 |     for image in tqdm(os.listdir(image_dir)):
111 |         image = Image.open(os.path.join(image_dir, image)).convert('RGB')
112 |         image = np.array(image)
113 |         mi,ma = image.min(), image.max()
114 |         assert mi >= 0
115 |         assert ma < 256
116 | 
117 | 
118 | 
119 | def main():
120 |     image_dir = '../../data/dataset/train'
121 |     save_dir = '../../files/'
122 |     big_w_dir = '../../data/big_w_dir'
123 |     stati_image_size(image_dir, save_dir, big_w_dir)
124 | 
125 |     train_label_json = '../../files/train_alphabet.json'
126 |     long_text_dir = '../../data/long_text_dir'
127 |     stati_label_length(train_label_json, long_text_dir)
128 |     # stati_image_gray(image_dir)
129 | 
130 | if __name__ == '__main__':
131 |     main()
132 | 


--------------------------------------------------------------------------------
/code/preprocessing/map_word_to_index.py:
--------------------------------------------------------------------------------
 1 | # coding=utf8
 2 | #########################################################################
 3 | # File Name: map_word_to_index.py
 4 | # Author: ccyin
 5 | # mail: ccyin04@gmail.com
 6 | # Created Time: Fri 18 May 2018 03:30:26 PM CST
 7 | #########################################################################
 8 | '''
 9 | 此代码用于将所有文字映射到index上，有两种方式
10 |     1. 映射每一个英文单词为一个index
11 |     2. 映射每一个英文字母为一个index
12 | '''
13 | 
14 | import os
15 | import sys
16 | reload(sys)
17 | sys.setdefaultencoding('utf8')
18 | import json
19 | from collections import OrderedDict
20 | 
21 | def map_word_to_index(train_word_file, word_index_json, word_count_json, index_label_json, alphabet_to_index=True):
22 |     with open(train_word_file, 'r') as f:
23 |         labels = f.read().strip().decode('utf8')
24 |     word_count_dict = { }
25 |     for line in labels.split('\n')[1:]:
26 |         line = line.strip()
27 |         image, sentence = line.strip().split('.png,')
28 |         sentence = sentence.strip('"')
29 |         for w in sentence:
30 |             word_count_dict[w] = word_count_dict.get(w,0) + 1
31 |     print '一共有{:d}种字符，共{:d}个'.format(len(word_count_dict), sum(word_count_dict.values()))
32 |     word_sorted = sorted(word_count_dict.keys(), key=lambda k:word_count_dict[k], reverse=True)
33 |     # word_index_dict = { w:i for i,w in enumerate(word_sorted) }
34 |     word_index_dict = json.load(open(word_index_json))
35 | 
36 |     with open(word_count_json, 'w') as f:
37 |         f.write(json.dumps(word_count_dict, indent=4, ensure_ascii=False))
38 |     # with open(word_index_json, 'w') as f:
39 |     #     f.write(json.dumps(word_index_dict, indent=4, ensure_ascii=False))
40 |         
41 |     image_label_dict = OrderedDict()
42 |     for line in labels.split('\n')[1:]:
43 |         line = line.strip()
44 |         image, sentence = line.strip().split('.png,')
45 |         sentence = sentence.strip('"')
46 | 
47 |         # 换掉部分相似符号
48 |         for c in u"　 ":
49 |             sentence = sentence.replace(c, '')
50 |         replace_words = [
51 |                 u'(（',
52 |                 u')）',
53 |                 u',，',
54 |                 u"´'′", 
55 |                 u"″＂“",
56 |                 u"．.",
57 |                 u"—-"
58 |                 ]
59 |         for words in replace_words:
60 |             for w in words[:-1]:
61 |                 sentence = sentence.replace(w, words[-1])
62 | 
63 |         index_list = []
64 |         for w in sentence:
65 |             index_list.append(str(word_index_dict[w]))
66 |         image_label_dict[image + '.png'] = ' '.join(index_list)
67 |     with open(index_label_json, 'w') as f:
68 |         f.write(json.dumps(image_label_dict, indent=4))
69 | 
70 | 
71 | def main():
72 | 
73 |     # 映射字母为index
74 |     train_word_file = '../../files/train.csv'
75 |     word_index_json = '../../files/alphabet_index_dict.json'
76 |     word_count_json = '../../files/alphabet_count_dict.json'
77 |     index_label_json = '../../files/train_alphabet.json'
78 |     map_word_to_index(train_word_file, word_index_json, word_count_json, index_label_json, True)
79 | 
80 | if __name__ == '__main__':
81 |     main()
82 | 


--------------------------------------------------------------------------------
/code/preprocessing/show_black.py:
--------------------------------------------------------------------------------
 1 | # coding=utf8
 2 | #########################################################################
 3 | # File Name: show_black.py
 4 | # Author: ccyin
 5 | # mail: ccyin04@gmail.com
 6 | # Created Time: 2018年06月07日 星期四 01时06分22秒
 7 | #########################################################################
 8 | 
 9 | import os
10 | import sys
11 | import json
12 | sys.path.append('../ocr')
13 | from tools import parse, py_op
14 | args = parse.args
15 | 
16 | def cp_black_list(black_json, black_dir):
17 |     word_index_dict = json.load(open(args.word_index_json))
18 |     index_word_dict = { v:k for k,v in word_index_dict.items() }
19 |     train_word_dict = json.load(open(args.image_label_json))
20 |     train_word_dict = { k:''.join([index_word_dict[int(i)] for i in v.split()]) for k,v in train_word_dict.items() }
21 | 
22 |     py_op.mkdir(black_dir)
23 |     black_list = json.load(open(black_json))['black_list']
24 |     for i,name in enumerate(black_list):
25 |         cmd = 'cp {:s} {:s}'.format(os.path.join(args.data_dir, 'train', name), black_dir)
26 |         if train_word_dict[name] in ['Err:501', '#NAME?', '###']:
27 |             continue
28 |         print name
29 |         print train_word_dict[name]
30 |         os.system(cmd)
31 |         if i > 30:
32 |             break
33 | 
34 | if __name__ == '__main__':
35 |     black_dir = os.path.join(args.save_dir, 'black')
36 |     cp_black_list(args.black_json, black_dir)
37 | 


--------------------------------------------------------------------------------
/files/alphabet_count_dict.json:
--------------------------------------------------------------------------------
   1 | {
   2 |     "挂": 9, 
   3 |     "退": 5, 
   4 |     "谈": 4, 
   5 |     "随": 139, 
   6 |     "抗": 4, 
   7 |     "料": 95, 
   8 |     "微": 7, 
   9 |     "洞": 9, 
  10 |     "造": 61, 
  11 |     "般": 10, 
  12 |     "潜": 3, 
  13 |     "河": 48, 
  14 |     "欲": 2, 
  15 |     "侵": 3, 
  16 |     "临": 5, 
  17 |     "然": 113, 
  18 |     "吸": 7, 
  19 |     "场": 194, 
  20 |     "宽": 89, 
  21 |     "线": 4480, 
  22 |     "@": 2, 
  23 |     "反": 244, 
  24 |     "牌": 60, 
  25 |     "盏": 7, 
  26 |     "科": 26, 
  27 |     "筒": 8, 
  28 |     "苗": 22, 
  29 |     "摘": 16, 
  30 |     "话": 18, 
  31 |     "赞": 3, 
  32 |     "凡": 3, 
  33 |     "知": 1291, 
  34 |     "除": 68, 
  35 |     "揭": 3, 
  36 |     "扬": 6, 
  37 |     "泳": 5, 
  38 |     "其": 520, 
  39 |     "闹": 1, 
  40 |     "绿": 42, 
  41 |     "渔": 11, 
  42 |     "覆": 5, 
  43 |     "沈": 4, 
  44 |     "》": 5, 
  45 |     "引": 14, 
  46 |     "应": 333, 
  47 |     "枚": 18, 
  48 |     "灵": 2, 
  49 |     "滤": 1, 
  50 |     "假": 53, 
  51 |     "鲨": 1, 
  52 |     "+": 2226, 
  53 |     "循": 12, 
  54 |     "抬": 2, 
  55 |     "是": 3714, 
  56 |     "械": 4, 
  57 |     "讲": 3, 
  58 |     "刷": 23, 
  59 |     "冶": 1, 
  60 |     "咸": 2, 
  61 |     "胀": 1, 
  62 |     "视": 135, 
  63 |     "俊": 1, 
  64 |     "抱": 1, 
  65 |     "契": 4, 
  66 |     "寒": 1, 
  67 |     "录": 13, 
  68 |     "酸": 7, 
  69 |     "教": 48, 
  70 |     "也": 85, 
  71 |     "囤": 1, 
  72 |     "秦": 2, 
  73 |     "峨": 1, 
  74 |     "k": 518, 
  75 |     "括": 30, 
  76 |     "景": 22, 
  77 |     "滴": 3, 
  78 |     "铸": 2, 
  79 |     "须": 18, 
  80 |     "基": 28, 
  81 |     "广": 47, 
  82 |     "₁": 176, 
  83 |     "暅": 2, 
  84 |     "上": 2968, 
  85 |     "后": 592, 
  86 |     "频": 26, 
  87 |     "餐": 19, 
  88 |     "暂": 4, 
  89 |     "底": 192, 
  90 |     "蒙": 1, 
  91 |     "辟": 2, 
  92 |     "足": 436, 
  93 |     "伴": 4, 
  94 |     "馈": 1, 
  95 |     "甸": 1, 
  96 |     "离": 525, 
  97 |     "笼": 3, 
  98 |     "尾": 33, 
  99 |     "框": 31, 
 100 |     "泉": 8, 
 101 |     "绕": 178, 
 102 |     "V": 20, 
 103 |     "虚": 23, 
 104 |     "迟": 3, 
 105 |     "郡": 1, 
 106 |     "牢": 1, 
 107 |     "柯": 1, 
 108 |     "棱": 155, 
 109 |     "跳": 23, 
 110 |     "轴": 1757, 
 111 |     "号": 159, 
 112 |     "偶": 46, 
 113 |     "啸": 1, 
 114 |     "移": 348, 
 115 |     "态": 15, 
 116 |     "节": 59, 
 117 |     "★": 8, 
 118 |     "构": 61, 
 119 |     "消": 18, 
 120 |     "肖": 1, 
 121 |     "伟": 2, 
 122 |     "倡": 2, 
 123 |     "冠": 4, 
 124 |     "纪": 6, 
 125 |     "术": 22, 
 126 |     "精": 34, 
 127 |     "A": 9419, 
 128 |     "柄": 2, 
 129 |     "汉": 13, 
 130 |     "克": 110, 
 131 |     "今": 34, 
 132 |     "前": 219, 
 133 |     "双": 169, 
 134 |     "坏": 1, 
 135 |     "塑": 10, 
 136 |     "姐": 1, 
 137 |     "幕": 2, 
 138 |     "胖": 1, 
 139 |     "几": 207, 
 140 |     "巨": 1, 
 141 |     "杯": 29, 
 142 |     "卷": 33, 
 143 |     "馨": 1, 
 144 |     "固": 20, 
 145 |     "导": 35, 
 146 |     "齿": 7, 
 147 |     "∀": 2, 
 148 |     "辉": 4, 
 149 |     "丈": 5, 
 150 |     "再": 148, 
 151 |     "咏": 1, 
 152 |     "库": 35, 
 153 |     "尔": 7, 
 154 |     "挖": 11, 
 155 |     "炮": 5, 
 156 |     "沟": 1, 
 157 |     "伞": 4, 
 158 |     "㎡": 1, 
 159 |     "符": 49, 
 160 |     "爆": 3, 
 161 |     ",": 316, 
 162 |     "水": 414, 
 163 |     "ρ": 10, 
 164 |     "所": 1053, 
 165 |     "旅": 30, 
 166 |     "摄": 3, 
 167 |     "么": 311, 
 168 |     "重": 280, 
 169 |     "灌": 5, 
 170 |     "坎": 1, 
 171 |     "结": 509, 
 172 |     "×": 54, 
 173 |     "学": 631, 
 174 |     "臭": 2, 
 175 |     "l": 460, 
 176 |     "倒": 24, 
 177 |     "践": 15, 
 178 |     "培": 6, 
 179 |     "持": 49, 
 180 |     "技": 23, 
 181 |     "标": 1525, 
 182 |     "予": 8, 
 183 |     "越": 18, 
 184 |     "馒": 2, 
 185 |     "耗": 11, 
 186 |     "辞": 1, 
 187 |     "加": 244, 
 188 |     "锥": 118, 
 189 |     "缩": 7, 
 190 |     "悬": 3, 
 191 |     "贵": 4, 
 192 |     "臂": 1, 
 193 |     "故": 10, 
 194 |     "蓄": 10, 
 195 |     "识": 34, 
 196 |     "免": 12, 
 197 |     "侣": 3, 
 198 |     "城": 62, 
 199 |     "筑": 20, 
 200 |     "秒": 190, 
 201 |     "W": 15, 
 202 |     "蝙": 1, 
 203 |     "江": 49, 
 204 |     "连": 542, 
 205 |     "卡": 61, 
 206 |     "狠": 1, 
 207 |     "略": 19, 
 208 |     "彩": 15, 
 209 |     "扫": 13, 
 210 |     "赵": 7, 
 211 |     "叶": 5, 
 212 |     "相": 1124, 
 213 |     "好": 220, 
 214 |     "屿": 1, 
 215 |     "争": 4, 
 216 |     "压": 17, 
 217 |     "谊": 1, 
 218 |     "吃": 8, 
 219 |     "疏": 2, 
 220 |     "骑": 33, 
 221 |     "或": 125, 
 222 |     "蜘": 1, 
 223 |     "趟": 2, 
 224 |     "务": 40, 
 225 |     "匠": 1, 
 226 |     "垣": 1, 
 227 |     "钥": 1, 
 228 |     "斯": 2, 
 229 |     "2": 6759, 
 230 |     "贴": 12, 
 231 |     "冷": 1, 
 232 |     "制": 75, 
 233 |     "霸": 1, 
 234 |     "礼": 2, 
 235 |     "B": 8729, 
 236 |     "送": 27, 
 237 |     "友": 15, 
 238 |     "筐": 8, 
 239 |     "糕": 3, 
 240 |     "蛙": 3, 
 241 |     "毛": 16, 
 242 |     "玻": 13, 
 243 |     "跟": 5, 
 244 |     "占": 20, 
 245 |     "啤": 1, 
 246 |     "绩": 24, 
 247 |     "普": 19, 
 248 |     "泵": 1, 
 249 |     "浴": 1, 
 250 |     "寻": 2, 
 251 |     "饼": 7, 
 252 |     "₂": 116, 
 253 |     "搅": 2, 
 254 |     "薄": 4, 
 255 |     "三": 1668, 
 256 |     "倍": 92, 
 257 |     "册": 5, 
 258 |     "鼓": 6, 
 259 |     "榜": 1, 
 260 |     "负": 79, 
 261 |     "圣": 4, 
 262 |     "ア": 292, 
 263 |     "大": 855, 
 264 |     "-": 2268, 
 265 |     "支": 130, 
 266 |     "樱": 9, 
 267 |     "记": 113, 
 268 |     "状": 66, 
 269 |     "扁": 1, 
 270 |     "π": 68, 
 271 |     "权": 4, 
 272 |     "义": 108, 
 273 |     "银": 10, 
 274 |     "遍": 3, 
 275 |     "里": 77, 
 276 |     "雎": 1, 
 277 |     "当": 817, 
 278 |     "展": 69, 
 279 |     "拖": 2, 
 280 |     "牡": 1, 
 281 |     "珠": 12, 
 282 |     "荐": 1, 
 283 |     "荫": 1, 
 284 |     "m": 1596, 
 285 |     "佳": 9, 
 286 |     "恒": 59, 
 287 |     "具": 70, 
 288 |     "铺": 21, 
 289 |     "肃": 2, 
 290 |     "蜂": 2, 
 291 |     "暗": 4, 
 292 |     "依": 70, 
 293 |     "东": 97, 
 294 |     "那": 247, 
 295 |     "判": 170, 
 296 |     "级": 89, 
 297 |     "梳": 1, 
 298 |     "段": 802, 
 299 |     "区": 224, 
 300 |     "徽": 9, 
 301 |     "社": 41, 
 302 |     "旁": 21, 
 303 |     "杂": 5, 
 304 |     "υ": 1, 
 305 |     "爷": 7, 
 306 |     "觉": 3, 
 307 |     "案": 102, 
 308 |     "归": 16, 
 309 |     "X": 8, 
 310 |     "泛": 1, 
 311 |     "保": 142, 
 312 |     "面": 1955, 
 313 |     "句": 8, 
 314 |     "继": 23, 
 315 |     "秩": 1, 
 316 |     "深": 18, 
 317 |     "汰": 3, 
 318 |     "镶": 2, 
 319 |     "凹": 5, 
 320 |     "系": 912, 
 321 |     "忽": 17, 
 322 |     "幼": 4, 
 323 |     "竿": 2, 
 324 |     "考": 149, 
 325 |     "抄": 1, 
 326 |     "万": 84, 
 327 |     "殊": 8, 
 328 |     "徒": 6, 
 329 |     "锡": 3, 
 330 |     "英": 4, 
 331 |     "天": 346, 
 332 |     "墨": 4, 
 333 |     "音": 7, 
 334 |     "］": 1, 
 335 |     "稿": 2, 
 336 |     "C": 7685, 
 337 |     "驶": 110, 
 338 |     "野": 7, 
 339 |     "汛": 1, 
 340 |     "勤": 8, 
 341 |     "左": 200, 
 342 |     "葫": 1, 
 343 |     "杭": 8, 
 344 |     "良": 5, 
 345 |     "泰": 9, 
 346 |     "兹": 1, 
 347 |     "类": 51, 
 348 |     "综": 14, 
 349 |     "穿": 3, 
 350 |     "攀": 6, 
 351 |     "茄": 5, 
 352 |     "笔": 70, 
 353 |     "林": 27, 
 354 |     "√": 180, 
 355 |     "孝": 3, 
 356 |     "本": 247, 
 357 |     ".": 5096, 
 358 |     "串": 1, 
 359 |     "点": 8894, 
 360 |     "纽": 2, 
 361 |     "宿": 5, 
 362 |     "担": 5, 
 363 |     "过": 1283, 
 364 |     "豆": 7, 
 365 |     "棉": 10, 
 366 |     "姓": 2, 
 367 |     "答": 138, 
 368 |     "券": 18, 
 369 |     "作": 791, 
 370 |     "∑": 1, 
 371 |     "院": 16, 
 372 |     "票": 58, 
 373 |     "n": 1062, 
 374 |     "走": 91, 
 375 |     "典": 17, 
 376 |     "彼": 2, 
 377 |     "顾": 20, 
 378 |     "艇": 10, 
 379 |     "℃": 9, 
 380 |     "蚂": 9, 
 381 |     "炎": 1, 
 382 |     "少": 639, 
 383 |     "蜗": 2, 
 384 |     "洛": 3, 
 385 |     "抚": 1, 
 386 |     "丝": 23, 
 387 |     "弧": 102, 
 388 |     "思": 25, 
 389 |     "振": 2, 
 390 |     "亲": 8, 
 391 |     "帽": 2, 
 392 |     "览": 2, 
 393 |     "降": 55, 
 394 |     "协": 2, 
 395 |     "ノ": 137, 
 396 |     "Y": 6, 
 397 |     "赛": 104, 
 398 |     "九": 43, 
 399 |     "远": 26, 
 400 |     "团": 24, 
 401 |     "古": 15, 
 402 |     "姨": 1, 
 403 |     "药": 35, 
 404 |     "说": 368, 
 405 |     "瓶": 25, 
 406 |     "凸": 6, 
 407 |     "极": 140, 
 408 |     "漆": 4, 
 409 |     "皋": 1, 
 410 |     "同": 829, 
 411 |     "帐": 4, 
 412 |     "研": 41, 
 413 |     "托": 5, 
 414 |     "战": 12, 
 415 |     "些": 70, 
 416 |     "脚": 10, 
 417 |     "劣": 10, 
 418 |     "弦": 188, 
 419 |     "太": 17, 
 420 |     "断": 177, 
 421 |     "搬": 10, 
 422 |     "丰": 2, 
 423 |     "洲": 8, 
 424 |     "爸": 41, 
 425 |     "麻": 1, 
 426 |     "尼": 1, 
 427 |     "D": 4649, 
 428 |     "但": 49, 
 429 |     "诉": 2, 
 430 |     "像": 176, 
 431 |     "华": 29, 
 432 |     "塔": 45, 
 433 |     "艘": 26, 
 434 |     "距": 515, 
 435 |     "d": 95, 
 436 |     "碑": 1, 
 437 |     "耽": 1, 
 438 |     "据": 221, 
 439 |     "买": 202, 
 440 |     "瓷": 4, 
 441 |     "靶": 2, 
 442 |     "鱼": 17, 
 443 |     "签": 6, 
 444 |     "蚀": 1, 
 445 |     "∃": 4, 
 446 |     "辆": 105, 
 447 |     "和": 977, 
 448 |     "透": 12, 
 449 |     "夕": 1, 
 450 |     "折": 287, 
 451 |     "簧": 3, 
 452 |     "骨": 2, 
 453 |     "/": 374, 
 454 |     "液": 12, 
 455 |     "宾": 8, 
 456 |     "汇": 1, 
 457 |     "偏": 51, 
 458 |     "网": 75, 
 459 |     "麦": 4, 
 460 |     "著": 15, 
 461 |     "诞": 2, 
 462 |     "坡": 88, 
 463 |     "因": 54, 
 464 |     "入": 131, 
 465 |     "孩": 6, 
 466 |     "虫": 4, 
 467 |     "息": 61, 
 468 |     "仰": 27, 
 469 |     "韶": 2, 
 470 |     "出": 1592, 
 471 |     "蚁": 9, 
 472 |     "最": 929, 
 473 |     "踪": 3, 
 474 |     "鞋": 4, 
 475 |     "锌": 1, 
 476 |     "熏": 1, 
 477 |     "斗": 2, 
 478 |     "⊙": 487, 
 479 |     "供": 49, 
 480 |     "眠": 1, 
 481 |     "带": 56, 
 482 |     "播": 5, 
 483 |     "蔬": 29, 
 484 |     "估": 31, 
 485 |     "喷": 12, 
 486 |     "阶": 17, 
 487 |     "债": 5, 
 488 |     "粽": 2, 
 489 |     "情": 90, 
 490 |     "掷": 22, 
 491 |     "淇": 2, 
 492 |     "响": 18, 
 493 |     "界": 16, 
 494 |     "减": 79, 
 495 |     "黑": 30, 
 496 |     "罐": 4, 
 497 |     "寓": 4, 
 498 |     "奔": 1, 
 499 |     "旗": 21, 
 500 |     "进": 331, 
 501 |     "Z": 9, 
 502 |     "靠": 15, 
 503 |     "口": 75, 
 504 |     "呢": 5, 
 505 |     "內": 2, 
 506 |     "巧": 12, 
 507 |     "苦": 3, 
 508 |     "敬": 2, 
 509 |     "棵": 38, 
 510 |     "陶": 2, 
 511 |     "卸": 10, 
 512 |     "翻": 53, 
 513 |     "姿": 1, 
 514 |     "驾": 8, 
 515 |     "范": 415, 
 516 |     "者": 42, 
 517 |     "了": 514, 
 518 |     "看": 62, 
 519 |     "名": 156, 
 520 |     "徐": 8, 
 521 |     "粒": 3, 
 522 |     "掘": 2, 
 523 |     "肚": 1, 
 524 |     "钢": 24, 
 525 |     "紧": 12, 
 526 |     "约": 58, 
 527 |     "末": 38, 
 528 |     "搭": 21, 
 529 |     "男": 21, 
 530 |     "刹": 4, 
 531 |     "尽": 9, 
 532 |     "E": 3118, 
 533 |     "浇": 4, 
 534 |     "委": 6, 
 535 |     "佛": 2, 
 536 |     "陡": 2, 
 537 |     "健": 18, 
 538 |     "凤": 2, 
 539 |     "乱": 1, 
 540 |     "述": 59, 
 541 |     "零": 141, 
 542 |     "特": 19, 
 543 |     "司": 88, 
 544 |     "雾": 3, 
 545 |     "要": 357, 
 546 |     "辅": 8, 
 547 |     "序": 76, 
 548 |     "【": 44, 
 549 |     "钓": 1, 
 550 |     "＂": 2, 
 551 |     "会": 97, 
 552 |     "晚": 11, 
 553 |     "焦": 170, 
 554 |     "吨": 47, 
 555 |     "被": 77, 
 556 |     "训": 11, 
 557 |     "躯": 1, 
 558 |     "贮": 1, 
 559 |     "0": 4856, 
 560 |     "邻": 45, 
 561 |     "强": 38, 
 562 |     "弓": 1, 
 563 |     "见": 17, 
 564 |     "血": 2, 
 565 |     "迅": 1, 
 566 |     "经": 401, 
 567 |     "金": 93, 
 568 |     "周": 240, 
 569 |     "坪": 4, 
 570 |     "语": 8, 
 571 |     "浮": 5, 
 572 |     "p": 114, 
 573 |     "奶": 16, 
 574 |     "调": 223, 
 575 |     "验": 52, 
 576 |     "香": 8, 
 577 |     "隔": 36, 
 578 |     "芜": 2, 
 579 |     "星": 46, 
 580 |     "颠": 1, 
 581 |     "客": 92, 
 582 |     "饰": 4, 
 583 |     "咨": 1, 
 584 |     "港": 30, 
 585 |     "脱": 1, 
 586 |     "°": 992, 
 587 |     "β": 56, 
 588 |     "岸": 34, 
 589 |     "逻": 2, 
 590 |     "阿": 2, 
 591 |     "检": 38, 
 592 |     "扇": 80, 
 593 |     "蕉": 5, 
 594 |     "恶": 1, 
 595 |     "鹏": 2, 
 596 |     "浙": 12, 
 597 |     "[": 146, 
 598 |     "牧": 9, 
 599 |     "哨": 1, 
 600 |     "衫": 18, 
 601 |     "淮": 2, 
 602 |     "胰": 1, 
 603 |     "更": 29, 
 604 |     "穷": 7, 
 605 |     "怨": 1, 
 606 |     "黏": 1, 
 607 |     "老": 54, 
 608 |     "划": 99, 
 609 |     "栖": 4, 
 610 |     "沙": 9, 
 611 |     "业": 51, 
 612 |     "茜": 1, 
 613 |     "搞": 2, 
 614 |     "钩": 2, 
 615 |     "用": 804, 
 616 |     "侯": 2, 
 617 |     "±": 6, 
 618 |     "△": 1760, 
 619 |     "裁": 22, 
 620 |     "睡": 2, 
 621 |     "仅": 27, 
 622 |     "F": 1991, 
 623 |     "效": 24, 
 624 |     "始": 125, 
 625 |     "郑": 11, 
 626 |     "雕": 4, 
 627 |     "诗": 1, 
 628 |     "酥": 1, 
 629 |     "²": 310, 
 630 |     "奠": 1, 
 631 |     "绥": 1, 
 632 |     "噪": 5, 
 633 |     "譬": 1, 
 634 |     "俯": 34, 
 635 |     "腰": 360, 
 636 |     "色": 82, 
 637 |     "篷": 3, 
 638 |     "顶": 631, 
 639 |     "击": 4, 
 640 |     "矿": 5, 
 641 |     "清": 16, 
 642 |     "澄": 1, 
 643 |     "指": 68, 
 644 |     "完": 218, 
 645 |     "式": 956, 
 646 |     "】": 49, 
 647 |     "o": 126, 
 648 |     "袖": 2, 
 649 |     "亚": 4, 
 650 |     "期": 151, 
 651 |     "撞": 1, 
 652 |     "群": 12, 
 653 |     "伯": 2, 
 654 |     "1": 6605, 
 655 |     "弟": 5, 
 656 |     "爽": 3, 
 657 |     "久": 2, 
 658 |     "俄": 1, 
 659 |     "奋": 1, 
 660 |     "富": 3, 
 661 |     "勒": 1, 
 662 |     "陕": 6, 
 663 |     "淘": 3, 
 664 |     "滚": 17, 
 665 |     "菜": 44, 
 666 |     "束": 15, 
 667 |     "卧": 2, 
 668 |     "门": 65, 
 669 |     "雪": 4, 
 670 |     "软": 1, 
 671 |     "q": 45, 
 672 |     "柴": 4, 
 673 |     "壶": 1, 
 674 |     "养": 17, 
 675 |     "建": 135, 
 676 |     "链": 5, 
 677 |     "言": 2, 
 678 |     "超": 179, 
 679 |     "砌": 2, 
 680 |     "．": 27, 
 681 |     "丘": 1, 
 682 |     "抛": 957, 
 683 |     "“": 280, 
 684 |     "管": 38, 
 685 |     "莱": 2, 
 686 |     "舰": 7, 
 687 |     "羹": 1, 
 688 |     "常": 75, 
 689 |     "值": 1567, 
 690 |     "圾": 11, 
 691 |     "证": 541, 
 692 |     "捆": 6, 
 693 |     "立": 234, 
 694 |     "蕊": 2, 
 695 |     "种": 520, 
 696 |     "发": 495, 
 697 |     "酒": 12, 
 698 |     "痕": 49, 
 699 |     "这": 713, 
 700 |     "乘": 60, 
 701 |     "招": 6, 
 702 |     "赚": 9, 
 703 |     "工": 298, 
 704 |     "烧": 5, 
 705 |     "矩": 291, 
 706 |     "器": 104, 
 707 |     "衬": 9, 
 708 |     "轮": 48, 
 709 |     "菱": 101, 
 710 |     "幸": 2, 
 711 |     "函": 1404, 
 712 |     "七": 33, 
 713 |     "概": 90, 
 714 |     "有": 1547, 
 715 |     "程": 854, 
 716 |     "复": 43, 
 717 |     "小": 1400, 
 718 |     "美": 16, 
 719 |     "殖": 5, 
 720 |     "厦": 9, 
 721 |     "椭": 231, 
 722 |     "氯": 2, 
 723 |     "家": 238, 
 724 |     "弹": 11, 
 725 |     "纸": 202, 
 726 |     "刻": 27, 
 727 |     "炼": 11, 
 728 |     "观": 96, 
 729 |     "浅": 1, 
 730 |     "G": 427, 
 731 |     "φ": 5, 
 732 |     "屏": 4, 
 733 |     "街": 5, 
 734 |     "余": 108, 
 735 |     "付": 73, 
 736 |     "胜": 34, 
 737 |     "章": 6, 
 738 |     "另": 141, 
 739 |     "革": 1, 
 740 |     "雨": 6, 
 741 |     "鸦": 3, 
 742 |     "修": 62, 
 743 |     "顷": 5, 
 744 |     "较": 59, 
 745 |     "月": 218, 
 746 |     "宋": 3, 
 747 |     "递": 61, 
 748 |     "优": 82, 
 749 |     "窄": 3, 
 750 |     "骡": 1, 
 751 |     "在": 3928, 
 752 |     "袭": 3, 
 753 |     "花": 82, 
 754 |     "′": 390, 
 755 |     "缸": 1, 
 756 |     "去": 160, 
 757 |     "尺": 71, 
 758 |     "炽": 2, 
 759 |     "瘾": 1, 
 760 |     "促": 24, 
 761 |     "桂": 3, 
 762 |     "资": 64, 
 763 |     "摊": 1, 
 764 |     "仙": 1, 
 765 |     "虞": 2, 
 766 |     "毫": 8, 
 767 |     "路": 350, 
 768 |     "拱": 22, 
 769 |     "柳": 2, 
 770 |     "r": 174, 
 771 |     "整": 273, 
 772 |     "究": 96, 
 773 |     "都": 309, 
 774 |     "百": 39, 
 775 |     "霓": 1, 
 776 |     "吕": 1, 
 777 |     "丙": 45, 
 778 |     "”": 247, 
 779 |     "亮": 21, 
 780 |     "舱": 1, 
 781 |     "育": 30, 
 782 |     "医": 27, 
 783 |     "难": 4, 
 784 |     "裂": 1, 
 785 |     "淄": 2, 
 786 |     "拆": 10, 
 787 |     "黎": 1, 
 788 |     "量": 499, 
 789 |     "乙": 491, 
 790 |     "还": 86, 
 791 |     "]": 150, 
 792 |     "仑": 2, 
 793 |     "履": 1, 
 794 |     "照": 48, 
 795 |     "齐": 7, 
 796 |     "险": 21, 
 797 |     "哪": 74, 
 798 |     "轿": 4, 
 799 |     "长": 1774, 
 800 |     "图": 3271, 
 801 |     "漂": 3, 
 802 |     "希": 4, 
 803 |     "疑": 1, 
 804 |     "枝": 3, 
 805 |     "唱": 3, 
 806 |     "聚": 2, 
 807 |     "隧": 28, 
 808 |     "锦": 2, 
 809 |     "″": 1, 
 810 |     "▱": 8, 
 811 |     "³": 26, 
 812 |     "✲": 14, 
 813 |     "∴": 10, 
 814 |     "康": 5, 
 815 |     "妹": 2, 
 816 |     "势": 3, 
 817 |     "蛇": 4, 
 818 |     "H": 286, 
 819 |     "青": 15, 
 820 |     "拟": 48, 
 821 |     "煤": 6, 
 822 |     "巡": 6, 
 823 |     "形": 2838, 
 824 |     "麓": 1, 
 825 |     "旱": 3, 
 826 |     "想": 98, 
 827 |     "椒": 1, 
 828 |     "绷": 1, 
 829 |     "勿": 1, 
 830 |     "洁": 2, 
 831 |     "赁": 8, 
 832 |     "朝": 14, 
 833 |     "舟": 5, 
 834 |     "骤": 10, 
 835 |     "储": 15, 
 836 |     "京": 25, 
 837 |     "3": 2839, 
 838 |     "夹": 47, 
 839 |     "流": 44, 
 840 |     "桃": 16, 
 841 |     "珊": 1, 
 842 |     "潍": 4, 
 843 |     "屋": 2, 
 844 |     "道": 189, 
 845 |     "单": 510, 
 846 |     "￢": 2, 
 847 |     "畜": 2, 
 848 |     "部": 466, 
 849 |     "只": 193, 
 850 |     "们": 200, 
 851 |     "s": 321, 
 852 |     "拴": 1, 
 853 |     "她": 19, 
 854 |     "寺": 3, 
 855 |     "悉": 2, 
 856 |     "戊": 1, 
 857 |     "，": 19839, 
 858 |     "宏": 1, 
 859 |     "锐": 62, 
 860 |     "蜜": 1, 
 861 |     "素": 22, 
 862 |     "否": 379, 
 863 |     "亭": 10, 
 864 |     "讯": 8, 
 865 |     "抵": 8, 
 866 |     "德": 7, 
 867 |     "守": 2, 
 868 |     "眼": 10, 
 869 |     "县": 16, 
 870 |     "倾": 17, 
 871 |     "摆": 28, 
 872 |     "慈": 2, 
 873 |     "绍": 1, 
 874 |     "档": 3, 
 875 |     "峡": 2, 
 876 |     "幢": 7, 
 877 |     "童": 9, 
 878 |     "孤": 2, 
 879 |     "韧": 2, 
 880 |     "全": 232, 
 881 |     "剪": 91, 
 882 |     "转": 337, 
 883 |     "误": 35, 
 884 |     "数": 3880, 
 885 |     "至": 145, 
 886 |     "闻": 2, 
 887 |     "空": 104, 
 888 |     "国": 91, 
 889 |     "逼": 1, 
 890 |     "灾": 15, 
 891 |     "谁": 17, 
 892 |     "菁": 1, 
 893 |     "把": 195, 
 894 |     "碎": 5, 
 895 |     "向": 715, 
 896 |     "众": 18, 
 897 |     "果": 546, 
 898 |     "红": 64, 
 899 |     "室": 21, 
 900 |     "贫": 2, 
 901 |     "中": 2874, 
 902 |     "新": 97, 
 903 |     "∵": 5, 
 904 |     "画": 157, 
 905 |     "汁": 3, 
 906 |     "评": 5, 
 907 |     "I": 27, 
 908 |     "魏": 1, 
 909 |     "风": 49, 
 910 |     "盒": 76, 
 911 |     "叔": 2, 
 912 |     "%": 122, 
 913 |     "柜": 13, 
 914 |     "损": 13, 
 915 |     "胞": 1, 
 916 |     "瓦": 4, 
 917 |     "度": 1032, 
 918 |     "廓": 2, 
 919 |     "森": 3, 
 920 |     "曲": 245, 
 921 |     "援": 9, 
 922 |     "项": 209, 
 923 |     "白": 55, 
 924 |     "一": 3859, 
 925 |     "⑩": 2, 
 926 |     "掉": 14, 
 927 |     "∈": 124, 
 928 |     "阐": 1, 
 929 |     "途": 21, 
 930 |     "搜": 1, 
 931 |     "御": 1, 
 932 |     "索": 20, 
 933 |     "堤": 4, 
 934 |     "玩": 34, 
 935 |     "享": 15, 
 936 |     "梯": 84, 
 937 |     "肉": 1, 
 938 |     "芳": 1, 
 939 |     "4": 1979, 
 940 |     "容": 24, 
 941 |     "示": 692, 
 942 |     "匾": 1, 
 943 |     "迁": 2, 
 944 |     "杆": 24, 
 945 |     "ω": 17, 
 946 |     "≈": 46, 
 947 |     "腊": 1, 
 948 |     "巍": 5, 
 949 |     "限": 196, 
 950 |     "慢": 19, 
 951 |     "燕": 2, 
 952 |     "橘": 1, 
 953 |     "蛛": 1, 
 954 |     "竣": 1, 
 955 |     "姥": 2, 
 956 |     "拥": 6, 
 957 |     "警": 4, 
 958 |     "柱": 57, 
 959 |     "晰": 1, 
 960 |     "敲": 2, 
 961 |     "t": 511, 
 962 |     "对": 1036, 
 963 |     "⊕": 1, 
 964 |     "企": 20, 
 965 |     "涂": 25, 
 966 |     "椅": 7, 
 967 |     "按": 267, 
 968 |     "隐": 1, 
 969 |     "蒜": 3, 
 970 |     "莞": 1, 
 971 |     "吧": 1, 
 972 |     "动": 1346, 
 973 |     "购": 291, 
 974 |     "议": 4, 
 975 |     "▲": 2, 
 976 |     "´": 5, 
 977 |     "阻": 8, 
 978 |     "什": 70, 
 979 |     "摇": 4, 
 980 |     "捉": 2, 
 981 |     "盐": 3, 
 982 |     "丢": 1, 
 983 |     "巢": 3, 
 984 |     "祥": 2, 
 985 |     "扳": 2, 
 986 |     "筹": 8, 
 987 |     "登": 5, 
 988 |     "避": 5, 
 989 |     "截": 102, 
 990 |     "梅": 8, 
 991 |     "文": 51, 
 992 |     "昆": 3, 
 993 |     "律": 92, 
 994 |     "福": 13, 
 995 |     "税": 12, 
 996 |     "世": 6, 
 997 |     "∞": 49, 
 998 |     "张": 197, 
 999 |     "宣": 1, 
1000 |     "助": 27, 
1001 |     "γ": 4, 
1002 |     "仁": 1, 
1003 |     "求": 2444, 
1004 |     "装": 127, 
1005 |     "襄": 1, 
1006 |     "拉": 11, 
1007 |     "聊": 3, 
1008 |     "嵌": 2, 
1009 |     "监": 9, 
1010 |     "坐": 1446, 
1011 |     "兔": 3, 
1012 |     "湖": 33, 
1013 |     "遥": 1, 
1014 |     "蓝": 11, 
1015 |     "杰": 5, 
1016 |     "并": 521, 
1017 |     "竹": 14, 
1018 |     "顺": 148, 
1019 |     "丁": 8, 
1020 |     "沂": 3, 
1021 |     "栅": 1, 
1022 |     "授": 2, 
1023 |     "夏": 9, 
1024 |     "熔": 1, 
1025 |     "洗": 12, 
1026 |     "望": 12, 
1027 |     "萝": 4, 
1028 |     "斜": 184, 
1029 |     "感": 11, 
1030 |     "鸡": 10, 
1031 |     "利": 279, 
1032 |     "厨": 5, 
1033 |     "漫": 6, 
1034 |     "5": 1720, 
1035 |     "桥": 38, 
1036 |     "儿": 7, 
1037 |     "激": 8, 
1038 |     "规": 177, 
1039 |     "籍": 2, 
1040 |     "他": 184, 
1041 |     "橙": 2, 
1042 |     "棚": 4, 
1043 |     "季": 27, 
1044 |     "剩": 44, 
1045 |     "u": 4, 
1046 |     "致": 25, 
1047 |     "延": 310, 
1048 |     "寸": 2, 
1049 |     "命": 85, 
1050 |     "勾": 12, 
1051 |     "　": 10, 
1052 |     "璃": 13, 
1053 |     "阄": 2, 
1054 |     "且": 965, 
1055 |     "宝": 4, 
1056 |     "废": 1, 
1057 |     " ": 16, 
1058 |     "爬": 25, 
1059 |     "售": 409, 
1060 |     "堰": 4, 
1061 |     "方": 1883, 
1062 |     "机": 177, 
1063 |     "亿": 4, 
1064 |     "派": 11, 
1065 |     "附": 9, 
1066 |     "≌": 36, 
1067 |     "狗": 2, 
1068 |     "员": 68, 
1069 |     "坚": 1, 
1070 |     "统": 46, 
1071 |     "换": 43, 
1072 |     "查": 87, 
1073 |     "晤": 4, 
1074 |     "逐": 4, 
1075 |     "班": 66, 
1076 |     "念": 9, 
1077 |     "年": 281, 
1078 |     "摸": 43, 
1079 |     "仿": 7, 
1080 |     "球": 314, 
1081 |     "厂": 74, 
1082 |     "的": 15843, 
1083 |     "鲁": 7, 
1084 |     "馆": 29, 
1085 |     "羊": 4, 
1086 |     "谓": 1, 
1087 |     "五": 80, 
1088 |     "列": 627, 
1089 |     "错": 49, 
1090 |     "鸟": 2, 
1091 |     "探": 105, 
1092 |     "失": 7, 
1093 |     "户": 57, 
1094 |     "改": 73, 
1095 |     "螺": 35, 
1096 |     "丿": 1, 
1097 |     "允": 3, 
1098 |     "参": 135, 
1099 |     "雄": 3, 
1100 |     "配": 51, 
1101 |     "K": 61, 
1102 |     "拌": 1, 
1103 |     "毒": 4, 
1104 |     "何": 219, 
1105 |     "仔": 5, 
1106 |     "留": 60, 
1107 |     "筝": 5, 
1108 |     "州": 80, 
1109 |     "Ⅱ": 36, 
1110 |     "惠": 64, 
1111 |     "④": 84, 
1112 |     "王": 53, 
1113 |     "来": 151, 
1114 |     "触": 6, 
1115 |     "注": 27, 
1116 |     "火": 50, 
1117 |     "独": 68, 
1118 |     "问": 275, 
1119 |     "声": 3, 
1120 |     "米": 415, 
1121 |     "维": 8, 
1122 |     "湿": 1, 
1123 |     "宇": 1, 
1124 |     "堆": 18, 
1125 |     "粉": 14, 
1126 |     "井": 8, 
1127 |     "演": 9, 
1128 |     "甘": 3, 
1129 |     "股": 17, 
1130 |     "质": 104, 
1131 |     "个": 2737, 
1132 |     "碱": 1, 
1133 |     "纵": 40, 
1134 |     "缴": 14, 
1135 |     "获": 112, 
1136 |     "6": 1348, 
1137 |     "便": 13, 
1138 |     "嘴": 1, 
1139 |     "剂": 1, 
1140 |     "雅": 2, 
1141 |     "拍": 9, 
1142 |     "受": 37, 
1143 |     "属": 25, 
1144 |     "胡": 1, 
1145 |     "Ⅰ": 38, 
1146 |     "缺": 5, 
1147 |     "狭": 1, 
1148 |     "罩": 2, 
1149 |     "绵": 1, 
1150 |     "v": 15, 
1151 |     "智": 11, 
1152 |     "汾": 1, 
1153 |     "、": 2143, 
1154 |     "冀": 2, 
1155 |     "阅": 40, 
1156 |     "领": 21, 
1157 |     "床": 3, 
1158 |     "舍": 20, 
1159 |     "树": 117, 
1160 |     "北": 86, 
1161 |     "宜": 2, 
1162 |     "？": 723, 
1163 |     "!": 7, 
1164 |     "抢": 1, 
1165 |     "春": 13, 
1166 |     "欧": 4, 
1167 |     "梦": 5, 
1168 |     "涨": 19, 
1169 |     "溪": 3, 
1170 |     "净": 18, 
1171 |     "际": 37, 
1172 |     "屉": 3, 
1173 |     "才": 60, 
1174 |     "菌": 1, 
1175 |     "槐": 1, 
1176 |     "浓": 8, 
1177 |     "返": 50, 
1178 |     "南": 76, 
1179 |     "a": 2002, 
1180 |     "散": 4, 
1181 |     "跨": 2, 
1182 |     "八": 46, 
1183 |     "仪": 10, 
1184 |     "召": 6, 
1185 |     "坯": 1, 
1186 |     "称": 391, 
1187 |     "荷": 3, 
1188 |     "胶": 2, 
1189 |     "使": 507, 
1190 |     "刀": 8, 
1191 |     "各": 215, 
1192 |     "笆": 7, 
1193 |     "师": 75, 
1194 |     "祖": 4, 
1195 |     "而": 122, 
1196 |     "适": 23, 
1197 |     "气": 59, 
1198 |     "化": 216, 
1199 |     "＞": 305, 
1200 |     "∠": 1603, 
1201 |     "斥": 3, 
1202 |     "丨": 14, 
1203 |     "洪": 4, 
1204 |     "箱": 37, 
1205 |     "纳": 17, 
1206 |     "·": 189, 
1207 |     "茶": 9, 
1208 |     "格": 214, 
1209 |     "沿": 352, 
1210 |     "病": 5, 
1211 |     "嵊": 1, 
1212 |     "L": 22, 
1213 |     "李": 40, 
1214 |     "译": 1, 
1215 |     "仓": 15, 
1216 |     "卖": 43, 
1217 |     "锻": 10, 
1218 |     "～": 5, 
1219 |     "握": 3, 
1220 |     "≠": 114, 
1221 |     "Ⅲ": 4, 
1222 |     "日": 69, 
1223 |     "⑤": 22, 
1224 |     "武": 18, 
1225 |     "往": 69, 
1226 |     "型": 133, 
1227 |     "浪": 1, 
1228 |     "凭": 2, 
1229 |     "篱": 7, 
1230 |     "绳": 29, 
1231 |     "饲": 3, 
1232 |     "÷": 4, 
1233 |     "锅": 1, 
1234 |     "）": 6138, 
1235 |     "专": 8, 
1236 |     "逗": 1, 
1237 |     "颜": 28, 
1238 |     "挡": 4, 
1239 |     "撤": 1, 
1240 |     "恰": 184, 
1241 |     "别": 1036, 
1242 |     "启": 2, 
1243 |     "骰": 7, 
1244 |     "7": 678, 
1245 |     "达": 282, 
1246 |     "十": 59, 
1247 |     "畅": 2, 
1248 |     "密": 10, 
1249 |     "终": 91, 
1250 |     "手": 61, 
1251 |     "背": 17, 
1252 |     "乓": 9, 
1253 |     "角": 2523, 
1254 |     "遗": 1, 
1255 |     "取": 655, 
1256 |     "静": 11, 
1257 |     "盘": 23, 
1258 |     "祝": 1, 
1259 |     "详": 1, 
1260 |     "轩": 1, 
1261 |     "滨": 7, 
1262 |     "六": 73, 
1263 |     "矮": 3, 
1264 |     "平": 1616, 
1265 |     "w": 15, 
1266 |     "旺": 5, 
1267 |     "匀": 109, 
1268 |     "垃": 11, 
1269 |     "。": 205, 
1270 |     "（": 6089, 
1271 |     "明": 650, 
1272 |     "输": 57, 
1273 |     "隙": 3, 
1274 |     "额": 44, 
1275 |     "夜": 1, 
1276 |     "枣": 3, 
1277 |     "•": 117, 
1278 |     "亩": 2, 
1279 |     "舶": 2, 
1280 |     "携": 6, 
1281 |     "举": 19, 
1282 |     "污": 33, 
1283 |     "遂": 1, 
1284 |     "铅": 16, 
1285 |     "父": 9, 
1286 |     "藏": 2, 
1287 |     "虎": 1, 
1288 |     "法": 264, 
1289 |     "赔": 6, 
1290 |     "b": 1082, 
1291 |     "轨": 56, 
1292 |     "叫": 26, 
1293 |     "豪": 5, 
1294 |     "热": 19, 
1295 |     "公": 344, 
1296 |     "闯": 2, 
1297 |     "扶": 7, 
1298 |     "读": 39, 
1299 |     "楼": 65, 
1300 |     "湾": 2, 
1301 |     "圃": 7, 
1302 |     "善": 8, 
1303 |     "备": 47, 
1304 |     "很": 14, 
1305 |     "营": 40, 
1306 |     "温": 41, 
1307 |     "辨": 2, 
1308 |     "沪": 2, 
1309 |     "冬": 2, 
1310 |     "唯": 11, 
1311 |     "皮": 30, 
1312 |     "娱": 1, 
1313 |     "栽": 20, 
1314 |     "届": 7, 
1315 |     "奇": 36, 
1316 |     "M": 1415, 
1317 |     "敏": 6, 
1318 |     "体": 379, 
1319 |     "棒": 12, 
1320 |     "汕": 2, 
1321 |     "族": 1, 
1322 |     "硝": 1, 
1323 |     "⑥": 6, 
1324 |     "子": 255, 
1325 |     "卫": 5, 
1326 |     "灭": 2, 
1327 |     "目": 45, 
1328 |     "巴": 4, 
1329 |     "虹": 1, 
1330 |     "厅": 5, 
1331 |     "昌": 7, 
1332 |     "谐": 3, 
1333 |     "写": 419, 
1334 |     "岛": 24, 
1335 |     "□": 6, 
1336 |     "患": 3, 
1337 |     "接": 755, 
1338 |     "产": 255, 
1339 |     "账": 1, 
1340 |     "碳": 6, 
1341 |     "莲": 3, 
1342 |     "收": 71, 
1343 |     "8": 887, 
1344 |     "治": 3, 
1345 |     "辽": 6, 
1346 |     "织": 18, 
1347 |     "秉": 1, 
1348 |     "苏": 20, 
1349 |     "慎": 1, 
1350 |     "跑": 58, 
1351 |     "恢": 1, 
1352 |     "毕": 10, 
1353 |     "博": 9, 
1354 |     "※": 1, 
1355 |     "神": 3, 
1356 |     "①": 282, 
1357 |     "≤": 153, 
1358 |     "表": 614, 
1359 |     "短": 58, 
1360 |     "己": 10, 
1361 |     "驴": 3, 
1362 |     "x": 4672, 
1363 |     "追": 24, 
1364 |     "锁": 6, 
1365 |     "垂": 322, 
1366 |     "包": 72, 
1367 |     "岗": 8, 
1368 |     "服": 45, 
1369 |     "戏": 27, 
1370 |     "酬": 9, 
1371 |     "栓": 7, 
1372 |     "厚": 18, 
1373 |     "缝": 1, 
1374 |     "府": 10, 
1375 |     "娟": 1, 
1376 |     "黄": 40, 
1377 |     "#": 167, 
1378 |     "挥": 2, 
1379 |     "护": 18, 
1380 |     "润": 138, 
1381 |     "梨": 3, 
1382 |     "含": 98, 
1383 |     "赴": 1, 
1384 |     "哀": 1, 
1385 |     "奉": 1, 
1386 |     "络": 4, 
1387 |     "c": 968, 
1388 |     "副": 15, 
1389 |     "峰": 4, 
1390 |     "时": 2209, 
1391 |     "载": 10, 
1392 |     "身": 55, 
1393 |     "销": 329, 
1394 |     "将": 505, 
1395 |     "怎": 45, 
1396 |     "刚": 45, 
1397 |     "＜": 349, 
1398 |     "报": 42, 
1399 |     "挤": 3, 
1400 |     "澧": 2, 
1401 |     "横": 128, 
1402 |     "环": 64, 
1403 |     "脸": 1, 
1404 |     "似": 146, 
1405 |     "访": 3, 
1406 |     "铁": 74, 
1407 |     "燃": 12, 
1408 |     "祈": 1, 
1409 |     "旋": 280, 
1410 |     "描": 8, 
1411 |     "N": 781, 
1412 |     "乒": 10, 
1413 |     "绝": 13, 
1414 |     "察": 56, 
1415 |     "令": 11, 
1416 |     "夺": 1, 
1417 |     "若": 1714, 
1418 |     "剧": 1, 
1419 |     "园": 39, 
1420 |     "缓": 4, 
1421 |     "可": 457, 
1422 |     "遮": 1, 
1423 |     "干": 57, 
1424 |     "添": 19, 
1425 |     "艺": 7, 
1426 |     "份": 74, 
1427 |     "轼": 1, 
1428 |     "玄": 2, 
1429 |     "妈": 34, 
1430 |     "民": 61, 
1431 |     "互": 101, 
1432 |     "候": 5, 
1433 |     "增": 165, 
1434 |     "■": 1, 
1435 |     "∥": 204, 
1436 |     "瞬": 3, 
1437 |     "邮": 15, 
1438 |     "就": 79, 
1439 |     "9": 792, 
1440 |     "活": 74, 
1441 |     "丽": 17, 
1442 |     "围": 474, 
1443 |     "元": 702, 
1444 |     "婷": 1, 
1445 |     "俩": 2, 
1446 |     "比": 590, 
1447 |     "块": 111, 
1448 |     "飞": 41, 
1449 |     "②": 265, 
1450 |     "无": 115, 
1451 |     "Ⅳ": 1, 
1452 |     "≥": 66, 
1453 |     "桩": 2, 
1454 |     "山": 88, 
1455 |     "踩": 3, 
1456 |     "女": 25, 
1457 |     "侨": 1, 
1458 |     "y": 2232, 
1459 |     "T": 61, 
1460 |     "内": 538, 
1461 |     "缆": 5, 
1462 |     "安": 70, 
1463 |     "萌": 2, 
1464 |     "茎": 5, 
1465 |     "辑": 1, 
1466 |     "渐": 29, 
1467 |     "锯": 2, 
1468 |     "刘": 8, 
1469 |     "盛": 3, 
1470 |     "映": 10, 
1471 |     "伦": 1, 
1472 |     "让": 18, 
1473 |     "听": 9, 
1474 |     "破": 7, 
1475 |     "款": 71, 
1476 |     "陀": 2, 
1477 |     "蛋": 15, 
1478 |     "近": 71, 
1479 |     "乐": 5, 
1480 |     "靖": 1, 
1481 |     "叙": 4, 
1482 |     "川": 11, 
1483 |     "食": 11, 
1484 |     "恤": 11, 
1485 |     "车": 557, 
1486 |     "捷": 2, 
1487 |     "闭": 17, 
1488 |     "任": 279, 
1489 |     "省": 67, 
1490 |     "喂": 1, 
1491 |     "宗": 2, 
1492 |     "簇": 1, 
1493 |     "砖": 12, 
1494 |     "休": 12, 
1495 |     "源": 17, 
1496 |     "烂": 1, 
1497 |     "吗": 80, 
1498 |     "匙": 2, 
1499 |     "鲜": 3, 
1500 |     "实": 413, 
1501 |     "蜡": 24, 
1502 |     "∣": 342, 
1503 |     "氧": 2, 
1504 |     "侦": 2, 
1505 |     "弱": 1, 
1506 |     "稳": 16, 
1507 |     "架": 31, 
1508 |     "主": 33, 
1509 |     "沼": 9, 
1510 |     "设": 545, 
1511 |     "雀": 2, 
1512 |     "充": 34, 
1513 |     "等": 1164, 
1514 |     "晋": 1, 
1515 |     "O": 2430, 
1516 |     "凌": 1, 
1517 |     "条": 735, 
1518 |     "扣": 14, 
1519 |     "既": 20, 
1520 |     "&": 3, 
1521 |     "灯": 73, 
1522 |     "影": 183, 
1523 |     "绰": 1, 
1524 |     "已": 1220, 
1525 |     "课": 44, 
1526 |     "需": 230, 
1527 |     "傅": 18, 
1528 |     "云": 5, 
1529 |     "疗": 21, 
1530 |     "通": 179, 
1531 |     "肥": 5, 
1532 |     "涧": 1, 
1533 |     "垫": 2, 
1534 |     "株": 6, 
1535 |     "头": 72, 
1536 |     ":": 79, 
1537 |     "着": 98, 
1538 |     "叠": 116, 
1539 |     "操": 38, 
1540 |     "敌": 6, 
1541 |     "扎": 3, 
1542 |     "滑": 38, 
1543 |     "裕": 1, 
1544 |     "做": 135, 
1545 |     "租": 63, 
1546 |     "蝠": 1, 
1547 |     "③": 124, 
1548 |     "书": 123, 
1549 |     "芽": 2, 
1550 |     "矫": 4, 
1551 |     "凯": 1, 
1552 |     "彰": 2, 
1553 |     "奴": 1, 
1554 |     "苹": 31, 
1555 |     "z": 31, 
1556 |     "磁": 11, 
1557 |     "熄": 2, 
1558 |     "崇": 1, 
1559 |     "庆": 10, 
1560 |     "遭": 2, 
1561 |     "朋": 8, 
1562 |     "意": 263, 
1563 |     "耻": 1, 
1564 |     "则": 1518, 
1565 |     "厘": 34, 
1566 |     "；": 1368, 
1567 |     "尝": 9, 
1568 |     "够": 29, 
1569 |     "急": 9, 
1570 |     "货": 91, 
1571 |     "讨": 21, 
1572 |     "借": 10, 
1573 |     "船": 93, 
1574 |     "θ": 50, 
1575 |     "º": 6, 
1576 |     "秋": 4, 
1577 |     "千": 207, 
1578 |     "细": 26, 
1579 |     "运": 696, 
1580 |     "盖": 18, 
1581 |     "变": 342, 
1582 |     "竞": 26, 
1583 |     "荣": 1, 
1584 |     "e": 113, 
1585 |     "页": 15, 
1586 |     "批": 121, 
1587 |     "轻": 7, 
1588 |     "汽": 91, 
1589 |     "西": 76, 
1590 |     "待": 10, 
1591 |     "合": 355, 
1592 |     "袋": 25, 
1593 |     "₃": 44, 
1594 |     "亏": 14, 
1595 |     "⌒": 70, 
1596 |     "窗": 7, 
1597 |     "外": 223, 
1598 |     "：": 1311, 
1599 |     "欢": 4, 
1600 |     "两": 1925, 
1601 |     "未": 14, 
1602 |     "箭": 4, 
1603 |     "纯": 6, 
1604 |     "钳": 3, 
1605 |     "阴": 107, 
1606 |     "永": 4, 
1607 |     "放": 148, 
1608 |     "私": 3, 
1609 |     "础": 12, 
1610 |     "必": 65, 
1611 |     "勇": 1, 
1612 |     "呈": 11, 
1613 |     "棋": 11, 
1614 |     "坊": 4, 
1615 |     "母": 53, 
1616 |     "济": 10, 
1617 |     "P": 2875, 
1618 |     "荒": 2, 
1619 |     "奖": 48, 
1620 |     "烛": 25, 
1621 |     "罚": 5, 
1622 |     "盟": 4, 
1623 |     "止": 111, 
1624 |     "拧": 1, 
1625 |     "早": 14, 
1626 |     "士": 9, 
1627 |     "马": 20, 
1628 |     "灰": 2, 
1629 |     "职": 8, 
1630 |     "蕨": 1, 
1631 |     "淹": 1, 
1632 |     "替": 4, 
1633 |     "礁": 3, 
1634 |     "陪": 1, 
1635 |     "庄": 10, 
1636 |     "切": 308, 
1637 |     "针": 201, 
1638 |     "栋": 5, 
1639 |     "脑": 19, 
1640 |     "排": 110, 
1641 |     "首": 19, 
1642 |     "粘": 4, 
1643 |     "匝": 4, 
1644 |     "队": 140, 
1645 |     "订": 7, 
1646 |     "交": 1788, 
1647 |     "∧": 6, 
1648 |     "慰": 1, 
1649 |     "草": 14, 
1650 |     "费": 212, 
1651 |     "总": 172, 
1652 |     "寂": 2, 
1653 |     "组": 295, 
1654 |     "片": 156, 
1655 |     "准": 74, 
1656 |     "哈": 4, 
1657 |     "插": 4, 
1658 |     "询": 1, 
1659 |     "座": 41, 
1660 |     "执": 13, 
1661 |     "填": 58, 
1662 |     "差": 94, 
1663 |     "共": 397, 
1664 |     "味": 5, 
1665 |     "史": 2, 
1666 |     "直": 2859, 
1667 |     "泸": 1, 
1668 |     "{": 174, 
1669 |     "拼": 44, 
1670 |     "弄": 1, 
1671 |     "率": 287, 
1672 |     "逆": 78, 
1673 |     "钉": 2, 
1674 |     "锈": 2, 
1675 |     "脐": 1, 
1676 |     "戒": 1, 
1677 |     "算": 219, 
1678 |     "高": 369, 
1679 |     "军": 27, 
1680 |     "猜": 53, 
1681 |     "龄": 18, 
1682 |     "趣": 21, 
1683 |     "控": 7, 
1684 |     "…": 68, 
1685 |     "第": 593, 
1686 |     "铜": 6, 
1687 |     "焰": 1, 
1688 |     "油": 22, 
1689 |     "贸": 1, 
1690 |     "为": 5153, 
1691 |     "抽": 75, 
1692 |     "壁": 8, 
1693 |     "罄": 1, 
1694 |     "歌": 7, 
1695 |     "漏": 1, 
1696 |     "糙": 2, 
1697 |     "奥": 7, 
1698 |     "卜": 5, 
1699 |     "域": 94, 
1700 |     "裤": 7, 
1701 |     "以": 894, 
1702 |     "f": 567, 
1703 |     "羽": 2, 
1704 |     "兰": 7, 
1705 |     "割": 31, 
1706 |     "请": 603, 
1707 |     "桶": 23, 
1708 |     "浸": 1, 
1709 |     "臻": 1, 
1710 |     "柿": 4, 
1711 |     "码": 26, 
1712 |     "径": 496, 
1713 |     "吉": 7, 
1714 |     "伏": 5, 
1715 |     "←": 1, 
1716 |     "颖": 5, 
1717 |     "庚": 1, 
1718 |     "初": 31, 
1719 |     "真": 33, 
1720 |     "校": 243, 
1721 |     "责": 7, 
1722 |     "严": 3, 
1723 |     "悦": 1, 
1724 |     "夫": 2, 
1725 |     "弯": 9, 
1726 |     "粮": 4, 
1727 |     "冰": 15, 
1728 |     "申": 2, 
1729 |     "劲": 1, 
1730 |     "阵": 4, 
1731 |     "土": 26, 
1732 |     "人": 566, 
1733 |     "∽": 35, 
1734 |     "秀": 8, 
1735 |     "幅": 13, 
1736 |     "升": 32, 
1737 |     "益": 6, 
1738 |     "住": 45, 
1739 |     "Q": 737, 
1740 |     "字": 158, 
1741 |     "衡": 10, 
1742 |     "摩": 6, 
1743 |     "凰": 2, 
1744 |     "偿": 3, 
1745 |     "试": 276, 
1746 |     "板": 105, 
1747 |     "宁": 18, 
1748 |     "稀": 2, 
1749 |     "射": 215, 
1750 |     "成": 812, 
1751 |     "碗": 3, 
1752 |     "聘": 2, 
1753 |     "龙": 10, 
1754 |     "力": 49, 
1755 |     "钟": 97, 
1756 |     "计": 340, 
1757 |     "昨": 1, 
1758 |     "☆": 3, 
1759 |     "到": 1039, 
1760 |     "吴": 5, 
1761 |     "边": 1956, 
1762 |     "游": 75, 
1763 |     "贺": 3, 
1764 |     "<": 50, 
1765 |     "练": 24, 
1766 |     "陈": 2, 
1767 |     "行": 781, 
1768 |     "低": 75, 
1769 |     "摔": 2, 
1770 |     "策": 5, 
1771 |     "忙": 4, 
1772 |     "停": 133, 
1773 |     "非": 35, 
1774 |     "代": 145, 
1775 |     "汤": 1, 
1776 |     "晨": 4, 
1777 |     "硬": 29, 
1778 |     "置": 247, 
1779 |     "桨": 1, 
1780 |     "迹": 59, 
1781 |     "|": 83, 
1782 |     "松": 6, 
1783 |     "布": 45, 
1784 |     "如": 2274, 
1785 |     "历": 6, 
1786 |     "植": 35, 
1787 |     "挑": 4, 
1788 |     "易": 16, 
1789 |     "→": 171, 
1790 |     "鹅": 1, 
1791 |     "创": 12, 
1792 |     "次": 892, 
1793 |     "渣": 1, 
1794 |     "津": 3, 
1795 |     "性": 101, 
1796 |     "Φ": 5, 
1797 |     "暨": 1, 
1798 |     "薪": 2, 
1799 |     "甜": 2, 
1800 |     "阳": 36, 
1801 |     "冲": 6, 
1802 |     "电": 186, 
1803 |     "样": 207, 
1804 |     "舞": 3, 
1805 |     "髀": 1, 
1806 |     "遇": 62, 
1807 |     "鼠": 12, 
1808 |     "盈": 23, 
1809 |     "迎": 7, 
1810 |     "拐": 14, 
1811 |     "⇒": 1, 
1812 |     "绘": 9, 
1813 |     "牛": 23, 
1814 |     "g": 172, 
1815 |     "饭": 7, 
1816 |     "危": 6, 
1817 |     "突": 7, 
1818 |     "堂": 2, 
1819 |     "分": 2482, 
1820 |     "圈": 27, 
1821 |     "残": 3, 
1822 |     "与": 2046, 
1823 |     "抑": 1, 
1824 |     "撕": 1, 
1825 |     "缘": 6, 
1826 |     "尚": 6, 
1827 |     "辣": 1, 
1828 |     "木": 38, 
1829 |     "宫": 3, 
1830 |     "帮": 29, 
1831 |     "励": 7, 
1832 |     "伸": 11, 
1833 |     "λ": 16, 
1834 |     "显": 6, 
1835 |     "心": 383, 
1836 |     "采": 49, 
1837 |     "告": 12, 
1838 |     "棍": 1, 
1839 |     "捐": 24, 
1840 |     "染": 3, 
1841 |     "R": 288, 
1842 |     "∏": 4, 
1843 |     "给": 117, 
1844 |     "衢": 4, 
1845 |     "泥": 5, 
1846 |     "赤": 1, 
1847 |     "杨": 11, 
1848 |     "印": 16, 
1849 |     "石": 14, 
1850 |     "能": 425, 
1851 |     "威": 4, 
1852 |     "简": 36, 
1853 |     "嘉": 6, 
1854 |     "于": 2477, 
1855 |     "我": 118, 
1856 |     "农": 34, 
1857 |     "生": 424, 
1858 |     "模": 89, 
1859 |     "梢": 1, 
1860 |     "芦": 1, 
1861 |     "累": 5, 
1862 |     "爱": 11, 
1863 |     "现": 277, 
1864 |     "圳": 5, 
1865 |     "丹": 3, 
1866 |     "=": 5025, 
1867 |     "政": 18, 
1868 |     "J": 5, 
1869 |     "居": 29, 
1870 |     "钠": 3, 
1871 |     "从": 584, 
1872 |     "扑": 4, 
1873 |     "提": 96, 
1874 |     "套": 42, 
1875 |     "竖": 22, 
1876 |     "忘": 6, 
1877 |     "回": 114, 
1878 |     "橡": 3, 
1879 |     "佣": 2, 
1880 |     "慧": 9, 
1881 |     "拦": 2, 
1882 |     "台": 123, 
1883 |     "间": 882, 
1884 |     "}": 173, 
1885 |     "曾": 4, 
1886 |     "境": 9, 
1887 |     "异": 39, 
1888 |     "宅": 2, 
1889 |     "娄": 2, 
1890 |     "斑": 3, 
1891 |     "店": 114, 
1892 |     "崖": 1, 
1893 |     "妙": 2, 
1894 |     "功": 8, 
1895 |     "鄞": 1, 
1896 |     "(": 75, 
1897 |     "央": 3, 
1898 |     "露": 5, 
1899 |     "核": 7, 
1900 |     "⊿": 1, 
1901 |     "局": 14, 
1902 |     "雇": 12, 
1903 |     "商": 264, 
1904 |     "凉": 5, 
1905 |     "半": 542, 
1906 |     "乌": 8, 
1907 |     "扔": 2, 
1908 |     "塘": 4, 
1909 |     "淡": 2, 
1910 |     "池": 38, 
1911 |     "该": 518, 
1912 |     "h": 137, 
1913 |     "端": 90, 
1914 |     "饮": 22, 
1915 |     "价": 499, 
1916 |     "赶": 12, 
1917 |     "拿": 9, 
1918 |     "腾": 1, 
1919 |     "j": 5, 
1920 |     "颗": 10, 
1921 |     "疆": 1, 
1922 |     "伍": 10, 
1923 |     "二": 499, 
1924 |     "题": 545, 
1925 |     "定": 544, 
1926 |     "障": 3, 
1927 |     "贡": 2, 
1928 |     "您": 1, 
1929 |     "措": 2, 
1930 |     "由": 408, 
1931 |     "论": 273, 
1932 |     "房": 50, 
1933 |     "诊": 3, 
1934 |     "魅": 1, 
1935 |     "光": 83, 
1936 |     "烈": 1, 
1937 |     "及": 172, 
1938 |     "位": 803, 
1939 |     "救": 9, 
1940 |     "S": 403, 
1941 |     "捕": 1, 
1942 |     "拔": 8, 
1943 |     "坝": 10, 
1944 |     "漠": 1, 
1945 |     "衣": 23, 
1946 |     "步": 73, 
1947 |     "▪": 20, 
1948 |     "旦": 8, 
1949 |     "承": 9, 
1950 |     "岁": 21, 
1951 |     "市": 259, 
1952 |     "理": 382, 
1953 |     "炉": 2, 
1954 |     "刊": 1, 
1955 |     "撑": 2, 
1956 |     "投": 67, 
1957 |     "得": 775, 
1958 |     "尖": 4, 
1959 |     "墙": 37, 
1960 |     "原": 407, 
1961 |     "没": 62, 
1962 |     "α": 151, 
1963 |     "踢": 4, 
1964 |     "肩": 2, 
1965 |     "航": 68, 
1966 |     "庭": 20, 
1967 |     "钱": 54, 
1968 |     "田": 3, 
1969 |     "决": 84, 
1970 |     "防": 11, 
1971 |     "猴": 5, 
1972 |     ">": 35, 
1973 |     "幂": 2, 
1974 |     "先": 111, 
1975 |     "削": 2, 
1976 |     "仍": 39, 
1977 |     "每": 928, 
1978 |     "志": 10, 
1979 |     "泡": 1, 
1980 |     "赠": 7, 
1981 |     "解": 543, 
1982 |     "此": 306, 
1983 |     "⑦": 1, 
1984 |     "续": 61, 
1985 |     "彬": 1, 
1986 |     "确": 283, 
1987 |     "却": 1, 
1988 |     "迷": 1, 
1989 |     "汶": 1, 
1990 |     "霾": 3, 
1991 |     "封": 15, 
1992 |     "震": 9, 
1993 |     "选": 139, 
1994 |     "冈": 5, 
1995 |     "洋": 3, 
1996 |     "不": 1149, 
1997 |     "斐": 4, 
1998 |     "媚": 1, 
1999 |     "溢": 4, 
2000 |     "认": 26, 
2001 |     ")": 75, 
2002 |     "膨": 1, 
2003 |     "紫": 7, 
2004 |     "荆": 1, 
2005 |     "鸭": 2, 
2006 |     "根": 324, 
2007 |     "攻": 1, 
2008 |     "寄": 3, 
2009 |     "均": 254, 
2010 |     "测": 186, 
2011 |     "桌": 35, 
2012 |     "罗": 1, 
2013 |     "糖": 19, 
2014 |     "秘": 3, 
2015 |     "象": 686, 
2016 |     "擦": 2, 
2017 |     "i": 151, 
2018 |     "关": 794, 
2019 |     "盲": 3, 
2020 |     "牵": 3, 
2021 |     "∪": 4, 
2022 |     "征": 7, 
2023 |     "圆": 1087, 
2024 |     "玉": 7, 
2025 |     "事": 21, 
2026 |     "抓": 5, 
2027 |     "—": 5, 
2028 |     "粗": 22, 
2029 |     "编": 25, 
2030 |     "多": 786, 
2031 |     "喝": 2, 
2032 |     "澡": 1, 
2033 |     "渠": 6, 
2034 |     "财": 6, 
2035 |     "Ω": 2, 
2036 |     "∨": 5, 
2037 |     "劳": 6, 
2038 |     "甲": 490, 
2039 |     "况": 74, 
2040 |     "施": 20, 
2041 |     "叉": 3, 
2042 |     "版": 3, 
2043 |     "介": 2, 
2044 |     "拓": 11, 
2045 |     "联": 33, 
2046 |     "四": 784, 
2047 |     "烟": 6, 
2048 |     "信": 60, 
2049 |     "习": 46, 
2050 |     "赢": 5, 
2051 |     "僧": 2, 
2052 |     "⊂": 3, 
2053 |     "韩": 2, 
2054 |     "！": 1, 
2055 |     "繁": 4, 
2056 |     "预": 22, 
2057 |     "下": 880, 
2058 |     "栏": 9, 
2059 |     "潮": 8, 
2060 |     "速": 531, 
2061 |     "萧": 1, 
2062 |     "盆": 2, 
2063 |     "谷": 5, 
2064 |     "?": 18, 
2065 |     "卉": 1, 
2066 |     "之": 518, 
2067 |     "释": 13, 
2068 |     "村": 30, 
2069 |     "打": 87, 
2070 |     "孙": 1, 
2071 |     "⊥": 481, 
2072 |     "姚": 2, 
2073 |     "瓜": 4, 
2074 |     "侧": 176, 
2075 |     "拨": 5, 
2076 |     "自": 177, 
2077 |     "篮": 26, 
2078 |     "困": 2, 
2079 |     "闲": 2, 
2080 |     "兵": 6, 
2081 |     "郴": 1, 
2082 |     "开": 235, 
2083 |     "它": 260, 
2084 |     "墅": 5, 
2085 |     "处": 318, 
2086 |     "例": 307, 
2087 |     "《": 4, 
2088 |     "暑": 5, 
2089 |     "Γ": 5, 
2090 |     "钝": 11, 
2091 |     "睛": 4, 
2092 |     "熟": 1, 
2093 |     "传": 35, 
2094 |     "谢": 2, 
2095 |     "*": 11, 
2096 |     "岭": 2, 
2097 |     "地": 484, 
2098 |     "层": 29, 
2099 |     "陆": 6, 
2100 |     "午": 22, 
2101 |     "赌": 3, 
2102 |     "材": 39, 
2103 |     "站": 46, 
2104 |     "存": 535, 
2105 |     "铝": 2, 
2106 |     "镜": 25, 
2107 |     "满": 384, 
2108 |     "你": 278, 
2109 |     "旧": 8, 
2110 |     "择": 46, 
2111 |     "快": 50, 
2112 |     "聪": 14, 
2113 |     "积": 860, 
2114 |     "驮": 3, 
2115 |     "物": 1081, 
2116 |     "右": 256, 
2117 |     "兴": 26, 
2118 |     "混": 16, 
2119 |     "找": 45, 
2120 |     "币": 19, 
2121 |     "品": 319, 
2122 |     "皆": 1, 
2123 |     "咱": 1, 
2124 |     "析": 259, 
2125 |     "官": 1, 
2126 |     "喜": 5, 
2127 |     "办": 13, 
2128 |     "渡": 1, 
2129 |     "欣": 2, 
2130 |     "∩": 8, 
2131 |     "推": 28, 
2132 |     "溶": 12, 
2133 |     "许": 7, 
2134 |     "落": 152, 
2135 |     "○": 13, 
2136 |     "愿": 5, 
2137 |     "﹁": 3, 
2138 |     "海": 72, 
2139 |     "镇": 13, 
2140 |     "集": 67, 
2141 |     "又": 78, 
2142 |     "郊": 4, 
2143 |     "跌": 1, 
2144 |     "虑": 15, 
2145 |     "某": 559, 
2146 |     "U": 1, 
2147 |     "坛": 12, 
2148 |     "乡": 3, 
2149 |     "正": 1218, 
2150 |     "波": 15, 
2151 |     "补": 38, 
2152 |     "'": 17, 
2153 |     "扩": 5, 
2154 |     "即": 84, 
2155 |     "遵": 1, 
2156 |     "起": 66, 
2157 |     "件": 471, 
2158 |     "诸": 2, 
2159 |     "腿": 4, 
2160 |     "孔": 5
2161 | }


--------------------------------------------------------------------------------
/files/black.json:
--------------------------------------------------------------------------------
   1 | {
   2 |     "white_list": [
   3 |         "A4758.png", 
   4 |         "A3905.png", 
   5 |         "T359_0.png", 
   6 |         "T898_10.png", 
   7 |         "T1091_4.png", 
   8 |         "B2258_1.png", 
   9 |         "A2114.png", 
  10 |         "A9393.png", 
  11 |         "T40_11.png", 
  12 |         "T411_4.png", 
  13 |         "A2754.png", 
  14 |         "A14295.png", 
  15 |         "A14349.png", 
  16 |         "A14364.png", 
  17 |         "A15101.png", 
  18 |         "A15241.png", 
  19 |         "A15933.png", 
  20 |         "A16323.png", 
  21 |         "A1595.png", 
  22 |         "A16092.png", 
  23 |         "A17063.png", 
  24 |         "A16761.png", 
  25 |         "A17141.png", 
  26 |         "A16560.png", 
  27 |         "A16953.png", 
  28 |         "A1745.png", 
  29 |         "A17010.png", 
  30 |         "A16435.png", 
  31 |         "A17281.png", 
  32 |         "A17980.png", 
  33 |         "A1875.png", 
  34 |         "A18451.png", 
  35 |         "A18969.png", 
  36 |         "A19152.png", 
  37 |         "A19424.png", 
  38 |         "A17989.png", 
  39 |         "A18139.png", 
  40 |         "A18891.png", 
  41 |         "A19260.png", 
  42 |         "A18701.png", 
  43 |         "A19584.png", 
  44 |         "A19012.png", 
  45 |         "A215.png", 
  46 |         "A20801.png", 
  47 |         "A22541.png", 
  48 |         "A22740.png", 
  49 |         "A22290.png", 
  50 |         "A21194.png", 
  51 |         "A22899.png", 
  52 |         "A22381.png", 
  53 |         "A19650.png", 
  54 |         "A26050.png", 
  55 |         "A24554.png", 
  56 |         "A309.png", 
  57 |         "A26100.png", 
  58 |         "A24084.png", 
  59 |         "A2508.png", 
  60 |         "A25709.png", 
  61 |         "A26620.png", 
  62 |         "A2594.png", 
  63 |         "A24920.png", 
  64 |         "A27604.png", 
  65 |         "A27933.png", 
  66 |         "A25019.png", 
  67 |         "A26091.png", 
  68 |         "A2432.png", 
  69 |         "A2493.png", 
  70 |         "A2755.png", 
  71 |         "A23883.png", 
  72 |         "A3437.png", 
  73 |         "A27102.png", 
  74 |         "A314.png", 
  75 |         "A25250.png", 
  76 |         "A25612.png", 
  77 |         "A24169.png", 
  78 |         "A28034.png", 
  79 |         "A3210.png", 
  80 |         "A26760.png", 
  81 |         "A2538.png", 
  82 |         "A2974.png", 
  83 |         "A24609.png", 
  84 |         "A20.png", 
  85 |         "A24062.png", 
  86 |         "A3223.png", 
  87 |         "A24311.png", 
  88 |         "A26699.png", 
  89 |         "A23992.png", 
  90 |         "A26012.png", 
  91 |         "A2175.png", 
  92 |         "A24603.png", 
  93 |         "A27172.png", 
  94 |         "A2004.png", 
  95 |         "A20100.png", 
  96 |         "A3808.png", 
  97 |         "A4943.png", 
  98 |         "A441.png", 
  99 |         "A7366.png", 
 100 |         "A7242.png", 
 101 |         "A7697.png", 
 102 |         "A5932.png", 
 103 |         "A5298.png", 
 104 |         "A5823.png", 
 105 |         "A3937.png", 
 106 |         "A6152.png", 
 107 |         "A4141.png", 
 108 |         "A4141.png", 
 109 |         "A7360.png", 
 110 |         "A5923.png", 
 111 |         "A3945.png", 
 112 |         "A6821.png", 
 113 |         "A3852.png", 
 114 |         "A6252.png", 
 115 |         "A4188.png", 
 116 |         "A3681.png", 
 117 |         "A4947.png", 
 118 |         "A6011.png", 
 119 |         "A5304.png", 
 120 |         "A5304.png", 
 121 |         "A4979.png", 
 122 |         "A7861.png", 
 123 |         "A5400.png", 
 124 |         "A4611.png", 
 125 |         "A4883.png", 
 126 |         "A368.png", 
 127 |         "A4347.png", 
 128 |         "A7100.png", 
 129 |         "A4956.png", 
 130 |         "A7133.png", 
 131 |         "A6237.png", 
 132 |         "A2923.png", 
 133 |         "A3235.png", 
 134 |         "A26992.png", 
 135 |         "A4642.png", 
 136 |         "A27272.png", 
 137 |         "A5920.png", 
 138 |         "A21381.png", 
 139 |         "A6608.png", 
 140 |         "A4419.png", 
 141 |         "A23613.png", 
 142 |         "A27283.png", 
 143 |         "A24464.png", 
 144 |         "A25601.png", 
 145 |         "A7814.png", 
 146 |         "A24509.png", 
 147 |         "A24304.png", 
 148 |         "A6120.png", 
 149 |         "A3495.png", 
 150 |         "A3949.png", 
 151 |         "A24379.png", 
 152 |         "A20111.png", 
 153 |         "A22962.png", 
 154 |         "A8877.png", 
 155 |         "A8936.png", 
 156 |         "A9761.png", 
 157 |         "A8741.png", 
 158 |         "A9064.png", 
 159 |         "A8370.png", 
 160 |         "A9829.png", 
 161 |         "A9018.png", 
 162 |         "B1879_1.png", 
 163 |         "A9354.png", 
 164 |         "A8201.png", 
 165 |         "B1813_3.png", 
 166 |         "A8350.png", 
 167 |         "A8353.png", 
 168 |         "A9446.png", 
 169 |         "B1879_0.png", 
 170 |         "A8674.png", 
 171 |         "A9219.png", 
 172 |         "B2404_2.png", 
 173 |         "T1175_12.png", 
 174 |         "B935_8.png", 
 175 |         "T1140_5.png", 
 176 |         "B523_1.png", 
 177 |         "B523_5.png", 
 178 |         "T1175_15.png", 
 179 |         "B523_0.png", 
 180 |         "T1058_17.png", 
 181 |         "T1089_3.png", 
 182 |         "B2741_0.png", 
 183 |         "T1036_1.png", 
 184 |         "T1184_4.png", 
 185 |         "T129_18.png", 
 186 |         "T134_7.png", 
 187 |         "T142_4.png", 
 188 |         "T144_4.png", 
 189 |         "T169_10.png", 
 190 |         "T169_6.png", 
 191 |         "T174_0.png", 
 192 |         "T200_0.png", 
 193 |         "T20_8.png", 
 194 |         "T217_6.png", 
 195 |         "T217_8.png", 
 196 |         "T230_6.png", 
 197 |         "T235_12.png", 
 198 |         "T23_4.png", 
 199 |         "T244_5.png", 
 200 |         "T247_8.png", 
 201 |         "T261_8.png", 
 202 |         "T270_12.png", 
 203 |         "T294_4.png", 
 204 |         "T300_12.png", 
 205 |         "T302_1.png", 
 206 |         "T311_11.png", 
 207 |         "T321_11.png", 
 208 |         "T321_3.png", 
 209 |         "T321_9.png", 
 210 |         "T324_2.png", 
 211 |         "T327_5.png", 
 212 |         "T328_7.png", 
 213 |         "T329_1.png", 
 214 |         "T331_0.png", 
 215 |         "T340_1.png", 
 216 |         "T352_4.png", 
 217 |         "T352_8.png", 
 218 |         "T356_5.png", 
 219 |         "T358_7.png", 
 220 |         "T359_6.png", 
 221 |         "T366_12.png", 
 222 |         "T372_4.png", 
 223 |         "T374_5.png", 
 224 |         "T374_6.png", 
 225 |         "T381_2.png", 
 226 |         "T381_4.png", 
 227 |         "T381_6.png", 
 228 |         "T382_3.png", 
 229 |         "T387_1.png", 
 230 |         "T389_14.png", 
 231 |         "T38_1.png", 
 232 |         "T38_4.png", 
 233 |         "T396_6.png", 
 234 |         "T3_8.png", 
 235 |         "T403_9.png", 
 236 |         "T409_0.png", 
 237 |         "T40_14.png", 
 238 |         "T40_15.png", 
 239 |         "T40_2.png", 
 240 |         "T40_6.png", 
 241 |         "T411_0.png", 
 242 |         "T411_5.png", 
 243 |         "T41_1.png", 
 244 |         "T50_16.png", 
 245 |         "T50_19.png", 
 246 |         "T53_13.png", 
 247 |         "T53_9.png", 
 248 |         "T580_4.png", 
 249 |         "T580_5.png", 
 250 |         "T582_3.png", 
 251 |         "T582_5.png", 
 252 |         "T583_4.png", 
 253 |         "T586_1.png", 
 254 |         "T58_0.png", 
 255 |         "T58_3.png", 
 256 |         "T58_7.png", 
 257 |         "T597_0.png", 
 258 |         "T602_7.png", 
 259 |         "T602_9.png", 
 260 |         "T607_0.png", 
 261 |         "T619_12.png", 
 262 |         "T619_6.png", 
 263 |         "T619_7.png", 
 264 |         "T636_0.png", 
 265 |         "T636_4.png", 
 266 |         "T642_1.png", 
 267 |         "T647_18.png", 
 268 |         "T647_2.png", 
 269 |         "T647_20.png", 
 270 |         "T64_0.png", 
 271 |         "T64_1.png", 
 272 |         "T658_11.png", 
 273 |         "T658_6.png", 
 274 |         "T663_11.png", 
 275 |         "T66_3.png", 
 276 |         "T677_6.png", 
 277 |         "T693_0.png", 
 278 |         "T693_9.png", 
 279 |         "T695_0.png", 
 280 |         "T710_14.png", 
 281 |         "T711_5.png", 
 282 |         "T712_4.png", 
 283 |         "T71_0.png", 
 284 |         "T71_1.png", 
 285 |         "T71_2.png", 
 286 |         "T71_3.png", 
 287 |         "T71_4.png", 
 288 |         "T71_5.png", 
 289 |         "T71_6.png", 
 290 |         "T71_7.png", 
 291 |         "T71_8.png", 
 292 |         "T71_9.png", 
 293 |         "T724_10.png", 
 294 |         "T725_11.png", 
 295 |         "T726_1.png", 
 296 |         "T734_2.png", 
 297 |         "T736_15.png", 
 298 |         "T736_2.png", 
 299 |         "T736_5.png", 
 300 |         "T740_2.png", 
 301 |         "T745_1.png", 
 302 |         "T756_7.png", 
 303 |         "T757_4.png", 
 304 |         "T762_0.png", 
 305 |         "T767_1.png", 
 306 |         "T767_6.png", 
 307 |         "T770_0.png", 
 308 |         "T770_10.png", 
 309 |         "T770_11.png", 
 310 |         "T770_6.png", 
 311 |         "T772_11.png", 
 312 |         "T775_9.png", 
 313 |         "T77_1.png", 
 314 |         "T795_1.png", 
 315 |         "T795_11.png", 
 316 |         "T7_7.png", 
 317 |         "T7_9.png", 
 318 |         "T803_6.png", 
 319 |         "T803_7.png", 
 320 |         "T810_5.png", 
 321 |         "T810_6.png", 
 322 |         "T810_7.png", 
 323 |         "T813_5.png", 
 324 |         "T823_4.png", 
 325 |         "T823_5.png", 
 326 |         "T840_6.png", 
 327 |         "T844_9.png", 
 328 |         "T848_1.png", 
 329 |         "T855_2.png", 
 330 |         "T856_18.png", 
 331 |         "T856_2.png", 
 332 |         "T856_4.png", 
 333 |         "T865_6.png", 
 334 |         "T86_1.png", 
 335 |         "T86_6.png", 
 336 |         "T879_6.png", 
 337 |         "T884_4.png", 
 338 |         "T886_1.png", 
 339 |         "T898_8.png", 
 340 |         "T913_14.png", 
 341 |         "T915_4.png", 
 342 |         "T919_1.png", 
 343 |         "T932_3.png", 
 344 |         "T945_12.png", 
 345 |         "T945_13.png", 
 346 |         "T945_15.png", 
 347 |         "T945_16.png", 
 348 |         "T945_17.png", 
 349 |         "T945_18.png", 
 350 |         "T945_8.png", 
 351 |         "T963_1.png", 
 352 |         "T96_5.png", 
 353 |         "T96_6.png", 
 354 |         "T972_6.png", 
 355 |         "T979_13.png", 
 356 |         "T994_4.png", 
 357 |         "T997_10.png", 
 358 |         "T999_7.png", 
 359 |         "T106_1.png", 
 360 |         "T188_11.png", 
 361 |         "T763_1.png", 
 362 |         "T763_2.png", 
 363 |         "T865_0.png", 
 364 |         "T876_9.png", 
 365 |         "T999_3.png", 
 366 |         "A1007.png", 
 367 |         "A1264.png", 
 368 |         "A14912.png", 
 369 |         "A15901.png", 
 370 |         "A17682.png", 
 371 |         "A20064.png", 
 372 |         "A24631.png", 
 373 |         "A2751.png", 
 374 |         "A4189.png", 
 375 |         "A9707.png", 
 376 |         "B2436_1.png", 
 377 |         "B2861_1.png", 
 378 |         "T1027_2.png", 
 379 |         "T151_8.png", 
 380 |         "T165_6.png", 
 381 |         "T207_12.png", 
 382 |         "T217_1.png", 
 383 |         "T217_3.png", 
 384 |         "T261_1.png", 
 385 |         "T261_2.png", 
 386 |         "T311_7.png", 
 387 |         "T320_5.png", 
 388 |         "T325_1.png", 
 389 |         "T329_8.png", 
 390 |         "T333_6.png", 
 391 |         "T342_5.png", 
 392 |         "T350_10.png", 
 393 |         "T350_2.png", 
 394 |         "T387_2.png", 
 395 |         "T387_4.png", 
 396 |         "T389_10.png", 
 397 |         "T3_7.png", 
 398 |         "T4_7.png", 
 399 |         "T58_8.png", 
 400 |         "T597_9.png", 
 401 |         "T59_6.png", 
 402 |         "T5_0.png", 
 403 |         "T5_1.png", 
 404 |         "T619_13.png", 
 405 |         "T624_2.png", 
 406 |         "T636_1.png", 
 407 |         "T647_21.png", 
 408 |         "T658_13.png", 
 409 |         "T663_3.png", 
 410 |         "T667_2.png", 
 411 |         "T684_5.png", 
 412 |         "T684_8.png", 
 413 |         "T687_2.png", 
 414 |         "T6_11.png", 
 415 |         "T731_5.png", 
 416 |         "T735_1.png", 
 417 |         "T756_12.png", 
 418 |         "T757_3.png", 
 419 |         "T795_13.png", 
 420 |         "T838_9.png", 
 421 |         "T856_19.png", 
 422 |         "T857_0.png", 
 423 |         "T86_7.png", 
 424 |         "T886_0.png", 
 425 |         "T898_9.png", 
 426 |         "T933_5.png", 
 427 |         "T997_7.png", 
 428 |         "T374_0.png", 
 429 |         "A24374.png", 
 430 |         "T799_1.png", 
 431 |         "T398_7.png", 
 432 |         "T949_8.png", 
 433 |         "T789_1.png", 
 434 |         "T1009_1.png", 
 435 |         "T4_5.png", 
 436 |         "T816_0.png", 
 437 |         "T159_6.png", 
 438 |         "A14243.png", 
 439 |         "A1610.png", 
 440 |         "A6442.png", 
 441 |         "A3354.png", 
 442 |         "T690_4.png", 
 443 |         "A4609.png", 
 444 |         "T1077_7.png", 
 445 |         "A5355.png", 
 446 |         "T73_3.png", 
 447 |         "A3692.png", 
 448 |         "A9506.png", 
 449 |         "A14812.png", 
 450 |         "T936_8.png", 
 451 |         "T1071_5.png", 
 452 |         "T216_7.png", 
 453 |         "T1105_2.png", 
 454 |         "T261_3.png", 
 455 |         "A21449.png", 
 456 |         "T124_13.png", 
 457 |         "T583_6.png", 
 458 |         "T942_7.png", 
 459 |         "B1442_9.png", 
 460 |         "B968_3.png", 
 461 |         "T401_6.png", 
 462 |         "T230_10.png", 
 463 |         "A2143.png", 
 464 |         "A2143.png", 
 465 |         "A9643.png", 
 466 |         "T587_1.png", 
 467 |         "A24620.png", 
 468 |         "T934_0.png", 
 469 |         "A2433.png", 
 470 |         "T881_5.png", 
 471 |         "T931_24.png", 
 472 |         "B858_2.png", 
 473 |         "T1009_0.png", 
 474 |         "T270_14.png", 
 475 |         "T181_18.png", 
 476 |         "T1071_6.png", 
 477 |         "A4674.png", 
 478 |         "A16263.png", 
 479 |         "A6368.png", 
 480 |         "T1134_7.png", 
 481 |         "A7325.png", 
 482 |         "T174_5.png", 
 483 |         "B685_0.png", 
 484 |         "T285_2.png", 
 485 |         "A20784.png", 
 486 |         "A19004.png", 
 487 |         "A2612.png", 
 488 |         "T374_8.png", 
 489 |         "B2681_2.png", 
 490 |         "A26479.png", 
 491 |         "B1958_0.png", 
 492 |         "T312_1.png", 
 493 |         "A1268.png", 
 494 |         "A798.png", 
 495 |         "A7143.png", 
 496 |         "B121_0.png", 
 497 |         "A20795.png", 
 498 |         "A21802.png", 
 499 |         "A2295.png", 
 500 |         "A4076.png", 
 501 |         "A3121.png", 
 502 |         "A27044.png", 
 503 |         "T684_6.png", 
 504 |         "A6189.png", 
 505 |         "T723_3.png", 
 506 |         "T218_9.png", 
 507 |         "T279_5.png", 
 508 |         "A4335.png", 
 509 |         "T634_7.png", 
 510 |         "T870_2.png", 
 511 |         "A4889.png"
 512 |     ], 
 513 |     "black_list": [
 514 |         "A14430.png", 
 515 |         "A1315.png", 
 516 |         "A1573.png", 
 517 |         "A16342.png", 
 518 |         "A18403.png", 
 519 |         "A18610.png", 
 520 |         "A19289.png", 
 521 |         "A1945.png", 
 522 |         "A19462.png", 
 523 |         "A19233.png", 
 524 |         "A23543.png", 
 525 |         "A22742.png", 
 526 |         "A22689.png", 
 527 |         "A20253.png", 
 528 |         "A19845.png", 
 529 |         "A20654.png", 
 530 |         "A475.png", 
 531 |         "B1339_5.png", 
 532 |         "B1462_3.png", 
 533 |         "B1339_4.png", 
 534 |         "B1014_0.png", 
 535 |         "B1610_7.png", 
 536 |         "B1864_0.png", 
 537 |         "B1864_1.png", 
 538 |         "B1141_13.png", 
 539 |         "B1884_0.png", 
 540 |         "B1141_3.png", 
 541 |         "B1721_0.png", 
 542 |         "B1252_1.png", 
 543 |         "B1877_3.png", 
 544 |         "B1801_3.png", 
 545 |         "B1422_1.png", 
 546 |         "B1387_2.png", 
 547 |         "B1339_3.png", 
 548 |         "B1007_0.png", 
 549 |         "B1131_0.png", 
 550 |         "B1252_0.png", 
 551 |         "B1141_2.png", 
 552 |         "B1141_14.png", 
 553 |         "B1652_1.png", 
 554 |         "B1422_3.png", 
 555 |         "B1141_8.png", 
 556 |         "B1652_2.png", 
 557 |         "B1052_2.png", 
 558 |         "B1141_4.png", 
 559 |         "B1141_6.png", 
 560 |         "B1422_5.png", 
 561 |         "B1339_6.png", 
 562 |         "B1462_2.png", 
 563 |         "B1410_0.png", 
 564 |         "B1422_4.png", 
 565 |         "B1339_7.png", 
 566 |         "B1864_3.png", 
 567 |         "B1387_1.png", 
 568 |         "B1864_4.png", 
 569 |         "B1864_2.png", 
 570 |         "B1339_2.png", 
 571 |         "B1801_2.png", 
 572 |         "B1877_2.png", 
 573 |         "B1052_1.png", 
 574 |         "B1462_1.png", 
 575 |         "B1877_1.png", 
 576 |         "B1387_0.png", 
 577 |         "B1387_3.png", 
 578 |         "B1566_0.png", 
 579 |         "B1141_12.png", 
 580 |         "B2756_5.png", 
 581 |         "B245_0.png", 
 582 |         "B2530_5.png", 
 583 |         "B2999_3.png", 
 584 |         "B227_1.png", 
 585 |         "B2411_0.png", 
 586 |         "B364_0.png", 
 587 |         "B2530_8.png", 
 588 |         "B552_0.png", 
 589 |         "B535_4.png", 
 590 |         "B2756_7.png", 
 591 |         "B2999_6.png", 
 592 |         "B631_0.png", 
 593 |         "B245_3.png", 
 594 |         "B2043_0.png", 
 595 |         "B2513_0.png", 
 596 |         "B2557_4.png", 
 597 |         "B3090_4.png", 
 598 |         "B2982_16.png", 
 599 |         "B2267_0.png", 
 600 |         "B2092_3.png", 
 601 |         "B364_1.png", 
 602 |         "B2999_5.png", 
 603 |         "T103_5.png", 
 604 |         "B759_4.png", 
 605 |         "T1070_1.png", 
 606 |         "T1095_4.png", 
 607 |         "B866_10.png", 
 608 |         "B762_4.png", 
 609 |         "T1135_6.png", 
 610 |         "T1108_9.png", 
 611 |         "T1056_4.png", 
 612 |         "T1061_2.png", 
 613 |         "T105_9.png", 
 614 |         "T1135_7.png", 
 615 |         "T1174_7.png", 
 616 |         "B866_6.png", 
 617 |         "T1070_4.png", 
 618 |         "B866_1.png", 
 619 |         "B866_4.png", 
 620 |         "B2999_2.png", 
 621 |         "T107_3.png", 
 622 |         "B227_0.png", 
 623 |         "B866_0.png", 
 624 |         "B2412_0.png", 
 625 |         "B2982_6.png", 
 626 |         "B2999_11.png", 
 627 |         "B2993_6.png", 
 628 |         "T1070_2.png", 
 629 |         "T1070_12.png", 
 630 |         "B245_1.png", 
 631 |         "B3007_0.png", 
 632 |         "B2267_2.png", 
 633 |         "T1112_10.png", 
 634 |         "B535_3.png", 
 635 |         "B552_1.png", 
 636 |         "B2092_4.png", 
 637 |         "B245_2.png", 
 638 |         "T1070_3.png", 
 639 |         "B421_0.png", 
 640 |         "B2557_6.png", 
 641 |         "T1070_0.png", 
 642 |         "B2530_3.png", 
 643 |         "B2092_2.png", 
 644 |         "B762_5.png", 
 645 |         "B2557_5.png", 
 646 |         "B2511_2.png", 
 647 |         "B759_3.png", 
 648 |         "B2982_14.png", 
 649 |         "T1135_4.png", 
 650 |         "B2530_6.png", 
 651 |         "B227_2.png", 
 652 |         "B2215_0.png", 
 653 |         "B2530_9.png", 
 654 |         "B2982_7.png", 
 655 |         "T1070_8.png", 
 656 |         "B2176_0.png", 
 657 |         "B759_2.png", 
 658 |         "B2999_0.png", 
 659 |         "B762_6.png", 
 660 |         "B2982_5.png", 
 661 |         "T1052_4.png", 
 662 |         "B2412_2.png", 
 663 |         "B634_0.png", 
 664 |         "B552_2.png", 
 665 |         "B2999_10.png", 
 666 |         "B762_0.png", 
 667 |         "B2982_4.png", 
 668 |         "B2999_4.png", 
 669 |         "B2452_0.png", 
 670 |         "B866_3.png", 
 671 |         "B2567_0.png", 
 672 |         "B2703_2.png", 
 673 |         "B364_2.png", 
 674 |         "B2557_7.png", 
 675 |         "T1155_3.png", 
 676 |         "B2252_0.png", 
 677 |         "B2999_1.png", 
 678 |         "T1046_3.png", 
 679 |         "T1135_0.png", 
 680 |         "B2530_14.png", 
 681 |         "B227_3.png", 
 682 |         "B552_3.png", 
 683 |         "B866_2.png", 
 684 |         "B2557_3.png", 
 685 |         "B216_0.png", 
 686 |         "B2412_1.png", 
 687 |         "B2530_13.png", 
 688 |         "B2649_0.png", 
 689 |         "B2748_0.png", 
 690 |         "B2748_1.png", 
 691 |         "B2756_4.png", 
 692 |         "B2982_2.png", 
 693 |         "B2982_3.png", 
 694 |         "B2993_1.png", 
 695 |         "B2993_5.png", 
 696 |         "B3084_0.png", 
 697 |         "B3090_0.png", 
 698 |         "B535_8.png", 
 699 |         "B699_0.png", 
 700 |         "B803_0.png", 
 701 |         "T1071_1.png", 
 702 |         "T1112_9.png", 
 703 |         "T1135_8.png", 
 704 |         "T117_2.png", 
 705 |         "T121_2.png", 
 706 |         "T127_4.png", 
 707 |         "T129_1.png", 
 708 |         "T129_13.png", 
 709 |         "T150_8.png", 
 710 |         "T169_12.png", 
 711 |         "T182_12.png", 
 712 |         "T183_5.png", 
 713 |         "T188_6.png", 
 714 |         "T19_2.png", 
 715 |         "T206_13.png", 
 716 |         "T206_8.png", 
 717 |         "T216_9.png", 
 718 |         "T230_7.png", 
 719 |         "T233_11.png", 
 720 |         "T258_3.png", 
 721 |         "T260_3.png", 
 722 |         "T288_0.png", 
 723 |         "T288_11.png", 
 724 |         "T288_9.png", 
 725 |         "T294_9.png", 
 726 |         "T300_11.png", 
 727 |         "T319_8.png", 
 728 |         "T32_0.png", 
 729 |         "T32_1.png", 
 730 |         "T32_2.png", 
 731 |         "T32_4.png", 
 732 |         "T32_5.png", 
 733 |         "T32_6.png", 
 734 |         "T32_9.png", 
 735 |         "T330_8.png", 
 736 |         "T340_10.png", 
 737 |         "T344_3.png", 
 738 |         "T357_5.png", 
 739 |         "T377_3.png", 
 740 |         "T389_6.png", 
 741 |         "T38_2.png", 
 742 |         "T390_8.png", 
 743 |         "T398_9.png", 
 744 |         "T3_0.png", 
 745 |         "T3_1.png", 
 746 |         "T3_6.png", 
 747 |         "T4_4.png", 
 748 |         "T50_7.png", 
 749 |         "T53_11.png", 
 750 |         "T53_14.png", 
 751 |         "T593_5.png", 
 752 |         "T597_8.png", 
 753 |         "T59_7.png", 
 754 |         "T608_4.png", 
 755 |         "T648_4.png", 
 756 |         "T659_7.png", 
 757 |         "T668_4.png", 
 758 |         "T668_5.png", 
 759 |         "T668_6.png", 
 760 |         "T668_7.png", 
 761 |         "T668_8.png", 
 762 |         "T688_8.png", 
 763 |         "T691_4.png", 
 764 |         "T701_11.png", 
 765 |         "T710_7.png", 
 766 |         "T747_0.png", 
 767 |         "T747_1.png", 
 768 |         "T747_10.png", 
 769 |         "T749_1.png", 
 770 |         "T74_1.png", 
 771 |         "T74_7.png", 
 772 |         "T760_6.png", 
 773 |         "T765_1.png", 
 774 |         "T767_9.png", 
 775 |         "T781_1.png", 
 776 |         "T781_11.png", 
 777 |         "T781_12.png", 
 778 |         "T781_13.png", 
 779 |         "T781_14.png", 
 780 |         "T781_15.png", 
 781 |         "T781_16.png", 
 782 |         "T781_17.png", 
 783 |         "T781_2.png", 
 784 |         "T781_21.png", 
 785 |         "T781_3.png", 
 786 |         "T781_4.png", 
 787 |         "T781_5.png", 
 788 |         "T781_6.png", 
 789 |         "T781_7.png", 
 790 |         "T792_1.png", 
 791 |         "T792_2.png", 
 792 |         "T792_3.png", 
 793 |         "T792_4.png", 
 794 |         "T792_6.png", 
 795 |         "T792_7.png", 
 796 |         "T794_3.png", 
 797 |         "T803_8.png", 
 798 |         "T813_4.png", 
 799 |         "T817_11.png", 
 800 |         "T817_13.png", 
 801 |         "T826_1.png", 
 802 |         "T84_0.png", 
 803 |         "T84_10.png", 
 804 |         "T84_12.png", 
 805 |         "T84_13.png", 
 806 |         "T84_14.png", 
 807 |         "T84_15.png", 
 808 |         "T84_4.png", 
 809 |         "T84_5.png", 
 810 |         "T85_13.png", 
 811 |         "T865_5.png", 
 812 |         "T86_2.png", 
 813 |         "T881_8.png", 
 814 |         "T886_11.png", 
 815 |         "T914_11.png", 
 816 |         "T931_12.png", 
 817 |         "T931_13.png", 
 818 |         "T949_3.png", 
 819 |         "T96_9.png", 
 820 |         "T997_5.png", 
 821 |         "T997_9.png", 
 822 |         "B1052_0.png", 
 823 |         "B1141_15.png", 
 824 |         "B1141_5.png", 
 825 |         "B1141_7.png", 
 826 |         "B1422_2.png", 
 827 |         "B1652_0.png", 
 828 |         "B2166_0.png", 
 829 |         "B2267_1.png", 
 830 |         "B2530_15.png", 
 831 |         "B2530_16.png", 
 832 |         "B2530_17.png", 
 833 |         "B2530_4.png", 
 834 |         "B2530_7.png", 
 835 |         "B2649_1.png", 
 836 |         "B2756_3.png", 
 837 |         "B2756_6.png", 
 838 |         "B2865_0.png", 
 839 |         "B2865_1.png", 
 840 |         "B2982_12.png", 
 841 |         "B2982_13.png", 
 842 |         "B2982_15.png", 
 843 |         "B2982_8.png", 
 844 |         "B2993_0.png", 
 845 |         "B3064_0.png", 
 846 |         "B3126_0.png", 
 847 |         "B421_1.png", 
 848 |         "B535_0.png", 
 849 |         "B535_1.png", 
 850 |         "B535_2.png", 
 851 |         "B866_5.png", 
 852 |         "T1031_2.png", 
 853 |         "T106_5.png", 
 854 |         "T1070_10.png", 
 855 |         "T1070_11.png", 
 856 |         "T1070_9.png", 
 857 |         "T1135_5.png", 
 858 |         "T1164_6.png", 
 859 |         "T143_2.png", 
 860 |         "T15_4.png", 
 861 |         "T221_4.png", 
 862 |         "T279_1.png", 
 863 |         "T288_1.png", 
 864 |         "T288_10.png", 
 865 |         "T288_12.png", 
 866 |         "T288_13.png", 
 867 |         "T288_14.png", 
 868 |         "T288_15.png", 
 869 |         "T288_16.png", 
 870 |         "T288_17.png", 
 871 |         "T288_2.png", 
 872 |         "T288_3.png", 
 873 |         "T288_4.png", 
 874 |         "T288_5.png", 
 875 |         "T288_6.png", 
 876 |         "T288_7.png", 
 877 |         "T288_8.png", 
 878 |         "T321_10.png", 
 879 |         "T321_6.png", 
 880 |         "T32_3.png", 
 881 |         "T32_7.png", 
 882 |         "T32_8.png", 
 883 |         "T352_6.png", 
 884 |         "T354_4.png", 
 885 |         "T41_2.png", 
 886 |         "T53_0.png", 
 887 |         "T668_2.png", 
 888 |         "T668_3.png", 
 889 |         "T66_1.png", 
 890 |         "T747_11.png", 
 891 |         "T747_5.png", 
 892 |         "T747_6.png", 
 893 |         "T747_7.png", 
 894 |         "T747_8.png", 
 895 |         "T747_9.png", 
 896 |         "T773_0.png", 
 897 |         "T792_5.png", 
 898 |         "T83_2.png", 
 899 |         "T84_1.png", 
 900 |         "T84_11.png", 
 901 |         "T84_2.png", 
 902 |         "T84_3.png", 
 903 |         "T84_6.png", 
 904 |         "T84_7.png", 
 905 |         "T84_8.png", 
 906 |         "T84_9.png", 
 907 |         "T95_2.png", 
 908 |         "T302_3.png", 
 909 |         "T404_3.png", 
 910 |         "T714_3.png", 
 911 |         "T723_4.png", 
 912 |         "T931_9.png", 
 913 |         "T825_1.png", 
 914 |         "T285_3.png", 
 915 |         "T835_23.png", 
 916 |         "T295_3.png", 
 917 |         "T302_10.png", 
 918 |         "B2069_1.png", 
 919 |         "T76_5.png", 
 920 |         "T1172_8.png", 
 921 |         "T1134_17.png", 
 922 |         "T295_4.png", 
 923 |         "A9514.png", 
 924 |         "T765_9.png", 
 925 |         "T1020_0.png", 
 926 |         "T933_8.png", 
 927 |         "T329_0.png", 
 928 |         "T346_4.png", 
 929 |         "T820_0.png", 
 930 |         "T761_1.png", 
 931 |         "T783_0.png", 
 932 |         "T917_0.png", 
 933 |         "T238_8.png", 
 934 |         "T216_17.png", 
 935 |         "T931_0.png", 
 936 |         "T374_9.png", 
 937 |         "T1026_13.png", 
 938 |         "T183_7.png", 
 939 |         "T769_2.png", 
 940 |         "T176_1.png", 
 941 |         "T945_22.png", 
 942 |         "T247_0.png", 
 943 |         "T981_3.png", 
 944 |         "T322_4.png", 
 945 |         "T137_0.png", 
 946 |         "T35_3.png", 
 947 |         "T355_4.png", 
 948 |         "T877_0.png", 
 949 |         "T776_0.png", 
 950 |         "T388_8.png", 
 951 |         "T207_16.png", 
 952 |         "T196_8.png", 
 953 |         "T929_10.png", 
 954 |         "T1076_10.png", 
 955 |         "T229_0.png", 
 956 |         "T927_5.png", 
 957 |         "T760_0.png", 
 958 |         "T18_0.png", 
 959 |         "T928_9.png", 
 960 |         "T230_12.png", 
 961 |         "T807_5.png", 
 962 |         "T129_0.png", 
 963 |         "T164_9.png", 
 964 |         "T240_0.png", 
 965 |         "T354_3.png", 
 966 |         "T912_0.png", 
 967 |         "T366_6.png", 
 968 |         "T231_5.png", 
 969 |         "T179_0.png", 
 970 |         "T82_2.png", 
 971 |         "T191_7.png", 
 972 |         "T243_1.png", 
 973 |         "T207_9.png", 
 974 |         "T1007_4.png", 
 975 |         "A25503.png", 
 976 |         "T834_2.png", 
 977 |         "T371_0.png", 
 978 |         "T770_8.png", 
 979 |         "T349_3.png", 
 980 |         "T995_9.png", 
 981 |         "T1016_2.png", 
 982 |         "T227_8.png", 
 983 |         "T771_3.png", 
 984 |         "T1034_5.png", 
 985 |         "T755_2.png", 
 986 |         "A24951.png", 
 987 |         "T207_14.png", 
 988 |         "T935_7.png", 
 989 |         "A8455.png", 
 990 |         "T684_9.png", 
 991 |         "T989_5.png", 
 992 |         "A6750.png", 
 993 |         "A631.png", 
 994 |         "A8794.png", 
 995 |         "A25351.png", 
 996 |         "A7508.png", 
 997 |         "A2718.png", 
 998 |         "A26059.png", 
 999 |         "T1134_4.png", 
1000 |         "A254.png", 
1001 |         "A9840.png", 
1002 |         "A511.png", 
1003 |         "A20920.png", 
1004 |         "A15739.png", 
1005 |         "A874.png", 
1006 |         "A21434.png", 
1007 |         "A22940.png", 
1008 |         "A26491.png", 
1009 |         "A5271.png", 
1010 |         "A19883.png", 
1011 |         "A7578.png", 
1012 |         "T124_8.png", 
1013 |         "T919_2.png", 
1014 |         "A25805.png", 
1015 |         "A5626.png", 
1016 |         "A25754.png", 
1017 |         "A8109.png", 
1018 |         "A20859.png", 
1019 |         "A5111.png", 
1020 |         "A5019.png", 
1021 |         "A26234.png", 
1022 |         "A20231.png", 
1023 |         "A26382.png", 
1024 |         "A5864.png", 
1025 |         "A22103.png", 
1026 |         "A26384.png", 
1027 |         "A3577.png", 
1028 |         "T684_10.png", 
1029 |         "A22322.png", 
1030 |         "A3374.png", 
1031 |         "T1037_4.png", 
1032 |         "A4999.png", 
1033 |         "A5769.png", 
1034 |         "A27040.png", 
1035 |         "T234_7.png", 
1036 |         "T725_10.png", 
1037 |         "T302_15.png", 
1038 |         "T688_9.png", 
1039 |         "T6_6.png", 
1040 |         "A296.png", 
1041 | 		"A21322.png",
1042 | 		"T251_4.png",
1043 |         "A8899.png"
1044 |     ]
1045 | }
1046 | 


--------------------------------------------------------------------------------
/files/src/A81.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinchangchang/ocr_densenet/a31f57e006f73b52b3881fd4a771320f02df2147/files/src/A81.png


--------------------------------------------------------------------------------
/files/src/B1000_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinchangchang/ocr_densenet/a31f57e006f73b52b3881fd4a771320f02df2147/files/src/B1000_0.png


--------------------------------------------------------------------------------
/files/ttf/simsun.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yinchangchang/ocr_densenet/a31f57e006f73b52b3881fd4a771320f02df2147/files/ttf/simsun.ttf


--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
 1 | Pillow
 2 | fuzzywuzzy
 3 | numpy==1.14.2
 4 | tqdm==4.19.4
 5 | scikit-image==0.13.0
 6 | scikit-learn==0.19.1
 7 | torchvision==0.2.0
 8 | scipy==0.19.0
 9 | matplotlib==2.0.2
10 | 


--------------------------------------------------------------------------------