├── .gitignore ├── demo └── re1.jpg ├── location ├── README.md ├── output_crop │ ├── 07010974437.jpg │ ├── 18893039105.jpg │ ├── 38661425763.jpg │ ├── 50758360544.jpg │ ├── 55038743175.jpg │ ├── 070109744371.jpg │ ├── 070109744372.jpg │ ├── 070109744373.jpg │ ├── 188930391052.jpg │ ├── 188930391053.jpg │ ├── 293072937641.jpg │ ├── 309392130741.jpg │ └── 311990100321.jpg ├── test_imgs │ ├── 07010974437.jpg │ ├── 070109744371.jpg │ ├── 070109744372.jpg │ ├── 070109744373.jpg │ ├── 18893039105.jpg │ ├── 188930391051.jpg │ ├── 188930391052.jpg │ ├── 188930391053.jpg │ ├── 293072937641.jpg │ ├── 309392130741.jpg │ ├── 311990100321.jpg │ ├── 38661425763.jpg │ ├── 50758360544.jpg │ └── 55038743175.jpg ├── output │ ├── 070109744371_predict.jpg │ ├── 070109744372_predict.jpg │ ├── 070109744373_predict.jpg │ ├── 07010974437_predict.jpg │ ├── 188930391052_predict.jpg │ ├── 188930391053_predict.jpg │ ├── 18893039105_predict.jpg │ ├── 293072937641_predict.jpg │ ├── 309392130741_predict.jpg │ ├── 311990100321_predict.jpg │ ├── 38661425763_predict.jpg │ ├── 50758360544_predict.jpg │ └── 55038743175_predict.jpg ├── LICENSE ├── data_loader.py ├── cfg.py ├── train.py ├── losses.py ├── nms.py ├── network.py ├── predict.py ├── label.py └── preprocess.py ├── recognition ├── test │ ├── 12233418739.jpg │ ├── 22046298859.jpg │ ├── 28510715459.jpg │ ├── 37774346979.jpg │ ├── 41679405336.jpg │ ├── 84999825604.jpg │ ├── 97785067838.jpg │ └── 99851544924.jpg ├── result.txt ├── README.md ├── cfg.py ├── predict.py ├── train.py ├── data_loader.py └── network.py ├── cfg.py ├── LICENSE ├── README.md ├── loc_and_reg.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | data_util/ 3 | .idea/ 4 | -------------------------------------------------------------------------------- /demo/re1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/demo/re1.jpg -------------------------------------------------------------------------------- /location/README.md: -------------------------------------------------------------------------------- 1 | This image's number detection network is based on [Advanced_EAST](https://github.com/huoyijie/AdvancedEAST) 2 | -------------------------------------------------------------------------------- /recognition/test/12233418739.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/recognition/test/12233418739.jpg -------------------------------------------------------------------------------- /recognition/test/22046298859.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/recognition/test/22046298859.jpg -------------------------------------------------------------------------------- /recognition/test/28510715459.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/recognition/test/28510715459.jpg -------------------------------------------------------------------------------- /recognition/test/37774346979.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/recognition/test/37774346979.jpg -------------------------------------------------------------------------------- /recognition/test/41679405336.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/recognition/test/41679405336.jpg -------------------------------------------------------------------------------- /recognition/test/84999825604.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/recognition/test/84999825604.jpg -------------------------------------------------------------------------------- /recognition/test/97785067838.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/recognition/test/97785067838.jpg -------------------------------------------------------------------------------- /recognition/test/99851544924.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/recognition/test/99851544924.jpg -------------------------------------------------------------------------------- /location/output_crop/07010974437.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output_crop/07010974437.jpg -------------------------------------------------------------------------------- /location/output_crop/18893039105.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output_crop/18893039105.jpg -------------------------------------------------------------------------------- /location/output_crop/38661425763.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output_crop/38661425763.jpg -------------------------------------------------------------------------------- /location/output_crop/50758360544.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output_crop/50758360544.jpg -------------------------------------------------------------------------------- /location/output_crop/55038743175.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output_crop/55038743175.jpg -------------------------------------------------------------------------------- /location/test_imgs/07010974437.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/07010974437.jpg -------------------------------------------------------------------------------- /location/test_imgs/070109744371.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/070109744371.jpg -------------------------------------------------------------------------------- /location/test_imgs/070109744372.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/070109744372.jpg -------------------------------------------------------------------------------- /location/test_imgs/070109744373.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/070109744373.jpg -------------------------------------------------------------------------------- /location/test_imgs/18893039105.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/18893039105.jpg -------------------------------------------------------------------------------- /location/test_imgs/188930391051.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/188930391051.jpg -------------------------------------------------------------------------------- /location/test_imgs/188930391052.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/188930391052.jpg -------------------------------------------------------------------------------- /location/test_imgs/188930391053.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/188930391053.jpg -------------------------------------------------------------------------------- /location/test_imgs/293072937641.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/293072937641.jpg -------------------------------------------------------------------------------- /location/test_imgs/309392130741.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/309392130741.jpg -------------------------------------------------------------------------------- /location/test_imgs/311990100321.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/311990100321.jpg -------------------------------------------------------------------------------- /location/test_imgs/38661425763.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/38661425763.jpg -------------------------------------------------------------------------------- /location/test_imgs/50758360544.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/50758360544.jpg -------------------------------------------------------------------------------- /location/test_imgs/55038743175.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/test_imgs/55038743175.jpg -------------------------------------------------------------------------------- /location/output_crop/070109744371.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output_crop/070109744371.jpg -------------------------------------------------------------------------------- /location/output_crop/070109744372.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output_crop/070109744372.jpg -------------------------------------------------------------------------------- /location/output_crop/070109744373.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output_crop/070109744373.jpg -------------------------------------------------------------------------------- /location/output_crop/188930391052.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output_crop/188930391052.jpg -------------------------------------------------------------------------------- /location/output_crop/188930391053.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output_crop/188930391053.jpg -------------------------------------------------------------------------------- /location/output_crop/293072937641.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output_crop/293072937641.jpg -------------------------------------------------------------------------------- /location/output_crop/309392130741.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output_crop/309392130741.jpg -------------------------------------------------------------------------------- /location/output_crop/311990100321.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output_crop/311990100321.jpg -------------------------------------------------------------------------------- /location/output/070109744371_predict.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output/070109744371_predict.jpg -------------------------------------------------------------------------------- /location/output/070109744372_predict.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output/070109744372_predict.jpg -------------------------------------------------------------------------------- /location/output/070109744373_predict.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output/070109744373_predict.jpg -------------------------------------------------------------------------------- /location/output/07010974437_predict.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output/07010974437_predict.jpg -------------------------------------------------------------------------------- /location/output/188930391052_predict.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output/188930391052_predict.jpg -------------------------------------------------------------------------------- /location/output/188930391053_predict.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output/188930391053_predict.jpg -------------------------------------------------------------------------------- /location/output/18893039105_predict.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output/18893039105_predict.jpg -------------------------------------------------------------------------------- /location/output/293072937641_predict.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output/293072937641_predict.jpg -------------------------------------------------------------------------------- /location/output/309392130741_predict.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output/309392130741_predict.jpg -------------------------------------------------------------------------------- /location/output/311990100321_predict.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output/311990100321_predict.jpg -------------------------------------------------------------------------------- /location/output/38661425763_predict.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output/38661425763_predict.jpg -------------------------------------------------------------------------------- /location/output/50758360544_predict.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output/50758360544_predict.jpg -------------------------------------------------------------------------------- /location/output/55038743175_predict.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bobo-y/number_detection_recognition/HEAD/location/output/55038743175_predict.jpg -------------------------------------------------------------------------------- /recognition/result.txt: -------------------------------------------------------------------------------- 1 | 22046298859.jpg : 22046298859 2 | 97785067838.jpg : 97785067838 3 | 84999825604.jpg : 84999825604 4 | 99851544924.jpg : 99851544924 5 | 28510715459.jpg : 28510715459 6 | 12233418739.jpg : 12233418739 7 | 41679405336.jpg : 41679405336 8 | 37774346979.jpg : 37774346979 9 | -------------------------------------------------------------------------------- /cfg.py: -------------------------------------------------------------------------------- 1 | image_size = 512 2 | pixel_threshold = 0.9 3 | side_vertex_pixel_threshold = 0.9 4 | trunc_threshold = 0.1 5 | pixel_size = 4 6 | width = 200 7 | height = 31 8 | label_len = 11 9 | characters = '0123456789' + '-' 10 | label_classes = len(characters) 11 | location_model = "loc.h5" 12 | recognition_model = "recog.h5" 13 | epsilon = 1e-4 -------------------------------------------------------------------------------- /recognition/README.md: -------------------------------------------------------------------------------- 1 | Number recognition based on CRNN(cnn + bi_lstm) 2 | 3 | train: 4 | * prepare your dataset, format like the [MJSynth data](http://www.robots.ox.ac.uk/~vgg/data/text/mjsynth.tar.gz) 5 | * modify config params in cfg.py, see default values. 6 | * python train.py 7 | 8 | predict: 9 | * specify test images dir 10 | * python predict.py -------------------------------------------------------------------------------- /recognition/cfg.py: -------------------------------------------------------------------------------- 1 | width = 200 2 | height = 31 3 | label_len = 11 # we recognition phone-number, and We think the maximum length is 11, it can be changed at will 4 | # according to the actual needs 5 | characters = '0123456789' + '-' # recognition character 0 to 9, '-' for blank(ctc loss) 6 | label_classes = len(characters) # Number of categories requiring character recognition 7 | ocr_dataset_path = "dataset/imgs" 8 | save_model_path = "saved_model/weights.h5" 9 | log_dir = "logs" 10 | load_model = True 11 | load_model_path = "model/weights_base.h5" 12 | checkpoint_path = "save_model/val_model.h5" 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 lulu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /location/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright © 2018 huoyijie, https://github.com/huoyijie/AdvancedEAST 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /location/data_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from keras.preprocessing import image 4 | from keras.applications.vgg16 import preprocess_input 5 | 6 | import cfg 7 | 8 | 9 | def gen(batch_size=cfg.batch_size, is_val=False): 10 | img_h, img_w = cfg.max_train_img_size, cfg.max_train_img_size 11 | x = np.zeros((batch_size, img_h, img_w, cfg.num_channels), dtype=np.float32) 12 | pixel_num_h = img_h // cfg.pixel_size 13 | pixel_num_w = img_w // cfg.pixel_size 14 | y = np.zeros((batch_size, pixel_num_h, pixel_num_w, 7), dtype=np.float32) 15 | if is_val: 16 | with open(os.path.join(cfg.data_dir, cfg.val_fname), 'r') as f_val: 17 | f_list = f_val.readlines() 18 | else: 19 | with open(os.path.join(cfg.data_dir, cfg.train_fname), 'r') as f_train: 20 | f_list = f_train.readlines() 21 | while True: 22 | for i in range(batch_size): 23 | # random gen an image name 24 | random_img = np.random.choice(f_list) 25 | img_filename = str(random_img).strip().split(',')[0] 26 | # load img and img anno 27 | img_path = os.path.join(cfg.data_dir, 28 | cfg.train_image_dir_name, 29 | img_filename) 30 | img = image.load_img(img_path) 31 | img = image.img_to_array(img) 32 | x[i] = preprocess_input(img, mode='tf') 33 | gt_file = os.path.join(cfg.data_dir, 34 | cfg.train_label_dir_name, 35 | img_filename[:-4] + '_gt.npy') 36 | y[i] = np.load(gt_file) 37 | yield x, y 38 | -------------------------------------------------------------------------------- /location/cfg.py: -------------------------------------------------------------------------------- 1 | epoch_num = 24 2 | lr = 1e-3 3 | decay = 5e-4 4 | patience = 5 5 | load_weights = False 6 | lambda_inside_score_loss = 4.0 7 | lambda_side_vertex_code_loss = 1.0 8 | lambda_side_vertex_coord_loss = 1.0 9 | 10 | total_img = 10000 11 | validation_split_ratio = 0.1 12 | image_size = 512 # (height == width, in [256, 384, 512, 640, 736]) 13 | batch_size = 4 14 | steps_per_epoch = total_img * (1 - validation_split_ratio) // batch_size 15 | validation_steps = total_img * validation_split_ratio // batch_size 16 | 17 | data_dir = 'icpr/' 18 | origin_image_dir_name = 'image_10000/' 19 | origin_txt_dir_name = 'txt_10000/' 20 | train_image_dir_name = 'images_%s/' % image_size 21 | train_label_dir_name = 'labels_%s/' % image_size 22 | show_gt_image_dir_name = 'show_gt_images_%s/' % image_size 23 | show_act_image_dir_name = 'show_act_images_%s/' % image_size 24 | gen_origin_img = True 25 | draw_gt_quad = True 26 | draw_act_quad = True 27 | val_fname = 'val_%s.txt' % image_size 28 | train_fname = 'train_%s.txt' % image_size 29 | # in paper it's 0.3, maybe to large to this problem 30 | shrink_ratio = 0.2 31 | # pixels between 0.2 and 0.6 are side pixels 32 | shrink_side_ratio = 0.6 33 | epsilon = 1e-4 34 | 35 | num_channels = 3 36 | feature_layers_range = range(5, 1, -1) 37 | # feature_layers_range = range(3, 0, -1) 38 | feature_layers_num = len(feature_layers_range) 39 | # pixel_size = 4 40 | pixel_size = 2 ** feature_layers_range[-1] 41 | 42 | model_weights_path = 'saved_model/weights_%s.h5' % image_size 43 | saved_model_file_path = 'saved_model/model_%s.h5' % image_size 44 | saved_model_weights_file_path = 'model/weights_base.h5' 45 | 46 | pixel_threshold = 0.9 47 | side_vertex_pixel_threshold = 0.9 48 | trunc_threshold = 0.1 49 | predict_write2txt = False 50 | detection_box_crop = True 51 | -------------------------------------------------------------------------------- /recognition/predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | import warnings 5 | from network import CRNN 6 | import cfg 7 | import sys 8 | import keras.backend as ktf 9 | 10 | 11 | warnings.filterwarnings('ignore') 12 | _, model = CRNN(cfg.width, cfg.height, cfg.label_len, cfg.characters).network() 13 | model.load_weights(cfg.save_model_path) 14 | # model.summary() 15 | 16 | 17 | def predict(infer_model, img_path): 18 | img = cv2.imread(img_path) 19 | img_size = img.shape 20 | if (img_size[1] / img_size[0] * 1.0) < 6: 21 | img_reshape = cv2.resize(img, (int(31.0 / img_size[0] * img_size[1]), cfg.height)) 22 | 23 | mat_ori = np.zeros((cfg.height, cfg.width - int(31.0 / img_size[0] * img_size[1]), 3), dtype=np.uint8) 24 | out_img = np.concatenate([img_reshape, mat_ori], axis=1).transpose([1, 0, 2]) 25 | else: 26 | out_img = cv2.resize(img, (cfg.width, cfg.height), interpolation=cv2.INTER_CUBIC) 27 | out_img = np.asarray(out_img) 28 | out_img = out_img.transpose([1, 0, 2]) 29 | 30 | y_pred = infer_model.predict(np.expand_dims(out_img, axis=0)) 31 | shape = y_pred[:, 2:, :].shape 32 | ctc_decode = ktf.ctc_decode(y_pred[:, 2:, :], input_length=np.ones(shape[0]) * shape[1])[0][0] 33 | out = ktf.get_value(ctc_decode)[:, :cfg.label_len] 34 | result = ''.join([cfg.characters[k] for k in out[0]]) 35 | return result 36 | 37 | 38 | def main(): 39 | imgs_list = os.listdir('test') 40 | result_txt = open('result.txt', 'a+') 41 | for img_name in imgs_list: 42 | result = predict(model, os.path.join('test', img_name)) 43 | result = img_name + " : " + result + "\n" 44 | print(result) 45 | result_txt.write(result) 46 | result_txt.close() 47 | 48 | 49 | if __name__ == '__main__': 50 | main() 51 | 52 | -------------------------------------------------------------------------------- /location/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | from keras.callbacks import EarlyStopping, ModelCheckpoint 3 | from keras.optimizers import Adam 4 | import keras.backend as ktf 5 | import tensorflow as tf 6 | import cfg 7 | from network_vgg import East 8 | from losses import quad_loss 9 | from data_loader import gen 10 | 11 | 12 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 13 | tf_config = tf.ConfigProto() 14 | tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5 15 | session = tf.Session(config=tf_config) 16 | ktf.set_session(session) 17 | 18 | east = East() 19 | east_network = east.east_network() 20 | east_network.summary() 21 | east_network.compile(loss=quad_loss, optimizer=Adam(lr=cfg.lr, 22 | decay=cfg.decay)) 23 | if cfg.load_weights and os.path.exists(cfg.saved_model_weights_file_path): 24 | east_network.load_weights(cfg.saved_model_weights_file_path) 25 | 26 | east_network.fit_generator(generator=gen(), 27 | steps_per_epoch=cfg.steps_per_epoch, 28 | epochs=cfg.epoch_num, 29 | validation_data=gen(is_val=True), 30 | validation_steps=cfg.validation_steps, 31 | verbose=1, 32 | initial_epoch=cfg.initial_epoch, 33 | callbacks=[EarlyStopping(patience=cfg.patience, verbose=1), 34 | ModelCheckpoint(filepath=cfg.model_weights_path, 35 | save_best_only=True, 36 | save_weights_only=True, 37 | verbose=1)]) 38 | east_network.save(cfg.saved_model_file_path) 39 | east_network.save_weights(cfg.saved_model_weights_file_path) 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | number detection and recognition based on [AdvancedEast](https://github.com/huoyijie/AdvancedEASTg) and [CRNN](https://arxiv.org/abs/1507.05717) 2 | ____ 3 | Detection and Crop: 4 | 5 | 6 | ____ 7 | Recognition: 8 | 9 | * 22046298859.jpg : 22046298859 10 | * 97785067838.jpg : 97785067838 11 | * 84999825604.jpg : 84999825604 12 | * 99851544924.jpg : 99851544924 13 | * 28510715459.jpg : 28510715459 14 | * 12233418739.jpg : 12233418739 15 | * 41679405336.jpg : 41679405336 16 | * 37774346979.jpg : 37774346979 17 | 18 | ____ 19 | limitations: 20 | *When the two models are test on their respective validation sets , they can reach an acc of about 0.9. However, the number of the training data for recognizer I generated is horizontal, and the number in the crop image after the detection result introduces the rotation and other factors, resulting in poor results when used in combination.* 21 | 22 | 23 | ---- 24 | # Detection 25 | 26 | ## training 27 | * prepare training data, data format refer to [ICPR](https://tianchi.aliyun.com/competition/introduction.htm?spm=5176.100066.0.0.3bcad780oQ9Ce4&raceId=231651) 28 | * modify params in cfg.py 29 | * run python preprocess.py to resize image and generator .npy training files 30 | * run python label.py 31 | * run python train.py, train the network 32 | ## testing 33 | * modify your images' dir in predict.py, and run python predict.py, then we will get three outputs: bounding box on origin 34 | images, the cropped image, and coordinates(txt file). 35 | 36 | *more details please refer to [AdvancedEast](https://github.com/huoyijie/AdvancedEASTg)* 37 | 38 | ----- 39 | # Recognition 40 | 41 | ## training 42 | * prepare training data, data format refer to [MJSynth data](http://www.robots.ox.ac.uk/~vgg/data/text/mjsynth.tar.gz) 43 | * modify params in cfg.py 44 | * modify input_shape=(None, 50,7,512) in train.py line 55, the input_shape is refer to your bn_shape = bn4.get_shape() in network.py 45 | * run python train.py 46 | ## testing 47 | * modify your images' dir in predict.py, then run python predict.py 48 | ---- -------------------------------------------------------------------------------- /recognition/train.py: -------------------------------------------------------------------------------- 1 | from keras.callbacks import ModelCheckpoint, Callback 2 | from keras.callbacks import TensorBoard 3 | from data_loader import * 4 | from network import * 5 | import keras.backend as ktf 6 | import cfg 7 | import os 8 | import tensorflow as tf 9 | 10 | 11 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 12 | tf_config = tf.ConfigProto() 13 | tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5 14 | session = tf.Session(config=tf_config) 15 | ktf.set_session(session) 16 | 17 | 18 | class Evaluate(Callback): 19 | 20 | def on_epoch_end(self, epoch, logs=None): 21 | 22 | def evaluate(input_model): 23 | correct_prediction = 0 24 | generator = img_gen_val_lexicon() 25 | x_test, y_test = next(generator) 26 | y_pred = input_model.predict(x_test) 27 | shape = y_pred[:, 2:, :].shape 28 | ctc_decode = ktf.ctc_decode(y_pred[:, 2:, :], input_length=np.ones(shape[0]) * shape[1])[0][0] 29 | out = ktf.get_value(ctc_decode)[:, :cfg.label_len] 30 | 31 | for m in range(1000): 32 | result_str = ''.join([cfg.characters[k] for k in out[m]]) 33 | result_str = result_str.replace('-', '') 34 | if result_str == y_test[m]: 35 | correct_prediction += 1 36 | else: 37 | print(result_str, y_test[m]) 38 | 39 | return correct_prediction * 1.0 / 10 40 | acc = evaluate(infer_model) 41 | print('') 42 | print('acc:'+str(acc)+"%") 43 | 44 | 45 | evaluator = Evaluate() 46 | 47 | 48 | checkpoint = ModelCheckpoint(cfg.checkpoint_path, monitor='loss', verbose=1, save_best_only=True, mode='min') 49 | 50 | 51 | train_model, infer_model = CRNN(cfg.width, cfg.height, cfg.label_len, cfg.characters).network() 52 | if cfg.load_model: 53 | train_model.load_weights(cfg.load_model_path, by_name=True, skip_mismatch=True) 54 | train_model.summary() 55 | train_model.fit_generator(img_gen_lexicon(input_shape=(None, 50, 7, 512)), steps_per_epoch=2000, epochs=50, verbose=1, 56 | callbacks=[evaluator, 57 | checkpoint, 58 | TensorBoard(log_dir=cfg.log_dir)] 59 | ) 60 | infer_model.save(cfg.save_model_path) 61 | 62 | -------------------------------------------------------------------------------- /location/losses.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | import cfg 4 | 5 | 6 | def quad_loss(y_true, y_pred): 7 | # loss for inside_score 8 | logits = y_pred[:, :, :, :1] 9 | labels = y_true[:, :, :, :1] 10 | # balance positive and negative samples in an image 11 | beta = 1 - tf.reduce_mean(labels) 12 | # first apply sigmoid activation 13 | predicts = tf.nn.sigmoid(logits) 14 | # log +epsilon for stable cal 15 | inside_score_loss = tf.reduce_mean( 16 | -1 * (beta * labels * tf.log(predicts + cfg.epsilon) + 17 | (1 - beta) * (1 - labels) * tf.log(1 - predicts + cfg.epsilon))) 18 | inside_score_loss *= cfg.lambda_inside_score_loss 19 | 20 | # loss for side_vertex_code 21 | vertex_logits = y_pred[:, :, :, 1:3] 22 | vertex_labels = y_true[:, :, :, 1:3] 23 | vertex_beta = 1 - (tf.reduce_mean(y_true[:, :, :, 1:2]) 24 | / (tf.reduce_mean(labels) + cfg.epsilon)) 25 | vertex_predicts = tf.nn.sigmoid(vertex_logits) 26 | pos = -1 * vertex_beta * vertex_labels * tf.log(vertex_predicts + 27 | cfg.epsilon) 28 | neg = -1 * (1 - vertex_beta) * (1 - vertex_labels) * tf.log( 29 | 1 - vertex_predicts + cfg.epsilon) 30 | positive_weights = tf.cast(tf.equal(y_true[:, :, :, 0], 1), tf.float32) 31 | side_vertex_code_loss = \ 32 | tf.reduce_sum(tf.reduce_sum(pos + neg, axis=-1) * positive_weights) / ( 33 | tf.reduce_sum(positive_weights) + cfg.epsilon) 34 | side_vertex_code_loss *= cfg.lambda_side_vertex_code_loss 35 | 36 | # loss for side_vertex_coord delta 37 | g_hat = y_pred[:, :, :, 3:] 38 | g_true = y_true[:, :, :, 3:] 39 | vertex_weights = tf.cast(tf.equal(y_true[:, :, :, 1], 1), tf.float32) 40 | pixel_wise_smooth_l1norm = smooth_l1_loss(g_hat, g_true, vertex_weights) 41 | side_vertex_coord_loss = tf.reduce_sum(pixel_wise_smooth_l1norm) / ( 42 | tf.reduce_sum(vertex_weights) + cfg.epsilon) 43 | side_vertex_coord_loss *= cfg.lambda_side_vertex_coord_loss 44 | return inside_score_loss + side_vertex_code_loss + side_vertex_coord_loss 45 | 46 | 47 | def smooth_l1_loss(prediction_tensor, target_tensor, weights): 48 | n_q = tf.reshape(quad_norm(target_tensor), tf.shape(weights)) 49 | diff = prediction_tensor - target_tensor 50 | abs_diff = tf.abs(diff) 51 | abs_diff_lt_1 = tf.less(abs_diff, 1) 52 | pixel_wise_smooth_l1norm = (tf.reduce_sum( 53 | tf.where(abs_diff_lt_1, 0.5 * tf.square(abs_diff), abs_diff - 0.5), 54 | axis=-1) / n_q) * weights 55 | return pixel_wise_smooth_l1norm 56 | 57 | 58 | def quad_norm(g_true): 59 | shape = tf.shape(g_true) 60 | delta_xy_matrix = tf.reshape(g_true, [-1, 2, 2]) 61 | diff = delta_xy_matrix[:, 0:1, :] - delta_xy_matrix[:, 1:2, :] 62 | square = tf.square(diff) 63 | distance = tf.sqrt(tf.reduce_sum(square, axis=-1)) 64 | distance *= 4.0 65 | distance += cfg.epsilon 66 | return tf.reshape(distance, shape[:-1]) 67 | -------------------------------------------------------------------------------- /location/nms.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import numpy as np 3 | 4 | import cfg 5 | 6 | 7 | def should_merge(region, i, j): 8 | neighbor = {(i, j - 1)} 9 | return not region.isdisjoint(neighbor) 10 | 11 | 12 | def region_neighbor(region_set): 13 | region_pixels = np.array(list(region_set)) 14 | j_min = np.amin(region_pixels, axis=0)[1] - 1 15 | j_max = np.amax(region_pixels, axis=0)[1] + 1 16 | i_m = np.amin(region_pixels, axis=0)[0] + 1 17 | region_pixels[:, 0] += 1 18 | neighbor = {(region_pixels[n, 0], region_pixels[n, 1]) for n in 19 | range(len(region_pixels))} 20 | neighbor.add((i_m, j_min)) 21 | neighbor.add((i_m, j_max)) 22 | return neighbor 23 | 24 | 25 | def region_group(region_list): 26 | S = [i for i in range(len(region_list))] 27 | D = [] 28 | while len(S) > 0: 29 | m = S.pop(0) 30 | if len(S) == 0: 31 | # S has only one element, put it to D 32 | D.append([m]) 33 | else: 34 | D.append(rec_region_merge(region_list, m, S)) 35 | return D 36 | 37 | 38 | def rec_region_merge(region_list, m, S): 39 | rows = [m] 40 | tmp = [] 41 | for n in S: 42 | if not region_neighbor(region_list[m]).isdisjoint(region_list[n]) or \ 43 | not region_neighbor(region_list[n]).isdisjoint(region_list[m]): 44 | # 第m与n相交 45 | tmp.append(n) 46 | for d in tmp: 47 | S.remove(d) 48 | for e in tmp: 49 | rows.extend(rec_region_merge(region_list, e, S)) 50 | return rows 51 | 52 | 53 | def nms(predict, activation_pixels, threshold=cfg.side_vertex_pixel_threshold): 54 | region_list = [] 55 | for i, j in zip(activation_pixels[0], activation_pixels[1]): 56 | merge = False 57 | for k in range(len(region_list)): 58 | if should_merge(region_list[k], i, j): 59 | region_list[k].add((i, j)) 60 | merge = True 61 | # Fixme 重叠文本区域处理,存在和多个区域邻接的pixels,先都merge试试 62 | # break 63 | if not merge: 64 | region_list.append({(i, j)}) 65 | D = region_group(region_list) 66 | quad_list = np.zeros((len(D), 4, 2)) 67 | score_list = np.zeros((len(D), 4)) 68 | for group, g_th in zip(D, range(len(D))): 69 | total_score = np.zeros((4, 2)) 70 | for row in group: 71 | for ij in region_list[row]: 72 | score = predict[ij[0], ij[1], 1] 73 | if score >= threshold: 74 | ith_score = predict[ij[0], ij[1], 2:3] 75 | if not (cfg.trunc_threshold <= ith_score < 1 - 76 | cfg.trunc_threshold): 77 | ith = int(np.around(ith_score)) 78 | total_score[ith * 2:(ith + 1) * 2] += score 79 | px = (ij[1] + 0.5) * cfg.pixel_size 80 | py = (ij[0] + 0.5) * cfg.pixel_size 81 | p_v = [px, py] + np.reshape(predict[ij[0], ij[1], 3:7], 82 | (2, 2)) 83 | quad_list[g_th, ith * 2:(ith + 1) * 2] += score * p_v 84 | score_list[g_th] = total_score[:, 0] 85 | quad_list[g_th] /= (total_score + cfg.epsilon) 86 | return score_list, quad_list 87 | -------------------------------------------------------------------------------- /loc_and_reg.py: -------------------------------------------------------------------------------- 1 | from keras.models import load_model 2 | from PIL import Image, ImageDraw 3 | from keras.preprocessing import image 4 | from keras.applications.vgg16 import preprocess_input 5 | from util import * 6 | import numpy as np 7 | import os 8 | import cv2 9 | import keras.backend as ktf 10 | import cfg 11 | 12 | 13 | def location(model, img_path, pixel_threshold): 14 | img = image.load_img(img_path) 15 | d_wight, d_height = resize_image(img, cfg.image_size) 16 | img = img.resize((d_wight, d_height), Image.NEAREST).convert('RGB') 17 | img = image.img_to_array(img) 18 | img = preprocess_input(img, mode='tf') 19 | x = np.expand_dims(img, axis=0) 20 | y = model.predict(x) 21 | y = np.squeeze(y, axis=0) 22 | y[:, :, :3] = sigmoid(y[:, :, :3]) 23 | cond = np.greater_equal(y[:, :, 0], pixel_threshold) 24 | activation_pixels = np.where(cond) 25 | quad_scores, quad_after_nms = nms(y, activation_pixels) 26 | results = [] 27 | with Image.open(img_path) as im: 28 | d_wight, d_height = resize_image(im, cfg.image_size) 29 | scale_ratio_w = d_wight / im.width 30 | scale_ratio_h = d_height / im.height 31 | im = im.resize((d_wight, d_height), Image.NEAREST).convert('RGB') 32 | for score, geo, s in zip(quad_scores, quad_after_nms, 33 | range(len(quad_scores))): 34 | if np.amin(score) > 0: 35 | rescaled_geo = geo / [scale_ratio_w, scale_ratio_h] 36 | im = cv2.cvtColor(np.asarray(im), cv2.COLOR_RGB2BGR) 37 | im = crop_rectangle(im, rescaled_geo) 38 | results.append(im) 39 | return results 40 | 41 | 42 | def recognition(model, img): 43 | img_size = img.shape 44 | if (img_size[1] / img_size[0] * 1.0) < 6: 45 | img_reshape = cv2.resize(img, (int(31.0 / img_size[0] * img_size[1]), cfg.height)) 46 | 47 | mat_ori = np.zeros((cfg.height, cfg.width - int(31.0 / img_size[0] * img_size[1]), 3), dtype=np.uint8) 48 | out_img = np.concatenate([img_reshape, mat_ori], axis=1).transpose([1, 0, 2]) 49 | else: 50 | out_img = cv2.resize(img, (cfg.width, cfg.height), interpolation=cv2.INTER_CUBIC) 51 | out_img = np.asarray(out_img) 52 | out_img = out_img.transpose([1, 0, 2]) 53 | 54 | y_pred = model.predict(np.expand_dims(out_img, axis=0)) 55 | shape = y_pred[:, 2:, :].shape 56 | ctc_decode = ktf.ctc_decode(y_pred[:, 2:, :], input_length=np.ones(shape[0]) * shape[1])[0][0] 57 | out = ktf.get_value(ctc_decode)[:, :cfg.label_len] 58 | result = ''.join([cfg.characters[k] for k in out[0]]) 59 | return result 60 | 61 | 62 | def main(): 63 | location_model = load_model(cfg.location_model) 64 | recognition_model = load_model(cfg.recognition_model) 65 | imgs = os.listdir('test') 66 | result_txt = open('result.txt', 'a+') 67 | for im in imgs: 68 | img_path = os.path.join('test', im) 69 | ims_re = location(location_model, img_path, cfg.pixel_threshold) 70 | if len(ims_re) > 0: 71 | for i in range(len(ims_re)): 72 | re_text = recognition(recognition_model, ims_re[i]) 73 | result = im + " : " + re_text + "\n" 74 | result_txt.write(result) 75 | result_txt.close() 76 | 77 | 78 | main() -------------------------------------------------------------------------------- /location/network.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from keras import Input, Model 3 | from keras.applications.vgg16 import VGG16 4 | from keras.layers import Concatenate, Conv2D, UpSampling2D, BatchNormalization 5 | import tensorflow as tf 6 | import cfg 7 | from keras.layers import Layer 8 | 9 | 10 | class Att(Layer): 11 | def __init__(self, **kwargs): 12 | super(Att, self).__init__(**kwargs) 13 | 14 | def call(self, layer_input, **kwargs): 15 | x = layer_input 16 | attention = tf.reduce_mean(layer_input, axis=-1, keep_dims=True) 17 | importance_map = tf.sigmoid(attention) 18 | output = tf.multiply(x, importance_map) 19 | return output 20 | 21 | def compute_output_shape(self, input_shape): 22 | return input_shape 23 | 24 | 25 | class East: 26 | 27 | def __init__(self): 28 | self.input_img = Input(name='input_img', 29 | shape=(None, None, cfg.num_channels), 30 | dtype='float32') 31 | vgg16 = VGG16(input_tensor=self.input_img, 32 | weights=None, 33 | include_top=False) 34 | self.vgg_pools = [vgg16.get_layer('block%d_pool' % i).output 35 | for i in range(2, 6)] 36 | 37 | def east_network(self): 38 | 39 | def decoder(layer_input, skip_input, channel): 40 | concat = Concatenate(axis=-1)([layer_input, skip_input]) 41 | bn1 = BatchNormalization()(concat) 42 | conv_1 = Conv2D(channel, 1, 43 | activation='relu', padding='same')(bn1) 44 | bn2 = BatchNormalization()(conv_1) 45 | conv_3 = Conv2D(channel, 3, 46 | activation='relu', padding='same')(bn2) 47 | return conv_3 48 | 49 | # d1 = Att()(self.vgg_pools[3]) 50 | # d1 = decoder(UpSampling2D((2, 2))(d1), self.vgg_pools[2], 128) 51 | d1 = decoder(UpSampling2D((2, 2))(self.vgg_pools[3]), self.vgg_pools[2], 128) 52 | # d1 = Att()(d1) 53 | d2 = decoder(UpSampling2D((2, 2))(d1), self.vgg_pools[1], 64) 54 | # d2 = Att()(d2) 55 | d3 = decoder(UpSampling2D((2, 2))(d2), self.vgg_pools[0], 32) 56 | # d3 = Att()(d3) 57 | bn = BatchNormalization()(d3) 58 | before_output = Conv2D(32, 3, activation='relu', padding='same')(bn) 59 | inside_score = Conv2D(1, 1, padding='same', name='inside_score' 60 | )(before_output) 61 | side_v_code = Conv2D(2, 1, padding='same', name='side_vertex_code' 62 | )(before_output) 63 | side_v_coord = Conv2D(4, 1, padding='same', name='side_vertex_coord' 64 | )(before_output) 65 | east_detect = Concatenate(axis=-1, 66 | name='east_detect')([inside_score, 67 | side_v_code, 68 | side_v_coord]) 69 | model = Model(inputs=self.input_img, outputs=[east_detect]) 70 | # model.summary() 71 | return model 72 | 73 | 74 | if __name__ == '__main__': 75 | east = East() 76 | east_network = east.east_network() 77 | east_network.load_weights('saved_model/east_model_weights_origin.h5') 78 | east_network.summary() 79 | -------------------------------------------------------------------------------- /recognition/data_loader.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import linecache 4 | import os 5 | import cfg 6 | 7 | 8 | train_imgs = open(os.path.join(cfg.ocr_dataset_path, "annotation_train.txt"), 'r').readlines() 9 | train_imgs_num = len(train_imgs) 10 | 11 | val_imgs = open(os.path.join(cfg.ocr_dataset_path, "annotation_val.txt"), 'r').readlines() 12 | val_imgs_num = len(val_imgs) 13 | 14 | lexicon_dic_path = os.path.join(cfg.ocr_dataset_path, "lexicon.txt") 15 | 16 | 17 | def img_gen_lexicon(batch_size=50, input_shape=None): 18 | imgs = np.zeros((batch_size, cfg.width, cfg.height, 3), dtype=np.uint8) 19 | labels = np.zeros((batch_size, cfg.label_len), dtype=np.uint8) 20 | 21 | while True: 22 | for i in range(batch_size): 23 | while True: 24 | pick_index = np.random.randint(0, train_imgs_num - 1) 25 | train_imgs_split = [m for m in train_imgs[pick_index].split()] 26 | lexicon = linecache.getline(lexicon_dic_path, int(train_imgs_split[1]) + 1).strip("\n") 27 | img_path = cfg.ocr_dataset_path + train_imgs_split[0][1:] 28 | img = cv2.imread(img_path) 29 | if (img is not None) and len(lexicon) <= cfg.label_len: 30 | img_size = img.shape # (height, width, channels) 31 | if img_size[1] > 2 and img_size[0] > 2: 32 | break 33 | if (img_size[1]/(img_size[0]*1.0)) < 6.4: 34 | img_reshape = cv2.resize(img, (int(31.0/img_size[0]*img_size[1]), cfg.height)) 35 | mat_ori = np.zeros((cfg.height, cfg.width - int(31.0/img_size[0]*img_size[1]), 3), dtype=np.uint8) 36 | out_img = np.concatenate([img_reshape, mat_ori], axis=1).transpose([1, 0, 2]) 37 | else: 38 | out_img = cv2.resize(img, (cfg.width, cfg.height), interpolation=cv2.INTER_CUBIC) 39 | out_img = np.asarray(out_img).transpose([1, 0, 2]) 40 | 41 | # due to the explanation of ctc_loss, try to not add "-" for blank 42 | while len(lexicon) < cfg.label_len: 43 | lexicon += "-" 44 | 45 | imgs[i] = out_img 46 | labels[i] = [cfg.characters.find(c) for c in lexicon] 47 | yield [imgs, labels, np.ones(batch_size) * int(input_shape[1] - 2), np.ones(batch_size) * cfg.label_len], labels 48 | 49 | 50 | def img_gen_val_lexicon(batch_size=1000): 51 | imgs = np.zeros((batch_size, cfg.width, cfg.height, 3), dtype=np.uint8) 52 | labels = [] 53 | 54 | while True: 55 | for i in range(batch_size): 56 | 57 | while True: 58 | pick_index = np.random.randint(0, val_imgs_num - 1) 59 | train_imgs_split = [m for m in val_imgs[pick_index].split()] 60 | lexicon = linecache.getline(lexicon_dic_path, int(train_imgs_split[1]) + 1).strip("\n") 61 | img_path = cfg.ocr_dataset_path + train_imgs_split[0][1:] 62 | img = cv2.imread(img_path) 63 | 64 | if (img is not None) and len(lexicon) <= cfg.label_len: 65 | img_size = img.shape # (height, width, channels) 66 | if img_size[1] > 2 and img_size[0] > 2: 67 | break 68 | if (img_size[1]/(img_size[0]*1.0)) < 6.4: 69 | img_reshape = cv2.resize(img, (int(31.0/img_size[0]*img_size[1]), cfg.height)) 70 | mat_ori = np.zeros((cfg.height, cfg.width - int(31.0/img_size[0]*img_size[1]), 3), dtype=np.uint8) 71 | out_img = np.concatenate([img_reshape, mat_ori], axis=1).transpose([1, 0, 2]) 72 | else: 73 | out_img = cv2.resize(img, (cfg.width, cfg.height), interpolation=cv2.INTER_CUBIC) 74 | out_img = np.asarray(out_img).transpose([1, 0, 2]) 75 | 76 | imgs[i] = out_img 77 | labels.append(lexicon) 78 | yield imgs, labels 79 | 80 | -------------------------------------------------------------------------------- /recognition/network.py: -------------------------------------------------------------------------------- 1 | from keras import backend as ktf 2 | from keras.layers import Conv2D, LSTM, Lambda, BatchNormalization, MaxPooling2D, Reshape, Dense, Dropout, add, concatenate, Bidirectional 3 | from keras.models import Model, Input 4 | from keras.optimizers import SGD 5 | 6 | 7 | class CRNN: 8 | def __init__(self, width, height, label_len, characters): 9 | self.height = height 10 | self.width = width 11 | self.label_len = label_len 12 | self.characters = characters 13 | self.label_classes = len(self.characters) 14 | 15 | def ctc_loss(self, args): 16 | iy_pred, ilabels, iinput_length, ilabel_length = args 17 | # the 2 is critical here since the first couple outputs of the RNN 18 | # tend to be garbage: 19 | iy_pred = iy_pred[:, 2:, :] # no such influence 20 | return ktf.ctc_batch_cost(ilabels, iy_pred, iinput_length, ilabel_length) 21 | 22 | def network(self): 23 | input_im = Input(shape=(self.width, self.height, 3)) 24 | 25 | conv_1 = Conv2D(64, (3, 3), activation='relu', padding='same')(input_im) 26 | bn1 = BatchNormalization()(conv_1) 27 | 28 | conv_2_1 = Conv2D(128, (3, 3), activation='relu', padding='same')(bn1) 29 | conv_2_2 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv_2_1) 30 | bn2 = BatchNormalization()(conv_2_2) 31 | pool_1 = MaxPooling2D(pool_size=(2, 2))(bn2) 32 | 33 | conv_3_1 = Conv2D(256, (3, 3), activation='relu', padding='same')(pool_1) 34 | conv_3_2 = Conv2D(512, (3, 3), activation='relu', padding='same')(conv_3_1) 35 | bn3 = BatchNormalization()(conv_3_2) 36 | pool_2 = MaxPooling2D(pool_size=(2, 2))(bn3) 37 | 38 | conv_4_1 = Conv2D(512, (3, 3), activation='relu', padding='same')(pool_2) 39 | conv_4_2 = Conv2D(512, (3, 3), activation='relu', padding='same')(conv_4_1) 40 | bn4 = BatchNormalization()(conv_4_2) 41 | 42 | bn_shape = bn4.get_shape() 43 | 44 | x_reshape = Reshape(target_shape=(int(bn_shape[1]), int(bn_shape[2] * bn_shape[3])))(bn4) 45 | 46 | fc_1 = Dense(128, activation='relu')(x_reshape) 47 | 48 | rnn_1 = LSTM(128, kernel_initializer="he_normal", return_sequences=True)(fc_1) 49 | rnn_1b = LSTM(128, kernel_initializer="he_normal", go_backwards=True, return_sequences=True)(fc_1) 50 | rnn1_merged = add([rnn_1, rnn_1b]) 51 | # bi_lstm1 = Bidirectional(LSTM(128, kernel_initializer="he_normal", return_sequences=True), merge_mode='sum')(fc_1) 52 | # 53 | rnn_2 = LSTM(128, kernel_initializer="he_normal", return_sequences=True)(rnn1_merged) 54 | rnn_2b = LSTM(128, kernel_initializer="he_normal", go_backwards=True, return_sequences=True)(rnn1_merged) 55 | rnn2_merged = concatenate([rnn_2, rnn_2b]) 56 | # bi_lstm2 = Bidirectional(LSTM(128, kernel_initializer="he_normal", return_sequences=True), merge_mode='concat')(bi_lstm1) 57 | 58 | drop_1 = Dropout(0.25)(rnn2_merged) 59 | # drop_1 = Dropout(0.25)(bi_lstm2) 60 | 61 | fc_2 = Dense(self.label_classes, kernel_initializer='he_normal', activation='softmax')(drop_1) 62 | 63 | infer_model = Model(inputs=input_im, outputs=fc_2) 64 | 65 | labels = Input(name='the_labels', shape=[self.label_len], dtype='float32') 66 | input_length = Input(name='input_length', shape=[1], dtype='int64') 67 | label_length = Input(name='label_length', shape=[1], dtype='int64') 68 | 69 | loss_out = Lambda(self.ctc_loss, output_shape=(1,), name='ctc')([fc_2, labels, input_length, 70 | label_length]) 71 | 72 | train_model = Model(inputs=[input_im, labels, input_length, label_length], outputs=[loss_out]) 73 | sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5) 74 | 75 | train_model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd) 76 | infer_model.summary() 77 | train_model.summary() 78 | 79 | return train_model, infer_model 80 | 81 | 82 | if __name__ == '__main__': 83 | import string 84 | CRNN(200, 31, 11, '0123456789'+'-').network() 85 | -------------------------------------------------------------------------------- /location/predict.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image, ImageDraw 3 | from keras.preprocessing import image 4 | from keras.applications.vgg16 import preprocess_input 5 | import cv2 6 | import cfg 7 | from network import East 8 | from preprocess import resize_image 9 | from nms import nms 10 | import os 11 | 12 | 13 | def sigmoid(x): 14 | """`y = 1 / (1 + exp(-x))`""" 15 | return 1 / (1 + np.exp(-x)) 16 | 17 | 18 | def crop_rectangle(img, geo): 19 | rect = cv2.minAreaRect(geo.astype(int)) 20 | center, size, angle = rect[0], rect[1], rect[2] 21 | if(angle > -45): 22 | center = tuple(map(int, center)) 23 | size = tuple([int(rect[1][0] + 10), int(rect[1][1] + 10)]) 24 | height, width = img.shape[0], img.shape[1] 25 | M = cv2.getRotationMatrix2D(center, angle, 1) 26 | img_rot = cv2.warpAffine(img, M, (width, height)) 27 | img_crop = cv2.getRectSubPix(img_rot, size, center) 28 | else: 29 | center = tuple(map(int, center)) 30 | size = tuple([int(rect[1][1] + 10), int(rect[1][0]) + 10]) 31 | angle -= 270 32 | height, width = img.shape[0], img.shape[1] 33 | M = cv2.getRotationMatrix2D(center, angle, 1) 34 | img_rot = cv2.warpAffine(img, M, (width, height)) 35 | img_crop = cv2.getRectSubPix(img_rot, size, center) 36 | return img_crop 37 | 38 | 39 | def predict(east_detect, img_path, pixel_threshold, quiet=False): 40 | img = image.load_img(img_path) 41 | d_wight, d_height = resize_image(img, cfg.image_size) 42 | img = img.resize((d_wight, d_height), Image.NEAREST).convert('RGB') 43 | img = image.img_to_array(img) 44 | img = preprocess_input(img, mode='tf') 45 | x = np.expand_dims(img, axis=0) 46 | y = east_detect.predict(x) 47 | y = np.squeeze(y, axis=0) 48 | y[:, :, :3] = sigmoid(y[:, :, :3]) 49 | cond = np.greater_equal(y[:, :, 0], pixel_threshold) 50 | activation_pixels = np.where(cond) 51 | quad_scores, quad_after_nms = nms(y, activation_pixels) 52 | with Image.open(img_path) as im: 53 | im_array = image.img_to_array(im.convert('RGB')) 54 | d_wight, d_height = resize_image(im, cfg.image_size) 55 | scale_ratio_w = d_wight / im.width 56 | scale_ratio_h = d_height / im.height 57 | im = im.resize((d_wight, d_height), Image.NEAREST).convert('RGB') 58 | quad_im = im.copy() 59 | quad_draw = ImageDraw.Draw(quad_im) 60 | txt_items = [] 61 | flag = False 62 | for score, geo, s in zip(quad_scores, quad_after_nms, 63 | range(len(quad_scores))): 64 | if np.amin(score) > 0: 65 | flag = True 66 | quad_draw.line([tuple(geo[0]), 67 | tuple(geo[1]), 68 | tuple(geo[2]), 69 | tuple(geo[3]), 70 | tuple(geo[0])], width=2, fill='blue') 71 | rescaled_geo = geo / [scale_ratio_w, scale_ratio_h] 72 | rescaled_geo_list = np.reshape(rescaled_geo, (8,)).tolist() 73 | txt_item = ','.join(map(str, rescaled_geo_list)) 74 | txt_items.append(txt_item + '\n') 75 | if cfg.detection_box_crop: 76 | img_crop = crop_rectangle(im_array, rescaled_geo) 77 | cv2.imwrite(os.path.join('output_crop', img_path.split('/')[-1].split('.')[0] + '.jpg'), img_crop) 78 | elif not quiet: 79 | print('quad invalid with vertex num less then 4.') 80 | if flag: 81 | quad_im.save(os.path.join('output', img_path.split('/')[-1].split('.')[0] + '_predict.jpg')) 82 | if cfg.predict_write2txt and len(txt_items) > 0: 83 | with open(os.path.join("output_txt", img_path.split('/')[-1].split('.')[0] + '.txt'), 'w') as f_txt: 84 | f_txt.writelines(txt_items) 85 | 86 | 87 | if __name__ == '__main__': 88 | east = East() 89 | east_detect = east.east_network() 90 | east_detect.summary() 91 | east_detect.load_weights(cfg.model_weights_path) 92 | img_list = os.listdir('test_imgs') 93 | for img_path in img_list: 94 | predict(east_detect, os.path.join('test_imgs', img_path), cfg.pixel_threshold) 95 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cfg as cfg 3 | import cv2 4 | 5 | 6 | def sigmoid(x): 7 | """`y = 1 / (1 + exp(-x))`""" 8 | return 1 / (1 + np.exp(-x)) 9 | 10 | 11 | def resize_image(im, max_img_size=cfg.image_size): 12 | im_width = np.minimum(im.width, max_img_size) 13 | if im_width == max_img_size < im.width: 14 | im_height = int((im_width / im.width) * im.height) 15 | else: 16 | im_height = im.height 17 | o_height = np.minimum(im_height, max_img_size) 18 | if o_height == max_img_size < im_height: 19 | o_width = int((o_height / im_height) * im_width) 20 | else: 21 | o_width = im_width 22 | d_wight = o_width - (o_width % 32) 23 | d_height = o_height - (o_height % 32) 24 | return d_wight, d_height 25 | 26 | 27 | def should_merge(region, i, j): 28 | neighbor = {(i, j - 1)} 29 | return not region.isdisjoint(neighbor) 30 | 31 | 32 | def region_neighbor(region_set): 33 | region_pixels = np.array(list(region_set)) 34 | j_min = np.amin(region_pixels, axis=0)[1] - 1 35 | j_max = np.amax(region_pixels, axis=0)[1] + 1 36 | i_m = np.amin(region_pixels, axis=0)[0] + 1 37 | region_pixels[:, 0] += 1 38 | neighbor = {(region_pixels[n, 0], region_pixels[n, 1]) for n in 39 | range(len(region_pixels))} 40 | neighbor.add((i_m, j_min)) 41 | neighbor.add((i_m, j_max)) 42 | return neighbor 43 | 44 | 45 | def region_group(region_list): 46 | S = [i for i in range(len(region_list))] 47 | D = [] 48 | while len(S) > 0: 49 | m = S.pop(0) 50 | if len(S) == 0: 51 | # S has only one element, put it to D 52 | D.append([m]) 53 | else: 54 | D.append(rec_region_merge(region_list, m, S)) 55 | return D 56 | 57 | 58 | def rec_region_merge(region_list, m, S): 59 | rows = [m] 60 | tmp = [] 61 | for n in S: 62 | if not region_neighbor(region_list[m]).isdisjoint(region_list[n]) or \ 63 | not region_neighbor(region_list[n]).isdisjoint(region_list[m]): 64 | # 第m与n相交 65 | tmp.append(n) 66 | for d in tmp: 67 | S.remove(d) 68 | for e in tmp: 69 | rows.extend(rec_region_merge(region_list, e, S)) 70 | return rows 71 | 72 | 73 | def nms(predict, activation_pixels, threshold=cfg.side_vertex_pixel_threshold): 74 | region_list = [] 75 | for i, j in zip(activation_pixels[0], activation_pixels[1]): 76 | merge = False 77 | for k in range(len(region_list)): 78 | if should_merge(region_list[k], i, j): 79 | region_list[k].add((i, j)) 80 | merge = True 81 | if not merge: 82 | region_list.append({(i, j)}) 83 | D = region_group(region_list) 84 | quad_list = np.zeros((len(D), 4, 2)) 85 | score_list = np.zeros((len(D), 4)) 86 | for group, g_th in zip(D, range(len(D))): 87 | total_score = np.zeros((4, 2)) 88 | for row in group: 89 | for ij in region_list[row]: 90 | score = predict[ij[0], ij[1], 1] 91 | if score >= threshold: 92 | ith_score = predict[ij[0], ij[1], 2:3] 93 | if not (cfg.trunc_threshold <= ith_score < 1 - 94 | cfg.trunc_threshold): 95 | ith = int(np.around(ith_score)) 96 | total_score[ith * 2:(ith + 1) * 2] += score 97 | px = (ij[1] + 0.5) * cfg.pixel_size 98 | py = (ij[0] + 0.5) * cfg.pixel_size 99 | p_v = [px, py] + np.reshape(predict[ij[0], ij[1], 3:7], 100 | (2, 2)) 101 | quad_list[g_th, ith * 2:(ith + 1) * 2] += score * p_v 102 | score_list[g_th] = total_score[:, 0] 103 | quad_list[g_th] /= (total_score + cfg.epsilon) 104 | return score_list, quad_list 105 | 106 | 107 | def crop_rectangle(img, geo): 108 | rect = cv2.minAreaRect(geo.astype(int)) 109 | center, size, angle = rect[0], rect[1], rect[2] 110 | if(angle > -45): 111 | center = tuple(map(int, center)) 112 | size = tuple([int(rect[1][0] + 10), int(rect[1][1] + 10)]) 113 | height, width = img.shape[0], img.shape[1] 114 | M = cv2.getRotationMatrix2D(center, angle, 1) 115 | img_rot = cv2.warpAffine(img, M, (width, height)) 116 | img_crop = cv2.getRectSubPix(img_rot, size, center) 117 | else: 118 | center = tuple(map(int, center)) 119 | size = tuple([int(rect[1][1] + 10), int(rect[1][0]) + 10]) 120 | angle -= 270 121 | height, width = img.shape[0], img.shape[1] 122 | M = cv2.getRotationMatrix2D(center, angle, 1) 123 | img_rot = cv2.warpAffine(img, M, (width, height)) 124 | img_crop = cv2.getRectSubPix(img_rot, size, center) 125 | return img_crop -------------------------------------------------------------------------------- /location/label.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | from PIL import Image, ImageDraw 4 | from tqdm import tqdm 5 | import cfg 6 | 7 | 8 | def point_inside_of_quad(px, py, quad_xy_list, p_min, p_max): 9 | if (p_min[0] <= px <= p_max[0]) and (p_min[1] <= py <= p_max[1]): 10 | xy_list = np.zeros((4, 2)) 11 | xy_list[:3, :] = quad_xy_list[1:4, :] - quad_xy_list[:3, :] 12 | xy_list[3] = quad_xy_list[0, :] - quad_xy_list[3, :] 13 | yx_list = np.zeros((4, 2)) 14 | yx_list[:, :] = quad_xy_list[:, -1:-3:-1] 15 | a = xy_list * ([py, px] - yx_list) 16 | b = a[:, 0] - a[:, 1] 17 | if np.amin(b) >= 0 or np.amax(b) <= 0: 18 | return True 19 | else: 20 | return False 21 | else: 22 | return False 23 | 24 | 25 | def point_inside_of_nth_quad(px, py, xy_list, shrink_1, long_edge): 26 | nth = -1 27 | vs = [[[0, 0, 3, 3, 0], [1, 1, 2, 2, 1]], 28 | [[0, 0, 1, 1, 0], [2, 2, 3, 3, 2]]] 29 | for ith in range(2): 30 | quad_xy_list = np.concatenate(( 31 | np.reshape(xy_list[vs[long_edge][ith][0]], (1, 2)), 32 | np.reshape(shrink_1[vs[long_edge][ith][1]], (1, 2)), 33 | np.reshape(shrink_1[vs[long_edge][ith][2]], (1, 2)), 34 | np.reshape(xy_list[vs[long_edge][ith][3]], (1, 2))), axis=0) 35 | p_min = np.amin(quad_xy_list, axis=0) 36 | p_max = np.amax(quad_xy_list, axis=0) 37 | if point_inside_of_quad(px, py, quad_xy_list, p_min, p_max): 38 | if nth == -1: 39 | nth = ith 40 | else: 41 | nth = -1 42 | break 43 | return nth 44 | 45 | 46 | def shrink(xy_list, ratio=cfg.shrink_ratio): 47 | if ratio == 0.0: 48 | return xy_list, xy_list 49 | diff_1to3 = xy_list[:3, :] - xy_list[1:4, :] 50 | diff_4 = xy_list[3:4, :] - xy_list[0:1, :] 51 | diff = np.concatenate((diff_1to3, diff_4), axis=0) 52 | dis = np.sqrt(np.sum(np.square(diff), axis=-1)) 53 | # determine which are long or short edges 54 | long_edge = int(np.argmax(np.sum(np.reshape(dis, (2, 2)), axis=0))) 55 | short_edge = 1 - long_edge 56 | # cal r length array 57 | r = [np.minimum(dis[i], dis[(i + 1) % 4]) for i in range(4)] 58 | # cal theta array 59 | diff_abs = np.abs(diff) 60 | diff_abs[:, 0] += cfg.epsilon 61 | theta = np.arctan(diff_abs[:, 1] / diff_abs[:, 0]) 62 | # shrink two long edges 63 | temp_new_xy_list = np.copy(xy_list) 64 | shrink_edge(xy_list, temp_new_xy_list, long_edge, r, theta, ratio) 65 | shrink_edge(xy_list, temp_new_xy_list, long_edge + 2, r, theta, ratio) 66 | # shrink two short edges 67 | new_xy_list = np.copy(temp_new_xy_list) 68 | shrink_edge(temp_new_xy_list, new_xy_list, short_edge, r, theta, ratio) 69 | shrink_edge(temp_new_xy_list, new_xy_list, short_edge + 2, r, theta, ratio) 70 | return temp_new_xy_list, new_xy_list, long_edge 71 | 72 | 73 | def shrink_edge(xy_list, new_xy_list, edge, r, theta, ratio=cfg.shrink_ratio): 74 | if ratio == 0.0: 75 | return 76 | start_point = edge 77 | end_point = (edge + 1) % 4 78 | long_start_sign_x = np.sign( 79 | xy_list[end_point, 0] - xy_list[start_point, 0]) 80 | new_xy_list[start_point, 0] = \ 81 | xy_list[start_point, 0] + \ 82 | long_start_sign_x * ratio * r[start_point] * np.cos(theta[start_point]) 83 | long_start_sign_y = np.sign( 84 | xy_list[end_point, 1] - xy_list[start_point, 1]) 85 | new_xy_list[start_point, 1] = \ 86 | xy_list[start_point, 1] + \ 87 | long_start_sign_y * ratio * r[start_point] * np.sin(theta[start_point]) 88 | # long edge one, end point 89 | long_end_sign_x = -1 * long_start_sign_x 90 | new_xy_list[end_point, 0] = \ 91 | xy_list[end_point, 0] + \ 92 | long_end_sign_x * ratio * r[end_point] * np.cos(theta[start_point]) 93 | long_end_sign_y = -1 * long_start_sign_y 94 | new_xy_list[end_point, 1] = \ 95 | xy_list[end_point, 1] + \ 96 | long_end_sign_y * ratio * r[end_point] * np.sin(theta[start_point]) 97 | 98 | 99 | def process_label(data_dir=cfg.data_dir): 100 | with open(os.path.join(data_dir, cfg.val_fname), 'r') as f_val: 101 | f_list = f_val.readlines() 102 | with open(os.path.join(data_dir, cfg.train_fname), 'r') as f_train: 103 | f_list.extend(f_train.readlines()) 104 | for line, _ in zip(f_list, tqdm(range(len(f_list)))): 105 | line_cols = str(line).strip().split(',') 106 | img_name, width, height = \ 107 | line_cols[0].strip(), int(line_cols[1].strip()), \ 108 | int(line_cols[2].strip()) 109 | gt = np.zeros((height // cfg.pixel_size, width // cfg.pixel_size, 7)) 110 | train_label_dir = os.path.join(data_dir, cfg.train_label_dir_name) 111 | xy_list_array = np.load(os.path.join(train_label_dir, 112 | img_name[:-4] + '.npy')) 113 | train_image_dir = os.path.join(data_dir, cfg.train_image_dir_name) 114 | with Image.open(os.path.join(train_image_dir, img_name)) as im: 115 | draw = ImageDraw.Draw(im) 116 | for xy_list in xy_list_array: 117 | _, shrink_xy_list, _ = shrink(xy_list, cfg.shrink_ratio) 118 | shrink_1, _, long_edge = shrink(xy_list, cfg.shrink_side_ratio) 119 | p_min = np.amin(shrink_xy_list, axis=0) 120 | p_max = np.amax(shrink_xy_list, axis=0) 121 | # floor of the float 122 | ji_min = (p_min / cfg.pixel_size - 0.5).astype(int) - 1 123 | # +1 for ceil of the float and +1 for include the end 124 | ji_max = (p_max / cfg.pixel_size - 0.5).astype(int) + 3 125 | imin = np.maximum(0, ji_min[1]) 126 | imax = np.minimum(height // cfg.pixel_size, ji_max[1]) 127 | jmin = np.maximum(0, ji_min[0]) 128 | jmax = np.minimum(width // cfg.pixel_size, ji_max[0]) 129 | for i in range(imin, imax): 130 | for j in range(jmin, jmax): 131 | px = (j + 0.5) * cfg.pixel_size 132 | py = (i + 0.5) * cfg.pixel_size 133 | if point_inside_of_quad(px, py, 134 | shrink_xy_list, p_min, p_max): 135 | gt[i, j, 0] = 1 136 | line_width, line_color = 1, 'red' 137 | ith = point_inside_of_nth_quad(px, py, 138 | xy_list, 139 | shrink_1, 140 | long_edge) 141 | vs = [[[3, 0], [1, 2]], [[0, 1], [2, 3]]] 142 | if ith in range(2): 143 | gt[i, j, 1] = 1 144 | if ith == 0: 145 | line_width, line_color = 2, 'yellow' 146 | else: 147 | line_width, line_color = 2, 'green' 148 | gt[i, j, 2:3] = ith 149 | gt[i, j, 3:5] = \ 150 | xy_list[vs[long_edge][ith][0]] - [px, py] 151 | gt[i, j, 5:] = \ 152 | xy_list[vs[long_edge][ith][1]] - [px, py] 153 | draw.line([(px - 0.5 * cfg.pixel_size, 154 | py - 0.5 * cfg.pixel_size), 155 | (px + 0.5 * cfg.pixel_size, 156 | py - 0.5 * cfg.pixel_size), 157 | (px + 0.5 * cfg.pixel_size, 158 | py + 0.5 * cfg.pixel_size), 159 | (px - 0.5 * cfg.pixel_size, 160 | py + 0.5 * cfg.pixel_size), 161 | (px - 0.5 * cfg.pixel_size, 162 | py - 0.5 * cfg.pixel_size)], 163 | width=line_width, fill=line_color) 164 | act_image_dir = os.path.join(cfg.data_dir, 165 | cfg.show_act_image_dir_name) 166 | if cfg.draw_act_quad: 167 | im.save(os.path.join(act_image_dir, img_name)) 168 | train_label_dir = os.path.join(data_dir, cfg.train_label_dir_name) 169 | np.save(os.path.join(train_label_dir, 170 | img_name[:-4] + '_gt.npy'), gt) 171 | 172 | 173 | if __name__ == '__main__': 174 | process_label() 175 | -------------------------------------------------------------------------------- /location/preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image, ImageDraw 3 | import os 4 | import random 5 | from tqdm import tqdm 6 | from label import shrink 7 | import cfg 8 | 9 | 10 | def batch_reorder_vertexes(xy_list_array): 11 | reorder_xy_list_array = np.zeros_like(xy_list_array) 12 | for xy_list, i in zip(xy_list_array, range(len(xy_list_array))): 13 | reorder_xy_list_array[i] = reorder_vertexes(xy_list) 14 | return reorder_xy_list_array 15 | 16 | 17 | def reorder_vertexes(xy_list): 18 | reorder_xy_list = np.zeros_like(xy_list) 19 | # determine the first point with the smallest x, 20 | # if two has same x, choose that with smallest y, 21 | ordered = np.argsort(xy_list, axis=0) 22 | xmin1_index = ordered[0, 0] 23 | xmin2_index = ordered[1, 0] 24 | if xy_list[xmin1_index, 0] == xy_list[xmin2_index, 0]: 25 | if xy_list[xmin1_index, 1] <= xy_list[xmin2_index, 1]: 26 | reorder_xy_list[0] = xy_list[xmin1_index] 27 | first_v = xmin1_index 28 | else: 29 | reorder_xy_list[0] = xy_list[xmin2_index] 30 | first_v = xmin2_index 31 | else: 32 | reorder_xy_list[0] = xy_list[xmin1_index] 33 | first_v = xmin1_index 34 | # connect the first point to others, the third point on the other side of 35 | # the line with the middle slope 36 | others = list(range(4)) 37 | others.remove(first_v) 38 | k = np.zeros((len(others),)) 39 | for index, i in zip(others, range(len(others))): 40 | k[i] = (xy_list[index, 1] - xy_list[first_v, 1]) \ 41 | / (xy_list[index, 0] - xy_list[first_v, 0] + cfg.epsilon) 42 | k_mid = np.argsort(k)[1] 43 | third_v = others[k_mid] 44 | reorder_xy_list[2] = xy_list[third_v] 45 | # determine the second point which on the bigger side of the middle line 46 | others.remove(third_v) 47 | b_mid = xy_list[first_v, 1] - k[k_mid] * xy_list[first_v, 0] 48 | second_v, fourth_v = 0, 0 49 | for index, i in zip(others, range(len(others))): 50 | # delta = y - (k * x + b) 51 | delta_y = xy_list[index, 1] - (k[k_mid] * xy_list[index, 0] + b_mid) 52 | if delta_y > 0: 53 | second_v = index 54 | else: 55 | fourth_v = index 56 | reorder_xy_list[1] = xy_list[second_v] 57 | reorder_xy_list[3] = xy_list[fourth_v] 58 | # compare slope of 13 and 24, determine the final order 59 | k13 = k[k_mid] 60 | k24 = (xy_list[second_v, 1] - xy_list[fourth_v, 1]) / ( 61 | xy_list[second_v, 0] - xy_list[fourth_v, 0] + cfg.epsilon) 62 | if k13 < k24: 63 | tmp_x, tmp_y = reorder_xy_list[3, 0], reorder_xy_list[3, 1] 64 | for i in range(2, -1, -1): 65 | reorder_xy_list[i + 1] = reorder_xy_list[i] 66 | reorder_xy_list[0, 0], reorder_xy_list[0, 1] = tmp_x, tmp_y 67 | return reorder_xy_list 68 | 69 | 70 | def resize_image(im, max_img_size=cfg.image_size): 71 | im_width = np.minimum(im.width, max_img_size) 72 | if im_width == max_img_size < im.width: 73 | im_height = int((im_width / im.width) * im.height) 74 | else: 75 | im_height = im.height 76 | o_height = np.minimum(im_height, max_img_size) 77 | if o_height == max_img_size < im_height: 78 | o_width = int((o_height / im_height) * im_width) 79 | else: 80 | o_width = im_width 81 | d_wight = o_width - (o_width % 32) 82 | d_height = o_height - (o_height % 32) 83 | return d_wight, d_height 84 | 85 | 86 | def preprocess(): 87 | data_dir = cfg.data_dir 88 | origin_image_dir = os.path.join(data_dir, cfg.origin_image_dir_name) 89 | origin_txt_dir = os.path.join(data_dir, cfg.origin_txt_dir_name) 90 | train_image_dir = os.path.join(data_dir, cfg.train_image_dir_name) 91 | train_label_dir = os.path.join(data_dir, cfg.train_label_dir_name) 92 | if not os.path.exists(train_image_dir): 93 | os.mkdir(train_image_dir) 94 | if not os.path.exists(train_label_dir): 95 | os.mkdir(train_label_dir) 96 | draw_gt_quad = cfg.draw_gt_quad 97 | show_gt_image_dir = os.path.join(data_dir, cfg.show_gt_image_dir_name) 98 | if not os.path.exists(show_gt_image_dir): 99 | os.mkdir(show_gt_image_dir) 100 | show_act_image_dir = os.path.join(cfg.data_dir, cfg.show_act_image_dir_name) 101 | if not os.path.exists(show_act_image_dir): 102 | os.mkdir(show_act_image_dir) 103 | 104 | o_img_list = os.listdir(origin_image_dir) 105 | print('found %d origin images.' % len(o_img_list)) 106 | train_val_set = [] 107 | for o_img_fname, _ in zip(o_img_list, tqdm(range(len(o_img_list)))): 108 | with Image.open(os.path.join(origin_image_dir, o_img_fname)) as im: 109 | # d_wight, d_height = resize_image(im) 110 | d_wight, d_height = cfg.image_size, cfg.image_size 111 | scale_ratio_w = d_wight / im.width 112 | scale_ratio_h = d_height / im.height 113 | try: 114 | im = im.resize((d_wight, d_height), Image.NEAREST).convert('RGB') 115 | show_gt_im = im.copy() 116 | # draw on the img 117 | draw = ImageDraw.Draw(show_gt_im) 118 | with open(os.path.join(origin_txt_dir, 119 | o_img_fname[:-4] + '.txt'), 'r') as f: 120 | anno_list = f.readlines() 121 | xy_list_array = np.zeros((len(anno_list), 4, 2)) 122 | for anno, i in zip(anno_list, range(len(anno_list))): 123 | anno_colums = anno.strip().split(',') 124 | anno_array = np.array(anno_colums) 125 | xy_list = np.reshape(anno_array[:8].astype(float), (4, 2)) 126 | xy_list[:, 0] = xy_list[:, 0] * scale_ratio_w 127 | xy_list[:, 1] = xy_list[:, 1] * scale_ratio_h 128 | xy_list = reorder_vertexes(xy_list) 129 | xy_list_array[i] = xy_list 130 | _, shrink_xy_list, _ = shrink(xy_list, cfg.shrink_ratio) 131 | shrink_1, _, long_edge = shrink(xy_list, cfg.shrink_side_ratio) 132 | if draw_gt_quad: 133 | draw.line([tuple(xy_list[0]), tuple(xy_list[1]), 134 | tuple(xy_list[2]), tuple(xy_list[3]), 135 | tuple(xy_list[0]) 136 | ], 137 | width=2, fill='green') 138 | draw.line([tuple(shrink_xy_list[0]), 139 | tuple(shrink_xy_list[1]), 140 | tuple(shrink_xy_list[2]), 141 | tuple(shrink_xy_list[3]), 142 | tuple(shrink_xy_list[0]) 143 | ], 144 | width=2, fill='blue') 145 | vs = [[[0, 0, 3, 3, 0], [1, 1, 2, 2, 1]], 146 | [[0, 0, 1, 1, 0], [2, 2, 3, 3, 2]]] 147 | for q_th in range(2): 148 | draw.line([tuple(xy_list[vs[long_edge][q_th][0]]), 149 | tuple(shrink_1[vs[long_edge][q_th][1]]), 150 | tuple(shrink_1[vs[long_edge][q_th][2]]), 151 | tuple(xy_list[vs[long_edge][q_th][3]]), 152 | tuple(xy_list[vs[long_edge][q_th][4]])], 153 | width=3, fill='yellow') 154 | if cfg.gen_origin_img: 155 | im.save(os.path.join(train_image_dir, o_img_fname)) 156 | np.save(os.path.join( 157 | train_label_dir, 158 | o_img_fname[:-4] + '.npy'), 159 | xy_list_array) 160 | if draw_gt_quad: 161 | show_gt_im.save(os.path.join(show_gt_image_dir, o_img_fname)) 162 | train_val_set.append('{},{},{}\n'.format(o_img_fname, 163 | d_wight, 164 | d_height)) 165 | except Exception: 166 | pass 167 | 168 | train_img_list = os.listdir(train_image_dir) 169 | print('found %d train images.' % len(train_img_list)) 170 | train_label_list = os.listdir(train_label_dir) 171 | print('found %d train labels.' % len(train_label_list)) 172 | 173 | random.shuffle(train_val_set) 174 | val_count = int(cfg.validation_split_ratio * len(train_val_set)) 175 | with open(os.path.join(data_dir, cfg.val_fname), 'w') as f_val: 176 | f_val.writelines(train_val_set[:val_count]) 177 | with open(os.path.join(data_dir, cfg.train_fname), 'w') as f_train: 178 | f_train.writelines(train_val_set[val_count:]) 179 | 180 | 181 | if __name__ == '__main__': 182 | preprocess() 183 | --------------------------------------------------------------------------------