├── .gitignore ├── README.md ├── fonts └── Ubuntu-M.ttf ├── lib ├── __init__.py ├── lstm │ ├── __init__.py │ ├── config.py │ ├── test.py │ ├── train.py │ └── utils │ │ ├── __init__.py │ │ ├── gen.py │ │ ├── tf_records.py │ │ ├── timer.py │ │ └── training.py ├── networks │ ├── LSTM_test.py │ ├── LSTM_train.py │ ├── __init__.py │ ├── factory.py │ └── network.py └── utils │ ├── convert_ckpt2npy.py │ ├── data_util.py │ └── genImg.py ├── lstm ├── __init__.py ├── lstm.yml ├── test_net.py └── train_net.py ├── test.sh └── train.sh /.gitignore: -------------------------------------------------------------------------------- 1 | log/ 2 | val*/ 3 | test*/ 4 | train*/ 5 | logs/ 6 | output/ 7 | data/ 8 | tmp/ 9 | __pycache__/ 10 | checkpoint* 11 | *.json 12 | *.swp 13 | *.swo 14 | .gdb* 15 | .idea/ 16 | *~ 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | - [old master](https://github.com/ilovin/lstm_ctc_ocr/tree/backup): 2 | - harder to converge compare to the beta version 3 | - both standard ctc and warpCTC 4 | - read data at once 5 | - [dev](https://github.com/ilovin/lstm_ctc_ocr/tree/dev): 6 | - the pipline version of lstm_ctc_ocr, resize to same size 7 | - use tf.records 8 | - [beta](https://github.com/ilovin/lstm_ctc_ocr/tree/beta) (current): 9 | - generate data on the fly 10 | - deal with multi-width image, padding to same width 11 | 12 | ## How to use 13 | 1. ./train.sh 14 | 15 | 16 | ### Dependency 17 | - python 3 18 | - tensorflow 1.0.1 19 | - [captcha](https://pypi.python.org/pypi/captcha) 20 | - [warpCTC tensorflow_binding](https://github.com/baidu-research/warp-ctc/tree/master/tensorflow_binding) 21 | 22 | ### Some details 23 | 24 | The training data: 25 | ![data](https://ooo.0o0.ooo/2017/04/13/58ef08ab6af03.png) 26 | 27 | Notice that, 28 | parameters can be found in `./lstm.yml`(higher priority) and `lib/lstm/utils/config.y` 29 | some parameters need to be fined tune: 30 | - learning rate 31 | - decay step & decay rate 32 | - image_height 33 | - optimizer? 34 | 35 | in `./lib/lstm/utils/gen.py`, the height of the images are the same, and I pad the width 36 | to the same for each batch, so 37 | if you want to use your own data, the height of the image shall be the same. 38 | 39 | ### Result 40 | The accurary can be more that 95% 41 | ![acc](https://i.loli.net/2017/08/28/59a2ee75a2a0a.png) 42 | 43 | Read [this blog](https://ilovin.github.io/2017-04-06/tensorflow-lstm-ctc-ocr/) for more details and [this blog](http://ilovin.github.io/2017-04-23/tensorflow-lstm-ctc-input-output/) for how to 44 | use `tf.nn.ctc_loss` or `warpCTC` 45 | -------------------------------------------------------------------------------- /fonts/Ubuntu-M.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ilovin/lstm_ctc_ocr/6c753df22e7c1bab40ce2170e9a11e7b3868cf80/fonts/Ubuntu-M.ttf -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- 1 | #import fast_rcnn 2 | -------------------------------------------------------------------------------- /lib/lstm/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from . import config 9 | from . import train 10 | -------------------------------------------------------------------------------- /lib/lstm/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import numpy as np 4 | from time import strftime, localtime 5 | from easydict import EasyDict as edict 6 | 7 | __C = edict() 8 | # Consumers can get config by: 9 | # from fast_rcnn_config import cfg 10 | cfg = __C 11 | 12 | # Default GPU device id 13 | __C.GPU_ID = 1 14 | __C.GPU_USAGE = 0.9 15 | __C.OFFSET_TIME_STEP = -1 16 | # region proposal network (RPN) or not 17 | __C.POOL_SCALE = 4 18 | __C.IMG_SHAPE = [32,100] 19 | __C.IMG_HEIGHT = 32 20 | __C.MAX_CHAR_LEN = 6 21 | __C.BLANK_TOKEN=0 22 | __C.CHARSET = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' 23 | __C.NCLASSES = len(__C.CHARSET)+2 24 | __C.MIN_LEN = 4 25 | __C.MAX_LEN = 6 26 | __C.FONT = 'fonts/Ubuntu-M.ttf' 27 | __C.NCHANNELS = 1 28 | __C.NUM_FEATURES= __C.IMG_HEIGHT *__C.NCHANNELS 29 | #__C.TIME_STEP = __C.IMG_SHAPE[0]//__C.POOL_SCALE 30 | 31 | __C.NET_NAME = 'lstm' 32 | __C.TRAIN = edict() 33 | # Adam, Momentum, RMS 34 | __C.TRAIN.SOLVER = 'Adam' 35 | #__C.TRAIN.SOLVER = 'Momentum' 36 | # __C.TRAIN.SOLVER = 'RMS' 37 | # learning rate 38 | __C.TRAIN.TXT = 'annotation_train.txt' 39 | __C.TRAIN.WEIGHT_DECAY = 0.0005 40 | __C.TRAIN.LEARNING_RATE = 0.01 41 | __C.TRAIN.MOMENTUM = 0.9 42 | __C.TRAIN.GAMMA = 0.1 43 | __C.TRAIN.STEPSIZE = 50000 44 | __C.TRAIN.DISPLAY = 10 45 | __C.TRAIN.LOG_IMAGE_ITERS = 100 46 | __C.TRAIN.NUM_EPOCHS = 2000 47 | 48 | __C.TRAIN.NUM_HID = 512 49 | __C.TRAIN.NUM_LAYERS = 2 50 | __C.TRAIN.BATCH_SIZE = 64 51 | 52 | # Iterations between snapshots 53 | __C.TRAIN.SNAPSHOT_ITERS = 5000 54 | __C.TRAIN.SNAPSHOT_PREFIX = 'lstm' 55 | __C.TRAIN.SNAPSHOT_INFIX = '' 56 | 57 | __C.VAL = edict() 58 | __C.VAL.TXT = 'annotation_val.txt' 59 | __C.VAL.VAL_STEP = 1000 60 | __C.VAL.NUM_EPOCHS = 1000 61 | __C.VAL.BATCH_SIZE = 128 62 | __C.VAL.PRINT_NUM = 5 63 | 64 | __C.RNG_SEED = 3 65 | 66 | __C.ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..')) 67 | __C.TEST = edict() 68 | __C.EXP_DIR = 'default' 69 | __C.LOG_DIR = 'default' 70 | 71 | __C.SPACE_INDEX = 0 72 | __C.SPACE_TOKEN = '' 73 | def get_encode_decode_dict(): 74 | encode_maps = {} 75 | decode_maps = {} 76 | for i, char in enumerate(__C.CHARSET, 1): 77 | encode_maps[char] = i 78 | decode_maps[i] = char 79 | encode_maps[__C.SPACE_TOKEN] = __C.SPACE_INDEX 80 | decode_maps[__C.SPACE_INDEX] = __C.SPACE_TOKEN 81 | return encode_maps,decode_maps 82 | 83 | 84 | def get_output_dir(imdb, weights_filename): 85 | outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'output', __C.EXP_DIR)) 86 | if weights_filename is not None: 87 | outdir = osp.join(outdir, weights_filename) 88 | if not os.path.exists(outdir): 89 | os.makedirs(outdir) 90 | return outdir 91 | 92 | def get_log_dir(imdb): 93 | log_dir = osp.abspath(\ 94 | osp.join(__C.ROOT_DIR, 'logs', __C.LOG_DIR, imdb.name, strftime("%Y-%m-%d-%H-%M-%S", localtime()))) 95 | if not os.path.exists(log_dir): 96 | os.makedirs(log_dir) 97 | return log_dir 98 | 99 | def _merge_a_into_b(a, b): 100 | if type(a) is not edict: 101 | return 102 | 103 | for k, v in a.items(): 104 | # a must specify keys that are in b 105 | if k not in b: 106 | raise KeyError('{} is not a valid config key'.format(k)) 107 | 108 | # the types must match, too 109 | old_type = type(b[k]) 110 | if old_type is not type(v): 111 | if isinstance(b[k], np.ndarray): 112 | v = np.array(v, dtype=b[k].dtype) 113 | else: 114 | raise ValueError(('Type mismatch ({} vs. {}) ' 115 | 'for config key: {}').format(type(b[k]), 116 | type(v), k)) 117 | 118 | # recursively merge dicts 119 | if type(v) is edict: 120 | try: 121 | _merge_a_into_b(a[k], b[k]) 122 | except: 123 | print(('Error under config key: {}'.format(k))) 124 | raise 125 | else: 126 | b[k] = v 127 | 128 | def cfg_from_file(filename): 129 | """Load a config file and merge it into the default options.""" 130 | import yaml 131 | with open(filename, 'r') as f: 132 | yaml_cfg = edict(yaml.load(f)) 133 | 134 | _merge_a_into_b(yaml_cfg, __C) 135 | 136 | def cfg_from_list(cfg_list): 137 | """Set config keys via list (e.g., from command line).""" 138 | from ast import literal_eval 139 | assert len(cfg_list) % 2 == 0 140 | for k, v in zip(cfg_list[0::2], cfg_list[1::2]): 141 | key_list = k.split('.') 142 | d = __C 143 | for subkey in key_list[:-1]: 144 | assert subkey in d 145 | d = d[subkey] 146 | subkey = key_list[-1] 147 | assert subkey in d 148 | try: 149 | value = literal_eval(v) 150 | except: 151 | # handle the case when v is a string literal 152 | value = v 153 | assert type(value) == type(d[subkey]), \ 154 | 'type {} does not match original type {}'.format( 155 | type(value), type(d[subkey])) 156 | d[subkey] = value 157 | -------------------------------------------------------------------------------- /lib/lstm/test.py: -------------------------------------------------------------------------------- 1 | import sys,math 2 | import os,shutil 3 | import collections 4 | import numpy as np 5 | import os 6 | import tensorflow as tf 7 | import cv2 8 | from lib.lstm.utils.timer import Timer 9 | from ..lstm.config import cfg,get_encode_decode_dict 10 | 11 | class SolverWrapper(object): 12 | def __init__(self, sess, network, imgdb, output_dir, logdir, pretrained_model=None): 13 | self.net = network 14 | self.imgdb = imgdb 15 | self.output_dir = output_dir 16 | self.pretrained_model = pretrained_model 17 | print('done') 18 | 19 | # For checkpoint 20 | self.saver = tf.train.Saver(max_to_keep=100) 21 | self.writer = tf.summary.FileWriter(logdir=logdir, 22 | graph=tf.get_default_graph(), 23 | flush_secs=5) 24 | 25 | 26 | 27 | def test_model(self,sess,testDir=None,restore = True): 28 | logits = self.net.get_output('logits') 29 | time_step_batch = self.net.get_output('time_step_len') 30 | decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, time_step_batch, merge_repeated=True) 31 | dense_decoded = tf.cast(tf.sparse_tensor_to_dense(decoded[0], default_value=0), tf.int32) 32 | 33 | img_size = cfg.IMG_SHAPE 34 | global_step = tf.Variable(0, trainable=False) 35 | # intialize variables 36 | local_vars_init_op = tf.local_variables_initializer() 37 | global_vars_init_op = tf.global_variables_initializer() 38 | 39 | combined_op = tf.group(local_vars_init_op, global_vars_init_op) 40 | sess.run(combined_op) 41 | # resuming a trainer 42 | if restore: 43 | try: 44 | ckpt = tf.train.get_checkpoint_state(self.output_dir) 45 | print('Restoring from {}...'.format(ckpt.model_checkpoint_path), end=' ') 46 | self.saver.restore(sess, tf.train.latest_checkpoint(self.output_dir)) 47 | stem = os.path.splitext(os.path.basename(ckpt.model_checkpoint_path))[0] 48 | restore_iter = int(stem.split('_')[-1]) 49 | sess.run(global_step.assign(restore_iter)) 50 | print('done') 51 | except: 52 | raise Exception('Check your pretrained {:s}'.format(ckpt.model_checkpoint_path)) 53 | 54 | timer = Timer() 55 | 56 | total = correct = 0 57 | for file in os.listdir(testDir): 58 | timer.tic() 59 | total+=1 60 | 61 | if cfg.NCHANNELS == 1: img = cv2.imread(os.path.join(testDir,file),0) 62 | else : img = cv2.imread(os.path.join(testDir,file),1) 63 | print(file,end=' ') 64 | #img = cv2.resize(img,tuple(img_size)) 65 | w = img.shape[1] 66 | width = math.ceil(img.shape[1] / cfg.POOL_SCALE) * cfg.POOL_SCALE 67 | img = cv2.copyMakeBorder(img, 0, 0, 0, width - w, cv2.BORDER_CONSTANT, value=0).astype(np.float32) / 255. 68 | 69 | img = img.swapaxes(0,1) 70 | img = np.reshape(img, [1,width,cfg.NUM_FEATURES]) 71 | #img = np.expand_dims(img,axis=0) 72 | feed_dict = { 73 | self.net.data: img, 74 | self.net.time_step_len: [img.shape[1]//cfg.POOL_SCALE], 75 | self.net.keep_prob: 1.0 76 | } 77 | res = sess.run(fetches=dense_decoded[0], feed_dict=feed_dict) 78 | def decodeRes(nums,ignore= 0): 79 | encode_maps,decode_maps = get_encode_decode_dict() 80 | res = [decode_maps[i] for i in nums if i!=ignore] 81 | return res 82 | org = file.split('.')[0].split('_')[1] 83 | res = ''.join(decodeRes(res)) 84 | if org==res:correct+=1 85 | _diff_time = timer.toc(average=False) 86 | print('cost time: {:.3f},\n res: {}'.format(_diff_time,res)) 87 | #visualize_segmentation_adaptive(np.array(output),cls_dict) 88 | print('total acc:{}/{}={:.4f}'.format(correct,total,correct/total)) 89 | 90 | 91 | def test_net(network, imgdb, testDir, output_dir, log_dir, pretrained_model=None,restore=True): 92 | 93 | config = tf.ConfigProto(allow_soft_placement=True) 94 | config.gpu_options.allocator_type = 'BFC' 95 | #config.gpu_options.per_process_gpu_memory_fraction = 0.4 96 | with tf.Session(config=config) as sess: 97 | sw = SolverWrapper(sess, network, imgdb, output_dir, logdir= log_dir, pretrained_model=pretrained_model) 98 | print('Solving...') 99 | sw.test_model(sess, testDir=testDir, restore=restore) 100 | print('done solving') 101 | 102 | -------------------------------------------------------------------------------- /lib/lstm/train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os,re 3 | import tensorflow as tf 4 | from ..lstm.config import cfg 5 | from lib.lstm.utils.timer import Timer 6 | from lib.lstm.utils.training import accuracy_calculation 7 | from lib.lstm.utils.tf_records import read_tfrecord_and_decode_into_image_annotation_pair_tensors 8 | from lib.lstm.utils.gen import get_batch 9 | 10 | class SolverWrapper(object): 11 | def __init__(self, sess, network, imgdb, pre_train,output_dir, logdir): 12 | """Initialize the SolverWrapper.""" 13 | self.net = network 14 | self.imgdb = imgdb 15 | self.pre_train=pre_train 16 | self.output_dir = output_dir 17 | print('done') 18 | self.saver = tf.train.Saver(max_to_keep=100) 19 | self.writer = tf.summary.FileWriter(logdir=logdir, 20 | graph=tf.get_default_graph(), 21 | flush_secs=5) 22 | 23 | def snapshot(self, sess, iter): 24 | net = self.net 25 | if not os.path.exists(self.output_dir): 26 | os.makedirs(self.output_dir) 27 | infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX 28 | if cfg.TRAIN.SNAPSHOT_INFIX != '' else '') 29 | 30 | filename = (cfg.TRAIN.SNAPSHOT_PREFIX + '_ctc' + infix + 31 | '_iter_{:d}'.format(iter + 1) + '.ckpt') 32 | 33 | #filename = (cfg.TRAIN.SNAPSHOT_PREFIX + infix + 34 | # '_iter_{:d}'.format(iter+1) + '.ckpt') 35 | filename = os.path.join(self.output_dir, filename) 36 | self.saver.save(sess, filename) 37 | print('Wrote snapshot to: {:s}'.format(filename)) 38 | 39 | def get_data(self,path,batch_size,num_epochs): 40 | filename_queue = tf.train.string_input_producer([path], num_epochs=num_epochs) 41 | image,label,label_len,time_step= read_tfrecord_and_decode_into_image_annotation_pair_tensors(filename_queue) 42 | image_batch, label_batch, label_len_batch,time_step_batch = tf.train.shuffle_batch([image,label,label_len,time_step], 43 | batch_size=batch_size, 44 | capacity=9600, 45 | num_threads=4, 46 | min_after_dequeue=6400) 47 | return image_batch, label_batch, label_len_batch,time_step_batch 48 | 49 | def restoreLabel(self,label_vec,label_len): 50 | labels = [] 51 | for l_len in label_len: 52 | labels.append(label_vec[:l_len]) 53 | label_vec = label_vec[l_len:] 54 | return labels 55 | 56 | def mergeLabel(self,labels,ignore = 0): 57 | label_lst = [] 58 | for l in labels: 59 | while l[-1] == ignore: l = l[:-1] 60 | label_lst.extend(l) 61 | return np.array(label_lst) 62 | 63 | def train_model(self, sess, max_iters, restore=False): 64 | train_gen = get_batch(num_workers=12,batch_size=cfg.TRAIN.BATCH_SIZE,vis=False) 65 | val_gen = get_batch(num_workers=1,batch_size=cfg.VAL.BATCH_SIZE,vis=False) 66 | 67 | loss, dense_decoded = self.net.build_loss() 68 | 69 | tf.summary.scalar('loss', loss) 70 | summary_op = tf.summary.merge_all() 71 | 72 | # optimizer 73 | lr = tf.Variable(cfg.TRAIN.LEARNING_RATE, trainable=False) 74 | if cfg.TRAIN.SOLVER == 'Adam': opt = tf.train.AdamOptimizer(lr) 75 | elif cfg.TRAIN.SOLVER == 'RMS': opt = tf.train.RMSPropOptimizer(lr) 76 | else: opt = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM) 77 | 78 | global_step = tf.Variable(0, trainable=False) 79 | with_clip = True 80 | if with_clip: 81 | tvars = tf.trainable_variables() 82 | grads, norm = tf.clip_by_global_norm(tf.gradients(loss, tvars), 10.0) 83 | train_op = opt.apply_gradients(list(zip(grads, tvars)), global_step=global_step) 84 | else: 85 | train_op = opt.minimize(loss, global_step=global_step) 86 | 87 | # intialize variables 88 | local_vars_init_op = tf.local_variables_initializer() 89 | global_vars_init_op = tf.global_variables_initializer() 90 | 91 | combined_op = tf.group(local_vars_init_op, global_vars_init_op) 92 | sess.run(combined_op) 93 | restore_iter = 1 94 | 95 | # resuming a trainer 96 | if restore: 97 | try: 98 | ckpt = tf.train.get_checkpoint_state(self.output_dir) 99 | print('Restoring from {}...'.format(ckpt.model_checkpoint_path), end=' ') 100 | self.saver.restore(sess, tf.train.latest_checkpoint(self.output_dir)) 101 | stem = os.path.splitext(os.path.basename(ckpt.model_checkpoint_path))[0] 102 | restore_iter = int(stem.split('_')[-1]) 103 | sess.run(global_step.assign(restore_iter)) 104 | print('done') 105 | except: 106 | raise Exception('Check your pretrained {:s}'.format(ckpt.model_checkpoint_path)) 107 | 108 | timer = Timer() 109 | loss_min = 0.015 110 | first_val = True 111 | for iter in range(restore_iter, max_iters): 112 | timer.tic() 113 | # learning rate 114 | if iter != 0 and iter % cfg.TRAIN.STEPSIZE == 0: 115 | sess.run(tf.assign(lr, lr.eval() * cfg.TRAIN.GAMMA)) 116 | 117 | # get one batch 118 | img_Batch,label_Batch, label_len_Batch,time_step_Batch = next(train_gen) 119 | img_Batch = np.array(img_Batch) 120 | # Subtract the mean pixel value from each pixel 121 | feed_dict = { 122 | self.net.data: np.array(img_Batch), 123 | self.net.labels: np.array(label_Batch), 124 | self.net.time_step_len: np.array(time_step_Batch), 125 | self.net.labels_len: np.array(label_len_Batch), 126 | self.net.keep_prob: 0.5 127 | } 128 | 129 | fetch_list = [loss,summary_op,train_op] 130 | ctc_loss,summary_str, _ = sess.run(fetches=fetch_list, feed_dict=feed_dict) 131 | 132 | self.writer.add_summary(summary=summary_str, global_step=global_step.eval()) 133 | _diff_time = timer.toc(average=False) 134 | 135 | if (iter) % (cfg.TRAIN.DISPLAY) == 0: 136 | print('iter: %d / %d, total loss: %.7f, lr: %.7f'%\ 137 | (iter, max_iters, ctc_loss ,lr.eval()),end=' ') 138 | print('speed: {:.3f}s / iter'.format(_diff_time)) 139 | if (iter+1) % cfg.TRAIN.SNAPSHOT_ITERS == 0 or ctc_loss num_fg: 170 | disable_inds = np.random.choice(fg_inds, size=(len_fg - num_fg), replace=False) 171 | x[disable_inds] = 255 172 | len_fg= 500 173 | 174 | num_bg = 1000-len_fg 175 | bg_inds = np.where(x == 0)[0] 176 | len_bg = len(bg_inds) 177 | if len_bg > num_bg: 178 | disable_inds = np.random.choice(bg_inds, size=(len_bg - num_bg), replace=False) 179 | x[disable_inds] = 255 180 | x=x.reshape(input_shape) 181 | return x 182 | 183 | def get_valid_logits_and_labels(annotation_batch_tensor, 184 | logits_batch_tensor, 185 | class_labels): 186 | """Returns two tensors of size (num_valid_entries, num_classes). 187 | The function converts annotation batch tensor input of the size 188 | (batch_size, height, width) into label tensor (batch_size, height, 189 | width, num_classes) and then selects only valid entries, resulting 190 | in tensor of the size (num_valid_entries, num_classes). The function 191 | also returns the tensor with corresponding valid entries in the logits 192 | tensor. Overall, two tensors of the same sizes are returned and later on 193 | can be used as an input into tf.softmax_cross_entropy_with_logits() to 194 | get the cross entropy error for each entry. 195 | 196 | Parameters 197 | ---------- 198 | annotation_batch_tensor : Tensor of size (batch_size, width, height) 199 | Tensor with class labels for each batch 200 | logits_batch_tensor : Tensor of size (batch_size, width, height, num_classes) 201 | Tensor with logits. Usually can be achived after inference of fcn network. 202 | class_labels : list of ints 203 | List that contains the numbers that represent classes. Last 204 | value in the list should represent the number that was used 205 | for masking out. 206 | 207 | Returns 208 | ------- 209 | (valid_labels_batch_tensor, valid_logits_batch_tensor) : Two Tensors of size (num_valid_eintries, num_classes). 210 | Tensors that represent valid labels and logits. 211 | """ 212 | 213 | annotation_batch_tensor = tf.py_func(sample, [annotation_batch_tensor], tf.int32) 214 | labels_batch_tensor = get_labels_from_annotation_batch(annotation_batch_tensor=annotation_batch_tensor, 215 | class_labels=class_labels) 216 | 217 | valid_batch_indices = get_valid_entries_indices_from_annotation_batch(annotation_batch_tensor=annotation_batch_tensor, 218 | class_labels=class_labels) 219 | 220 | valid_labels_batch_tensor = tf.gather_nd(params=labels_batch_tensor, indices=valid_batch_indices) 221 | 222 | valid_logits_batch_tensor = tf.gather_nd(params=logits_batch_tensor, indices=valid_batch_indices) 223 | 224 | return valid_labels_batch_tensor, valid_logits_batch_tensor -------------------------------------------------------------------------------- /lib/networks/LSTM_test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from .network import Network 3 | from ..lstm.config import cfg 4 | 5 | 6 | class LSTM_test(Network): 7 | def __init__(self, trainable=True): 8 | self.inputs = [] 9 | 10 | self.data = tf.placeholder(tf.float32, shape=[None, None, cfg.NUM_FEATURES], name='data') 11 | self.time_step_len = tf.placeholder(tf.int32,[None], name='time_step_len') 12 | 13 | self.keep_prob = tf.placeholder(tf.float32) 14 | self.layers = dict({'data': self.data, 'time_step_len':self.time_step_len}) 15 | self.trainable = trainable 16 | self.setup() 17 | 18 | def setup(self): 19 | (self.feed('data') 20 | .conv_single(3, 3, 64 ,1, 1, name='conv1',c_i=cfg.NCHANNELS) 21 | .max_pool(2, 2, 2, 2, padding='VALID', name='pool1') 22 | .conv_single(3, 3, 128 ,1, 1, name='conv2') 23 | .max_pool(2, 2, 2, 2, padding='VALID', name='pool2') 24 | .conv_single(3, 3, 256 ,1, 1, name='conv3_1') 25 | .conv_single(3, 3, 256 ,1, 1, name='conv3_2') 26 | .max_pool(1, 2, 1, 2, padding='VALID', name='pool2') 27 | .conv_single(3, 3, 512 ,1, 1, name='conv4_1', bn=True) 28 | .conv_single(3, 3, 512 ,1, 1, name='conv4_2', bn=True) 29 | .max_pool(1, 2, 1, 2, padding='VALID', name='pool3') 30 | .conv_single(2, 2, 512 ,1, 1, padding = 'VALID', name='conv5', relu=False) 31 | #.dropout(keep_prob = self.keep_prob, name = 'dropout_layer') 32 | .reshape_squeeze_layer(d = 512 , name='reshaped_layer')) 33 | (self.feed('reshaped_layer','time_step_len') 34 | .bi_lstm(cfg.TRAIN.NUM_HID,cfg.TRAIN.NUM_LAYERS,name='logits')) 35 | 36 | -------------------------------------------------------------------------------- /lib/networks/LSTM_train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from .network import Network 3 | from ..lstm.config import cfg 4 | 5 | 6 | class LSTM_train(Network): 7 | def __init__(self, trainable=True): 8 | self.inputs = [] 9 | 10 | self.data = tf.placeholder(tf.float32, shape=[None, None, cfg.NUM_FEATURES ], name='data') #N*t_s*features*channels 11 | self.labels = tf.placeholder(tf.int32,[None],name='labels') 12 | self.time_step_len = tf.placeholder(tf.int32,[None], name='time_step_len') 13 | self.labels_len = tf.placeholder(tf.int32,[None],name='labels_len') 14 | 15 | self.keep_prob = tf.placeholder(tf.float32) 16 | self.layers = dict({'data': self.data,'labels':self.labels, 17 | 'time_step_len':self.time_step_len, 18 | 'labels_len':self.labels_len}) 19 | self.trainable = trainable 20 | self.setup() 21 | 22 | def setup(self): 23 | (self.feed('data') 24 | .conv_single(3, 3, 64 ,1, 1, name='conv1',c_i=cfg.NCHANNELS) 25 | .max_pool(2, 2, 2, 2, padding='VALID', name='pool1') 26 | .conv_single(3, 3, 128 ,1, 1, name='conv2') 27 | .max_pool(2, 2, 2, 2, padding='VALID', name='pool2') 28 | .conv_single(3, 3, 256 ,1, 1, name='conv3_1') 29 | .conv_single(3, 3, 256 ,1, 1, name='conv3_2') 30 | .max_pool(1, 2, 1, 2, padding='VALID', name='pool2') 31 | .conv_single(3, 3, 512 ,1, 1, name='conv4_1', bn=True) 32 | .conv_single(3, 3, 512 ,1, 1, name='conv4_2', bn=True) 33 | .max_pool(1, 2, 1, 2, padding='VALID', name='pool3') 34 | .conv_single(2, 2, 512 ,1, 1, padding = 'VALID', name='conv5', relu=False) 35 | #.dropout(keep_prob = self.keep_prob, name = 'dropout_layer') 36 | .reshape_squeeze_layer(d = 512 , name='reshaped_layer')) 37 | (self.feed('reshaped_layer','time_step_len') 38 | .bi_lstm(cfg.TRAIN.NUM_HID,cfg.TRAIN.NUM_LAYERS,name='logits')) 39 | # .lstm(cfg.TRAIN.NUM_HID,cfg.TRAIN.NUM_LAYERS,name='logits',img_shape=[-1,cfg.IMG_SHAPE[0]//cfg.POOL_SCALE,cfg.NUM_FEATURES//cfg.POOL_SCALE])) 40 | #.bi_lstm(cfg.TRAIN.NUM_HID,cfg.TRAIN.NUM_LAYERS,name='logits',img_shape=[-1,cfg.IMG_SHAPE[0]//cfg.POOL_SCALE,cfg.NUM_FEATURES//cfg.POOL_SCALE])) 41 | -------------------------------------------------------------------------------- /lib/networks/__init__.py: -------------------------------------------------------------------------------- 1 | from . import factory 2 | -------------------------------------------------------------------------------- /lib/networks/factory.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # SubCNN_TF 3 | # Copyright (c) 2016 CVGL Stanford 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Yu Xiang 6 | # -------------------------------------------------------- 7 | 8 | """Factory method for easily getting imdbs by name.""" 9 | 10 | __sets = {} 11 | from .LSTM_train import LSTM_train 12 | from .LSTM_test import LSTM_test 13 | def get_network(name): 14 | """Get a network by name.""" 15 | if name.split('_')[0] == 'LSTM': 16 | if name.split('_')[1] == 'train': 17 | return LSTM_train() 18 | elif name.split('_')[1] == 'test': 19 | return LSTM_test() 20 | else: 21 | raise KeyError('Unknown dataset: {}'.format(name)) 22 | 23 | def list_networks(): 24 | """List all registered imdbs.""" 25 | return list(__sets.keys()) 26 | 27 | -------------------------------------------------------------------------------- /lib/networks/network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from lib.lstm.config import cfg 5 | from lib.lstm.utils.training import * 6 | import warpctc_tensorflow 7 | 8 | DEFAULT_PADDING = 'SAME' 9 | 10 | def incluude_original(dec): 11 | """ Meta decorator, which make the original function callable (via f._original() )""" 12 | def meta_decorator(f): 13 | decorated = dec(f) 14 | decorated._original = f 15 | return decorated 16 | return meta_decorator 17 | 18 | #@include_original 19 | def layer(op): 20 | def layer_decorated(self, *args, **kwargs): 21 | # Automatically set a name if not provided. 22 | name = kwargs.setdefault('name', self.get_unique_name(op.__name__)) 23 | # Figure out the layer inputs. 24 | if len(self.inputs)==0: 25 | raise RuntimeError('No input variables found for layer %s.'%name) 26 | elif len(self.inputs)==1: 27 | layer_input = self.inputs[0] 28 | else: 29 | layer_input = list(self.inputs) 30 | # Perform the operation and get the output. 31 | layer_output = op(self, layer_input, *args, **kwargs) 32 | # Add to layer LUT. 33 | self.layers[name] = layer_output 34 | # This output is now the input for the next layer. 35 | self.feed(layer_output) 36 | # Return self for chained calls. 37 | return self 38 | return layer_decorated 39 | 40 | class Network(object): 41 | def __init__(self, inputs, trainable=True): 42 | self.inputs = [] 43 | self.layers = dict(inputs) 44 | self.trainable = trainable 45 | self.setup() 46 | 47 | def setup(self): 48 | raise NotImplementedError('Must be subclassed.') 49 | 50 | def load(self, data_path, session, ignore_missing=False): 51 | data_dict = np.load(data_path,encoding='latin1').item() 52 | for key in data_dict: 53 | with tf.variable_scope(key, reuse=True): 54 | for subkey in data_dict[key]: 55 | try: 56 | var = tf.get_variable(subkey) 57 | session.run(var.assign(data_dict[key][subkey])) 58 | print("assign pretrain model "+subkey+ " to "+key) 59 | except ValueError: 60 | print("ignore "+key) 61 | if not ignore_missing: 62 | 63 | raise 64 | 65 | def feed(self, *args): 66 | assert len(args)!=0 67 | self.inputs = [] 68 | for layer in args: 69 | if isinstance(layer, str): 70 | try: 71 | layer = self.layers[layer] 72 | print(layer) 73 | except KeyError: 74 | print(list(self.layers.keys())) 75 | raise KeyError('Unknown layer name fed: %s'%layer) 76 | self.inputs.append(layer) 77 | return self 78 | 79 | def get_output(self, layer): 80 | try: 81 | layer = self.layers[layer] 82 | except KeyError: 83 | print(list(self.layers.keys())) 84 | raise KeyError('Unknown layer name fed: %s'%layer) 85 | return layer 86 | 87 | def get_unique_name(self, prefix): 88 | id = sum(t.startswith(prefix) for t,_ in list(self.layers.items()))+1 89 | return '%s_%d'%(prefix, id) 90 | 91 | def make_var(self, name, shape, initializer=None, trainable=True, regularizer=None): 92 | return tf.get_variable(name, shape, initializer=initializer, trainable=trainable, regularizer=regularizer) 93 | 94 | def validate_padding(self, padding): 95 | assert padding in ('SAME', 'VALID') 96 | 97 | @layer 98 | def bi_lstm(self, input, num_hids, num_layers, name,img_shape = None ,trainable=True): 99 | img,img_len = input[0],input[1] 100 | #img = tf.squeeze(img,axis=3) 101 | if img_shape:img =tf.reshape(img,shape = img_shape ) 102 | with tf.variable_scope(name) as scope: 103 | #stack = tf.contrib.rnn.MultiRNNCell([cell,cell1] , state_is_tuple=True) 104 | lstm_fw_cell = tf.contrib.rnn.LSTMCell(num_hids//2,state_is_tuple=True) 105 | lstm_bw_cell = tf.contrib.rnn.LSTMCell(num_hids//2,state_is_tuple=True) 106 | 107 | output,_ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,lstm_bw_cell,img,img_len,dtype=tf.float32) 108 | # output_bw_reverse = tf.reverse_sequence(output[1],img_len,seq_axis=1) 109 | output = tf.concat(output,axis=2) 110 | 111 | #stack_cell = tf.contrib.rnn.MultiRNNCell( 112 | # [tf.contrib.rnn.LSTMCell(num_hids, state_is_tuple=True) for _ in range(num_layers)], 113 | # state_is_tuple=True) 114 | #lstm_out,last_state = tf.nn.dynamic_rnn(stack_cell,output,img_len,dtype=tf.float32) 115 | lstm_out = output 116 | shape = tf.shape(img) 117 | batch_size, time_step = shape[0],shape[1] 118 | lstm_out = tf.reshape(lstm_out,[-1,num_hids]) 119 | init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False) 120 | # init_weights = tf.contrib.layers.xavier_initializer() 121 | # init_weights = tf.truncated_normal_initializer(stddev=0.1) 122 | init_biases = tf.constant_initializer(0.0) 123 | W = self.make_var('weights', [num_hids, cfg.NCLASSES], init_weights, trainable, \ 124 | regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)) 125 | b = self.make_var('biases', [cfg.NCLASSES], init_biases, trainable) 126 | logits = tf.matmul(lstm_out,W)+b 127 | logits = tf.reshape(logits,[batch_size,-1,cfg.NCLASSES]) 128 | logits = tf.transpose(logits,(1,0,2)) 129 | return logits 130 | @layer 131 | def lstm(self, input, num_hids, num_layers, name,img_shape = None ,trainable=True): 132 | img,img_len = input[0],input[1] 133 | if img_shape:img =tf.reshape(img,shape = img_shape ) 134 | with tf.variable_scope(name) as scope: 135 | stack_cell = tf.contrib.rnn.MultiRNNCell( 136 | [tf.contrib.rnn.LSTMCell(num_hids, state_is_tuple=True) for _ in range(num_layers)], 137 | state_is_tuple=True) 138 | lstm_out,last_state = tf.nn.dynamic_rnn(stack_cell,img,img_len,dtype=tf.float32) 139 | shape = tf.shape(img) 140 | batch_size, time_step = shape[0],shape[1] 141 | lstm_out = tf.reshape(lstm_out,[-1,num_hids]) 142 | # init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.001, mode='FAN_AVG', uniform=False) 143 | # init_weights = tf.contrib.layers.xavier_initializer() 144 | init_weights = tf.truncated_normal_initializer(stddev=0.1) 145 | init_biases = tf.constant_initializer(0.0) 146 | W = self.make_var('weights', [num_hids, cfg.NCLASSES], init_weights, trainable, \ 147 | regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)) 148 | b = self.make_var('biases', [cfg.NCLASSES], init_biases, trainable) 149 | logits = tf.matmul(lstm_out,W)+b 150 | logits = tf.reshape(logits,[batch_size,-1,cfg.NCLASSES]) 151 | logits = tf.transpose(logits,(1,0,2)) 152 | return logits 153 | 154 | @layer 155 | def concat(self, input, axis, name): 156 | with tf.variable_scope(name) as scope: 157 | concat = tf.concat(values=input,axis=axis) 158 | return concat 159 | 160 | @layer 161 | def conv_single(self, input, k_h, k_w, c_o, s_h, s_w, name, c_i=None, bn=False, biased=True,relu=True, padding=DEFAULT_PADDING, trainable=True): 162 | """ contribution by miraclebiu, and biased option""" 163 | self.validate_padding(padding) 164 | if not c_i: c_i = input.get_shape()[-1] 165 | if c_i==1: input = tf.expand_dims(input=input,axis=3) 166 | convolve = lambda i, k: tf.nn.conv2d(i, k, [1,s_h, s_w, 1], padding=padding) 167 | with tf.variable_scope(name) as scope: 168 | init_weights = tf.contrib.layers.xavier_initializer() 169 | init_biases = tf.constant_initializer(0.0) 170 | kernel = self.make_var('weights', [k_h, k_w, c_i, c_o], init_weights, trainable, \ 171 | regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)) 172 | if biased: 173 | biases = self.make_var('biases', [c_o], init_biases, trainable) 174 | conv = convolve(input, kernel) 175 | bias = tf.nn.bias_add(conv, biases) 176 | if bn: 177 | bn_layer = tf.contrib.layers.batch_norm(bias, scale=True, 178 | center=True, is_training=True, scope=name) 179 | else:bn_layer = bias 180 | if relu: 181 | return tf.nn.relu(bn_layer) 182 | else: return bn_layer 183 | else: 184 | conv = convolve(input, kernel) 185 | if bn: 186 | bn_layer = tf.contrib.layers.batch_norm(conv, scale=True, 187 | center=True, is_training=True, scope=name) 188 | else:bn_layer = conv 189 | if relu: 190 | return tf.nn.relu(bn_layer) 191 | return bn_layer 192 | 193 | @layer 194 | def conv(self, input, k_h, k_w, c_o, s_h, s_w, name, c_i=None, biased=True,relu=True, padding=DEFAULT_PADDING, trainable=True): 195 | """ contribution by miraclebiu, and biased option""" 196 | self.validate_padding(padding) 197 | if not c_i: c_i = input.get_shape()[-1] 198 | convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding) 199 | with tf.variable_scope(name) as scope: 200 | init_weights = tf.contrib.layers.xavier_initializer() 201 | init_biases = tf.constant_initializer(0.0) 202 | kernel = self.make_var('weights', [k_h, k_w, c_i, c_o], init_weights, trainable, \ 203 | regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)) 204 | if biased: 205 | biases = self.make_var('biases', [c_o], init_biases, trainable) 206 | conv = convolve(input, kernel) 207 | if relu: 208 | bias = tf.nn.bias_add(conv, biases) 209 | 210 | return tf.nn.relu(bias) 211 | return tf.nn.bias_add(conv, biases) 212 | else: 213 | conv = convolve(input, kernel) 214 | if relu: 215 | return tf.nn.relu(conv) 216 | return conv 217 | 218 | @layer 219 | def conv_zero(self, input, k_h, k_w, c_o, s_h, s_w, name, biased=True, relu=True, padding=DEFAULT_PADDING, 220 | trainable=True): 221 | """ contribution by miraclebiu, and biased option""" 222 | self.validate_padding(padding) 223 | c_i = input.get_shape()[-1] 224 | convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding) 225 | with tf.variable_scope(name) as scope: 226 | init_weights = tf.constant_initializer(0.0) 227 | init_biases = tf.constant_initializer(0.0) 228 | kernel = self.make_var('weights', [k_h, k_w, c_i, c_o], init_weights, trainable, \ 229 | regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)) 230 | if biased: 231 | biases = self.make_var('biases', [c_o], init_biases, trainable) 232 | conv = convolve(input, kernel) 233 | if relu: 234 | bias = tf.nn.bias_add(conv, biases) 235 | 236 | return tf.nn.relu(bias) 237 | return tf.nn.bias_add(conv, biases) 238 | else: 239 | conv = convolve(input, kernel) 240 | if relu: 241 | return tf.nn.relu(conv) 242 | return conv 243 | 244 | @layer 245 | def conv_norm(self, input, k_h, k_w, c_o, s_h, s_w, name, biased=True,relu=True, padding=DEFAULT_PADDING, trainable=True): 246 | """ contribution by miraclebiu, and biased option""" 247 | self.validate_padding(padding) 248 | c_i = input.get_shape()[-1] 249 | convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding) 250 | with tf.variable_scope(name) as scope: 251 | init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.001, mode='FAN_AVG', uniform=False) 252 | # init_weights = tf.contrib.layers.xavier_initializer() 253 | init_biases = tf.constant_initializer(0.0) 254 | kernel = self.make_var('weights', [k_h, k_w, c_i, c_o], init_weights, trainable, \ 255 | regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)) 256 | if biased: 257 | biases = self.make_var('biases', [c_o], init_biases, trainable) 258 | conv = convolve(input, kernel) 259 | if relu: 260 | bias = tf.nn.bias_add(conv, biases) 261 | temp_layer = tf.contrib.layers.batch_norm(bias, scale=True, center=True, is_training=True, 262 | scope=name) 263 | return tf.nn.relu(temp_layer) 264 | return tf.nn.bias_add(conv, biases) 265 | else: 266 | conv = convolve(input, kernel) 267 | if relu: 268 | return tf.nn.crelu(conv) 269 | return conv 270 | 271 | @layer 272 | def conv_final(self, input, k_h, k_w, c_o, s_h, s_w, name, biased=True, relu=True, padding=DEFAULT_PADDING, 273 | trainable=True): 274 | """ contribution by miraclebiu, and biased option""" 275 | self.validate_padding(padding) 276 | c_i = 128 277 | convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding) 278 | with tf.variable_scope(name) as scope: 279 | init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.001, mode='FAN_AVG', uniform=False) 280 | # init_weights = tf.contrib.layers.xavier_initializer() 281 | init_biases = tf.constant_initializer(0.0) 282 | kernel = self.make_var('weights', [k_h, k_w, c_i, c_o], init_weights, trainable, \ 283 | regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)) 284 | if biased: 285 | biases = self.make_var('biases', [c_o], init_biases, trainable) 286 | conv = convolve(input, kernel) 287 | if relu: 288 | bias = tf.nn.bias_add(conv, biases) 289 | temp_layer = tf.contrib.layers.batch_norm(bias, scale=True, center=True, is_training=True, 290 | scope=name) 291 | return tf.nn.relu(temp_layer) 292 | return tf.nn.bias_add(conv, biases) 293 | else: 294 | conv = convolve(input, kernel) 295 | if relu: 296 | return tf.nn.crelu(conv) 297 | return conv 298 | 299 | @layer 300 | def upconv(self, input, shape, c_o, ksize=4, stride = 2, name = 'upconv', biased=False, relu=True, padding=DEFAULT_PADDING, 301 | trainable=True): 302 | """ up-conv""" 303 | self.validate_padding(padding) 304 | 305 | c_in = input.get_shape()[3].value 306 | in_shape = tf.shape(input) 307 | if shape is None: 308 | h = ((in_shape[1] ) * stride) 309 | w = ((in_shape[2] ) * stride) 310 | new_shape = [in_shape[0], h, w, c_o] 311 | else: 312 | new_shape = [in_shape[0], shape[1], shape[2], c_o] 313 | output_shape = tf.stack(new_shape) 314 | 315 | filter_shape = [ksize, ksize, c_o, c_in] 316 | 317 | with tf.variable_scope(name) as scope: 318 | # init_weights = tf.contrib.layers.xavier_initializer() 319 | init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.001, mode='FAN_AVG', uniform=False) 320 | filters = self.make_var('weights', filter_shape, init_weights, trainable, \ 321 | regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)) 322 | deconv = tf.nn.conv2d_transpose(input, filters, output_shape, 323 | strides=[1, stride, stride, 1], padding=DEFAULT_PADDING, name=scope.name) 324 | # coz de-conv losses shape info, use reshape to re-gain shape 325 | deconv = tf.reshape(deconv, new_shape) 326 | 327 | if biased: 328 | init_biases = tf.constant_initializer(0.0) 329 | biases = self.make_var('biases', [c_o], init_biases, trainable) 330 | if relu: 331 | bias = tf.nn.bias_add(deconv, biases) 332 | return tf.nn.relu(bias) 333 | return tf.nn.bias_add(deconv, biases) 334 | else: 335 | if relu: 336 | return tf.nn.relu(deconv) 337 | return deconv 338 | 339 | @layer 340 | def relu(self, input, name): 341 | return tf.nn.relu(input, name=name) 342 | 343 | @layer 344 | def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING): 345 | self.validate_padding(padding) 346 | return tf.nn.max_pool(input, 347 | ksize=[1, k_h, k_w, 1], 348 | strides=[1, s_h, s_w, 1], 349 | padding=padding, 350 | name=name) 351 | 352 | @layer 353 | def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING): 354 | self.validate_padding(padding) 355 | return tf.nn.avg_pool(input, 356 | ksize=[1, k_h, k_w, 1], 357 | strides=[1, s_h, s_w, 1], 358 | padding=padding, 359 | name=name) 360 | 361 | @layer 362 | def reshape_squeeze_layer(self, input, d, name): 363 | #N,H,W,C-> N,H*W,C 364 | input_shape = tf.shape(input) 365 | return tf.reshape(input, \ 366 | [input_shape[0], \ 367 | input_shape[1]*input_shape[2], \ 368 | int(d)]) 369 | 370 | @layer 371 | def reshape_layer(self, input, d, name): 372 | input_shape = tf.shape(input) 373 | if name == 'rpn_cls_prob_reshape': 374 | # 375 | # transpose: (1, AxH, W, 2) -> (1, 2, AxH, W) 376 | # reshape: (1, 2xA, H, W) 377 | # transpose: -> (1, H, W, 2xA) 378 | return tf.transpose(tf.reshape(tf.transpose(input,[0,3,1,2]), 379 | [ input_shape[0], 380 | int(d), 381 | tf.cast(tf.cast(input_shape[1],tf.float32)/tf.cast(d,tf.float32)*tf.cast(input_shape[3],tf.float32),tf.int32), 382 | input_shape[2] 383 | ]), 384 | [0,2,3,1],name=name) 385 | else: 386 | return tf.transpose(tf.reshape(tf.transpose(input,[0,3,1,2]), 387 | [ input_shape[0], 388 | int(d), 389 | tf.cast(tf.cast(input_shape[1],tf.float32)*(tf.cast(input_shape[3],tf.float32)/tf.cast(d,tf.float32)),tf.int32), 390 | input_shape[2] 391 | ]), 392 | [0,2,3,1],name=name) 393 | 394 | @layer 395 | def spatial_reshape_layer(self, input, d, name): 396 | input_shape = tf.shape(input) 397 | # transpose: (1, H, W, A x d) -> (1, H, WxA, d) 398 | return tf.reshape(input,\ 399 | [input_shape[0],\ 400 | input_shape[1], \ 401 | -1,\ 402 | int(d)]) 403 | 404 | 405 | @layer 406 | def lrn(self, input, radius, alpha, beta, name, bias=1.0): 407 | return tf.nn.local_response_normalization(input, 408 | depth_radius=radius, 409 | alpha=alpha, 410 | beta=beta, 411 | bias=bias, 412 | name=name) 413 | 414 | 415 | @layer 416 | def fc(self, input, num_out, name, relu=True, trainable=True): 417 | with tf.variable_scope(name) as scope: 418 | # only use the first input 419 | if isinstance(input, tuple): 420 | input = input[0] 421 | 422 | input_shape = input.get_shape() 423 | if input_shape.ndims == 4: 424 | dim = 1 425 | for d in input_shape[1:].as_list(): 426 | dim *= d 427 | feed_in = tf.reshape(tf.transpose(input,[0,3,1,2]), [-1, dim]) 428 | else: 429 | feed_in, dim = (input, int(input_shape[-1])) 430 | 431 | if name == 'bbox_pred': 432 | init_weights = tf.truncated_normal_initializer(0.0, stddev=0.001) 433 | init_biases = tf.constant_initializer(0.0) 434 | else: 435 | init_weights = tf.truncated_normal_initializer(0.0, stddev=0.01) 436 | init_biases = tf.constant_initializer(0.0) 437 | 438 | weights = self.make_var('weights', [dim, num_out], init_weights, trainable, \ 439 | regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)) 440 | biases = self.make_var('biases', [num_out], init_biases, trainable) 441 | 442 | op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b 443 | fc = op(feed_in, weights, biases, name=scope.name) 444 | return fc 445 | 446 | @layer 447 | def softmax(self, input, name): 448 | input_shape = tf.shape(input) 449 | if name == 'rpn_cls_prob': 450 | return tf.reshape(tf.nn.softmax(tf.reshape(input,[-1,input_shape[3]])),[-1,input_shape[1],input_shape[2],input_shape[3]],name=name) 451 | else: 452 | return tf.nn.softmax(input,name=name) 453 | 454 | @layer 455 | def spatial_softmax(self, input, name): 456 | input_shape = tf.shape(input) 457 | # d = input.get_shape()[-1] 458 | return tf.reshape(tf.nn.softmax(tf.reshape(input, [-1, input_shape[3]])), 459 | [-1, input_shape[1], input_shape[2], input_shape[3]], name=name) 460 | 461 | @layer 462 | def add(self,input,name): 463 | """contribution by miraclebiu""" 464 | return tf.add(input[0],input[1], name=name) 465 | 466 | @layer 467 | def batch_normalization(self,input,name,relu=True, is_training=False): 468 | """contribution by miraclebiu""" 469 | if relu: 470 | temp_layer=tf.contrib.layers.batch_norm(input,scale=True,center=True,is_training=is_training,scope=name) 471 | return tf.nn.relu(temp_layer) 472 | else: 473 | return tf.contrib.layers.batch_norm(input,scale=True,center=True,is_training=is_training,scope=name) 474 | 475 | @layer 476 | def negation(self, input, name): 477 | """ simply multiplies -1 to the tensor""" 478 | return tf.multiply(input, -1.0, name=name) 479 | 480 | @layer 481 | def bn_scale_combo(self, input, c_in, name, relu=True): 482 | """ PVA net BN -> Scale -> Relu""" 483 | with tf.variable_scope(name) as scope: 484 | bn = self.batch_normalization._original(self, input, name='bn', relu=False, is_training=False) 485 | # alpha = tf.get_variable('bn_scale/alpha', shape=[c_in, ], dtype=tf.float32, 486 | # initializer=tf.constant_initializer(1.0), trainable=True, 487 | # regularizer=self.l2_regularizer(0.00001)) 488 | # beta = tf.get_variable('bn_scale/beta', shape=[c_in, ], dtype=tf.float32, 489 | # initializer=tf.constant_initializer(0.0), trainable=True, 490 | # regularizer=self.l2_regularizer(0.00001)) 491 | # bn = tf.add(tf.mul(bn, alpha), beta) 492 | if relu: 493 | bn = tf.nn.relu(bn, name='relu') 494 | return bn 495 | 496 | @layer 497 | def pva_negation_block(self, input, k_h, k_w, c_o, s_h, s_w, name, biased=True, padding=DEFAULT_PADDING, trainable=True, 498 | scale = True, negation = True): 499 | """ for PVA net, Conv -> BN -> Neg -> Concat -> Scale -> Relu""" 500 | with tf.variable_scope(name) as scope: 501 | conv = self.conv._original(self, input, k_h, k_w, c_o, s_h, s_w, biased=biased, relu=False, name='conv', padding=padding, trainable=trainable) 502 | conv = self.batch_normalization._original(self, conv, name='bn', relu=False, is_training=False) 503 | c_in = c_o 504 | if negation: 505 | conv_neg = self.negation._original(self, conv, name='neg') 506 | conv = tf.concat(axis=3, values=[conv, conv_neg], name='concat') 507 | c_in += c_in 508 | if scale: 509 | # y = \alpha * x + \beta 510 | alpha = tf.get_variable('scale/alpha', shape=[c_in,], dtype=tf.float32, 511 | initializer=tf.constant_initializer(1.0), trainable=True, regularizer=self.l2_regularizer(0.00001)) 512 | beta = tf.get_variable('scale/beta', shape=[c_in, ], dtype=tf.float32, 513 | initializer=tf.constant_initializer(0.0), trainable=True, regularizer=self.l2_regularizer(0.00001)) 514 | # conv = conv * alpha + beta 515 | conv = tf.add(tf.multiply(conv, alpha), beta) 516 | return tf.nn.relu(conv, name='relu') 517 | 518 | @layer 519 | def pva_negation_block_v2(self, input, k_h, k_w, c_o, s_h, s_w, c_in, name, biased=True, padding=DEFAULT_PADDING, trainable=True, 520 | scale = True, negation = True): 521 | """ for PVA net, BN -> [Neg -> Concat ->] Scale -> Relu -> Conv""" 522 | with tf.variable_scope(name) as scope: 523 | bn = self.batch_normalization._original(self, input, name='bn', relu=False, is_training=False) 524 | if negation: 525 | bn_neg = self.negation._original(self, bn, name='neg') 526 | bn = tf.concat(axis=3, values=[bn, bn_neg], name='concat') 527 | c_in += c_in 528 | # y = \alpha * x + \beta 529 | alpha = tf.get_variable('scale/alpha', shape=[c_in,], dtype=tf.float32, 530 | initializer=tf.constant_initializer(1.0), trainable=True, regularizer=self.l2_regularizer(0.00004)) 531 | beta = tf.get_variable('scale/beta', shape=[c_in, ], dtype=tf.float32, 532 | initializer=tf.constant_initializer(0.0), trainable=True, regularizer=self.l2_regularizer(0.00004)) 533 | bn = tf.add(tf.multiply(bn, alpha), beta) 534 | bn = tf.nn.relu(bn, name='relu') 535 | if name == 'conv3_1/1': self.layers['conv3_1/1/relu'] = bn 536 | 537 | conv = self.conv._original(self, bn, k_h, k_w, c_o, s_h, s_w, biased=biased, relu=False, name='conv', padding=padding, 538 | trainable=trainable) 539 | return conv 540 | 541 | @layer 542 | def pva_inception_res_stack(self, input, c_in, name, block_start = False, type = 'a'): 543 | 544 | if type == 'a': 545 | (c_0, c_1, c_2, c_pool, c_out) = (64, 64, 24, 128, 256) 546 | elif type == 'b': 547 | (c_0, c_1, c_2, c_pool, c_out) = (64, 96, 32, 128, 384) 548 | else: 549 | raise ('Unexpected inception-res type') 550 | if block_start: 551 | stride = 2 552 | else: 553 | stride = 1 554 | with tf.variable_scope(name+'/incep') as scope: 555 | bn = self.batch_normalization._original(self, input, name='bn', relu=False, is_training=False) 556 | bn_scale = self.scale._original(self, bn, c_in, name='bn_scale') 557 | ## 1 x 1 558 | 559 | conv = self.conv._original(self, bn_scale, 1, 1, c_0, stride, stride, name='0/conv', biased = False, relu=False) 560 | conv_0 = self.bn_scale_combo._original(self, conv, c_in=c_0, name ='0', relu=True) 561 | 562 | ## 3 x 3 563 | bn_relu = tf.nn.relu(bn_scale, name='relu') 564 | if name == 'conv4_1': tmp_c = c_1; c_1 = 48 565 | conv = self.conv._original(self, bn_relu, 1, 1, c_1, stride, stride, name='1_reduce/conv', biased = False, relu=False) 566 | conv = self.bn_scale_combo._original(self, conv, c_in=c_1, name='1_reduce', relu=True) 567 | if name == 'conv4_1': c_1 = tmp_c 568 | conv = self.conv._original(self, conv, 3, 3, c_1 * 2, 1, 1, name='1_0/conv', biased = False, relu=False) 569 | conv_1 = self.bn_scale_combo._original(self, conv, c_in=c_1 * 2, name='1_0', relu=True) 570 | 571 | ## 5 x 5 572 | conv = self.conv._original(self, bn_scale, 1, 1, c_2, stride, stride, name='2_reduce/conv', biased = False, relu=False) 573 | conv = self.bn_scale_combo._original(self, conv, c_in=c_2, name='2_reduce', relu=True) 574 | conv = self.conv._original(self, conv, 3, 3, c_2 * 2, 1, 1, name='2_0/conv', biased = False, relu=False) 575 | conv = self.bn_scale_combo._original(self, conv, c_in=c_2 * 2, name='2_0', relu=True) 576 | conv = self.conv._original(self, conv, 3, 3, c_2 * 2, 1, 1, name='2_1/conv', biased = False, relu=False) 577 | conv_2 = self.bn_scale_combo._original(self, conv, c_in=c_2 * 2, name='2_1', relu=True) 578 | 579 | ## pool 580 | if block_start: 581 | pool = self.max_pool._original(self, bn_scale, 3, 3, 2, 2, padding=DEFAULT_PADDING, name='pool') 582 | pool = self.conv._original(self, pool, 1, 1, c_pool, 1, 1, name='poolproj/conv', biased = False, relu=False) 583 | pool = self.bn_scale_combo._original(self, pool, c_in=c_pool, name='poolproj', relu=True) 584 | 585 | with tf.variable_scope(name) as scope: 586 | if block_start: 587 | concat = tf.concat(axis=3, values=[conv_0, conv_1, conv_2, pool], name='concat') 588 | proj = self.conv._original(self, input, 1, 1, c_out, 2, 2, name='proj', biased=True, 589 | relu=False) 590 | else: 591 | concat = tf.concat(axis=3, values=[conv_0, conv_1, conv_2], name='concat') 592 | proj = input 593 | 594 | conv = self.conv._original(self, concat, 1, 1, c_out, 1, 1, name='out/conv', relu=False) 595 | if name == 'conv5_4': 596 | conv = self.bn_scale_combo._original(self, conv, c_in=c_out, name='out', relu=False) 597 | conv = self.add._original(self, [conv, proj], name='sum') 598 | return conv 599 | 600 | @layer 601 | def pva_inception_res_block(self, input, name, name_prefix = 'conv4_', type = 'a'): 602 | """build inception block""" 603 | node = input 604 | if type == 'a': 605 | c_ins = (128, 256, 256, 256, 256, ) 606 | else: 607 | c_ins = (256, 384, 384, 384, 384, ) 608 | for i in range(1, 5): 609 | node = self.pva_inception_res_stack._original(self, node, c_in = c_ins[i-1], 610 | name = name_prefix + str(i), block_start=(i==1), type=type) 611 | return node 612 | 613 | @layer 614 | def scale(self, input, c_in, name): 615 | with tf.variable_scope(name) as scope: 616 | 617 | alpha = tf.get_variable('alpha', shape=[c_in, ], dtype=tf.float32, 618 | initializer=tf.constant_initializer(1.0), trainable=True, 619 | regularizer=self.l2_regularizer(0.00001)) 620 | beta = tf.get_variable('beta', shape=[c_in, ], dtype=tf.float32, 621 | initializer=tf.constant_initializer(0.0), trainable=True, 622 | regularizer=self.l2_regularizer(0.00001)) 623 | return tf.add(tf.multiply(input, alpha), beta) 624 | 625 | 626 | @layer 627 | def dropout(self, input, keep_prob, name): 628 | return tf.nn.dropout(input, keep_prob, name=name) 629 | 630 | def l2_regularizer(self, weight_decay=0.0005, scope=None): 631 | def regularizer(tensor): 632 | with tf.name_scope(scope, default_name='l2_regularizer', values=[tensor]): 633 | l2_weight = tf.convert_to_tensor(weight_decay, 634 | dtype=tensor.dtype.base_dtype, 635 | name='weight_decay') 636 | return tf.multiply(l2_weight, tf.nn.l2_loss(tensor), name='value') 637 | return regularizer 638 | 639 | def smooth_l1_dist(self, deltas, sigma2=9.0, name='smooth_l1_dist'): 640 | with tf.name_scope(name=name) as scope: 641 | deltas_abs = tf.abs(deltas) 642 | smoothL1_sign = tf.cast(tf.less(deltas_abs, 1.0/sigma2), tf.float32) 643 | return tf.square(deltas) * 0.5 * sigma2 * smoothL1_sign + \ 644 | (deltas_abs - 0.5 / sigma2) * tf.abs(smoothL1_sign - 1) 645 | 646 | 647 | def build_loss(self): 648 | time_step_batch = self.get_output('time_step_len') 649 | logits_batch = self.get_output('logits') 650 | labels = self.get_output('labels') 651 | label_len = self.get_output('labels_len') 652 | 653 | ctc_loss = warpctc_tensorflow.ctc(activations=logits_batch,flat_labels=labels, 654 | label_lengths=label_len,input_lengths=time_step_batch) 655 | loss = tf.reduce_mean(ctc_loss) 656 | decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits_batch, time_step_batch, merge_repeated=True) 657 | dense_decoded = tf.cast(tf.sparse_tensor_to_dense(decoded[0], default_value=0), tf.int32) 658 | 659 | # add regularizer 660 | if cfg.TRAIN.WEIGHT_DECAY > 0: 661 | regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) 662 | loss = tf.add_n(regularization_losses) + loss 663 | 664 | return loss,dense_decoded 665 | -------------------------------------------------------------------------------- /lib/utils/convert_ckpt2npy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | import numpy as np 4 | from easydict import EasyDict as edict 5 | from lib.networks.factory import get_network 6 | from lib.fcn.config import get_output_dir,cfg_from_file 7 | 8 | class Convert(object): 9 | def __init__(self, sess, network, model_dir,out_path,model): 10 | self.net = network 11 | self.model_dir = model_dir 12 | self.out_path=out_path 13 | self.model=model 14 | self.saver = tf.train.Saver(max_to_keep=100) 15 | 16 | def conver2npy(self,sess): 17 | global_step = tf.Variable(0, trainable=False) 18 | local_vars_init_op = tf.local_variables_initializer() 19 | global_vars_init_op = tf.global_variables_initializer() 20 | combined_op = tf.group(local_vars_init_op, global_vars_init_op) 21 | sess.run(combined_op) 22 | 23 | try: 24 | self.saver.restore(sess, tf.train.latest_checkpoint(self.model_dir)) 25 | sess.run(global_step.assign(0)) 26 | dic=dict() 27 | pri_keys=['conv1_1','conv1_2','conv2_1','conv2_2', 28 | 'conv3_1','conv3_2','conv3_3', 29 | 'conv4_1','conv4_2','conv4_3', 30 | 'conv5_1','conv5_2','conv5_3'] 31 | if self.model==32: 32 | keys=pri_keys+['fc6','fc7','fc8'] 33 | elif self.model==16: 34 | keys=pri_keys+['fc6','fc7','fc8','pool4_fc'] 35 | elif self.model==8: 36 | keys=pri_keys+['fc6','fc7','fc8','pool4_fc','pool3_fc'] 37 | for key in keys: 38 | with tf.variable_scope(key, reuse=True): 39 | dic[key] = dict() 40 | for subkey in ['weights','biases']: 41 | try: 42 | var = tf.get_variable(subkey) 43 | data=sess.run(var) 44 | dic[key][subkey]=data 45 | 46 | print("save model " + subkey + " to " + key) 47 | except ValueError: 48 | print("fail to convert") 49 | np.save(self.out_path, dic) 50 | except: 51 | raise Exception('Check your model') 52 | 53 | 54 | def convert_ckpt2npy(network, model_dir,out_path,model): 55 | config = tf.ConfigProto(allow_soft_placement=True) 56 | config.gpu_options.allocator_type = 'BFC' 57 | with tf.Session(config=config) as sess: 58 | ct = Convert(sess, network,model_dir,out_path,model) 59 | ct.conver2npy(sess) 60 | print('done converting') 61 | 62 | 63 | 64 | 65 | if __name__ == '__main__': 66 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 67 | 68 | #修改此处 69 | output_network_name = '32s' 70 | 71 | cfg_from_file('./fcn/fcn_nlpr.yml') 72 | imgdb = edict({'path': './data/train.tfrecords', 'name': 'FCN_' + output_network_name}) 73 | model_dir = get_output_dir(imgdb, None) 74 | network = get_network('VGGnet_'+output_network_name) 75 | out_path='./data/'+output_network_name 76 | convert_ckpt2npy(network,model_dir=model_dir,out_path=out_path,model=int(output_network_name[:-1])) 77 | -------------------------------------------------------------------------------- /lib/utils/data_util.py: -------------------------------------------------------------------------------- 1 | ''' 2 | this file is modified from keras implemention of data process multi-threading, 3 | see https://github.com/fchollet/keras/blob/master/keras/utils/data_utils.py 4 | ''' 5 | import time 6 | import numpy as np 7 | import threading 8 | import multiprocessing 9 | try: 10 | import queue 11 | except ImportError: 12 | import Queue as queue 13 | 14 | 15 | class GeneratorEnqueuer(): 16 | """Builds a queue out of a data generator. 17 | 18 | Used in `fit_generator`, `evaluate_generator`, `predict_generator`. 19 | 20 | # Arguments 21 | generator: a generator function which endlessly yields data 22 | use_multiprocessing: use multiprocessing if True, otherwise threading 23 | wait_time: time to sleep in-between calls to `put()` 24 | random_seed: Initial seed for workers, 25 | will be incremented by one for each workers. 26 | """ 27 | 28 | def __init__(self, generator, 29 | use_multiprocessing=False, 30 | wait_time=0.05, 31 | random_seed=None): 32 | self.wait_time = wait_time 33 | self._generator = generator 34 | self._use_multiprocessing = use_multiprocessing 35 | self._threads = [] 36 | self._stop_event = None 37 | self.queue = None 38 | self.random_seed = random_seed 39 | 40 | def start(self, workers=1, max_queue_size=10): 41 | """Kicks off threads which add data from the generator into the queue. 42 | 43 | # Arguments 44 | workers: number of worker threads 45 | max_queue_size: queue size 46 | (when full, threads could block on `put()`) 47 | """ 48 | 49 | def data_generator_task(): 50 | while not self._stop_event.is_set(): 51 | try: 52 | if self._use_multiprocessing or self.queue.qsize() < max_queue_size: 53 | generator_output = next(self._generator) 54 | self.queue.put(generator_output) 55 | else: 56 | time.sleep(self.wait_time) 57 | except Exception: 58 | self._stop_event.set() 59 | raise 60 | 61 | try: 62 | if self._use_multiprocessing: 63 | self.queue = multiprocessing.Queue(maxsize=max_queue_size) 64 | self._stop_event = multiprocessing.Event() 65 | else: 66 | self.queue = queue.Queue() 67 | self._stop_event = threading.Event() 68 | 69 | for _ in range(workers): 70 | if self._use_multiprocessing: 71 | # Reset random seed else all children processes 72 | # share the same seed 73 | np.random.seed(self.random_seed) 74 | thread = multiprocessing.Process(target=data_generator_task) 75 | thread.daemon = True 76 | if self.random_seed is not None: 77 | self.random_seed += 1 78 | else: 79 | thread = threading.Thread(target=data_generator_task) 80 | self._threads.append(thread) 81 | thread.start() 82 | except: 83 | self.stop() 84 | raise 85 | 86 | def is_running(self): 87 | return self._stop_event is not None and not self._stop_event.is_set() 88 | 89 | def stop(self, timeout=None): 90 | """Stops running threads and wait for them to exit, if necessary. 91 | 92 | Should be called by the same thread which called `start()`. 93 | 94 | # Arguments 95 | timeout: maximum time to wait on `thread.join()`. 96 | """ 97 | if self.is_running(): 98 | self._stop_event.set() 99 | 100 | for thread in self._threads: 101 | if thread.is_alive(): 102 | if self._use_multiprocessing: 103 | thread.terminate() 104 | else: 105 | thread.join(timeout) 106 | 107 | if self._use_multiprocessing: 108 | if self.queue is not None: 109 | self.queue.close() 110 | 111 | self._threads = [] 112 | self._stop_event = None 113 | self.queue = None 114 | 115 | def get(self): 116 | """Creates a generator to extract data from the queue. 117 | 118 | Skip the data if it is `None`. 119 | 120 | # Returns 121 | A generator 122 | """ 123 | while self.is_running(): 124 | if not self.queue.empty(): 125 | inputs = self.queue.get() 126 | if inputs is not None: 127 | yield inputs 128 | else: 129 | time.sleep(self.wait_time) -------------------------------------------------------------------------------- /lib/utils/genImg.py: -------------------------------------------------------------------------------- 1 | import sys, random,os 2 | import numpy as np 3 | from captcha.image import ImageCaptcha 4 | import cv2 5 | from multiprocessing import Pool 6 | 7 | def randRGB(): 8 | return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) 9 | 10 | #10+26+26 11 | char_set='0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' 12 | imgDir = None 13 | numProcess = 12 14 | def gen_rand(): 15 | buf = "" 16 | max_len = random.randint(4,6) 17 | for i in range(max_len): 18 | buf += random.choice(char_set) 19 | return buf 20 | def generateImg(ind): 21 | global imgDir 22 | captcha=ImageCaptcha(fonts=['./fonts/Ubuntu-M.ttf']) 23 | theChars=gen_rand() 24 | data=captcha.generate(theChars) 25 | img_name= '{:08d}'.format(ind)+'_'+theChars+'.png' 26 | img_path=imgDir+'/'+img_name 27 | captcha.write(theChars,img_path) 28 | print(img_path) 29 | 30 | def run(num,path): 31 | global imgDir 32 | imgDir = path 33 | if not os.path.exists(path): 34 | os.makedirs(path) 35 | with Pool(processes=numProcess) as pool: 36 | pool.map(generateImg,range(num)) 37 | 38 | if __name__=='__main__': 39 | #run(64*2000,'./data/train') 40 | run(500,'./data/val') 41 | -------------------------------------------------------------------------------- /lstm/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '..') -------------------------------------------------------------------------------- /lstm/lstm.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: lstm_ctc 2 | LOG_DIR: lstm_ctc 3 | NET_NAME: LSTM 4 | GPU_ID: 0 5 | TRAIN: 6 | SOLVER: Adam 7 | DISPLAY: 100 8 | SNAPSHOT_ITERS: 2000 9 | LEARNING_RATE: 0.0001 10 | MOMENTUM: 0.9 11 | GAMMA: 1.0 12 | STEPSIZE: 2000 13 | WEIGHT_DECAY: 0.00001 14 | -------------------------------------------------------------------------------- /lstm/test_net.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import collections 4 | import argparse 5 | import pprint 6 | import numpy as np 7 | import pdb 8 | import sys 9 | import os.path 10 | 11 | this_dir = os.path.dirname(__file__) 12 | sys.path.insert(0, this_dir + '/..') 13 | 14 | from lib.lstm.test import test_net 15 | from lib.lstm.config import cfg, cfg_from_file, cfg_from_list, get_output_dir, get_log_dir 16 | from lib.networks.factory import get_network 17 | from easydict import EasyDict as edict 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser(description='Test a FCN network') 21 | parser.add_argument('--gpu', dest='gpu_id', 22 | help='GPU device id to use [0]', 23 | default=0, type=int) 24 | parser.add_argument('--network', dest='network_name', 25 | help='name of the network', 26 | default=None, type=str) 27 | parser.add_argument('--cfg', dest='cfg_file', 28 | help='optional config file', 29 | default=None, type=str) 30 | parser.add_argument('--restore', dest='restore', 31 | help='restore or not', 32 | default=1, type=int) 33 | 34 | if len(sys.argv) == 1: 35 | parser.print_help() 36 | 37 | args = parser.parse_args() 38 | return args 39 | 40 | if __name__ == '__main__': 41 | args = parse_args() 42 | 43 | print('Called with args:') 44 | print(args) 45 | 46 | if args.cfg_file is not None: 47 | cfg_from_file(args.cfg_file) 48 | 49 | os.environ["CUDA_VISIBLE_DEVICES"] = str(cfg.GPU_ID) 50 | 51 | print('Using config:') 52 | pprint.pprint(cfg) 53 | 54 | output_network_name=args.network_name.split('_')[-1] 55 | imgdb = edict({'path':'./data/train.tfrecords','name':'lstm_'+output_network_name, 56 | 'val_path':'./data/val.tfrecords' }) 57 | 58 | output_dir = get_output_dir(imgdb, None) 59 | log_dir = get_log_dir(imgdb) 60 | print(('Output will be saved to `{:s}`'.format(output_dir))) 61 | print(('Logs will be saved to `{:s}`'.format(log_dir))) 62 | 63 | device_name = '/gpu:{:d}'.format(args.gpu_id) 64 | print(device_name) 65 | 66 | network = get_network(args.network_name) 67 | print(('Use network `{:s}` in training'.format(args.network_name))) 68 | 69 | test_net(network, imgdb, 70 | testDir= './data/val/', #'data/demo' 71 | output_dir=output_dir, 72 | log_dir=log_dir, 73 | restore=bool(int(args.restore))) 74 | -------------------------------------------------------------------------------- /lstm/train_net.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pprint 3 | import numpy as np 4 | import pdb 5 | import sys 6 | import os.path 7 | 8 | this_dir = os.path.dirname(__file__) 9 | sys.path.insert(0, this_dir + '/..') 10 | 11 | from lib.lstm.train import train_net 12 | from lib.lstm.config import cfg, cfg_from_file, cfg_from_list, get_output_dir, get_log_dir 13 | from lib.networks.factory import get_network 14 | from easydict import EasyDict as edict 15 | import matplotlib 16 | 17 | def parse_args(): 18 | parser = argparse.ArgumentParser(description='Train a lstm network') 19 | parser.add_argument('--gpu', dest='gpu_id', 20 | help='GPU device id to use [0]', 21 | default=0, type=int) 22 | parser.add_argument('--iters', dest='max_iters', 23 | help='number of iterations to train', 24 | default=1000000, type=int) 25 | parser.add_argument('--cfg', dest='cfg_file', 26 | help='optional config file', 27 | default=None, type=str) 28 | parser.add_argument('--pre_train', dest='pre_train', 29 | help='pre trained model', 30 | default=None, type=str) 31 | parser.add_argument('--rand', dest='randomize', 32 | help='randomize (do not use a fixed seed)', 33 | action='store_true') 34 | parser.add_argument('--network', dest='network_name', 35 | help='name of the network', 36 | default=None, type=str) 37 | parser.add_argument('--set', dest='set_cfgs', 38 | help='set config keys', default=None, 39 | nargs=argparse.REMAINDER) 40 | parser.add_argument('--restore', dest='restore', 41 | help='restore or not', 42 | default=0, type=int) 43 | 44 | if len(sys.argv) == 1: 45 | parser.print_help() 46 | 47 | args = parser.parse_args() 48 | return args 49 | 50 | # os.environ["CUDA_VISIBLE_DEVICES"] = '0' 51 | if __name__ == '__main__': 52 | args = parse_args() 53 | print('Called with args:') 54 | print(args) 55 | if args.cfg_file is not None: 56 | cfg_from_file(args.cfg_file) 57 | if args.set_cfgs is not None: 58 | cfg_from_list(args.set_cfgs) 59 | os.environ["CUDA_VISIBLE_DEVICES"] = str(cfg.GPU_ID) 60 | 61 | print('Using config:') 62 | pprint.pprint(cfg) 63 | 64 | if not args.randomize: 65 | # fix the random seeds (numpy and caffe) for reproducibility 66 | np.random.seed(cfg.RNG_SEED) 67 | 68 | # imgdb = edict({'path':'data/lstm_voc/pascal_augmented_train.tfrecords','name':'pascal_augmentted'}) 69 | output_network_name=args.network_name.split('_')[-1] 70 | imgdb = edict({'path':'./data/train_4_6.tfrecords','name':'lstm_'+output_network_name, 71 | 'val_path':'./data/val.tfrecords' }) 72 | 73 | output_dir = get_output_dir(imgdb, None) 74 | log_dir = get_log_dir(imgdb) 75 | print(('Output will be saved to `{:s}`'.format(output_dir))) 76 | print(('Logs will be saved to `{:s}`'.format(log_dir))) 77 | 78 | device_name = '/gpu:{:d}'.format(args.gpu_id) 79 | print(device_name) 80 | 81 | network = get_network(args.network_name) 82 | print(('Use network `{:s}` in training'.format(args.network_name))) 83 | 84 | train_net(network, imgdb, 85 | pre_train=args.pre_train, 86 | output_dir=output_dir, 87 | log_dir=log_dir, 88 | max_iters=args.max_iters, 89 | restore=bool(int(args.restore))) 90 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python ./lstm/test_net.py --network=LSTM_test --cfg=./lstm/lstm.yml --restore=1 3 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python ./lstm/train_net.py --network=LSTM_train --cfg=./lstm/lstm.yml --restore=0 3 | --------------------------------------------------------------------------------