├── .gitignore
├── README.md
├── fonts
    └── Ubuntu-M.ttf
├── lib
    ├── __init__.py
    ├── lstm
    │   ├── __init__.py
    │   ├── config.py
    │   ├── test.py
    │   ├── train.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── gen.py
    │   │   ├── tf_records.py
    │   │   ├── timer.py
    │   │   └── training.py
    ├── networks
    │   ├── LSTM_test.py
    │   ├── LSTM_train.py
    │   ├── __init__.py
    │   ├── factory.py
    │   └── network.py
    └── utils
    │   ├── convert_ckpt2npy.py
    │   ├── data_util.py
    │   └── genImg.py
├── lstm
    ├── __init__.py
    ├── lstm.yml
    ├── test_net.py
    └── train_net.py
├── test.sh
└── train.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | log/
 2 | val*/
 3 | test*/
 4 | train*/
 5 | logs/
 6 | output/
 7 | data/
 8 | tmp/
 9 | __pycache__/
10 | checkpoint*
11 | *.json
12 | *.swp
13 | *.swo
14 | .gdb*
15 | .idea/
16 | *~
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | - [old master](https://github.com/ilovin/lstm_ctc_ocr/tree/backup):
 2 |     - harder to converge compare to the beta version
 3 |     - both standard ctc and warpCTC
 4 |     - read data at once
 5 | - [dev](https://github.com/ilovin/lstm_ctc_ocr/tree/dev):
 6 |     - the pipline version of lstm_ctc_ocr, resize to same size
 7 |     - use tf.records
 8 | - [beta](https://github.com/ilovin/lstm_ctc_ocr/tree/beta) (current):
 9 |     - generate data on the fly
10 |     - deal with multi-width image, padding to same width
11 | 
12 | ## How to use
13 | 1. ./train.sh
14 | 
15 | 
16 | ### Dependency
17 | - python 3
18 | - tensorflow 1.0.1  
19 | - [captcha](https://pypi.python.org/pypi/captcha)
20 | - [warpCTC tensorflow_binding](https://github.com/baidu-research/warp-ctc/tree/master/tensorflow_binding)
21 | 
22 | ### Some details
23 | 
24 | The training data:  
25 | ![data](https://ooo.0o0.ooo/2017/04/13/58ef08ab6af03.png)  
26 | 
27 | Notice that,
28 | parameters can be found in `./lstm.yml`(higher priority) and `lib/lstm/utils/config.y`  
29 | some parameters need to be fined tune:
30 | - learning rate
31 | - decay step & decay rate
32 | - image_height
33 | - optimizer?
34 | 
35 | in `./lib/lstm/utils/gen.py`, the height of the images are the same, and I pad the width
36 | to the same for each batch, so
37 | if you want to use your own data, the height of the image shall be the same.
38 | 
39 | ### Result
40 | The accurary can be more that 95%
41 | ![acc](https://i.loli.net/2017/08/28/59a2ee75a2a0a.png)
42 | 
43 | Read [this blog](https://ilovin.github.io/2017-04-06/tensorflow-lstm-ctc-ocr/) for more details and [this blog](http://ilovin.github.io/2017-04-23/tensorflow-lstm-ctc-input-output/) for how to
44 | use `tf.nn.ctc_loss` or `warpCTC`
45 | 


--------------------------------------------------------------------------------
/fonts/Ubuntu-M.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilovin/lstm_ctc_ocr/6c753df22e7c1bab40ce2170e9a11e7b3868cf80/fonts/Ubuntu-M.ttf


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
1 | #import fast_rcnn
2 | 


--------------------------------------------------------------------------------
/lib/lstm/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | from . import config
 9 | from . import train
10 | 


--------------------------------------------------------------------------------
/lib/lstm/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path as osp
  3 | import numpy as np
  4 | from time import strftime, localtime
  5 | from easydict import EasyDict as edict
  6 | 
  7 | __C = edict()
  8 | # Consumers can get config by:
  9 | #   from fast_rcnn_config import cfg
 10 | cfg = __C
 11 | 
 12 | # Default GPU device id
 13 | __C.GPU_ID = 1
 14 | __C.GPU_USAGE = 0.9
 15 | __C.OFFSET_TIME_STEP = -1
 16 | # region proposal network (RPN) or not
 17 | __C.POOL_SCALE = 4
 18 | __C.IMG_SHAPE = [32,100]
 19 | __C.IMG_HEIGHT = 32
 20 | __C.MAX_CHAR_LEN = 6
 21 | __C.BLANK_TOKEN=0
 22 | __C.CHARSET = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 23 | __C.NCLASSES = len(__C.CHARSET)+2
 24 | __C.MIN_LEN = 4
 25 | __C.MAX_LEN = 6
 26 | __C.FONT = 'fonts/Ubuntu-M.ttf'
 27 | __C.NCHANNELS = 1
 28 | __C.NUM_FEATURES= __C.IMG_HEIGHT *__C.NCHANNELS
 29 | #__C.TIME_STEP = __C.IMG_SHAPE[0]//__C.POOL_SCALE
 30 | 
 31 | __C.NET_NAME = 'lstm'
 32 | __C.TRAIN = edict()
 33 | # Adam, Momentum, RMS
 34 | __C.TRAIN.SOLVER = 'Adam'
 35 | #__C.TRAIN.SOLVER = 'Momentum'
 36 | # __C.TRAIN.SOLVER = 'RMS'
 37 | # learning rate
 38 | __C.TRAIN.TXT = 'annotation_train.txt'
 39 | __C.TRAIN.WEIGHT_DECAY = 0.0005
 40 | __C.TRAIN.LEARNING_RATE = 0.01
 41 | __C.TRAIN.MOMENTUM = 0.9
 42 | __C.TRAIN.GAMMA = 0.1
 43 | __C.TRAIN.STEPSIZE = 50000
 44 | __C.TRAIN.DISPLAY = 10
 45 | __C.TRAIN.LOG_IMAGE_ITERS = 100
 46 | __C.TRAIN.NUM_EPOCHS = 2000
 47 | 
 48 | __C.TRAIN.NUM_HID = 512
 49 | __C.TRAIN.NUM_LAYERS = 2
 50 | __C.TRAIN.BATCH_SIZE = 64
 51 | 
 52 | # Iterations between snapshots
 53 | __C.TRAIN.SNAPSHOT_ITERS = 5000
 54 | __C.TRAIN.SNAPSHOT_PREFIX = 'lstm'
 55 | __C.TRAIN.SNAPSHOT_INFIX = ''
 56 | 
 57 | __C.VAL = edict()
 58 | __C.VAL.TXT = 'annotation_val.txt'
 59 | __C.VAL.VAL_STEP = 1000
 60 | __C.VAL.NUM_EPOCHS = 1000
 61 | __C.VAL.BATCH_SIZE = 128
 62 | __C.VAL.PRINT_NUM = 5
 63 | 
 64 | __C.RNG_SEED = 3
 65 | 
 66 | __C.ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..'))
 67 | __C.TEST = edict()
 68 | __C.EXP_DIR = 'default'
 69 | __C.LOG_DIR = 'default'
 70 | 
 71 | __C.SPACE_INDEX = 0
 72 | __C.SPACE_TOKEN = ''
 73 | def get_encode_decode_dict():
 74 |     encode_maps = {}
 75 |     decode_maps = {}
 76 |     for i, char in enumerate(__C.CHARSET, 1):
 77 |         encode_maps[char] = i
 78 |         decode_maps[i] = char
 79 |     encode_maps[__C.SPACE_TOKEN] = __C.SPACE_INDEX
 80 |     decode_maps[__C.SPACE_INDEX] = __C.SPACE_TOKEN
 81 |     return encode_maps,decode_maps
 82 | 
 83 | 
 84 | def get_output_dir(imdb, weights_filename):
 85 |     outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'output', __C.EXP_DIR))
 86 |     if weights_filename is not None:
 87 |         outdir = osp.join(outdir, weights_filename)
 88 |     if not os.path.exists(outdir):
 89 |         os.makedirs(outdir)
 90 |     return outdir
 91 | 
 92 | def get_log_dir(imdb):
 93 |     log_dir = osp.abspath(\
 94 |         osp.join(__C.ROOT_DIR, 'logs', __C.LOG_DIR, imdb.name, strftime("%Y-%m-%d-%H-%M-%S", localtime())))
 95 |     if not os.path.exists(log_dir):
 96 |         os.makedirs(log_dir)
 97 |     return log_dir
 98 | 
 99 | def _merge_a_into_b(a, b):
100 |     if type(a) is not edict:
101 |         return
102 | 
103 |     for k, v in a.items():
104 |         # a must specify keys that are in b
105 |         if k not in b:
106 |             raise KeyError('{} is not a valid config key'.format(k))
107 | 
108 |         # the types must match, too
109 |         old_type = type(b[k])
110 |         if old_type is not type(v):
111 |             if isinstance(b[k], np.ndarray):
112 |                 v = np.array(v, dtype=b[k].dtype)
113 |             else:
114 |                 raise ValueError(('Type mismatch ({} vs. {}) '
115 |                                 'for config key: {}').format(type(b[k]),
116 |                                                             type(v), k))
117 | 
118 |         # recursively merge dicts
119 |         if type(v) is edict:
120 |             try:
121 |                 _merge_a_into_b(a[k], b[k])
122 |             except:
123 |                 print(('Error under config key: {}'.format(k)))
124 |                 raise
125 |         else:
126 |             b[k] = v
127 | 
128 | def cfg_from_file(filename):
129 |     """Load a config file and merge it into the default options."""
130 |     import yaml
131 |     with open(filename, 'r') as f:
132 |         yaml_cfg = edict(yaml.load(f))
133 | 
134 |     _merge_a_into_b(yaml_cfg, __C)
135 | 
136 | def cfg_from_list(cfg_list):
137 |     """Set config keys via list (e.g., from command line)."""
138 |     from ast import literal_eval
139 |     assert len(cfg_list) % 2 == 0
140 |     for k, v in zip(cfg_list[0::2], cfg_list[1::2]):
141 |         key_list = k.split('.')
142 |         d = __C
143 |         for subkey in key_list[:-1]:
144 |             assert subkey in d
145 |             d = d[subkey]
146 |         subkey = key_list[-1]
147 |         assert subkey in d
148 |         try:
149 |             value = literal_eval(v)
150 |         except:
151 |             # handle the case when v is a string literal
152 |             value = v
153 |         assert type(value) == type(d[subkey]), \
154 |             'type {} does not match original type {}'.format(
155 |             type(value), type(d[subkey]))
156 |         d[subkey] = value
157 | 


--------------------------------------------------------------------------------
/lib/lstm/test.py:
--------------------------------------------------------------------------------
  1 | import sys,math
  2 | import os,shutil
  3 | import collections
  4 | import numpy as np
  5 | import os
  6 | import tensorflow as tf
  7 | import cv2
  8 | from lib.lstm.utils.timer import Timer
  9 | from ..lstm.config import cfg,get_encode_decode_dict
 10 | 
 11 | class SolverWrapper(object):
 12 |     def __init__(self, sess, network, imgdb, output_dir, logdir, pretrained_model=None):
 13 |         self.net = network
 14 |         self.imgdb = imgdb
 15 |         self.output_dir = output_dir
 16 |         self.pretrained_model = pretrained_model
 17 |         print('done')
 18 | 
 19 |         # For checkpoint
 20 |         self.saver = tf.train.Saver(max_to_keep=100)
 21 |         self.writer = tf.summary.FileWriter(logdir=logdir,
 22 |                                              graph=tf.get_default_graph(),
 23 |                                              flush_secs=5)
 24 | 
 25 | 
 26 | 
 27 |     def test_model(self,sess,testDir=None,restore = True):
 28 |         logits = self.net.get_output('logits')
 29 |         time_step_batch = self.net.get_output('time_step_len')
 30 |         decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, time_step_batch, merge_repeated=True)
 31 |         dense_decoded = tf.cast(tf.sparse_tensor_to_dense(decoded[0], default_value=0), tf.int32)
 32 | 
 33 |         img_size = cfg.IMG_SHAPE
 34 |         global_step = tf.Variable(0, trainable=False)
 35 |         # intialize variables
 36 |         local_vars_init_op = tf.local_variables_initializer()
 37 |         global_vars_init_op = tf.global_variables_initializer()
 38 | 
 39 |         combined_op = tf.group(local_vars_init_op, global_vars_init_op)
 40 |         sess.run(combined_op)
 41 |         # resuming a trainer
 42 |         if restore:
 43 |             try:
 44 |                 ckpt = tf.train.get_checkpoint_state(self.output_dir)
 45 |                 print('Restoring from {}...'.format(ckpt.model_checkpoint_path), end=' ')
 46 |                 self.saver.restore(sess, tf.train.latest_checkpoint(self.output_dir))
 47 |                 stem = os.path.splitext(os.path.basename(ckpt.model_checkpoint_path))[0]
 48 |                 restore_iter = int(stem.split('_')[-1])
 49 |                 sess.run(global_step.assign(restore_iter))
 50 |                 print('done')
 51 |             except:
 52 |                 raise Exception('Check your pretrained {:s}'.format(ckpt.model_checkpoint_path))
 53 | 
 54 |         timer = Timer()
 55 | 
 56 |         total = correct = 0
 57 |         for file in os.listdir(testDir):
 58 |             timer.tic()
 59 |             total+=1
 60 | 
 61 |             if cfg.NCHANNELS == 1: img = cv2.imread(os.path.join(testDir,file),0)
 62 |             else : img = cv2.imread(os.path.join(testDir,file),1)
 63 |             print(file,end=' ')
 64 |             #img = cv2.resize(img,tuple(img_size))
 65 |             w = img.shape[1]
 66 |             width = math.ceil(img.shape[1] / cfg.POOL_SCALE) * cfg.POOL_SCALE
 67 |             img = cv2.copyMakeBorder(img, 0, 0, 0, width - w, cv2.BORDER_CONSTANT, value=0).astype(np.float32) / 255.
 68 | 
 69 |             img = img.swapaxes(0,1)
 70 |             img = np.reshape(img, [1,width,cfg.NUM_FEATURES])
 71 |             #img = np.expand_dims(img,axis=0)
 72 |             feed_dict = {
 73 |                 self.net.data: img,
 74 |                 self.net.time_step_len: [img.shape[1]//cfg.POOL_SCALE],
 75 |                 self.net.keep_prob: 1.0
 76 |             }
 77 |             res = sess.run(fetches=dense_decoded[0], feed_dict=feed_dict)
 78 |             def decodeRes(nums,ignore= 0):
 79 |                 encode_maps,decode_maps = get_encode_decode_dict()
 80 |                 res = [decode_maps[i] for i in nums if i!=ignore]
 81 |                 return res
 82 |             org = file.split('.')[0].split('_')[1]
 83 |             res = ''.join(decodeRes(res))
 84 |             if org==res:correct+=1
 85 |             _diff_time = timer.toc(average=False)
 86 |             print('cost time: {:.3f},\n    res: {}'.format(_diff_time,res))
 87 |             #visualize_segmentation_adaptive(np.array(output),cls_dict)
 88 |         print('total acc:{}/{}={:.4f}'.format(correct,total,correct/total))
 89 | 
 90 | 
 91 | def test_net(network, imgdb, testDir, output_dir, log_dir, pretrained_model=None,restore=True):
 92 | 
 93 |     config = tf.ConfigProto(allow_soft_placement=True)
 94 |     config.gpu_options.allocator_type = 'BFC'
 95 |     #config.gpu_options.per_process_gpu_memory_fraction = 0.4
 96 |     with tf.Session(config=config) as sess:
 97 |         sw = SolverWrapper(sess, network, imgdb, output_dir, logdir= log_dir, pretrained_model=pretrained_model)
 98 |         print('Solving...')
 99 |         sw.test_model(sess, testDir=testDir, restore=restore)
100 |         print('done solving')
101 | 
102 | 


--------------------------------------------------------------------------------
/lib/lstm/train.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os,re
  3 | import tensorflow as tf
  4 | from ..lstm.config import cfg
  5 | from lib.lstm.utils.timer import Timer
  6 | from lib.lstm.utils.training import accuracy_calculation
  7 | from lib.lstm.utils.tf_records import read_tfrecord_and_decode_into_image_annotation_pair_tensors
  8 | from lib.lstm.utils.gen import get_batch
  9 | 
 10 | class SolverWrapper(object):
 11 |     def __init__(self, sess, network, imgdb, pre_train,output_dir, logdir):
 12 |         """Initialize the SolverWrapper."""
 13 |         self.net = network
 14 |         self.imgdb = imgdb
 15 |         self.pre_train=pre_train
 16 |         self.output_dir = output_dir
 17 |         print('done')
 18 |         self.saver = tf.train.Saver(max_to_keep=100)
 19 |         self.writer = tf.summary.FileWriter(logdir=logdir,
 20 |                                              graph=tf.get_default_graph(),
 21 |                                              flush_secs=5)
 22 | 
 23 |     def snapshot(self, sess, iter):
 24 |         net = self.net
 25 |         if not os.path.exists(self.output_dir):
 26 |             os.makedirs(self.output_dir)
 27 |         infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX
 28 |                  if cfg.TRAIN.SNAPSHOT_INFIX != '' else '')
 29 |         
 30 |         filename = (cfg.TRAIN.SNAPSHOT_PREFIX + '_ctc' + infix +
 31 |                         '_iter_{:d}'.format(iter + 1) + '.ckpt')
 32 |         
 33 |         #filename = (cfg.TRAIN.SNAPSHOT_PREFIX + infix +
 34 |          #           '_iter_{:d}'.format(iter+1) + '.ckpt')
 35 |         filename = os.path.join(self.output_dir, filename)
 36 |         self.saver.save(sess, filename)
 37 |         print('Wrote snapshot to: {:s}'.format(filename))
 38 | 
 39 |     def get_data(self,path,batch_size,num_epochs):
 40 |         filename_queue = tf.train.string_input_producer([path], num_epochs=num_epochs)
 41 |         image,label,label_len,time_step= read_tfrecord_and_decode_into_image_annotation_pair_tensors(filename_queue)
 42 |         image_batch, label_batch, label_len_batch,time_step_batch = tf.train.shuffle_batch([image,label,label_len,time_step],
 43 |                                                                                            batch_size=batch_size,
 44 |                                                                                            capacity=9600,
 45 |                                                                                            num_threads=4,
 46 |                                                                                            min_after_dequeue=6400)
 47 |         return image_batch, label_batch, label_len_batch,time_step_batch
 48 | 
 49 |     def restoreLabel(self,label_vec,label_len):
 50 |         labels = []
 51 |         for l_len in label_len:
 52 |             labels.append(label_vec[:l_len])
 53 |             label_vec = label_vec[l_len:]
 54 |         return labels
 55 | 
 56 |     def mergeLabel(self,labels,ignore = 0):
 57 |         label_lst = []
 58 |         for l in labels:
 59 |             while l[-1] == ignore: l = l[:-1]
 60 |             label_lst.extend(l)
 61 |         return np.array(label_lst)
 62 | 
 63 |     def train_model(self, sess, max_iters, restore=False):
 64 |         train_gen = get_batch(num_workers=12,batch_size=cfg.TRAIN.BATCH_SIZE,vis=False)
 65 |         val_gen = get_batch(num_workers=1,batch_size=cfg.VAL.BATCH_SIZE,vis=False)
 66 | 
 67 |         loss, dense_decoded = self.net.build_loss()
 68 | 
 69 |         tf.summary.scalar('loss', loss)
 70 |         summary_op = tf.summary.merge_all()
 71 | 
 72 |         # optimizer
 73 |         lr = tf.Variable(cfg.TRAIN.LEARNING_RATE, trainable=False)
 74 |         if cfg.TRAIN.SOLVER == 'Adam': opt = tf.train.AdamOptimizer(lr)
 75 |         elif cfg.TRAIN.SOLVER == 'RMS': opt = tf.train.RMSPropOptimizer(lr)
 76 |         else: opt = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM)
 77 | 
 78 |         global_step = tf.Variable(0, trainable=False)
 79 |         with_clip = True
 80 |         if with_clip:
 81 |             tvars = tf.trainable_variables()
 82 |             grads, norm = tf.clip_by_global_norm(tf.gradients(loss, tvars), 10.0)
 83 |             train_op = opt.apply_gradients(list(zip(grads, tvars)), global_step=global_step)
 84 |         else:
 85 |             train_op = opt.minimize(loss, global_step=global_step)
 86 | 
 87 |         # intialize variables
 88 |         local_vars_init_op = tf.local_variables_initializer()
 89 |         global_vars_init_op = tf.global_variables_initializer()
 90 | 
 91 |         combined_op = tf.group(local_vars_init_op, global_vars_init_op)
 92 |         sess.run(combined_op)
 93 |         restore_iter = 1
 94 | 
 95 |         # resuming a trainer
 96 |         if restore:
 97 |             try:
 98 |                 ckpt = tf.train.get_checkpoint_state(self.output_dir)
 99 |                 print('Restoring from {}...'.format(ckpt.model_checkpoint_path), end=' ')
100 |                 self.saver.restore(sess, tf.train.latest_checkpoint(self.output_dir))
101 |                 stem = os.path.splitext(os.path.basename(ckpt.model_checkpoint_path))[0]
102 |                 restore_iter = int(stem.split('_')[-1])
103 |                 sess.run(global_step.assign(restore_iter))
104 |                 print('done')
105 |             except:
106 |                 raise Exception('Check your pretrained {:s}'.format(ckpt.model_checkpoint_path))
107 | 
108 |         timer = Timer()
109 |         loss_min = 0.015
110 |         first_val = True
111 |         for iter in range(restore_iter, max_iters):
112 |             timer.tic()
113 |             # learning rate
114 |             if iter != 0 and iter % cfg.TRAIN.STEPSIZE == 0:
115 |                 sess.run(tf.assign(lr, lr.eval() * cfg.TRAIN.GAMMA))
116 | 
117 |             # get one batch
118 |             img_Batch,label_Batch, label_len_Batch,time_step_Batch = next(train_gen)
119 |             img_Batch = np.array(img_Batch)
120 |             # Subtract the mean pixel value from each pixel
121 |             feed_dict = {
122 |                 self.net.data:          np.array(img_Batch),
123 |                 self.net.labels:        np.array(label_Batch),
124 |                 self.net.time_step_len: np.array(time_step_Batch),
125 |                 self.net.labels_len:    np.array(label_len_Batch),
126 |                 self.net.keep_prob:     0.5
127 |             }
128 | 
129 |             fetch_list = [loss,summary_op,train_op]
130 |             ctc_loss,summary_str, _ =  sess.run(fetches=fetch_list, feed_dict=feed_dict)
131 | 
132 |             self.writer.add_summary(summary=summary_str, global_step=global_step.eval())
133 |             _diff_time = timer.toc(average=False)
134 | 
135 |             if (iter) % (cfg.TRAIN.DISPLAY) == 0:
136 |                 print('iter: %d / %d, total loss: %.7f, lr: %.7f'%\
137 |                         (iter, max_iters, ctc_loss ,lr.eval()),end=' ')
138 |                 print('speed: {:.3f}s / iter'.format(_diff_time))
139 |             if (iter+1) % cfg.TRAIN.SNAPSHOT_ITERS == 0 or ctc_loss<loss_min:
140 |                 if(ctc_loss<loss_min):
141 |                     print('loss: ',ctc_loss,end=' ')
142 |                     self.snapshot(sess, 1)
143 |                     loss_min = ctc_loss
144 |                 else: self.snapshot(sess, iter)
145 |             if (iter+1) % cfg.VAL.VAL_STEP == 0 or loss_min==ctc_loss:
146 |                 if first_val:
147 |                     val_img_Batch,val_label_Batch, val_label_len_Batch,val_time_step_Batch = next(val_gen)
148 |                     org = self.restoreLabel(val_label_Batch,val_label_len_Batch)
149 |                     first_val=False
150 | 
151 |                 feed_dict = {
152 |                     self.net.data :          np.array(val_img_Batch),
153 |                     self.net.labels :         np.array(val_label_Batch),
154 |                     self.net.time_step_len : np.array(val_time_step_Batch),
155 |                     self.net.labels_len :     np.array(val_label_len_Batch),
156 |                     self.net.keep_prob:      1.0
157 |                 }
158 | 
159 |                 # fetch_list = [dense_decoded]
160 |                 res =  sess.run(fetches=dense_decoded, feed_dict=feed_dict)
161 |                 acc = accuracy_calculation(org,res,ignore_value=0)
162 |                 print('accuracy: {:.5f}'.format(acc))
163 | 
164 | 
165 | def train_net(network, imgdb, pre_train,output_dir, log_dir, max_iters=40000, restore=False):
166 |     config = tf.ConfigProto(allow_soft_placement=True)
167 |     config.gpu_options.per_process_gpu_memory_fraction = cfg.GPU_USAGE
168 |     config.gpu_options.allocator_type = 'BFC'
169 |     config.gpu_options.allow_growth = True
170 |     with tf.Session(config=config) as sess:
171 |         sw = SolverWrapper(sess, network, imgdb, pre_train,output_dir, logdir= log_dir)
172 |         print('Solving...')
173 |         sw.train_model(sess, max_iters, restore=restore)
174 |         print('done solving')
175 | 


--------------------------------------------------------------------------------
/lib/lstm/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ilovin/lstm_ctc_ocr/6c753df22e7c1bab40ce2170e9a11e7b3868cf80/lib/lstm/utils/__init__.py


--------------------------------------------------------------------------------
/lib/lstm/utils/gen.py:
--------------------------------------------------------------------------------
  1 | # encoding:utf-8
  2 | # encoding:utf-8
  3 | import glob
  4 | import csv
  5 | import cv2,math
  6 | import time
  7 | import os,random
  8 | import numpy as np
  9 | import scipy.optimize
 10 | import matplotlib.pyplot as plt
 11 | import matplotlib.patches as Patches
 12 | #from shapely.geometry import Polygon
 13 | from lib.utils.data_util import GeneratorEnqueuer
 14 | from lib.lstm.config import cfg,get_encode_decode_dict
 15 | import tensorflow as tf
 16 | from captcha.image import ImageCaptcha
 17 | import cv2
 18 | import sys
 19 | 
 20 | 
 21 | def randRGB():
 22 |     return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
 23 | 
 24 | def gen_rand():
 25 |     buf = ""
 26 |     max_len = random.randint(cfg.MIN_LEN,cfg.MAX_LEN)
 27 |     for i in range(max_len):
 28 |         buf += random.choice(cfg.CHARSET)
 29 |     return buf
 30 | 
 31 | def generateImg():
 32 |     captcha=ImageCaptcha(fonts=[cfg.FONT])
 33 |     if not os.path.exists(cfg.FONT):
 34 |         print('cannot open the font')
 35 |     theChars=gen_rand()
 36 |     data=captcha.generate_image(theChars)
 37 |     return np.array(data),theChars
 38 | 
 39 | encode_maps,decode_maps = get_encode_decode_dict()
 40 | 
 41 | def groupBatch(imgs,labels):
 42 |     max_w = -sys.maxsize
 43 |     time_steps = []
 44 |     label_len = []
 45 |     label_vec = []
 46 |     img_batch = []
 47 |     nh = cfg.IMG_HEIGHT
 48 |     for i,img in enumerate(imgs):
 49 |         if cfg.NCHANNELS==1: h,w = img.shape
 50 |         else: h,w,_ = img.shape
 51 |         nw = int(nh/h*w)
 52 |         max_w = max(max_w,nw)
 53 |         imgs[i] = cv2.resize(img,(nw,nh))
 54 |         time_steps.append(nw//cfg.POOL_SCALE+cfg.OFFSET_TIME_STEP)
 55 |         code = [encode_maps[c] for c in list(labels[i])]
 56 |         label_vec.extend(code)
 57 |         label_len.append(len(labels[i]))
 58 |     max_w = math.ceil(max_w/cfg.POOL_SCALE)*cfg.POOL_SCALE
 59 |     for img in imgs:
 60 |         if cfg.NCHANNELS==1: h,w = img.shape
 61 |         else: h,w,_ = img.shape
 62 |         img = cv2.copyMakeBorder(img,0,0,0,max_w-w,cv2.BORDER_CONSTANT,value=0).astype(np.float32)/255.
 63 |         img = img.swapaxes(0, 1)
 64 |         img = np.reshape(img,[-1,cfg.NUM_FEATURES])
 65 |         img_batch.append(img)
 66 |     #img_batch = np.array(img_batch)
 67 |     return img_batch,label_vec,label_len,time_steps
 68 | 
 69 | def generator(batch_size=32, vis=False):
 70 |     images = []
 71 |     labels = []
 72 |     while True:
 73 |         try:
 74 |             im, label = generateImg()
 75 |             #img_size = cfg.IMG_SHAPE  # 160,60
 76 |             #im = cv2.resize(im,(img_size[0],img_size[1]))
 77 |             if cfg.NCHANNELS == 1:
 78 |                 im = cv2.cvtColor(im,cv2.COLOR_BGR2GRAY)
 79 |                 #im = np.expand_dims(im,axis=2)
 80 |             # print(im.shape,' ',label)
 81 |             if vis:
 82 |                 fig, axs = plt.subplots(2, 1, figsize=(50, 30))
 83 |                 if cfg.NCHANNELS==1: axs[0].imshow(im[:, :])
 84 |                 else: axs[0].imshow(im[:, :,:])
 85 |                 axs[0].set_xticks([])
 86 |                 axs[0].set_yticks([])
 87 |                 axs[0].text(0, 0, label)
 88 | 
 89 |                 if cfg.NCHANNELS==1: pass#axs[1].imshow(im[:, :])
 90 |                 else:axs[1].imshow(im[:, :, ::-1])
 91 |                 axs[1].set_xticks([])
 92 |                 axs[1].set_yticks([])
 93 | 
 94 |                 plt.tight_layout()
 95 |                 plt.show()
 96 |                 plt.close()
 97 | 
 98 |             images.append(im)
 99 |             labels.append(label)
100 | 
101 |             if len(images) == batch_size:
102 |                 image_batch,label_vec,label_len,time_step = groupBatch(images,labels)
103 |                 yield image_batch,label_vec,label_len,time_step
104 |                 images = []
105 |                 labels = []
106 |         except Exception as e:
107 |             print(e)
108 |             import traceback
109 |             traceback.print_exc()
110 |             continue
111 | 
112 | def get_batch(num_workers, **kwargs):
113 |     try:
114 |         enqueuer = GeneratorEnqueuer(generator(**kwargs), use_multiprocessing=True)
115 |         enqueuer.start(max_queue_size=24, workers=num_workers)
116 |         generator_output = None
117 |         while True:
118 |             while enqueuer.is_running():
119 |                 if not enqueuer.queue.empty():
120 |                     generator_output = enqueuer.queue.get()
121 |                     break
122 |                 else:
123 |                     time.sleep(0.01)
124 |             yield generator_output
125 |             generator_output = None
126 |     finally:
127 |         if enqueuer is not None:
128 |             enqueuer.stop()
129 | 
130 | if __name__ == '__main__':
131 |     # gen = generator(batch_size=32, vis=False)
132 |     gen = get_batch(num_workers=24,batch_size=32,vis=False)
133 |     while True:
134 |         images, labels,label_len,time_step =  next(gen)
135 |         print(len(images)," ",images[0].shape)
136 | 


--------------------------------------------------------------------------------
/lib/lstm/utils/tf_records.py:
--------------------------------------------------------------------------------
  1 | # Important: We are using PIL to read .png files later.
  2 | # This was done on purpose to read indexed png files
  3 | # in a special way -- only indexes and not map the indexes
  4 | # to actual rgb values. This is specific to PASCAL VOC
  5 | # dataset data. If you don't want thit type of behaviour
  6 | # consider using skimage.io.imread()
  7 | from PIL import Image
  8 | import numpy as np
  9 | import skimage.io as io
 10 | import tensorflow as tf
 11 | import os,re
 12 | from lib.lstm.config import cfg,get_encode_decode_dict
 13 | 
 14 | charset = cfg.CHARSET
 15 | encode_maps , decode_maps = get_encode_decode_dict()
 16 | # Helper functions for defining tf types
 17 | def _bytes_feature(value):
 18 |     return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
 19 | 
 20 | def _int64_feature(value):
 21 |     return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
 22 | 
 23 | def _int64_feature_list(values):
 24 |     """Wrapper for inserting an int64 FeatureList into a SequenceExample proto,
 25 |     e.g, sentence in list of ints
 26 |     """
 27 |     return tf.train.FeatureList(feature=[_int64_feature(v) for v in values])
 28 | 
 29 | def _bytes_feature_list(values):
 30 |     """Wrapper for inserting a bytes FeatureList into a SequenceExample proto,
 31 |     e.g, sentence in list of bytes
 32 |     """
 33 |     return tf.train.FeatureList(feature=[_bytes_feature(v) for v in values])
 34 | 
 35 | 
 36 | def write_image_annotation_pairs_to_tfrecord(img_path, tfrecords_filename):
 37 |     """Writes given image/annotation pairs to the tfrecords file.
 38 |     The function reads each image/annotation pair given img_path
 39 |     of image and respective annotation and writes it to the tfrecord
 40 |     file.
 41 |     Parameters
 42 |     ----------
 43 |     img_path : img_path
 44 |     tfrecords_filename : string
 45 |         Tfrecords filename to write the image/annotation pairs
 46 |     """
 47 |     writer = tf.python_io.TFRecordWriter(tfrecords_filename)
 48 |     maxLen = cfg.MAX_CHAR_LEN
 49 | 
 50 |     for root,subfolder,fileList in os.walk(img_path):
 51 |         for fname in fileList:
 52 |             fname = os.path.join(root,fname)
 53 |             img = np.array(Image.open(fname))
 54 |             code = re.match(r'.*\/[0-9]+_(.*)(_1)?\..*', fname).group(1)
 55 |             code = [cfg.SPACE_INDEX if code == cfg.SPACE_TOKEN else encode_maps[c] for c in list(code)]
 56 |             aligned_code = code[:]
 57 |             while len(aligned_code)<maxLen:aligned_code.append(0)
 58 |             # Unomment this one when working with surgical data
 59 | 
 60 |             # The reason to store image sizes was demonstrated
 61 |             # in the previous example -- we have to know sizes
 62 |             # of images to later read raw serialized string,
 63 |             # convert to 1d array and convert to respective
 64 |             # shape that image used to have.
 65 |             height = img.shape[0]
 66 |             width = img.shape[1]
 67 |             time_step = cfg.IMG_SHAPE[0]#160
 68 | 
 69 |             img_raw = img.tostring()
 70 | 
 71 |             context = tf.train.Features(feature={
 72 |                 'height': _int64_feature(height),
 73 |                 'width': _int64_feature(width),
 74 |                 'time_step': _int64_feature(time_step),
 75 |                 'label_len': _int64_feature(len(code)),
 76 |                 'image_raw': _bytes_feature(img_raw)
 77 |                 })
 78 |             featureLists = tf.train.FeatureLists(feature_list={
 79 |                 'label': _int64_feature_list(aligned_code)
 80 |             })
 81 | 
 82 |             sequence_example = tf.train.SequenceExample(
 83 |                 context=context,feature_lists =featureLists
 84 |             )
 85 | 
 86 |             writer.write(sequence_example.SerializeToString())
 87 |             # writer.write(example.SerializeToString())
 88 | 
 89 |         writer.close()
 90 |         print('Done')
 91 | 
 92 | 
 93 | def read_image_annotation_pairs_from_tfrecord(tfrecords_filename):
 94 |     """Return image/annotation pairs from the tfrecords file.
 95 |     The function reads the tfrecords file and returns image
 96 |     and respective annotation matrices pairs.
 97 |     Parameters
 98 |     ----------
 99 |     tfrecords_filename : string
100 |         filename of .tfrecords file to read from
101 |     
102 |     Returns
103 |     -------
104 |     image_annotation_pairs : array of tuples (img, annotation)
105 |         The image and annotation that were read from the file
106 |     """
107 |     
108 |     image_annotation_pairs = []
109 | 
110 |     record_iterator = tf.python_io.tf_record_iterator(path=tfrecords_filename)
111 | 
112 |     for string_record in record_iterator:
113 | 
114 |         example = tf.train.Example()
115 |         example.ParseFromString(string_record)
116 | 
117 |         height = int(example.features.feature['height']
118 |                                      .int64_list
119 |                                      .value[0])
120 | 
121 |         width = int(example.features.feature['width']
122 |                                     .int64_list
123 |                                     .value[0])
124 | 
125 |         img_string = (example.features.feature['image_raw']
126 |                                       .bytes_list
127 |                                       .value[0])
128 | 
129 |         annotation_string = (example.features.feature['mask_raw']
130 |                                     .bytes_list
131 |                                     .value[0])
132 | 
133 |         img_1d = np.fromstring(img_string, dtype=np.uint8)
134 |         img = img_1d.reshape((height, width, -1))
135 | 
136 |         annotation_1d = np.fromstring(annotation_string, dtype=np.uint8)
137 | 
138 |         # Annotations don't have depth (3rd dimension)
139 |         # TODO: check if it works for other datasets
140 |         annotation = annotation_1d.reshape((height, width))
141 | 
142 |         image_annotation_pairs.append((img, annotation))
143 |     
144 |     return image_annotation_pairs
145 | 
146 | 
147 | def read_tfrecord_and_decode_into_image_annotation_pair_tensors(tfrecord_filenames_queue):
148 |     """Return image/annotation tensors that are created by reading tfrecord file.
149 |     The function accepts tfrecord filenames queue as an input which is usually
150 |     can be created using tf.train.string_input_producer() where filename
151 |     is specified with desired number of epochs. This function takes queue
152 |     produced by aforemention tf.train.string_input_producer() and defines
153 |     tensors converted from raw binary representations into
154 |     reshaped image/annotation tensors.
155 |     Parameters
156 |     ----------
157 |     tfrecord_filenames_queue : tfrecord filename queue
158 |         String queue object from tf.train.string_input_producer()
159 |     
160 |     Returns
161 |     -------
162 |     image, annotation : tuple of tf.int32 (image, annotation)
163 |         Tuple of image/annotation tensors
164 |     """
165 |     
166 |     reader = tf.TFRecordReader()
167 | 
168 |     _, serialized_example = reader.read(tfrecord_filenames_queue)
169 | 
170 |     features,sequence_features = tf.parse_single_sequence_example( serialized_example,
171 |         context_features={
172 |             'height': tf.FixedLenFeature([], tf.int64),
173 |             'width': tf.FixedLenFeature([], tf.int64),
174 |             'time_step': tf.FixedLenFeature([], tf.int64),
175 |             'label_len': tf.FixedLenFeature([], tf.int64),
176 |             'image_raw': tf.FixedLenFeature([], tf.string), },
177 |         sequence_features={
178 |             'label': tf.FixedLenSequenceFeature([], tf.int64),})
179 |     
180 |     image = tf.decode_raw(features['image_raw'], tf.uint8)
181 | 
182 |     height = tf.cast(features['height'], tf.int32)
183 |     width = tf.cast(features['width'], tf.int32)
184 |     label_len = tf.cast(features['label_len'], tf.int32)
185 |     label = tf.cast(sequence_features['label'],tf.int32)
186 |     label = tf.reshape(label,[cfg.MAX_CHAR_LEN])
187 |     #image_shape = tf.pack([height, width, 3])
188 |     image_shape = tf.parallel_stack([height, width, 3])
189 |     image = tf.reshape(image,image_shape)
190 | 
191 |     img_size = cfg.IMG_SHAPE #160,60
192 |     time_step = tf.constant(cfg.TIME_STEP,tf.int32)
193 | 
194 |     if cfg.NCHANNELS==1: image = tf.image.rgb_to_grayscale(image)
195 |     image = tf.image.resize_images(image,size=(img_size[1],img_size[0]),method=tf.image.ResizeMethod.BILINEAR)
196 |     image = tf.transpose(image,perm=[1,0,2])
197 |     image = tf.cast(tf.reshape(image,[img_size[0],cfg.NUM_FEATURES]),dtype=tf.float32)/255.
198 | 
199 |     # The last dimension was added because
200 |     # the tf.resize_image_with_crop_or_pad() accepts tensors
201 |     # that have depth. We need resize and crop later.
202 |     # TODO: See if it is necessary and probably remove third
203 |     # dimension
204 |     #annotation_shape = tf.pack([height, width, 1])
205 |     # image = tf.reshape(image, image_shape)
206 | 
207 |     return image, label,label_len,time_step
208 | 
209 | def wrtie_test(img_path ,tfrecords_filename = None):
210 |     write_image_annotation_pairs_to_tfrecord(img_path=img_path,tfrecords_filename=tfrecords_filename)
211 | def read_test(tfrecords_fiename=None):
212 |     config = tf.ConfigProto(allow_soft_placement=True)
213 |     config.gpu_options.allocator_type = 'BFC'
214 |     with tf.Session(config=config) as sess:
215 |         filename_queue = tf.train.string_input_producer([tfrecords_fiename], num_epochs=1)
216 |         image,label,label_len,time_step= read_tfrecord_and_decode_into_image_annotation_pair_tensors(filename_queue)
217 |         image_batch, label_batch, label_len_batch,time_step_batch = tf.train.shuffle_batch([image,label,label_len,time_step],
218 |                                                                                            batch_size=2,
219 |                                                                                            capacity=500,
220 |                                                                                            num_threads=2,
221 |                                                                                            min_after_dequeue=100)
222 |         sess.run(tf.global_variables_initializer())
223 |         sess.run(tf.local_variables_initializer())
224 | 
225 |         coord = tf.train.Coordinator()
226 |         threads = tf.train.start_queue_runners(coord=coord)
227 |         try:
228 |             while not coord.should_stop():
229 |                 # get one batch
230 |                 img_batch, l_batch, l_len_batch,t_s_batch = sess.run([image_batch, label_batch, label_len_batch,time_step_batch] )
231 |                 label = []
232 |                 for l in l_batch:
233 |                     while l[-1] == 0: l=l[:-1]
234 |                     label.extend(l)
235 |                 print(l_batch)
236 |                 # Subtract the mean pixel value from each pixel
237 |         except tf.errors.OutOfRangeError:
238 |             print('finish')
239 |         finally:
240 |             coord.request_stop()
241 |         coord.join(threads)
242 | 
243 | 
244 | if __name__=='__main__':
245 |     wrtie_test(img_path='/home/amax/Documents/code/lstm_train/lstm_ctc/data/train_4_6',tfrecords_filename='./data/train_4_6.tfrecords')
246 |     # wrtie_test(img_path='./data/val',tfrecords_filename='./data/val.tfrecords')
247 |     # read_test(tfrecords_fiename='./data/val.tfrecords')
248 | 


--------------------------------------------------------------------------------
/lib/lstm/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import time
 9 | 
10 | class Timer(object):
11 |     """A simple timer."""
12 |     def __init__(self):
13 |         self.total_time = 0.
14 |         self.calls = 0
15 |         self.start_time = 0.
16 |         self.diff = 0.
17 |         self.average_time = 0.
18 | 
19 |     def tic(self):
20 |         # using time.time instead of time.clock because time time.clock
21 |         # does not normalize for multithreading
22 |         self.start_time = time.time()
23 | 
24 |     def toc(self, average=True):
25 |         self.diff = time.time() - self.start_time
26 |         self.total_time += self.diff
27 |         self.calls += 1
28 |         self.average_time = self.total_time / self.calls
29 |         if average:
30 |             return self.average_time
31 |         else:
32 |             return self.diff
33 | 


--------------------------------------------------------------------------------
/lib/lstm/utils/training.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import re
  3 | from lib.lstm.config import cfg
  4 | import numpy as np
  5 | 
  6 | def get_label_and_len(fnames):
  7 |     labels = []
  8 |     lens = []
  9 |     """
 10 |     need to add regular expresiion for tensorflow
 11 |     """
 12 |     fnames = fnames.reshape([-1,1])
 13 |     # for fname in fnames:
 14 |     labels = np.array(labels,dtype=np.int32)
 15 |     lens = np.array(lens,dtype=np.int32)
 16 |     return labels
 17 | 
 18 | def get_label_and_len_from_fnames(fnames_tensor):
 19 |     """
 20 |     :param fnames_tensor: (batch_size) string
 21 |     :return: (n) n: the number of label
 22 |     """
 23 |     labels,label_len = tf.py_func(get_label_and_len,[fnames_tensor],tf.int32)
 24 |     return labels,label_len
 25 | 
 26 | def accuracy_calculation(original_seq,decoded_seq,ignore_value=0,isPrint = True):
 27 |     if  len(original_seq)!=len(decoded_seq):
 28 |         print('original lengths is different from the decoded_seq,please check again')
 29 |         return 0
 30 |     count = 0
 31 |     for i,origin_label in enumerate(original_seq):
 32 |         decoded_label  = [j for j in decoded_seq[i] if j!=ignore_value]
 33 |         org_label = [l for l in origin_label if l!=ignore_value]
 34 |         if isPrint and i<cfg.VAL.PRINT_NUM:
 35 |             print('seq{0:4d}: origin: {1} decoded:{2}'.format(i,origin_label,decoded_label))
 36 |         if org_label == decoded_label: count+=1
 37 |     return count*1.0/len(original_seq)
 38 | 
 39 | def get_labels_from_annotation(annotation_tensor, class_labels):
 40 |     """Returns tensor of size (width, height, num_classes) derived from annotation tensor.
 41 |     The function returns tensor that is of a size (width, height, num_classes) which
 42 |     is derived from annotation tensor with sizes (width, height) where value at
 43 |     each position represents a class. The functions requires a list with class
 44 |     values like [0, 1, 2 ,3] -- they are used to derive labels. Derived values will
 45 |     be ordered in the same way as the class numbers were provided in the list. Last
 46 |     value in the aforementioned list represents a value that indicate that the pixel
 47 |     should be masked out. So, the size of num_classes := len(class_labels) - 1.
 48 |     
 49 |     Parameters
 50 |     ----------
 51 |     annotation_tensor : Tensor of size (width, height)
 52 |         Tensor with class labels for each element
 53 |     class_labels : list of ints
 54 |         List that contains the numbers that represent classes. Last
 55 |         value in the list should represent the number that was used
 56 |         for masking out.
 57 |         
 58 |     Returns
 59 |     -------
 60 |     labels_2d_stacked : Tensor of size (width, height, num_classes).
 61 |         Tensor with labels for each pixel.
 62 |     """
 63 |     
 64 |     # Last value in the classes list should show
 65 |     # which number was used in the annotation to mask out
 66 |     # the ambigious regions or regions that should not be
 67 |     # used for training.
 68 |     # TODO: probably replace class_labels list with some custom object
 69 |     valid_entries_class_labels = class_labels[:-1]
 70 |     
 71 |     # Stack the binary masks for each class
 72 |     labels_2d = list(map(lambda x: tf.equal(annotation_tensor, x),
 73 |                     valid_entries_class_labels))
 74 | 
 75 |     # Perform the merging of all of the binary masks into one matrix
 76 |     labels_2d_stacked = tf.stack(labels_2d, axis=2)
 77 |     
 78 |     # Convert tf.bool to tf.float
 79 |     # Later on in the labels and logits will be used
 80 |     # in tf.softmax_cross_entropy_with_logits() function
 81 |     # where they have to be of the float type.
 82 |     labels_2d_stacked_float = tf.to_float(labels_2d_stacked)
 83 |     
 84 |     return labels_2d_stacked_float
 85 | 
 86 | def get_labels_from_annotation_batch(annotation_batch_tensor, class_labels):
 87 |     """Returns tensor of size (batch_size, width, height, num_classes) derived
 88 |     from annotation batch tensor. The function returns tensor that is of a size
 89 |     (batch_size, width, height, num_classes) which is derived from annotation tensor
 90 |     with sizes (batch_size, width, height) where value at each position represents a class.
 91 |     The functions requires a list with class values like [0, 1, 2 ,3] -- they are
 92 |     used to derive labels. Derived values will be ordered in the same way as
 93 |     the class numbers were provided in the list. Last value in the aforementioned
 94 |     list represents a value that indicate that the pixel should be masked out.
 95 |     So, the size of num_classes len(class_labels) - 1.
 96 |     
 97 |     Parameters
 98 |     ----------
 99 |     annotation_batch_tensor : Tensor of size (batch_size, width, height)
100 |         Tensor with class labels for each element
101 |     class_labels : list of ints
102 |         List that contains the numbers that represent classes. Last
103 |         value in the list should represent the number that was used
104 |         for masking out.
105 |         
106 |     Returns
107 |     -------
108 |     batch_labels : Tensor of size (batch_size, width, height, num_classes).
109 |         Tensor with labels for each batch.
110 |     """
111 |     
112 |     batch_labels = tf.map_fn(fn=lambda x: get_labels_from_annotation(annotation_tensor=x, class_labels=class_labels),
113 |                              elems=annotation_batch_tensor,
114 |                              dtype=tf.float32)
115 |     
116 |     return batch_labels
117 | 
118 | def get_valid_entries_indices_from_annotation_batch(annotation_batch_tensor, class_labels):
119 |     """Returns tensor of size (num_valid_eintries, 3).
120 |     Returns tensor that contains the indices of valid entries according
121 |     to the annotation tensor. This can be used to later on extract only
122 |     valid entries from logits tensor and labels tensor. This function is
123 |     supposed to work with a batch input like [b, w, h] -- where b is a
124 |     batch size, w, h -- are width and height sizes. So the output is
125 |     a tensor which contains indexes of valid entries. This function can
126 |     also work with a single annotation like [w, h] -- the output will
127 |     be (num_valid_eintries, 2).
128 |     
129 |     Parameters
130 |     ----------
131 |     annotation_batch_tensor : Tensor of size (batch_size, width, height)
132 |         Tensor with class labels for each batch
133 |     class_labels : list of ints
134 |         List that contains the numbers that represent classes. Last
135 |         value in the list should represent the number that was used
136 |         for masking out.
137 |         
138 |     Returns
139 |     -------
140 |     valid_labels_indices : Tensor of size (num_valid_eintries, 3).
141 |         Tensor with indices of valid entries
142 |     """
143 |     
144 |     # Last value in the classes list should show
145 |     # which number was used in the annotation to mask out
146 |     # the ambigious regions or regions that should not be
147 |     # used for training.
148 |     # TODO: probably replace class_labels list with some custom object
149 |     mask_out_class_label = class_labels[-1]
150 |     
151 |     # Get binary mask for the pixels that we want to
152 |     # use for training. We do this because some pixels
153 |     # are marked as ambigious and we don't want to use
154 |     # them for trainig to avoid confusing the model
155 |     valid_labels_mask = tf.not_equal(annotation_batch_tensor,
156 |                                         mask_out_class_label)
157 |     
158 |     valid_labels_indices = tf.where(valid_labels_mask)
159 |     
160 |     return tf.to_int32(valid_labels_indices)
161 | 
162 | import numpy as np
163 | def sample(x):
164 |     input_shape=x.shape
165 |     x=x.reshape([-1,1])
166 |     num_fg = 500
167 |     fg_inds = np.where(x == 1)[0]
168 |     len_fg = len(fg_inds)
169 |     if len_fg > num_fg:
170 |         disable_inds = np.random.choice(fg_inds, size=(len_fg - num_fg), replace=False)
171 |         x[disable_inds] = 255
172 |         len_fg= 500
173 | 
174 |     num_bg = 1000-len_fg
175 |     bg_inds = np.where(x == 0)[0]
176 |     len_bg = len(bg_inds)
177 |     if len_bg > num_bg:
178 |         disable_inds = np.random.choice(bg_inds, size=(len_bg - num_bg), replace=False)
179 |         x[disable_inds] = 255
180 |     x=x.reshape(input_shape)
181 |     return x
182 | 
183 | def get_valid_logits_and_labels(annotation_batch_tensor,
184 |                                 logits_batch_tensor,
185 |                                 class_labels):
186 |     """Returns two tensors of size (num_valid_entries, num_classes).
187 |     The function converts annotation batch tensor input of the size
188 |     (batch_size, height, width) into label tensor (batch_size, height,
189 |     width, num_classes) and then selects only valid entries, resulting
190 |     in tensor of the size (num_valid_entries, num_classes). The function
191 |     also returns the tensor with corresponding valid entries in the logits
192 |     tensor. Overall, two tensors of the same sizes are returned and later on
193 |     can be used as an input into tf.softmax_cross_entropy_with_logits() to
194 |     get the cross entropy error for each entry.
195 |     
196 |     Parameters
197 |     ----------
198 |     annotation_batch_tensor : Tensor of size (batch_size, width, height)
199 |         Tensor with class labels for each batch
200 |     logits_batch_tensor : Tensor of size (batch_size, width, height, num_classes)
201 |         Tensor with logits. Usually can be achived after inference of fcn network.
202 |     class_labels : list of ints
203 |         List that contains the numbers that represent classes. Last
204 |         value in the list should represent the number that was used
205 |         for masking out.
206 |         
207 |     Returns
208 |     -------
209 |     (valid_labels_batch_tensor, valid_logits_batch_tensor) : Two Tensors of size (num_valid_eintries, num_classes).
210 |         Tensors that represent valid labels and logits.
211 |     """
212 | 
213 |     annotation_batch_tensor = tf.py_func(sample, [annotation_batch_tensor], tf.int32)
214 |     labels_batch_tensor = get_labels_from_annotation_batch(annotation_batch_tensor=annotation_batch_tensor,
215 |                                                            class_labels=class_labels)
216 |     
217 |     valid_batch_indices = get_valid_entries_indices_from_annotation_batch(annotation_batch_tensor=annotation_batch_tensor,
218 |                                                                           class_labels=class_labels)
219 |     
220 |     valid_labels_batch_tensor = tf.gather_nd(params=labels_batch_tensor, indices=valid_batch_indices)
221 |     
222 |     valid_logits_batch_tensor = tf.gather_nd(params=logits_batch_tensor, indices=valid_batch_indices)
223 |     
224 |     return valid_labels_batch_tensor, valid_logits_batch_tensor


--------------------------------------------------------------------------------
/lib/networks/LSTM_test.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from .network import Network
 3 | from ..lstm.config import cfg
 4 | 
 5 | 
 6 | class LSTM_test(Network):
 7 |     def __init__(self, trainable=True):
 8 |         self.inputs = []
 9 | 
10 |         self.data = tf.placeholder(tf.float32, shape=[None, None, cfg.NUM_FEATURES], name='data')
11 |         self.time_step_len = tf.placeholder(tf.int32,[None], name='time_step_len')
12 | 
13 |         self.keep_prob = tf.placeholder(tf.float32)
14 |         self.layers = dict({'data': self.data, 'time_step_len':self.time_step_len})
15 |         self.trainable = trainable
16 |         self.setup()
17 | 
18 |     def setup(self):
19 |         (self.feed('data')
20 |          .conv_single(3, 3, 64 ,1, 1, name='conv1',c_i=cfg.NCHANNELS)
21 |          .max_pool(2, 2, 2, 2, padding='VALID', name='pool1')
22 |          .conv_single(3, 3, 128 ,1, 1, name='conv2')
23 |          .max_pool(2, 2, 2, 2, padding='VALID', name='pool2')
24 |          .conv_single(3, 3, 256 ,1, 1, name='conv3_1')
25 |          .conv_single(3, 3, 256 ,1, 1, name='conv3_2')
26 |          .max_pool(1, 2, 1, 2, padding='VALID', name='pool2')
27 |          .conv_single(3, 3, 512 ,1, 1, name='conv4_1', bn=True)
28 |          .conv_single(3, 3, 512 ,1, 1, name='conv4_2', bn=True)
29 |          .max_pool(1, 2, 1, 2, padding='VALID', name='pool3')
30 |          .conv_single(2, 2, 512 ,1, 1, padding = 'VALID', name='conv5', relu=False)
31 |          #.dropout(keep_prob = self.keep_prob, name = 'dropout_layer')
32 |          .reshape_squeeze_layer(d = 512 , name='reshaped_layer'))
33 |         (self.feed('reshaped_layer','time_step_len')
34 |          .bi_lstm(cfg.TRAIN.NUM_HID,cfg.TRAIN.NUM_LAYERS,name='logits'))
35 | 
36 | 


--------------------------------------------------------------------------------
/lib/networks/LSTM_train.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from .network import Network
 3 | from ..lstm.config import cfg
 4 | 
 5 | 
 6 | class LSTM_train(Network):
 7 |     def __init__(self, trainable=True):
 8 |         self.inputs = []
 9 | 
10 |         self.data = tf.placeholder(tf.float32, shape=[None, None, cfg.NUM_FEATURES ], name='data') #N*t_s*features*channels
11 |         self.labels = tf.placeholder(tf.int32,[None],name='labels')
12 |         self.time_step_len = tf.placeholder(tf.int32,[None], name='time_step_len')
13 |         self.labels_len = tf.placeholder(tf.int32,[None],name='labels_len')
14 | 
15 |         self.keep_prob = tf.placeholder(tf.float32)
16 |         self.layers = dict({'data': self.data,'labels':self.labels,
17 |                             'time_step_len':self.time_step_len,
18 |                             'labels_len':self.labels_len})
19 |         self.trainable = trainable
20 |         self.setup()
21 | 
22 |     def setup(self):
23 |         (self.feed('data')
24 |          .conv_single(3, 3, 64 ,1, 1, name='conv1',c_i=cfg.NCHANNELS)
25 |          .max_pool(2, 2, 2, 2, padding='VALID', name='pool1')
26 |          .conv_single(3, 3, 128 ,1, 1, name='conv2')
27 |          .max_pool(2, 2, 2, 2, padding='VALID', name='pool2')
28 |          .conv_single(3, 3, 256 ,1, 1, name='conv3_1')
29 |          .conv_single(3, 3, 256 ,1, 1, name='conv3_2')
30 |          .max_pool(1, 2, 1, 2, padding='VALID', name='pool2')
31 |          .conv_single(3, 3, 512 ,1, 1, name='conv4_1', bn=True)
32 |          .conv_single(3, 3, 512 ,1, 1, name='conv4_2', bn=True)
33 |          .max_pool(1, 2, 1, 2, padding='VALID', name='pool3')
34 |          .conv_single(2, 2, 512 ,1, 1, padding = 'VALID', name='conv5', relu=False)
35 |          #.dropout(keep_prob = self.keep_prob, name = 'dropout_layer')
36 |          .reshape_squeeze_layer(d = 512 , name='reshaped_layer'))
37 |         (self.feed('reshaped_layer','time_step_len')
38 |          .bi_lstm(cfg.TRAIN.NUM_HID,cfg.TRAIN.NUM_LAYERS,name='logits'))
39 |          # .lstm(cfg.TRAIN.NUM_HID,cfg.TRAIN.NUM_LAYERS,name='logits',img_shape=[-1,cfg.IMG_SHAPE[0]//cfg.POOL_SCALE,cfg.NUM_FEATURES//cfg.POOL_SCALE]))
40 |          #.bi_lstm(cfg.TRAIN.NUM_HID,cfg.TRAIN.NUM_LAYERS,name='logits',img_shape=[-1,cfg.IMG_SHAPE[0]//cfg.POOL_SCALE,cfg.NUM_FEATURES//cfg.POOL_SCALE]))
41 | 


--------------------------------------------------------------------------------
/lib/networks/__init__.py:
--------------------------------------------------------------------------------
1 | from . import factory
2 | 


--------------------------------------------------------------------------------
/lib/networks/factory.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # SubCNN_TF
 3 | # Copyright (c) 2016 CVGL Stanford
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Yu Xiang
 6 | # --------------------------------------------------------
 7 | 
 8 | """Factory method for easily getting imdbs by name."""
 9 | 
10 | __sets = {}
11 | from .LSTM_train import LSTM_train
12 | from .LSTM_test import LSTM_test
13 | def get_network(name):
14 |     """Get a network by name."""
15 |     if name.split('_')[0] == 'LSTM':
16 |         if name.split('_')[1] == 'train':
17 |             return LSTM_train()
18 |         elif name.split('_')[1] == 'test':
19 |             return LSTM_test()
20 |         else:
21 |             raise KeyError('Unknown dataset: {}'.format(name))
22 | 
23 | def list_networks():
24 |     """List all registered imdbs."""
25 |     return list(__sets.keys())
26 | 
27 | 


--------------------------------------------------------------------------------
/lib/networks/network.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | from lib.lstm.config import cfg
  5 | from lib.lstm.utils.training import *
  6 | import warpctc_tensorflow
  7 | 
  8 | DEFAULT_PADDING = 'SAME'
  9 | 
 10 | def incluude_original(dec):
 11 |     """ Meta decorator, which make the original function callable (via f._original() )"""
 12 |     def meta_decorator(f):
 13 |         decorated = dec(f)
 14 |         decorated._original = f
 15 |         return decorated
 16 |     return meta_decorator
 17 | 
 18 | #@include_original
 19 | def layer(op):
 20 |     def layer_decorated(self, *args, **kwargs):
 21 |         # Automatically set a name if not provided.
 22 |         name = kwargs.setdefault('name', self.get_unique_name(op.__name__))
 23 |         # Figure out the layer inputs.
 24 |         if len(self.inputs)==0:
 25 |             raise RuntimeError('No input variables found for layer %s.'%name)
 26 |         elif len(self.inputs)==1:
 27 |             layer_input = self.inputs[0]
 28 |         else:
 29 |             layer_input = list(self.inputs)
 30 |         # Perform the operation and get the output.
 31 |         layer_output = op(self, layer_input, *args, **kwargs)
 32 |         # Add to layer LUT.
 33 |         self.layers[name] = layer_output
 34 |         # This output is now the input for the next layer.
 35 |         self.feed(layer_output)
 36 |         # Return self for chained calls.
 37 |         return self
 38 |     return layer_decorated
 39 | 
 40 | class Network(object):
 41 |     def __init__(self, inputs, trainable=True):
 42 |         self.inputs = []
 43 |         self.layers = dict(inputs)
 44 |         self.trainable = trainable
 45 |         self.setup()
 46 | 
 47 |     def setup(self):
 48 |         raise NotImplementedError('Must be subclassed.')
 49 | 
 50 |     def load(self, data_path, session, ignore_missing=False):
 51 |         data_dict = np.load(data_path,encoding='latin1').item()
 52 |         for key in data_dict:
 53 |             with tf.variable_scope(key, reuse=True):
 54 |                 for subkey in data_dict[key]:
 55 |                     try:
 56 |                         var = tf.get_variable(subkey)
 57 |                         session.run(var.assign(data_dict[key][subkey]))
 58 |                         print("assign pretrain model "+subkey+ " to "+key)
 59 |                     except ValueError:
 60 |                         print("ignore "+key)
 61 |                         if not ignore_missing:
 62 | 
 63 |                             raise
 64 | 
 65 |     def feed(self, *args):
 66 |         assert len(args)!=0
 67 |         self.inputs = []
 68 |         for layer in args:
 69 |             if isinstance(layer, str):
 70 |                 try:
 71 |                     layer = self.layers[layer]
 72 |                     print(layer)
 73 |                 except KeyError:
 74 |                     print(list(self.layers.keys()))
 75 |                     raise KeyError('Unknown layer name fed: %s'%layer)
 76 |             self.inputs.append(layer)
 77 |         return self
 78 | 
 79 |     def get_output(self, layer):
 80 |         try:
 81 |             layer = self.layers[layer]
 82 |         except KeyError:
 83 |             print(list(self.layers.keys()))
 84 |             raise KeyError('Unknown layer name fed: %s'%layer)
 85 |         return layer
 86 | 
 87 |     def get_unique_name(self, prefix):
 88 |         id = sum(t.startswith(prefix) for t,_ in list(self.layers.items()))+1
 89 |         return '%s_%d'%(prefix, id)
 90 | 
 91 |     def make_var(self, name, shape, initializer=None, trainable=True, regularizer=None):
 92 |         return tf.get_variable(name, shape, initializer=initializer, trainable=trainable, regularizer=regularizer)
 93 | 
 94 |     def validate_padding(self, padding):
 95 |         assert padding in ('SAME', 'VALID')
 96 | 
 97 |     @layer
 98 |     def bi_lstm(self, input, num_hids, num_layers, name,img_shape = None ,trainable=True):
 99 |         img,img_len = input[0],input[1]
100 |         #img = tf.squeeze(img,axis=3)
101 |         if img_shape:img =tf.reshape(img,shape = img_shape )
102 |         with tf.variable_scope(name) as scope:
103 |             #stack = tf.contrib.rnn.MultiRNNCell([cell,cell1] , state_is_tuple=True)
104 |             lstm_fw_cell = tf.contrib.rnn.LSTMCell(num_hids//2,state_is_tuple=True)
105 |             lstm_bw_cell = tf.contrib.rnn.LSTMCell(num_hids//2,state_is_tuple=True)
106 | 
107 |             output,_ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,lstm_bw_cell,img,img_len,dtype=tf.float32)
108 |             # output_bw_reverse = tf.reverse_sequence(output[1],img_len,seq_axis=1)
109 |             output = tf.concat(output,axis=2)
110 | 
111 |             #stack_cell = tf.contrib.rnn.MultiRNNCell(
112 |             #    [tf.contrib.rnn.LSTMCell(num_hids, state_is_tuple=True) for _ in range(num_layers)],
113 |             #    state_is_tuple=True)
114 |             #lstm_out,last_state = tf.nn.dynamic_rnn(stack_cell,output,img_len,dtype=tf.float32)
115 |             lstm_out = output
116 |             shape = tf.shape(img)
117 |             batch_size, time_step = shape[0],shape[1]
118 |             lstm_out = tf.reshape(lstm_out,[-1,num_hids])
119 |             init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False)
120 |             # init_weights = tf.contrib.layers.xavier_initializer()
121 |             # init_weights = tf.truncated_normal_initializer(stddev=0.1)
122 |             init_biases = tf.constant_initializer(0.0)
123 |             W = self.make_var('weights', [num_hids, cfg.NCLASSES], init_weights, trainable, \
124 |                               regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY))
125 |             b = self.make_var('biases', [cfg.NCLASSES], init_biases, trainable)
126 |             logits = tf.matmul(lstm_out,W)+b
127 |             logits = tf.reshape(logits,[batch_size,-1,cfg.NCLASSES])
128 |             logits = tf.transpose(logits,(1,0,2))
129 |             return logits
130 |     @layer
131 |     def lstm(self, input, num_hids, num_layers, name,img_shape = None ,trainable=True):
132 |         img,img_len = input[0],input[1]
133 |         if img_shape:img =tf.reshape(img,shape = img_shape )
134 |         with tf.variable_scope(name) as scope:
135 |             stack_cell = tf.contrib.rnn.MultiRNNCell(
136 |                 [tf.contrib.rnn.LSTMCell(num_hids, state_is_tuple=True) for _ in range(num_layers)],
137 |                 state_is_tuple=True)
138 |             lstm_out,last_state = tf.nn.dynamic_rnn(stack_cell,img,img_len,dtype=tf.float32)
139 |             shape = tf.shape(img)
140 |             batch_size, time_step = shape[0],shape[1]
141 |             lstm_out = tf.reshape(lstm_out,[-1,num_hids])
142 |             # init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.001, mode='FAN_AVG', uniform=False)
143 |             # init_weights = tf.contrib.layers.xavier_initializer()
144 |             init_weights = tf.truncated_normal_initializer(stddev=0.1)
145 |             init_biases = tf.constant_initializer(0.0)
146 |             W = self.make_var('weights', [num_hids, cfg.NCLASSES], init_weights, trainable, \
147 |                                    regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY))
148 |             b = self.make_var('biases', [cfg.NCLASSES], init_biases, trainable)
149 |             logits = tf.matmul(lstm_out,W)+b
150 |             logits = tf.reshape(logits,[batch_size,-1,cfg.NCLASSES])
151 |             logits = tf.transpose(logits,(1,0,2))
152 |             return logits
153 | 
154 |     @layer
155 |     def concat(self, input, axis, name):
156 |         with tf.variable_scope(name) as scope:
157 |             concat = tf.concat(values=input,axis=axis)
158 |         return concat
159 | 
160 |     @layer
161 |     def conv_single(self, input, k_h, k_w, c_o, s_h, s_w, name, c_i=None, bn=False, biased=True,relu=True, padding=DEFAULT_PADDING, trainable=True):
162 |         """ contribution by miraclebiu, and biased option"""
163 |         self.validate_padding(padding)
164 |         if not c_i: c_i = input.get_shape()[-1]
165 |         if c_i==1: input = tf.expand_dims(input=input,axis=3)
166 |         convolve = lambda i, k: tf.nn.conv2d(i, k, [1,s_h, s_w, 1], padding=padding)
167 |         with tf.variable_scope(name) as scope:
168 |             init_weights = tf.contrib.layers.xavier_initializer()
169 |             init_biases = tf.constant_initializer(0.0)
170 |             kernel = self.make_var('weights', [k_h, k_w, c_i, c_o], init_weights, trainable, \
171 |                                    regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY))
172 |             if biased:
173 |                 biases = self.make_var('biases', [c_o], init_biases, trainable)
174 |                 conv = convolve(input, kernel)
175 |                 bias = tf.nn.bias_add(conv, biases)
176 |                 if bn:
177 |                     bn_layer = tf.contrib.layers.batch_norm(bias, scale=True,
178 |                                                             center=True, is_training=True, scope=name)
179 |                 else:bn_layer = bias
180 |                 if relu:
181 |                     return tf.nn.relu(bn_layer)
182 |                 else: return bn_layer
183 |             else:
184 |                 conv = convolve(input, kernel)
185 |                 if bn:
186 |                     bn_layer = tf.contrib.layers.batch_norm(conv, scale=True,
187 |                                                             center=True, is_training=True, scope=name)
188 |                 else:bn_layer = conv
189 |                 if relu:
190 |                     return tf.nn.relu(bn_layer)
191 |                 return bn_layer
192 | 
193 |     @layer
194 |     def conv(self, input, k_h, k_w, c_o, s_h, s_w, name, c_i=None, biased=True,relu=True, padding=DEFAULT_PADDING, trainable=True):
195 |         """ contribution by miraclebiu, and biased option"""
196 |         self.validate_padding(padding)
197 |         if not c_i: c_i = input.get_shape()[-1]
198 |         convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
199 |         with tf.variable_scope(name) as scope:
200 |             init_weights = tf.contrib.layers.xavier_initializer()
201 |             init_biases = tf.constant_initializer(0.0)
202 |             kernel = self.make_var('weights', [k_h, k_w, c_i, c_o], init_weights, trainable, \
203 |                                    regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY))
204 |             if biased:
205 |                 biases = self.make_var('biases', [c_o], init_biases, trainable)
206 |                 conv = convolve(input, kernel)
207 |                 if relu:
208 |                     bias = tf.nn.bias_add(conv, biases)
209 | 
210 |                     return tf.nn.relu(bias)
211 |                 return tf.nn.bias_add(conv, biases)
212 |             else:
213 |                 conv = convolve(input, kernel)
214 |                 if relu:
215 |                     return tf.nn.relu(conv)
216 |                 return conv
217 | 
218 |     @layer
219 |     def conv_zero(self, input, k_h, k_w, c_o, s_h, s_w, name, biased=True, relu=True, padding=DEFAULT_PADDING,
220 |              trainable=True):
221 |         """ contribution by miraclebiu, and biased option"""
222 |         self.validate_padding(padding)
223 |         c_i = input.get_shape()[-1]
224 |         convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
225 |         with tf.variable_scope(name) as scope:
226 |             init_weights = tf.constant_initializer(0.0)
227 |             init_biases = tf.constant_initializer(0.0)
228 |             kernel = self.make_var('weights', [k_h, k_w, c_i, c_o], init_weights, trainable, \
229 |                                    regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY))
230 |             if biased:
231 |                 biases = self.make_var('biases', [c_o], init_biases, trainable)
232 |                 conv = convolve(input, kernel)
233 |                 if relu:
234 |                     bias = tf.nn.bias_add(conv, biases)
235 | 
236 |                     return tf.nn.relu(bias)
237 |                 return tf.nn.bias_add(conv, biases)
238 |             else:
239 |                 conv = convolve(input, kernel)
240 |                 if relu:
241 |                     return tf.nn.relu(conv)
242 |                 return conv
243 | 
244 |     @layer
245 |     def conv_norm(self, input, k_h, k_w, c_o, s_h, s_w, name, biased=True,relu=True, padding=DEFAULT_PADDING, trainable=True):
246 |         """ contribution by miraclebiu, and biased option"""
247 |         self.validate_padding(padding)
248 |         c_i = input.get_shape()[-1]
249 |         convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
250 |         with tf.variable_scope(name) as scope:
251 |             init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.001, mode='FAN_AVG', uniform=False)
252 |             # init_weights = tf.contrib.layers.xavier_initializer()
253 |             init_biases = tf.constant_initializer(0.0)
254 |             kernel = self.make_var('weights', [k_h, k_w, c_i, c_o], init_weights, trainable, \
255 |                                    regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY))
256 |             if biased:
257 |                 biases = self.make_var('biases', [c_o], init_biases, trainable)
258 |                 conv = convolve(input, kernel)
259 |                 if relu:
260 |                     bias = tf.nn.bias_add(conv, biases)
261 |                     temp_layer = tf.contrib.layers.batch_norm(bias, scale=True, center=True, is_training=True,
262 |                                                       scope=name)
263 |                     return tf.nn.relu(temp_layer)
264 |                 return tf.nn.bias_add(conv, biases)
265 |             else:
266 |                 conv = convolve(input, kernel)
267 |                 if relu:
268 |                     return tf.nn.crelu(conv)
269 |                 return conv
270 | 
271 |     @layer
272 |     def conv_final(self, input, k_h, k_w, c_o, s_h, s_w, name, biased=True, relu=True, padding=DEFAULT_PADDING,
273 |                   trainable=True):
274 |         """ contribution by miraclebiu, and biased option"""
275 |         self.validate_padding(padding)
276 |         c_i = 128
277 |         convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
278 |         with tf.variable_scope(name) as scope:
279 |             init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.001, mode='FAN_AVG', uniform=False)
280 |             # init_weights = tf.contrib.layers.xavier_initializer()
281 |             init_biases = tf.constant_initializer(0.0)
282 |             kernel = self.make_var('weights', [k_h, k_w, c_i, c_o], init_weights, trainable, \
283 |                                    regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY))
284 |             if biased:
285 |                 biases = self.make_var('biases', [c_o], init_biases, trainable)
286 |                 conv = convolve(input, kernel)
287 |                 if relu:
288 |                     bias = tf.nn.bias_add(conv, biases)
289 |                     temp_layer = tf.contrib.layers.batch_norm(bias, scale=True, center=True, is_training=True,
290 |                                                               scope=name)
291 |                     return tf.nn.relu(temp_layer)
292 |                 return tf.nn.bias_add(conv, biases)
293 |             else:
294 |                 conv = convolve(input, kernel)
295 |                 if relu:
296 |                     return tf.nn.crelu(conv)
297 |                 return conv
298 | 
299 |     @layer
300 |     def upconv(self, input, shape, c_o, ksize=4, stride = 2, name = 'upconv', biased=False, relu=True, padding=DEFAULT_PADDING,
301 |              trainable=True):
302 |         """ up-conv"""
303 |         self.validate_padding(padding)
304 | 
305 |         c_in = input.get_shape()[3].value
306 |         in_shape = tf.shape(input)
307 |         if shape is None:
308 |             h = ((in_shape[1] ) * stride)
309 |             w = ((in_shape[2] ) * stride)
310 |             new_shape = [in_shape[0], h, w, c_o]
311 |         else:
312 |             new_shape = [in_shape[0], shape[1], shape[2], c_o]
313 |         output_shape = tf.stack(new_shape)
314 | 
315 |         filter_shape = [ksize, ksize, c_o, c_in]
316 | 
317 |         with tf.variable_scope(name) as scope:
318 |             # init_weights = tf.contrib.layers.xavier_initializer()
319 |             init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.001, mode='FAN_AVG', uniform=False)
320 |             filters = self.make_var('weights', filter_shape, init_weights, trainable, \
321 |                                    regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY))
322 |             deconv = tf.nn.conv2d_transpose(input, filters, output_shape,
323 |                                             strides=[1, stride, stride, 1], padding=DEFAULT_PADDING, name=scope.name)
324 |             # coz de-conv losses shape info, use reshape to re-gain shape
325 |             deconv = tf.reshape(deconv, new_shape)
326 | 
327 |             if biased:
328 |                 init_biases = tf.constant_initializer(0.0)
329 |                 biases = self.make_var('biases', [c_o], init_biases, trainable)
330 |                 if relu:
331 |                     bias = tf.nn.bias_add(deconv, biases)
332 |                     return tf.nn.relu(bias)
333 |                 return tf.nn.bias_add(deconv, biases)
334 |             else:
335 |                 if relu:
336 |                     return tf.nn.relu(deconv)
337 |                 return deconv
338 | 
339 |     @layer
340 |     def relu(self, input, name):
341 |         return tf.nn.relu(input, name=name)
342 | 
343 |     @layer
344 |     def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING):
345 |         self.validate_padding(padding)
346 |         return tf.nn.max_pool(input,
347 |                               ksize=[1, k_h, k_w, 1],
348 |                               strides=[1, s_h, s_w, 1],
349 |                               padding=padding,
350 |                               name=name)
351 | 
352 |     @layer
353 |     def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING):
354 |         self.validate_padding(padding)
355 |         return tf.nn.avg_pool(input,
356 |                               ksize=[1, k_h, k_w, 1],
357 |                               strides=[1, s_h, s_w, 1],
358 |                               padding=padding,
359 |                               name=name)
360 | 
361 |     @layer
362 |     def reshape_squeeze_layer(self, input, d, name):
363 |         #N,H,W,C-> N,H*W,C
364 |         input_shape = tf.shape(input)
365 |         return tf.reshape(input, \
366 |                           [input_shape[0], \
367 |                            input_shape[1]*input_shape[2], \
368 |                            int(d)])
369 | 
370 |     @layer
371 |     def reshape_layer(self, input, d, name):
372 |         input_shape = tf.shape(input)
373 |         if name == 'rpn_cls_prob_reshape':
374 |             #
375 |             # transpose: (1, AxH, W, 2) -> (1, 2, AxH, W)
376 |             # reshape: (1, 2xA, H, W)
377 |             # transpose: -> (1, H, W, 2xA)
378 |              return tf.transpose(tf.reshape(tf.transpose(input,[0,3,1,2]),
379 |                                             [   input_shape[0],
380 |                                                 int(d),
381 |                                                 tf.cast(tf.cast(input_shape[1],tf.float32)/tf.cast(d,tf.float32)*tf.cast(input_shape[3],tf.float32),tf.int32),
382 |                                                 input_shape[2]
383 |                                             ]),
384 |                                  [0,2,3,1],name=name)
385 |         else:
386 |              return tf.transpose(tf.reshape(tf.transpose(input,[0,3,1,2]),
387 |                                         [   input_shape[0],
388 |                                             int(d),
389 |                                             tf.cast(tf.cast(input_shape[1],tf.float32)*(tf.cast(input_shape[3],tf.float32)/tf.cast(d,tf.float32)),tf.int32),
390 |                                             input_shape[2]
391 |                                         ]),
392 |                                  [0,2,3,1],name=name)
393 | 
394 |     @layer
395 |     def spatial_reshape_layer(self, input, d, name):
396 |         input_shape = tf.shape(input)
397 |         # transpose: (1, H, W, A x d) -> (1, H, WxA, d)
398 |         return tf.reshape(input,\
399 |                                [input_shape[0],\
400 |                                 input_shape[1], \
401 |                                 -1,\
402 |                                 int(d)])
403 | 
404 | 
405 |     @layer
406 |     def lrn(self, input, radius, alpha, beta, name, bias=1.0):
407 |         return tf.nn.local_response_normalization(input,
408 |                                                   depth_radius=radius,
409 |                                                   alpha=alpha,
410 |                                                   beta=beta,
411 |                                                   bias=bias,
412 |                                                   name=name)
413 | 
414 | 
415 |     @layer
416 |     def fc(self, input, num_out, name, relu=True, trainable=True):
417 |         with tf.variable_scope(name) as scope:
418 |             # only use the first input
419 |             if isinstance(input, tuple):
420 |                 input = input[0]
421 | 
422 |             input_shape = input.get_shape()
423 |             if input_shape.ndims == 4:
424 |                 dim = 1
425 |                 for d in input_shape[1:].as_list():
426 |                     dim *= d
427 |                 feed_in = tf.reshape(tf.transpose(input,[0,3,1,2]), [-1, dim])
428 |             else:
429 |                 feed_in, dim = (input, int(input_shape[-1]))
430 | 
431 |             if name == 'bbox_pred':
432 |                 init_weights = tf.truncated_normal_initializer(0.0, stddev=0.001)
433 |                 init_biases = tf.constant_initializer(0.0)
434 |             else:
435 |                 init_weights = tf.truncated_normal_initializer(0.0, stddev=0.01)
436 |                 init_biases = tf.constant_initializer(0.0)
437 | 
438 |             weights = self.make_var('weights', [dim, num_out], init_weights, trainable, \
439 |                                     regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY))
440 |             biases = self.make_var('biases', [num_out], init_biases, trainable)
441 | 
442 |             op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b
443 |             fc = op(feed_in, weights, biases, name=scope.name)
444 |             return fc
445 | 
446 |     @layer
447 |     def softmax(self, input, name):
448 |         input_shape = tf.shape(input)
449 |         if name == 'rpn_cls_prob':
450 |             return tf.reshape(tf.nn.softmax(tf.reshape(input,[-1,input_shape[3]])),[-1,input_shape[1],input_shape[2],input_shape[3]],name=name)
451 |         else:
452 |             return tf.nn.softmax(input,name=name)
453 | 
454 |     @layer
455 |     def spatial_softmax(self, input, name):
456 |         input_shape = tf.shape(input)
457 |         # d = input.get_shape()[-1]
458 |         return tf.reshape(tf.nn.softmax(tf.reshape(input, [-1, input_shape[3]])),
459 |                           [-1, input_shape[1], input_shape[2], input_shape[3]], name=name)
460 | 
461 |     @layer
462 |     def add(self,input,name):
463 |         """contribution by miraclebiu"""
464 |         return tf.add(input[0],input[1], name=name)
465 | 
466 |     @layer
467 |     def batch_normalization(self,input,name,relu=True, is_training=False):
468 |         """contribution by miraclebiu"""
469 |         if relu:
470 |             temp_layer=tf.contrib.layers.batch_norm(input,scale=True,center=True,is_training=is_training,scope=name)
471 |             return tf.nn.relu(temp_layer)
472 |         else:
473 |             return tf.contrib.layers.batch_norm(input,scale=True,center=True,is_training=is_training,scope=name)
474 | 
475 |     @layer
476 |     def negation(self, input, name):
477 |         """ simply multiplies -1 to the tensor"""
478 |         return tf.multiply(input, -1.0, name=name)
479 | 
480 |     @layer
481 |     def bn_scale_combo(self, input, c_in, name, relu=True):
482 |         """ PVA net BN -> Scale -> Relu"""
483 |         with tf.variable_scope(name) as scope:
484 |             bn = self.batch_normalization._original(self, input, name='bn', relu=False, is_training=False)
485 |             # alpha = tf.get_variable('bn_scale/alpha', shape=[c_in, ], dtype=tf.float32,
486 |             #                     initializer=tf.constant_initializer(1.0), trainable=True,
487 |             #                     regularizer=self.l2_regularizer(0.00001))
488 |             # beta = tf.get_variable('bn_scale/beta', shape=[c_in, ], dtype=tf.float32,
489 |             #                    initializer=tf.constant_initializer(0.0), trainable=True,
490 |             #                    regularizer=self.l2_regularizer(0.00001))
491 |             # bn = tf.add(tf.mul(bn, alpha), beta)
492 |             if relu:
493 |                 bn = tf.nn.relu(bn, name='relu')
494 |             return bn
495 | 
496 |     @layer
497 |     def pva_negation_block(self, input, k_h, k_w, c_o, s_h, s_w, name, biased=True, padding=DEFAULT_PADDING, trainable=True,
498 |                            scale = True, negation = True):
499 |         """ for PVA net, Conv -> BN -> Neg -> Concat -> Scale -> Relu"""
500 |         with tf.variable_scope(name) as scope:
501 |             conv = self.conv._original(self, input, k_h, k_w, c_o, s_h, s_w, biased=biased, relu=False, name='conv', padding=padding, trainable=trainable)
502 |             conv = self.batch_normalization._original(self, conv, name='bn', relu=False, is_training=False)
503 |             c_in = c_o
504 |             if negation:
505 |                 conv_neg = self.negation._original(self, conv, name='neg')
506 |                 conv = tf.concat(axis=3, values=[conv, conv_neg], name='concat')
507 |                 c_in += c_in
508 |             if scale:
509 |                 # y = \alpha * x + \beta
510 |                 alpha = tf.get_variable('scale/alpha', shape=[c_in,], dtype=tf.float32,
511 |                                         initializer=tf.constant_initializer(1.0), trainable=True, regularizer=self.l2_regularizer(0.00001))
512 |                 beta = tf.get_variable('scale/beta', shape=[c_in, ], dtype=tf.float32,
513 |                                         initializer=tf.constant_initializer(0.0), trainable=True, regularizer=self.l2_regularizer(0.00001))
514 |                 # conv = conv * alpha + beta
515 |                 conv = tf.add(tf.multiply(conv, alpha), beta)
516 |             return tf.nn.relu(conv, name='relu')
517 | 
518 |     @layer
519 |     def pva_negation_block_v2(self, input, k_h, k_w, c_o, s_h, s_w, c_in, name, biased=True, padding=DEFAULT_PADDING, trainable=True,
520 |                            scale = True, negation = True):
521 |         """ for PVA net, BN -> [Neg -> Concat ->] Scale -> Relu -> Conv"""
522 |         with tf.variable_scope(name) as scope:
523 |             bn = self.batch_normalization._original(self, input, name='bn', relu=False, is_training=False)
524 |             if negation:
525 |                 bn_neg = self.negation._original(self, bn, name='neg')
526 |                 bn = tf.concat(axis=3, values=[bn, bn_neg], name='concat')
527 |                 c_in += c_in
528 |                 # y = \alpha * x + \beta
529 |                 alpha = tf.get_variable('scale/alpha', shape=[c_in,], dtype=tf.float32,
530 |                                         initializer=tf.constant_initializer(1.0), trainable=True, regularizer=self.l2_regularizer(0.00004))
531 |                 beta = tf.get_variable('scale/beta', shape=[c_in, ], dtype=tf.float32,
532 |                                         initializer=tf.constant_initializer(0.0), trainable=True, regularizer=self.l2_regularizer(0.00004))
533 |                 bn = tf.add(tf.multiply(bn, alpha), beta)
534 |             bn = tf.nn.relu(bn, name='relu')
535 |             if name == 'conv3_1/1': self.layers['conv3_1/1/relu'] = bn
536 | 
537 |             conv = self.conv._original(self, bn, k_h, k_w, c_o, s_h, s_w, biased=biased, relu=False, name='conv', padding=padding,
538 |                          trainable=trainable)
539 |             return conv
540 | 
541 |     @layer
542 |     def pva_inception_res_stack(self, input, c_in, name, block_start = False, type = 'a'):
543 | 
544 |         if type == 'a':
545 |             (c_0, c_1, c_2, c_pool, c_out) = (64, 64, 24, 128, 256)
546 |         elif type == 'b':
547 |             (c_0, c_1, c_2, c_pool, c_out) = (64, 96, 32, 128, 384)
548 |         else:
549 |             raise ('Unexpected inception-res type')
550 |         if block_start:
551 |             stride = 2
552 |         else:
553 |             stride = 1
554 |         with tf.variable_scope(name+'/incep') as scope:
555 |             bn = self.batch_normalization._original(self, input, name='bn', relu=False, is_training=False)
556 |             bn_scale = self.scale._original(self, bn, c_in, name='bn_scale')
557 |             ## 1 x 1
558 | 
559 |             conv = self.conv._original(self, bn_scale, 1, 1, c_0, stride, stride, name='0/conv', biased = False, relu=False)
560 |             conv_0 = self.bn_scale_combo._original(self, conv, c_in=c_0, name ='0', relu=True)
561 | 
562 |             ## 3 x 3
563 |             bn_relu = tf.nn.relu(bn_scale, name='relu')
564 |             if name == 'conv4_1': tmp_c = c_1; c_1 = 48
565 |             conv = self.conv._original(self, bn_relu, 1, 1, c_1, stride, stride, name='1_reduce/conv', biased = False, relu=False)
566 |             conv = self.bn_scale_combo._original(self, conv, c_in=c_1, name='1_reduce', relu=True)
567 |             if name == 'conv4_1': c_1 = tmp_c
568 |             conv = self.conv._original(self, conv, 3, 3, c_1 * 2, 1, 1, name='1_0/conv', biased = False, relu=False)
569 |             conv_1 = self.bn_scale_combo._original(self, conv, c_in=c_1 * 2, name='1_0', relu=True)
570 | 
571 |             ## 5 x 5
572 |             conv = self.conv._original(self, bn_scale, 1, 1, c_2, stride, stride, name='2_reduce/conv', biased = False, relu=False)
573 |             conv = self.bn_scale_combo._original(self, conv, c_in=c_2, name='2_reduce', relu=True)
574 |             conv = self.conv._original(self, conv, 3, 3, c_2 * 2, 1, 1, name='2_0/conv', biased = False, relu=False)
575 |             conv = self.bn_scale_combo._original(self, conv, c_in=c_2 * 2, name='2_0', relu=True)
576 |             conv = self.conv._original(self, conv, 3, 3, c_2 * 2, 1, 1, name='2_1/conv', biased = False, relu=False)
577 |             conv_2 = self.bn_scale_combo._original(self, conv, c_in=c_2 * 2, name='2_1', relu=True)
578 | 
579 |             ## pool
580 |             if block_start:
581 |                 pool = self.max_pool._original(self, bn_scale, 3, 3, 2, 2, padding=DEFAULT_PADDING, name='pool')
582 |                 pool = self.conv._original(self, pool, 1, 1, c_pool, 1, 1, name='poolproj/conv', biased = False, relu=False)
583 |                 pool = self.bn_scale_combo._original(self, pool, c_in=c_pool, name='poolproj', relu=True)
584 | 
585 |         with tf.variable_scope(name) as scope:
586 |             if block_start:
587 |                 concat = tf.concat(axis=3, values=[conv_0, conv_1, conv_2, pool], name='concat')
588 |                 proj = self.conv._original(self, input, 1, 1, c_out, 2, 2, name='proj', biased=True,
589 |                                            relu=False)
590 |             else:
591 |                 concat = tf.concat(axis=3, values=[conv_0, conv_1, conv_2], name='concat')
592 |                 proj = input
593 | 
594 |             conv = self.conv._original(self, concat, 1, 1, c_out, 1, 1, name='out/conv', relu=False)
595 |             if name == 'conv5_4':
596 |                 conv = self.bn_scale_combo._original(self, conv, c_in=c_out, name='out', relu=False)
597 |             conv = self.add._original(self, [conv, proj], name='sum')
598 |         return  conv
599 | 
600 |     @layer
601 |     def pva_inception_res_block(self, input, name, name_prefix = 'conv4_', type = 'a'):
602 |         """build inception block"""
603 |         node = input
604 |         if type == 'a':
605 |             c_ins = (128, 256, 256, 256, 256, )
606 |         else:
607 |             c_ins = (256, 384, 384, 384, 384, )
608 |         for i in range(1, 5):
609 |             node = self.pva_inception_res_stack._original(self, node, c_in = c_ins[i-1],
610 |                                                           name = name_prefix + str(i), block_start=(i==1), type=type)
611 |         return node
612 | 
613 |     @layer
614 |     def scale(self, input, c_in, name):
615 |         with tf.variable_scope(name) as scope:
616 | 
617 |             alpha = tf.get_variable('alpha', shape=[c_in, ], dtype=tf.float32,
618 |                                     initializer=tf.constant_initializer(1.0), trainable=True,
619 |                                     regularizer=self.l2_regularizer(0.00001))
620 |             beta = tf.get_variable('beta', shape=[c_in, ], dtype=tf.float32,
621 |                                    initializer=tf.constant_initializer(0.0), trainable=True,
622 |                                    regularizer=self.l2_regularizer(0.00001))
623 |             return tf.add(tf.multiply(input, alpha), beta)
624 | 
625 | 
626 |     @layer
627 |     def dropout(self, input, keep_prob, name):
628 |         return tf.nn.dropout(input, keep_prob, name=name)
629 | 
630 |     def l2_regularizer(self, weight_decay=0.0005, scope=None):
631 |         def regularizer(tensor):
632 |             with tf.name_scope(scope, default_name='l2_regularizer', values=[tensor]):
633 |                 l2_weight = tf.convert_to_tensor(weight_decay,
634 |                                        dtype=tensor.dtype.base_dtype,
635 |                                        name='weight_decay')
636 |                 return tf.multiply(l2_weight, tf.nn.l2_loss(tensor), name='value')
637 |         return regularizer
638 | 
639 |     def smooth_l1_dist(self, deltas, sigma2=9.0, name='smooth_l1_dist'):
640 |         with tf.name_scope(name=name) as scope:
641 |             deltas_abs = tf.abs(deltas)
642 |             smoothL1_sign = tf.cast(tf.less(deltas_abs, 1.0/sigma2), tf.float32)
643 |             return tf.square(deltas) * 0.5 * sigma2 * smoothL1_sign + \
644 |                         (deltas_abs - 0.5 / sigma2) * tf.abs(smoothL1_sign - 1)
645 | 
646 | 
647 |     def build_loss(self):
648 |         time_step_batch = self.get_output('time_step_len')
649 |         logits_batch = self.get_output('logits')
650 |         labels = self.get_output('labels')
651 |         label_len = self.get_output('labels_len')
652 | 
653 |         ctc_loss = warpctc_tensorflow.ctc(activations=logits_batch,flat_labels=labels,
654 |                                                label_lengths=label_len,input_lengths=time_step_batch)
655 |         loss = tf.reduce_mean(ctc_loss)
656 |         decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits_batch, time_step_batch, merge_repeated=True)
657 |         dense_decoded = tf.cast(tf.sparse_tensor_to_dense(decoded[0], default_value=0), tf.int32)
658 | 
659 |         # add regularizer
660 |         if cfg.TRAIN.WEIGHT_DECAY > 0:
661 |             regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
662 |             loss = tf.add_n(regularization_losses) + loss
663 | 
664 |         return loss,dense_decoded
665 | 


--------------------------------------------------------------------------------
/lib/utils/convert_ckpt2npy.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tensorflow as tf
 3 | import numpy as np
 4 | from easydict import EasyDict as edict
 5 | from lib.networks.factory import get_network
 6 | from lib.fcn.config import get_output_dir,cfg_from_file
 7 | 
 8 | class Convert(object):
 9 |     def __init__(self, sess, network, model_dir,out_path,model):
10 |         self.net = network
11 |         self.model_dir = model_dir
12 |         self.out_path=out_path
13 |         self.model=model
14 |         self.saver = tf.train.Saver(max_to_keep=100)
15 | 
16 |     def conver2npy(self,sess):
17 |         global_step = tf.Variable(0, trainable=False)
18 |         local_vars_init_op = tf.local_variables_initializer()
19 |         global_vars_init_op = tf.global_variables_initializer()
20 |         combined_op = tf.group(local_vars_init_op, global_vars_init_op)
21 |         sess.run(combined_op)
22 | 
23 |         try:
24 |             self.saver.restore(sess, tf.train.latest_checkpoint(self.model_dir))
25 |             sess.run(global_step.assign(0))
26 |             dic=dict()
27 |             pri_keys=['conv1_1','conv1_2','conv2_1','conv2_2',
28 |                   'conv3_1','conv3_2','conv3_3',
29 |                   'conv4_1','conv4_2','conv4_3',
30 |                   'conv5_1','conv5_2','conv5_3']
31 |             if self.model==32:
32 |                 keys=pri_keys+['fc6','fc7','fc8']
33 |             elif self.model==16:
34 |                 keys=pri_keys+['fc6','fc7','fc8','pool4_fc']
35 |             elif self.model==8:
36 |                 keys=pri_keys+['fc6','fc7','fc8','pool4_fc','pool3_fc']
37 |             for key in keys:
38 |                 with tf.variable_scope(key, reuse=True):
39 |                     dic[key] = dict()
40 |                     for subkey in ['weights','biases']:
41 |                         try:
42 |                             var = tf.get_variable(subkey)
43 |                             data=sess.run(var)
44 |                             dic[key][subkey]=data
45 | 
46 |                             print("save model " + subkey + " to " + key)
47 |                         except ValueError:
48 |                             print("fail to convert")
49 |             np.save(self.out_path, dic)
50 |         except:
51 |             raise Exception('Check your model')
52 | 
53 | 
54 | def convert_ckpt2npy(network, model_dir,out_path,model):
55 |     config = tf.ConfigProto(allow_soft_placement=True)
56 |     config.gpu_options.allocator_type = 'BFC'
57 |     with tf.Session(config=config) as sess:
58 |         ct = Convert(sess, network,model_dir,out_path,model)
59 |         ct.conver2npy(sess)
60 |         print('done converting')
61 | 
62 | 
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     os.environ["CUDA_VISIBLE_DEVICES"] = '0'
67 | 
68 |     #修改此处
69 |     output_network_name = '32s'
70 | 
71 |     cfg_from_file('./fcn/fcn_nlpr.yml')
72 |     imgdb = edict({'path': './data/train.tfrecords', 'name': 'FCN_' + output_network_name})
73 |     model_dir = get_output_dir(imgdb, None)
74 |     network = get_network('VGGnet_'+output_network_name)
75 |     out_path='./data/'+output_network_name
76 |     convert_ckpt2npy(network,model_dir=model_dir,out_path=out_path,model=int(output_network_name[:-1]))
77 | 


--------------------------------------------------------------------------------
/lib/utils/data_util.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | this file is modified from keras implemention of data process multi-threading,
  3 | see https://github.com/fchollet/keras/blob/master/keras/utils/data_utils.py
  4 | '''
  5 | import time
  6 | import numpy as np
  7 | import threading
  8 | import multiprocessing
  9 | try:
 10 |     import queue
 11 | except ImportError:
 12 |     import Queue as queue
 13 | 
 14 | 
 15 | class GeneratorEnqueuer():
 16 |     """Builds a queue out of a data generator.
 17 | 
 18 |     Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
 19 | 
 20 |     # Arguments
 21 |         generator: a generator function which endlessly yields data
 22 |         use_multiprocessing: use multiprocessing if True, otherwise threading
 23 |         wait_time: time to sleep in-between calls to `put()`
 24 |         random_seed: Initial seed for workers,
 25 |             will be incremented by one for each workers.
 26 |     """
 27 | 
 28 |     def __init__(self, generator,
 29 |                  use_multiprocessing=False,
 30 |                  wait_time=0.05,
 31 |                  random_seed=None):
 32 |         self.wait_time = wait_time
 33 |         self._generator = generator
 34 |         self._use_multiprocessing = use_multiprocessing
 35 |         self._threads = []
 36 |         self._stop_event = None
 37 |         self.queue = None
 38 |         self.random_seed = random_seed
 39 | 
 40 |     def start(self, workers=1, max_queue_size=10):
 41 |         """Kicks off threads which add data from the generator into the queue.
 42 | 
 43 |         # Arguments
 44 |             workers: number of worker threads
 45 |             max_queue_size: queue size
 46 |                 (when full, threads could block on `put()`)
 47 |         """
 48 | 
 49 |         def data_generator_task():
 50 |             while not self._stop_event.is_set():
 51 |                 try:
 52 |                     if self._use_multiprocessing or self.queue.qsize() < max_queue_size:
 53 |                         generator_output = next(self._generator)
 54 |                         self.queue.put(generator_output)
 55 |                     else:
 56 |                         time.sleep(self.wait_time)
 57 |                 except Exception:
 58 |                     self._stop_event.set()
 59 |                     raise
 60 | 
 61 |         try:
 62 |             if self._use_multiprocessing:
 63 |                 self.queue = multiprocessing.Queue(maxsize=max_queue_size)
 64 |                 self._stop_event = multiprocessing.Event()
 65 |             else:
 66 |                 self.queue = queue.Queue()
 67 |                 self._stop_event = threading.Event()
 68 | 
 69 |             for _ in range(workers):
 70 |                 if self._use_multiprocessing:
 71 |                     # Reset random seed else all children processes
 72 |                     # share the same seed
 73 |                     np.random.seed(self.random_seed)
 74 |                     thread = multiprocessing.Process(target=data_generator_task)
 75 |                     thread.daemon = True
 76 |                     if self.random_seed is not None:
 77 |                         self.random_seed += 1
 78 |                 else:
 79 |                     thread = threading.Thread(target=data_generator_task)
 80 |                 self._threads.append(thread)
 81 |                 thread.start()
 82 |         except:
 83 |             self.stop()
 84 |             raise
 85 | 
 86 |     def is_running(self):
 87 |         return self._stop_event is not None and not self._stop_event.is_set()
 88 | 
 89 |     def stop(self, timeout=None):
 90 |         """Stops running threads and wait for them to exit, if necessary.
 91 | 
 92 |         Should be called by the same thread which called `start()`.
 93 | 
 94 |         # Arguments
 95 |             timeout: maximum time to wait on `thread.join()`.
 96 |         """
 97 |         if self.is_running():
 98 |             self._stop_event.set()
 99 | 
100 |         for thread in self._threads:
101 |             if thread.is_alive():
102 |                 if self._use_multiprocessing:
103 |                     thread.terminate()
104 |                 else:
105 |                     thread.join(timeout)
106 | 
107 |         if self._use_multiprocessing:
108 |             if self.queue is not None:
109 |                 self.queue.close()
110 | 
111 |         self._threads = []
112 |         self._stop_event = None
113 |         self.queue = None
114 | 
115 |     def get(self):
116 |         """Creates a generator to extract data from the queue.
117 | 
118 |         Skip the data if it is `None`.
119 | 
120 |         # Returns
121 |             A generator
122 |         """
123 |         while self.is_running():
124 |             if not self.queue.empty():
125 |                 inputs = self.queue.get()
126 |                 if inputs is not None:
127 |                     yield inputs
128 |             else:
129 |                 time.sleep(self.wait_time)


--------------------------------------------------------------------------------
/lib/utils/genImg.py:
--------------------------------------------------------------------------------
 1 | import sys, random,os
 2 | import numpy as np
 3 | from captcha.image import ImageCaptcha
 4 | import cv2
 5 | from multiprocessing import Pool
 6 | 
 7 | def randRGB():
 8 |         return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
 9 | 
10 | #10+26+26
11 | char_set='0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
12 | imgDir = None
13 | numProcess = 12 
14 | def gen_rand():
15 |     buf = ""
16 |     max_len = random.randint(4,6)
17 |     for i in range(max_len):
18 |        buf += random.choice(char_set)
19 |     return buf
20 | def generateImg(ind):
21 |     global imgDir
22 |     captcha=ImageCaptcha(fonts=['./fonts/Ubuntu-M.ttf'])
23 |     theChars=gen_rand()
24 |     data=captcha.generate(theChars)
25 |     img_name= '{:08d}'.format(ind)+'_'+theChars+'.png'
26 |     img_path=imgDir+'/'+img_name
27 |     captcha.write(theChars,img_path)
28 |     print(img_path)
29 | 
30 | def run(num,path):
31 |     global imgDir
32 |     imgDir = path
33 |     if not os.path.exists(path):
34 |         os.makedirs(path)
35 |     with Pool(processes=numProcess) as pool:
36 |          pool.map(generateImg,range(num))
37 | 
38 | if __name__=='__main__':
39 |     #run(64*2000,'./data/train')
40 |     run(500,'./data/val')
41 | 


--------------------------------------------------------------------------------
/lstm/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.insert(0, '..')


--------------------------------------------------------------------------------
/lstm/lstm.yml:
--------------------------------------------------------------------------------
 1 | EXP_DIR: lstm_ctc
 2 | LOG_DIR: lstm_ctc
 3 | NET_NAME: LSTM
 4 | GPU_ID: 0
 5 | TRAIN:
 6 |   SOLVER: Adam
 7 |   DISPLAY: 100
 8 |   SNAPSHOT_ITERS: 2000
 9 |   LEARNING_RATE: 0.0001
10 |   MOMENTUM: 0.9
11 |   GAMMA: 1.0
12 |   STEPSIZE: 2000
13 |   WEIGHT_DECAY: 0.00001
14 | 


--------------------------------------------------------------------------------
/lstm/test_net.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import collections
 4 | import argparse
 5 | import pprint
 6 | import numpy as np
 7 | import pdb
 8 | import sys
 9 | import os.path
10 | 
11 | this_dir = os.path.dirname(__file__)
12 | sys.path.insert(0, this_dir + '/..')
13 | 
14 | from lib.lstm.test import test_net
15 | from lib.lstm.config import cfg, cfg_from_file, cfg_from_list, get_output_dir, get_log_dir
16 | from lib.networks.factory import get_network
17 | from easydict import EasyDict as edict
18 | 
19 | def parse_args():
20 |     parser = argparse.ArgumentParser(description='Test a FCN network')
21 |     parser.add_argument('--gpu', dest='gpu_id',
22 |                         help='GPU device id to use [0]',
23 |                         default=0, type=int)
24 |     parser.add_argument('--network', dest='network_name',
25 |                         help='name of the network',
26 |                         default=None, type=str)
27 |     parser.add_argument('--cfg', dest='cfg_file',
28 |                         help='optional config file',
29 |                         default=None, type=str)
30 |     parser.add_argument('--restore', dest='restore',
31 |                         help='restore or not',
32 |                         default=1, type=int)
33 | 
34 |     if len(sys.argv) == 1:
35 |         parser.print_help()
36 | 
37 |     args = parser.parse_args()
38 |     return args
39 | 
40 | if __name__ == '__main__':
41 |     args = parse_args()
42 | 
43 |     print('Called with args:')
44 |     print(args)
45 | 
46 |     if args.cfg_file is not None:
47 |         cfg_from_file(args.cfg_file)
48 | 
49 |     os.environ["CUDA_VISIBLE_DEVICES"] = str(cfg.GPU_ID)
50 | 
51 |     print('Using config:')
52 |     pprint.pprint(cfg)
53 | 
54 |     output_network_name=args.network_name.split('_')[-1]
55 |     imgdb = edict({'path':'./data/train.tfrecords','name':'lstm_'+output_network_name,
56 |                    'val_path':'./data/val.tfrecords' })
57 | 
58 |     output_dir = get_output_dir(imgdb, None)
59 |     log_dir = get_log_dir(imgdb)
60 |     print(('Output will be saved to `{:s}`'.format(output_dir)))
61 |     print(('Logs will be saved to `{:s}`'.format(log_dir)))
62 | 
63 |     device_name = '/gpu:{:d}'.format(args.gpu_id)
64 |     print(device_name)
65 | 
66 |     network = get_network(args.network_name)
67 |     print(('Use network `{:s}` in training'.format(args.network_name)))
68 | 
69 |     test_net(network, imgdb,
70 |               testDir= './data/val/', #'data/demo'
71 |               output_dir=output_dir,
72 |               log_dir=log_dir,
73 |               restore=bool(int(args.restore)))
74 | 


--------------------------------------------------------------------------------
/lstm/train_net.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pprint
 3 | import numpy as np
 4 | import pdb
 5 | import sys
 6 | import os.path
 7 | 
 8 | this_dir = os.path.dirname(__file__)
 9 | sys.path.insert(0, this_dir + '/..')
10 | 
11 | from lib.lstm.train import train_net
12 | from lib.lstm.config import cfg, cfg_from_file, cfg_from_list, get_output_dir, get_log_dir
13 | from lib.networks.factory import get_network
14 | from easydict import EasyDict as edict
15 | import matplotlib
16 | 
17 | def parse_args():
18 |     parser = argparse.ArgumentParser(description='Train a lstm network')
19 |     parser.add_argument('--gpu', dest='gpu_id',
20 |                         help='GPU device id to use [0]',
21 |                         default=0, type=int)
22 |     parser.add_argument('--iters', dest='max_iters',
23 |                         help='number of iterations to train',
24 |                         default=1000000, type=int)
25 |     parser.add_argument('--cfg', dest='cfg_file',
26 |                         help='optional config file',
27 |                         default=None, type=str)
28 |     parser.add_argument('--pre_train', dest='pre_train',
29 |                         help='pre trained model',
30 |                         default=None, type=str)
31 |     parser.add_argument('--rand', dest='randomize',
32 |                         help='randomize (do not use a fixed seed)',
33 |                         action='store_true')
34 |     parser.add_argument('--network', dest='network_name',
35 |                         help='name of the network',
36 |                         default=None, type=str)
37 |     parser.add_argument('--set', dest='set_cfgs',
38 |                         help='set config keys', default=None,
39 |                         nargs=argparse.REMAINDER)
40 |     parser.add_argument('--restore', dest='restore',
41 |                         help='restore or not',
42 |                         default=0, type=int)
43 | 
44 |     if len(sys.argv) == 1:
45 |         parser.print_help()
46 | 
47 |     args = parser.parse_args()
48 |     return args
49 | 
50 | # os.environ["CUDA_VISIBLE_DEVICES"] = '0'
51 | if __name__ == '__main__':
52 |     args = parse_args()
53 |     print('Called with args:')
54 |     print(args)
55 |     if args.cfg_file is not None:
56 |         cfg_from_file(args.cfg_file)
57 |     if args.set_cfgs is not None:
58 |         cfg_from_list(args.set_cfgs)
59 |     os.environ["CUDA_VISIBLE_DEVICES"] = str(cfg.GPU_ID)
60 | 
61 |     print('Using config:')
62 |     pprint.pprint(cfg)
63 | 
64 |     if not args.randomize:
65 |         # fix the random seeds (numpy and caffe) for reproducibility
66 |         np.random.seed(cfg.RNG_SEED)
67 | 
68 |     # imgdb = edict({'path':'data/lstm_voc/pascal_augmented_train.tfrecords','name':'pascal_augmentted'})
69 |     output_network_name=args.network_name.split('_')[-1]
70 |     imgdb = edict({'path':'./data/train_4_6.tfrecords','name':'lstm_'+output_network_name,
71 |                    'val_path':'./data/val.tfrecords' })
72 | 
73 |     output_dir = get_output_dir(imgdb, None)
74 |     log_dir = get_log_dir(imgdb)
75 |     print(('Output will be saved to `{:s}`'.format(output_dir)))
76 |     print(('Logs will be saved to `{:s}`'.format(log_dir)))
77 | 
78 |     device_name = '/gpu:{:d}'.format(args.gpu_id)
79 |     print(device_name)
80 | 
81 |     network = get_network(args.network_name)
82 |     print(('Use network `{:s}` in training'.format(args.network_name)))
83 | 
84 |     train_net(network, imgdb,
85 |               pre_train=args.pre_train,
86 |               output_dir=output_dir,
87 |               log_dir=log_dir,
88 |               max_iters=args.max_iters,
89 |               restore=bool(int(args.restore)))
90 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python ./lstm/test_net.py --network=LSTM_test --cfg=./lstm/lstm.yml --restore=1
3 | 


--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python ./lstm/train_net.py --network=LSTM_train --cfg=./lstm/lstm.yml --restore=0
3 | 


--------------------------------------------------------------------------------