├── hwr ├── __init__.py ├── lm │ ├── __init__.py │ ├── trie.py │ ├── lm.py │ └── generate_lm.py ├── app │ ├── __init__.py │ ├── event.py │ ├── pubsub.py │ ├── app.py │ ├── model.py │ └── views.py ├── data │ ├── __init__.py │ ├── createnpz.py │ ├── generator.py │ ├── reader.py │ └── datarep.py ├── models │ ├── __init__.py │ ├── metrics.py │ ├── model.py │ └── ONNET.py ├── decoding │ ├── __init__.py │ ├── mlf.py │ ├── trie_beam_search.py │ ├── trie_beam_search-backup.py │ └── ctc_decoder.py └── constants.py ├── data ├── iamon │ ├── lineStrokes(on) │ │ ├── readme.md │ │ └── data │ │ │ ├── b04 │ │ │ └── b04-334 │ │ │ │ └── b04-334z-07.xml │ │ │ └── d06 │ │ │ └── d06-414 │ │ │ └── d06-414z-07.xml │ └── split-config │ │ ├── test_example.txt │ │ ├── t2_letters │ │ ├── testset_v.txt │ │ ├── testset_t.txt │ │ ├── testset_f.txt │ │ └── trainset.txt └── lm │ └── 5gram-p100.pkl ├── writingpad.py ├── pics └── architecture-overview.png ├── models └── iamon │ └── ONNET │ └── pretrained-deep-lstm │ ├── weights.h5 │ └── ONNET_test.py ├── setup.py ├── README.md ├── requirements.txt ├── .gitignore ├── conda-requirements.txt └── demo ├── generate_lm.ipynb ├── languagemodel.ipynb ├── data_reader.ipynb └── training.ipynb /hwr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hwr/lm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hwr/app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hwr/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hwr/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hwr/decoding/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/iamon/lineStrokes(on)/readme.md: -------------------------------------------------------------------------------- 1 | t2_labels.mlf goes here -------------------------------------------------------------------------------- /data/iamon/split-config/test_example.txt: -------------------------------------------------------------------------------- 1 | a02-000 2 | a03-017 3 | b01-044 -------------------------------------------------------------------------------- /writingpad.py: -------------------------------------------------------------------------------- 1 | from hwr.app.app import run_app 2 | 3 | if __name__ == "__main__": 4 | run_app() -------------------------------------------------------------------------------- /data/lm/5gram-p100.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunkyjasper/IAMhwr/HEAD/data/lm/5gram-p100.pkl -------------------------------------------------------------------------------- /pics/architecture-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunkyjasper/IAMhwr/HEAD/pics/architecture-overview.png -------------------------------------------------------------------------------- /models/iamon/ONNET/pretrained-deep-lstm/weights.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chunkyjasper/IAMhwr/HEAD/models/iamon/ONNET/pretrained-deep-lstm/weights.h5 -------------------------------------------------------------------------------- /hwr/app/event.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class Event(Enum): 5 | PRED_SELECTED = 1 6 | START_DRAWING = 2 7 | END_DRAWING = 3 8 | PRED_SETTED = 4 9 | POINT_SETTED = 5 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='hwr', 4 | version='0.1', 5 | description='hwr', 6 | url='', 7 | author='', 8 | author_email='', 9 | license='MIT', 10 | packages=['hwr'], 11 | zip_safe=False) 12 | 13 | -------------------------------------------------------------------------------- /data/iamon/split-config/t2_letters: -------------------------------------------------------------------------------- 1 | ex 2 | qu 3 | ga 4 | do 5 | am 6 | ti 7 | sl 8 | lb 9 | rb 10 | ls 11 | rs 12 | sr 13 | cm 14 | mi 15 | pl 16 | pt 17 | sp 18 | cl 19 | sc 20 | qm 21 | n0 22 | n1 23 | n2 24 | n3 25 | n4 26 | n5 27 | n6 28 | n7 29 | n8 30 | n9 31 | A 32 | B 33 | C 34 | D 35 | E 36 | F 37 | G 38 | H 39 | I 40 | J 41 | K 42 | L 43 | M 44 | N 45 | O 46 | P 47 | Q 48 | R 49 | S 50 | T 51 | U 52 | V 53 | W 54 | X 55 | Y 56 | Z 57 | a 58 | b 59 | c 60 | d 61 | e 62 | f 63 | g 64 | h 65 | i 66 | j 67 | k 68 | l 69 | m 70 | n 71 | o 72 | p 73 | q 74 | r 75 | s 76 | t 77 | u 78 | v 79 | w 80 | x 81 | y 82 | z 83 | -------------------------------------------------------------------------------- /hwr/app/pubsub.py: -------------------------------------------------------------------------------- 1 | # Basic publisher subscriber pattern 2 | subscribers = {} 3 | 4 | 5 | def unsub(event, callback): 6 | if event is not None or event != "" \ 7 | and event in subscribers.keys(): 8 | subscribers[event] = list( 9 | filter( 10 | lambda x: x is not callback, 11 | subscribers[event] 12 | ) 13 | ) 14 | 15 | 16 | def unsub_all(): 17 | global subscribers 18 | subscribers = {} 19 | 20 | 21 | def sub(event, callback): 22 | if not callable(callback): 23 | raise ValueError("callback must be callable") 24 | 25 | if event is None or event == "": 26 | raise ValueError("Event cant be empty") 27 | 28 | if event not in subscribers.keys(): 29 | subscribers[event] = [callback] 30 | else: 31 | subscribers[event].append(callback) 32 | 33 | 34 | def pub(event, args): 35 | if event in subscribers.keys(): 36 | for callback in subscribers[event]: 37 | callback(args) 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # English on-line handwriting recognition with TDNN and RNN 2 | 3 | [IAM database](http://www.fki.inf.unibe.ch/databases/iam-handwriting-database) 4 | 5 | 6.2% character error rate (CER) on IAM-OnDB independent writer handwriting task 2. 6 | 7 | Alongside with the code, we provide: 8 | 1. Pretrained weighs for the current iteration of model 9 | 2. 100k lexicon from wikitionary or 10k lexicon from google 1b corpus 10 | 3. A heavily pruned 5-gram character level language model trained on a subset of google 1b corpus. 11 | 4. Example data from IAM Online database. 12 | 13 | Prediction would be slightly worse than the above result due to the pruned language model, but should work okay. Download the data directly from IAM official website shown above if neccessary. 14 | 15 | ## Network 16 | ![overview](pics/architecture-overview.png) 17 | 18 | ## Getting Started 19 | 20 | ### Installing 21 | Create a virtual environment. 22 | 23 | Install dependencies 24 | ``` 25 | pip install -r requirements.txt 26 | 27 | pip install -e . 28 | ``` 29 | ### Run 30 | Demo shows some jupyter notebook examples. Writingpad.py runs the writing pad application. 31 | 32 | ``` 33 | python writingpad.py 34 | ``` 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.8.0 2 | astor==0.8.0 3 | attrs==19.1.0 4 | backcall==0.1.0 5 | bleach==3.1.0 6 | cycler==0.10.0 7 | decorator==4.4.0 8 | defusedxml==0.6.0 9 | dill==0.3.0 10 | editdistance==0.5.3 11 | entrypoints==0.3 12 | gast==0.3.2 13 | google-pasta==0.1.7 14 | grpcio==1.23.0 15 | h5py==2.10.0 16 | ipykernel==5.1.2 17 | ipython==7.8.0 18 | ipython-genutils==0.2.0 19 | ipywidgets==7.5.1 20 | jedi==0.15.1 21 | Jinja2==2.10.1 22 | jsonschema==3.0.2 23 | jupyter==1.0.0 24 | jupyter-client==5.3.2 25 | jupyter-console==6.0.0 26 | jupyter-core==4.5.0 27 | Keras-Applications==1.0.8 28 | Keras-Preprocessing==1.1.0 29 | kiwisolver==1.1.0 30 | lxml==4.4.1 31 | Markdown==3.1.1 32 | MarkupSafe==1.1.1 33 | matplotlib==3.1.1 34 | mistune==0.8.4 35 | nbconvert==5.6.0 36 | nbformat==4.4.0 37 | nltk==3.4.5 38 | notebook==6.0.1 39 | numpy==1.16.4 40 | pandocfilters==1.4.2 41 | parso==0.5.1 42 | pexpect==4.7.0 43 | pickleshare==0.7.5 44 | prometheus-client==0.7.1 45 | prompt-toolkit==2.0.9 46 | protobuf==3.9.1 47 | ptyprocess==0.6.0 48 | Pygments==2.4.2 49 | pyparsing==2.4.2 50 | pyrsistent==0.15.4 51 | python-dateutil==2.8.0 52 | pyzmq==18.1.0 53 | qtconsole==4.5.5 54 | Send2Trash==1.5.0 55 | six==1.12.0 56 | tensorboard==1.14.0 57 | tensorflow==1.14.0 58 | tensorflow-estimator==1.14.0 59 | termcolor==1.1.0 60 | terminado==0.8.2 61 | testpath==0.4.2 62 | tornado==6.0.3 63 | tqdm==4.35.0 64 | traitlets==4.3.2 65 | wcwidth==0.1.7 66 | webencodings==0.5.1 67 | Werkzeug==0.15.6 68 | widgetsnbextension==3.5.1 69 | wrapt==1.11.2 70 | -------------------------------------------------------------------------------- /hwr/models/metrics.py: -------------------------------------------------------------------------------- 1 | import editdistance 2 | import re 3 | 4 | 5 | # edit distance/length of ground truth 6 | def character_error_rate(y_true, y_pred): 7 | assert len(y_true) == len(y_pred) 8 | total_cer = 0 9 | for i in range(len(y_true)): 10 | ed = editdistance.eval(y_true[i], y_pred[i]) 11 | char = len(y_true[i]) 12 | total_cer += ed / char 13 | avg_cer = total_cer / len(y_true) 14 | return avg_cer 15 | 16 | 17 | # Edit distance of words 18 | def word_error_rate(y_true, y_pred): 19 | assert len(y_true) == len(y_pred) 20 | pattern = r'[\w]+' 21 | total_wer = 0 22 | for i in range(len(y_true)): 23 | gt, pred = y_true[i], y_pred[i] 24 | words_gt = re.findall(pattern, gt) 25 | words_pred = re.findall(pattern, pred) 26 | words = list(set(words_gt + words_pred)) 27 | idx_gt = [] 28 | for w in words_gt: 29 | idx_gt.append(words.index(w)) 30 | idx_pred = [] 31 | for w in words_pred: 32 | idx_pred.append(words.index(w)) 33 | ed = editdistance.eval(idx_gt, idx_pred) 34 | wer = ed / len(idx_gt) 35 | total_wer += wer 36 | avg_wer = total_wer / len(y_true) 37 | return avg_wer 38 | 39 | 40 | # Out of vocab rate given dict in trie 41 | def oov(trie, lines): 42 | pattern = r'[\w]+' 43 | words = [re.findall(pattern, l) for l in lines] 44 | words = [item for sublist in words for item in sublist] 45 | is_word = [trie.is_word(w) for w in words] 46 | return 1 - (sum(is_word) / len(is_word)) 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /hwr/lm/trie.py: -------------------------------------------------------------------------------- 1 | # Basic Trie implementation 2 | 3 | 4 | class TrieNode(object): 5 | def __init__(self): 6 | self.children = {} 7 | self.is_word = False 8 | 9 | def get_children_nodes(self): 10 | return list(self.children.values()) 11 | 12 | def get_children_chars(self): 13 | return list(self.children.keys()) 14 | 15 | 16 | class Trie(object): 17 | def __init__(self): 18 | self.root = TrieNode() 19 | 20 | def get_root(self): 21 | return self.root 22 | 23 | def insert(self, word): 24 | curr = self.root 25 | last = len(word) - 1 26 | for i, c in enumerate(word): 27 | # If not exist, create 28 | if c not in curr.children: 29 | curr.children[c] = TrieNode() 30 | # Move down the node 31 | curr = curr.children[c] 32 | if i == last: 33 | curr.is_word = True 34 | return self 35 | 36 | def mass_insert(self, words): 37 | for w in words: 38 | self.insert(w) 39 | 40 | # Return Node if exist, None if not 41 | def search(self, prefix): 42 | curr = self.root 43 | for c in prefix: 44 | if c in curr.children: 45 | curr = curr.children[c] 46 | else: 47 | return None 48 | return curr 49 | 50 | def is_word(self, txt): 51 | node = self.search(txt) 52 | if node: 53 | return node.is_word 54 | else: 55 | return False 56 | 57 | def get_char_candidates(self, txt): 58 | node = self.search(txt) 59 | ret = [*node.children] if node else [] 60 | return ret -------------------------------------------------------------------------------- /hwr/data/createnpz.py: -------------------------------------------------------------------------------- 1 | import os 2 | from argparse import ArgumentParser 3 | 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | from hwr.constants import SPLIT, PREPROCESS 8 | from hwr.data.reader import IAMReader 9 | 10 | 11 | # Pre-create appropriate preprocessed data and features as npz file format to 12 | # save training time 13 | # E.g. to create npz for preprocess scheme 6, do 14 | # >> python createnpz.py 6 15 | 16 | 17 | def save_npz(preprocess_no): 18 | preprocess = getattr(PREPROCESS, "SCHEME" + str(preprocess_no)) 19 | reader = IAMReader(SPLIT.ALL) 20 | samples = reader.get_samples() 21 | bad_samples = [] 22 | for i in tqdm(range(len(samples))): 23 | sample = samples[i] 24 | xml_path = sample.xml_path 25 | y = sample.ground_truth 26 | f_split = xml_path.split('/') 27 | f_split[-4] = 'npz-' + preprocess_no 28 | f_split[-1] = f_split[-1][:-3] + 'npz' 29 | f = '/'.join(f_split) 30 | try: 31 | features = sample.generate_features(preprocess=preprocess) 32 | labels = y 33 | d = '/'.join(f_split[:-1]) 34 | if not os.path.exists(d): 35 | os.makedirs(d) 36 | np.savez(f, x=features, y=labels) 37 | except ValueError: 38 | print("Bad sample: {}".format(f_split[-1])) 39 | bad_samples.append(f_split[-1]) 40 | 41 | 42 | if __name__ == "__main__": 43 | parser = ArgumentParser(description='Pre preprocess data into npz.') 44 | parser.add_argument("scheme", 45 | help="number of preprocess scheme") 46 | args = parser.parse_args() 47 | save_npz(args.scheme) 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /hwr/app/app.py: -------------------------------------------------------------------------------- 1 | from hwr.app.model import ONNETpred, Model 2 | from hwr.app.views import * 3 | import tkinter as tk 4 | from hwr.app.pubsub import sub 5 | from hwr.app.event import Event 6 | 7 | 8 | # Controller 9 | class App(tk.Tk): 10 | def __init__(self, *args, **kwargs): 11 | super(App, self).__init__(*args, **kwargs) 12 | 13 | self.title('On-line handwriting recognition') 14 | self.geometry('{}x{}'.format(1024, 768)) 15 | 16 | model = Model(ONNETpred) 17 | text_area = PredictedTextView(self, text="Text", width=50, height=40, padx=3, pady=3) 18 | pred_area = CorrectionsView(self, text="Correction", width=450, height=50, pady=3) 19 | draw_area = WritingPadView(self, text="Writing area", width=450, height=200, padx=3, pady=3) 20 | 21 | # Events 22 | # Text area 23 | sub(Event.PRED_SELECTED, lambda x: text_area.insert_text(x)) 24 | sub(Event.START_DRAWING, lambda x: text_area.set_word_start()) 25 | sub(Event.PRED_SETTED, lambda x: text_area.on_predictions_setted(x)) 26 | # Correction area 27 | sub(Event.PRED_SETTED, lambda x: pred_area.update_buttons(x)) 28 | # Model 29 | sub(Event.END_DRAWING, lambda x: model.compute_predictions(x)) 30 | 31 | # Relative layout of the child views 32 | self.grid_rowconfigure(0, weight=1) 33 | self.grid_columnconfigure(0, weight=1) 34 | text_area.grid(row=0, sticky="nsew") 35 | pred_area.grid(row=1, sticky="ew") 36 | draw_area.grid(row=2, sticky="ew") 37 | text_area.grid_rowconfigure(0, weight=1) 38 | text_area.grid_columnconfigure(0, weight=1) 39 | pred_area.grid_rowconfigure(0, weight=1) 40 | draw_area.grid_columnconfigure(0, weight=1) 41 | draw_area.grid_rowconfigure(0, weight=1) 42 | 43 | def run(self): 44 | self.mainloop() 45 | 46 | 47 | def run_app(): 48 | App().run() 49 | 50 | 51 | if __name__ == "__main__": 52 | run_app() 53 | 54 | 55 | -------------------------------------------------------------------------------- /hwr/decoding/mlf.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | import numpy as np 4 | 5 | from hwr.constants import DATA 6 | 7 | """ 8 | Conversion between text, label and MLF 9 | """ 10 | 11 | 12 | # map operation for ndarray 13 | def ndarray_map(f, a, as_list=False): 14 | if isinstance(a, np.ndarray): 15 | a = a.tolist() 16 | ret = list(map(f, a)) 17 | return ret if as_list else np.array(ret) 18 | 19 | 20 | def txt2label(txt, multiple=False): 21 | if multiple: 22 | return ndarray_map(lambda x: txt2label(x), txt) 23 | label = np.ones(len(txt)) * -1 24 | txt = list(map(lambda x: DATA.CHARS.index(x), txt)) 25 | for i in range(len(txt)): 26 | label[i] = txt[i] 27 | return label 28 | 29 | 30 | def label2txt(labels, remove_dup=False, multiple=False): 31 | if multiple: 32 | return ndarray_map(lambda x: label2txt(x, remove_dup=remove_dup), labels) 33 | if remove_dup: 34 | labels = list(ch for ch, _ in itertools.groupby(labels)) 35 | txt = list(map(lambda x: '' if x == -1 or x == DATA.BLANK_IDX else DATA.CHARS[int(x)], labels)) 36 | txt = ''.join(txt) 37 | return txt 38 | 39 | 40 | def label2mlf(labels, remove_dup=False, multiple=False): 41 | if multiple: 42 | return ndarray_map(lambda x: label2mlf(x, remove_dup=remove_dup), labels) 43 | if remove_dup: 44 | labels = list(ch for ch, _ in itertools.groupby(labels)) 45 | mlf = list(map(lambda x: '' if x == -1 or x == DATA.BLANK_IDX else DATA.CHARS_MLF[int(x)], labels)) 46 | mlf = [x for x in mlf if x != ''] 47 | return mlf 48 | 49 | 50 | def mlf2label(mlf, multiple=False): 51 | if multiple: 52 | return ndarray_map(lambda x: mlf2label(x), mlf) 53 | label = np.zeros(len(mlf)) 54 | mlf_idx = list(map(lambda x: DATA.CHARS_MLF.index(x), mlf)) 55 | for i in range(len(mlf_idx)): 56 | label[i] = mlf_idx[i] 57 | return label 58 | 59 | 60 | def mlf2txt(mlf, multiple=False): 61 | if multiple: 62 | return ndarray_map(lambda x: mlf2txt(x), mlf) 63 | return label2txt(mlf2label(mlf)) 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /hwr/app/model.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | import numpy as np 4 | 5 | from hwr.app.event import Event 6 | from hwr.app.pubsub import pub, sub 7 | from hwr.constants import PREPROCESS 8 | from hwr.data.datarep import Point, PointSet 9 | from hwr.models.ONNET import ONNET 10 | from hwr.decoding.ctc_decoder import BestPathDecoder, TrieBeamSearchDecoder 11 | 12 | # Data for application 13 | class Model: 14 | def __init__(self, pred): 15 | self.pred = pred() 16 | self._predictions = [] 17 | self._points = [] 18 | 19 | def set_predictions(self, predictions): 20 | self._predictions = predictions 21 | pub(Event.PRED_SETTED, predictions) 22 | 23 | def set_points(self, points): 24 | self._points = points 25 | pub(Event.POINT_SETTED, points) 26 | 27 | def compute_predictions(self, points): 28 | self.set_points(points) 29 | features = self.pred.get_features(self._points) 30 | predictions = self.pred.predict(features, 5) 31 | self.set_predictions(predictions) 32 | return predictions 33 | 34 | 35 | # Interface for prediction algorithm. 36 | class IPred(object): 37 | def __init__(self): 38 | __metaclass__ = abc.ABCMeta 39 | 40 | # Take a list of (list of (x,y)) and return features 41 | @abc.abstractmethod 42 | def get_features(self, coordinates): 43 | return 44 | 45 | # predict top n output with ^ return value 46 | @abc.abstractmethod 47 | def predict(self, features, n): 48 | return 49 | 50 | 51 | class ONNETpred(IPred): 52 | 53 | def __init__(self): 54 | super().__init__() 55 | decoder = TrieBeamSearchDecoder(25, lm="sbo", 56 | ngram=5, prune=100, 57 | trie="100k", gamma=1) 58 | self.model = ONNET(preload=True, gru=False, decoder=decoder) 59 | 60 | def get_features(self, strokes): 61 | points = [] 62 | for i in range(len(strokes)): 63 | stroke = strokes[i] 64 | for x, y in stroke: 65 | points.append(Point(i, 0, x, y)) 66 | pointset = PointSet(points=points) 67 | #pointset.plot_both() 68 | scheme = PREPROCESS.SCHEME6 69 | pointset.preprocess(**scheme) 70 | #pointset.plot_both() 71 | #print(pointset) 72 | return pointset.generate_features(add_pad=10) 73 | 74 | def predict(self, features, n): 75 | x = np.expand_dims(features, axis=0) 76 | results = self.model.predict(x, top=n)[0] 77 | print(results) 78 | return results 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | 118 | #Idea 119 | .idea/ 120 | 121 | # notebooks 122 | notebooks/ 123 | 124 | data/* 125 | !data/1blm/* 126 | !data/iamon/lineStrokes(on)/* 127 | #Keep corpus example 128 | data/lblm/* 129 | !data/1blm/corpus_example.txt 130 | # 131 | 132 | # Datas 133 | data/iamoff/ 134 | data/iamon/lineStrokes(on)/npz-6/* 135 | # Keep some examples 136 | data/iamon/lineStrokes(on)/data/* 137 | !data/iamon/lineStrokes(on)/data/m05/m05-507/* 138 | !data/iamon/lineStrokes(on)/data/b04/b04-334/* 139 | !data/iamon/lineStrokes(on)/data/d06/d06-414/* 140 | 141 | # corpus 142 | data/1blm 143 | 144 | # language model counter pickle 145 | data/lm/* 146 | !data/lm/lm_5gramchar_counter_pruned-100.pkl 147 | 148 | #Models 149 | models/*/checkpoint/* 150 | models/iamon/* 151 | models/iamoff/* 152 | !models/iamon/ONNET/* 153 | 154 | 155 | # keep readme.md 156 | !readme.md 157 | 158 | # old model 159 | hwr/on 160 | -------------------------------------------------------------------------------- /data/iamon/split-config/testset_v.txt: -------------------------------------------------------------------------------- 1 | a02-050 2 | a03-023 3 | c02-035 4 | c04-061 5 | d06-037 6 | f07-076a 7 | g01-022 8 | g06-000j 9 | c02-078 10 | e02-025 11 | e07-007 12 | f04-068 13 | f07-069 14 | g03-026 15 | g06-000f 16 | h04-057 17 | a01-003 18 | a06-020 19 | a06-110 20 | a06-119 21 | c02-000 22 | d04-117 23 | g06-000o 24 | j01-059 25 | a04-015 26 | b05-032 27 | d04-125 28 | e04-043 29 | g02-073 30 | h02-004 31 | h02-040 32 | h07-040 33 | a01-023z 34 | c01-041z 35 | e03-137z 36 | g08-249z 37 | k04-256z 38 | l06-690z 39 | n04-282z 40 | p05-590z 41 | a01-048z 42 | c02-059z 43 | e03-156z 44 | g09-267z 45 | k04-274z 46 | l06-714z 47 | n04-311z 48 | p05-612z 49 | b05-422z 50 | d07-510z 51 | f09-499z 52 | j07-370z 53 | l03-272z 54 | m06-637z 55 | p02-169z 56 | r04-347z 57 | b05-444z 58 | d08-528z 59 | f09-518z 60 | j08-391z 61 | l03-295z 62 | m06-667z 63 | p02-192z 64 | r05-367z 65 | b05-465z 66 | d08-544z 67 | f10-536z 68 | j08-408z 69 | l03-323z 70 | m06-693z 71 | p02-214z 72 | r06-388z 73 | b06-489z 74 | d08-561z 75 | f10-558z 76 | j08-425z 77 | l03-340z 78 | m06-712z 79 | p02-236z 80 | r06-412z 81 | b06-508z 82 | d08-579z 83 | f10-582z 84 | j09-443z 85 | l04-363z 86 | m06-736z 87 | p03-254z 88 | r06-429z 89 | b06-527z 90 | d09-598z 91 | g01-016z 92 | j10-457z 93 | l04-388z 94 | m06-759z 95 | p03-279z 96 | r06-453z 97 | b07-548z 98 | d09-618z 99 | g02-032z 100 | j10-480z 101 | l04-408z 102 | n01-015z 103 | p03-304z 104 | r07-475z 105 | b07-568z 106 | d09-642z 107 | g03-048z 108 | j10-499z 109 | l04-425z 110 | n01-035z 111 | p03-331z 112 | r07-503z 113 | b07-587z 114 | d09-664z 115 | g03-067z 116 | k01-021z 117 | l04-446z 118 | n01-055z 119 | p03-354z 120 | r07-532z 121 | b08-611z 122 | d10-684z 123 | g04-085z 124 | k02-043z 125 | l04-468z 126 | n01-074z 127 | p04-379z 128 | r07-555z 129 | b08-636z 130 | d10-709z 131 | g04-101z 132 | k02-067z 133 | l05-489z 134 | n02-096z 135 | p04-401z 136 | r08-580z 137 | b08-658z 138 | e01-009z 139 | g04-116z 140 | k02-091z 141 | l05-517z 142 | n02-117z 143 | p04-422z 144 | r08-599z 145 | b08-681z 146 | e01-026z 147 | g05-133z 148 | k02-111z 149 | l05-536z 150 | n02-140z 151 | p04-445z 152 | r08-617z 153 | b09-710z 154 | e01-049z 155 | g06-150z 156 | k03-130z 157 | l05-554z 158 | n02-163z 159 | p04-468z 160 | r09-638z 161 | b10-739z 162 | e02-077z 163 | g06-172z 164 | k03-154z 165 | l05-577z 166 | n03-188z 167 | p04-492z 168 | r09-664z 169 | b10-764z 170 | e02-095z 171 | g06-186z 172 | k03-181z 173 | l05-601z 174 | n03-209z 175 | p05-519z 176 | r09-689z 177 | c01-009z 178 | e02-117z 179 | g07-205z 180 | k03-205z 181 | l06-628z 182 | n03-232z 183 | p05-543z 184 | r09-711z 185 | a01-020z 186 | c01-038z 187 | e02-134z 188 | g07-222z 189 | k03-228z 190 | l06-655z 191 | n04-255z 192 | p05-565z 193 | -------------------------------------------------------------------------------- /hwr/lm/lm.py: -------------------------------------------------------------------------------- 1 | from nltk.lm.api import LanguageModel, Smoothing 2 | 3 | 4 | class MLE(LanguageModel): 5 | def __init__(self, order, **kwargs): 6 | super().__init__(order, **kwargs) 7 | self.order = order 8 | 9 | def unmasked_score(self, word, context=None): 10 | if context: 11 | context = context[-(self.order - 1):] 12 | return self.context_counts(context).freq(word) 13 | 14 | 15 | class StupidBackoff(LanguageModel): 16 | def __init__(self, order, backoff=0.4, **kwargs): 17 | super().__init__(order, **kwargs) 18 | self.backoff = backoff 19 | 20 | def unmasked_score(self, word, context=None): 21 | if not context: 22 | return self.counts[1].freq(word) 23 | context = context[-(self.order - 1):] 24 | context_freq_d = self.context_counts(context) 25 | ngram_count = context_freq_d[word] 26 | if not ngram_count: 27 | return self.backoff * self.unmasked_score(word, context[1:]) 28 | else: 29 | return ngram_count / context_freq_d.N() 30 | 31 | 32 | class KneserNeyInterpolated(LanguageModel): 33 | def __init__(self, order, **kwargs): 34 | super().__init__(order, **kwargs) 35 | self.estimator = KneserNey(self.vocab, self.counts) 36 | 37 | def unmasked_score(self, word, context=None): 38 | if not context: 39 | return self.estimator.unigram_score(word) 40 | context = context[-(self.order - 1):] 41 | alpha, gamma = self.estimator.alpha_gamma(word, context) 42 | return alpha + gamma * self.unmasked_score(word, context[1:]) 43 | 44 | 45 | # KneserNey of NLTK, but implemented with backoff 46 | class KneserNeyBackoff(LanguageModel): 47 | def __init__(self, order, backoff=0.4, **kwargs): 48 | super().__init__(order, **kwargs) 49 | self.estimator = KneserNey(self.vocab, self.counts, backoff=backoff) 50 | 51 | def unmasked_score(self, word, context=None): 52 | if not context: 53 | return self.estimator.unigram_score(word) 54 | context = context[-(self.order - 1):] 55 | alpha, gamma = self.estimator.alpha_gamma(word, context) 56 | return alpha + gamma * self.unmasked_score(word, context[1:]) 57 | 58 | 59 | def count_non_zero_vals(dictionary): 60 | return sum(1.0 for c in dictionary.values() if c > 0) 61 | 62 | 63 | class KneserNey(Smoothing): 64 | 65 | def __init__(self, vocabulary, counter, discount=0.1, backoff=0.0, **kwargs): 66 | super(KneserNey, self).__init__(vocabulary, counter, *kwargs) 67 | self.discount = discount 68 | self.backoff = backoff 69 | 70 | def unigram_score(self, word): 71 | return 1.0 / len(self.vocab) 72 | 73 | def alpha_gamma(self, word, context): 74 | prefix_counts = self.counts[context] 75 | prefix_total_ngrams = prefix_counts.N() 76 | word_count_given_prefix = prefix_counts[word] 77 | if word_count_given_prefix: 78 | alpha = max(word_count_given_prefix - self.discount, 0.0) / prefix_total_ngrams 79 | gamma = (self.discount * count_non_zero_vals(prefix_counts) / prefix_total_ngrams) 80 | else: 81 | alpha = 0.0 82 | gamma = self.backoff if self.backoff else 1 83 | 84 | return alpha, gamma 85 | -------------------------------------------------------------------------------- /data/iamon/split-config/testset_t.txt: -------------------------------------------------------------------------------- 1 | a01-000u 2 | a01-053x 3 | a02-004 4 | c02-007 5 | d05-013 6 | g07-038 7 | g07-065 8 | h04-028 9 | b01-127 10 | c04-000 11 | c04-008 12 | e02-000 13 | f07-046b 14 | g04-055 15 | g06-000k 16 | j04-015 17 | a01-132 18 | a02-017 19 | a04-047 20 | c02-056 21 | f02-044 22 | f04-004 23 | g04-036 24 | h06-079 25 | a02-037 26 | a02-102 27 | b01-014 28 | d05-008 29 | e01-014 30 | f04-020 31 | f04-083 32 | h02-019 33 | b06-004 34 | b06-087 35 | d04-066 36 | e04-091 37 | f04-032 38 | f07-028b 39 | g01-039 40 | h04-035 41 | a02-057 42 | a03-011 43 | a03-054 44 | a03-066 45 | a06-064 46 | c02-082 47 | f02-030 48 | h01-030 49 | a01-087 50 | a03-059 51 | a06-070 52 | b05-083 53 | c04-075 54 | e04-026 55 | f02-003 56 | f04-046 57 | a01-001z 58 | c01-013z 59 | e02-120z 60 | g07-225z 61 | k03-232z 62 | l06-661z 63 | n04-259z 64 | p05-567z 65 | a10-697z 66 | d02-062z 67 | f02-094z 68 | h10-437z 69 | k09-890z 70 | m02-132z 71 | n08-954z 72 | p10-258z 73 | a10-719z 74 | d02-081z 75 | f02-121z 76 | j01-017z 77 | k09-918z 78 | m02-149z 79 | n08-990z 80 | p10-288z 81 | b01-015z 82 | d03-109z 83 | f03-148z 84 | j01-038z 85 | k09-946z 86 | m03-167z 87 | n09-090z 88 | p10-313z 89 | b01-039z 90 | d03-137z 91 | f03-177z 92 | j01-060z 93 | k09-980z 94 | m03-200z 95 | n09-110z 96 | p10-341z 97 | b01-065z 98 | d03-157z 99 | f03-205z 100 | j02-078z 101 | k10-051z 102 | m03-225z 103 | n09-137z 104 | p10-364z 105 | b01-087z 106 | d03-184z 107 | f03-231z 108 | j02-095z 109 | k10-075z 110 | m04-246z 111 | n09-160z 112 | r01-016z 113 | b02-108z 114 | d03-211z 115 | f03-255z 116 | j02-110z 117 | k10-103z 118 | m04-277z 119 | n09-183z 120 | r01-043z 121 | b02-127z 122 | d04-243z 123 | f03-284z 124 | j03-138z 125 | l01-011z 126 | m04-301z 127 | n09-210z 128 | r01-072z 129 | b03-148z 130 | d04-263z 131 | f04-304z 132 | j03-158z 133 | l01-037z 134 | m04-325z 135 | n10-239z 136 | r02-096z 137 | b03-163z 138 | d04-290z 139 | f04-318z 140 | j04-183z 141 | l01-063z 142 | m04-359z 143 | n10-260z 144 | r02-119z 145 | b03-184z 146 | d05-315z 147 | f05-335z 148 | j04-200z 149 | l01-091z 150 | m04-392z 151 | n10-284z 152 | r02-140z 153 | b03-205z 154 | d05-335z 155 | f05-352z 156 | j04-218z 157 | l01-114z 158 | m04-416z 159 | n10-305z 160 | r02-166z 161 | b04-236z 162 | d05-360z 163 | f05-370z 164 | j04-238z 165 | l01-141z 166 | m05-439z 167 | n10-330z 168 | r02-190z 169 | b04-261z 170 | d05-380z 171 | f06-384z 172 | j05-261z 173 | l02-158z 174 | m05-472z 175 | p01-010z 176 | r03-208z 177 | b04-289z 178 | d05-396z 179 | f06-401z 180 | j06-278z 181 | l02-176z 182 | m05-492z 183 | p01-032z 184 | r03-235z 185 | b04-315z 186 | d06-419z 187 | f07-417z 188 | j06-301z 189 | l02-194z 190 | m05-516z 191 | p01-063z 192 | r04-260z 193 | b04-344z 194 | d06-440z 195 | f07-436z 196 | j06-316z 197 | l02-216z 198 | m05-543z 199 | p01-095z 200 | r04-283z 201 | b04-369z 202 | d07-460z 203 | f07-453z 204 | j06-333z 205 | l02-237z 206 | m05-576z 207 | p01-119z 208 | r04-308z 209 | b05-400z 210 | d07-485z 211 | f08-473z 212 | j07-353z 213 | l03-252z 214 | m06-608z 215 | p01-147z 216 | r04-330z 217 | -------------------------------------------------------------------------------- /hwr/lm/generate_lm.py: -------------------------------------------------------------------------------- 1 | import io 2 | import re 3 | from collections import defaultdict 4 | 5 | import dill as pickle 6 | from nltk.lm import NgramCounter 7 | from nltk.probability import FreqDist, ConditionalFreqDist 8 | from nltk.util import ngrams 9 | from tqdm import tqdm 10 | 11 | from hwr.constants import PATH 12 | 13 | 14 | def clean_space(txt): 15 | return re.sub(r"\s+", " ", txt) 16 | 17 | 18 | def clean_newline(txt): 19 | return re.sub(r"\n", " ", txt) 20 | 21 | 22 | def clean_chars(txt): 23 | return re.sub(r"[^!\"&'/\(\)\[\]*,\-+.\s:;?0-9a-zA-Z]", "", txt) 24 | 25 | 26 | def clean_text(txt): 27 | return clean_space(clean_newline(clean_chars(txt))).lower() 28 | 29 | 30 | # count everygram up to ngram 31 | def update_counter(counter, ngram, fname, batch=10000): 32 | print("Updating counter with file:") 33 | print(fname) 34 | with io.open(fname, encoding='utf8') as fin: 35 | txt = fin.read() 36 | txt = clean_text(txt) 37 | # n of batches 38 | k = int(len(txt) / batch) 39 | 40 | # for each ngram 41 | print("Updating ngrams:") 42 | for n in tqdm(range(1, ngram + 1)): 43 | # for each batch 44 | for i in range(k + 1): 45 | # update ngram 46 | start = 0 if i == 0 else i * batch - n + 1 47 | last = -1 if i == k else (i + 1) * batch + n - 1 48 | counter.update([ngrams(txt[start:last], n)]) 49 | return counter 50 | 51 | 52 | # Overriding ConditionalFreqDist constructor 53 | def init_override(self, cond_samples=None, with_freq=False): 54 | defaultdict.__init__(self, FreqDist) 55 | if cond_samples: 56 | if not with_freq: 57 | for (cond, sample) in cond_samples: 58 | self[cond][sample] += 1 59 | else: 60 | for (cond, sample, freq) in cond_samples: 61 | self[cond][sample] += freq 62 | 63 | 64 | def prune_cond_dist(cond_dist, threshold=10): 65 | setattr(ConditionalFreqDist, '__init__', init_override) 66 | tuple_list = list(cond_dist.__reduce__()[4]) 67 | cond_samples = [] 68 | for cond, freq_dist in tuple_list: 69 | for c in freq_dist: 70 | n = freq_dist[c] 71 | if n > threshold: 72 | cond_samples.append((cond, c, n)) 73 | return ConditionalFreqDist(cond_samples=cond_samples, with_freq=True) 74 | 75 | 76 | def prune_counter(counter, order, threshold=10): 77 | new_counter = NgramCounter() 78 | new_counter._counts[1] = counter[1] 79 | for i in range(2, order+1): 80 | new_counter._counts[i] = prune_cond_dist(counter[i], threshold=threshold) 81 | return new_counter 82 | 83 | 84 | def get_unique_ngram_count(counter, order): 85 | total = 0 86 | for i in range(2, order + 1): 87 | cond = counter[i] 88 | for freqdist in cond.values(): 89 | total += sum(1.0 for c in freqdist.values() if c > 0) 90 | return total 91 | 92 | 93 | def get_cond_count(counter, order): 94 | s = 0 95 | for i in range(order): 96 | s += len(counter[i]) 97 | return s 98 | 99 | 100 | # Return a subset of ngram of lower order 101 | def get_subset_from_counter(counter, order): 102 | new_counter = NgramCounter() 103 | for i in range(order): 104 | new_counter._counts[i + 1] = counter[i + 1] 105 | 106 | return new_counter 107 | 108 | 109 | 110 | 111 | ngram = 7 112 | fname = "lm_7gram_counter.pkl" 113 | 114 | if __name__ == "__main__": 115 | counter = NgramCounter() 116 | for p in range(1, 100): 117 | print("file {}".format(p)) 118 | fnum = "0000" + str(p) if p < 10 else "000" + str(p) 119 | fn = PATH.BASE_DIR + '../data/1blm/training-monolingual.tokenized.shuffled/news.en-' + fnum + '-of-00100' 120 | counter = update_counter(counter, ngram, fn) 121 | 122 | with open(fname, 'wb') as fout: 123 | pickle.dump(counter, fout) 124 | 125 | print("Completed.") 126 | -------------------------------------------------------------------------------- /conda-requirements.txt: -------------------------------------------------------------------------------- 1 | # For tensorflow gpu. 2 | # This file may be used to create an environment using: 3 | # $ conda create --name --file 4 | # platform: linux-64 5 | _libgcc_mutex=0.1=main 6 | _tflow_select=2.1.0=gpu 7 | absl-py=0.7.1=py37_0 8 | astor=0.8.0=py37_0 9 | attrs=19.1.0=py37_1 10 | backcall=0.1.0=py37_0 11 | blas=1.0=mkl 12 | bleach=3.1.0=pypi_0 13 | c-ares=1.15.0=h7b6447c_1001 14 | ca-certificates=2019.5.15=1 15 | certifi=2019.6.16=py37_1 16 | cudatoolkit=10.1.168=0 17 | cudnn=7.6.0=cuda10.1_0 18 | cupti=10.1.168=0 19 | cycler=0.10.0=py37_0 20 | dbus=1.13.6=h746ee38_0 21 | decorator=4.4.0=py37_1 22 | defusedxml=0.6.0=py_0 23 | dill=0.3.0=py37_0 24 | editdistance=0.5.3=py37hf484d3e_0 25 | entrypoints=0.3=py37_0 26 | expat=2.2.6=he6710b0_0 27 | fontconfig=2.13.0=h9420a91_0 28 | freetype=2.9.1=h8a8886c_1 29 | gast=0.2.2=py37_0 30 | glib=2.56.2=hd408876_0 31 | gmp=6.1.2=h6c8ec71_1 32 | google-pasta=0.1.7=py_0 33 | grpcio=1.16.1=py37hf8bcb03_1 34 | gst-plugins-base=1.14.0=hbbd80ab_1 35 | gstreamer=1.14.0=hb453b48_1 36 | h5py=2.9.0=py37h7918eee_0 37 | hdf5=1.10.4=hb1b8bf9_0 38 | icu=58.2=h9c2bf20_1 39 | intel-openmp=2019.4=243 40 | ipykernel=5.1.2=py37h39e3cac_0 41 | ipython=7.8.0=py37h39e3cac_0 42 | ipython_genutils=0.2.0=py37_0 43 | ipywidgets=7.5.1=py_0 44 | jedi=0.15.1=py37_0 45 | jinja2=2.10.1=py37_0 46 | jpeg=9b=h024ee3a_2 47 | jsonschema=3.0.2=py37_0 48 | jupyter=1.0.0=py37_7 49 | jupyter_client=5.3.1=py_0 50 | jupyter_console=6.0.0=py37_0 51 | jupyter_core=4.5.0=py_0 52 | keras-applications=1.0.8=py_0 53 | keras-preprocessing=1.1.0=py_1 54 | kiwisolver=1.1.0=py37he6710b0_0 55 | libedit=3.1.20181209=hc058e9b_0 56 | libffi=3.2.1=hd88cf55_4 57 | libgcc-ng=9.1.0=hdf63c60_0 58 | libgfortran-ng=7.3.0=hdf63c60_0 59 | libpng=1.6.37=hbc83047_0 60 | libprotobuf=3.8.0=hd408876_0 61 | libsodium=1.0.16=h1bed415_0 62 | libstdcxx-ng=9.1.0=hdf63c60_0 63 | libuuid=1.0.3=h1bed415_2 64 | libxcb=1.13=h1bed415_1 65 | libxml2=2.9.9=hea5a465_1 66 | libxslt=1.1.33=h7d1a2b0_0 67 | lxml=4.4.1=py37hefd8a0e_0 68 | markdown=3.1.1=py37_0 69 | markupsafe=1.1.1=py37h7b6447c_0 70 | matplotlib=3.1.1=py37h5429711_0 71 | mistune=0.8.4=py37h7b6447c_0 72 | mkl=2019.4=243 73 | mkl-service=2.3.0=py37he904b0f_0 74 | mkl_fft=1.0.14=py37ha843d7b_0 75 | mkl_random=1.0.2=py37hd81dba3_0 76 | nbconvert=5.5.0=py_0 77 | nbformat=4.4.0=py37_0 78 | ncurses=6.1=he6710b0_1 79 | nltk=3.4.5=py37_0 80 | notebook=6.0.1=py37_0 81 | numpy=1.16.5=py37h7e9f1db_0 82 | numpy-base=1.16.5=py37hde5b4d6_0 83 | openssl=1.1.1d=h7b6447c_1 84 | pandoc=2.2.3.2=0 85 | pandocfilters=1.4.2=py37_1 86 | parso=0.5.1=py_0 87 | pcre=8.43=he6710b0_0 88 | pexpect=4.7.0=py37_0 89 | pickleshare=0.7.5=py37_0 90 | pip=19.2.2=py37_0 91 | prometheus_client=0.7.1=py_0 92 | prompt_toolkit=2.0.9=py37_0 93 | protobuf=3.8.0=py37he6710b0_0 94 | ptyprocess=0.6.0=py37_0 95 | pygments=2.4.2=py_0 96 | pyparsing=2.4.2=py_0 97 | pyqt=5.9.2=py37h05f1152_2 98 | pyrsistent=0.14.11=py37h7b6447c_0 99 | python=3.7.4=h265db76_1 100 | python-dateutil=2.8.0=py37_0 101 | pytz=2019.2=py_0 102 | pyzmq=18.1.0=py37he6710b0_0 103 | qt=5.9.7=h5867ecd_1 104 | qtconsole=4.5.5=py_0 105 | readline=7.0=h7b6447c_5 106 | scipy=1.3.1=py37h7c811a0_0 107 | send2trash=1.5.0=py37_0 108 | setuptools=41.0.1=py37_0 109 | sip=4.19.8=py37hf484d3e_0 110 | six=1.12.0=py37_0 111 | sqlite=3.29.0=h7b6447c_0 112 | tensorboard=1.14.0=py37hf484d3e_0 113 | tensorflow=1.14.0=gpu_py37h74c33d7_0 114 | tensorflow-base=1.14.0=gpu_py37he45bfe2_0 115 | tensorflow-estimator=1.14.0=py_0 116 | tensorflow-gpu=1.14.0=h0d30ee6_0 117 | termcolor=1.1.0=py37_1 118 | terminado=0.8.2=py37_0 119 | testpath=0.4.2=py37_0 120 | tk=8.6.8=hbc83047_0 121 | tornado=6.0.3=py37h7b6447c_0 122 | tqdm=4.32.1=py_0 123 | traitlets=4.3.2=py37_0 124 | wcwidth=0.1.7=py37_0 125 | webencodings=0.5.1=py37_1 126 | werkzeug=0.15.5=py_0 127 | wheel=0.33.4=py37_0 128 | widgetsnbextension=3.5.1=py37_0 129 | wrapt=1.11.2=py37h7b6447c_0 130 | xz=5.2.4=h14c3975_4 131 | zeromq=4.3.1=he6710b0_3 132 | zlib=1.2.11=h7b6447c_3 133 | -------------------------------------------------------------------------------- /hwr/decoding/trie_beam_search.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | from hwr.constants import DATA 7 | 8 | 9 | # get the ending alphabets given a word beam 10 | def get_ending_alphas(text): 11 | end_alphas = "" 12 | for i in reversed(range(len(text))): 13 | if text[i].isalpha(): 14 | end_alphas = text[i] + end_alphas 15 | else: 16 | break 17 | return end_alphas 18 | 19 | 20 | # sm has dimension [sample, timestep, num_of_chars] 21 | def trie_beam_search(rnn_out, bw, top_paths, trie=None, lm=None, lm_order=0, candidate_cap=5, gamma=0.5): 22 | return [__trie_beam_search(x, lm, trie, bw, top_paths, lm_order, candidate_cap, gamma) for x in tqdm(rnn_out)] 23 | 24 | 25 | def __trie_beam_search(mat, lm, trie, bw, top_paths, lm_order, candidate_cap, gamma): 26 | # pb[t][beam]: P of {beam} at time {t} ending with blank '%' 27 | # pnb[t][beam]: P of {beam} at time {t} ending with any non blank chars 28 | # Ptxt[beam] : P of {beam} given a language model. 29 | pb, pnb, ptxt = defaultdict(Counter), defaultdict(Counter), {} 30 | timestep, chars_size = mat.shape 31 | # add a time step 0 for P(t-1) at t=1 32 | mat = np.vstack((np.zeros(chars_size), mat)) 33 | pb[0][''] = 1 34 | pnb[0][''] = 0 35 | ptxt[''] = 1 36 | beams_prev = [''] 37 | 38 | for t in range(1, timestep + 1): 39 | for beam in beams_prev: 40 | # Get ending alphabet, try to form a word in the trie 41 | if trie: 42 | ending_alphas = get_ending_alphas(beam).lower() 43 | candidates = trie.get_char_candidates(ending_alphas) 44 | # Allow uppercase and non alphabets only when a word is form/ not being formed 45 | if trie.is_word(ending_alphas) or ending_alphas == "": 46 | candidates += [c.upper() for c in candidates] 47 | candidates += DATA.NON_ALPHAS 48 | candidates += "%" 49 | else: 50 | candidates = DATA.CHARS 51 | 52 | # Check only top n candidates for performance 53 | if len(candidates) > candidate_cap: 54 | candidates = sorted(candidates, key=lambda c: mat[t][DATA.CHARS.index(c)], reverse=True)[:candidate_cap] 55 | 56 | for char in candidates: 57 | # if candidate is blank 58 | if char == '%': 59 | # Pb(beam,t) += mat(blank,t) * Ptotal(beam,t-1) 60 | pb[t][beam] += mat[t][-1] * (pb[t - 1][beam] + pnb[t - 1][beam]) 61 | 62 | # if candidate is non-blank 63 | else: 64 | new_beam = beam + char 65 | letter_idx = DATA.CHARS.index(char) 66 | 67 | # Apply character level language model and calculate Ptxt(beam) 68 | if lm: 69 | if new_beam not in ptxt.keys(): 70 | # Ptxt(beam+c) = P(c|last n char in beam) 71 | prefix = beam[-(lm_order - 1):] 72 | ptxt[new_beam] = lm.score(char.lower(), [p for p in prefix.lower()]) 73 | else: 74 | ptxt[new_beam] = 1 75 | 76 | # if new candidate and last char in the beam is same 77 | if len(beam) > 0 and char == beam[-1]: 78 | # Pnb(beam+c,t) += mat(c,t) * Pb(beam,t-1) 79 | pnb[t][new_beam] += mat[t][letter_idx] * pb[t - 1][beam] 80 | # Pnb(beam,t) = mat(c,t) * Pnb(beam,t-1) 81 | pnb[t][beam] += mat[t][letter_idx] * pnb[t - 1][beam] 82 | else: 83 | # Pnb(beam+c,t) = mat(c,t) * Ptotal(beam,t-1) 84 | pnb[t][new_beam] += mat[t][letter_idx] * (pb[t - 1][beam] + pnb[t - 1][beam]) 85 | Ptotal_t = pb[t] + pnb[t] 86 | # sort by Ptotal * weighted Ptxt 87 | sort = lambda k: Ptotal_t[k] * (ptxt[k] ** gamma) 88 | # Top (bw) beams for next iteration 89 | beams_prev = sorted(Ptotal_t, key=sort, reverse=True)[:bw] 90 | 91 | return beams_prev[:top_paths] 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /hwr/decoding/trie_beam_search-backup.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | from hwr.constants import DATA 7 | 8 | # Issue: minor reduction of CER comparing to the normal implementation. 9 | # Therefore use the old version. 10 | class Beam: 11 | 12 | def __init__(self, root): 13 | self.root = root 14 | self.curr_node = root 15 | self.text = "" 16 | self.ptxt = 1 17 | 18 | def is_word(self): 19 | return self.curr_node.is_word 20 | 21 | def end_in_non_alpha(self): 22 | return self.text == "" or self.text[-1] in DATA.NON_ALPHAS 23 | 24 | # Find character candidates that would eventually form a word, unless a word is already fored 25 | def get_candidates(self): 26 | candidates = self.curr_node.get_children_chars() 27 | if self.is_word() or self.end_in_non_alpha(): 28 | candidates += [c.upper() for c in candidates] 29 | candidates += DATA.NON_ALPHAS 30 | candidates += "%" 31 | return candidates 32 | 33 | def extend(self, c, score_f): 34 | new_beam = Beam(self.root) 35 | if c in DATA.NON_ALPHAS: 36 | new_beam.curr_node = self.root 37 | else: 38 | new_beam.curr_node = self.curr_node.children[c.lower()] 39 | new_beam.text = self.text + c 40 | new_beam.ptxt = score_f(self.text + c) 41 | return new_beam 42 | 43 | 44 | # rnn_out has dimension [batch_size, timestep, num_of_chars] 45 | def trie_beam_search(rnn_out, lm, trie, bw, top_paths, lm_order, candidate_cap=5): 46 | return [__trie_beam_search(x, lm, trie, bw, top_paths, lm_order, candidate_cap) for x in tqdm(rnn_out)] 47 | 48 | 49 | def __trie_beam_search(mat, lm, trie, bw, top_paths, lm_order, candidate_cap): 50 | # pb[t][beam]: P of {beam} at time {t} ending with blank '%' 51 | # pnb[t][beam]: P of {beam} at time {t} ending with any non blank chars 52 | # Ptxt[beam] : P of {beam} given a language model. 53 | pb, pnb = defaultdict(Counter), defaultdict(Counter) 54 | timestep, chars_size = mat.shape 55 | # add a time step 0 for P(t-1) at t=1 56 | mat = np.vstack((np.zeros(chars_size), mat)) 57 | empty_beam = Beam(trie.root) 58 | pb[0][empty_beam] = 1 59 | pnb[0][empty_beam] = 0 60 | beams_prev = [empty_beam] 61 | 62 | for t in range(1, timestep + 1): 63 | for beam in beams_prev: 64 | # Get candidates by looking at trie 65 | candidates = beam.get_candidates() 66 | # Check only top n candidates for performance 67 | if len(candidates) > candidate_cap: 68 | candidates = sorted(candidates, key=lambda c: mat[t][DATA.CHARS.index(c)], reverse=True)[:candidate_cap] 69 | 70 | for char in candidates: 71 | # if candidate is blank 72 | if char == '%': 73 | # Pb(beam,t) += mat(blank,t) * Ptotal(beam,t-1) 74 | pb[t][beam] += mat[t][-1] * (pb[t - 1][beam] + pnb[t - 1][beam]) 75 | 76 | # if candidate is non-blank 77 | else: 78 | l_plus = beam.text + char 79 | letter_idx = DATA.CHARS.index(char) 80 | 81 | new_beam = next((b for b in beams_prev if b.text == l_plus), None) 82 | if not new_beam: 83 | if lm: 84 | score_f = lambda txt: lm.score(txt[-1].lower(), list(txt[-lm_order:-1].lower())) 85 | else: 86 | score_f = lambda txt: 1 87 | new_beam = beam.extend(char, score_f) 88 | 89 | # Apply character level language model and calculate Ptxt(beam) 90 | 91 | # Ptxt(beam+c) = P(c|last n char in beam) 92 | # ptxt[new_beam] = lm.score(char.lower(), [p for p in prefix.lower()]) 93 | 94 | # if new candidate and last char in the beam is same 95 | if len(beam.text) > 0 and char == beam.text[-1]: 96 | # Pnb(beam+c,t) += mat(c,t) * Pb(beam,t-1) 97 | pnb[t][new_beam] += mat[t][letter_idx] * pb[t - 1][beam] 98 | # Pnb(beam,t) = mat(c,t) * Pnb(beam,t-1) 99 | pnb[t][beam] += mat[t][letter_idx] * pnb[t - 1][beam] 100 | else: 101 | # Pnb(beam+c,t) = mat(c,t) * Ptotal(beam,t-1) 102 | pnb[t][new_beam] += mat[t][letter_idx] * (pb[t - 1][beam] + pnb[t - 1][beam]) 103 | # TODO: fix 104 | # bandaid fix for issue: When sequence is padded too long, sometimes Pb and Pnb reaches zero 105 | if (pb[t] + pnb[t]) == Counter(): 106 | pb[t], pnb[t] = pb[t-1], pnb[t-1] 107 | 108 | Ptotal_t = pb[t] + pnb[t] 109 | # sort by Ptotal * Ptxt 110 | sort = lambda k: Ptotal_t[k] * k.ptxt 111 | # Top (bw) beams for next iteration 112 | beams_prev = sorted(Ptotal_t, key=sort, reverse=True)[:bw] 113 | 114 | return [b.text for b in beams_prev[:top_paths]] 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /hwr/app/views.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from hwr.app.event import Event 3 | from hwr.app.pubsub import pub 4 | 5 | 6 | # the drawing pad 7 | class WritingPadView(tk.LabelFrame): 8 | def __init__(self, parent, **kwargs): 9 | super().__init__(parent, **kwargs) 10 | # state variables 11 | self.btn1pressed = False 12 | self.newline = True 13 | self.xorig = None 14 | self.yorig = None 15 | self.drawing = False 16 | self.after_list = [] 17 | self.curr_stroke = 0 18 | self.points = [] 19 | 20 | # View 21 | self.setup_canvas() 22 | 23 | def mouse1press(self, event): 24 | if not self.drawing: 25 | pub(Event.START_DRAWING, None) 26 | 27 | self.drawing = True 28 | self.btn1pressed = True 29 | self.xorig = event.x 30 | self.yorig = event.y 31 | self.curr_stroke += 1 32 | self.points.append([]) 33 | if self.after_list: 34 | self.after_cancel(self.after_list.pop(0)) 35 | 36 | def mouse1release(self, event): 37 | self.btn1pressed = False 38 | self.xorig = None 39 | self.yorig = None 40 | # Wait 1s, if mouse1 was not pressed, clear canvas 41 | after_id = self.after(1000, self.on_end_drawing) 42 | self.after_list.append(after_id) 43 | 44 | def mousemove(self, event): 45 | # left click held down 46 | if self.btn1pressed: 47 | if self.xorig: 48 | event.widget.create_line(self.xorig, self.yorig, event.x, event.y, 49 | smooth=tk.TRUE, width=3) 50 | self.xorig = event.x 51 | self.yorig = event.y 52 | # append point to last stroke 53 | self.points[-1].append((event.x, event.y)) 54 | 55 | def setup_canvas(self): 56 | self.canvas = tk.Canvas(self, bg="white") 57 | self.canvas.bind("", self.mousemove) 58 | self.canvas.bind("", self.mouse1press) 59 | self.canvas.bind("", self.mouse1release) 60 | self.canvas.grid(column=0, row=0, sticky="nsew") 61 | 62 | def on_end_drawing(self): 63 | self.clear_canvas() 64 | pub(Event.END_DRAWING, self.points) 65 | self.points = [] 66 | 67 | def clear_canvas(self): 68 | self.canvas.delete("all") 69 | self.drawing = False 70 | 71 | 72 | class PredictedTextView(tk.LabelFrame): 73 | 74 | def __init__(self, parent, **kwargs): 75 | super().__init__(parent, **kwargs) 76 | self.setup_textbox() 77 | 78 | def on_predictions_setted(self, preds): 79 | self.insert_text(preds[0]) 80 | self.set_word_end() 81 | 82 | def setup_textbox(self): 83 | self.textbox = ScrolledText(self) 84 | self.textbox.grid(row=0, column=0, sticky="nsew") 85 | self.textbox.tag_configure("TAG", background="#e9e9e9") 86 | self.set_word_start() 87 | self.set_word_end() 88 | 89 | def get_input(self): 90 | print(self.textbox.get(1.0, tk.END)) 91 | 92 | def insert_text(self, text): 93 | self.textbox.tag_remove(tk.SEL, 1.0, tk.END) 94 | self.textbox.delete("WORDSTART", "WORDEND") 95 | self.textbox.insert("WORDSTART", text, (tk.SEL,)) 96 | 97 | # Set the current ending mark as start mark 98 | def set_word_start(self): 99 | self.textbox.mark_set("WORDSTART", tk.INSERT) 100 | self.textbox.mark_gravity("WORDSTART", tk.LEFT) 101 | 102 | def set_word_end(self): 103 | self.textbox.mark_set("WORDEND", tk.INSERT) 104 | 105 | 106 | # the top-n predictions for correction 107 | class CorrectionsView(tk.LabelFrame): 108 | def __init__(self, parent, **kwargs): 109 | super().__init__(parent, **kwargs) 110 | self.buttons = [] 111 | self.setup_predictions(5) 112 | 113 | def setup_predictions(self, n): 114 | for i in range(n): 115 | b = tk.Button(self) 116 | b.grid(row=0, column=i, sticky="nsew") 117 | self.buttons.append(b) 118 | self.grid_columnconfigure(i, weight=1) 119 | 120 | def update_buttons(self, preds): 121 | assert (len(preds) == len(self.buttons)) 122 | for i in range(len(self.buttons)): 123 | self.buttons[i].config(text=preds[i], command=lambda t=preds[i]: pub(Event.PRED_SELECTED, t)) 124 | 125 | 126 | # Source code from tkinter 127 | class ScrolledText(tk.Text): 128 | def __init__(self, master=None, **kw): 129 | super(ScrolledText, self).__init__(master=master) 130 | self.frame = tk.Frame(master) 131 | self.vbar = tk.Scrollbar(self.frame) 132 | self.vbar.pack(side=tk.RIGHT, fill=tk.Y) 133 | 134 | kw.update({'yscrollcommand': self.vbar.set}) 135 | tk.Text.__init__(self, self.frame, **kw) 136 | self.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) 137 | self.vbar['command'] = self.yview 138 | 139 | # Copy geometry methods of self.frame without overriding Text 140 | # methods -- hack! 141 | text_meths = vars(tk.Text).keys() 142 | methods = vars(tk.Pack).keys() | vars(tk.Grid).keys() | vars(tk.Place).keys() 143 | methods = methods.difference(text_meths) 144 | 145 | for m in methods: 146 | if m[0] != '_' and m != 'config' and m != 'configure': 147 | setattr(self, m, getattr(self.frame, m)) 148 | 149 | def __str__(self): 150 | return str(self.frame) 151 | 152 | -------------------------------------------------------------------------------- /hwr/decoding/ctc_decoder.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | import dill as pickle 4 | import numpy as np 5 | from nltk.lm import Vocabulary 6 | from tensorflow.keras import backend as K 7 | from tensorflow.keras.backend import ctc_decode 8 | from tensorflow.python.ops import ctc_ops as ctc 9 | from tensorflow.python.ops import sparse_ops, math_ops, array_ops 10 | 11 | from hwr.constants import DATA, PATH 12 | from hwr.decoding.mlf import label2txt 13 | from hwr.lm.trie import Trie 14 | from hwr.decoding.trie_beam_search import trie_beam_search 15 | from hwr.lm.lm import StupidBackoff, KneserNeyInterpolated, KneserNeyBackoff, MLE 16 | 17 | 18 | # Interface for a CTC decoding algorithm 19 | class ICTCDecoder: 20 | def __int__(self): 21 | __metaclass__ = abc.ABCMeta 22 | 23 | """ 24 | Given the softmax output of RNN of size (batch_size, time_step, labels) 25 | return top n predictions for each item in batch. 26 | Output have type list(predictions) of length = batch_size, 27 | where predictions is list(string) of length = top_n 28 | """ 29 | @abc.abstractmethod 30 | def decode(self, rnn_out, top_n): 31 | return 32 | 33 | 34 | # See trie_beam_search.py 35 | class TrieBeamSearchDecoder(ICTCDecoder): 36 | 37 | def __init__(self, beam_width, lm=None, ngram=0, prune=0, trie=None, gamma=1): 38 | super().__init__() 39 | self.beam_width = beam_width 40 | self.gamma = gamma 41 | if lm: 42 | assert ngram 43 | file_path = PATH.LM_DATA_DIR + str(ngram) + "gram-p" + str(prune) + ".pkl" 44 | with open(file_path, 'rb') as fin: 45 | counter = pickle.load(fin) 46 | vocab = Vocabulary(DATA.CHARS) 47 | lm_switcher = { 48 | 'mle': MLE(ngram, counter=counter, vocabulary=vocab), 49 | 'sbo': StupidBackoff(ngram, backoff=0.4, counter=counter, vocabulary=vocab), 50 | 'kn': KneserNeyInterpolated(ngram, counter=counter, vocabulary=vocab), 51 | 'knbo': KneserNeyBackoff(ngram, backoff=0.4, counter=counter, vocabulary=vocab), 52 | } 53 | lm = lm_switcher[lm] 54 | self.lm = lm 55 | self.ngram = ngram 56 | if trie: 57 | trie_switcher = { 58 | '100k': "wiki-100k.txt", 59 | '10k': 'google-10000-english.txt', 60 | } 61 | trie = load_trie(PATH.LM_DATA_DIR + trie_switcher[trie]) 62 | self.trie = trie 63 | 64 | def decode(self, rnn_out, top_n): 65 | return trie_beam_search(rnn_out, self.beam_width, top_n, gamma=self.gamma, 66 | lm=self.lm, lm_order=self.ngram, trie=self.trie) 67 | 68 | 69 | def load_trie(file_path): 70 | with open(file_path, encoding='utf8') as fin: 71 | vocab_txt = fin.read().splitlines() 72 | # vocabs to lowercase and remove ones with illegal chars 73 | vocab_txt = [v.lower() for v in vocab_txt if v[:2] != "#!"] 74 | vocab_txt = [v for v in vocab_txt if all([c in DATA.CHARS for c in v])] 75 | t = Trie() 76 | t.mass_insert(vocab_txt) 77 | return t 78 | 79 | 80 | class BestPathDecoder(ICTCDecoder): 81 | def __init__(self): 82 | super().__init__() 83 | 84 | def decode(self, rnn_out, top_n): 85 | # pred = best_path_tensor(rnn_out) 86 | pred = best_path(rnn_out) 87 | return list(map(lambda p: [p for _ in range(top_n)], pred)) 88 | 89 | 90 | class BeamSearchDecoder(ICTCDecoder): 91 | def __init__(self, beam_width): 92 | super().__init__() 93 | self.beam_width = beam_width 94 | 95 | def decode(self, rnn_out, top_n): 96 | return beam_search_tensor(rnn_out, self.beam_width, top_paths=top_n) 97 | 98 | 99 | # Get max p across all labels at each timestep 100 | def best_path(rnn_out, remove_dup=True): 101 | ret = [] 102 | for i in range(len(rnn_out)): 103 | ret.append([np.argmax(row) for row in rnn_out[i]]) 104 | return label2txt(ret, remove_dup=remove_dup, multiple=True) 105 | 106 | 107 | # Greedy search. Just pick the most probable candidate at each time step. 108 | def best_path_tensor(rnn_out): 109 | result_list, _ = ctc_decode(rnn_out, np.ones(rnn_out.shape[0]) * rnn_out.shape[1]) 110 | result_list = K.eval(result_list[0]) 111 | pred = label2txt(result_list, multiple=True) 112 | return list(pred) 113 | 114 | 115 | # Beam search. Keep track of the best n 'beam' per timestep 116 | # And calculate the prob of them to find the most probable sequence. 117 | def beam_search_tensor(rnn_out, beam_width, top_paths=1): 118 | _EPSILON = 1e-7 119 | num_of_samples = rnn_out.shape[0] 120 | input_length = np.ones(num_of_samples) * rnn_out.shape[1] 121 | input_length = math_ops.to_int32(input_length) 122 | rnn_out = math_ops.log(array_ops.transpose(rnn_out, perm=[1, 0, 2]) + _EPSILON) 123 | 124 | decoded, log_prob = ctc.ctc_beam_search_decoder( 125 | inputs=rnn_out, 126 | sequence_length=input_length, 127 | beam_width=beam_width, 128 | top_paths=top_paths, 129 | merge_repeated=False) 130 | 131 | decoded_dense = [ 132 | sparse_ops.sparse_to_dense( 133 | st.indices, st.dense_shape, st.values, default_value=-1) 134 | for st in decoded 135 | ] 136 | candidates = [K.eval(i) for i in decoded_dense] 137 | 138 | pred = [[] for _ in range(num_of_samples)] 139 | for k in range(num_of_samples): 140 | for c in candidates: 141 | pred[k].append(c[k]) 142 | pred = [list(label2txt(p, multiple=True)) for p in pred] 143 | return pred 144 | 145 | 146 | -------------------------------------------------------------------------------- /hwr/data/generator.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from tensorflow.keras.utils import Sequence 5 | from tqdm import tqdm 6 | 7 | from hwr.constants import SPLIT, PREPROCESS 8 | from hwr.data.reader import IAMReader, xmlpath2npypath 9 | from hwr.decoding.mlf import mlf2label, mlf2txt 10 | 11 | 12 | # Generator of IAM-ON data for Keras model 13 | class IAMSequence(Sequence): 14 | def __init__(self, split=SPLIT.ALL, batch_size=1, pred=False, 15 | npz=False, preprocess=None, pad_to=None, inout_ratio=4): 16 | reader = IAMReader(split) 17 | self.samples = np.asarray(reader.get_samples()) 18 | # pre-preprocessed data in npz 19 | self.npz = npz 20 | self.npz_dir = "npz-" + str(preprocess) 21 | # xs: features 22 | # ys: ground truths 23 | self.xs = [] 24 | self.ys = [] 25 | 26 | # Load features from npz or preprocess from scratch 27 | if not self.npz: 28 | print("Not using npz. Data preprocessing may take some time.") 29 | preprocess_scheme = getattr(PREPROCESS, "SCHEME" + str(preprocess)) 30 | self.xs = np.asarray([s.generate_features(preprocess_scheme) for s in tqdm(self.samples)]) 31 | self.ys = np.asarray([s.ground_truth for s in self.samples]) 32 | 33 | else: 34 | for s in self.samples: 35 | data = np.load(xmlpath2npypath(s.xml_path, self.npz_dir)) 36 | self.xs.append(data['x']) 37 | self.ys.append(data['y']) 38 | self.xs = np.asarray(self.xs) 39 | self.ys = np.asarray(self.ys) 40 | 41 | self.n = len(self.samples) 42 | # Indices for shuffling 43 | self.indices = np.arange(self.n) 44 | np.random.shuffle(self.indices) 45 | self.batch_size = batch_size 46 | # If pred, generate only xs 47 | self.pred = pred 48 | 49 | # Manually define pad value 50 | if pad_to: 51 | self.x_pad, self.y_pad = pad_to 52 | self.adaptive_pad = False 53 | 54 | # Else pad to match the longest sample in batch 55 | else: 56 | self.adaptive_pad = True 57 | # How much the TDNN scale down the input 58 | self.inout_ratio = inout_ratio 59 | 60 | 61 | def __len__(self): 62 | return int(np.ceil(self.n / float(self.batch_size))) 63 | 64 | # Get a batch 65 | def __getitem__(self, idx): 66 | # batch indices 67 | inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size] 68 | batch_sample = self.samples[inds] 69 | batch_xs = self.xs[inds] 70 | batch_ys = self.ys[inds] 71 | 72 | # pad depending on the longest sample of each batch 73 | if self.adaptive_pad: 74 | max_len_x = max([len(i) for i in batch_xs]) 75 | y_pad = int(np.ceil(max_len_x / self.inout_ratio)) 76 | x_pad = y_pad * self.inout_ratio 77 | 78 | # Pad with given pad_x and pad_y value 79 | else: 80 | x_pad = self.x_pad 81 | y_pad = self.y_pad 82 | 83 | # features 84 | inputs = np.array([pad_2d(x, pad_to=x_pad, pad_value=0) 85 | for x in batch_xs]) 86 | # truth labels 87 | labels = np.array([pad_1d(y, pad_to=y_pad, pad_value=-1) 88 | for y in mlf2label(batch_ys, multiple=True)]) 89 | # Length of network output 90 | ypred_length = np.array([y_pad 91 | for _ in batch_sample])[:, np.newaxis] 92 | # Number of chars in ground truth 93 | ytrue_length = np.array([len(s.ground_truth) 94 | for s in batch_sample])[:, np.newaxis] 95 | # Prediction sequence, return only xs 96 | if self.pred: 97 | return inputs 98 | 99 | # Training/evaluation sequence 100 | return {'xs': inputs, 101 | 'ys': labels, 102 | 'ypred_length': ypred_length, 103 | 'ytrue_length': ytrue_length}, labels 104 | 105 | # Get a random sample for demonstration/testing 106 | def random_sample(self, pad=10): 107 | idx = random.randint(0, self.n - 1) 108 | return self.sample_at_idx(idx, pad=pad) 109 | 110 | def sample_at_idx(self, idx, pad=10): 111 | idx = self.indices[idx] 112 | return self.sample_at_absolute_idx(idx, pad=pad) 113 | 114 | # regardless of shuffling 115 | def sample_at_absolute_idx(self, idx, pad=10): 116 | pointset = self.samples[idx].pointset 117 | ground_truth = mlf2txt(self.samples[idx].ground_truth) 118 | network_input = self.xs[idx] 119 | network_input = pad_2d(self.xs[idx], 120 | pad_to=network_input.shape[0] + pad, 121 | pad_value=0) 122 | network_input = np.asarray([network_input]) 123 | return network_input, ground_truth, pointset 124 | 125 | 126 | def on_epoch_end(self): 127 | np.random.shuffle(self.indices) 128 | 129 | # Get xs and ys which match the current permutation defined by indices 130 | def get_xy(self): 131 | xs = [self.xs[idx] for idx in self.indices] 132 | ys = mlf2txt([self.ys[idx] for idx in self.indices], multiple=True) 133 | return xs, ys 134 | 135 | def gen_iter(self): 136 | for i in range(len(self)): 137 | yield self[i] 138 | 139 | 140 | def pad_2d(x, pad_to, pad_value): 141 | result = np.ones((pad_to, x.shape[1])) * pad_value 142 | result[:x.shape[0], :] = x 143 | return result 144 | 145 | 146 | def pad_1d(x, pad_to, pad_value): 147 | result = np.ones(pad_to) * pad_value 148 | result[:x.shape[0]] = x 149 | return result 150 | -------------------------------------------------------------------------------- /hwr/models/model.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import datetime 3 | import os 4 | 5 | import tensorflow as tf 6 | from tensorflow.keras.models import Model 7 | from tensorflow.keras.utils import Sequence 8 | 9 | from hwr.constants import PRETRAINED, DATA, PATH 10 | from hwr.decoding.ctc_decoder import TrieBeamSearchDecoder 11 | from hwr.models.metrics import character_error_rate, word_error_rate 12 | from tqdm import tqdm 13 | 14 | # Interface for prediction model 15 | class HWRModel(object): 16 | def __init__(self, chars=DATA.CHARS, preload_key=None, 17 | decoder=None): 18 | __metaclass__ = abc.ABCMeta 19 | self.decoder = decoder 20 | if decoder is None: 21 | self.decoder = TrieBeamSearchDecoder(beam_width=25, lm="sbo", ngram=7, 22 | prune=10, trie='100k', gamma=1) 23 | self.chars = chars 24 | self.class_name = type(self).__name__ 25 | self.ckptdir = PATH.CKPT_DIR + self.class_name + "/" 26 | self.char_size = len(chars) + 1 27 | self.model = self.get_model_conf() 28 | self.pred_model = self.get_intermediate_model(self.get_prediction_layer()) 29 | self.compile() 30 | if preload_key: 31 | self.pretrained = PRETRAINED[preload_key] 32 | print("preloading model weights from {}".format(self.pretrained)) 33 | self.load_weights(self.pretrained, full_path=True) 34 | 35 | @abc.abstractmethod 36 | def get_model_conf(self): 37 | return 38 | 39 | @abc.abstractmethod 40 | def get_prediction_layer(self): 41 | return 42 | 43 | @abc.abstractmethod 44 | def get_input_layer(self): 45 | return 46 | 47 | @abc.abstractmethod 48 | def get_optimizer(self): 49 | return 50 | 51 | @abc.abstractmethod 52 | def get_loss(self): 53 | return 54 | 55 | def get_intermediate_model(self, layer_name): 56 | in_model = Model(inputs=self.model.get_layer(self.get_input_layer()).output, 57 | outputs=self.model.get_layer(layer_name).output) 58 | # dummy loss and optimizer, predict with Sequence class requires compiled 59 | in_model.compile(loss={layer_name: lambda y_true, y_pred: y_pred}, optimizer='adam') 60 | return in_model 61 | 62 | 63 | def train(self, train_seq, test_seq, epochs=100, earlystop=5): 64 | ckptdir = self.ckptdir + get_time() + '/' 65 | if not os.path.exists(ckptdir): 66 | os.makedirs(ckptdir) 67 | cp_callback = tf.keras.callbacks.ModelCheckpoint(ckptdir + 'weights.h5', 68 | save_weights_only=True, 69 | save_best_only=True, 70 | verbose=1) 71 | es_callback = tf.keras.callbacks.EarlyStopping(patience=earlystop) 72 | self.model.fit_generator( 73 | generator=train_seq, 74 | validation_data=test_seq, 75 | shuffle=True, 76 | verbose=1, 77 | epochs=epochs, 78 | callbacks=[cp_callback, es_callback] 79 | ) 80 | 81 | def predict_softmax(self, x): 82 | # for variable length sequence 83 | if isinstance(x, Sequence) and x.batch_size == 1: 84 | print("predicting softmax for sequence with batch size: 1, will return list of ndarray.") 85 | sm = [] 86 | gen = x.gen_iter() 87 | for b in tqdm(gen, total=len(x)): 88 | sm.append(self.pred_model.predict(b, verbose=0)[0]) 89 | elif isinstance(x, Sequence): 90 | sm = self.pred_model.predict_generator(x, verbose=1) 91 | else: 92 | sm = self.pred_model.predict(x, verbose=1) 93 | return sm 94 | 95 | # return top n predicted text. 96 | def predict(self, x, decoder=None, top=1): 97 | if decoder is None: 98 | decoder = self.decoder 99 | 100 | softmaxs = self.predict_softmax(x) 101 | pred = decoder.decode(rnn_out=softmaxs, top_n=top) 102 | if top == 1: 103 | try: 104 | pred = [p[0] for p in pred] 105 | except IndexError: 106 | print("Index Error: {}".format(pred)) 107 | return pred 108 | 109 | def evaluate(self, eval_seq, metrics=None, decoder=None): 110 | if metrics is None: 111 | metrics = [character_error_rate, word_error_rate] 112 | if decoder is None: 113 | decoder = self.decoder 114 | _, y_true = eval_seq.get_xy() 115 | y_pred = self.predict(eval_seq, decoder=decoder) 116 | ret = {} 117 | for m in metrics: 118 | ret[m.__name__] = m(y_true, y_pred) 119 | return ret 120 | 121 | # Keras cannot save custom loss and keras optimizer, so have to recompile after loading 122 | def compile(self): 123 | self.model.compile(loss=self.get_loss(), 124 | optimizer=self.get_optimizer()) 125 | 126 | def save_weights(self, file_name="", full_path=False): 127 | if not file_name: 128 | file_name = get_time() + '.h5' 129 | if not full_path: 130 | file_name += self.ckptdir 131 | self.model.save_weights(file_name) 132 | 133 | def load_weights(self, file_name, full_path=False): 134 | if not full_path: 135 | file_name += self.ckptdir 136 | self.model.load_weights(file_name) 137 | self.compile() 138 | 139 | def get_model_summary(self): 140 | return self.model.summary() 141 | 142 | 143 | 144 | 145 | # get timestamp 146 | def get_time(): 147 | return datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S") 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /hwr/models/ONNET.py: -------------------------------------------------------------------------------- 1 | # suppress tf warnings 2 | import os 3 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 4 | 5 | from tensorflow.keras import backend as K 6 | from tensorflow.keras.layers import Conv1D, AveragePooling1D, Input, Dense, Activation, \ 7 | LSTM, GRU, \ 8 | Lambda, BatchNormalization 9 | from tensorflow.keras.layers import concatenate, add 10 | from tensorflow.keras.models import Model 11 | from tensorflow.keras.optimizers import SGD 12 | 13 | from hwr.constants import DATA 14 | from hwr.models.model import HWRModel 15 | 16 | 17 | 18 | 19 | 20 | # Implementation of model 21 | class ONNET(HWRModel): 22 | def __init__(self, preload=False, gru=False, decoder=None, gpu=False): 23 | if gpu: 24 | # Importing at file heading makes cpu version unable to init 25 | from tensorflow.keras.layers import CuDNNGRU, CuDNNLSTM 26 | if gru: 27 | self.rnn = CuDNNGRU if gpu else GRU 28 | preload_key = "ONNET-GRU" if preload else None 29 | else: 30 | self.rnn = CuDNNLSTM if gpu else LSTM 31 | preload_key = "ONNET-LSTM" if preload else None 32 | super().__init__(preload_key=preload_key, decoder=decoder) 33 | 34 | def get_prediction_layer(self): 35 | return "softmax" 36 | 37 | def get_input_layer(self): 38 | return "xs" 39 | 40 | def get_optimizer(self): 41 | return SGD(lr=1e-4, momentum=0.9, nesterov=True, clipnorm=5) 42 | #return 'adam' 43 | 44 | def get_loss(self): 45 | return {'ctc': lambda y_true, y_pred: y_pred} 46 | 47 | def get_model_conf(self): 48 | input_shape = (None, 6) 49 | inputs = Input(shape=input_shape, dtype='float32', name='xs') 50 | inner = inputs 51 | 52 | inner = tdnn_bn_relu(inner, 60, 7) 53 | inner = tdnn_bn_relu(inner, 90, 5) 54 | inner = tdnn_bn_relu(inner, 120, 5) 55 | inner = AveragePooling1D(pool_size=2)(inner) 56 | 57 | inner = tdnn_bn_relu(inner, 120, 3) 58 | inner = tdnn_bn_relu(inner, 160, 3) 59 | inner = tdnn_bn_relu(inner, 200, 3) 60 | 61 | inner = AveragePooling1D(pool_size=2)(inner) 62 | 63 | # No significant difference between gru and lstm 64 | inner = self.bi_rnn(inner, 60) 65 | inner = self.bi_rnn(inner, 60) 66 | inner = self.bi_rnn(inner, 60) 67 | inner = self.bi_rnn(inner, 60) 68 | 69 | 70 | inner = BatchNormalization()(inner) 71 | 72 | inner = Dense(DATA.CHARS_SIZE, kernel_initializer='he_normal')(inner) 73 | y_pred = Activation('softmax', name='softmax')(inner) 74 | 75 | # parameters for CTC loss, fed as network input 76 | labels = Input(name='ys', 77 | shape=[None], dtype='float32') 78 | input_length = Input(name='ypred_length', shape=[1], dtype='int64') 79 | label_length = Input(name='ytrue_length', shape=[1], dtype='int64') 80 | 81 | loss_out = Lambda(self.__ctc_lambda_func, output_shape=(1,), 82 | name='ctc')([y_pred, labels, input_length, label_length]) 83 | 84 | model = Model(inputs=[inputs, labels, input_length, label_length], 85 | outputs=loss_out) 86 | return model 87 | 88 | def __ctc_lambda_func(self, args): 89 | y_pred, labels, input_length, label_length = args 90 | return K.ctc_batch_cost(labels, y_pred, input_length, label_length) 91 | 92 | def bi_rnn(self, inner, hidden_unit): 93 | rnn_a = self.rnn(hidden_unit, return_sequences=True, 94 | kernel_initializer='he_normal')(inner) 95 | rnn_b = self.rnn(hidden_unit, return_sequences=True, 96 | go_backwards=True, kernel_initializer='he_normal')(inner) 97 | rnn_merged = concatenate([rnn_a, rnn_b]) 98 | return rnn_merged 99 | 100 | 101 | def tdnn_bn_relu(inner, filters, kernel_size): 102 | inner = Conv1D(filters, kernel_size, padding="same", kernel_initializer='he_normal')(inner) 103 | inner = BatchNormalization()(inner) 104 | inner = Activation('relu')(inner) 105 | return inner 106 | 107 | 108 | def inception(filters, filter_size, pool_size, strides, inner): 109 | n = int(filters / (len(filter_size) + 1)) 110 | inc = [] 111 | for i in range(len(filter_size)): 112 | inc_this = Conv1D(n, filter_size[i], strides, padding="same", kernel_initializer='he_normal')(inner) 113 | inc.append(inc_this) 114 | inc_avg = AveragePooling1D(pool_size=pool_size, strides=strides, padding="same")(inner) 115 | inc_avg = Conv1D(n, 1, padding="same", kernel_initializer='he_normal')(inc_avg) 116 | inc.append(inc_avg) 117 | inner = concatenate(inc) 118 | inner = BatchNormalization()(inner) 119 | inner = Activation('relu')(inner) 120 | return inner 121 | 122 | 123 | def residual_inception(inner): 124 | inc1 = Conv1D(32, 1, padding="same", kernel_initializer='he_normal')(inner) 125 | inc2 = Conv1D(32, 1, padding="same", kernel_initializer='he_normal')(inner) 126 | inc2 = Conv1D(32, 3, padding="same", kernel_initializer='he_normal')(inc2) 127 | inc3 = Conv1D(32, 1, padding="same", kernel_initializer='he_normal')(inner) 128 | inc3 = Conv1D(32, 3, padding="same", kernel_initializer='he_normal')(inc3) 129 | inc3 = Conv1D(32, 3, padding="same", kernel_initializer='he_normal')(inc3) 130 | inc_cc = concatenate([inc1, inc2, inc3]) 131 | inc_cc = Conv1D(256, 1, padding="same", kernel_initializer='he_normal')(inc_cc) 132 | inner = add([inner, inc_cc]) 133 | return inner 134 | 135 | 136 | # Override default LSTM activation function for CuDNN compatiblity 137 | class LSTM(LSTM): 138 | def __init__(self, *args, **kwargs): 139 | super().__init__(recurrent_activation='sigmoid', *args, **kwargs) 140 | 141 | -------------------------------------------------------------------------------- /hwr/data/reader.py: -------------------------------------------------------------------------------- 1 | import os 2 | from decimal import Decimal 3 | 4 | import numpy as np 5 | from lxml import etree 6 | 7 | from hwr.constants import SPLIT, PATH, DATA 8 | from hwr.data.datarep import PointSet, Point 9 | from hwr.decoding.mlf import mlf2txt 10 | 11 | 12 | # A sample from the IAM online database 13 | class Sample(object): 14 | def __init__(self, xml_path, ground_truth): 15 | self.xml_path = xml_path 16 | self.ground_truth = ground_truth 17 | self.__pointset = None 18 | 19 | def generate_features(self, preprocess): 20 | return self.pointset.generate_features(preprocess=preprocess) 21 | 22 | # plot the sample 23 | def visualize(self): 24 | self.pointset.plot_points() 25 | 26 | # Ground truth in readable form 27 | def get_ground_truth_text(self): 28 | return mlf2txt(self.ground_truth) 29 | 30 | @property 31 | def name(self): 32 | return self.xml_path.split("/")[-1][:-4] 33 | 34 | # Read data in xml file into PointSet object 35 | @property 36 | def pointset(self): 37 | if self.__pointset is not None: 38 | return self.__pointset 39 | else: 40 | xml = open(self.xml_path, 'rb').read() 41 | root = etree.XML(xml) 42 | wbd, strokeset = root.getchildren() 43 | 44 | # Unpack white board description 45 | sl = wbd[0].attrib['corner'] 46 | do = wbd[1].attrib['x'], wbd[1].attrib['y'] 47 | vo = wbd[2].attrib['x'], wbd[2].attrib['y'] 48 | ho = wbd[3].attrib['x'], wbd[3].attrib['y'] 49 | 50 | # Unpack Strokes, return list of (stroke_id, time, x, y) 51 | strokes = [] 52 | stroke_id = 1 53 | min_time = Decimal(strokeset.getchildren()[0].getchildren()[0].attrib['time']) 54 | for stroke in strokeset: 55 | for point in stroke: 56 | t = (Decimal(point.attrib['time']) - min_time) * 1000 57 | x = point.attrib['x'] 58 | y = point.attrib['y'] 59 | strokes.append([stroke_id, t, x, y]) 60 | stroke_id += 1 61 | strokes = np.asarray(strokes, dtype=np.int) 62 | 63 | # Find the four edges of whiteboard in coordinate space 64 | r, b = do # right, bottom edge 65 | l, _ = vo # left edge 66 | _, u = ho # upper edge 67 | r, b, l, u = int(r), int(b), int(l), int(u) 68 | 69 | # Move top left corner to origin then flip along y 70 | strokes[:, 2] = np.subtract(strokes[:, 2], l) 71 | strokes[:, 3] = np.subtract(strokes[:, 3], u) 72 | points = [] 73 | for s in strokes: 74 | points.append(Point(*s)) 75 | return PointSet(points=points, w=r - l, h=b - u, file_name=self.xml_path) 76 | 77 | def __repr__(self): 78 | return "".format(self.name) 79 | 80 | 81 | # load samples given the data directory and a split (e.g. train, test) 82 | class IAMReader(object): 83 | 84 | def __init__(self, split, data_path=PATH.DATA_DIR): 85 | self.data_path = data_path 86 | self.line_data_path = data_path + "lineStrokes(on)/" 87 | self.split = split 88 | self.samples = None 89 | 90 | # Given a data split, return the samples 91 | def get_samples(self): 92 | if self.samples is not None: 93 | return self.samples 94 | sample_names = [] 95 | if self.split == SPLIT.ALL: 96 | all_split = [SPLIT.TRAIN, SPLIT.VAL1, SPLIT.VAL2, SPLIT.TEST] 97 | for split in all_split: 98 | sample_names += self.__get_sample_names_from_split(split) 99 | else: 100 | sample_names = self.__get_sample_names_from_split(self.split) 101 | self.samples = self.__get_samples_from_name(sample_names) 102 | return self.samples 103 | 104 | # Given a data split, return the sample names list 105 | def __get_sample_names_from_split(self, split): 106 | f = open(self.data_path + "split-config/" + split) 107 | return [line.strip(' \n') for line in f] 108 | 109 | # Given samples name e.g. ['a02-050',], return samples with path to data and ground truth. 110 | def __get_samples_from_name(self, names, blacklist=DATA.BLACKLIST): 111 | # File of ground truth of each sample 112 | f = open(self.line_data_path + "t2_labels.mlf") 113 | 114 | samples = [] 115 | curr_path = "" 116 | curr_sample_name = "" 117 | curr_gt = [] 118 | 119 | for line in f: 120 | # comments 121 | if line[0] == '#': 122 | continue 123 | # .lab file name 124 | # e.g. "/scratch/global/liwicki/wb/data/new-lang-model/transcriptions/a01-000u-05.lab" 125 | elif line[0] == '"': 126 | # Add the sample 127 | if curr_path and curr_sample_name in names: 128 | curr_gt = curr_gt[:-1] 129 | samples.append(Sample(curr_path, curr_gt)) 130 | # Clear cached result 131 | curr_path = "" 132 | curr_sample_name = "" 133 | curr_gt = [] 134 | # Read the next file name 135 | striped_line = line.strip(' "\n') 136 | line_split = striped_line.split('/') 137 | file_name = line_split[8].split('.')[0] 138 | if file_name in blacklist: 139 | continue 140 | fn_split = file_name.split('-') 141 | path = fn_split[0] + "/" + fn_split[0] + "-" + fn_split[1][:3] + \ 142 | "/" + file_name + ".xml" 143 | path = self.line_data_path + "data/" + path 144 | 145 | # if corrupted file/not found, pass 146 | try: 147 | if not os.path.getsize(path): 148 | continue 149 | except FileNotFoundError: 150 | # print("Missing file: {}".format(path)) 151 | continue 152 | 153 | curr_path = path 154 | curr_sample_name = fn_split[0] + "-" + fn_split[1] 155 | # Read the ground truth 156 | else: 157 | line_split = line.strip('\n') 158 | curr_gt.append(line_split) 159 | return samples 160 | 161 | 162 | def __repr__(self): 163 | return "".format(self.split) 164 | 165 | def xmlpath2npypath(path, npz_dir): 166 | f_split = path.split('/') 167 | f_split[-4] = npz_dir 168 | f_split[-1] = f_split[-1][:-3] + 'npz' 169 | f = '/'.join(f_split) 170 | return f -------------------------------------------------------------------------------- /demo/generate_lm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from hwr.lm.generate_lm import *\n", 10 | "from hwr.lm.lm import MLE, StupidBackoff, KneserNeyInterpolated, KneserNeyBackoff\n", 11 | "from nltk.lm import NgramCounter, Vocabulary\n", 12 | "from nltk.util import everygrams\n", 13 | "from hwr.constants import PATH, DATA, SPLIT\n", 14 | "import dill as pickle" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# These should be command line argument\n", 24 | "ngram = 7\n", 25 | "to_file = 10\n", 26 | "fname = 'lm_7gram_counter.pkl'" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "scrolled": true 34 | }, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "file 1\n", 41 | "Updating counter with file:\n", 42 | "/home/jasper/Desktop/fyp/HWR/hwr/../data/1blm/training-monolingual.tokenized.shuffled/news.en-00001-of-00100\n" 43 | ] 44 | }, 45 | { 46 | "name": "stderr", 47 | "output_type": "stream", 48 | "text": [ 49 | "\r", 50 | " 0%| | 0/9 [00:00 2 pooling with attempt 6 config 31 | # Attempt12: 50 100 150 200 4 3 2 2 200 200. 19.06 loss 11.92cer(fast) 32 | # single 2 pooling with attempt 6 config 33 | # Attempt13: 50 100 150 200 4 3 2 2 200 200. /// 34 | # Convolution pooling(pool size 2 stride 2) to replace average pool 35 | # Attempt14: 50 100 150 200 4 3 2 2 200 200. 17.58 loss 10.81cer 36 | # Attempt15: 50 100 150 200 4 2 3 4(stride 2) 200 200. 16.41 loss 10.4cer 37 | # Attempt16: 50 100 150 200 4 3 2 7(stride 2) 200 200. 20.2 loss 20.4cer 38 | # Return to average pooling 39 | # Attempt17: 64 128 256 256 4 3 2 2 128 128 16.3 loss 9.72cer new best 40 | # Attempt18: 128 256 512 512 4 3 2 2 128 128 16.6 loss 10.9 cer 41 | # Attempt19: 64 128 128a2 256 256a2 4 3 2 2 2 128 128 15.5 loss 9.82cer 42 | # Attempt20: 64 128a2 128 256 256a2 3 3 2 2 2 128 128 15.6 loss 9.20 cer new best 43 | # Attempt21: 64 64 128a2 128 256 256a2 2 2 2 2 2 2 128 128 16.8 loss 44 | # Attempt22: 64 128a2 128 256 256a2 3 3 2 2 2 128 256 16.38 loss 10.2cer 45 | # Attempt23: 64 128a2 128 256 256a2 3 3 2 2 2 64 128 does not converge 46 | # Attempt24: 64 128a2 128 256 256a2 3 3 2 2 2 128 does not look good so stop 47 | # Attempt25: LSTM of 20. 15.6 loss 9.35cer 9.91 cer for early stopped 48 | # Attempt26: Maxpool of 20. Does not converge 49 | # Attempt27: Average pool of 3 for first of 20. 10.7 50 | # Attempt28: 20, but 128 1d conv at the end. 9.95 51 | # Attempt29: A20 with preprocess5. 16.0loss 9.45cer 52 | # Attempt30: A30 but 128 5 for first layer. 15.9loss 9.91cer 53 | # Attempt31: 64 128a2 128 256 256a2 5 3 3 2 2 100 100 9.70cer 15.8 loss 54 | # Attempt32: a20 but 100 100. Very slow to converge 55 | # Attempt33: a20 but add a 32 3 layer at front. Fast training, 15.8 loss 9.47 cer 56 | # Attempt34: 32 64 96a2 128 160 192a2, 96 96. Scheme 6. 15.5 loss 9.82cer 57 | # Attempt35: 32 64 96a2 128 160 200a2, 100 100. Scheme 6. 10.4cer batch 30 58 | # Attempt20h: a20 half size. 16.5 loss 10.0 cer 59 | # Attempt20h2: a20, 50 100 100 200 200 100 100 10.1cer 16.2loss 60 | # Attempt20h3: a20, 75 150 150 300 300 150 150 15.3loss 9.22cer 61 | # ATTEMPT20h4: 20h3 with extra 37 at first 9.43cer 62 | # Attempet20h5: a20 but 128 1 at last and 64 64. 9.70cer 63 | # Attempt20h6: a20 but 1 layer gru. 12.0cer 64 | # Attempt20h7 170. 15.8loss 9.86cer 65 | # Attempt36: 32 64 64 128 128 256 256 5 3 3 3 2 2, 32pool, 128 128 gru. 9.71 cer 66 | # Attempt37: Pre6 32 64 96 96 128 128 5 3 3 2 2 1 22pool, 64 64. 15.7 loss, 9.60cer Effcient, 250k param 67 | # Attempt38: Pre6 48 72 96 120 144 168 5 3 3 2 2 1, 84 84. 9.19cer new best! 9.34 try2 68 | # Attempt38h1: 64 96 128 128 160 192 5 3 3 2 2 1, 96 96. 9.32cer try2 loss 9.18 69 | # Attempt38h2: 128 128. 9.46 cer 70 | # Switching optimzer. Adam -> sgd 1e-5 -> 1e-6 71 | # Attempt39: 60 80 100 100 130 160 80 80 5 3 3 2 2 1. 14.68 loss 8.82cer (396) adam sgd 1e-4 e-5 e-6 72 | # Attempt40: 60 80 100 100 130 160 80 80 7 5 3 3 2 2. 14.46 loss 8.64cer (402) 73 | # Attempt41: 60 80 100 100 130 160 80 80 7 5 5 3 3 3. 14.29 loss 8.50cer (412) beam search 8.37 74 | # Attempt42: 60 80 100 100 130 160 80 80 9 7 7 5 5 3 14.70 loss 8.75cer 75 | # Attempt43: 80 100 120 120 140 160 80 80 9 7 5 3 3 3. 14.39 loss 9.09cer 76 | # Attempt44: 80 100 120 140 160 200 100 100 7 5 5 3 3 3. 14.13loss 8.43cer new best 77 | # Attempt45: 60 80 100 100 130 160 80 80 9 7 7 5 3 3, 3 2 pool. 16.5loss, 9.74cer 78 | # Attempt46: 60 90 120 120 160 200 100 100 7 5 5 3 3 3. 13.98 loss 8.45 cer 79 | # Attempt47: Same as a46, retry 13.88 loss , 8.431 cer 32 wer 80 | # a48: LSTM of a47 81 | # a49 : 60 90 120 120 160 200 60 60 60 60 7 7 5 3 3 7.93 cer 30.2 wer new best 82 | # a50 83 | # Implementation of model 84 | class ONNET_test(HWRModel): 85 | def __init__(self, preload=False, gru=True): 86 | if gru: 87 | self.rnn = CuDNNGRU 88 | super().__init__(preload=preload) 89 | else: 90 | self.rnn = CuDNNLSTM 91 | super().__init__() 92 | if preload: 93 | self.load_weights(PRETRAINED['ONNET-LSTM'], full_path=True) 94 | 95 | def get_prediction_layer(self): 96 | return "softmax" 97 | 98 | def get_input_layer(self): 99 | return "xs" 100 | 101 | def get_optimizer(self): 102 | #return SGD(lr=1e-4, momentum=0.9, nesterov=True, clipnorm=5) 103 | return 'adam' 104 | 105 | def get_loss(self): 106 | return {'ctc': lambda y_true, y_pred: y_pred} 107 | 108 | def get_model_conf(self): 109 | input_shape = (None, 6) 110 | inputs = Input(shape=input_shape, dtype='float32', name='xs') 111 | inner = inputs 112 | 113 | inner = tdnn_bn_relu(inner, 60, 7) 114 | inner = tdnn_bn_relu(inner, 90, 5) 115 | inner = tdnn_bn_relu(inner, 120, 5) 116 | inner = AveragePooling1D(pool_size=2)(inner) 117 | 118 | inner = tdnn_bn_relu(inner, 120, 3) 119 | inner = tdnn_bn_relu(inner, 160, 3) 120 | inner = tdnn_bn_relu(inner, 200, 3) 121 | 122 | inner = AveragePooling1D(pool_size=2)(inner) 123 | 124 | # No significant difference between gru and lstm 125 | inner = self.bi_rnn(inner, 60) 126 | inner = self.bi_rnn(inner, 60) 127 | inner = self.bi_rnn(inner, 60) 128 | inner = self.bi_rnn(inner, 60) 129 | 130 | inner = BatchNormalization()(inner) 131 | 132 | inner = Dense(DATA.CHARS_SIZE, kernel_initializer='he_normal')(inner) 133 | y_pred = Activation('softmax', name='softmax')(inner) 134 | 135 | # parameters for CTC loss, fed as network input 136 | labels = Input(name='ys', 137 | shape=[None], dtype='float32') 138 | input_length = Input(name='ypred_length', shape=[1], dtype='int64') 139 | label_length = Input(name='ytrue_length', shape=[1], dtype='int64') 140 | 141 | loss_out = Lambda(self.__ctc_lambda_func, output_shape=(1,), 142 | name='ctc')([y_pred, labels, input_length, label_length]) 143 | 144 | model = Model(inputs=[inputs, labels, input_length, label_length], 145 | outputs=loss_out) 146 | return model 147 | 148 | def __ctc_lambda_func(self, args): 149 | y_pred, labels, input_length, label_length = args 150 | return K.ctc_batch_cost(labels, y_pred, input_length, label_length) 151 | 152 | def bi_rnn(self, inner, hidden_unit): 153 | rnn_a = self.rnn(hidden_unit, return_sequences=True, 154 | kernel_initializer='he_normal')(inner) 155 | rnn_b = self.rnn(hidden_unit, return_sequences=True, 156 | go_backwards=True, kernel_initializer='he_normal')(inner) 157 | rnn_merged = concatenate([rnn_a, rnn_b]) 158 | return rnn_merged 159 | 160 | 161 | def tdnn_bn_relu(inner, filters, kernel_size): 162 | inner = Conv1D(filters, kernel_size, padding="same", kernel_initializer='he_normal')(inner) 163 | inner = BatchNormalization()(inner) 164 | inner = Activation('relu')(inner) 165 | return inner 166 | 167 | 168 | def inception(filters, filter_size, pool_size, strides, inner): 169 | n = int(filters / (len(filter_size) + 1)) 170 | inc = [] 171 | for i in range(len(filter_size)): 172 | inc_this = Conv1D(n, filter_size[i], strides, padding="same", kernel_initializer='he_normal')(inner) 173 | inc.append(inc_this) 174 | inc_avg = AveragePooling1D(pool_size=pool_size, strides=strides, padding="same")(inner) 175 | inc_avg = Conv1D(n, 1, padding="same", kernel_initializer='he_normal')(inc_avg) 176 | inc.append(inc_avg) 177 | inner = concatenate(inc) 178 | inner = BatchNormalization()(inner) 179 | inner = Activation('relu')(inner) 180 | return inner 181 | 182 | 183 | def residual_inception(inner): 184 | inc1 = Conv1D(32, 1, padding="same", kernel_initializer='he_normal')(inner) 185 | inc2 = Conv1D(32, 1, padding="same", kernel_initializer='he_normal')(inner) 186 | inc2 = Conv1D(32, 3, padding="same", kernel_initializer='he_normal')(inc2) 187 | inc3 = Conv1D(32, 1, padding="same", kernel_initializer='he_normal')(inner) 188 | inc3 = Conv1D(32, 3, padding="same", kernel_initializer='he_normal')(inc3) 189 | inc3 = Conv1D(32, 3, padding="same", kernel_initializer='he_normal')(inc3) 190 | inc_cc = concatenate([inc1, inc2, inc3]) 191 | inc_cc = Conv1D(256, 1, padding="same", kernel_initializer='he_normal')(inc_cc) 192 | inner = add([inner, inc_cc]) 193 | return inner 194 | -------------------------------------------------------------------------------- /data/iamon/lineStrokes(on)/data/b04/b04-334/b04-334z-07.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /data/iamon/split-config/trainset.txt: -------------------------------------------------------------------------------- 1 | a01-020x 2 | d04-081 3 | f02-000 4 | f04-061 5 | f07-088b 6 | h02-024 7 | h02-037 8 | h07-013 9 | b01-113 10 | c02-049 11 | d04-005 12 | d06-011 13 | e01-050 14 | f01-075 15 | g04-043 16 | h07-028a 17 | a01-038x 18 | a06-044 19 | a06-114 20 | b03-087 21 | b05-088 22 | c01-066 23 | f07-101a 24 | h02-017 25 | a04-077 26 | a06-014 27 | b06-019 28 | b06-032 29 | g02-069 30 | g03-000 31 | j01-049 32 | a01-001w 33 | a01-004w 34 | a01-007w 35 | a01-009w 36 | a01-013w 37 | a01-017w 38 | a01-020w 39 | a01-023w 40 | a01-090z 41 | c03-095z 42 | e04-186z 43 | g09-310z 44 | k04-318z 45 | l07-770z 46 | n04-376z 47 | p06-658z 48 | a01-067z 49 | c03-075z 50 | e03-171z 51 | g09-287z 52 | k04-296z 53 | l06-745z 54 | n04-345z 55 | p06-629z 56 | a02-125z 57 | c04-136z 58 | e04-220z 59 | g10-353z 60 | k05-369z 61 | l07-824z 62 | n05-425z 63 | p06-716z 64 | a02-145z 65 | c04-161z 66 | e04-240z 67 | g10-373z 68 | k05-395z 69 | l07-851z 70 | n05-450z 71 | p06-748z 72 | a02-166z 73 | c04-188z 74 | e05-260z 75 | g10-392z 76 | k05-426z 77 | l07-876z 78 | n05-475z 79 | p06-775z 80 | a03-193z 81 | c04-215z 82 | e05-279z 83 | h01-004z 84 | k05-450z 85 | l08-892z 86 | n05-503z 87 | p06-804z 88 | a03-218z 89 | c04-238z 90 | e05-297z 91 | h01-026z 92 | k05-474z 93 | l08-917z 94 | n05-528z 95 | p07-830z 96 | a04-239z 97 | c05-261z 98 | e06-317z 99 | h02-050z 100 | k05-494z 101 | l08-941z 102 | n05-553z 103 | p07-852z 104 | a04-262z 105 | c06-286z 106 | e06-333z 107 | h02-082z 108 | k06-513z 109 | l08-961z 110 | n06-576z 111 | p07-878z 112 | a05-284z 113 | c06-310z 114 | e06-352z 115 | h03-106z 116 | k06-536z 117 | l08-980z 118 | n06-603z 119 | p07-899z 120 | a05-309z 121 | c06-337z 122 | e06-368z 123 | h03-123z 124 | k06-558z 125 | l09-015z 126 | n06-627z 127 | p07-917z 128 | a05-340z 129 | c06-363z 130 | e07-389z 131 | h04-141z 132 | k06-579z 133 | l09-036z 134 | n06-650z 135 | p08-012z 136 | a06-364z 137 | c07-388z 138 | e07-405z 139 | h04-162z 140 | k06-597z 141 | l10-040z 142 | n06-674z 143 | p08-037z 144 | a06-385z 145 | c07-404z 146 | e07-421z 147 | h05-181z 148 | k07-628z 149 | l10-058z 150 | n06-695z 151 | p08-061z 152 | a06-405z 153 | c07-423z 154 | e08-438z 155 | h06-206z 156 | k07-650z 157 | l10-084z 158 | n07-721z 159 | p08-944z 160 | a07-431z 161 | c08-441z 162 | e08-457z 163 | h06-228z 164 | k07-682z 165 | l10-112z 166 | n07-754z 167 | p08-967z 168 | a07-457z 169 | c08-457z 170 | e09-478z 171 | h06-251z 172 | k07-712z 173 | l10-134z 174 | n07-783z 175 | p08-988z 176 | a07-480z 177 | c08-479z 178 | e09-502z 179 | h07-265z 180 | k08-736z 181 | l10-159z 182 | n07-813z 183 | p09-076z 184 | a08-506z 185 | c09-502z 186 | e09-523z 187 | h08-285z 188 | k08-759z 189 | l10-188z 190 | n07-837z 191 | p09-101z 192 | a08-531z 193 | c09-521z 194 | e10-546z 195 | h08-304z 196 | k08-779z 197 | m01-019z 198 | n07-860z 199 | p09-124z 200 | a08-551z 201 | c09-545z 202 | e10-563z 203 | h08-329z 204 | k08-803z 205 | m01-037z 206 | n07-889z 207 | p09-151z 208 | a09-583z 209 | c10-563z 210 | f01-017z 211 | h09-351z 212 | k08-828z 213 | m01-059z 214 | n08-000z 215 | p09-172z 216 | a09-607z 217 | c10-585z 218 | f01-036z 219 | h09-370z 220 | k09-011z 221 | m01-082z 222 | n08-031z 223 | p10-195z 224 | a09-635z 225 | d01-013z 226 | f01-054z 227 | h10-408z 228 | k09-848z 229 | m01-100z 230 | n08-066z 231 | p10-220z 232 | a09-657z 233 | d01-035z 234 | f01-084z 235 | h10-429z 236 | k09-874z 237 | m02-120z 238 | n08-934z 239 | p10-242z 240 | a10-683z 241 | d01-052z 242 | f02-111z 243 | j01-007z 244 | k09-899z 245 | m02-141z 246 | n08-968z 247 | p10-269z 248 | a10-707z 249 | d02-070z 250 | f02-133z 251 | j01-026z 252 | k09-931z 253 | m02-156z 254 | n09-077z 255 | p10-298z 256 | b01-001z 257 | d03-091z 258 | f03-159z 259 | j01-048z 260 | k09-960z 261 | m03-180z 262 | n09-100z 263 | p10-325z 264 | b01-024z 265 | d03-121z 266 | f03-188z 267 | j02-068z 268 | k09-994z 269 | m03-209z 270 | n09-120z 271 | p10-350z 272 | b01-050z 273 | d03-144z 274 | f03-214z 275 | j02-087z 276 | k10-058z 277 | m04-233z 278 | n09-149z 279 | r01-004z 280 | b01-074z 281 | d03-168z 282 | f03-242z 283 | j02-101z 284 | k10-086z 285 | m04-257z 286 | n09-168z 287 | r01-027z 288 | b01-097z 289 | d03-195z 290 | f03-267z 291 | j03-116z 292 | k10-114z 293 | m04-286z 294 | n09-194z 295 | r01-053z 296 | b02-116z 297 | d03-229z 298 | f03-295z 299 | j03-147z 300 | l01-023z 301 | m04-310z 302 | n10-225z 303 | r01-085z 304 | b02-136z 305 | d04-251z 306 | f04-310z 307 | j03-174z 308 | l01-048z 309 | m04-337z 310 | n10-249z 311 | r02-103z 312 | b03-154z 313 | d04-274z 314 | f04-325z 315 | j04-192z 316 | l01-076z 317 | m04-374z 318 | n10-271z 319 | r02-127z 320 | b03-171z 321 | d04-298z 322 | f05-342z 323 | j04-208z 324 | l01-100z 325 | m04-404z 326 | n10-293z 327 | r02-150z 328 | b03-195z 329 | d05-325z 330 | f05-361z 331 | j04-225z 332 | l01-125z 333 | m05-426z 334 | n10-316z 335 | r02-175z 336 | b04-221z 337 | d05-346z 338 | f05-376z 339 | j05-247z 340 | l02-148z 341 | m05-453z 342 | n10-340z 343 | r02-196z 344 | b04-246z 345 | d05-369z 346 | f06-391z 347 | j05-267z 348 | l02-166z 349 | m05-480z 350 | p01-021z 351 | r03-221z 352 | b04-275z 353 | d05-386z 354 | f07-408z 355 | j06-287z 356 | l02-183z 357 | m05-501z 358 | p01-048z 359 | r03-249z 360 | b04-301z 361 | d06-404z 362 | f07-425z 363 | j06-308z 364 | l02-203z 365 | m05-530z 366 | p01-077z 367 | r04-269z 368 | b04-327z 369 | d06-428z 370 | f07-443z 371 | j06-323z 372 | l02-227z 373 | m05-559z 374 | p01-107z 375 | r04-293z 376 | b04-357z 377 | d06-447z 378 | f08-460z 379 | j07-341z 380 | l03-243z 381 | m05-589z 382 | p01-130z 383 | r04-319z 384 | b04-384z 385 | d07-470z 386 | f09-482z 387 | j07-360z 388 | l03-260z 389 | m06-618z 390 | p02-158z 391 | r04-336z 392 | b05-411z 393 | d07-503z 394 | f09-505z 395 | j08-378z 396 | l03-282z 397 | m06-647z 398 | p02-178z 399 | r05-357z 400 | b05-431z 401 | d07-516z 402 | f09-524z 403 | j08-397z 404 | l03-309z 405 | m06-679z 406 | p02-203z 407 | r05-373z 408 | b05-452z 409 | d08-534z 410 | f10-545z 411 | j08-417z 412 | l03-329z 413 | m06-700z 414 | p02-221z 415 | r06-397z 416 | b05-476z 417 | d08-552z 418 | f10-567z 419 | j09-434z 420 | l04-349z 421 | m06-724z 422 | p02-243z 423 | r06-418z 424 | b06-496z 425 | d08-568z 426 | g01-006z 427 | j09-448z 428 | l04-373z 429 | m06-748z 430 | p03-263z 431 | r06-441z 432 | b06-516z 433 | d08-586z 434 | g01-023z 435 | j10-464z 436 | l04-397z 437 | n01-003z 438 | p03-292z 439 | r06-463z 440 | b06-537z 441 | d09-606z 442 | g03-040z 443 | j10-486z 444 | l04-416z 445 | n01-023z 446 | p03-315z 447 | r07-487z 448 | b07-556z 449 | d09-629z 450 | g03-056z 451 | k01-008z 452 | l04-436z 453 | n01-044z 454 | p03-341z 455 | r07-520z 456 | b07-576z 457 | d09-651z 458 | g03-075z 459 | k01-028z 460 | l04-457z 461 | n01-062z 462 | p03-367z 463 | r07-546z 464 | b08-597z 465 | d09-674z 466 | g04-090z 467 | k02-053z 468 | l05-477z 469 | n02-084z 470 | p04-387z 471 | r07-568z 472 | b08-620z 473 | d10-697z 474 | g04-107z 475 | k02-075z 476 | l05-499z 477 | n02-102z 478 | p04-411z 479 | r08-587z 480 | b08-648z 481 | d10-718z 482 | g05-124z 483 | k02-098z 484 | l05-526z 485 | n02-127z 486 | p04-431z 487 | r08-606z 488 | b08-667z 489 | e01-017z 490 | g05-140z 491 | k02-118z 492 | l05-543z 493 | n02-149z 494 | p04-455z 495 | r08-625z 496 | b09-696z 497 | e01-038z 498 | g06-159z 499 | k03-139z 500 | l05-564z 501 | n02-173z 502 | p04-478z 503 | r09-648z 504 | b09-724z 505 | e01-060z 506 | g06-179z 507 | k03-167z 508 | l05-588z 509 | n03-196z 510 | p04-504z 511 | r09-676z 512 | b10-748z 513 | e02-086z 514 | g06-194z 515 | k03-192z 516 | l05-613z 517 | n03-216z 518 | p05-529z 519 | r09-699z 520 | a01-007z 521 | b10-776z 522 | e02-103z 523 | g07-213z 524 | k03-215z 525 | l06-637z 526 | n03-242z 527 | p05-556z 528 | a01-030z 529 | c01-024z 530 | e02-125z 531 | g08-229z 532 | k03-239z 533 | l06-669z 534 | n04-266z 535 | p05-574z 536 | a01-053z 537 | c02-047z 538 | e03-143z 539 | g08-256z 540 | k04-259z 541 | l06-695z 542 | n04-290z 543 | p05-596z 544 | a01-073z 545 | c02-063z 546 | e03-160z 547 | g09-271z 548 | k04-282z 549 | l06-725z 550 | n04-320z 551 | p05-616z 552 | a01-094z 553 | c03-081z 554 | e03-175z 555 | g09-294z 556 | k04-301z 557 | l06-754z 558 | n04-351z 559 | p06-638z 560 | a02-113z 561 | c03-101z 562 | e04-192z 563 | g09-317z 564 | k04-324z 565 | l07-782z 566 | n04-383z 567 | p06-664z 568 | a02-130z 569 | c04-122z 570 | e04-207z 571 | g10-338z 572 | k04-353z 573 | l07-807z 574 | n04-405z 575 | p06-697z 576 | a02-152z 577 | c04-144z 578 | e04-226z 579 | g10-358z 580 | k05-374z 581 | l07-833z 582 | n05-434z 583 | p06-724z 584 | a02-171z 585 | c04-165z 586 | e04-246z 587 | g10-378z 588 | k05-406z 589 | l07-859z 590 | n05-457z 591 | p06-756z 592 | a03-200z 593 | c04-198z 594 | e05-265z 595 | g10-399z 596 | k05-431z 597 | l07-880z 598 | n05-483z 599 | p06-784z 600 | a04-224z 601 | c04-221z 602 | e05-286z 603 | h01-009z 604 | k05-458z 605 | l08-900z 606 | n05-510z 607 | p06-814z 608 | a04-246z 609 | c04-244z 610 | e05-302z 611 | h01-030z 612 | k05-480z 613 | l08-922z 614 | n05-534z 615 | p07-834z 616 | a04-269z 617 | c05-267z 618 | e06-321z 619 | h02-056z 620 | k06-499z 621 | l08-945z 622 | n05-559z 623 | p07-860z 624 | a05-291z 625 | c06-292z 626 | e06-339z 627 | h02-088z 628 | k06-520z 629 | l08-966z 630 | n06-584z 631 | p07-882z 632 | a05-345z 633 | c06-343z 634 | e06-375z 635 | h03-127z 636 | k06-563z 637 | l09-021z 638 | n06-633z 639 | p07-923z 640 | a05-318z 641 | c06-316z 642 | e06-357z 643 | h03-112z 644 | k06-543z 645 | l09-002z 646 | n06-608z 647 | p07-904z 648 | a06-368z 649 | c07-372z 650 | e07-393z 651 | h04-146z 652 | k06-584z 653 | l09-987z 654 | n06-655z 655 | p08-020z 656 | a06-392z 657 | c07-393z 658 | e07-409z 659 | h04-168z 660 | k07-604z 661 | l10-044z 662 | n06-680z 663 | p08-045z 664 | a07-413z 665 | c07-410z 666 | e07-425z 667 | h05-188z 668 | k07-635z 669 | l10-065z 670 | n06-701z 671 | p08-929z 672 | a07-438z 673 | c08-429z 674 | e08-445z 675 | h06-211z 676 | k07-662z 677 | l10-089z 678 | n07-732z 679 | p08-952z 680 | a07-466z 681 | c08-445z 682 | e08-461z 683 | h06-235z 684 | k07-689z 685 | l10-118z 686 | n07-759z 687 | p08-973z 688 | a08-491z 689 | c08-465z 690 | e09-488z 691 | h07-256z 692 | k07-721z 693 | l10-141z 694 | n07-793z 695 | p08-996z 696 | a08-512z 697 | c09-487z 698 | e09-509z 699 | h07-269z 700 | k08-744z 701 | l10-166z 702 | n07-818z 703 | p09-081z 704 | a08-535z 705 | c09-507z 706 | e10-532z 707 | h08-289z 708 | k08-765z 709 | m01-005z 710 | n07-844z 711 | p09-110z 712 | a08-562z 713 | c09-528z 714 | e10-550z 715 | h08-311z 716 | k08-788z 717 | m01-024z 718 | n07-868z 719 | p09-130z 720 | a09-589z 721 | c10-552z 722 | f01-005z 723 | h09-334z 724 | k08-813z 725 | m01-042z 726 | n07-899z 727 | p09-156z 728 | a09-615z 729 | c10-567z 730 | f01-022z 731 | h09-356z 732 | k08-835z 733 | m01-066z 734 | n08-011z 735 | p09-178z 736 | a09-639z 737 | d01-001z 738 | f01-042z 739 | h09-380z 740 | k09-019z 741 | m01-088z 742 | n08-040z 743 | p10-200z 744 | a09-666z 745 | d01-019z 746 | f01-065z 747 | h10-415z 748 | k09-854z 749 | m01-106z 750 | n08-073z 751 | p10-226z 752 | a10-688z 753 | d01-039z 754 | f01-089z 755 | h10-432z 756 | k09-882z 757 | m02-125z 758 | n08-944z 759 | p10-249z 760 | a10-713z 761 | d01-057z 762 | f02-116z 763 | j01-013z 764 | k09-909z 765 | m02-145z 766 | n08-978z 767 | p10-277z 768 | b01-008z 769 | d02-076z 770 | f03-139z 771 | j01-032z 772 | k09-938z 773 | m02-161z 774 | n09-083z 775 | p10-305z 776 | -------------------------------------------------------------------------------- /data/iamon/lineStrokes(on)/data/d06/d06-414/d06-414z-07.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | -------------------------------------------------------------------------------- /hwr/data/datarep.py: -------------------------------------------------------------------------------- 1 | from itertools import cycle 2 | 3 | import matplotlib.collections as mcoll 4 | import matplotlib.path as mpath 5 | import numpy as np 6 | from matplotlib import pyplot as plt 7 | from pylab import rcParams 8 | from copy import deepcopy 9 | 10 | # Data representation and preprocessing 11 | 12 | 13 | # Coordinate Classes 14 | class PointSet: 15 | def __init__(self, points=None, w=None, h=None, file_name=None, gt=None): 16 | self.points = points if points else [] 17 | self.w = w 18 | self.h = h 19 | self.file_name = file_name 20 | self.gt = gt 21 | 22 | def get_copy(self): 23 | return deepcopy(self) 24 | 25 | def add_point(self, point): 26 | self.points.append(point) 27 | 28 | def sample_size(self): 29 | return len(self.points) 30 | 31 | def get_stroke_group(self): 32 | strokes_n = set([p.stroke for p in self.points]) 33 | return [[p for p in self.points if p.stroke == n] for n in strokes_n] 34 | 35 | def get_all_points(self): 36 | groups = self.get_stroke_group() 37 | x, y = [], [] 38 | for g in groups: 39 | x = [p.x for p in g] 40 | y = [p.y for p in g] 41 | return x, y 42 | 43 | 44 | def get_lines(self): 45 | lines = [] 46 | strokes = self.get_stroke_group() 47 | for s in strokes: 48 | for i in range(len(s) - 1): 49 | if i == len(s) - 2: 50 | lines.append(Line(s[-2], s[-1], eos=True)) 51 | else: 52 | lines.append(Line(s[i], s[i + 1])) 53 | return lines 54 | 55 | def total_length(self): 56 | return sum([l.length() for l in self.get_lines()]) 57 | 58 | def mean(self): 59 | lines = self.get_lines() 60 | sum_px, sum_py, sum_l = (0, 0, 0) 61 | for l in lines: 62 | sum_px += l.proj_x() 63 | sum_py += l.proj_y() 64 | sum_l += l.length() 65 | 66 | return np.array([sum_px / sum_l, sum_py / sum_l]) 67 | 68 | def sd_x(self, mean_x=None): 69 | if mean_x is None: 70 | mean_x = self.mean()[0] 71 | lines = self.get_lines() 72 | sum_vx = sum([l.var_x(mean_x) for l in lines]) 73 | sum_l = sum([l.length() for l in lines]) 74 | return (sum_vx / sum_l) ** .5 75 | 76 | def sd_y(self, mean_y=None): 77 | if mean_y is None: 78 | mean_y = self.mean()[1] 79 | lines = self.get_lines() 80 | sum_vy = sum([l.var_y(mean_y) for l in lines]) 81 | sum_l = sum([l.length() for l in lines]) 82 | 83 | return (sum_vy / sum_l) ** .5 84 | 85 | def normalize_points(self): 86 | mean_x, mean_y = self.mean() 87 | sd_y = self.sd_y(mean_y=mean_y) 88 | for p in self.points: 89 | p.normalize(mean_x, mean_y, sd_y) 90 | return self 91 | 92 | def range_x(self): 93 | xs = [p.x for p in self.points] 94 | return min(xs), max(xs) 95 | 96 | def range_y(self): 97 | ys = [p.y for p in self.points] 98 | return min(ys), max(ys) 99 | 100 | def down_sample_distance(self, d_th): 101 | strokes = self.get_stroke_group() 102 | ret = [] 103 | removed = 0 104 | for s in strokes: 105 | for i in range(len(s)): 106 | if i == 0 or i == len(s) - 1 or Line(s[i], s[i - 1 - removed]).length() > d_th: 107 | removed = 0 108 | ret.append(s[i]) 109 | else: 110 | removed += 1 111 | self.points = ret 112 | return ret 113 | 114 | def down_sample_angle(self, cos_th): 115 | strokes = self.get_stroke_group() 116 | ret = [] 117 | removed = 0 118 | for s in strokes: 119 | for i in range(len(s)): 120 | if i == 0 or i == len(s) - 1: 121 | removed = 0 122 | ret.append(s[i]) 123 | else: 124 | cs = Line(s[i - 1 - removed], s[i]).cosine_similarity(Line(s[i], s[i + 1])) 125 | if cs < cos_th: 126 | removed = 0 127 | ret.append(s[i]) 128 | else: 129 | removed += 1 130 | self.points = ret 131 | return ret 132 | 133 | def resample_distance(self, d): 134 | strokes = self.get_stroke_group() 135 | ret = [] 136 | for s in strokes: 137 | if len(s) > 0: 138 | ret.append(s[0]) 139 | for i in range(1, len(s)): 140 | line = Line(ret[-1], s[i]) 141 | l = line.length() 142 | if l > d: 143 | # interpolate 144 | f = d / l 145 | iteration = int(l / d) 146 | for j in range(iteration): 147 | ret.append(line.interpolate(f * (j + 1))) 148 | elif l == d: 149 | ret.append(s[i]) 150 | elif i < d: 151 | continue 152 | ret.append(s[-1]) 153 | self.points = ret 154 | return ret 155 | 156 | def slope_correction(self): 157 | def homogeneous_co(x, y): 158 | mat = np.ones((len(x), 3)) 159 | mat[:, 0] = x 160 | mat[:, 1] = y 161 | return mat 162 | 163 | def translation_matrix(x, y): 164 | mat = np.identity(3) 165 | mat[2][0] = x 166 | mat[2][1] = y 167 | return mat 168 | 169 | def rotation_matrix(rad): 170 | mat = np.identity(3) 171 | mat[:2, :2] = np.array([[np.cos(rad), -np.sin(rad)], [np.sin(rad), np.cos(rad)]]) 172 | return mat 173 | 174 | x = np.array([p.x for p in self.points]) 175 | y = np.array([p.y for p in self.points]) 176 | best_fit = np.poly1d(np.polyfit(x, y, 1)) 177 | x_max, x_min = np.max(x), np.min(x) 178 | rad = np.arctan2(best_fit(x_max) - best_fit(x_min), x_max - x_min) 179 | x_mid = (x_max + x_min) / 2 180 | pivot_x, pivot_y = x_mid, best_fit(x_mid) 181 | co = homogeneous_co(x, y) 182 | #co = np.matmul(co, translation_matrix(-pivot_x, -pivot_y)) 183 | co = np.matmul(co, rotation_matrix(rad)) 184 | #co = np.matmul(co, translation_matrix(pivot_x, pivot_y)) 185 | for i in range(len(co)): 186 | self.points[i].x, self.points[i].y = co[i][0], co[i][1] 187 | return self.points 188 | 189 | def up_sample_short_stroke(self, n): 190 | strokes = self.get_stroke_group() 191 | new_strokes = [] 192 | for i in range(len(strokes)): 193 | s = strokes[i] 194 | l = len(s) 195 | if l < n: 196 | if l == 1: 197 | new_strokes.append([s[0] for _ in range(n)]) 198 | else: 199 | pts = PointSet(s) 200 | resample_d = pts.total_length()/n 201 | new_strokes.append(pts.resample_distance(resample_d)) 202 | else: 203 | new_strokes.append(s) 204 | self.points = [inner for outer in new_strokes for inner in outer] 205 | return self.points 206 | 207 | def generate_features(self, preprocess=None, pad=0, add_pad=0, fset=1): 208 | if preprocess: 209 | self.preprocess(**preprocess) 210 | lines = self.get_lines() 211 | features = [] 212 | for l in lines: 213 | if fset == 1: 214 | features.append(l.get_features()) 215 | elif fset == 2: 216 | features.append(l.get_features_2()) 217 | features = np.array(features) 218 | dim = pad if pad else features.shape[0] 219 | dim += add_pad 220 | result = np.zeros((dim, features.shape[1])) 221 | result[:features.shape[0], :] = features 222 | return result 223 | 224 | def preprocess(self, down_d=0, down_cos=1, slope_correction=False, 225 | normalize=False, resample_distance=0, up_sample=0): 226 | if slope_correction: 227 | self.slope_correction() 228 | if normalize: 229 | self.normalize_points() 230 | if down_cos < 1: 231 | self.down_sample_angle(down_cos) 232 | if resample_distance > 0: 233 | self.resample_distance(d=resample_distance) 234 | if up_sample > 0: 235 | self.up_sample_short_stroke(up_sample) 236 | if down_d > 0: 237 | self.down_sample_distance(down_d) 238 | 239 | return self.sample_size() 240 | 241 | def plot_points(self): 242 | rcParams['figure.figsize'] = 10, 10 243 | groups = self.get_stroke_group() 244 | for g in groups: 245 | x = [p.x for p in g] 246 | y = [-p.y for p in g] 247 | plt.plot(x, y, '.', linewidth=2, color=(0, 0, 0)) 248 | plt.gca().set_aspect('equal', adjustable='box') 249 | plt.show() 250 | 251 | def plot_points_with_lines(self): 252 | rcParams['figure.figsize'] = 10, 10 253 | groups = self.get_stroke_group() 254 | for g in groups: 255 | x = [p.x for p in g] 256 | y = [-p.y for p in g] 257 | plt.plot(x, y, '.', linewidth=2, color=(0, 0, 0)) 258 | plt.gca().set_aspect('equal', adjustable='box') 259 | plt.show() 260 | 261 | def plot_strokes(self): 262 | def colorline(x, y, color, linewidth=20): 263 | n_pts = len(x) 264 | colors = [(color + (1 - 0.9 * j / n_pts,)) for j in range(n_pts)] 265 | segments = make_segments(x, y) 266 | lc = mcoll.LineCollection(segments, colors=colors, linewidth=linewidth) 267 | ax = plt.gca() 268 | ax.add_collection(lc) 269 | return lc 270 | 271 | def make_segments(x, y): 272 | points = np.array([x, y]).T.reshape(-1, 1, 2) 273 | segments = np.concatenate([points[:-1], points[1:]], axis=1) 274 | return segments 275 | 276 | rcParams['figure.figsize'] = 10, 10 277 | c = cycle([(1, 0, 0), (0, 0.6, 0), (0, 0, 0.7)]) 278 | fig, ax = plt.subplots() 279 | xmin, xmax = self.range_x() 280 | xpad = (xmax - xmin) * 0.05 281 | xmin, xmax = xmin - xpad, xmax + xpad 282 | ymin, ymax = self.range_y() 283 | ypad = (ymax - ymin) * 0.05 284 | ymin, ymax = -ymax - ypad, -ymin + ypad 285 | ax.set_xlim(xmin, xmax) 286 | ax.set_ylim(ymin, ymax) 287 | groups = self.get_stroke_group() 288 | for g in groups: 289 | color = next(c) 290 | x = [p.x for p in g] 291 | y = [-p.y for p in g] 292 | path = mpath.Path(np.column_stack([x, y])) 293 | verts = path.interpolated(steps=3).vertices 294 | x, y = verts[:, 0], verts[:, 1] 295 | colorline(x, y, color, linewidth=2) 296 | 297 | plt.gca().set_aspect('equal', adjustable='box') 298 | plt.show() 299 | 300 | def plot_both(self): 301 | self.plot_points() 302 | self.plot_strokes() 303 | 304 | def __repr__(self): 305 | return "".format(self.w, self.h, len(self.points)) 306 | 307 | 308 | class Point: 309 | 310 | def __init__(self, stroke, time, x, y): 311 | self.stroke = stroke 312 | self.time = time 313 | self.x = x 314 | self.y = y 315 | 316 | def coordinates(self): 317 | return np.array([self.x, self.y]) 318 | 319 | def normalize(self, mean_x, mean_y, sd): 320 | self.x = (self.x - mean_x) / sd 321 | self.y = (self.y - mean_y) / sd 322 | return self 323 | 324 | def displace(self, co_diff, t_diff): 325 | new_x, new_y = self.coordinates() + co_diff 326 | return Point(self.stroke, self.time + t_diff, new_x, new_y) 327 | 328 | def __repr__(self): 329 | return "".format(self.stroke, 330 | self.time, self.x, self.y) 331 | 332 | 333 | class Line: 334 | def __init__(self, p1, p2, eos=False): 335 | self.p1 = p1 336 | self.p2 = p2 337 | self.eos = eos 338 | 339 | def vec(self): 340 | return self.p2.coordinates() - self.p1.coordinates() 341 | 342 | def normalized_vec(self): 343 | return self.vec() / self.length() 344 | 345 | def length(self): 346 | return np.linalg.norm(self.vec()) 347 | 348 | def time_diff(self): 349 | return self.p2.time - self.p1.time 350 | 351 | def cosine_similarity(self, l): 352 | if self.length() * l.length() < 1e-5: 353 | return np.inf 354 | return np.dot(self.vec(), l.vec()) / (self.length() * l.length()) 355 | 356 | def proj_x(self): 357 | return self.length() * (self.p1.x + self.p2.x) / 2 358 | 359 | def proj_y(self): 360 | return self.length() * (self.p1.y + self.p2.y) / 2 361 | 362 | def var_x(self, mean_x): 363 | return self.length() / 3 * ((self.p2.x - mean_x) ** 2 + (self.p1.x - mean_x) ** 2 + 364 | (self.p1.x - mean_x) * (self.p2.x - mean_x)) 365 | 366 | def var_y(self, mean_y): 367 | return self.length() / 3 * ((self.p2.y - mean_y) ** 2 + (self.p1.y - mean_y) ** 2 + 368 | (self.p1.y - mean_y) * (self.p2.y - mean_y)) 369 | 370 | def interpolate(self, pc): 371 | return self.p1.displace(self.vec() * pc, self.time_diff() * pc) 372 | 373 | # x, y, delta_x, delta_y, down, up 374 | def get_features(self): 375 | x_start = self.p1.x 376 | y_start = self.p1.y 377 | delta_x, delta_y = self.vec() 378 | down = not self.eos 379 | up = self.eos 380 | return np.array([x_start, y_start, delta_x, delta_y, down, up]) 381 | 382 | # x, y, normalized direction, length, down, up 383 | def get_features_2(self): 384 | x_start = self.p1.x 385 | y_start = self.p1.y 386 | length = self.length() 387 | direction_x, direction_y = self.vec() / length 388 | down = not self.eos 389 | up = self.eos 390 | return np.array([x_start, y_start, direction_x, direction_y, down, up]) 391 | 392 | def __repr__(self): 393 | return "' 394 | -------------------------------------------------------------------------------- /demo/data_reader.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from hwr.data.reader import IAMReader\n", 10 | "from hwr.constants import SPLIT, PREPROCESS" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 12, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/plain": [ 21 | "" 22 | ] 23 | }, 24 | "execution_count": 12, 25 | "metadata": {}, 26 | "output_type": "execute_result" 27 | } 28 | ], 29 | "source": [ 30 | "# Initialize reader with a given split\n", 31 | "reader_train = IAMReader(SPLIT.TRAIN)\n", 32 | "reader_all = IAMReader(SPLIT.ALL)\n", 33 | "reader_train" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "(5361, 12171)" 45 | ] 46 | }, 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "# Get samples from reader\n", 54 | "train_samples = reader_train.get_samples()\n", 55 | "all_samples = reader_all.get_samples()\n", 56 | "len(train_samples), len(all_samples)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 8, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "" 68 | ] 69 | }, 70 | "execution_count": 8, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "# sample is wrapped in a class\n", 77 | "sample = train_samples[0]\n", 78 | "sample" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 13, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmIAAABjCAYAAAA4lWxqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJztnX2MHGed57+/6rf4buGCB98OeXEcFK/ORj7Z4Ay0bj3prIMXwyWeKCuUxbnJxcbjSWyOESdNMJDTnMxONoY9JqsYpzvxWG6J2+xLSC5BRAR73U5EV16cHSAvDsFhSUiED5iFvVvpPD3T/dwfXU/56aef6q6e7q6qnvl9pFJ3VVdXPfVU1fP8nt/bQ0IIMAzDMAzDMMFjhV0AhmEYhmGY5QoLYgzDMAzDMCHBghjDMAzDMExIsCDGMAzDMAwTEiyIMQzDMAzDhAQLYgzDMAzDMCHBghjDMAzDMExIsCDGMAzDMAwTEiyIMQzDMAzDhEQ8rBMT0ScA3A8gBuBhIcSfN9r//e9/v1izZk0QRWMYhmEYhmmLl1566TdCiFXN9gtFECOiGIDDAD4O4B0ALxLRE0KI17z+s2bNGpw5cyaoIjIMwzABYts2CoUCMpkM0ul02MVhmLYhorf87BeWaXIAwDkhxM+EECUAjwDYEVJZGIZhmBDJ5XLYsmULvvSlL2HLli3I5XJhF4kJgbvvvhtXXnklrrvuOti2HXZxAiMsQexyAL9Q1t9xttVARCNEdIaIzvz6178OrHAMwzBMMNi2jTvvvBPlchkAUC6Xcddddy2rjpipCmGHDh3CO++8g2eeeQZbtmxZNs9ApJ31hRA5IcRmIcTmVauamlkZhmGYHuPQoUOoVCo128rlMvL5fEglYsLg29/+ds16uVxGoVAIpzABE5Yg9i6AK5X1K5xtDMMwzDLBtm088cQTxt+OHTu2bDQiDPDRj360Zt2yLGQymXAKEzBhRU2+CGAtEV2NqgB2K4DPhFQWpsOoTrcA6r739fVhdnaWnXIZZpmTz+frtGGShYUFFAoFbiOWCR/60Idq1kdGRpbNvQ9FEBNCLBDRfgDfQzV9xbQQ4tUwysJ0Bil89fX1YWxsDKVSCbFYDESEhYUF9/v8/DwqlQosy0IqlcLJkyeXzcvGMEwtr71WGyi/du1avPXWW5ifn0csFls2GhEG6Ovrq1nftGlTSCUJntDyiAkhvgvgu2Gdn2kfk/BFRKhUKu4CAEKImu8AUKlUUCqVeMS7TLFt2/UBGh4e5mdgmXLhwoWa9Xg8DiICAPeTWR7Mzs7Csix3oD47Oxt2kQIjNEFsKaEKJMvF5GbbNrZu3VonfFmW5Wq/mmnEkskkj3iXIbZt4/rrr8fc3BwA4OjRozh9+nTdO5PL5fDoo4/illtuwcjISM3/WYhbGqxduxYvvPCCu75q1Sq88cYbEEKwaXKZkclkEI/HMT8/j3g8vqz6BhbEFomuDZqbm0OlUnEFkMOHD9d0HksJ27YxMTHhXrMqfCWTSUxNTbkCKcA+YkwthUIBpVLJXZ+fn0c+n695FnK5HPbu3QsAePrppwFUfUZ0IW56epo76x7Ftm088sgjNdvWr1+PF198EaVSiQdqy5Dlqg1lQawFmpniALgjuf3792PDhg1LqoOQmohjx47V+Xqpwpd+zer6UqoPZnFkMhkQkWumBup9hY4ePVq3PjIygkKh4AphQFWIUwWxXC6HqakpEBE+//nPL9nB0FIgn8+7ucMAIBaLYXh4GMPDw5y6YhlSKBQwPz8PIUTde73UYUGsCVL4OH/+PJ566iksLCwYTXEAaqJ/FhYW6kb5vYyuiQCq4cU33HADJiYmjNfpZVpiljfpdBo33XQTHn/8cXfbD37wA9i27T5Hl112Wc1/LrnkEgD1Dr2qQ7eqRQPgfudnrze48cYbkU6nYds2jh8/jlKphOPHj3NAzzKhr6/P7UMrlUrdu76UiXRC17CwbRv33nsvcrkcrr/+ejz44IN4/PHHMTc3h3K5jEqlglgshlgshlQqhQceeABf/epXMT4+DsuqVqkQYknlwcnn8zVCGBEhlUo1FML27t2Lp59+Gnv37uUpS5gaxsfH3QEMUH1f1OSN+u/PP/88bNvG7Oxsjdnis5/9rPv8Pfroo3Xn0TVrTHQYHh5GPF7VBcTjcYyPjwO4aLoul8tuQA+z9JHO+gCWnbM+C2IOcpqNm2++Gddffz3uuece7Nu3r8aXRZJIJPDAAw/g4MGDOHnyJEZGRnDgwAEMDQ3VdBJSvboUufbaaxuOVE2mJSZ6yEFH0AOGdDqNb37zm4jH4655W/UHSqfT2LNnj7su36W+vr4ak6Ya4n7LLbfUnUfXrDHR4eWXX4YQwvWrlWQyGSSTSViWBSJaVpqR5Yx01icidtZfjti2jeuuuw7z8/M12y3LgmVZNX4MQHXqBZP/V6FQqOkkllJm4OHhYUxPT2N+fh6JRAJTU1MNzQVepiUmOqiRr8lkMnAT0MjICDZs2OAGc+jnVoUsaaqQGjHZgauj5pGREbz55pv42te+BiEEEomEq2VhooVt29i3b5/btqo+Qel0GlNTU9i/fz/K5TLGxsaWnL9tM9Sk2J28btXVpr+/P5SoY9XXemZmBkD1XZ+ZmXH7T7/O+l7Jw3vtWWFBDFWzmy6ESdPb1NQUZmZmcPr0aZw9exaAt/+XGn5rWRYOHz7ccw+EF+l0GoVCwfeDPj4+jieffNJtaKVpaanUx1LAZAIK+v7IjteEKa+QqhETQtRpS+677z4MDQ31bIO8XMjn81hYWHDX9eSts7Ozrh/uUs036CVsyQHS3Nyc2490ws/R5OfrlTqmG+RyORw9ehQzMzOui4+KGsDTyFnfFDRnWZbbh1uWhSNHjvRWZL4QoieWj3zkI6JbjI6OCgDuYlmWGB0dFcVi0XOf0dHRuuMUi0WRSqUEEYlUKlXz/+XI6OioICIBQMRiMTE5ORl2kRiFYrEoVqxYIWKxmFixYkXkntdisSiSyaQgIpFMJkWxWBSTk5PCsiz3PeVnqvcoFosikUi4bWksFhPZbLZun6CfzWw2K7Zt21ZXlk5TLBbF6OioSCaTwrIskUgkas6pPuMARCKRaPv6i8Wi2LZtW00fJpehoaF2L6kp2WzWeO5Gi+k+qM9FIpFw+xd9ISIRi8VEKpWq68uDBMAZ4UO+CV3A8rt0UxCTDb5XoyBE/YNk2ocFj1pMHSkTLYLqfBaDaWATdeGRaY4+qPUSBKTAEkRHqrfv4+PjHT+HKoDpAoRlWe47WCwWRTwerxEq2ulL5DujCnfqEovFul6/AwMDTQUvvU5Myo7JyUkRi8WM+zc6blhthV9BjJ31cdHsNjk5iWeffdaoBlajtXTfFKCqLj127FiNjXup+Ie1w3JN0NcL2LaNsbExnDx5EmNjY5GL8C0UClhYWKjLsj41NYWtW7c29VPsRWTQ0J133hm5+9Et+vv7PX87fvw4HnroIWzdurWj9XHbbbehr68Pt912GwDg/vvvr/n961//ekfPJ82N2WwWpVKpxpcYqPpA7tu3z3Xf+MIXvuD+JgwmeNPxTUE3puTb69evr9lHaBHL3UD3GV6/fj2y2SxGR0cxOjqKbDaLa6+9tulx1EAOvQ5VZKAHUL2+UqmEfD4fSmCSL/xIa1FYuqkR80MzjZiuTrYsa9mP1tXRC2sIo0eQ90eaFVt5J0wa1VY0Yjt37hQrV64UO3fu7MQldBWpLVFNdolEIlSzSrfIZrMiFos11ZR36/ncuXNnTVu+bds2o7bIpJFZLOq1QDGdeZ1zcnLS1fg004ipGi/VzKlrwizLEitWrBDZbLZG4xaEG42q5YvH48bzqe+7l2VKiOrzc80119Rp9VatWiVisZgYGBio0T5KE6X8HqR2DGya7CyqoEVEdS9psVise7HkA7EYFtNxRQ35wssGoJevZSkSlJmv0XkaPecm06TfzlnvbKMojMnryWazYsWKFUZTS5hmlW6g3tNGna3ctxvP58qVK2vqWLql6ItfAaWZCdUkFMj99XNKM6EfVxiJrgSQx1DfFcuyxLZt29yyqEJeJwXORvjp07LZrEgkEp59hi5cElGdj53pnKOjozUmzaCuORBBDMDXALwO4McAHgNwqbN9DYD/B+CHzvKg8p+PAHgZwDkAfwmA/JwrbEFM9SPzekmHhobqGtFkMtnyqHYp+FZ5jdIa7T80NCQGBgYi6a+0VAnCD8dLeGrW0Zr+5/fdeM973lPzLq5cubIr17YY9I5ZdjwmYWCpaZO9gp7U5zCbzboC6ujoqBgcHOxou6A7rasCsPrdTzCI1O6p90otp/qM647jxWLReN8nJydbCkoxKQGGhoZqnjH5fvnpx8Kk2UDLS7hshhwAqMJ3ENcdlCC2DUDc+X4fgPvERUHsFY//vADgYwAIwFMAtvs5V9iCmBC1jYjp5dBv9mIFMj8RmlGnFbOCqSFhYSwYgtCKeUUTN3tGvEyTzSKTTR3ctm3bOn5di0E3C8m2JB6Pu5310NCQGBoaCsWU0m30tm3dunViaGioxiSrC0SdbhdMmih5zsHBQV/nk4KjybyoRjk2C+AaHx83XqM6kI3H402vW1cCxGIxo/DXzLITNs0GWu1YWcLQBPoVxNpy1hdCPC2EkMlgngNwRaP9iegDAN4rhHjOKWQewFA7ZQgSU4JJlXQ6jVOnTmF0dBSpVKrOWTCbzSKTybTsiHv+/PnOXECASKfKWCyGZDLZMHBBn/wXAKampjpeprCyyEeZoKaTqb7uFz8BfxnU9WAPkwO/TqFQqMtRFIXAGdu2cdddd9Xkz5L5Cg8fPoyDBw/i1KlTeOyxx/DYY4+hUCi4s3cslaCETZs2udPYAMDZs2fx+OOP1+VxVJ8TFdM0Vp1CCIGVK1c2nWYnl8vhuuuuQzabrWu3ALjPpR7ApedKA6p57wYHB911eU4ZlBKLxVCpVJoG02zfvr1mvVKpoFwuY2FhAatXr3afH3U+RyFETZ8WFbwCvGRwkZxisNVgneHhYSSTSQDVaz969Ghk+oJORk3uQlXDJbmaiGaI6DQRbXG2XQ7gHWWfd5xtPUGzyEmgKowdOXIEp06dwt69exclkA0PDyORSLjrTz31VGQeGL+0Et1mEjRff/31js1PKSPRMpkMvvKVr2DLli24+eabI1unQUbOtSIwS+6++26sXbsWd999t69zFAoFlMtlCCFQLpdd4alZZ2MSuvyUVxfoFjtdSqcFd9OAY8uWLbj99tuxYcMGHDhwoOY9SafTddt6Gdu28bnPfa5OSDbh1Rlv3Lix7XKo7asqFBIR+vv7kUql3FlV9GdJzggwPz9fIyyq5RSiGuUon1/5+65du+rupW3beO6559z1RCLhPqumpLZe6HOwEhEsy6p5R2zbxqOPPuruF8X5HAuFglu3+hSB+XweFy5ccOuk1bKn0+kagXV+fh75fL5TRW+PZiozACcAvGJYdij7fBlVHzFy1lMA+sRFn7BfAHgvgM0ATij/2wLgOw3OPQLgDIAzq1ev7rjasFUWk2tGqrClOUX9fyNH3F7PSdaKyUs3B8il3USGjfL2yHr1a+rw45DbieAK3eckCB+OVnKJ6U7wft6BRg64jXxhvEyTzXzaOmF+8WMCbRWTSSyRSLjviPSNktfZ68E6OmokoGlZt25dnY/Y5OSk2Llzp7Asq2OBC3rAgFqGbDbbMKpTv4exWMx9HvVnzo8ZrZG5rBVfYb1v0p3YvSIoo/R8ST9h/X7I3zrh4xW02w+CipoE8J8B2AD+VYN9Co4Q9gEAryvb/xRA1s95ouAjpjckrQgKXgKZV0fR6w77fn3ETP5hzerGD7LhaZb0z08yQ1040u+73mlLJ+NWHeCz2Wydb1O7yRyb0YrAXCwW6+rv8ssv93V8L1+XRtFhpnr1U9ZOROt2o8EuFuszyqsdoxTKksmkSKVSRgGtlykWaxOVyusPOo2FLqjr75rp3sv222tGAN0JPpFIuJnfvQZ8zRznWxkMeEVgyudGTdHRipN7UJjaa7Xt09N/LGY2APUeBtWvBiKIAfgEgNcArNK2rwIQc75/EMC7AFYKs7P+J/2cKwqCmN6QLKaTNL3QppesGyPyIPErSOqh17KhMDV2rZxbzw0kR4it5gvyEhTVaC89a7Q+PYkfgcwkhHk9G52klU7O1Nhv3LixreOrGgFdI6Zn0R4YGGha1lajdf1ea6dGznp0oBQspeAor1WtEy+tWa+hCqIyk7wfzZ86WFhMFLpOo+l25CBK3TY4OGgUEvRnQn2W9UUXHPQ2ynS8VgazphQcyWRSjI+P10wFFEVNmBDmfgC4qHFvJ9pTVYJ06hnyS1CC2DlUzY41aSoA3ALgVWfbPwC4UfnPZlRNm28CeAA9kr5Coke5dKKhN0Vg9vqcen4FSV1LIBs+fWTn98UxqeDVyKFisSjWrFnTsJNVTXVeEVaDg4NifHy8YdoBXQhsFIGlC3tEJIaGhrreWPiNzvJq7Js1iI3Mks3MDSYtQyqValjWTpn0g9JIS0FEFcpUjZgqoOlas15L9tqOcKt3pu0IE14mUtnOmtok/d30ynHllZOMiMT4+LiYnJz0LRz5fTdNAqBsP3TFQdQ0YRKvOveKQPVrLTFp2oJ09QlEEAtyiYog1qmRcrOEfa0k9IsirQiSukZpYGCgLsQfQNMw7mYqeJM2Ujcz6oK2l/+a1xKPxz0bca+M0vozRUSB3u9mSRRlGRt1XiaadSS6xsv0LumClUx1sBjBrlWC9tNSz6cKaGq2cV3LG0XthhftCmLbtm3z1J62gt62WpZV9zzpvkpqe9FIAPYauKn3TF9vJBz5SXCqC3+WZYlkMikGBgYW7UoTBqZUHgBq8g7qA7NG98LLMhLFzPpxMC2hR/gtNrWEjHIRQhgjMGdnZ2FZljs/mN8IEdu23UiQ4eHh0CKu1DBpU6oPlUwmgxdeeKFmfffu3di/f78bNQdUw8L379+PDRs2GKOPtm7dWjOnWiqVwsTEBNLptPv7hQsX3OMREXbv3u0eK5fL4dChQzXHffvtt5FKpYzzw6lYloWRkREMDw8DqEb4nD9/Hk8++aQbKbewsIB8Pt/0nuzYscM432m3MEVnqWW0bRvT09Pu9cdiMfeaKpUKfve739UdU5/jzvSMq89EIpFw605leHgY09PTmJ+fRywWQ39/v2dZ/USptUI6nQ70/TGdL5/Pu/Uu03zIqDIhLs6hJyNKoxxh+d73vrfhuhemd7dZm9IItW2Vx4rH4zXR3V7zX+7evRtHjhzxPLaeDkK28RK9DYnH424b5VXWcrmMSqWCubm5unczn8+7qT+ICDt27EB/fz+OHTuGM2fO1LyzDzzwQKSfj0svvbRuGylzNqfTaezatQvZbBZCVKMqs9kspqensWvXLmzatAmzs7Pu/np/EI/HsWvXrlD7RU/8SGtRWKKiEdNHSotxGhSiucbLr1pa3b+ZtidIWtGINcu2rZrtLMsSo6OjNVFVUnvQKOOyrtHRR0ZeqvF169YZy6EvXtGDuqO/SUsTdmBGq5orfaQt/X3ksaQJqVGElq698gqaUE3ciUTC1YiZ6irselwsJs2b1IToJhXpX6b6u8i6lhrDRhqCSy+9VAAQV111VUBXdxHdJWD9+vW+/uflP9ROII/JhKhHLOrvu5/21GT2lAEJ8v2Rn37a9UaR+l5+U7pju/wt6v6Fug82UD81mcnUqNf1ihUratrrMIMTwKbJ7tAp02QjJ2WJH5OR2vGZHs7FCort0oog2Uy4VetB7XjUF08NcZf+WLp5R21QdZW2l9+ILLepM0gkEr6mXvFzr4M2g+l4PWsm3xxTdngpBOjPoVcjaKpv07vk1QmbOkZVuFtMkEcYmIRHrwARtTOV/ou6Wd9LYCgW6yNegxTGTOdft26d7/828x9qFZPpUR8MFIvVdAoy+79fH1X93ZD+WnpaDj/H098Ty7LE+Pi4e+9Nzv4mYcUr6KNZWcIwzw8ODorLL7/cc3DbrM+TA5Yg5tFtBgtiXcKvE7qfYzTSkghRH2YtOzTdwbdRigY/6Rm6RaN8PCqmKCUdec0mzZTp+r0cnhulC9E7fFUgNPmV+O3ovbSfYQtfKiYNpmzQTc7hXlGe+n1pNIDQ/2+6740i3HThTX+Oou4TI4R5YOflZ5RIJGomjvbKj2cSaicnJ433JyhMgncrgrKuWW4kvPvBq4474cRtejcW+yyaBDt98dI4qwMoU9CHPpjVhTO/6WLCopn2PQrtKwtiXaTdG+zHSVkIcyckJwnWXy55rHg87itPWRD41R6aGptG0Xt6VKSpM2qUAsB03/S61htOk0O930a7mZAThUbOJCw2i0A0mc9k3fiJ6uvv76/535o1a+r2MQkQpmfKpDXphWjjVgQx9ZlWv/sRxKKgEZODRqnVaRVTvbRjnjSZEDv1HurCWLvBBY0GPY3Mbvqg3dRveAln6n5RTiquXmPYgpcOC2IRRUrxfib0bWSW0QUMteMLIzu7iVb86datW1ez78DAgOe+6os3OjpqNAW0mhSz0flNgqLfEa7JtCfLH6X0JPqzdtVVV9WsN5pge7E5evQ6N5mqmmnEZIdu6qR7If+eScPuJVRKLZiqETO1D17PZtg+Yu0OYHVLQrsaT1XAaUXD3crxm7mWtHKsdvMM6kKZl9Bl6l+iMFjsRfwKYhw1GSAy+qdUKiEej2PPnj0NIzgymQxSqZQb+aFCzmTBU1NTbqSIPE46ncbMzAwefPBBAHDn7IpcpIjCqlWrcPbsWXe9VCp57qtGl917773VEYXDxo0b8elPf9qNnPETTZbL5WrODVSjo4CL0X/6HIGf+tSnGh5TRq8eO3YMCwsLiMViNff75Zdf9h1VGgSZTAbxeNyt97feesv9jYhwxx13GK9X3ovh4eGWI/fGxsawd+/emnWdZtHC3/jGNzA0NITXXnut7rft27dH+pkHqvV36tSpuro7ffq0G/2sR4PJfeX3vr4+zMzM4Pz58+jv7/dsU9LpNH77298Gcl0m2o1ClXXVqajwkZERbNiwoWsRp508voyilpHklmXhxhtvxPj4uO/jqvUvy9XX14fZ2Vn09fVhbGysLurc1L8wXcCPtBaFZSloxBYzVYccxcgkgNJhvZnWIQp5yFoJbFhsLi2vyCE/yLrVHZ5lNJduBlWXRs7gJmdZ/X7r2sKwzMcqjXyTujUabjbPpX5/ZXSgum66h1GpU4bpJN30e4qyia9XAZsmu0urL0QrJslOnDMKpq9WAhtMztt+fTbaybgci8XqnIBVc5fqZ3bFFVfUCSgmB9dmSQRNUXFREBp0oaeZwBlkudS5O00mH33gEWaQCsMwjBAsiHWVVh2t1f3V6Xa6XcYo5FRqRXg0ZbH3K1T5zbhsir40TTRr0rTpaRv0+Sv1XEFe2kuTA/pinJe7gRR6hoaGIj2Fjum5ymazYmBgIJCpoRiGYZrBglgX8Rv1KET9NAtBRZ90Is1G0CxmUmn1v7pApQpBJqdyNbRbPefg4KAYGhoyatlUbYzJwVXV5nlFM5kc0DsZscUwDMOEj19BjJ31F4F0bC6XyxBC4KGHHsKmTZtch0rbtl1HSN0BMplMuo623URO9yKEwMLCQuSd9YGq860MMJD85je/8f3f48ePu9OgCFGd/kVOgUFEddMU7dmzB6tXr8bbb7/tTpsBAM8880zNsZPJpDv9juqAq99fOZ2JPr2SjskBvVKp+Jr+iGEYhllasCC2CNLpNO644w638y6Xy7jrrrswMzODTZs2YWxsDKVSCZZlufOEWZaFG264oeG8YhIpyLUTqZLJZJBMJlEqlQIT/tolnU5j586d+Na3vuVu+8xnPuP7vydPnnQjFaXQJYRw52KTghYRucKVnIfy4YcfducpVDFFDDaLPmoWZZTJZOrmoGMYhmGWKX7UZlFYomSaFMKcQFL3GZLmq1Z8yUw5pxZLs4i0qDI+Pi6uueaaRftNNTJDekWcmjJ3dyL/jxfj4+PsXM4wDLOEgU/TJIk2RuVENAFgD4BfO5u+JIT4rvPbAQC7AZQB/BchxPec7Z8AcD+AGICHhRB/7udcmzdvFmfOnFl0WbtBLpfDvn37ajQplmXBsiwIIZBMJn3nYcnlcm6OGHlPYrEYDh48iAMHDrRcNjVnWTKZxMmTJ5ed2UvVLALNc4rJ3F9Abe6mbtVbLpfD0aNHcdlll7WUD4hhGIaJPkT0khBic9P9OiCI/YsQ4uva9vUA/grAAIDLAJwA8AfOz28A+DiAdwC8COBPhRD12Rg1oiiIARc77+npaZTL5ZaEL/UYW7ZsqUkaSkS45JJLFi1A3XvvvbjnnntQLpfbEugYhmEYhmkdv4JYt3zEdgB4RAgxB+AfiegcqkIZAJwTQvzMKeQjzr5NBbGo0k5mcUk+n68RwizLwsjISFuZo3vRR4xhGIZhlhudEMT2E9EwgDMA/qsQ4rcALgfwnLLPO842APiFtv2jHShD6LQ7fYfKTTfdhCNHjrR9nNtvvx1A+1OBMAzDMAzTHZoKYkR0AkC/4acvAzgC4CCqDscHAfwFgF2dKhwRjQAYAYDVq1d36rCRY9OmTTXr27dvb+t4un+YTL3AMAzDMEy0aCqICSFu8HMgInoIwHec1XcBXKn8fIWzDQ22m86dA5ADqj5ifsrRi8zOzsKyLDfNRbOJjptRKBRQKpVQLpdRKpV6IocYwzAMwyxHrHb+TEQfUFZvBvCK8/0JALcSUYqIrgawFsALqDrnryWiq4koCeBWZ99lTSaTQSKRABEhkUi07c/V19cHIgo0gSzDMAzDMK3Tro/YISLaiKpp8ucA9gKAEOJVIvobVJ3wFwDsE0KUAYCI9gP4HqrpK6aFEK+2WYYlgYxebSeKFaiaJcfGxlCpVBCLxTA1NcXaMIZhGIaJKG0JYkKI/9Tgtz8D8GeG7d8F8N12zrvUKBQK7nRJ5XK5LVOiNEvKKXfaNXMyDMMwDNM92jJNMp1Bzl1JRIjH422ZEtksyTAMwzC9AwtiEaETpkk2SzIMwzBMb8GCWAQwmSYXexxplqxUKmyWZBiGYZiIw4JYBJBZ8OU8lX19fS39+ugkAAAGdklEQVQfw7ZtvP3224jFYojFYmyWZBiGYZgegAWxCJBOpzE1NQXLslAulzE2Ngbbtn3917Zt3HnnnchkMnjooYdARNizZ8+ynOSbYRiGYXqNbs01ybTI7OwshBCoVCqYm5vDxMQEJiYmjMKUbdsoFAro6+vD2NgYLly4UONbtnr1ahbCGIZhGKYHYEEsIkjz5NzcHCqVCk6cOIHTp0/jjjvucKcoUoWvUqnkatCkEEZEbJJkGIZhmB6C2k0gGhSbN28WZ86cCbsYXcW2bUxMTODEiROoVCoALgpX0pFfCl9yOiTLsiCEQDwed4U21oYxDMMwTLgQ0UtCiM3N9mONWIRIp9OYmJjAs88+65obhRAolUoA4K5bluUKaFNTU5idnUUmk2EBjGEYhmF6DBbEIkY6ncbJkyeRz+cxPT2NcrmMeDzuasRY+GIYhmGYpQObJiOMdMqXPl/yOwtfDMMwDBNt/Jome0YQI6JfA3iry6d5P4DfdPkcDNdzUHA9BwfXdTBwPQcD13NnuEoIsarZTj0jiAUBEZ3xI70y7cH1HAxcz8HBdR0MXM/BwPUcLJzQlWEYhmEYJiRYEGMYhmEYhgkJFsRqyYVdgGUC13MwcD0HB9d1MHA9BwPXc4CwjxjDMAzDMExIsEaMYRiGYRgmJFgQA0BEnyCinxDROSL6Ytjl6UWIaJqIfkVEryjbVhLR94nop87n+5ztRER/6dT3j4now8p/bnf2/ykR3R7GtUQZIrqSiE4R0WtE9CoRfd7ZznXdQYjoEiJ6gYh+5NTzf3e2X01Ezzv1+ddElHS2p5z1c87va5RjHXC2/4SI/jicK4o2RBQjohki+o6zzvXcYYjo50T0MhH9kIjOONu43YgCctqc5boAiAF4E8AHASQB/AjA+rDL1WsLgEEAHwbwirLtEIAvOt+/COA+5/snATwFgAB8DMDzzvaVAH7mfL7P+f6+sK8tSguADwD4sPP9PQDeALCe67rj9UwAfs/5ngDwvFN/fwPgVmf7gwDudL7fBeBB5/utAP7a+b7eaVNSAK522ppY2NcXtQXAFwD8TwDfcda5njtfxz8H8H5tG7cbEVhYIwYMADgnhPiZEKIE4BEAO0IuU88hhHgGwD9pm3cAOO58Pw5gSNmeF1WeA3ApEX0AwB8D+L4Q4p+EEL8F8H0An+h+6XsHIcQvhRD/4Hz/vwDOArgcXNcdxamvf3FWE84iAPwRgL9ztuv1LOv/7wBsJSJytj8ihJgTQvwjgHOotjmMAxFdAeBTAB521glcz0HB7UYEYEGs2on9Qll/x9nGtM/vCyF+6Xw/D+D3ne9edc73ogUcs8wmVLU1XNcdxjGX/RDAr1DtcN4E8DshxIKzi1pnbn06v/8zgD5wPfthCsA4gIqz3geu524gADxNRC8R0YizjduNCMCTfjOBIIQQRMQhuh2CiH4PwKMAxoQQ/6eqFKjCdd0ZhBBlABuJ6FIAjwH4dyEXaclBRP8RwK+EEC8RUSbs8ixx/lAI8S4R/VsA3yei19Ufud0ID9aIAe8CuFJZv8LZxrTP/3bU2XA+f+Vs96pzvhc+IKIEqkLYt4QQ33Y2c113CSHE7wCcApBG1UQjB7Bqnbn16fz+bwDMguu5Gf8BwE1E9HNU3UL+CMD94HruOEKId53PX6E6sBgAtxuRgAUx4EUAa50onSSqDqBPhFympcITAGRUze0A/peyfdiJzPkYgH921OPfA7CNiN7nRO9sc7YxDo4/zFEAZ4UQ/0P5ieu6gxDRKkcTBiJaAeDjqPrjnQLwJ85uej3L+v8TAH8vhBDO9ludaL+rAawF8EIwVxF9hBAHhBBXCCHWoNr2/r0QYie4njsKEf1rInqP/I7q+/4KuN2IBmFHC0RhQTVC5A1UfUC+HHZ5enEB8FcAfglgHlW/gd2o+m6cBPBTACcArHT2JQCHnfp+GcBm5Ti7UHW0PQfgjrCvK2oLgD9E1dfjxwB+6Cyf5LrueD3/ewAzTj2/AuC/Ods/iGoHfw7A3wJIOdsvcdbPOb9/UDnWl536/wmA7WFfW1QXABlcjJrkeu5s3X4Q1ajSHwF4VfZz3G5EY+HM+gzDMAzDMCHBpkmGYRiGYZiQYEGMYRiGYRgmJFgQYxiGYRiGCQkWxBiGYRiGYUKCBTGGYRiGYZiQYEGMYRiGYRgmJFgQYxiGYRiGCQkWxBiGYRiGYULi/wPnCbmAjf8jIwAAAABJRU5ErkJggg==\n", 89 | "text/plain": [ 90 | "
" 91 | ] 92 | }, 93 | "metadata": { 94 | "needs_background": "light" 95 | }, 96 | "output_type": "display_data" 97 | } 98 | ], 99 | "source": [ 100 | "# visualize sample\n", 101 | "sample.visualize()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 14, 107 | "metadata": { 108 | "scrolled": true 109 | }, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "('/home/jasper/Desktop/fyp/HWR/hwr/../data/iamon/lineStrokes(on)/data/a01/a01-001/a01-001w-01.xml',\n", 115 | " ,\n", 116 | " 'By Trevor Williams . A move to ')" 117 | ] 118 | }, 119 | "execution_count": 14, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "# samples attributes, pointset is derived from the xml file\n", 126 | "sample.xml_path, sample.pointset, sample.get_ground_truth_text()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 15, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "(array([[-2.49708771e+01, -2.11171758e+00, 5.45721502e-02,\n", 138 | " 2.26284976e-01, 1.00000000e+00, 0.00000000e+00],\n", 139 | " [-2.49163050e+01, -1.88543260e+00, 5.45721502e-02,\n", 140 | " 2.26284976e-01, 1.00000000e+00, 0.00000000e+00],\n", 141 | " [-2.48617328e+01, -1.65914762e+00, 5.45721502e-02,\n", 142 | " 2.26284976e-01, 1.00000000e+00, 0.00000000e+00],\n", 143 | " ...,\n", 144 | " [ 2.71303422e+01, -4.96609488e-01, -3.68090784e-01,\n", 145 | " -3.75389779e-02, 1.00000000e+00, 0.00000000e+00],\n", 146 | " [ 2.67622514e+01, -5.34148466e-01, -3.68090784e-01,\n", 147 | " -3.75389779e-02, 1.00000000e+00, 0.00000000e+00],\n", 148 | " [ 2.63941606e+01, -5.71687444e-01, -1.85017042e-01,\n", 149 | " -1.88685806e-02, 0.00000000e+00, 1.00000000e+00]]), (348, 6))" 150 | ] 151 | }, 152 | "execution_count": 15, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "# Get features given a preprocess scheme. Dimension: (timestep, number_of_features)\n", 159 | "features = sample.generate_features(preprocess=PREPROCESS.SCHEME6)\n", 160 | "features, features.shape" 161 | ] 162 | } 163 | ], 164 | "metadata": { 165 | "kernelspec": { 166 | "display_name": "tf_gpu", 167 | "language": "python", 168 | "name": "tf_gpu" 169 | }, 170 | "varInspector": { 171 | "cols": { 172 | "lenName": 16, 173 | "lenType": 16, 174 | "lenVar": 40 175 | }, 176 | "kernels_config": { 177 | "python": { 178 | "delete_cmd_postfix": "", 179 | "delete_cmd_prefix": "del ", 180 | "library": "var_list.py", 181 | "varRefreshCmd": "print(var_dic_list())" 182 | }, 183 | "r": { 184 | "delete_cmd_postfix": ") ", 185 | "delete_cmd_prefix": "rm(", 186 | "library": "var_list.r", 187 | "varRefreshCmd": "cat(var_dic_list()) " 188 | } 189 | }, 190 | "types_to_exclude": [ 191 | "module", 192 | "function", 193 | "builtin_function_or_method", 194 | "instance", 195 | "_Feature" 196 | ], 197 | "window_display": false 198 | } 199 | }, 200 | "nbformat": 4, 201 | "nbformat_minor": 2 202 | } 203 | -------------------------------------------------------------------------------- /demo/training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from hwr.data.generator import IAMSequence\n", 10 | "from hwr.constants import SPLIT\n", 11 | "from hwr.models.ONNET import ONNET\n", 12 | "from hwr.decoding.ctc_decoder import BestPathDecoder, TrieBeamSearchDecoder\n", 13 | "from matplotlib import pyplot as plt\n", 14 | "import numpy as np" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 3, 20 | "metadata": { 21 | "scrolled": true 22 | }, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "preloading model weights from /home/jasper/Desktop/fyp/HWR/hwr/../models/iamon/ONNET/pretrained-deep-lstm/weights.h5\n", 29 | "__________________________________________________________________________________________________\n", 30 | "Layer (type) Output Shape Param # Connected to \n", 31 | "==================================================================================================\n", 32 | "xs (InputLayer) (None, None, 6) 0 \n", 33 | "__________________________________________________________________________________________________\n", 34 | "conv1d (Conv1D) (None, None, 60) 2580 xs[0][0] \n", 35 | "__________________________________________________________________________________________________\n", 36 | "batch_normalization (BatchNorma (None, None, 60) 240 conv1d[0][0] \n", 37 | "__________________________________________________________________________________________________\n", 38 | "activation (Activation) (None, None, 60) 0 batch_normalization[0][0] \n", 39 | "__________________________________________________________________________________________________\n", 40 | "conv1d_1 (Conv1D) (None, None, 90) 27090 activation[0][0] \n", 41 | "__________________________________________________________________________________________________\n", 42 | "batch_normalization_1 (BatchNor (None, None, 90) 360 conv1d_1[0][0] \n", 43 | "__________________________________________________________________________________________________\n", 44 | "activation_1 (Activation) (None, None, 90) 0 batch_normalization_1[0][0] \n", 45 | "__________________________________________________________________________________________________\n", 46 | "conv1d_2 (Conv1D) (None, None, 120) 54120 activation_1[0][0] \n", 47 | "__________________________________________________________________________________________________\n", 48 | "batch_normalization_2 (BatchNor (None, None, 120) 480 conv1d_2[0][0] \n", 49 | "__________________________________________________________________________________________________\n", 50 | "activation_2 (Activation) (None, None, 120) 0 batch_normalization_2[0][0] \n", 51 | "__________________________________________________________________________________________________\n", 52 | "average_pooling1d (AveragePooli (None, None, 120) 0 activation_2[0][0] \n", 53 | "__________________________________________________________________________________________________\n", 54 | "conv1d_3 (Conv1D) (None, None, 120) 43320 average_pooling1d[0][0] \n", 55 | "__________________________________________________________________________________________________\n", 56 | "batch_normalization_3 (BatchNor (None, None, 120) 480 conv1d_3[0][0] \n", 57 | "__________________________________________________________________________________________________\n", 58 | "activation_3 (Activation) (None, None, 120) 0 batch_normalization_3[0][0] \n", 59 | "__________________________________________________________________________________________________\n", 60 | "conv1d_4 (Conv1D) (None, None, 160) 57760 activation_3[0][0] \n", 61 | "__________________________________________________________________________________________________\n", 62 | "batch_normalization_4 (BatchNor (None, None, 160) 640 conv1d_4[0][0] \n", 63 | "__________________________________________________________________________________________________\n", 64 | "activation_4 (Activation) (None, None, 160) 0 batch_normalization_4[0][0] \n", 65 | "__________________________________________________________________________________________________\n", 66 | "conv1d_5 (Conv1D) (None, None, 200) 96200 activation_4[0][0] \n", 67 | "__________________________________________________________________________________________________\n", 68 | "batch_normalization_5 (BatchNor (None, None, 200) 800 conv1d_5[0][0] \n", 69 | "__________________________________________________________________________________________________\n", 70 | "activation_5 (Activation) (None, None, 200) 0 batch_normalization_5[0][0] \n", 71 | "__________________________________________________________________________________________________\n", 72 | "average_pooling1d_1 (AveragePoo (None, None, 200) 0 activation_5[0][0] \n", 73 | "__________________________________________________________________________________________________\n", 74 | "cu_dnnlstm (CuDNNLSTM) (None, None, 60) 62880 average_pooling1d_1[0][0] \n", 75 | "__________________________________________________________________________________________________\n", 76 | "cu_dnnlstm_1 (CuDNNLSTM) (None, None, 60) 62880 average_pooling1d_1[0][0] \n", 77 | "__________________________________________________________________________________________________\n", 78 | "concatenate (Concatenate) (None, None, 120) 0 cu_dnnlstm[0][0] \n", 79 | " cu_dnnlstm_1[0][0] \n", 80 | "__________________________________________________________________________________________________\n", 81 | "cu_dnnlstm_2 (CuDNNLSTM) (None, None, 60) 43680 concatenate[0][0] \n", 82 | "__________________________________________________________________________________________________\n", 83 | "cu_dnnlstm_3 (CuDNNLSTM) (None, None, 60) 43680 concatenate[0][0] \n", 84 | "__________________________________________________________________________________________________\n", 85 | "concatenate_1 (Concatenate) (None, None, 120) 0 cu_dnnlstm_2[0][0] \n", 86 | " cu_dnnlstm_3[0][0] \n", 87 | "__________________________________________________________________________________________________\n", 88 | "cu_dnnlstm_4 (CuDNNLSTM) (None, None, 60) 43680 concatenate_1[0][0] \n", 89 | "__________________________________________________________________________________________________\n", 90 | "cu_dnnlstm_5 (CuDNNLSTM) (None, None, 60) 43680 concatenate_1[0][0] \n", 91 | "__________________________________________________________________________________________________\n", 92 | "concatenate_2 (Concatenate) (None, None, 120) 0 cu_dnnlstm_4[0][0] \n", 93 | " cu_dnnlstm_5[0][0] \n", 94 | "__________________________________________________________________________________________________\n", 95 | "cu_dnnlstm_6 (CuDNNLSTM) (None, None, 60) 43680 concatenate_2[0][0] \n", 96 | "__________________________________________________________________________________________________\n", 97 | "cu_dnnlstm_7 (CuDNNLSTM) (None, None, 60) 43680 concatenate_2[0][0] \n", 98 | "__________________________________________________________________________________________________\n", 99 | "concatenate_3 (Concatenate) (None, None, 120) 0 cu_dnnlstm_6[0][0] \n", 100 | " cu_dnnlstm_7[0][0] \n", 101 | "__________________________________________________________________________________________________\n", 102 | "batch_normalization_6 (BatchNor (None, None, 120) 480 concatenate_3[0][0] \n", 103 | "__________________________________________________________________________________________________\n", 104 | "dense (Dense) (None, None, 83) 10043 batch_normalization_6[0][0] \n", 105 | "__________________________________________________________________________________________________\n", 106 | "softmax (Activation) (None, None, 83) 0 dense[0][0] \n", 107 | "__________________________________________________________________________________________________\n", 108 | "ys (InputLayer) (None, None) 0 \n", 109 | "__________________________________________________________________________________________________\n", 110 | "ypred_length (InputLayer) (None, 1) 0 \n", 111 | "__________________________________________________________________________________________________\n", 112 | "ytrue_length (InputLayer) (None, 1) 0 \n", 113 | "__________________________________________________________________________________________________\n", 114 | "ctc (Lambda) (None, 1) 0 softmax[0][0] \n", 115 | " ys[0][0] \n", 116 | " ypred_length[0][0] \n", 117 | " ytrue_length[0][0] \n", 118 | "==================================================================================================\n", 119 | "Total params: 682,433\n", 120 | "Trainable params: 680,693\n", 121 | "Non-trainable params: 1,740\n", 122 | "__________________________________________________________________________________________________\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "model = ONNET(preload=True, gru=False, gpu=True)\n", 128 | "model.get_model_summary()" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 4, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "p = 6\n", 138 | "train_seq = IAMSequence(SPLIT.TRAIN, batch_size=20, preprocess=p,\n", 139 | " npz=True, inout_ratio=4)\n", 140 | "val1_seq = IAMSequence(SPLIT.VAL1, batch_size=100, preprocess=p,\n", 141 | " npz=True, inout_ratio=4)\n", 142 | "eval_seq = IAMSequence(SPLIT.TEST, batch_size=100, preprocess=p,\n", 143 | " npz=True, pred=True, pad_to=(1200, 300))" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 5, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "Epoch 1/3000\n", 156 | " 19/269 [=>............................] - ETA: 1:12 - loss: 5.0801" 157 | ] 158 | }, 159 | { 160 | "ename": "KeyboardInterrupt", 161 | "evalue": "", 162 | "output_type": "error", 163 | "traceback": [ 164 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 165 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 166 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_seq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval1_seq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3000\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mearlystop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 167 | "\u001b[0;32m~/Desktop/fyp/HWR/hwr/models/model.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(self, train_seq, test_seq, epochs, earlystop)\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mepochs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 78\u001b[0;31m \u001b[0mcallbacks\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcp_callback\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mes_callback\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 79\u001b[0m )\n\u001b[1;32m 80\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 168 | "\u001b[0;32m~/anaconda3/envs/tf_gpu/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py\u001b[0m in \u001b[0;36mfit_generator\u001b[0;34m(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)\u001b[0m\n\u001b[1;32m 2175\u001b[0m \u001b[0muse_multiprocessing\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_multiprocessing\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2176\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2177\u001b[0;31m initial_epoch=initial_epoch)\n\u001b[0m\u001b[1;32m 2178\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2179\u001b[0m def evaluate_generator(self,\n", 169 | "\u001b[0;32m~/anaconda3/envs/tf_gpu/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_generator.py\u001b[0m in \u001b[0;36mfit_generator\u001b[0;34m(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)\u001b[0m\n\u001b[1;32m 174\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 175\u001b[0m outs = model.train_on_batch(\n\u001b[0;32m--> 176\u001b[0;31m x, y, sample_weight=sample_weight, class_weight=class_weight)\n\u001b[0m\u001b[1;32m 177\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 178\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mouts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 170 | "\u001b[0;32m~/anaconda3/envs/tf_gpu/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py\u001b[0m in \u001b[0;36mtrain_on_batch\u001b[0;34m(self, x, y, sample_weight, class_weight)\u001b[0m\n\u001b[1;32m 1938\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1939\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_train_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1940\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mins\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1941\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1942\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 171 | "\u001b[0;32m~/anaconda3/envs/tf_gpu/lib/python3.6/site-packages/tensorflow/python/keras/backend.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, inputs)\u001b[0m\n\u001b[1;32m 2984\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2985\u001b[0m fetched = self._callable_fn(*array_vals,\n\u001b[0;32m-> 2986\u001b[0;31m run_metadata=self.run_metadata)\n\u001b[0m\u001b[1;32m 2987\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_fetch_callbacks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfetched\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fetches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2988\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfetched\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 172 | "\u001b[0;32m~/anaconda3/envs/tf_gpu/lib/python3.6/site-packages/tensorflow/python/client/session.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1437\u001b[0m ret = tf_session.TF_SessionRunCallable(\n\u001b[1;32m 1438\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_handle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1439\u001b[0;31m run_metadata_ptr)\n\u001b[0m\u001b[1;32m 1440\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1441\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 173 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "model.train(train_seq, val1_seq, epochs=3000, earlystop=10)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 7, 184 | "metadata": { 185 | "scrolled": true 186 | }, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "39/39 [==============================] - 6s 148ms/step\n" 193 | ] 194 | }, 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "{'character_error_rate': 0.08255496354932794,\n", 199 | " 'word_error_rate': 0.3170504844629366}" 200 | ] 201 | }, 202 | "execution_count": 7, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "#TrieBeamSearchDecoder(25, lm=\"sbo\", ngram=7, prune=10, trie=\"100k\", gamma=1)\n", 209 | "model.evaluate(eval_seq, decoder=BestPathDecoder())" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 6, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "model.load_weights(\"../models/iamon/checkpoint/ONNET/2019-09-29-13:04:27/weights.h5\", full_path=True)" 219 | ] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "tf_gpu", 225 | "language": "python", 226 | "name": "tf_gpu" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.6.8" 239 | }, 240 | "varInspector": { 241 | "cols": { 242 | "lenName": 16, 243 | "lenType": 16, 244 | "lenVar": 40 245 | }, 246 | "kernels_config": { 247 | "python": { 248 | "delete_cmd_postfix": "", 249 | "delete_cmd_prefix": "del ", 250 | "library": "var_list.py", 251 | "varRefreshCmd": "print(var_dic_list())" 252 | }, 253 | "r": { 254 | "delete_cmd_postfix": ") ", 255 | "delete_cmd_prefix": "rm(", 256 | "library": "var_list.r", 257 | "varRefreshCmd": "cat(var_dic_list()) " 258 | } 259 | }, 260 | "types_to_exclude": [ 261 | "module", 262 | "function", 263 | "builtin_function_or_method", 264 | "instance", 265 | "_Feature" 266 | ], 267 | "window_display": false 268 | } 269 | }, 270 | "nbformat": 4, 271 | "nbformat_minor": 2 272 | } 273 | --------------------------------------------------------------------------------