├── .gitignore ├── LICENSE ├── README.md ├── config ├── __init__.py ├── default.py ├── french_large_base.yaml ├── french_small_base.yaml └── old_run_params.py ├── data └── french │ ├── dev.txt │ ├── test.txt │ └── train.txt ├── input.conll ├── neuralnets ├── BiLSTM.py ├── __init__.py ├── keraslayers │ ├── .gitignore │ ├── ChainCRF.py │ └── __init__.py └── utils.py ├── preprocessing.py ├── requirements.txt ├── res ├── network-diagram.PNG └── results.png ├── results ├── analyze_res_folder.py └── plot.py ├── run.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .h5 107 | .vscode 108 | models/ 109 | 110 | # csv results 111 | *.csv 112 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jacob Krantz, Maxwell Dulin, and Paul De Palma 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Language-Agnostic Syllabification with Neural Sequence Labeling 2 | 3 | ### Details 4 | 5 | This syllabifier treats the syllabification problem as a sequence labeling task where syllables can be trivialy recovered from boundary labels. Our network uses both an LSTM and a convolutional component. To decode an output sequence, a linear chain conditional random field (CRF) is used which provides an accuracy increase over a standard Softmax by a percentage point or two. 6 | 7 | Note that this repository contains code used in experimentation for research purposes. There may be issues hidden in places we don't know about. Feel free to contact us or open an issue. 8 | 9 |

10 | 11 |

Syllabification Network Diagram

12 |

13 | 14 | The repository structure and primary code files are adopted from [2] and can be found [here](https://github.com/UKPLab/emnlp2017-bilstm-cnn-crf). 15 | 16 | #### How well does this system work? 17 | The proposed model achieved accuracies higher than any other we could find on datasets from Dutch, Italian, French, and Basque languages and close to the best-reported accuracy for English. The results on Manipuri were weaker than others and may be due to having less labeled syllable data for Manipuri. 18 | 19 |

20 | 21 |

Syllabification Network Diagram

22 |

23 | 24 | ### Data 25 | 26 | This folder should contain all the datasets to be used with the syllabifier. The processed form of the French [dataset](http://www.lexique.org/) exists in this folder. This freely-available dataset includes about 140,000 unique words with transcribed syllabification data [1]. To access the processed datasets that were used in the paper, contact the authors. This is much faster than trying to regenerate them from their sources. Included in this are some generation scripts and datasets from English, Dutch, French, Italian, Manipuri, and Basque. Data files are in CONLL format where each line contains a phone and either a 1 or 0 denoting the presence or absence of a syllable boundary, repsectively. Blank lines delineate a separation between two words. 27 | 28 | The example phone sequence `aRboRE` would be syllabified as `[aR] [bo] [RE]` and is represented in our data files as such: 29 | ``` 30 | a 0 31 | R 1 32 | b 0 33 | o 1 34 | R 0 35 | E 0 36 | ``` 37 | 38 | ### Citing lstm-syllabify 39 | 40 | If this project contributed to your research, please cite the following [paper](https://arxiv.org/abs/1909.13362): 41 | ``` 42 | @article{krantz2019language, 43 | title={Language-Agnostic Syllabification with Neural Sequence Labeling}, 44 | author={Krantz, Jacob and Dulin, Maxwell and De Palma, Paul}, 45 | journal={arXiv preprint arXiv:1909.13362}, 46 | year={2019} 47 | } 48 | ``` 49 | 50 | ### Contact 51 | 52 | Corresponding author: Jacob Krantz 53 | Email: krantzja [at] oregonstate [dot] edu 54 | 55 | ### Acknowledgments 56 | 57 | This research was supported in part by a Gonzaga University McDonald Work Award by Robert and Claire McDonald and an Amazon Web Services (AWS) grant through the Cloud Credits for Research program. 58 | 59 | ### Citations 60 | 61 | ``` 62 | [1] B. New, C. Pallier, M. Brysbaert, and L. Ferrand, “Lexique 2: A new 63 | french lexical database,” Behavior Research Methods, Instruments, & 64 | Computers, vol. 36, no. 3, pp. 516–524, 2004. 65 | 66 | [2] N. Reimers and I. Gurevych, “Reporting score distributions makes a 67 | difference: Performance study of lstm-networks for sequence tagging,” 68 | in Proceedings of EMNLP 2017, 2017, pp. 338–348. 69 | ``` 70 | -------------------------------------------------------------------------------- /config/__init__.py: -------------------------------------------------------------------------------- 1 | from yacs.config import CfgNode as Config 2 | 3 | from config.default import get_cfg_defaults 4 | 5 | __all__ = ["Config", "get_cfg_defaults"] 6 | -------------------------------------------------------------------------------- /config/default.py: -------------------------------------------------------------------------------- 1 | from yacs.config import CfgNode as CN 2 | 3 | _C = CN() 4 | _C.CONFIG_NAME = "FinalParamsLarge" 5 | # ------------------------------------------------------------ 6 | # MODEL PARAMETERS 7 | # ------------------------------------------------------------ 8 | _C.MODEL = CN() 9 | _C.MODEL.DROPOUT = 0.25 10 | _C.MODEL.RECURRENT_DROPOUT = 0.0 11 | _C.MODEL.USE_RNN = True 12 | _C.MODEL.RNN = "lstm" # either 'lstm' or 'gru' 13 | _C.MODEL.RNN_SIZE = 300 14 | _C.MODEL.USE_CNN = True 15 | _C.MODEL.CNN_LAYERS = 2 # must be greater than 0 16 | _C.MODEL.NUM_FILTERS = 200 17 | _C.MODEL.FILTER_SIZE = 3 18 | _C.MODEL.MAX_POOL_SIZE = 2 # if None or False, do not use MaxPooling 19 | _C.MODEL.CLASSIFIER = "crf" # either 'softmax' or 'crf' (by Philipp Gross) 20 | _C.MODEL.EMBEDDING_SIZE = 300 21 | # ------------------------------------------------------------ 22 | # TRAINING PARAMETERS 23 | # ------------------------------------------------------------ 24 | _C.TRAINING = CN() 25 | # same as the language in ./data folder 26 | # ["english", "italian", "basque", "dutch", "manipuri", "french"] 27 | _C.TRAINING.DATASET = "french" 28 | _C.TRAINING.FEATURE_NAMES = ["tokens"] 29 | _C.TRAINING.MODEL_SAVE_PATH = "models/[DATASET]_[Epoch]_[DevScore]_[TestScore].h5" 30 | # used to get mean and standard deviation of model 31 | _C.TRAINING.TRAINING_REPEATS = 2 # just trains the model once 32 | _C.TRAINING.EPOCHS = 120 33 | _C.TRAINING.MINI_BATCH_SIZE = 64 34 | _C.TRAINING.EARLY_STOPPING = 10 35 | # ------------------------------------------------------------ 36 | # OPTIMIZATION PARAMETERS 37 | # ------------------------------------------------------------ 38 | _C.OPTIMIZER = CN() 39 | _C.OPTIMIZER.OPTIMIZER = "adam" 40 | _C.OPTIMIZER.CLIP_NORM = 0.0 # must be >= 0.0 41 | _C.OPTIMIZER.CLIP_VALUE = 0.0 42 | 43 | # SEE IF DROPOUT WORKS WITH LSTM 44 | 45 | 46 | def get_cfg_defaults(): 47 | """Get a yacs CfgNode object with default values""" 48 | return _C.clone() 49 | -------------------------------------------------------------------------------- /config/french_large_base.yaml: -------------------------------------------------------------------------------- 1 | CONFIG_NAME: "FinalParamsLarge" 2 | MODEL: 3 | DROPOUT: 0.25 4 | RECURRENT_DROPOUT: 0.0 5 | USE_RNN: True 6 | RNN: "lstm" 7 | RNN_SIZE: 300 8 | USE_CNN: True 9 | CNN_LAYERS: 2 10 | NUM_FILTERS: 200 11 | FILTER_SIZE: 3 12 | MAX_POOL_SIZE: 2 13 | CLASSIFIER: "crf" 14 | EMBEDDING_SIZE: 300 15 | TRAINING: 16 | DATASET: "french" 17 | MINI_BATCH_SIZE: 64 18 | EPOCHS: 1 19 | -------------------------------------------------------------------------------- /config/french_small_base.yaml: -------------------------------------------------------------------------------- 1 | CONFIG_NAME: "FinalParamsSmall" 2 | MODEL: 3 | DROPOUT: 0.25 4 | RECURRENT_DROPOUT: 0.0 5 | USE_RNN: True 6 | RNN: "lstm" 7 | RNN_SIZE: 50 8 | USE_CNN: True 9 | CNN_LAYERS: 1 10 | NUM_FILTERS: 40 11 | FILTER_SIZE: 3 12 | MAX_POOL_SIZE: 2 13 | CLASSIFIER: "crf" 14 | EMBEDDING_SIZE: 100 15 | TRAINING: 16 | DATASET: "french" 17 | MINI_BATCH_SIZE: 64 18 | EPOCHS: 2 19 | -------------------------------------------------------------------------------- /config/old_run_params.py: -------------------------------------------------------------------------------- 1 | cnn_optimization_runs = [ 2 | # [name, use_cnn, use_lstm, cnn layers, cnn number filters, cnn filter size, cnn max pool size, dropout, lstm size, embedding size, mini_batch_size, which_rnn, classifier, language] 3 | ["1", True, True, 6, 40, 3, 2, 100, 100, 64, "LSTM", "crf", "english"], 4 | ["2", True, True, 2, 100, 3, 2, 100, 100, 64, "LSTM", "crf", "english"], 5 | ["3", True, True, 6, 100, 3, 2, 100, 100, 64, "LSTM", "crf", "english"], 6 | ["4", True, True, 2, 40, 2, 2, 100, 100, 64, "LSTM", "crf", "english"], 7 | ["9", True, True, 2, 100, 2, 4, 100, 100, 64, "LSTM", "crf", "english"], 8 | ["10", True, True, 2, 200, 3, 2, 100, 100, 64, "LSTM", "crf", "english"], 9 | ["11", True, True, 2, 200, 2, 2, 100, 100, 64, "LSTM", "crf", "english"], 10 | ["12", True, True, 1, 40, 3, 2, 100, 100, 64, "LSTM", "crf", "english"], 11 | ["13", True, True, 1, 100, 3, 2, 100, 100, 64, "LSTM", "crf", "english"], 12 | ["14", True, True, 1, 200, 3, 2, 100, 100, 64, "LSTM", "crf", "english"], 13 | ["15", True, True, 2, 200, 2, 4, 100, 100, 64, "LSTM", "crf", "english"], 14 | [ 15 | "16", 16 | True, 17 | True, 18 | 2, 19 | 100, 20 | 2, 21 | None, 22 | 100, 23 | 100, 24 | 64, 25 | "LSTM", 26 | "crf", 27 | "english", 28 | ], 29 | [ 30 | "17", 31 | True, 32 | True, 33 | 2, 34 | 200, 35 | 2, 36 | None, 37 | 100, 38 | 100, 39 | 64, 40 | "LSTM", 41 | "crf", 42 | "english", 43 | ], 44 | [ 45 | "18", 46 | True, 47 | True, 48 | 2, 49 | 100, 50 | 3, 51 | None, 52 | 100, 53 | 100, 54 | 64, 55 | "LSTM", 56 | "crf", 57 | "english", 58 | ], 59 | ["19", True, True, 1, 200, 3, 2, 200, 100, 64, "LSTM", "crf", "english"], 60 | ["20", True, True, 1, 200, 3, 2, 100, 200, 64, "LSTM", "crf", "english"], 61 | ["21", True, True, 1, 200, 3, 2, 200, 200, 64, "LSTM", "crf", "english"], 62 | ["22", True, True, 2, 200, 3, 2, 200, 100, 64, "LSTM", "crf", "english"], 63 | ["23", True, True, 2, 200, 3, 2, 100, 200, 64, "LSTM", "crf", "english"], 64 | ["24", True, True, 2, 200, 3, 2, 200, 200, 64, "LSTM", "crf", "english"], 65 | ["25", True, True, 1, 200, 3, 2, 300, 300, 64, "LSTM", "crf", "english"], 66 | ["26", True, True, 2, 200, 3, 2, 300, 300, 64, "LSTM", "crf", "english"], 67 | ["27", True, True, 1, 200, 3, 2, 200, 200, 16, "LSTM", "crf", "english"], 68 | ["28", True, True, 2, 200, 3, 2, 300, 300, 16, "LSTM", "crf", "english"], 69 | ["29", True, True, 1, 200, 3, 2, 300, 300, 16, "LSTM", "crf", "english"], 70 | ["30", True, True, 2, 200, 3, 2, 300, 300, 16, "LSTM", "crf", "english"], 71 | ] 72 | 73 | m_experiment_runs = [ 74 | # [name, use_cnn, use_lstm, cnn layers, cnn number filters, cnn filter size, cnn max pool size, lstm size, embedding size, mini_batch_size, which_rnn, classifier, language] 75 | # ['Base', True, True, 2, 200, 3, 2, 300, 300, 64, 'LSTM', 'crf', 'english'], # same as cnn_optimization_runs #26. 76 | ["M1", True, True, 2, 200, 3, 2, 300, 300, 64, "GRU", "crf", "english"], 77 | ["M2", False, True, 2, 200, 3, 2, 300, 300, 64, "LSTM", "crf", "english"], 78 | ["M3", False, True, 2, 200, 3, 2, 300, 300, 64, "GRU", "crf", "english"], 79 | [ 80 | "M4", 81 | True, 82 | True, 83 | 2, 84 | 200, 85 | 3, 86 | 2, 87 | 300, 88 | 300, 89 | 64, 90 | "LSTM", 91 | "softmax", 92 | "english", 93 | ], 94 | ["M5", True, False, 2, 200, 3, 2, 300, 300, 64, "LSTM", "crf", "english"], 95 | ["M6", True, True, 2, 200, 3, 2, 300, 300, 64, "LSTM", "crf", "italian"], 96 | ["M7", True, True, 2, 200, 3, 2, 300, 300, 64, "LSTM", "crf", "basque"], 97 | ["M8", True, False, 2, 200, 3, 2, 300, 300, 64, "LSTM", "crf", "NETtalk"], 98 | ["M9", True, False, 2, 200, 3, 2, 300, 300, 16, "LSTM", "crf", "NETtalk"], 99 | [ 100 | "M10", 101 | True, 102 | False, 103 | 2, 104 | 200, 105 | 3, 106 | 2, 107 | 300, 108 | 300, 109 | 64, 110 | "LSTM", 111 | "crf", 112 | "NETtalkTrain", 113 | ], 114 | [ 115 | "M11", 116 | True, 117 | False, 118 | 2, 119 | 200, 120 | 3, 121 | 2, 122 | 300, 123 | 300, 124 | 16, 125 | "LSTM", 126 | "crf", 127 | "NETtalkTrain", 128 | ], 129 | [ 130 | "M12", 131 | True, 132 | False, 133 | 2, 134 | 200, 135 | 3, 136 | 2, 137 | 300, 138 | 300, 139 | 32, 140 | "LSTM", 141 | "crf", 142 | "NETtalkTrain", 143 | ], 144 | ["M13", True, False, 2, 200, 3, 2, 300, 300, 64, "LSTM", "crf", "dutch"], 145 | ["M14", True, True, 2, 200, 3, 2, 300, 300, 64, "LSTM", "crf", "manipuri"], 146 | ] 147 | 148 | small_test_params = [ 149 | [ 150 | "small-1", 151 | True, 152 | True, 153 | 1, 154 | 40, 155 | 3, 156 | 2, 157 | 100, 158 | 100, 159 | 64, 160 | "LSTM", 161 | "crf", 162 | "english", 163 | ], # small-1 is equivalent to cnn_optimization_runs #12 164 | [ 165 | "small-2", 166 | True, 167 | True, 168 | 1, 169 | 40, 170 | 3, 171 | 2, 172 | 100, 173 | 50, 174 | 64, 175 | "LSTM", 176 | "crf", 177 | "english", 178 | ], 179 | [ 180 | "small-3", 181 | True, 182 | True, 183 | 1, 184 | 40, 185 | 3, 186 | 2, 187 | 50, 188 | 100, 189 | 64, 190 | "LSTM", 191 | "crf", 192 | "english", 193 | ], 194 | ["small-4", True, True, 1, 40, 3, 2, 50, 50, 64, "LSTM", "crf", "english"], 195 | ] 196 | -------------------------------------------------------------------------------- /input.conll: -------------------------------------------------------------------------------- 1 | S 2 | V 3 | t 4 | P 5 | d 6 | 7 | d 8 | r 9 | 2 10 | v 11 | I 12 | N 13 | { 14 | t 15 | 16 | d 17 | I 18 | z 19 | 2 20 | @ 21 | r 22 | I 23 | N 24 | -------------------------------------------------------------------------------- /neuralnets/BiLSTM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import time 4 | import os 5 | import random 6 | import logging 7 | 8 | from .utils import try_tensorflow_import 9 | 10 | try_tensorflow_import() 11 | 12 | from tensorflow import keras 13 | from tensorflow.keras.optimizers import Adam, Nadam, RMSprop, Adadelta, Adagrad, SGD 14 | from tensorflow.keras.models import Model 15 | from tensorflow.keras.layers import * 16 | import tensorflow.keras.backend as K 17 | 18 | from config import get_cfg_defaults 19 | from .keraslayers.ChainCRF import ChainCRF 20 | 21 | 22 | class BiLSTM: 23 | """ 24 | A bidirectional LSTM with optional CRF for NLP sequence tagging. 25 | 26 | Author: Jacob Krantz 27 | Based on work done by Nils Reimers 28 | TODO: do Apache-2.0 properly 29 | 30 | https://github.com/tensorflow/tensorflow/issues/30263#issuecomment-509010526 31 | As of TF 2.0, we no longer need to selecte CudnnLSTM vs LSTM. Warning 32 | "Skipping optimization due to error while loading function libraries" 33 | can be ignored. 34 | """ 35 | 36 | def __init__(self, cfg): 37 | self.cfg = cfg 38 | self.model = None 39 | self.model_save_path = cfg.TRAINING.MODEL_SAVE_PATH 40 | self.results_save_path = None 41 | 42 | def set_vocab(self, vocab_size, n_class_labels, word_length, mappings): 43 | # class labels are syllable boundary labels 44 | self.vocab_size = vocab_size 45 | self.n_class_labels = n_class_labels 46 | self.word_length = word_length 47 | self.mappings = mappings # used indirectly during model reload 48 | 49 | def set_dataset(self, dataset, data): 50 | self.dataset = dataset 51 | self.data = data 52 | 53 | self.epoch = 0 54 | self.learning_rate_updates = {"sgd": {1: 0.1, 3: 0.05, 5: 0.01}} 55 | self.train_mini_batch_ranges = None 56 | self.train_word_length_ranges = None 57 | 58 | self.label_key = self.dataset["label"] 59 | 60 | logging.info("--- Dataset Details ---") 61 | logging.info("%d train words" % len(self.data["train_matrix"])) 62 | logging.info("%d dev words" % len(self.data["dev_matrix"])) 63 | logging.info("%d test words" % len(self.data["test_matrix"])) 64 | 65 | def build_model(self): 66 | if self.word_length <= 0: # variable length words 67 | self.word_length = None 68 | 69 | tokens_input = Input( 70 | shape=(self.word_length,), # use explicit word length for CNNs to work 71 | dtype="float32", 72 | name="phones_input", 73 | ) 74 | 75 | # output shape: (batch_size, word_length, embedding size) 76 | tokens = Embedding( 77 | input_dim=self.vocab_size, 78 | output_dim=self.cfg.MODEL.EMBEDDING_SIZE, 79 | trainable=True, 80 | name="phone_embeddings", 81 | )(tokens_input) 82 | 83 | # Add recurrent layers 84 | if self.cfg.MODEL.USE_RNN: 85 | assert self.cfg.MODEL.RNN in ["gru", "lstm"] 86 | rnn_func = GRU if self.cfg.MODEL.RNN == "gru" else LSTM 87 | 88 | recurrent_layer = Bidirectional( 89 | rnn_func( 90 | units=self.cfg.MODEL.RNN_SIZE, 91 | return_sequences=True, 92 | dropout=self.cfg.MODEL.DROPOUT, 93 | recurrent_dropout=self.cfg.MODEL.RECURRENT_DROPOUT, 94 | ), 95 | name="Bi" + self.cfg.MODEL.RNN, 96 | )(tokens) 97 | 98 | # Add CNNs, inspired by Ma and Hovy, 2016. In our case, 99 | # the CNNs are parallel to LSTM instead of prior. 100 | # TODO: add RELU activation function. 101 | if self.cfg.MODEL.USE_CNN: 102 | cnn_layer = tokens 103 | # how to reshape::: re = Reshape((tokens.shape[1],tokens.shape[2],) + (1, ))(tokens) # + (1, ) 104 | 105 | for i in range(self.cfg.MODEL.CNN_LAYERS): 106 | cnn_layer = Conv1D( 107 | filters=self.cfg.MODEL.NUM_FILTERS, 108 | kernel_size=self.cfg.MODEL.FILTER_SIZE, 109 | padding="same", 110 | name="cnn_" + str(i + 1), 111 | )(cnn_layer) 112 | 113 | if self.cfg.MODEL.MAX_POOL_SIZE: 114 | # maintain dimensionality (stride = 1) 115 | cnn_layer = MaxPooling1D( 116 | pool_size=self.cfg.MODEL.MAX_POOL_SIZE, 117 | strides=1, 118 | padding="same", 119 | name="max_pooling_" + str(i + 1), 120 | )(cnn_layer) 121 | 122 | # concatenating the CNN with the LSTM essentially tacks on the cnn vector to the end of each lstm time-step vector. 123 | if self.cfg.MODEL.USE_RNN: 124 | concat_layer = concatenate([recurrent_layer, cnn_layer]) 125 | else: 126 | concat_layer = cnn_layer 127 | else: 128 | assert self.cfg.MODEL.USE_RNN, "Either RNN or CNN must be in network." 129 | concat_layer = recurrent_layer 130 | 131 | # Add output classifier 132 | output = concat_layer 133 | assert self.cfg.MODEL.CLASSIFIER in [ 134 | "softmax", 135 | "crf", 136 | ], "classifier must be either 'softmax' or 'crf'" 137 | if self.cfg.MODEL.CLASSIFIER == "softmax": 138 | output = TimeDistributed( 139 | Dense(units=self.n_class_labels, activation="softmax"), name="softmax" 140 | )(output) 141 | loss_function = "sparse_categorical_crossentropy" 142 | 143 | elif self.cfg.MODEL.CLASSIFIER == "crf": # use Philipp Gross' ChainCRF 144 | output = TimeDistributed( 145 | Dense(units=self.n_class_labels, activation=None), 146 | name="hidden_lin_layer", 147 | )(output) 148 | crf = ChainCRF(name="crf") 149 | output = crf(output) 150 | loss_function = crf.sparse_loss 151 | 152 | # :: Parameters for the optimizer :: 153 | optim_params = {} 154 | if self.cfg.OPTIMIZER.CLIP_NORM > 0.0: 155 | optim_params["clipnorm"] = self.cfg.OPTIMIZER.CLIP_NORM 156 | if self.cfg.OPTIMIZER.CLIP_VALUE > 0: 157 | optim_params["clipvalue"] = self.cfg.OPTIMIZER.CLIP_VALUE 158 | 159 | optimizer = self.cfg.OPTIMIZER.OPTIMIZER 160 | if optimizer == "adam": 161 | opt = Adam(**optim_params) 162 | elif optimizer == "nadam": 163 | opt = Nadam(**optim_params) 164 | elif optimizer == "rmsprop": 165 | opt = RMSprop(**optim_params) 166 | elif optimizer == "adadelta": 167 | opt = Adadelta(**optim_params) 168 | elif optimizer == "adagrad": 169 | opt = Adagrad(**optim_params) 170 | elif optimizer == "sgd": 171 | opt = SGD(lr=0.1, **optim_params) 172 | else: 173 | assert False, "Optmizer not in list of allowable optimizers" 174 | 175 | model = Model(inputs=[tokens_input], outputs=[output]) 176 | model.compile(loss=loss_function, optimizer=opt) 177 | model.summary(line_length=100) 178 | self.model = model 179 | 180 | def train_model(self): 181 | self.epoch += 1 182 | 183 | if ( 184 | self.cfg.OPTIMIZER.OPTIMIZER in self.learning_rate_updates 185 | and self.epoch in self.learning_rate_updates[self.cfg.OPTIMIZER.OPTIMIZER] 186 | ): 187 | lr_update = self.learning_rate_updates[self.cfg.OPTIMIZER.OPTIMIZER][ 188 | self.epoch 189 | ] 190 | logging.info("Update Learning Rate to %f" % (lr_update)) 191 | K.set_value(self.model.optimizer.lr, lr_update) 192 | 193 | loss = 0.0 194 | for batch in self.minibatch_iterate_dataset(): 195 | loss += self.model.train_on_batch(x=batch[1:], y=batch[0]) 196 | return loss 197 | 198 | def minibatch_iterate_dataset(self): 199 | """ 200 | Create based on word length mini-batches with approx. the same size. 201 | Words and mini-batch chunks are shuffled and used to the train the model 202 | """ 203 | if self.train_word_length_ranges == None: 204 | """ Create mini batch ranges """ 205 | self.train_word_length_ranges = {} 206 | self.train_mini_batch_ranges = {} 207 | 208 | train_data = self.data["train_matrix"] 209 | train_data.sort( 210 | key=lambda x: len(x["tokens"]) 211 | ) # Sort train data by word length 212 | train_ranges = [] 213 | old_word_len = len(train_data[0]["tokens"]) 214 | idxStart = 0 215 | 216 | # Find start and end of ranges with words with same length 217 | for idx in range(len(train_data)): 218 | word_len = len(train_data[idx]["tokens"]) 219 | 220 | if word_len != old_word_len: 221 | train_ranges.append((idxStart, idx)) 222 | idxStart = idx 223 | 224 | old_word_len = word_len 225 | 226 | # Add last word 227 | train_ranges.append((idxStart, len(train_data))) 228 | 229 | # Break up ranges into smaller mini batch sizes 230 | mini_batch_ranges = [] 231 | for batch_range in train_ranges: 232 | range_len = batch_range[1] - batch_range[0] 233 | 234 | bins = int( 235 | np.ceil(range_len / float(self.cfg.TRAINING.MINI_BATCH_SIZE)) 236 | ) 237 | bin_size = int(np.ceil(range_len / float(bins))) 238 | 239 | for bin_nr in range(bins): 240 | startIdx = bin_nr * bin_size + batch_range[0] 241 | endIdx = min( 242 | batch_range[1], (bin_nr + 1) * bin_size + batch_range[0] 243 | ) 244 | mini_batch_ranges.append((startIdx, endIdx)) 245 | 246 | self.train_word_length_ranges = train_ranges 247 | self.train_mini_batch_ranges = mini_batch_ranges 248 | 249 | # Shuffle training data 250 | # 1. Shuffle words that have the same length 251 | x = self.data["train_matrix"] 252 | for data_range in self.train_word_length_ranges: 253 | for i in reversed(range(data_range[0] + 1, data_range[1])): 254 | # pick an element in x[:i+1] with which to exchange x[i] 255 | j = random.randint(data_range[0], i) 256 | x[i], x[j] = x[j], x[i] 257 | 258 | # 2. Shuffle the order of the mini batch ranges 259 | random.shuffle(self.train_mini_batch_ranges) 260 | 261 | # Iterate over the mini batch ranges 262 | range_length = len(self.train_mini_batch_ranges) 263 | 264 | batches = {} 265 | for idx in range(range_length): 266 | batches.clear() 267 | train_data = self.data["train_matrix"] 268 | data_range = self.train_mini_batch_ranges[ 269 | idx % len(self.train_mini_batch_ranges) 270 | ] 271 | labels = np.asarray( 272 | [ 273 | train_data[idx][self.label_key] 274 | for idx in range(data_range[0], data_range[1]) 275 | ] 276 | ) 277 | labels = np.expand_dims(labels, -1) 278 | 279 | batches = [labels] 280 | 281 | for featureName in self.cfg.TRAINING.FEATURE_NAMES: 282 | inputData = np.asarray( 283 | [ 284 | train_data[idx][featureName] 285 | for idx in range(data_range[0], data_range[1]) 286 | ] 287 | ) 288 | batches.append(inputData) 289 | 290 | yield batches 291 | 292 | def fit(self, epochs): 293 | if self.model is None: 294 | self.build_model() 295 | 296 | train_time_total = 0 297 | eval_time_total = 0 298 | max_dev_score = 0 299 | max_test_score = 0 300 | no_improvement_since = 0 301 | 302 | for epoch in range(epochs): 303 | sys.stdout.flush() 304 | logging.info("\n--------- Epoch %d -----------" % (epoch + 1)) 305 | 306 | start_time_epoch = time.time() 307 | self.train_model() 308 | train_time_epoch = time.time() - start_time_epoch 309 | train_time_total += train_time_epoch 310 | logging.info( 311 | "%.2f sec for training (%.2f total)" 312 | % (train_time_epoch, train_time_total) 313 | ) 314 | 315 | start_time_eval = time.time() 316 | dev_score, test_score = self.compute_acc_scores( 317 | self.data["dev_matrix"], self.data["test_matrix"] 318 | ) 319 | 320 | if dev_score > max_dev_score or epoch == 1: 321 | max_dev_score = dev_score 322 | max_test_score = test_score 323 | no_improvement_since = 0 324 | self.save_model(epoch, dev_score, test_score) 325 | else: 326 | no_improvement_since += 1 327 | 328 | eval_time_epoch = time.time() - start_time_eval 329 | eval_time_total += eval_time_epoch 330 | logging.info( 331 | "\nScores from epoch with best dev-scores:\n Dev-Score: %.4f\n Test-Score %.4f" 332 | % (max_dev_score, max_test_score) 333 | ) 334 | logging.info( 335 | "%.2f sec for eval (%.2f total)" % (eval_time_epoch, eval_time_total) 336 | ) 337 | 338 | if self.results_save_path != None: 339 | self.results_save_path.write( 340 | "\t".join( 341 | map( 342 | str, 343 | [ 344 | epoch + 1, 345 | dev_score, 346 | test_score, 347 | max_dev_score, 348 | max_test_score, 349 | train_time_epoch, 350 | train_time_total, 351 | eval_time_epoch, 352 | ], 353 | ) 354 | ) 355 | ) 356 | self.results_save_path.write("\n") 357 | self.results_save_path.flush() 358 | 359 | if ( 360 | self.cfg.TRAINING.EARLY_STOPPING > 0 361 | and no_improvement_since >= self.cfg.TRAINING.EARLY_STOPPING 362 | ): 363 | logging.info( 364 | "!!! Early stopping, no improvement after " 365 | + str(no_improvement_since) 366 | + " epochs !!!" 367 | ) 368 | break 369 | 370 | def tagWords(self, words): 371 | """ 372 | words: [{ 373 | 'raw_tokens': ['S', 'V', 't', 'P', 'd'], 374 | 'tokens': [11, 5, 43, 36, 8] 375 | }, ...] 376 | """ 377 | padded_pred_labels = self.predict_labels(self.model, words) 378 | pred_labels = [] 379 | for idx in range(len(words)): 380 | unpadded_pred_labels = [] 381 | for tokenIdx in range(len(words[idx]["tokens"])): 382 | # Skip padding tokens 383 | if words[idx]["tokens"][tokenIdx] != 0: 384 | unpadded_pred_labels.append(padded_pred_labels[idx][tokenIdx]) 385 | 386 | pred_labels.append(unpadded_pred_labels) 387 | 388 | return pred_labels 389 | 390 | def get_word_lengths(self, words): 391 | word_lengths = {} 392 | for idx in range(len(words)): 393 | word = words[idx]["tokens"] 394 | if len(word) not in word_lengths: 395 | word_lengths[len(word)] = [] 396 | word_lengths[len(word)].append(idx) 397 | 398 | return word_lengths 399 | 400 | def predict_labels(self, model, words): 401 | pred_labels = [None] * len(words) 402 | word_lengths = self.get_word_lengths(words) 403 | 404 | for indices in word_lengths.values(): 405 | nn_input = [] 406 | for feature_name in self.cfg.TRAINING.FEATURE_NAMES: 407 | input_data = np.asarray([words[idx][feature_name] for idx in indices]) 408 | nn_input.append(input_data) 409 | 410 | predictions = model.predict(nn_input, verbose=False) 411 | predictions = predictions.argmax(axis=-1) # Predict classes 412 | 413 | predIdx = 0 414 | for idx in indices: 415 | pred_labels[idx] = predictions[predIdx] 416 | predIdx += 1 417 | 418 | return pred_labels 419 | 420 | def compute_acc_scores(self, dev_data, test_data): 421 | """ 422 | Accuracy scores are reported at the word level. This means that if a single 423 | syllable boundary was incorrectly placed, the entire word is marked incorrect. 424 | 425 | Logs the boundary level accuracy as well. 426 | """ 427 | dev_acc, dev_bound = self.compute_acc(dev_data) 428 | test_acc, test_bound = self.compute_acc(test_data) 429 | 430 | logging.info("-- Epoch Accuracies --") 431 | logging.info("Word-Level Accuracy") 432 | logging.info("Dev: %.4f" % (dev_acc)) 433 | logging.info("Test: %.4f" % (test_acc)) 434 | 435 | logging.info("\nBoundary-Level Accuracy") 436 | logging.info("Dev: %.4f" % (dev_bound)) 437 | logging.info("Test: %.4f" % (test_bound)) 438 | 439 | return dev_acc, test_acc 440 | 441 | def compute_acc(self, words): 442 | """ 443 | Returns: 444 | float: word level accuracy. Range: [0.,1.] 445 | float: boundary_level_acc. Range: [0.,1.] 446 | """ 447 | correct_labels = [words[idx][self.label_key] for idx in range(len(words))] 448 | pred_labels = self.predict_labels(self.model, words) 449 | 450 | num_labels = 0 451 | num_corr_labels = 0 452 | num_corr_words = 0 453 | 454 | for word_id in range(len(correct_labels)): 455 | word_was_wrong = False 456 | for tokenId in range(len(correct_labels[word_id])): 457 | num_labels += 1 458 | if correct_labels[word_id][tokenId] == pred_labels[word_id][tokenId]: 459 | num_corr_labels += 1 460 | else: 461 | word_was_wrong = True 462 | 463 | if not word_was_wrong: 464 | num_corr_words += 1 465 | 466 | boundary_level_acc = num_corr_labels / float(num_labels) 467 | word_level_acc = num_corr_words / len(words) 468 | return word_level_acc, boundary_level_acc 469 | 470 | def store_results(self, results_path): 471 | if results_path != None: 472 | directory = os.path.dirname(results_path) 473 | if not os.path.exists(directory): 474 | os.makedirs(directory) 475 | 476 | self.results_save_path = open(results_path, "w") 477 | else: 478 | self.results_save_path = None 479 | 480 | def save_model(self, epoch, dev_score, test_score): 481 | import json 482 | import h5py 483 | 484 | save_path = ( 485 | self.model_save_path.replace("[DATASET]", self.cfg.TRAINING.DATASET) 486 | .replace("[DevScore]", "%.4f" % dev_score) 487 | .replace("[TestScore]", "%.4f" % test_score) 488 | .replace("[Epoch]", str(epoch + 1)) 489 | ) 490 | 491 | directory = os.path.dirname(save_path) 492 | if not os.path.exists(directory): 493 | os.makedirs(directory) 494 | 495 | if os.path.isfile(save_path): 496 | logging.info(f"Model {save_path} already exists. Model will be overwritten") 497 | 498 | res = keras.models.save_model( 499 | model=self.model, filepath=save_path, overwrite=True, save_format="h5" 500 | ) 501 | 502 | with h5py.File(save_path, "a") as h5file: 503 | h5file.attrs["mappings"] = json.dumps(self.mappings) 504 | h5file.attrs["label_key"] = self.dataset["label"] 505 | h5file.attrs["vocab_size"] = self.vocab_size 506 | h5file.attrs["n_class_labels"] = self.n_class_labels 507 | h5file.attrs["word_length"] = ( 508 | self.word_length if self.word_length != None else -1 509 | ) 510 | 511 | @staticmethod 512 | def load_model(model_path, cfg_path): 513 | import h5py 514 | import json 515 | 516 | cfg = get_cfg_defaults() 517 | cfg.merge_from_file(cfg_path) 518 | cfg.freeze() 519 | logging.info(cfg) 520 | 521 | with h5py.File(model_path, "r") as f: 522 | mappings = json.loads(f.attrs["mappings"]) 523 | label_key = f.attrs["label_key"] 524 | vocab_size = f.attrs["vocab_size"] 525 | n_class_labels = f.attrs["n_class_labels"] 526 | word_length = f.attrs["word_length"] 527 | 528 | if cfg.MODEL.CLASSIFIER == ["crf"]: 529 | from .keraslayers.ChainCRF import create_custom_objects 530 | 531 | custom_objects = create_custom_objects() 532 | 533 | model = keras.models.load_model(model_path, custom_objects=custom_objects) 534 | bilstm = BiLSTM(cfg) 535 | bilstm.set_vocab(vocab_size, n_class_labels, word_length, mappings) 536 | bilstm.model = model 537 | bilstm.label_key = label_key 538 | return bilstm 539 | -------------------------------------------------------------------------------- /neuralnets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jacobkrantz/lstm-syllabify/55d8da77a983e3f157c1281e3cd24beb9c0aed51/neuralnets/__init__.py -------------------------------------------------------------------------------- /neuralnets/keraslayers/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /neuralnets/keraslayers/ChainCRF.py: -------------------------------------------------------------------------------- 1 | """ 2 | [Adapted from Philipp Gross 3 | @ https://github.com/phipleg/keras/blob/crf/keras/layers/crf.py] 4 | 5 | """ 6 | from __future__ import absolute_import 7 | 8 | import tensorflow as tf 9 | from tensorflow import keras 10 | from tensorflow.keras import backend as K 11 | from tensorflow.keras import regularizers 12 | from tensorflow.keras import constraints 13 | from tensorflow.keras import initializers 14 | from tensorflow.keras.layers import Layer, InputSpec 15 | 16 | 17 | def path_energy(y, x, U, b_start=None, b_end=None): 18 | """Calculates the energy of a tag path y for a given input x, 19 | transition energies U and boundary energies b_start, b_end.""" 20 | x = add_boundary_energy(x, b_start, b_end) 21 | return path_energy0(y, x, U) 22 | 23 | 24 | def path_energy0(y, x, U): 25 | """Path energy without boundary potential handling.""" 26 | n_classes = K.shape(x)[2] 27 | y_one_hot = K.one_hot(y, n_classes) 28 | 29 | # Tag path energy 30 | energy = K.sum(x * y_one_hot, 2) 31 | energy = K.sum(energy, 1) 32 | 33 | # Transition energy 34 | y_t = y[:, :-1] 35 | y_tp1 = y[:, 1:] 36 | U_flat = K.reshape(U, [-1]) 37 | # Convert 2-dim indices (y_t, y_tp1) of U to 1-dim indices of U_flat: 38 | flat_indices = y_t * n_classes + y_tp1 39 | U_y_t_tp1 = K.gather(U_flat, flat_indices) 40 | 41 | energy += K.sum(U_y_t_tp1, axis=1) 42 | 43 | return energy 44 | 45 | 46 | def sparse_chain_crf_loss(y, x, U, b_start=None, b_end=None): 47 | """Given the true sparsely encoded tag sequence y, input x, 48 | transition energies U, boundary energies b_start and b_end, it computes 49 | the loss function of a Linear Chain Conditional Random Field: 50 | 51 | loss(y, x) = NNL(P(y|x)), where P(y|x) = exp(E(y, x)) / Z. 52 | So, loss(y, x) = - E(y, x) + log(Z) 53 | 54 | Here, E(y, x) is the tag path energy, and Z is the normalization constant. 55 | The values log(Z) is also called free energy. 56 | """ 57 | x = add_boundary_energy(x, b_start, b_end) 58 | energy = path_energy0(y, x, U) 59 | energy -= free_energy0(x, U) 60 | return K.expand_dims(-energy, -1) 61 | 62 | 63 | def chain_crf_loss(y, x, U, b_start=None, b_end=None): 64 | """Variant of sparse_chain_crf_loss but with one-hot encoded tags y.""" 65 | y_sparse = K.argmax(y, -1) 66 | y_sparse = K.cast(y_sparse, "int32") 67 | return sparse_chain_crf_loss(y_sparse, x, U, b_start, b_end) 68 | 69 | 70 | def add_boundary_energy(x, b_start=None, b_end=None): 71 | """Given the observations x, it adds the start boundary energy b_start (resp. 72 | end boundary energy b_end on the start (resp. end) elements. 73 | """ 74 | if b_start is not None: 75 | x = K.concatenate([x[:, :1, :] + b_start, x[:, 1:, :]], axis=1) 76 | if b_end is not None: 77 | x = K.concatenate([x[:, :-1, :], x[:, -1:, :] + b_end], axis=1) 78 | return x 79 | 80 | 81 | def viterbi_decode(x, U, b_start=None, b_end=None): 82 | """Computes the best tag sequence y for a given input x, i.e. the one that 83 | maximizes the value of path_energy.""" 84 | x = add_boundary_energy(x, b_start, b_end) 85 | 86 | alpha_0 = x[:, 0, :] 87 | gamma_0 = K.zeros_like(alpha_0) 88 | initial_states = [gamma_0, alpha_0] 89 | _, gamma = _forward( 90 | x, 91 | lambda B: [K.cast(K.argmax(B, axis=1), K.floatx()), K.max(B, axis=1)], 92 | initial_states, 93 | U, 94 | ) 95 | y = _backward(gamma) 96 | return y 97 | 98 | 99 | def free_energy(x, U, b_start=None, b_end=None): 100 | """Computes efficiently the sum of all path energies for input x, when 101 | runs over all possible tag sequences.""" 102 | x = add_boundary_energy(x, b_start, b_end) 103 | return free_energy0(x, U) 104 | 105 | 106 | def free_energy0(x, U): 107 | """Free energy without boundary potential handling.""" 108 | initial_states = [x[:, 0, :]] 109 | last_alpha, _ = _forward( 110 | x, lambda B: [tf.math.reduce_logsumexp(B, axis=1)], initial_states, U 111 | ) 112 | return last_alpha[:, 0] 113 | 114 | 115 | def _forward(x, reduce_step, initial_states, U): 116 | """Forward recurrence of the linear chain crf.""" 117 | 118 | def _forward_step(energy_matrix_t, states): 119 | alpha_tm1 = states[-1] 120 | new_states = reduce_step(K.expand_dims(alpha_tm1, 2) + energy_matrix_t) 121 | return new_states[0], new_states 122 | 123 | U_shared = K.expand_dims(K.expand_dims(U, 0), 0) 124 | 125 | inputs = K.expand_dims(x[:, 1:, :], 2) + U_shared 126 | inputs = K.concatenate([inputs, K.zeros_like(inputs[:, -1:, :, :])], axis=1) 127 | 128 | last, values, _ = K.rnn(_forward_step, inputs, initial_states) 129 | return last, values 130 | 131 | 132 | def batch_gather(reference, indices): 133 | ref_shape = K.shape(reference) 134 | batch_size = ref_shape[0] 135 | n_classes = ref_shape[1] 136 | flat_indices = K.arange(0, batch_size) * n_classes + K.flatten(indices) 137 | return K.gather(K.flatten(reference), flat_indices) 138 | 139 | 140 | def _backward(gamma): 141 | """Backward recurrence of the linear chain crf.""" 142 | gamma = K.cast(gamma, "int32") 143 | 144 | def _backward_step(gamma_t, states): 145 | y_tm1 = K.squeeze(states[0], 0) 146 | y_t = batch_gather(gamma_t, y_tm1) 147 | return y_t, [K.expand_dims(y_t, 0)] 148 | 149 | initial_states = [K.expand_dims(K.zeros_like(gamma[:, 0, 0]), 0)] 150 | _, y_rev, _ = K.rnn(_backward_step, gamma, initial_states, go_backwards=True) 151 | y = K.reverse(y_rev, 1) 152 | return y 153 | 154 | 155 | class ChainCRF(Layer): 156 | """A Linear Chain Conditional Random Field output layer. 157 | 158 | It carries the loss function and its weights for computing 159 | the global tag sequence scores. While training it acts as 160 | the identity function that passes the inputs to the subsequently 161 | used loss function. While testing it applies Viterbi decoding 162 | and returns the best scoring tag sequence as one-hot encoded vectors. 163 | 164 | # Arguments 165 | init: weight initialization function for chain energies U. 166 | Can be the name of an existing function (str), 167 | or a Theano function (see: [initializers](../initializers.md)). 168 | U_regularizer: instance of [WeightRegularizer](../regularizers.md) 169 | (eg. L1 or L2 regularization), applied to the transition weight matrix. 170 | b_start_regularizer: instance of [WeightRegularizer](../regularizers.md), 171 | applied to the start bias b. 172 | b_end_regularizer: instance of [WeightRegularizer](../regularizers.md) 173 | module, applied to the end bias b. 174 | b_start_constraint: instance of the [constraints](../constraints.md) 175 | module, applied to the start bias b. 176 | b_end_constraint: instance of the [constraints](../constraints.md) 177 | module, applied to the end bias b. 178 | weights: list of Numpy arrays for initializing [U, b_start, b_end]. 179 | Thus it should be a list of 3 elements of shape 180 | [(n_classes, n_classes), (n_classes, ), (n_classes, )] 181 | 182 | # Input shape 183 | 3D tensor with shape `(nb_samples, timesteps, nb_classes)`, where 184 | ´timesteps >= 2`and `nb_classes >= 2`. 185 | 186 | # Output shape 187 | Same shape as input. 188 | 189 | # Masking 190 | This layer does NOT support masking for input sequences of variable length. 191 | 192 | # Example 193 | 194 | ```python 195 | # As the last layer of sequential layer with 196 | # model.output_shape == (None, timesteps, nb_classes) 197 | crf = ChainCRF() 198 | model.add(crf) 199 | # now: model.output_shape == (None, timesteps, nb_classes) 200 | 201 | # Compile model with chain crf loss (and one-hot encoded labels) and accuracy 202 | model.compile(loss=crf.loss, optimizer='sgd', metrics=['accuracy']) 203 | 204 | # Alternatively, compile model with sparsely encoded labels and sparse accuracy: 205 | model.compile(loss=crf.sparse_loss, optimizer='sgd', metrics=['sparse_categorical_accuracy']) 206 | ``` 207 | 208 | # Gotchas 209 | 210 | ## Model loading 211 | 212 | When you want to load a saved model that has a crf output, then loading 213 | the model with 'keras.models.load_model' won't work properly because 214 | the reference of the loss function to the transition parameters is lost. To 215 | fix this, you need to use the parameter 'custom_objects' as follows: 216 | 217 | ```python 218 | from keras.layer.crf import create_custom_objects: 219 | model = keras.models.load_model(filename, custom_objects=create_custom_objects()) 220 | ``` 221 | 222 | ## Temporal sample weights 223 | 224 | Given a ChainCRF instance crf both loss functions, crf.loss and crf.sparse_loss 225 | return a tensor of shape (batch_size, 1) and not (batch_size, maxlen). 226 | that sample weighting in temporal mode. 227 | """ 228 | 229 | def __init__( 230 | self, 231 | init="glorot_uniform", 232 | U_regularizer=None, 233 | b_start_regularizer=None, 234 | b_end_regularizer=None, 235 | U_constraint=None, 236 | b_start_constraint=None, 237 | b_end_constraint=None, 238 | weights=None, 239 | **kwargs 240 | ): 241 | super(ChainCRF, self).__init__(**kwargs) 242 | self.init = initializers.get(init) 243 | self.U_regularizer = regularizers.get(U_regularizer) 244 | self.b_start_regularizer = regularizers.get(b_start_regularizer) 245 | self.b_end_regularizer = regularizers.get(b_end_regularizer) 246 | self.U_constraint = constraints.get(U_constraint) 247 | self.b_start_constraint = constraints.get(b_start_constraint) 248 | self.b_end_constraint = constraints.get(b_end_constraint) 249 | 250 | self.initial_weights = weights 251 | 252 | self.supports_masking = False 253 | self.uses_learning_phase = True 254 | self.input_spec = [InputSpec(ndim=3)] 255 | 256 | def compute_output_shape(self, input_shape): 257 | assert input_shape and len(input_shape) == 3 258 | return (input_shape[0], input_shape[1], input_shape[2]) 259 | 260 | def build(self, input_shape): 261 | assert len(input_shape) == 3 262 | n_classes = input_shape[2] 263 | n_steps = input_shape[1] 264 | assert n_steps is None or n_steps >= 2 265 | self.input_spec = [ 266 | InputSpec(dtype=K.floatx(), shape=(None, n_steps, n_classes)) 267 | ] 268 | 269 | self.U = self.add_weight( 270 | name="U", 271 | shape=(n_classes, n_classes), 272 | initializer=self.init, 273 | regularizer=self.U_regularizer, 274 | constraint=self.U_constraint, 275 | ) 276 | 277 | self.b_start = self.add_weight( 278 | shape=(n_classes,), 279 | initializer="zero", 280 | name="b_start", 281 | regularizer=self.b_start_regularizer, 282 | constraint=self.b_start_constraint, 283 | ) 284 | 285 | self.b_end = self.add_weight( 286 | name="b_end", 287 | shape=(n_classes,), 288 | initializer="zero", 289 | regularizer=self.b_end_regularizer, 290 | constraint=self.b_end_constraint, 291 | ) 292 | 293 | if self.initial_weights is not None: 294 | self.set_weights(self.initial_weights) 295 | del self.initial_weights 296 | 297 | self.built = True 298 | 299 | def call(self, x): 300 | y_pred = viterbi_decode(x, self.U, self.b_start, self.b_end) 301 | nb_classes = self.input_spec[0].shape[2] 302 | y_pred_one_hot = K.one_hot(y_pred, nb_classes) 303 | return K.in_train_phase(x, y_pred_one_hot) 304 | 305 | def loss(self, y_true, y_pred): 306 | """Linear Chain Conditional Random Field loss function. 307 | """ 308 | return chain_crf_loss(y_true, y_pred, self.U, self.b_start, self.b_end) 309 | 310 | def sparse_loss(self, y_true, y_pred): 311 | """Linear Chain Conditional Random Field loss function with sparse 312 | tag sequences. 313 | """ 314 | y_true = K.cast(y_true, "int32") 315 | y_true = K.squeeze(y_true, 2) 316 | return sparse_chain_crf_loss(y_true, y_pred, self.U, self.b_start, self.b_end) 317 | 318 | def get_config(self): 319 | config = { 320 | "init": initializers.serialize(self.init), 321 | "U_regularizer": regularizers.serialize(self.U_regularizer), 322 | "b_start_regularizer": regularizers.serialize(self.b_start_regularizer), 323 | "b_end_regularizer": regularizers.serialize(self.b_end_regularizer), 324 | "U_constraint": constraints.serialize(self.U_constraint), 325 | "b_start_constraint": constraints.serialize(self.b_start_constraint), 326 | "b_end_constraint": constraints.serialize(self.b_end_constraint), 327 | } 328 | base_config = super(ChainCRF, self).get_config() 329 | return dict(list(base_config.items()) + list(config.items())) 330 | 331 | 332 | def create_custom_objects(): 333 | """Returns the custom objects, needed for loading a persisted model.""" 334 | instanceHolder = {"instance": None} 335 | 336 | class ChainCRFClassWrapper(ChainCRF): 337 | def __init__(self, *args, **kwargs): 338 | instanceHolder["instance"] = self 339 | super(ChainCRFClassWrapper, self).__init__(*args, **kwargs) 340 | 341 | def loss(*args): 342 | method = getattr(instanceHolder["instance"], "loss") 343 | return method(*args) 344 | 345 | def sparse_loss(*args): 346 | method = getattr(instanceHolder["instance"], "sparse_loss") 347 | return method(*args) 348 | 349 | return { 350 | "ChainCRF": ChainCRFClassWrapper, 351 | "ChainCRFClassWrapper": ChainCRFClassWrapper, 352 | "loss": loss, 353 | "sparse_loss": sparse_loss, 354 | } 355 | -------------------------------------------------------------------------------- /neuralnets/keraslayers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jacobkrantz/lstm-syllabify/55d8da77a983e3f157c1281e3cd24beb9c0aed51/neuralnets/keraslayers/__init__.py -------------------------------------------------------------------------------- /neuralnets/utils.py: -------------------------------------------------------------------------------- 1 | def try_tensorflow_import(verbose=False): 2 | """ 3 | Sets the GPU device to 1. Sets the memory allocation to grow rather than 4 | allocating the whole VRAM. 5 | https://www.tensorflow.org/guide/gpu 6 | """ 7 | import os 8 | 9 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 10 | 11 | import tensorflow as tf 12 | 13 | tf.compat.v1.disable_eager_execution() 14 | 15 | if verbose: 16 | tf.debugging.set_log_device_placement(True) # logs what device is being used 17 | gpus = tf.config.experimental.list_physical_devices("GPU") 18 | if not gpus: 19 | return 20 | 21 | for gpu in gpus: 22 | tf.config.experimental.set_memory_growth(gpu, True) 23 | 24 | if verbose: 25 | logical_gpus = tf.config.experimental.list_logical_devices("GPU") 26 | print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") 27 | -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | from copy import copy 2 | import logging 3 | 4 | """ 5 | potential problem: running an old model may fail due to mappings 6 | being regenerated every time. 7 | TODO: fix mappings to a specified map by storing them in pkl file. 8 | """ 9 | 10 | 11 | def load_dataset(dataset, dataset_name, do_pad_words): 12 | """ 13 | if pad_words, then each word in every dataset will be padded to the length 14 | of the longest word. PAD token is integer 0. All fields would be padded, 15 | which include 'tokens', 'raw_tokens', and 'boundaries'. This makes the 16 | training take 75s per epoch on just LSTM (~2x longer). 17 | 18 | Returns: 19 | EMBEDDINGS (not used) 20 | - numpy.ndarray holding 300 dimensional embeddings (each 21 | numpy.ndarray) that are not normalized to 0-1. 22 | - there is not an explicit mapping built into the structure, so 23 | they must be associated with the mappings data structure 24 | - embeddings are for the word inputs. word (raw_tokens) -> tokens 25 | -> embedding 26 | DATA 27 | - raw_tokens are phones in DISC format 28 | shape: 29 | data = { 30 | 'english': { 31 | 'train_matrix': [ 32 | { 33 | 'tokens': [int, int, ... , int], 34 | 'boundaries': [int, int, ... , int], 35 | 'raw_tokens':[str, str, ..., str] 36 | }, ... 37 | ] 38 | 'dev_matrix': same as train_matrix 39 | 'test_matrix': same as train_matrix 40 | } 41 | } 42 | 43 | MAPPINGS 44 | - dictionary that maps tokens to a unique integer 45 | VOCAB_SIZE 46 | - number of possible inputs to the NN. 47 | - Usually is the number of phones in the langage being used. 48 | N_CLASS_LABELS 49 | - number of possible types of syllable boundaries. 50 | - Default is two: either boundary (1) or no boundary (0) 51 | WORD_LENGTH 52 | - length of the longest word in the dataset 53 | """ 54 | embeddings = [] 55 | mappings = {} 56 | data = {} 57 | word_length = -1 58 | 59 | dataset_columns = dataset["columns"] 60 | 61 | train_data = "data/%s/train.txt" % dataset_name 62 | dev_data = "data/%s/dev.txt" % dataset_name 63 | test_data = "data/%s/test.txt" % dataset_name 64 | paths = { 65 | "train_matrix": train_data, 66 | "dev_matrix": dev_data, 67 | "test_matrix": test_data, 68 | } 69 | 70 | logging.info(":: Transform " + dataset_name + " dataset ::") 71 | mappings, vocab_size, n_class_labels = make_mappings(paths.values(), do_pad_words) 72 | data = process_data(paths, dataset_columns, dataset, mappings) 73 | if do_pad_words: 74 | data, word_length = pad_words(data) 75 | 76 | # currently do not have pre-trained phonetic embeddings. 77 | # returning embeddings = []. Embeddings mst be trained. 78 | return (embeddings, data, mappings, vocab_size, n_class_labels, word_length) 79 | 80 | 81 | def pad_words(data): 82 | """ 83 | Pad each word to the length of the longest word. Token for PAD is the integer 0. 84 | """ 85 | 86 | # Find the length of the longest word in the dataset for padding purposes. 87 | max_len = 0 88 | tokens = set() 89 | phones = set() 90 | for mat in ["dev_matrix", "train_matrix", "test_matrix"]: 91 | for word in data[mat]: 92 | if len(word["raw_tokens"]) > max_len: 93 | max_len = len(word["raw_tokens"]) 94 | for tok in word["tokens"]: 95 | tokens.add(tok) 96 | for phone in word["raw_tokens"]: 97 | phones.add(phone) 98 | 99 | # pad both 'tokens' with 0 and 'raw_tokens' with 'PAD' 100 | for mat in ["dev_matrix", "train_matrix", "test_matrix"]: 101 | for word in data[mat]: 102 | word["raw_tokens"] += [ 103 | "PAD" for _ in range(max_len - len(word["raw_tokens"])) 104 | ] 105 | word["tokens"] += [0 for _ in range(max_len - len(word["tokens"]))] 106 | word["boundaries"] += [0 for _ in range(max_len - len(word["boundaries"]))] 107 | assert len(word["raw_tokens"]) == max_len 108 | assert len(word["tokens"]) == max_len 109 | assert len(word["boundaries"]) == max_len 110 | 111 | return data, max_len 112 | 113 | 114 | def make_mappings(paths, pad_words): 115 | """ 116 | creates a unique mapping from phone to integer. 117 | Args: 118 | paths (list): file paths that hold all possible phones 119 | Returns: 120 | dict phone->int, 121 | int: vocab size (# phone inputs) 122 | int: number of class labels (possible boundary classifications) 123 | """ 124 | all_phones = set() 125 | class_labels = set() 126 | 127 | for path in paths: 128 | with open(path, "r") as f: 129 | for line in f: 130 | line = line.split("\t") # (phone, boundary) 131 | if len(line) == 1: 132 | continue 133 | all_phones.add(line[0]) 134 | class_labels.add(line[1]) 135 | 136 | mappings = {} 137 | for i, phone in enumerate(all_phones): 138 | mappings[phone] = i + 1 if pad_words else i # reserve 0 for padding 139 | 140 | vocab_size = len(mappings) + 1 if pad_words else len(mappings) 141 | return mappings, vocab_size, len(class_labels) 142 | 143 | 144 | def process_data(paths, dataset_columns, dataset, mappings): 145 | """ 146 | hardcoded for certain columns. Mst be changed by hand. 147 | TODO: leverage details from dataset_columns and dataset. 148 | """ 149 | data = {} 150 | 151 | for name, path in paths.items(): 152 | 153 | # add 'raw_tokens' and 'boundaries' to data 154 | entries = [] 155 | entry = {"raw_tokens": [], "boundaries": []} 156 | with open(path, "r") as f: 157 | for line in f: 158 | line = line.strip("\n").split("\t") 159 | if len(line) == 1: 160 | # TEMP: only include length > 1 161 | if len(entry["raw_tokens"]) > 1: 162 | entries.append(copy(entry)) 163 | entry["raw_tokens"] = [] 164 | entry["boundaries"] = [] 165 | continue 166 | 167 | entry["raw_tokens"].append(line[0]) 168 | entry["boundaries"].append(int(line[1])) 169 | 170 | data[name] = entries 171 | 172 | # add 'tokens' to data 173 | for i, entry in enumerate(data[name]): 174 | data[name][i]["tokens"] = [mappings[raw] for raw in entry["raw_tokens"]] 175 | 176 | return data 177 | 178 | 179 | def read_conll_single(f_name, final_length=28): 180 | # The length is hardcoded for English. Won't work with other languages... 181 | words = [] 182 | current_length = 0 183 | with open(f_name, "r") as f: 184 | word = [] 185 | for line in f: 186 | line = line.split() 187 | if len(line) == 0: 188 | word += ["PAD" for x in range(final_length - current_length)] 189 | words.append({"tokens": copy(word)}) 190 | word = [] 191 | current_length = 0 192 | continue 193 | current_length += 1 194 | word.append(line[0]) 195 | return words 196 | 197 | 198 | def create_data_matrix(words, mappings): 199 | # TODO: this should be merged with process_data 200 | data = [] 201 | for word in words: 202 | token_transform_lst = [] 203 | for char in word["tokens"]: 204 | if char in mappings: 205 | token_transform_lst.append(mappings[char]) 206 | else: 207 | token_transform_lst.append("0") 208 | data.append({"raw_tokens": word["tokens"], "tokens": token_transform_lst}) 209 | 210 | return data 211 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fire==0.2.1 2 | Keras-Applications==1.0.8 3 | Keras-Preprocessing==1.1.0 4 | numpy==1.17.3 5 | PyYAML==5.1.2 6 | scipy=1.3.1 7 | tensorflow==2.0.0 8 | tensorflow-estimator==2.0.0 9 | yacs==0.1.6 10 | -------------------------------------------------------------------------------- /res/network-diagram.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jacobkrantz/lstm-syllabify/55d8da77a983e3f157c1281e3cd24beb9c0aed51/res/network-diagram.PNG -------------------------------------------------------------------------------- /res/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jacobkrantz/lstm-syllabify/55d8da77a983e3f157c1281e3cd24beb9c0aed51/res/results.png -------------------------------------------------------------------------------- /results/analyze_res_folder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import statistics 4 | import numpy as np 5 | import fire 6 | 7 | """ 8 | Analyze a directory of results. Each file is a repeated run. Can determine 9 | average scores and standard deviation across runs. 10 | 11 | How to call: 12 | >>> python3 analyze_res_folder.py (folder-name) 13 | where folder-name is the directory within ./results/ to be analyzed 14 | Example: 15 | >>> python3 analyze_res_folder.py batch_and_variation/2048 16 | 17 | csv column structure: 18 | ( 19 | 0 epoch, 20 | 1 dev word accuracy, 21 | 2 word accuracy, 22 | 3 best dev word accuracy, 23 | 4 best test word accuracy, 24 | 5 training time for epoch (s), 25 | 6 total training time thus far (s), 26 | 7 evaluation time for epoch (s) 27 | ) 28 | """ 29 | 30 | 31 | def analyze(res_dir_name): 32 | res_dir = os.getcwd() + "/" + str(res_dir_name) + "/" 33 | epoch_finished_at = [] 34 | training_times = [] 35 | epoch_training_time = [] 36 | epoch_eval_time = [] 37 | best_dev_accuracy = [] 38 | test_accuracy = [] 39 | 40 | if not os.path.isdir(res_dir): 41 | raise ValueError(res_dir + " is not a valid directory") 42 | 43 | for filename in os.listdir(res_dir): 44 | with open(res_dir + filename, "r") as f: 45 | csv_reader = csv.reader(f, delimiter="\t") 46 | 47 | # ignore all lines but the last 48 | for epoch in csv_reader: 49 | epoch_training_time.append(float(epoch[5])) 50 | epoch_eval_time.append(float(epoch[7])) 51 | 52 | epoch_finished_at.append(int(epoch[0])) 53 | training_times.append(int(float(epoch[6]))) 54 | best_dev_accuracy.append(float(epoch[3])) 55 | test_accuracy.append(float(epoch[4])) 56 | 57 | # display analysis 58 | print("---------------------------------------") 59 | print("Directory analyzed:\t\t", res_dir_name) 60 | print("---------------------------------------") 61 | print( 62 | "Average dev accuracy:\t\t", 63 | "{number:.{digits}f}".format(number=100 * np.mean(best_dev_accuracy), digits=3), 64 | ) 65 | if len(best_dev_accuracy) > 1: 66 | print( 67 | "Standard deviation dev:\t\t ", 68 | "{number:.{digits}f}".format( 69 | number=100 * statistics.stdev(best_dev_accuracy), digits=3 70 | ), 71 | ) 72 | print( 73 | "Average test accuracy:\t\t", 74 | "{number:.{digits}f}".format(number=100 * np.mean(test_accuracy), digits=3), 75 | ) 76 | if len(test_accuracy) > 1: 77 | print( 78 | "Standard deviation test\t\t ", 79 | "{number:.{digits}f}".format( 80 | number=100 * statistics.stdev(test_accuracy), digits=3 81 | ), 82 | ) 83 | print("") 84 | print( 85 | "Average total training time:\t", 86 | "{number:.{digits}f}".format(number=np.mean(training_times), digits=3), 87 | ) 88 | print( 89 | "Average epoch training time:\t", 90 | "{number:.{digits}f}".format(number=np.mean(epoch_training_time), digits=3), 91 | ) 92 | print( 93 | "Average epoch eval time:\t", 94 | "{number:.{digits}f}".format(number=np.mean(epoch_eval_time), digits=3), 95 | ) 96 | print( 97 | "Average number of total epochs:\t", 98 | "{number:.{digits}f}".format(number=np.mean(epoch_finished_at), digits=2), 99 | ) 100 | print("number of experiments analyzed:\t", len(epoch_finished_at)) 101 | 102 | 103 | if __name__ == "__main__": 104 | fire.Fire(analyze) 105 | -------------------------------------------------------------------------------- /results/plot.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import pandas as pd 3 | import matplotlib 4 | import matplotlib.pyplot as plt 5 | 6 | matplotlib.rc("font", **{"weight": "bold", "size": 22}) 7 | 8 | 9 | def plot(csv_file): 10 | """ 11 | Plots a graph of training accuracies as given by the model results output file. 12 | Args: 13 | csv_file (str): file name with csv extension. 14 | 15 | csv structure: 16 | ( 17 | 0 epoch, 18 | 1 dev word accuracy, 19 | 2 word accuracy, 20 | 3 best dev word accuracy, 21 | 4 best test word accuracy, 22 | 5 training time for epoch (s), 23 | 6 total training time thus far (s), 24 | 7 evaluation time 25 | ) 26 | """ 27 | df = pd.read_csv(csv_file, delimiter="\t", header=None, usecols=[1, 2, 3, 4, 5]) 28 | fig = plt.figure() 29 | 30 | ax1 = fig.add_subplot(111) 31 | ax1.set_title( 32 | next(iter(set(df[1].values))) + " syllabification training accuracy over epochs" 33 | ) 34 | ax1.set_xlabel("Epochs") 35 | ax1.set_ylabel("Word-Level Accuracy") 36 | ax1.plot( 37 | list(range(len(df[2].values))), 38 | df[2].values, 39 | c="r", 40 | label="Dev Score", 41 | linewidth=5, 42 | ) 43 | ax1.plot( 44 | list(range(len(df[2].values))), 45 | df[3].values, 46 | c="b", 47 | label="Test Score", 48 | linewidth=5, 49 | ) 50 | ax1.grid() 51 | ax1.legend() 52 | 53 | plt.show() 54 | 55 | 56 | if __name__ == "__main__": 57 | fire.Fire(plot) 58 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import fire 4 | 5 | from preprocessing import read_conll_single, create_data_matrix 6 | from neuralnets.BiLSTM import BiLSTM 7 | 8 | 9 | def run(model_path, input_path, config_path): 10 | """ 11 | This script loads a pretrained model and a input file in CoNLL format (each line a token, words separated by an empty line). 12 | The input words are passed to the model for tagging. Prints the tokens and the tags in a CoNLL format to stdout 13 | model_path (string): path to a pretrained model in .h5 format. 14 | input_path (string): path to the input file in CoNLL format of words to be syllabified. 15 | """ 16 | words = read_conll_single( 17 | input_path 18 | ) # words: list [ { tokens: [ raw_tokens, ... ] } ... ] 19 | 20 | model = BiLSTM.load_model(model_path, config_path) 21 | data_matrix = create_data_matrix(words, model.mappings) 22 | tags = model.tagWords(data_matrix)["english"] 23 | 24 | print("\nTagged Words: ") 25 | for i, word in enumerate(words): 26 | joined = [] 27 | for j, ch in enumerate(word["tokens"]): 28 | # pad tags with 0 to length of word. 29 | if len(tags[i]) < len(word["tokens"]): 30 | tags[i] += [0] * (len(word["tokens"]) - len(tags[i])) 31 | joined.append((ch, tags[i][j])) 32 | 33 | for tup in joined: 34 | print(tup[0], end="") 35 | if tup[1] == 1: 36 | print("-", end="") 37 | 38 | print("") 39 | 40 | 41 | if __name__ == "__main__": 42 | fire.Fire(run) 43 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # This script trains the BiLSTM-CRF architecture for syllabification. 2 | 3 | import argparse 4 | import os 5 | import logging 6 | import sys 7 | 8 | from config import get_cfg_defaults 9 | from neuralnets.BiLSTM import BiLSTM 10 | from preprocessing import load_dataset 11 | 12 | # Change into the working dir of the script 13 | abspath = os.path.abspath(__file__) 14 | dname = os.path.dirname(abspath) 15 | os.chdir(dname) 16 | 17 | # Logging level 18 | loggingLevel = logging.INFO 19 | logger = logging.getLogger() 20 | logger.setLevel(loggingLevel) 21 | 22 | ch = logging.StreamHandler(sys.stdout) 23 | ch.setLevel(loggingLevel) 24 | formatter = logging.Formatter("%(message)s") 25 | ch.setFormatter(formatter) 26 | logger.addHandler(ch) 27 | 28 | # Results directories 29 | PATH = os.getcwd() + "/results/" 30 | 31 | 32 | def create_directory(name): 33 | if not os.path.exists(PATH): 34 | os.mkdir(PATH) 35 | if not os.path.exists(PATH + "/" + str(name)): 36 | os.mkdir(PATH + "/" + str(name)) 37 | 38 | 39 | def train_and_eval_model(cfg): 40 | """ 41 | Load data and train model 42 | args: 43 | cfg (YACS YAML config) 44 | """ 45 | 46 | # Data preprocessing 47 | dataset = { 48 | "columns": {0: "raw_tokens", 1: "boundaries"}, 49 | # CoNLL format (tab-delineated) 50 | # Column 0: phones 51 | # Column 1: syllable boundary 52 | "label": "boundaries", # Which column we like to predict 53 | } 54 | 55 | # Load the embeddings and the dataset. Choose whether or not to pad the words. 56 | # Right now, padding must be done if CRF is chosen for output layer. 57 | # The CRF layer does not support masking. 58 | embeddings, data, mappings, vocab_size, n_class_labels, word_length = load_dataset( 59 | dataset, dataset_name=cfg.TRAINING.DATASET, do_pad_words=True 60 | ) 61 | 62 | create_directory(cfg.CONFIG_NAME) 63 | logger.info(f"Starting training of `{cfg.CONFIG_NAME}` on dataset `{dataset}`") 64 | 65 | for training_repeat in range(cfg.TRAINING.TRAINING_REPEATS): 66 | model = BiLSTM(cfg) 67 | model.set_vocab(vocab_size, n_class_labels, word_length, mappings) 68 | model.set_dataset(dataset, data) 69 | 70 | # Path to store performance scores for dev / test 71 | model.store_results( 72 | PATH + "/" + cfg.CONFIG_NAME + "/" + str(training_repeat) + ".csv" 73 | ) 74 | model.fit(epochs=cfg.TRAINING.EPOCHS) 75 | 76 | 77 | if __name__ == "__main__": 78 | parser = argparse.ArgumentParser() 79 | parser.add_argument( 80 | "--config", 81 | type=str, 82 | default="config/french_large_base.yaml", 83 | help="filename of config to run experiment with", 84 | ) 85 | args = parser.parse_args() 86 | 87 | cfg = get_cfg_defaults() 88 | cfg.merge_from_file(args.config) 89 | cfg.freeze() 90 | logging.info(cfg) 91 | train_and_eval_model(cfg) 92 | --------------------------------------------------------------------------------