├── .gitignore ├── LICENSE.md ├── README.md ├── core ├── __init__.py ├── callbacks.py ├── ctc_utils.py ├── initializers.py ├── layers.py ├── layers_utils.py ├── metrics.py └── models.py ├── data ├── download_brsmv1.sh └── download_datasets.sh ├── datasets ├── __init__.py ├── brsd.py ├── cslu.py ├── dataset_generator.py ├── dataset_parser.py ├── dummy.py ├── lapsbm.py ├── sid.py └── voxforge.py ├── eval.py ├── extras ├── __init__.py ├── apis.py ├── ctc_viz.py ├── eval_apis.py ├── make_dataset.py ├── print_args.py ├── recognizer.py └── results2xlsx.py ├── imgs ├── best_ler.jpg ├── best_ler.pdf ├── best_loss.jpg └── best_loss.pdf ├── logging.yaml ├── msc.yaml ├── predict.py ├── preprocessing ├── __init__.py ├── audio.py ├── audio_utils.py └── text.py ├── requirements.txt ├── train.py └── utils ├── __init__.py ├── core_utils.py ├── generic_utils.py └── hparams.py /.gitignore: -------------------------------------------------------------------------------- 1 | Ignoring automatic generated folders in data 2 | # Created by https://www.gitignore.io/api/sublimetext,macos,linux,python,ipythonnotebook,windows 3 | 4 | ### SublimeText ### 5 | # cache files for sublime text 6 | *.tmlanguage.cache 7 | *.tmPreferences.cache 8 | *.stTheme.cache 9 | 10 | # workspace files are user-specific 11 | *.sublime-workspace 12 | 13 | # project files should be checked into the repository, unless a significant 14 | # proportion of contributors will probably not be using SublimeText 15 | # *.sublime-project 16 | 17 | # sftp configuration file 18 | sftp-config.json 19 | 20 | # Package control specific files 21 | Package Control.last-run 22 | Package Control.ca-list 23 | Package Control.ca-bundle 24 | Package Control.system-ca-bundle 25 | Package Control.cache/ 26 | Package Control.ca-certs/ 27 | bh_unicode_properties.cache 28 | 29 | # Sublime-github package stores a github token in this file 30 | # https://packagecontrol.io/packages/sublime-github 31 | GitHub.sublime-settings 32 | 33 | 34 | ### macOS ### 35 | *.DS_Store 36 | .AppleDouble 37 | .LSOverride 38 | 39 | # Icon must end with two \r 40 | Icon 41 | # Thumbnails 42 | ._* 43 | # Files that might appear in the root of a volume 44 | .DocumentRevisions-V100 45 | .fseventsd 46 | .Spotlight-V100 47 | .TemporaryItems 48 | .Trashes 49 | .VolumeIcon.icns 50 | .com.apple.timemachine.donotpresent 51 | # Directories potentially created on remote AFP share 52 | .AppleDB 53 | .AppleDesktop 54 | Network Trash Folder 55 | Temporary Items 56 | .apdisk 57 | 58 | 59 | ### Linux ### 60 | *~ 61 | 62 | # temporary files which can be created if a process still has a handle open of a deleted file 63 | .fuse_hidden* 64 | 65 | # KDE directory preferences 66 | .directory 67 | 68 | # Linux trash folder which might appear on any partition or disk 69 | .Trash-* 70 | 71 | # .nfs files are created when an open file is removed but is still being accessed 72 | .nfs* 73 | 74 | 75 | ### Python ### 76 | # Byte-compiled / optimized / DLL files 77 | __pycache__/ 78 | *.py[cod] 79 | *$py.class 80 | 81 | # C extensions 82 | *.so 83 | 84 | # Distribution / packaging 85 | .Python 86 | env/ 87 | build/ 88 | develop-eggs/ 89 | dist/ 90 | downloads/ 91 | eggs/ 92 | .eggs/ 93 | lib/ 94 | lib64/ 95 | parts/ 96 | sdist/ 97 | var/ 98 | *.egg-info/ 99 | .installed.cfg 100 | *.egg 101 | 102 | # PyInstaller 103 | # Usually these files are written by a python script from a template 104 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 105 | *.manifest 106 | *.spec 107 | 108 | # Installer logs 109 | pip-log.txt 110 | pip-delete-this-directory.txt 111 | 112 | # Unit test / coverage reports 113 | htmlcov/ 114 | .tox/ 115 | .coverage 116 | .coverage.* 117 | .cache 118 | nosetests.xml 119 | coverage.xml 120 | *,cover 121 | .hypothesis/ 122 | 123 | # Translations 124 | *.mo 125 | *.pot 126 | 127 | # Django stuff: 128 | *.log 129 | local_settings.py 130 | 131 | # Flask stuff: 132 | instance/ 133 | .webassets-cache 134 | 135 | # Scrapy stuff: 136 | .scrapy 137 | 138 | # Sphinx documentation 139 | docs/_build/ 140 | 141 | # PyBuilder 142 | target/ 143 | 144 | # IPython Notebook 145 | .ipynum_checkpoints 146 | 147 | # pyenv 148 | .python-version 149 | 150 | # celery beat schedule file 151 | celerybeat-schedule 152 | 153 | # dotenv 154 | .env 155 | 156 | # virtualenv 157 | .venv/ 158 | venv/ 159 | ENV/ 160 | 161 | # Spyder project settings 162 | .spyderproject 163 | 164 | # Rope project settings 165 | .ropeproject 166 | 167 | 168 | ### IPythonNotebook ### 169 | # Temporary data 170 | .ipynum_checkpoints/ 171 | 172 | 173 | ### Windows ### 174 | # Windows image file caches 175 | Thumbs.db 176 | ehthumbs.db 177 | 178 | # Folder config file 179 | Desktop.ini 180 | 181 | # Recycle Bin used on file shares 182 | $RECYCLE.BIN/ 183 | 184 | # Windows Installer files 185 | *.cab 186 | *.msi 187 | *.msm 188 | *.msp 189 | 190 | # Windows shortcuts 191 | *.lnk 192 | 193 | # My ignores 194 | timit 195 | *.h5 196 | results/ 197 | .datasets/ 198 | .envrc 199 | notebooks/ 200 | sims/ 201 | results*/ 202 | <<<<<<< HEAD 203 | data/*/** 204 | data/* 205 | refs/ 206 | software 207 | results.json 208 | .vscode 209 | *.tar.gz 210 | *.xls* 211 | *.json 212 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2016 Igor Macedo Quintanilha 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # asr-study: a study of all-neural speech recognition models 2 | This repository contains my efforts on developing an end-to-end ASR system using Keras and Tensorflow. 3 | 4 | ## Training a character-based all-neural Brazilian Portuguese speech recognition model 5 | 6 | Our model was trained using four datasets: [CSLU Spoltech (LDC2006S16)](https://catalog.ldc.upenn.edu/LDC2006S16), Sid, [VoxForge](http://www.voxforge.org), and [LapsBM1.4]( http://www.laps.ufpa.br/falabrasil/). Only the CSLU dataset is paid. 7 | 8 | #### Set up the (partial) Brazilian Portuguese Speech Dataset (BRSD) 9 | 10 | You can download the freely available datasets with the provided script (it may take a while): 11 | 12 | ```bash 13 | $ cd data; sh download_datasets.sh 14 | ``` 15 | 16 | Next, you can preprocess it into an hdf5 file. Click [here](extras/make_dataset.py) for more information. 17 | 18 | ```bash 19 | $ python -m extras.make_dataset --parser brsd --input_parser mfcc 20 | ``` 21 | 22 | #### Train the network 23 | 24 | You can train the network with the `train.py` script. For more usage information see [this](train.py). To train with the default parameters: 25 | 26 | ```bash 27 | $ python train.py --dataset .datasets/brsd/data.h5 28 | ``` 29 | 30 | ## Pre-trained model 31 | 32 | You may download a pre-trained [brsm v1.0 model](core/models.py) over the full brsd dataset (including the CSLU dataset): 33 | 34 | ```bash 35 | $ mkdir models; sh download_brsmv1.sh 36 | ``` 37 | 38 | Also, you can evaluate the model against the **brsd** test set 39 | 40 | ```bash 41 | $ python eval.py --model models/brsmv1.h5 --dataset .datasets/brsd/data.h5 42 | ``` 43 | 44 | #### brsmv1.h5 training 45 | 46 |
47 | 48 | 49 |
50 | 51 | Test set: LER **25.13%** (using beam search decoder with beam width of 100) 52 | 53 | 54 | ## Predicting the outputs 55 | 56 | To predict the outputs of a trained model using some dataset: 57 | 58 | ```bash 59 | $ python predict.py --model MODEL --dataset DATASET 60 | ``` 61 | 62 | ## Available dataset parsers 63 | You can see in [datasets/](datasets/) all the datasets parsers available. 64 | 65 | #### Creating a custom dataset parser 66 | 67 | You may create your own dataset parser. Here an example: 68 | 69 | ```python 70 | class CustomParser(DatasetParser): 71 | 72 | def __init__(self, dataset_dir, name='default name', **kwargs): 73 | super(CustomParser, self).__init__(dataset_dir, name, **kwargs) 74 | 75 | def _iter(self): 76 | for line in dataset: 77 | yield {'duration': line['duration'], 78 | 'input': line['input'], 79 | 'label': line['label'], 80 | 'non-optional-field': line['non-optional-field']} 81 | 82 | def _report(self, dl): 83 | args = extract_statistics(dl) 84 | report = '''General information 85 | Number of utterances: %d 86 | Total size (in seconds) of utterances: %.f 87 | Number of speakers: %d''' % (args) 88 | ``` 89 | 90 | ## Available models 91 | You can see all the available models in [core/models.py](core/models.py) 92 | #### Creating a custom model 93 | 94 | You may create your custom model. Here an example of CTC-based model 95 | 96 | ```python 97 | def custom_model(num_features=26, num_hiddens=100, num_classes=28): 98 | 99 | x = Input(name='inputs', shape=(None, num_features)) 100 | o = x 101 | 102 | o = Bidirectional(LSTM(num_hiddens, 103 | return_sequences=True, 104 | consume_less='gpu'))(o) 105 | o = TimeDistributed(Dense(num_classes))(o) 106 | 107 | return ctc_model(x, o) 108 | ``` 109 | ## Contributing 110 | There are a plenty of work to be done. All contributions are welcome :). 111 | 112 | #### asr-related work 113 | * Add new layers 114 | * Batch normalized recurrent neural networks [arXiv](https://arxiv.org/abs/1510.01378) 115 | * Batch recurrent normalization [arXiv](https://arxiv.org/abs/1603.09025) 116 | * Reproduce topologies and results 117 | * [EESEN](https://arxiv.org/abs/1507.08240) 118 | * [Deep Speech 2](https://arxiv.org/abs/1512.02595) 119 | * ConvNet-based architectures 120 | * Add language model 121 | * [WFST](https://arxiv.org/abs/1507.08240) 122 | * [RNNLN](http://www.fit.vutbr.cz/~imikolov/rnnlm/) 123 | * Beam search decoder with LM or CLM 124 | * Encoder-decoder models with attention mechanism 125 | * ASR from raw speech 126 | * Real-time ASR 127 | 128 | #### brsp-related work 129 | * Investigate the brsdv1 model with 130 | * Multiplicative integration [arXiv](https://arxiv.org/abs/1606.06630) 131 | * Layer nomalization [arXiv](https://arxiv.org/abs/1607.06450) 132 | * Zoneout [arXiv](https://arxiv.org/abs/1606.01305) 133 | * Increase the number of datasets (ideally with free datasets) 134 | * Improve the LER 135 | * Train a language model 136 | 137 | #### code-related work 138 | * Test coverage 139 | * Examples 140 | * Better documentation 141 | * Improve the API 142 | * More features extractors, see [audio](preprocessing/audio.py) and [text](preprocessing/text.py) 143 | * More datasets parsers 144 | * [LibriSpeech](http://www.openslr.org/12/) 145 | * [Teldium](http://www-lium.univ-lemans.fr/en/content/ted-lium-corpus)) 146 | * WSJ 147 | * Switchboard 148 | * [TIMIT](https://catalog.ldc.upenn.edu/ldc93s1) 149 | * [VCTK](http://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html) 150 | * **Implement a nice wrapper for Kaldi in order to enjoy their feature extractors** 151 | * Better way of store the entire preprocessed dataset 152 | 153 | #### Known bugs 154 | * High memory and CPU consumption 155 | * Predicting with batch size greater than 1 (Keras' bug) 156 | * warp-ctc does not seem to speed up training 157 | * [zoneout](core/layers.py) implementation 158 | 159 | 160 | ## Requirements 161 | 162 | #### basic requirements 163 | * Python 2.7 164 | * Numpy 165 | * Scipy 166 | * Pyyaml 167 | * HDF5 168 | * Unidecode 169 | * Librosa 170 | * Tensorflow 171 | * Keras 172 | 173 | #### recommended 174 | * [warp-ctc](https://github.com/baidu-research/warp-ctc) (for fast CTC loss calculation) 175 | 176 | #### optional 177 | * [SpeechRecognition](https://pypi.python.org/pypi/SpeechRecognition/) (to use the [eval apis](extras/eval_apis.py)) 178 | * [openpyxl](https://pypi.python.org/pypi/openpyxl) (to [save the results in a excel file](extras/results2xlsx.py)) 179 | 180 | ## Acknowledgements 181 | * [python_speech_features](https://github.com/jameslyons/python_speech_features) for the [audio preprocessing](preprocessing/audio.py) 182 | * [Google Magenta](https://github.com/tensorflow/magenta) for the [hparams](core/hparams.py) 183 | * @robertomest for helping me with everything 184 | 185 | ## License 186 | See [LICENSE.md](LICENSE.md) for more information 187 | -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- 1 | import layers 2 | import layers_utils 3 | import metrics 4 | import ctc_utils 5 | import models 6 | import initializers 7 | import callbacks 8 | -------------------------------------------------------------------------------- /core/callbacks.py: -------------------------------------------------------------------------------- 1 | import keras.callbacks as callbacks 2 | 3 | import h5py 4 | import numpy as np 5 | import yaml 6 | 7 | 8 | class MetaCheckpoint(callbacks.ModelCheckpoint): 9 | """ 10 | Checkpoints some training information with the model. This should enable 11 | resuming training and having training information on every checkpoint. 12 | 13 | Thanks to Roberto Estevao @robertomest - robertomest@poli.ufrj.br 14 | """ 15 | 16 | def __init__(self, filepath, monitor='val_loss', verbose=0, 17 | save_best_only=False, save_weights_only=False, 18 | mode='auto', period=1, training_args=None, meta=None): 19 | 20 | super(MetaCheckpoint, self).__init__(filepath, monitor='val_loss', 21 | verbose=0, save_best_only=False, 22 | save_weights_only=False, 23 | mode='auto', period=1) 24 | 25 | self.filepath = filepath 26 | self.meta = meta or {'epochs': []} 27 | 28 | if training_args: 29 | training_args = vars(training_args) 30 | 31 | self.meta['training_args'] = training_args 32 | 33 | def on_train_begin(self, logs={}): 34 | super(MetaCheckpoint, self).on_train_begin(logs) 35 | 36 | def on_epoch_end(self, epoch, logs={}): 37 | super(MetaCheckpoint, self).on_epoch_end(epoch, logs) 38 | 39 | # Get statistics 40 | self.meta['epochs'].append(epoch) 41 | for k, v in logs.items(): 42 | # Get default gets the value or sets (and gets) the default value 43 | self.meta.setdefault(k, []).append(v) 44 | 45 | # Save to file 46 | filepath = self.filepath.format(epoch=epoch, **logs) 47 | 48 | if self.epochs_since_last_save == 0: 49 | with h5py.File(filepath, 'r+') as f: 50 | meta_group = f.create_group('meta') 51 | meta_group.attrs['training_args'] = yaml.dump( 52 | self.meta.get('training_args', '{}')) 53 | meta_group.create_dataset('epochs', 54 | data=np.array(self.meta['epochs'])) 55 | for k in logs: 56 | meta_group.create_dataset(k, data=np.array(self.meta[k])) 57 | 58 | 59 | class ProgbarLogger(callbacks.ProgbarLogger): 60 | 61 | def __init__(self, show_metrics=None): 62 | super(ProgbarLogger, self).__init__() 63 | 64 | self.show_metrics = show_metrics 65 | 66 | def on_train_begin(self, logs=None): 67 | super(ProgbarLogger, self).on_train_begin(logs) 68 | 69 | if self.show_metrics: 70 | self.params['metrics'] = self.show_metrics 71 | -------------------------------------------------------------------------------- /core/ctc_utils.py: -------------------------------------------------------------------------------- 1 | import keras 2 | import keras.backend as K 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | 8 | def decode(inputs, **kwargs): 9 | """ Decodes a sequence of probabilities choosing the path with highest 10 | probability of occur 11 | 12 | # Arguments 13 | is_greedy: if True (default) the greedy decoder will be used; 14 | otherwise beam search decoder will be used 15 | 16 | if is_greedy is False: 17 | see the documentation of tf.nn.ctc_beam_search_decoder for more 18 | options 19 | 20 | # Inputs 21 | A tuple (y_pred, seq_len) where: 22 | y_pred is a tensor (N, T, C) where N is the bath size, T is the 23 | maximum timestep and C is the number of classes (including the 24 | blank label) 25 | seq_len is a tensor (N,) that indicates the real number of 26 | timesteps of each sequence 27 | 28 | # Outputs 29 | A sparse tensor with the top path decoded sequence 30 | 31 | """ 32 | 33 | # Little hack for load_model 34 | import tensorflow as tf 35 | is_greedy = kwargs.get('is_greedy', True) 36 | y_pred, seq_len = inputs 37 | 38 | seq_len = tf.cast(seq_len[:, 0], tf.int32) 39 | y_pred = tf.transpose(y_pred, perm=[1, 0, 2]) 40 | 41 | if is_greedy: 42 | decoded = tf.nn.ctc_greedy_decoder(y_pred, seq_len)[0][0] 43 | else: 44 | beam_width = kwargs.get('beam_width', 100) 45 | top_paths = kwargs.get('top_paths', 1) 46 | merge_repeated = kwargs.get('merge_repeated', True) 47 | 48 | decoded = tf.nn.ctc_beam_search_decoder(y_pred, seq_len, beam_width, 49 | top_paths, 50 | merge_repeated)[0][0] 51 | 52 | return decoded 53 | 54 | 55 | def decode_output_shape(inputs_shape): 56 | y_pred_shape, seq_len_shape = inputs_shape 57 | return (y_pred_shape[:1], None) 58 | 59 | 60 | def ctc_lambda_func(args): 61 | """ CTC cost function 62 | """ 63 | y_pred, labels, inputs_length = args 64 | 65 | # Little hack for load_model 66 | import tensorflow as tf 67 | 68 | return tf.nn.ctc_loss(labels, 69 | tf.transpose(y_pred, perm=[1, 0, 2]), 70 | inputs_length[:, 0]) 71 | 72 | 73 | def ctc_dummy_loss(y_true, y_pred): 74 | """ Little hack to make CTC working with Keras 75 | """ 76 | return y_pred 77 | 78 | 79 | def decoder_dummy_loss(y_true, y_pred): 80 | """ Little hack to make CTC working with Keras 81 | """ 82 | return K.zeros((1,)) 83 | -------------------------------------------------------------------------------- /core/initializers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import keras.backend as K 4 | 5 | 6 | def k_init(k): 7 | def init(shape, name=None): 8 | return K.variable(k*np.ones(shape), dtype='float32', 9 | name=name) 10 | return init 11 | -------------------------------------------------------------------------------- /core/layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | import numpy as np 4 | 5 | import keras.backend as K 6 | import tensorflow as tf 7 | 8 | from keras import activations, initializations, regularizers 9 | import keras.layers as keras_layers 10 | from keras.layers.recurrent import Recurrent 11 | from keras.engine import Layer, InputSpec 12 | 13 | from .layers_utils import highway_bias_initializer 14 | from .layers_utils import layer_normalization 15 | from .layers_utils import multiplicative_integration_init 16 | from .layers_utils import multiplicative_integration 17 | from .layers_utils import zoneout 18 | 19 | from .initializers import k_init 20 | 21 | import logging 22 | 23 | 24 | class LayerNormalization(Layer): 25 | '''Normalize from all of the summed inputs to the neurons in a layer on 26 | a single training case. Unlike batch normalization, layer normalization 27 | performs exactly the same computation at training and tests time. 28 | 29 | # Arguments 30 | epsilon: small float > 0. Fuzz parameter 31 | num_var: how many tensor are condensed in the input 32 | weights: Initialization weights. 33 | List of 2 Numpy arrays, with shapes: 34 | `[(input_shape,), (input_shape,)]` 35 | Note that the order of this list is [gain, bias] 36 | gain_init: name of initialization function for gain parameter 37 | (see [initializations](../initializations.md)), or alternatively, 38 | Theano/TensorFlow function to use for weights initialization. 39 | This parameter is only relevant if you don't pass a `weights` 40 | argument. 41 | bias_init: name of initialization function for bias parameter 42 | (see [initializations](../initializations.md)), or alternatively, 43 | Theano/TensorFlow function to use for weights initialization. 44 | This parameter is only relevant if you don't pass a `weights` 45 | argument. 46 | # Input shape 47 | 48 | # Output shape 49 | Same shape as input. 50 | 51 | # References 52 | - [Layer Normalization](https://arxiv.org/abs/1607.06450) 53 | ''' 54 | def __init__(self, epsilon=1e-5, weights=None, gain_init='one', 55 | bias_init='zero', **kwargs): 56 | self.epsilon = epsilon 57 | self.gain_init = initializations.get(gain_init) 58 | self.bias_init = initializations.get(bias_init) 59 | self.initial_weights = weights 60 | self._logger = logging.getLogger('%s.%s' % (__name__, 61 | self.__class__.__name__)) 62 | 63 | super(LayerNormalization, self).__init__(**kwargs) 64 | 65 | def build(self, input_shape): 66 | self.input_spec = [InputSpec(shape=input_shape)] 67 | shape = (input_shape[-1],) 68 | 69 | self.g = self.gain_init(shape, name='{}_gain'.format(self.name)) 70 | self.b = self.bias_init(shape, name='{}_bias'.format(self.name)) 71 | 72 | self.trainable_weights = [self.g, self.b] 73 | 74 | if self.initial_weights is not None: 75 | self.set_weights(self.initial_weights) 76 | del self.initial_weights 77 | 78 | self.built = True 79 | 80 | def call(self, x, mask=None): 81 | return LN(x, self.g, self.b, epsilon=self.epsilon) 82 | 83 | def get_config(self): 84 | config = {"epsilon": self.epsilon, 85 | 'num_var': self.num_var, 86 | 'gain_init': self.gain_init.__name__, 87 | 'bias_init': self.bias_init.__name__} 88 | base_config = super(LayerNormalization, self).get_config() 89 | return dict(list(base_config.items()) + list(config.items())) 90 | 91 | 92 | class RHN(Recurrent): 93 | '''Recurrent Highway Network - Julian Georg Zilly, Rupesh Kumar Srivastava, 94 | Jan Koutník, Jürgen Schmidhuber - 2016. 95 | For a step-by-step description of the network, see 96 | [this paper](https://arxiv.org/abs/1607.03474). 97 | # Arguments 98 | output_dim: dimension of the internal projections and the final output. 99 | depth: recurrency depth size. 100 | init: weight initialization function. 101 | Can be the name of an existing function (str), 102 | or a Theano function (see: 103 | [initializations](../initializations.md)). 104 | inner_init: initialization function of the inner cells. 105 | bias_init: initialization function of the bias. 106 | (see [this 107 | post](http://people.idsia.ch/~rupesh/very_deep_learning/) 108 | for more information) 109 | activation: activation function. 110 | Can be the name of an existing function (str), 111 | or a Theano function (see: [activations](../activations.md)). 112 | inner_activation: activation function for the inner cells. 113 | coupling: if True, carry gate will be coupled to the transform gate, 114 | i.e., c = 1 - t 115 | W_regularizer: instance of [WeightRegularizer](../regularizers.md) 116 | (eg. L1 or L2 regularization), applied to the input weights 117 | matrices. 118 | U_regularizer: instance of [WeightRegularizer](../regularizers.md) 119 | (eg. L1 or L2 regularization), applied to the recurrent weights 120 | matrices. 121 | b_regularizer: instance of [WeightRegularizer](../regularizers.md), 122 | applied to the bias. 123 | dropout_W: float between 0 and 1. Fraction of the input units to drop 124 | for input gates. 125 | dropout_U: float between 0 and 1. Fraction of the input units to drop 126 | for recurrent connections. 127 | # References 128 | - [Recurrent Highway Networks](https://arxiv.org/abs/1607.03474) 129 | (original paper) 130 | - [Layer Normalization](https://arxiv.org/abs/1607.06450) 131 | - [A Theoretically Grounded Application of Dropout in Recurrent Neural 132 | Networks](http://arxiv.org/abs/1512.05287) 133 | # TODO: different dropout rates for each layer 134 | ''' 135 | def __init__(self, output_dim, depth=1, 136 | init='glorot_uniform', inner_init='orthogonal', 137 | bias_init=highway_bias_initializer, 138 | activation='tanh', inner_activation='hard_sigmoid', 139 | coupling=True, layer_norm=False, ln_gain_init='one', 140 | ln_bias_init='zero', mi=False, 141 | W_regularizer=None, U_regularizer=None, 142 | b_regularizer=None, dropout_W=0., dropout_U=0., **kwargs): 143 | self.output_dim = output_dim 144 | self.depth = depth 145 | self.init = initializations.get(init) 146 | self.inner_init = initializations.get(inner_init) 147 | self.bias_init = initializations.get(bias_init) 148 | self.activation = activations.get(activation) 149 | self.inner_activation = activations.get(inner_activation) 150 | self.coupling = coupling 151 | self.has_layer_norm = layer_norm 152 | self.ln_gain_init = initializations.get(ln_gain_init) 153 | self.ln_bias_init = initializations.get(ln_bias_init) 154 | self.mi = mi 155 | self.W_regularizer = regularizers.get(W_regularizer) 156 | self.U_regularizer = regularizers.get(U_regularizer) 157 | self.b_regularizer = regularizers.get(b_regularizer) 158 | self.dropout_W, self.dropout_U = dropout_W, dropout_U 159 | 160 | self._logger = logging.getLogger('%s.%s' % (__name__, 161 | self.__class__.__name__)) 162 | 163 | if self.dropout_W or self.dropout_U: 164 | self.uses_learning_phase = True 165 | 166 | super(RHN, self).__init__(**kwargs) 167 | 168 | if not self.consume_less == "gpu": 169 | self._logger.warning("Ignoring consume_less=%s. Setting to 'gpu'." % self.consume_less) 170 | 171 | def build(self, input_shape): 172 | self.input_spec = [InputSpec(shape=input_shape)] 173 | self.input_dim = input_shape[2] 174 | 175 | if self.stateful: 176 | self.reset_states() 177 | else: 178 | self.states = [None] 179 | 180 | self.W = self.init((self.input_dim, (2 + (not self.coupling)) * 181 | self.output_dim), name='{}_W'.format(self.name)) 182 | self.Us = [self.inner_init( 183 | (self.output_dim, (2 + (not self.coupling)) * self.output_dim), 184 | name='%s_%d_U' % (self.name, i)) for i in xrange(self.depth)] 185 | 186 | bias_init_value = K.get_value(self.bias_init((self.output_dim,))) 187 | b = [np.zeros(self.output_dim), 188 | np.copy(bias_init_value)] 189 | 190 | if not self.coupling: 191 | b.append(np.copy(bias_init_value)) 192 | 193 | self.bs = [K.variable(np.hstack(b), 194 | name='%s_%d_b' % (self.name, i)) for i in 195 | xrange(self.depth)] 196 | 197 | self.trainable_weights = [self.W] + self.Us + self.bs 198 | 199 | if self.mi: 200 | self.mi_params = [multiplicative_integration_init( 201 | ((2 + (not self.coupling)) * self.output_dim,), 202 | name='%s_%d' % (self.name, i), 203 | has_input=(i == 0)) for i in xrange(self.depth)] 204 | 205 | for p in self.mi_params: 206 | if type(p) in {list, tuple}: 207 | self.trainable_weights += p 208 | else: 209 | self.trainable_weights += [p] 210 | 211 | if self.has_layer_norm: 212 | self.ln_weights = [] 213 | ln_names = ['h', 't', 'c'] 214 | for l in xrange(self.depth): 215 | 216 | ln_gains = [self.ln_gain_init( 217 | (self.output_dim,), name='%s_%d_ln_gain_%s' % 218 | (self.name, l, ln_names[i])) for i in xrange(1)] 219 | 220 | ln_biases = [self.ln_bias_init( 221 | (self.output_dim,), name='%s_%d_ln_bias_%s' % 222 | (self.name, l, ln_names[i])) for i in xrange(1)] 223 | self.ln_weights.append([ln_gains, ln_biases]) 224 | self.trainable_weights += ln_gains + ln_biases 225 | 226 | self.regularizers = [] 227 | if self.W_regularizer: 228 | self.W_regularizer.set_param(self.W) 229 | self.regularizers.append(self.W_regularizer) 230 | if self.U_regularizer: 231 | self.U_regularizer.set_param(self.U) 232 | self.regularizers.append(self.U_regularizer) 233 | if self.b_regularizer: 234 | self.b_regularizer.set_param(self.b) 235 | self.regularizers.append(self.b_regularizer) 236 | 237 | if self.initial_weights is not None: 238 | self.set_weights(self.initial_weights) 239 | del self.initial_weights 240 | 241 | def reset_states(self): 242 | assert self.stateful, 'Layer must be stateful.' 243 | input_shape = self.input_spec[0].shape 244 | if not input_shape[0]: 245 | raise Exception('If a RNN is stateful, a complete ' + 246 | 'input_shape must be provided (including batch \ 247 | size).') 248 | if hasattr(self, 'states'): 249 | K.set_value(self.states[0], 250 | np.zeros((input_shape[0], self.output_dim))) 251 | else: 252 | self.states = [K.zeros((input_shape[0], self.output_dim))] 253 | 254 | def step(self, x, states): 255 | s_tm1 = states[0] 256 | 257 | for layer in xrange(self.depth): 258 | B_U = states[layer + 1][0] 259 | U, b = self.Us[layer], self.bs[layer] 260 | 261 | if layer == 0: 262 | B_W = states[layer + 1][1] 263 | Wx = K.dot(x * B_W, self.W) 264 | else: 265 | Wx = 0 266 | 267 | Us = K.dot(s_tm1 * B_U, U) 268 | 269 | if self.mi: 270 | a = multiplicative_integration(Wx, Us, 271 | self.mi_params[layer]) + b 272 | else: 273 | a = Wx + Us + b 274 | 275 | a0 = a[:, :self.output_dim] 276 | a1 = a[:, self.output_dim: 2 * self.output_dim] 277 | if not self.coupling: 278 | a2 = a[:, 2 * self.output_dim:] 279 | 280 | if self.has_layer_norm: 281 | ln_gains, ln_biases = self.ln_weights[layer] 282 | a0 = LN(a0, ln_gains[0], ln_biases[0]) 283 | # a1 = LN(a1, ln_gains[1], ln_biases[1]) 284 | # if not self.coupling: 285 | # a2 = LN(a2, ln_gains[2], ln_biases[2]) 286 | 287 | # Equation 7 288 | h = self.activation(a0) 289 | # Equation 8 290 | t = self.inner_activation(a1) 291 | # Equation 9 292 | if not self.coupling: 293 | c = self.inner_activation(a2) 294 | else: 295 | c = 1 - t # carry gate was coupled to the transform gate 296 | 297 | s = h * t + s_tm1 * c 298 | s_tm1 = s 299 | 300 | return s, [s] 301 | 302 | def get_constants(self, x): 303 | constants = [] 304 | 305 | for layer in xrange(self.depth): 306 | constant = [] 307 | if 0 < self.dropout_U < 1: 308 | ones = K.ones_like(K.reshape(x[:, 0, 0], (-1, 1))) 309 | ones = K.tile(ones, (1, self.output_dim)) 310 | B_U = K.in_train_phase(K.dropout(ones, self.dropout_U), ones) 311 | constant.append(B_U) 312 | else: 313 | constant.append(K.cast_to_floatx(1.)) 314 | 315 | if layer == 0: 316 | if 0 < self.dropout_W < 1: 317 | input_shape = self.input_spec[0].shape 318 | input_dim = input_shape[-1] 319 | ones = K.ones_like(K.reshape(x[:, 0, 0], (-1, 1))) 320 | ones = K.tile(ones, (1, input_dim)) 321 | B_W = K.in_train_phase(K.dropout(ones, 322 | self.dropout_W), ones) 323 | constant.append(B_W) 324 | else: 325 | constant.append(K.cast_to_floatx(1.)) 326 | 327 | constants.append(constant) 328 | 329 | return constants 330 | 331 | def get_config(self): 332 | config = {'output_dim': self.output_dim, 333 | 'depth': self.depth, 334 | 'init': self.init.__name__, 335 | 'inner_init': self.inner_init.__name__, 336 | 'bias_init': self.bias_init.__name__, 337 | 'activation': self.activation.__name__, 338 | 'inner_activation': self.inner_activation.__name__, 339 | 'coupling': self.coupling, 340 | 'layer_norm': self.has_layer_norm, 341 | 'ln_gain_init': self.ln_gain_init.__name__, 342 | 'ln_bias_init': self.ln_bias_init.__name__, 343 | 'mi': self.mi, 344 | 'W_regularizer': self.W_regularizer.get_config() if 345 | self.W_regularizer else None, 346 | 'U_regularizer': self.U_regularizer.get_config() if 347 | self.U_regularizer else None, 348 | 'b_regularizer': self.b_regularizer.get_config() if 349 | self.b_regularizer else None, 350 | 'dropout_W': self.dropout_W, 351 | 'dropout_U': self.dropout_U} 352 | base_config = super(RHN, self).get_config() 353 | return dict(list(base_config.items()) + list(config.items())) 354 | 355 | 356 | class LSTM(keras_layers.LSTM): 357 | """ 358 | # Arguments 359 | ln: None, list of float or list of list of floats. Determines whether will apply LN or not. If list of floats, the same init will be applied to every LN; otherwise will be individual 360 | mi: list of floats or None. If list of floats, the multiplicative integration will be active and initialized with these values. 361 | zoneout_h: float between 0 and 1. Fraction of the hidden/output units to maintain their previous values. 362 | zoneout_c: float between 0 and 1. Fraction of the cell units to maintain their previous values. 363 | # References 364 | - [Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations](https://arxiv.org/abs/1606.01305) 365 | """ 366 | def __init__(self, output_dim, zoneout_h=0., zoneout_c=0., 367 | layer_norm=None, mi=None, **kwargs): 368 | 369 | super(LSTM, self).__init__(output_dim, **kwargs) 370 | 371 | self._logger = logging.getLogger('%s.%s' % (__name__, 372 | self.__class__.__name__)) 373 | 374 | self.layer_norm = layer_norm 375 | self.mi = mi 376 | 377 | self.zoneout_c = zoneout_c 378 | self.zoneout_h = zoneout_h 379 | 380 | if self.zoneout_h or self.zoneout_c: 381 | self.uses_learning_phase = True 382 | 383 | if self.consume_less != 'gpu': 384 | self._logger.warn("Invalid option for `consume_less`. Falling back \ 385 | to option `gpu`.") 386 | self.consume_less = 'gpu' 387 | 388 | def build(self, input_shape): 389 | super(LSTM, self).build(input_shape) 390 | 391 | if self.mi is not None: 392 | alpha_init, beta1_init, beta2_init = self.mi 393 | 394 | self.mi_alpha = self.add_weight( 395 | (4 * self.output_dim, ), 396 | initializer=k_init(alpha_init), 397 | name='{}_mi_alpha'.format(self.name)) 398 | self.mi_beta1 = self.add_weight( 399 | (4 * self.output_dim, ), 400 | initializer=k_init(beta1_init), 401 | name='{}_mi_beta1'.format(self.name)) 402 | self.mi_beta2 = self.add_weight( 403 | (4 * self.output_dim, ), 404 | initializer=k_init(beta2_init), 405 | name='{}_mi_beta2'.format(self.name)) 406 | 407 | if self.layer_norm is not None: 408 | ln_gain_init, ln_bias_init = self.layer_norm 409 | 410 | self.layer_norm_params = {} 411 | for n, i in {'Uh': 4, 'Wx': 4, 'new_c': 1}.items(): 412 | 413 | gain = self.add_weight( 414 | (i*self.output_dim, ), 415 | initializer=k_init(ln_gain_init), 416 | name='%s_ln_gain_%s' % (self.name, n)) 417 | bias = self.add_weight( 418 | (i*self.output_dim, ), 419 | initializer=k_init(ln_bias_init), 420 | name='%s_ln_bias_%s' % (self.name, n)) 421 | 422 | self.layer_norm_params[n] = [gain, bias] 423 | 424 | def _layer_norm(self, x, param_name): 425 | if self.layer_norm is None: 426 | return x 427 | 428 | gain, bias = self.layer_norm_params[param_name] 429 | 430 | return layer_normalization(x, gain, bias) 431 | 432 | def step(self, x, states): 433 | h_tm1 = states[0] 434 | c_tm1 = states[1] 435 | B_U = states[2] 436 | B_W = states[3] 437 | 438 | Uh = self._layer_norm(K.dot(h_tm1 * B_U[0], self.U), 'Uh') 439 | Wx = self._layer_norm(K.dot(x * B_W[0], self.W), 'Wx') 440 | 441 | if self.mi is not None: 442 | z = self.mi_alpha * Wx * Uh + self.mi_beta1 * Uh + \ 443 | self.mi_beta2 * Wx + self.b 444 | else: 445 | z = Wx + Uh + self.b 446 | 447 | z_i = z[:, :self.output_dim] 448 | z_f = z[:, self.output_dim: 2 * self.output_dim] 449 | z_c = z[:, 2 * self.output_dim: 3 * self.output_dim] 450 | z_o = z[:, 3 * self.output_dim:] 451 | 452 | i = self.inner_activation(z_i) 453 | f = self.inner_activation(z_f) 454 | c = f * c_tm1 + i * self.activation(z_c) 455 | o = self.inner_activation(z_o) 456 | 457 | if 0 < self.zoneout_c < 1: 458 | c = zoneout(self.zoneout_c, c_tm1, c, 459 | noise_shape=(self.output_dim,)) 460 | 461 | # this is returning a lot of nan 462 | new_c = self._layer_norm(c, 'new_c') 463 | 464 | h = o * self.activation(new_c) 465 | if 0 < self.zoneout_h < 1: 466 | h = zoneout(self.zoneout_h, h_tm1, h, 467 | noise_shape=(self.output_dim,)) 468 | 469 | return h, [h, c] 470 | 471 | def get_config(self): 472 | config = {'layer_norm': self.layer_norm, 473 | 'mi': self.mi, 474 | 'zoneout_h': self.zoneout_h, 475 | 'zoneout_c': self.zoneout_c 476 | } 477 | 478 | base_config = super(LSTM, self).get_config() 479 | return dict(list(base_config.items()) + list(config.items())) 480 | 481 | 482 | def recurrent(output_dim, model='keras_lstm', activation='tanh', 483 | regularizer=None, dropout=0., **kwargs): 484 | if model == 'rnn': 485 | return keras_layers.SimpleRNN(output_dim, activation=activation, 486 | W_regularizer=regularizer, 487 | U_regularizer=regularizer, 488 | dropout_W=dropout, dropout_U=dropout, consume_less='gpu', 489 | **kwargs) 490 | if model == 'gru': 491 | return keras_layers.GRU(output_dim, activation=activation, 492 | W_regularizer=regularizer, 493 | U_regularizer=regularizer, dropout_W=dropout, 494 | dropout_U=dropout, 495 | consume_less='gpu', **kwargs) 496 | if model == 'keras_lstm': 497 | return keras_layers.LSTM(output_dim, activation=activation, 498 | W_regularizer=regularizer, 499 | U_regularizer=regularizer, 500 | dropout_W=dropout, dropout_U=dropout, 501 | consume_less='gpu', **kwargs) 502 | if model == 'rhn': 503 | return RHN(output_dim, depth=1, 504 | bias_init=highway_bias_initializer, 505 | activation=activation, layer_norm=False, ln_gain_init='one', 506 | ln_bias_init='zero', mi=False, 507 | W_regularizer=regularizer, U_regularizer=regularizer, 508 | dropout_W=dropout, dropout_U=dropout, consume_less='gpu', 509 | **kwargs) 510 | 511 | if model == 'lstm': 512 | return LSTM(output_dim, activation=activation, 513 | W_regularizer=regularizer, U_regularizer=regularizer, 514 | dropout_W=dropout, dropout_U=dropout, 515 | consume_less='gpu', **kwargs) 516 | raise ValueError('model %s was not recognized' % model) 517 | 518 | 519 | if __name__ == "__main__": 520 | from keras.models import Sequential 521 | from keras.utils.visualize_util import plot 522 | 523 | model = Sequential() 524 | model.add(RHN(10, input_dim=2, depth=2, layer_norm=True)) 525 | # plot(model) 526 | -------------------------------------------------------------------------------- /core/layers_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | import keras.backend as K 5 | import tensorflow as tf 6 | 7 | from keras import activations, initializations, regularizers 8 | from keras.layers import GRU, SimpleRNN 9 | from keras.layers import LSTM as keras_LSTM 10 | 11 | 12 | def highway_bias_initializer(shape, name=None): 13 | return -2 * initializations.one(shape, name=name) 14 | 15 | 16 | def layer_normalization(x, gain, bias, epsilon=1e-5): 17 | mean, std = tf.nn.moments(x, [1], keep_dims=True) 18 | x_normed = (x - mean) / K.sqrt(std + epsilon) * gain + bias 19 | return x_normed 20 | 21 | 22 | def multiplicative_integration_init(shape, alpha_init='one', 23 | beta1_init='one', beta2_init='one', 24 | name='mi', has_input=True): 25 | beta1 = initializations.get(beta1_init)(shape, name='%s_beta1' % name) 26 | if has_input: 27 | alpha = initializations.get(alpha_init)(shape, name='%s_alpha' % name) 28 | beta2 = initializations.get(beta2_init)(shape, name='%s_beta2' % name) 29 | return alpha, beta1, beta2 30 | 31 | return beta1 32 | 33 | 34 | def zoneout(level, h_tm1, h, noise_shape): 35 | '''Apply a zoneout function to preserve a fraction of values from h_tm1 in 36 | h.''' 37 | h_diff = h - h_tm1 38 | h = K.in_train_phase(K.dropout(h_diff, 39 | level, 40 | noise_shape=noise_shape), h_diff) 41 | h = h * (1. - level) + h_tm1 42 | return h 43 | 44 | 45 | def multiplicative_integration(Wx, Uz, params, has_input=True): 46 | if has_input: 47 | alpha, beta1, beta2 = params 48 | return alpha * Wx * Uz + beta1 * Uz + beta2 * Wx 49 | 50 | beta1 = params 51 | return beta1 * Uz 52 | 53 | 54 | def to_dense(x): 55 | if K.is_sparse(x): 56 | return tf.sparse_tensor_to_dense(x, default_value=-1) 57 | return x 58 | 59 | 60 | def to_dense_output_shape(input_shape): 61 | return input_shape 62 | 63 | 64 | LN = layer_normalization 65 | mi = multiplicative_integration 66 | mi_init = multiplicative_integration_init 67 | -------------------------------------------------------------------------------- /core/metrics.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def ler(y_true, y_pred, **kwargs): 5 | """ 6 | Label Error Rate. For more information see 'tf.edit_distance' 7 | """ 8 | return tf.reduce_mean(tf.edit_distance(y_pred, y_true, **kwargs)) 9 | -------------------------------------------------------------------------------- /core/models.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import core.ctc_utils as ctc_utils 6 | from utils.hparams import HParams 7 | 8 | import keras 9 | import keras.backend as K 10 | from keras.initializations import uniform 11 | from keras.activations import relu 12 | 13 | from keras.models import Model 14 | 15 | from keras.layers import Input 16 | from keras.layers import GaussianNoise 17 | from keras.layers import TimeDistributed 18 | from keras.layers import Dense 19 | from .layers import LSTM 20 | from keras.layers import Masking 21 | from keras.layers import Bidirectional 22 | from keras.layers import Lambda 23 | from keras.layers import Dropout 24 | from keras.layers import merge 25 | 26 | from keras.regularizers import l1, l2, l1l2 27 | 28 | from .layers import recurrent 29 | 30 | 31 | def ctc_model(inputs, output, **kwargs): 32 | """ Given the input and output returns a model appending ctc_loss, the 33 | decoder, labels, and inputs_length 34 | 35 | # Arguments 36 | see core.ctc_utils.layer_utils.decode for more arguments 37 | """ 38 | 39 | # Define placeholders 40 | labels = Input(name='labels', shape=(None,), dtype='int32', sparse=True) 41 | inputs_length = Input(name='inputs_length', shape=(None,), dtype='int32') 42 | 43 | # Define a decoder 44 | dec = Lambda(ctc_utils.decode, output_shape=ctc_utils.decode_output_shape, 45 | arguments={'is_greedy': True}, name='decoder') 46 | y_pred = dec([output, inputs_length]) 47 | 48 | ctc = Lambda(ctc_utils.ctc_lambda_func, output_shape=(1,), name="ctc") 49 | # Define loss as a layer 50 | loss = ctc([output, labels, inputs_length]) 51 | 52 | return Model(input=[inputs, labels, inputs_length], output=[loss, y_pred]) 53 | 54 | 55 | def graves2006(num_features=26, num_hiddens=100, num_classes=28, std=.6): 56 | """ Implementation of Graves' model 57 | Reference: 58 | [1] Graves, Alex, et al. "Connectionist temporal classification: 59 | labelling unsegmented sequence data with recurrent neural networks." 60 | Proceedings of the 23rd international conference on Machine learning. 61 | ACM, 2006. 62 | """ 63 | 64 | x = Input(name='inputs', shape=(None, num_features)) 65 | o = x 66 | 67 | o = GaussianNoise(std)(o) 68 | o = Bidirectional(LSTM(num_hiddens, 69 | return_sequences=True, 70 | consume_less='gpu'))(o) 71 | o = TimeDistributed(Dense(num_classes))(o) 72 | 73 | return ctc_model(x, o) 74 | 75 | 76 | def eyben(num_features=39, num_hiddens=[78, 120, 27], num_classes=28): 77 | """ Implementation of Eybens' model 78 | Reference: 79 | [1] Eyben, Florian, et al. "From speech to letters-using a novel neural 80 | network architecture for grapheme based asr." Automatic Speech 81 | Recognition & Understanding, 2009. ASRU 2009. IEEE Workshop on. IEEE, 82 | 2009. 83 | """ 84 | 85 | assert len(num_hiddens) == 3 86 | 87 | x = Input(name='inputs', shape=(None, num_features)) 88 | o = x 89 | 90 | if num_hiddens[0]: 91 | o = TimeDistributed(Dense(num_hiddens[0]))(o) 92 | if num_hiddens[1]: 93 | o = Bidirectional(LSTM(num_hiddens[1], 94 | return_sequences=True, 95 | consume_less='gpu'))(o) 96 | if num_hiddens[2]: 97 | o = Bidirectional(LSTM(num_hiddens[2], 98 | return_sequences=True, 99 | consume_less='gpu'))(o) 100 | 101 | o = TimeDistributed(Dense(num_classes))(o) 102 | 103 | return ctc_model(x, o) 104 | 105 | 106 | def maas(num_features=81, num_classes=29, num_hiddens=1824, dropout=0.1, 107 | max_value=20): 108 | """ Maas' model. 109 | Reference: 110 | [1] Maas, Andrew L., et al. "Lexicon-Free Conversational Speech 111 | Recognition with Neural Networks." HLT-NAACL. 2015. 112 | """ 113 | 114 | x = Input(name='inputs', shape=(None, num_features)) 115 | o = x 116 | 117 | def clipped_relu(x): 118 | return relu(x, max_value=max_value) 119 | 120 | # First layer 121 | o = TimeDistributed(Dense(num_hiddens))(o) 122 | o = TimeDistributed(Activation(clipped_relu))(o) 123 | 124 | # Second layer 125 | o = TimeDistributed(Dense(num_hiddens))(o) 126 | o = TimeDistributed(Activation(clipped_relu))(o) 127 | 128 | # Third layer 129 | o = Bidirectional(SimpleRNN(num_hiddens, return_sequences=True, 130 | dropout_W=dropout, 131 | activation=clipped_relu, 132 | init='he_normal'), merge_mode='sum')(o) 133 | 134 | # Fourth layer 135 | o = TimeDistributed(Dense(num_hiddens))(o) 136 | o = TimeDistributed(Activation(clipped_relu))(o) 137 | 138 | # Fifth layer 139 | o = TimeDistributed(Dense(num_hiddens))(o) 140 | o = TimeDistributed(Activation(clipped_relu))(o) 141 | 142 | # Output layer 143 | o = TimeDistributed(Dense(num_classes))(o) 144 | 145 | return ctc_model(x, o) 146 | 147 | 148 | def deep_speech(num_features=81, num_classes=29, num_hiddens=2048, dropout=0.1, 149 | max_value=20): 150 | """ Deep Speech model. 151 | 152 | Contains five layers: 3 FC - BRNN - 1 FC 153 | Dropout only applied to fully connected layers (between 5% to 10%) 154 | 155 | Note: 156 | * We are not translating the raw audio files by 5 ms (Sec 2.1 in [1]) 157 | * We are not striding the RNN to halve the timesteps (Sec 3.3 in [1]) 158 | * We are not using frames of context 159 | * Their output contains {a, ..., z, space, apostrophe, blank} 160 | Experiment 5.1: Conversational speech: Switchboard Hub5'00 (full) 161 | * Input - 80 linearly spaced log filter banks and an energy term. The 162 | filter banks are computed over windows of 20ms strided by 10ms. 163 | * Speaker adaptation - spectral features are normalized on a per 164 | speaker basis. 165 | * Hidden units: {2304, 2048} 166 | * Essemble of 4 networks 167 | Experiment 5.2: Noisy speech 168 | * Input - 160 linearly spaced log filter banks. The filter banks are 169 | computed over windows of 20ms strided by 10ms. Global mean and standard 170 | deviation over training set normalization 171 | * Speaker adaptation - none 172 | * Hidden units: 2560 173 | * Essemble of 6 networks 174 | Reference: 175 | [1] HANNUN, A. Y. et al. Deep Speech: Scaling up end-to-end speech 176 | recognition. arXiV, 2014. 177 | """ 178 | x = Input(name='inputs', shape=(None, num_features)) 179 | o = x 180 | 181 | def clipped_relu(x): 182 | return relu(x, max_value=max_value) 183 | 184 | # First layer 185 | o = TimeDistributed(Dense(num_hiddens))(o) 186 | o = TimeDistributed(Activation(clipped_relu))(o) 187 | o = TimeDistributed(Dropout(dropout))(o) 188 | 189 | # Second layer 190 | o = TimeDistributed(Dense(num_hiddens))(o) 191 | o = TimeDistributed(Activation(clipped_relu))(o) 192 | o = TimeDistributed(Dropout(dropout))(o) 193 | 194 | # Third layer 195 | o = TimeDistributed(Dense(num_hiddens))(o) 196 | o = TimeDistributed(Activation(clipped_relu))(o) 197 | o = TimeDistributed(Dropout(dropout))(o) 198 | 199 | # Fourth layer 200 | o = Bidirectional(SimpleRNN(num_hiddens, return_sequences=True, 201 | dropout_W=dropout, 202 | activation=clipped_relu, 203 | init='he_normal'), merge_mode='sum')(o) 204 | o = TimeDistributed(Dropout(dropout))(o) 205 | 206 | # Fifth layer 207 | o = TimeDistributed(Dense(num_hiddens))(o) 208 | o = TimeDistributed(Activation(clipped_relu))(o) 209 | o = TimeDistributed(Dropout(dropout))(o) 210 | 211 | # Output layer 212 | o = TimeDistributed(Dense(num_classes))(o) 213 | 214 | return ctc_model(x, o) 215 | 216 | 217 | def brsmv1(num_features=39, num_classes=28, num_hiddens=256, num_layers=5, 218 | dropout=0.2, zoneout=0., input_dropout=False, 219 | input_std_noise=.0, weight_decay=1e-4, residual=None, 220 | layer_norm=None, mi=None, activation='tanh'): 221 | """ BRSM v1.0 222 | Improved features: 223 | * Residual connection 224 | * Variational Dropout 225 | * Zoneout 226 | * Layer Normalization 227 | * Multiplicative Integration 228 | Note: 229 | Dropout, zoneout and weight decay is tied through layers, in order to 230 | minimizing the number of hyper parameters 231 | Reference: 232 | [1] Gal, Y, "A Theoretically Grounded Application of Dropout in 233 | Recurrent Neural Networks", 2015. 234 | [2] Graves, Alex, Abdel-rahman Mohamed, and Geoffrey Hinton. "Speech 235 | recognition with deep recurrent neural networks", 2013. 236 | [3] Krueger, David, et al. "Zoneout: Regularizing rnns by randomly 237 | preserving hidden activations", 2016. 238 | [4] Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton. "Layer 239 | normalization.", 2016. 240 | [5] Wu, Yuhuai, et al. "On multiplicative integration with recurrent 241 | neural networks." Advances In Neural Information Processing Systems. 242 | 2016. 243 | [6] Wu, Yonghui, et al. "Google's Neural Machine Translation System: 244 | Bridging the Gap between Human and Machine Translation.", 2016. 245 | """ 246 | 247 | x = Input(name='inputs', shape=(None, num_features)) 248 | o = x 249 | 250 | if input_std_noise is not None: 251 | o = GaussianNoise(input_std_noise)(o) 252 | 253 | if residual is not None: 254 | o = TimeDistributed(Dense(num_hiddens*2, 255 | W_regularizer=l2(weight_decay)))(o) 256 | 257 | if input_dropout: 258 | o = Dropout(dropout)(o) 259 | 260 | for i, _ in enumerate(range(num_layers)): 261 | new_o = Bidirectional(LSTM(num_hiddens, 262 | return_sequences=True, 263 | W_regularizer=l2(weight_decay), 264 | U_regularizer=l2(weight_decay), 265 | dropout_W=dropout, 266 | dropout_U=dropout, 267 | zoneout_c=zoneout, 268 | zoneout_h=zoneout, 269 | mi=mi, 270 | layer_norm=layer_norm, 271 | activation=activation))(o) 272 | 273 | if residual is not None: 274 | o = merge([new_o, o], mode=residual) 275 | else: 276 | o = new_o 277 | 278 | o = TimeDistributed(Dense(num_classes, 279 | W_regularizer=l2(weight_decay)))(o) 280 | 281 | return ctc_model(x, o) 282 | -------------------------------------------------------------------------------- /data/download_brsmv1.sh: -------------------------------------------------------------------------------- 1 | echo "Downloading the brsmv1 pre-trained model:" 2 | mkdir -p models/ 3 | wget -c -q --show-progress -O models/brsmv1.h5 https://www.dropbox.com/s/ink8zxhzysxvzxa/best_ptbr.h5?dl=0 4 | -------------------------------------------------------------------------------- /data/download_datasets.sh: -------------------------------------------------------------------------------- 1 | echo "Downloading pt-br datasets. This may take a while" 2 | echo "Downloading Sid dataset:" 3 | wget -c -q --show-progress -O ./sid.tar.gz https://www.dropbox.com/s/0wxlweatglrr7wl/sid.tar.gz?dl=0 4 | echo "Downloading VoxForge dataset:" 5 | wget -c -q --show-progress -O ./voxforge-ptbr.tar.gz https://www.dropbox.com/s/wrguetal6xmrgta/voxforge-ptbr.tar.gz?dl=0 6 | echo "Downloading LapsBenchmark1.4 dataset:" 7 | wget -c -q --show-progress -O ./lapsbm.tar.gz https://www.dropbox.com/s/8aqm9ktulmnry6d/lapsbm.tar.gz?dl=0 8 | 9 | echo "Extracting Sid dataset..." 10 | mkdir -p sid 11 | cd sid; tar -xzf ../sid.tar.gz; cd .. 12 | 13 | echo "Extracting VoxForge dataset..." 14 | mkdir -p voxforge 15 | cd voxforge; tar -xzf ../voxforge-ptbr.tar.gz; cd .. 16 | 17 | echo "Extracting LapsBenchmark1.4 dataset..." 18 | mkdir -p lapsbm 19 | cd lapsbm; tar -xzf ../lapsbm.tar.gz; cd .. 20 | 21 | echo "Finished." 22 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | from utils.generic_utils import safe_mkdirs 4 | import os 5 | 6 | DT_ABSPATH = os.path.join(os.path.sep.join(os.path.dirname(os.path.abspath(__file__)).split(os.path.sep)[:-1]), '.datasets') 7 | safe_mkdirs(DT_ABSPATH) 8 | 9 | from datasets.dataset_parser import DatasetParser 10 | from datasets.sid import Sid 11 | from datasets.lapsbm import LapsBM 12 | from datasets.voxforge import VoxForge 13 | from datasets.cslu import CSLU 14 | from datasets.dummy import Dummy 15 | from datasets.brsd import BRSD 16 | -------------------------------------------------------------------------------- /datasets/brsd.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from datasets import DatasetParser 6 | from datasets import LapsBM 7 | 8 | from utils.generic_utils import get_from_module 9 | 10 | 11 | class BRSD(DatasetParser): 12 | """ Brazilian Portuguese Speech dataset reader and parser 13 | 14 | This dataset is a combination of four smaller datasets (voxforge, lapsbm, 15 | sid, and cslu spoltech port). The dataset was divided in the following 16 | way: 17 | * Train: voxforge, sid, and cslu spoltech port 18 | * Valid: 5 women and 15 men from LaspBM 19 | * Test: 5 women 10 men from LapsBM (without overlapping with valid set 20 | either in speaker and utterance spoken) 21 | 22 | After cleaning (removing label with zero length, label with numeric 23 | digits, e.g., 4 instead of four) the training set contains 11702 24 | utterances with 425 speakers. 25 | 26 | """ 27 | 28 | def __init__(self, dataset_dir=None, name='brsd', **kwargs): 29 | 30 | dataset_dir = dataset_dir or {'lapsbm': None, 31 | 'voxforge': None, 32 | 'sid': None, 33 | 'cslu': None} 34 | 35 | super(BRSD, self).__init__(dataset_dir, name, **kwargs) 36 | 37 | @property 38 | def dataset_dir(self): 39 | """Filepath to the dataset directory""" 40 | return self._dataset_dir 41 | 42 | @dataset_dir.setter 43 | def dataset_dir(self, value): 44 | """Filepath to the dataset directory""" 45 | 46 | if value is None: 47 | raise ValueError("You must set the variable dataset_dir" 48 | " (the location of dataset) before continue") 49 | 50 | if not isinstance(value, dict): 51 | raise ValueError("dataset_dir must be a dictionary") 52 | 53 | for key in ('lapsbm', 'voxforge', 'sid'): 54 | if key not in value: 55 | raise KeyError("dataset_dir must have the key %s" % key) 56 | 57 | if 'cslu' not in value: 58 | self._logger.warning('CSLU not found. Ignoring it.') 59 | 60 | self._dataset_dir = value 61 | 62 | def _iter(self): 63 | 64 | for name, path in self.dataset_dir.items(): 65 | 66 | if name == 'lapsbm': 67 | continue 68 | 69 | try: 70 | dataset_cls = get_from_module('datasets*', name, regex=True) 71 | dataset = dataset_cls(dataset_dir=path) 72 | 73 | for d in dataset._iter(): 74 | yield {'duration': d['duration'], 75 | 'input': d['input'], 76 | 'label': d['label'], 77 | 'speaker': '%s_%s' % (str(dataset), d['speaker']), 78 | 'dataset': 'train'} 79 | except ValueError, e: 80 | self._logger.warning('Skipping dataset %s: %s' % (name, e.message)) 81 | # Test and valid set 82 | lapsbm = LapsBM(dataset_dir=self.dataset_dir['lapsbm'], split=True) 83 | for d in lapsbm._iter(): 84 | yield {'duration': d['duration'], 85 | 'input': d['input'], 86 | 'label': d['label'], 87 | 'speaker': '%s_%s' % (str(dataset), d['speaker']), 88 | 'dataset': d['dataset']} 89 | 90 | def _report(self, dl): 91 | report = '''General information: 92 | Number of utterances: %d 93 | Total size (in seconds) of utterances: %.f 94 | Number of speakers: %d''' % (len(dl['input']), 95 | sum(dl['duration']), 96 | len(set(dl['speaker']))) 97 | 98 | return report 99 | -------------------------------------------------------------------------------- /datasets/cslu.py: -------------------------------------------------------------------------------- 1 | from datasets import DatasetParser 2 | 3 | import os 4 | import re 5 | import librosa 6 | import codecs 7 | 8 | 9 | class CSLU(DatasetParser): 10 | """ CSLU Spoltech Port dataset reader and parser 11 | 12 | More about the dataset: https://catalog.ldc.upenn.edu/LDC2006S16 13 | """ 14 | 15 | def __init__(self, dataset_dir=None, name='cslu', **kwargs): 16 | 17 | dataset_dir = dataset_dir or 'data/cslu' 18 | 19 | super(CSLU, self).__init__(dataset_dir, name, **kwargs) 20 | 21 | def _iter(self): 22 | trans_directory = os.path.join(self.dataset_dir, 'trans') 23 | 24 | for speaker_path in os.listdir(trans_directory): 25 | 26 | root_path = os.path.join(os.path.abspath(trans_directory), 27 | speaker_path) 28 | 29 | if not os.path.isdir(os.path.join(root_path)): 30 | continue 31 | 32 | labels_files = os.listdir(root_path) 33 | 34 | for labels_file in labels_files: 35 | 36 | label = codecs.open( 37 | os.path.join(root_path, labels_file), 'r', 38 | 'latin-1').read().strip().lower() 39 | 40 | audio_file = os.path.join(os.path.abspath(self.dataset_dir), 41 | 'speech', speaker_path, 42 | labels_file[:-4]) 43 | 44 | audio_file = audio_file + '.wav' 45 | speaker_id = speaker_path 46 | 47 | try: 48 | duration = librosa.audio.get_duration(filename=audio_file) 49 | except IOError: 50 | self._logger.error('File %s not found' % audio_file) 51 | continue 52 | 53 | yield {'duration': duration, 54 | 'input': audio_file, 55 | 'label': label, 56 | 'speaker': speaker_id} 57 | 58 | def _report(self, dl): 59 | report = '''General information: 60 | Number of utterances: %d 61 | Total size (in seconds) of utterances: %.f 62 | Number of speakers: %d''' % (len(dl['audio']), sum(dl['duration']), 63 | len(set(dl['speaker']))) 64 | 65 | return report 66 | -------------------------------------------------------------------------------- /datasets/dataset_generator.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | from keras.preprocessing.image import Iterator 4 | from keras.preprocessing.sequence import pad_sequences 5 | 6 | import scipy 7 | import librosa 8 | import h5py 9 | import numpy as np 10 | import codecs 11 | import json 12 | import os 13 | 14 | import time 15 | 16 | from preprocessing import audio, text 17 | from utils import generic_utils as utils 18 | 19 | import logging 20 | 21 | 22 | class DatasetGenerator(object): 23 | """ Dataset generator that handles several forms of input and return an 24 | iterator over it. Only works for a CTC model 25 | 26 | # Arguments 27 | input_parser: instance of Feature [preprocessing.audio.Feature] 28 | feature that is applied to each audio file (or audio data) 29 | label_parser: instance of Parser [preprocessing.text.Parser]. 30 | parser that is applied to each label data 31 | batch_size: number of samples per batch 32 | shuffle: reordering index per epoch. This avoid some bias in training 33 | seed: default None 34 | """ 35 | 36 | def __init__(self, input_parser=None, label_parser=None, batch_size=32, 37 | shuffle=True, seed=None, mode='train'): 38 | self._logger = logging.getLogger('%s.%s' % (__name__, 39 | self.__class__.__name__)) 40 | self.input_parser = input_parser 41 | self.label_parser = label_parser 42 | self.batch_size = batch_size 43 | self.shuffle = shuffle 44 | self.seed = seed 45 | self.mode = mode 46 | 47 | def flow_from_fname(self, fname, datasets=None): 48 | """ Returns an specific iterator given the filename 49 | 50 | # Arguments 51 | datasets: str or list. If str will return one iterator; otherwise 52 | will return len(dataset) iterators for each dataset 53 | 54 | # Inputs 55 | fname: path to a file. 56 | *.h5 (HDF5 format) 57 | *json (JSON format) 58 | 59 | # Outputs 60 | If fname is: 61 | HDF5 format: H5Iterator 62 | JSON format: JSONIterator 63 | """ 64 | out = None 65 | datasets = datasets or ['/'] 66 | if type(datasets) not in (set, list): 67 | datasets = [datasets] 68 | 69 | if h5py.is_hdf5(fname): 70 | h5_f = h5py.File(fname, 'r') 71 | out = [self.flow_from_h5_group(h5_f[dataset]) 72 | for dataset in datasets] 73 | 74 | ext = os.path.splitext(fname)[1] 75 | if ext == '.json': 76 | out = [self.flow_from_json(fname, dataset) for dataset in datasets] 77 | 78 | if out is None: 79 | raise ValueError("Extension not recognized") 80 | 81 | if len(out) == 1: 82 | return out[0] 83 | return out 84 | 85 | def flow_from_json(self, fname, dataset=None): 86 | """ Returns JSONIterator given the filename""" 87 | return JSONIterator( 88 | fname, dataset, batch_size=self.batch_size, 89 | shuffle=self.shuffle, seed=self.seed, 90 | input_parser=self.input_parser, 91 | label_parser=self.label_parser, 92 | mode=self.mode) 93 | 94 | def flow_from_dl(self, dl, dataset=None): 95 | """ Return DictListIterator given a list of dictionaries. Each 96 | dictionary must have the keys 'input' and 'label' 97 | """ 98 | return DictListIterator(dl, dataset, batch_size=self.batch_size, 99 | shuffle=self.shuffle, seed=self.seed, 100 | input_parser=self.input_parser, 101 | label_parser=self.label_parser, 102 | mode=self.mode) 103 | 104 | def flow_from_h5_group(self, h5_group=None): 105 | """ Returns H5Iterator given a h5group from a HDF5 data 106 | """ 107 | return H5Iterator(h5_group, batch_size=self.batch_size, 108 | shuffle=self.shuffle, seed=self.seed, 109 | input_parser=self.input_parser, 110 | label_parser=self.label_parser, 111 | mode=self.mode) 112 | 113 | def flow_from_h5_file(self, h5_file, dataset='/'): 114 | h5_f = h5py.File(h5_file, 'r') 115 | return H5Iterator(h5_f[dataset], batch_size=self.batch_size, 116 | shuffle=self.shuffle, seed=self.seed, 117 | input_parser=self.input_parser, 118 | label_parser=self.label_parser, 119 | mode=self.mode) 120 | 121 | def flow(self, inputs, labels): 122 | return DatasetIterator(inputs, labels, batch_size=self.batch_size, 123 | shuffle=self.shuffle, seed=self.seed, 124 | input_parser=self.input_parser, 125 | label_parser=self.label_parser, 126 | mode=self.mode) 127 | 128 | 129 | class DatasetIterator(Iterator): 130 | 131 | def __init__(self, inputs, labels=None, batch_size=32, shuffle=False, 132 | seed=None, input_parser=None, label_parser=None, 133 | standarize=None, mode='train'): 134 | """ DatasetIterator iterates in a batch over a dataset and do some 135 | preprocessing on inputs and labels 136 | 137 | # Arguments 138 | inputs: a list of ndarray 139 | labels: a list of str or ndarray 140 | batch_size: size of each batch 141 | shuffle: if True after each epoch the dataset will shuffle the 142 | indexes 143 | seed: seed the random generator 144 | input_parser: instance of Feature 145 | [preprocessing.audio.Feature] 146 | feature that is applied to each ndarray in batch 147 | label_parser: instance of Parser [preprocessing.text.Parser]. 148 | parser that is applied to each label in batch 149 | standarize: if a set (mean, std), the input will be 150 | normalized 151 | mode: if 'predict', only the inputs is generated 152 | """ 153 | 154 | if labels is not None and len(inputs) != len(labels): 155 | raise ValueError('inputs and labels ' 156 | 'should have the same length. ' 157 | 'Found: len(inputs) = %s, len(labels) = %s' % 158 | (len(inputs), len(labels))) 159 | self._logger = logging.getLogger('%s.%s' % (__name__, 160 | self.__class__.__name__)) 161 | self.inputs = inputs 162 | self.labels = labels 163 | 164 | self.input_parser = input_parser 165 | self.label_parser = label_parser 166 | 167 | self.standarize = standarize 168 | self.mode = mode 169 | 170 | if self.input_parser is not None: 171 | logging.warning('Feature extractor is not None. It may slow down' 172 | + ' training') 173 | 174 | super(DatasetIterator, self).__init__(len(inputs), batch_size, 175 | shuffle, seed) 176 | 177 | @property 178 | def len(self): 179 | """ Return the total size of dataset 180 | """ 181 | return len(self.inputs) 182 | 183 | def next(self): 184 | """ Iterates over batches 185 | 186 | # Outputs 187 | Returns a tuple (input, output) that can be fed a CTC model 188 | input: is a list containing the inputs, labels and sequence 189 | length for the current batch 190 | output: is a list containing a vector of zeros (fake data for 191 | the decoder) and the batch labels for the decoder of a CTC 192 | model 193 | """ 194 | 195 | # Copy from DirectoryIterator from keras 196 | with self.lock: 197 | index_array, current_index, current_batch_size = next( 198 | self.index_generator) 199 | 200 | index_array.sort() 201 | 202 | index_array_list = index_array.tolist() 203 | 204 | batch_inputs, batch_inputs_len = self._make_in( 205 | self.inputs[index_array_list], current_batch_size) 206 | 207 | if self.labels is not None: 208 | batch_labels = self._make_out(self.labels[index_array_list], 209 | current_batch_size) 210 | else: 211 | batch_labels = None 212 | 213 | return self._make_in_out(batch_inputs, batch_labels, batch_inputs_len) 214 | 215 | def _make_in_out(self, batch_inputs, batch_labels, batch_inputs_len=None): 216 | # if label is not provided output is not necessary 217 | if batch_labels is None: 218 | return [batch_inputs, batch_inputs_len] 219 | 220 | return ([batch_inputs, batch_labels, batch_inputs_len], 221 | [np.zeros((batch_inputs.shape[0],)), batch_labels]) 222 | 223 | def _make_in(self, inputs, batch_size=None): 224 | if self.input_parser is not None: 225 | inputs = np.asarray([self.input_parser(i) for i in inputs]) 226 | 227 | batch_inputs = pad_sequences(inputs, dtype='float32', padding='post') 228 | 229 | if self.standarize: 230 | mean, std = self.standarize 231 | batch_inputs -= mean 232 | batch_inputs /= (std + self.eps) 233 | 234 | batch_inputs_len = np.asarray([i.shape[0] for i in inputs]) 235 | return batch_inputs, batch_inputs_len 236 | 237 | def _make_out(self, labels, batch_size=None): 238 | if self.labels is None or self.mode == 'predict': 239 | return None 240 | 241 | if self.label_parser is not None: 242 | labels = [self.label_parser(l) for l in labels] 243 | 244 | rows, cols, data = [], [], [] 245 | 246 | for row, label in enumerate(labels): 247 | cols.extend(range(len(label))) 248 | rows.extend(len(label) * [row]) 249 | data.extend(label) 250 | 251 | return scipy.sparse.coo_matrix((data, (rows, cols)), dtype='int32') 252 | 253 | 254 | class H5Iterator(DatasetIterator): 255 | 256 | def __init__(self, h5group, **kwargs): 257 | 258 | inputs = h5group['inputs'] 259 | labels = h5group['labels'] 260 | 261 | if kwargs.get('label_parser') is None: 262 | raise ValueError("label_parser must be set") 263 | 264 | self.num_feats = None 265 | if 'num_feats' in inputs.attrs.keys(): 266 | self.num_feats = inputs.attrs['num_feats'] 267 | 268 | self.durations = h5group['durations'] 269 | 270 | super(H5Iterator, self).__init__(inputs, labels, **kwargs) 271 | 272 | def _make_in(self, inputs, batch_size=None): 273 | 274 | if self.num_feats is not None: 275 | inputs = [i.reshape((-1, self.num_feats)) for i in inputs] 276 | 277 | return super(H5Iterator, self)._make_in(inputs) 278 | 279 | 280 | class JSONIterator(DatasetIterator): 281 | 282 | def __init__(self, fname, dataset=None, **kwargs): 283 | 284 | self._logger = logging.getLogger('%s.%s' % (__name__, 285 | self.__class__.__name__)) 286 | 287 | kwargs.setdefault('input_parser', audio.raw) 288 | 289 | if kwargs.get('input_parser') is None: 290 | raise ValueError("input_parser must be set") 291 | 292 | if kwargs.get('label_parser') is None: 293 | raise ValueError("label_parser must be set") 294 | 295 | with codecs.open(fname, 'r', encoding='utf8') as f: 296 | ld = json.load(f) 297 | 298 | data = utils.ld2dl(ld) 299 | 300 | if dataset and 'dataset' not in data: 301 | self._logger.warning('No dataset key found. Falling back to None') 302 | dataset = None 303 | 304 | if dataset: 305 | inputs = np.array([i for i, d in zip( 306 | data['input'], data['dataset']) if d == dataset]) 307 | labels = np.array([l for l, d in zip( 308 | data['label'], data['dataset']) if d == dataset]) 309 | else: 310 | inputs = np.array(data['input']) 311 | labels = np.array(data['label']) 312 | 313 | super(JSONIterator, self).__init__(inputs, labels, **kwargs) 314 | 315 | self.durations = np.array(data['duration']) 316 | 317 | 318 | class DictListIterator(DatasetIterator): 319 | 320 | def __init__(self, dict_list, dataset=None, **kwargs): 321 | 322 | kwargs.setdefault('input_parser', audio.raw) 323 | 324 | if kwargs.get('input_parser') is None: 325 | raise ValueError("input_parser must be set") 326 | 327 | if kwargs.get('label_parser') is None: 328 | raise ValueError("label_parser must be set") 329 | 330 | if dataset: 331 | dict_list = self._get_by_dataset(dict_list, dataset) 332 | 333 | inputs = np.array(dict_list['audio']) 334 | labels = np.array(dict_list['label']) 335 | 336 | super(DictListIterator, self).__init__(inputs, labels, **kwargs) 337 | 338 | self.durations = np.array(dict_list['duration']) 339 | 340 | def _get_by_dataset(self, dl, dataset): 341 | mask = [i for i, d in enumerate(dl['dataset']) if d == dataset] 342 | return {k: np.array(v)[mask] for k, v in dl.iteritems() 343 | if k != 'dataset'} 344 | -------------------------------------------------------------------------------- /datasets/dataset_parser.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import codecs 5 | import json 6 | 7 | import logging 8 | import h5py 9 | 10 | import numpy as np 11 | 12 | from preprocessing import audio, text 13 | from datasets import DT_ABSPATH 14 | from utils.generic_utils import safe_mkdirs, ld2dl 15 | 16 | 17 | class DatasetParser(object): 18 | '''Read data from directory and parser in a proper format 19 | ''' 20 | 21 | def __init__(self, dataset_dir, name=None): 22 | self._logger = logging.getLogger('%s.%s' % (__name__, 23 | self.__class__.__name__)) 24 | self.dataset_dir = dataset_dir 25 | self._name = name 26 | 27 | self.default_output_dir = os.path.join(DT_ABSPATH, self.name) 28 | 29 | @property 30 | def dataset_dir(self): 31 | """Filepath to the dataset directory""" 32 | return self._dataset_dir 33 | 34 | @dataset_dir.setter 35 | def dataset_dir(self, value): 36 | if value is None: 37 | raise ValueError("You must set the variable dataset_dir (the location of dataset) before continue") 38 | 39 | if not os.path.isdir(value): 40 | raise ValueError("Dataset directory provided is not a directory") 41 | self._dataset_dir = value 42 | 43 | def _to_ld(self, label_parser=None): 44 | ''' Transform dataset in a list of dictionary 45 | ''' 46 | data = [] 47 | for d in self._iter(): 48 | if not isinstance(d, dict): 49 | raise TypeError("__loop must return a dict") 50 | 51 | for k in ['input', 'label', 'duration']: 52 | if k not in d: 53 | raise KeyError("__loop must return a dict with %s key" % k) 54 | 55 | if not self._is_valid_label(d['label'], label_parser=label_parser): 56 | self._logger.warning(u'File %s has a forbidden label: "%s". Skipping', d['input'], d['label']) 57 | continue 58 | 59 | data.append(d) 60 | return data 61 | 62 | def to_json(self, fname=None): 63 | ''' Parse the entire dataset to a list of dictionary containin at least 64 | two keys: 65 | `input`: path to audio file 66 | `duration`: length of the audio 67 | `label`: transcription of the audio 68 | ''' 69 | fname = fname or os.path.join( 70 | self.default_output_dir, 'data.json') 71 | 72 | if os.path.exists(fname) and override: 73 | os.remove(fname) 74 | 75 | if not os.path.isdir(os.path.split(fname)[0]): 76 | safe_mkdirs(os.path.split(fname)[0]) 77 | 78 | data = self._to_ld() 79 | 80 | with codecs.open(fname, 'w', encoding='utf8') as f: 81 | json.dump(data, f) 82 | 83 | self._logger.info(self._report(ld2dl(data))) 84 | 85 | return fname 86 | 87 | def to_h5(self, fname=None, input_parser=audio.raw, label_parser=None, 88 | split_sets=True, override=False): 89 | ''' Generates h5df file for the dataset 90 | Note that this function will calculate the features rather than store 91 | the path to the audio file 92 | 93 | Args 94 | split_sets: if True and dataset is split in several sets (e.g. 95 | train, valid, test) the h5 file will create the corresponding 96 | datasets; otherwise no dataset is create 97 | ''' 98 | if not issubclass(input_parser.__class__, audio.Feature): 99 | raise TypeError("input_parser must be an instance of audio.Feature") 100 | 101 | fname = fname or os.path.join(self.default_output_dir, 'data.h5') 102 | 103 | if h5py.is_hdf5(fname) and override: 104 | os.remove(fname) 105 | 106 | if not os.path.isdir(os.path.split(fname)[0]): 107 | safe_mkdirs(os.path.split(fname)[0]) 108 | 109 | feat_name = str(input_parser) 110 | 111 | data = self._to_ld(label_parser=label_parser) 112 | 113 | if len(data) == 0: 114 | raise IndexError("Data is empty") 115 | 116 | datasets = ['/'] 117 | if 'dataset' in data[0]: 118 | datasets = list(set([d['dataset'] for d in data])) 119 | 120 | self._logger.info('Opening %s', fname) 121 | with h5py.File(fname) as f: 122 | 123 | # create all datasets 124 | for dataset in datasets: 125 | 126 | group = f['/'] 127 | if dataset != '/': 128 | group = f.create_group(dataset) 129 | 130 | inputs = group.create_dataset( 131 | 'inputs', (0,), maxshape=(None,), 132 | dtype=h5py.special_dtype(vlen=np.dtype('float32'))) 133 | 134 | if input_parser.num_feats: 135 | inputs.attrs['num_feats'] = input_parser.num_feats 136 | 137 | group.create_dataset( 138 | 'labels', (0,), maxshape=(None,), 139 | dtype=h5py.special_dtype(vlen=unicode)) 140 | 141 | group.create_dataset( 142 | 'durations', (0,), maxshape=(None,)) 143 | 144 | for i, d in enumerate(data): 145 | 146 | dataset = '/' 147 | if dataset not in datasets: 148 | dataset = d['dataset'] 149 | 150 | # HDF5 pointers 151 | inputs = f[dataset]['inputs'] 152 | labels = f[dataset]['labels'] 153 | durations = f[dataset]['durations'] 154 | 155 | # Data 156 | input_ = input_parser(d['input']) 157 | label = d['label'] 158 | duration = d['duration'] 159 | 160 | inputs.resize(inputs.shape[0] + 1, axis=0) 161 | inputs[inputs.shape[0] - 1] = input_.flatten().astype('float32') 162 | 163 | labels.resize(labels.shape[0] + 1, axis=0) 164 | labels[labels.shape[0] - 1] = label.encode('utf8') 165 | 166 | durations.resize(durations.shape[0] + 1, axis=0) 167 | durations[durations.shape[0] - 1] = duration 168 | 169 | # Flush to disk only when it reaches 128 samples 170 | if i % 128 == 0: 171 | self._logger.info('%d/%d done.' % (i, len(data))) 172 | f.flush() 173 | 174 | f.flush() 175 | self._logger.info('%d/%d done.' % (len(data), len(data))) 176 | 177 | return fname 178 | 179 | def _iter(self): 180 | raise NotImplementedError("_iter must be implemented") 181 | 182 | def _report(self, dl): 183 | """ 184 | Args 185 | dl: dictionary of list, where the keys were defined in _iter() 186 | """ 187 | raise NotImplementedError("_report must be implemented") 188 | 189 | def _is_valid_label(self, label, label_parser=None): 190 | if len(label) == 0: 191 | return False 192 | 193 | if label_parser is not None: 194 | return label_parser.is_valid(label) 195 | 196 | return True 197 | 198 | @property 199 | def name(self): 200 | return self._name 201 | 202 | def __str__(self): 203 | return self.name 204 | -------------------------------------------------------------------------------- /datasets/dummy.py: -------------------------------------------------------------------------------- 1 | from datasets import DatasetParser 2 | 3 | import os 4 | import re 5 | import librosa 6 | import codecs 7 | import tempfile 8 | 9 | import numpy as np 10 | 11 | 12 | class Dummy(DatasetParser): 13 | """ Fake dataset reader and parser to do some tests 14 | 15 | # Arguments 16 | num_speakers: number of speakers 17 | num_utterances_per_speaker: number of utterances that each speaker will 18 | have 19 | max_duration: max duration in seconds of each fake audio 20 | min_duration: min duration in seconds of each fake audio 21 | max_label_length: max size of each fake label 22 | fs: sampling frequency of each fake audio 23 | split: list with two values. It will divide this dataset in three sets 24 | (train, valid and test) given the proportions 25 | """ 26 | 27 | def __init__(self, dataset_dir=None, num_speakers=10, 28 | num_utterances_per_speaker=10, 29 | max_duration=10.0, min_duration=1.0, max_label_length=50, 30 | fs=16e3, split=None, name='dummy', **kwargs): 31 | ''' 32 | Args: 33 | split: list or nparray of size 2 that splits the data between 34 | train, valid and test. example: split = [.8 .15] = 80% train, 15% 35 | valid and 5% test 36 | ''' 37 | 38 | super(Dummy, self).__init__(None, name, **kwargs) 39 | 40 | self.num_speakers = num_speakers 41 | self.num_utterances_per_speaker = num_utterances_per_speaker 42 | self.max_duration = max_duration 43 | self.min_duration = min_duration 44 | self.fs = fs 45 | self.max_label_length = max_label_length 46 | self.split = split 47 | 48 | if split is not None and (len(split) != 2 or np.sum(split) > 1.): 49 | raise ValueError('Split must have len = 2 and must sum <= 1') 50 | 51 | @property 52 | def dataset_dir(self): 53 | """Filepath to the dataset directory""" 54 | return self._dataset_dir 55 | 56 | @dataset_dir.setter 57 | def dataset_dir(self, value): 58 | self._dataset_dir = value 59 | 60 | def _iter(self): 61 | 62 | counter = 0 63 | total = self.num_speakers * self.num_utterances_per_speaker 64 | 65 | for speaker in range(self.num_speakers): 66 | for utterance in range(self.num_utterances_per_speaker): 67 | 68 | duration = np.random.uniform(low=self.min_duration, 69 | high=self.max_duration) 70 | 71 | samples = np.floor(duration * self.fs) 72 | audio = np.random.randn(int(samples)) 73 | 74 | audio_file = tempfile.NamedTemporaryFile(delete=False) 75 | audio_fname = audio_file.name 76 | audio_file.close() 77 | 78 | librosa.output.write_wav(audio_fname, audio, self.fs) 79 | 80 | label = np.random.randint( 81 | low=ord('a'), high=ord('z'), 82 | size=(np.random.randint(2, self.max_label_length),)) 83 | 84 | label = ''.join([chr(l) for l in label]) 85 | 86 | data = {'duration': duration, 87 | 'input': audio_fname, 88 | 'label': label, 89 | 'speaker': 'speaker_%d' % speaker} 90 | 91 | if self.split is not None: 92 | if counter < np.floor(self.split[0] * total): 93 | dataset = 'train' 94 | elif counter < np.floor(np.sum(self.split) * total): 95 | dataset = 'valid' 96 | else: 97 | dataset = 'test' 98 | 99 | data['dataset'] = dataset 100 | counter += 1 101 | 102 | yield data 103 | 104 | def _report(self, dl): 105 | report = '''General information 106 | Number of utterances: %d 107 | Total size (in seconds) of utterances: %.f 108 | Number of speakers: %d''' % (len(dl['audio']), 109 | sum(dl['duration']), 110 | len(set(dl['speaker']))) 111 | 112 | return report 113 | -------------------------------------------------------------------------------- /datasets/lapsbm.py: -------------------------------------------------------------------------------- 1 | from datasets import DatasetParser 2 | 3 | import os 4 | import re 5 | import librosa 6 | import codecs 7 | 8 | 9 | class LapsBM(DatasetParser): 10 | """ Laps benchmark version 1.4 dataset reader and parser 11 | 12 | More about this dataset: http://www.laps.ufpa.br/falabrasil/downloads.php 13 | """ 14 | 15 | version = '1.4' 16 | 17 | # Random separation of LAPSBM1.4 dataset in validation and test if required 18 | # 5 women, 10 men 19 | _test_speaker_id = [3, 11, 13, 17, 12, 20 | 33, 5, 22, 16, 8, 21 | 4, 0, 20, 10, 9] 22 | 23 | # 5 women, 15 men 24 | _valid_speaker_id = [29, 32, 14, 31, 25, 25 | 23, 19, 26, 6, 2, 26 | 24, 15, 1, 21, 28, 27 | 30, 34, 27, 18, 7] 28 | 29 | def __init__(self, dataset_dir=None, name='lapsbm', split=False, **kwargs): 30 | 31 | dataset_dir = dataset_dir or 'data/lapsbm' 32 | 33 | self._split = split 34 | 35 | super(LapsBM, self).__init__(dataset_dir, name, **kwargs) 36 | 37 | def _iter(self): 38 | for speaker_path in os.listdir(self.dataset_dir): 39 | 40 | root_path = os.path.join(os.path.abspath(self.dataset_dir), 41 | speaker_path) 42 | 43 | if not os.path.isdir(os.path.join(root_path)): 44 | continue 45 | 46 | label_files = [f for f in os.listdir(root_path) 47 | if '.txt' in f.lower()] 48 | 49 | for label_file in label_files: 50 | 51 | label = ' '.join( 52 | codecs.open( 53 | os.path.join(root_path, label_file), 'r', 54 | encoding='utf8') 55 | .read().strip().split(' ')).lower() 56 | 57 | audio_file = os.path.join(root_path, 58 | "%s.wav" % (label_file[:-4])) 59 | gender_speaker = speaker_path.split('-')[1] 60 | gender = gender_speaker[0].lower() 61 | speaker_id = gender_speaker[1:] 62 | 63 | try: 64 | duration = librosa.audio.get_duration(filename=audio_file) 65 | except IOError: 66 | print('File %s not found' % audio_file) 67 | continue 68 | 69 | dataset = 'valid' 70 | if int(speaker_id) in self._test_speaker_id: 71 | dataset = 'test' 72 | 73 | data = {'duration': duration, 74 | 'input': audio_file, 75 | 'label': label, 76 | 'gender': gender, 77 | 'speaker': speaker_id} 78 | 79 | if self._split: 80 | data['dataset'] = dataset 81 | 82 | yield data 83 | 84 | def _report(self, dl): 85 | report = '''General information: 86 | Number of utterances: %d 87 | Total size (in seconds) of utterances: %.f 88 | Number of speakers: %d' 89 | %% of female speaker: %.2f%%''' \ 90 | % (len(dl['audio']), sum(dl['duration']), len(set(dl['speaker'])), 91 | 100 * (sum([1 for g in dl['gender'] if g == 'f']) / 92 | (1.0 * len(dl['gender'])))) 93 | 94 | return report 95 | -------------------------------------------------------------------------------- /datasets/sid.py: -------------------------------------------------------------------------------- 1 | from datasets import DatasetParser 2 | 3 | import os 4 | import re 5 | import librosa 6 | import codecs 7 | 8 | import numpy as np 9 | 10 | regex = r"Nome=(?P.*)[\n]+Idade=(?P.*)[\n]+.*[\n]+Sexo=(?P.*)[\n]+Escolaridade=(?P.*)[\n]+" 11 | 12 | 13 | class Sid(DatasetParser): 14 | """ Sid dataset reader and parser 15 | """ 16 | 17 | def __init__(self, dataset_dir=None, name='sid', **kwargs): 18 | 19 | dataset_dir = dataset_dir or 'data/sid' 20 | 21 | super(Sid, self).__init__(dataset_dir, name, **kwargs) 22 | 23 | def _iter(self): 24 | for speaker_path in os.listdir(self.dataset_dir): 25 | 26 | root_path = os.path.join(os.path.abspath(self.dataset_dir), 27 | speaker_path) 28 | 29 | if not os.path.isdir(os.path.join(root_path)): 30 | continue 31 | 32 | labels_file = os.path.join(root_path, 'prompts.txt') 33 | 34 | speaker_info_file = os.path.join(root_path, 'speaker.txt') 35 | 36 | with open(speaker_info_file) as f: 37 | info_text = f.read() 38 | 39 | pattern = re.compile(regex, re.MULTILINE | re.UNICODE) 40 | 41 | info = list(re.finditer(pattern, info_text))[0].groupdict() 42 | 43 | gender = info['gender'][0].lower() 44 | speaker_id = speaker_path.lower() 45 | 46 | try: 47 | age = int(info['age']) 48 | except ValueError: 49 | self._logger.error('age %s could not be converted in int.', 50 | (info['age'])) 51 | age = 0 52 | 53 | for line in codecs.open(labels_file, 'r', encoding='utf8'): 54 | 55 | split = line.strip().split('=') 56 | file_id = int(split[0]) 57 | 58 | label = split[1].lower() 59 | 60 | audio_file = os.path.join( 61 | root_path, "%s%03d" % (speaker_path, file_id)) + '.wav' 62 | 63 | try: 64 | duration = librosa.audio.get_duration(filename=audio_file) 65 | except IOError: 66 | self._logger.error('File %s not found' % audio_file) 67 | continue 68 | 69 | yield {'duration': duration, 70 | 'input': audio_file, 71 | 'label': label, 72 | 'gender': gender, 73 | 'speaker': speaker_id, 74 | 'age': age} 75 | 76 | def _report(self, dl): 77 | args = len(dl['audio']), sum(dl['duration']), 78 | len(set(dl['speaker'])), 79 | 100 * (sum([1 for g in dl['gender'] if g == 'f']) / 80 | (1.0 * len(dl['gender']))), 81 | min([a for a in dl['age'] if a is not 0]), 82 | max(dl['age']), np.mean([a for a in dl['age'] if a is not 0]) 83 | 84 | report = '''General information 85 | Number of utterances: %d 86 | Total size (in seconds) of utterances: %.f 87 | Number of speakers: %d 88 | %% of female speaker: %.2f%% 89 | age range: from %d to %d. Mean: %.f''' % (args) 90 | 91 | return report 92 | 93 | 94 | if __name__ == '__main__': 95 | """ Script to fix some errors in sid dataset about the name convention 96 | on folder and some errors in transcription 97 | """ 98 | parser = argparse.ArgumentParser() 99 | parser.add_argument('data_directory', type=str, 100 | help='Path to data directory') 101 | parser.add_argument('output_directory', type=str, 102 | help='Path to data directory') 103 | args = parser.parse_args() 104 | 105 | data_directory = args.data_directory 106 | output_directory = args.output_directory 107 | 108 | # fix wav filenamess 109 | matches = [] 110 | for root, dirnames, filenames in os.walk(data_directory): 111 | for filename in fnmatch.filter(filenames, '*.[Ww][Aa][Vv]'): 112 | filepath = os.path.join(root, filename) 113 | number = "%03d" % int(filename[-7:-4]) 114 | prefix = filepath.split(os.path.sep)[-2] 115 | 116 | new_filename = "%s%s" % (prefix, number) + '.wav' 117 | new_filepath = os.path.join(output_directory, root, new_filename) 118 | 119 | if not os.path.exists(os.path.join(output_directory, root)): 120 | os.makedirs(os.path.join(output_directory, root)) 121 | 122 | copyfile(filepath, new_filepath) 123 | 124 | for root, dirnames, filenames in os.walk(data_directory): 125 | for filename in fnmatch.filter(filenames, '*.[tT][xX][tT]'): 126 | filepath = os.path.join(root, filename) 127 | 128 | if filename.lower().startswith('texto'): 129 | filename = 'prompts.txt' 130 | 131 | new_filepath = os.path.join(output_directory, 132 | root, filename.lower()) 133 | copyfile(filepath, new_filepath) 134 | -------------------------------------------------------------------------------- /datasets/voxforge.py: -------------------------------------------------------------------------------- 1 | from datasets import DatasetParser 2 | 3 | import os 4 | import re 5 | import librosa 6 | import codecs 7 | 8 | regex = r"User\s+Name\:[\s]*(?P.*)[\n]+.*[\n]+Gender\:[\s]*(?P[a-zA-Z]+)[\w\r\s\n:\/]+Pronunciation dialect\:\s+(?P.*)" 9 | 10 | 11 | class VoxForge(DatasetParser): 12 | """ VoxForge (only portuguese brazilian audio files) dataset reader and parser 13 | 14 | More about the dataset: http://www.voxforge.org/ 15 | """ 16 | 17 | IGNORED_LIST = ['Marcelo-20131106-iqc', 18 | 'anonymous-20140619-wcy', 19 | 'ThiagoCastro-20131129-qpn', 20 | 'anonymous-20131016-uzv'] 21 | 22 | def __init__(self, dataset_dir=None, name='voxforge', **kwargs): 23 | 24 | dataset_dir = dataset_dir or 'data/voxforge' 25 | 26 | super(VoxForge, self).__init__(dataset_dir, name, **kwargs) 27 | 28 | if (self.dataset_dir is not None and 29 | os.path.isdir(os.path.join(self.dataset_dir, 'files'))): 30 | 31 | self.dataset_dir = os.path.join(self.dataset_dir, 'files') 32 | 33 | def _iter(self): 34 | for speaker_path in os.listdir(self.dataset_dir): 35 | 36 | if speaker_path in self.IGNORED_LIST: 37 | continue 38 | 39 | root_path = os.path.join( 40 | os.path.abspath(self.dataset_dir), speaker_path) 41 | 42 | if not os.path.isdir(os.path.join(root_path)): 43 | continue 44 | 45 | labels_file = os.path.join(root_path, 'etc', 'PROMPTS') 46 | 47 | if not os.path.exists(labels_file): 48 | labels_file = os.path.join(root_path, 'PROMPTS') 49 | 50 | speaker_info_file = os.path.join(root_path, 'etc', 'README') 51 | 52 | if not os.path.exists(speaker_info_file): 53 | speaker_info_file = os.path.join(root_path, 'README') 54 | 55 | with open(speaker_info_file) as f: 56 | info_text = f.read() 57 | 58 | pattern = re.compile(regex, re.MULTILINE | re.UNICODE) 59 | 60 | info = list(re.finditer(pattern, info_text))[0].groupdict() 61 | 62 | gender = info['gender'][0].lower() 63 | speaker_id = info['speaker'] 64 | 65 | for line in codecs.open(labels_file, 'r', encoding='utf8'): 66 | split = line.strip().split() 67 | file_id = split[0].split('/')[-1] 68 | 69 | label = ' '.join(split[1:]).lower() 70 | 71 | audio_file = os.path.join(root_path, 'wav', file_id) + '.wav' 72 | 73 | if not os.path.exists(audio_file): 74 | audio_file = os.path.join(root_path, file_id) + '.wav' 75 | 76 | try: 77 | duration = librosa.audio.get_duration(filename=audio_file) 78 | except IOError: 79 | self._logger.error('File %s not found' % audio_file) 80 | continue 81 | 82 | yield {'duration': duration, 83 | 'input': audio_file, 84 | 'label': label, 85 | 'gender': gender, 86 | 'speaker': speaker_id} 87 | 88 | def _report(self, dl): 89 | args = len(dl['audio']), sum(dl['duration']), 90 | len(set(dl['speaker'])), 91 | 100 * (sum([1 for g in dl['gender'] if g == 'f']) / 92 | (1.0 * len(dl['gender']))), 93 | 100 * (sum([1 for s in dl['speaker'] if s == 'anonymous']) / 94 | (1.0 * len(dl['speaker']))) 95 | 96 | report = '''General information 97 | Number of utterances: %d 98 | Total size (in seconds) of utterances: %.f 99 | Number of speakers: %d 100 | %% of female speaker: %.2f%% 101 | Anonymous speaker: %.2f%%''' % (args) 102 | 103 | return report 104 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import codecs 7 | import json 8 | import numpy as np 9 | # Preventing pool_allocator message 10 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 11 | 12 | import argparse 13 | import h5py 14 | import inspect 15 | 16 | from preprocessing import audio, text 17 | 18 | from utils import generic_utils as utils 19 | from utils.hparams import HParams 20 | 21 | from datasets.dataset_generator import DatasetGenerator, DatasetIterator 22 | 23 | from utils.core_utils import setup_gpu, load_model 24 | 25 | if __name__ == '__main__': 26 | parser = argparse.ArgumentParser(description='Evaluating an ASR system.') 27 | 28 | parser.add_argument('--model', required=True, type=str) 29 | parser.add_argument('--dataset', required=True, type=str) 30 | parser.add_argument('--subset', type=str, default='test') 31 | 32 | parser.add_argument('--batch_size', default=32, type=int) 33 | 34 | # Features generation (if necessary) 35 | parser.add_argument('--input_parser', type=str, default=None) 36 | parser.add_argument('--input_parser_params', nargs='+', default=[]) 37 | 38 | # Label generation (if necessary) 39 | parser.add_argument('--label_parser', type=str, 40 | default='simple_char_parser') 41 | parser.add_argument('--label_parser_params', nargs='+', default=[]) 42 | 43 | # Other configs 44 | parser.add_argument('--gpu', default='0', type=str) 45 | parser.add_argument('--allow_growth', default=False, action='store_true') 46 | 47 | parser.add_argument('--save_transcriptions', default=None, type=str) 48 | 49 | args = parser.parse_args() 50 | args_nondefault = utils.parse_nondefault_args( 51 | args, parser.parse_args( 52 | ['--model', args.model, '--dataset', args.dataset])) 53 | 54 | # GPU configuration 55 | setup_gpu(args.gpu, args.allow_growth) 56 | 57 | # Loading model 58 | model, meta = load_model(args.model, return_meta=True, mode='eval') 59 | 60 | args = HParams(**meta['training_args']).update(vars(args_nondefault)) 61 | 62 | # Features extractor 63 | input_parser = utils.get_from_module('preprocessing.audio', 64 | args.input_parser, 65 | params=args.input_parser_params) 66 | 67 | # Recovering text parser 68 | label_parser = utils.get_from_module('preprocessing.text', 69 | args.label_parser, 70 | params=args.label_parser_params) 71 | 72 | data_gen = DatasetGenerator(input_parser, label_parser, 73 | batch_size=args.batch_size, seed=0) 74 | test_flow = data_gen.flow_from_fname(args.dataset, datasets=args.subset) 75 | 76 | metrics = model.evaluate_generator(test_flow, test_flow.len, 77 | max_q_size=10, nb_worker=1) 78 | 79 | for m, v in zip(model.metrics_names, metrics): 80 | print('%s: %4f' % (m, v)) 81 | 82 | from keras import backend as K; K.clear_session() 83 | -------------------------------------------------------------------------------- /extras/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | from utils.generic_utils import setup_logging 4 | setup_logging() 5 | -------------------------------------------------------------------------------- /extras/apis.py: -------------------------------------------------------------------------------- 1 | import os 2 | import speech_recognition as sr 3 | 4 | r = sr.Recognizer() 5 | 6 | def recognize_from_api(audio, api, name='API', safe=True, **kwargs): 7 | if not isinstance(audio, sr.AudioData): 8 | with sr.AudioFile(audio) as source: 9 | audio = r.record(source) 10 | try: 11 | return api(audio, **kwargs) 12 | except sr.UnknownValueError as e: 13 | if not safe: 14 | raise e 15 | return "\t%s could not understand audio" % name 16 | except sr.RequestError as e: 17 | if not safe: 18 | raise e 19 | return "\tCould not request results from %s \ 20 | service; {0}" % (name, e) 21 | 22 | 23 | def recognize_google(audio, 24 | credentials=os.environ['GOOGLE_CLOUD_API'], 25 | **kwargs): 26 | 27 | return recognize_from_api(audio, r.recognize_google_cloud, 28 | name='Google Cloud Speech', 29 | credentials_json=credentials, 30 | **kwargs) 31 | 32 | 33 | def recognize_bing(audio, key=os.environ['BING_API'], **kwargs): 34 | return recognize_from_api(audio, r.recognize_bing, 35 | name='Microsoft Bing Voice', 36 | key=key, **kwargs) 37 | 38 | 39 | def recognize_ibm(audio, 40 | username=os.environ['IBM_USERNAME'], 41 | password=os.environ['IBM_PASSWORD'], **kwargs): 42 | return recognize_from_api(audio, r.recognize_ibm, 43 | name='IBM Speech to Text', 44 | username=username, password=password, 45 | **kwargs) 46 | -------------------------------------------------------------------------------- /extras/ctc_viz.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import numpy as np 4 | 5 | from datasets.dataset_generator import DatasetGenerator, DatasetIterator 6 | 7 | from utils.core_utils import setup_gpu, load_model 8 | 9 | from utils.hparams import HParams 10 | from utils import generic_utils as utils 11 | 12 | from preprocessing import audio, text 13 | 14 | import matplotlib 15 | import matplotlib.pyplot as plt 16 | 17 | if __name__ == '__main__': 18 | 19 | parser = argparse.ArgumentParser(description='Evaluating an ASR system.') 20 | 21 | parser.add_argument('--model', required=True, type=str) 22 | parser.add_argument('--dataset', default=None, type=str) 23 | parser.add_argument('--files', default=[], type=str, nargs='+') 24 | parser.add_argument('--labels', default=[], nargs='+', type=str) 25 | parser.add_argument('--subset', type=str, default='test') 26 | 27 | # Features generation (if necessary) 28 | parser.add_argument('--input_parser', type=str, default=None) 29 | parser.add_argument('--input_parser_params', nargs='+', default=[]) 30 | 31 | # Label generation (if necessary) 32 | parser.add_argument('--label_parser', type=str, 33 | default='simple_char_parser') 34 | parser.add_argument('--label_parser_params', nargs='+', default=[]) 35 | 36 | # Other configs 37 | parser.add_argument('--gpu', default='0', type=str) 38 | parser.add_argument('--allow_growth', default=False, action='store_true') 39 | 40 | 41 | parser.add_argument('--plt_backend', type=str, default="Qt5Agg") 42 | 43 | parser.add_argument('--save', default=None, type=str) 44 | 45 | args = parser.parse_args() 46 | args_nondefault = utils.parse_nondefault_args( 47 | args, parser.parse_args( 48 | ['--model', args.model, '--dataset', args.dataset])) 49 | 50 | matplotlib.use(args.plt_backend) 51 | 52 | if args.dataset is None and len(args.files) == 0: 53 | raise ValueError('dataset or file args must be set.') 54 | 55 | if args.dataset and args.files: 56 | print('Both dataset and file args was set. Ignoring file args.') 57 | 58 | # GPU configuration 59 | setup_gpu(args.gpu, args.allow_growth) 60 | 61 | # Loading model 62 | model, meta = load_model(args.model, return_meta=True, mode='eval') 63 | 64 | args = HParams(**meta['training_args']).update(vars(args_nondefault)) 65 | 66 | # Features extractor 67 | input_parser = utils.get_from_module('preprocessing.audio', 68 | args.input_parser, 69 | params=args.input_parser_params) 70 | 71 | # Recovering text parser 72 | label_parser = utils.get_from_module('preprocessing.text', 73 | args.label_parser, 74 | params=args.label_parser_params) 75 | 76 | if args.dataset is not None: 77 | data_gen = DatasetGenerator(input_parser, label_parser, 78 | batch_size=1, seed=0, mode='predict') 79 | test_flow = data_gen.flow_from_fname(args.dataset, 80 | datasets=args.subset) 81 | else: 82 | if len(args.files) == 0: 83 | raise ValueError("files arg must be > 0") 84 | 85 | test_flow = DatasetIterator(np.array(args.files), None, 86 | input_parser=input_parser, 87 | label_parser=label_parser, mode='predict') 88 | test_flow.labels = np.array([u'']*len(args.file)) 89 | 90 | model = load_model(args.model, mode='predict', decoder=False) 91 | 92 | results = [] 93 | 94 | plt.figure() 95 | for index in range(test_flow.len): 96 | prediction = model.predict(test_flow.next()) 97 | 98 | truth = label_parser._sanitize(test_flow.labels[0]) 99 | 100 | plt.plot(prediction[0,...]) 101 | plt.show() 102 | 103 | 104 | from keras import backend as K; K.clear_session() 105 | -------------------------------------------------------------------------------- /extras/eval_apis.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import argparse 7 | import codecs 8 | import json 9 | import time 10 | 11 | from preprocessing import audio, text 12 | from utils import generic_utils as utils 13 | 14 | import apis 15 | import speech_recognition as sr 16 | 17 | if __name__ == '__main__': 18 | parser = argparse.ArgumentParser(description='Evaluating an ASR system \ 19 | over an API.') 20 | 21 | parser.add_argument('--dataset', required=True, type=str) 22 | parser.add_argument('--language', default='pt-BR', type=str) 23 | parser.add_argument('--all', action='store_true', help='Will evaluate \ 24 | over all dataset, not only with the dt key equals test.') 25 | 26 | # Label generation (if necessary) 27 | parser.add_argument('--label_parser', type=str, 28 | default='simple_char_parser') 29 | parser.add_argument('--label_parser_params',nargs='+', default=[]) 30 | 31 | # Other configs 32 | parser.add_argument('--save_every', default=10, type=int) 33 | parser.add_argument('--resume', action='store_true') 34 | parser.add_argument('--save', default=None, type=str) 35 | parser.add_argument('--apis', default=['google', 'ibm', 'microsoft'], 36 | nargs='+') 37 | 38 | args = parser.parse_args() 39 | 40 | # If save is not defined, it will use the folder name of dataset location 41 | save = args.save 42 | if args.save is None: 43 | save = '%s_eval_apis.json' % args.dataset.split(os.path.sep)[-2] 44 | 45 | # Recovering text parser 46 | label_parser = utils.get_from_module('preprocessing.text', 47 | args.label_parser, 48 | params=args.label_parser_params) 49 | 50 | if not utils.check_ext(args.dataset, 'json'): 51 | raise ValueError('dataset must be a json file') 52 | 53 | dataset = json.load(codecs.open(args.dataset, 'r', encoding='utf8')) 54 | 55 | if not args.all and 'dt' in dataset[0]: 56 | dataset = [d for d in dataset if d['dt'] == 'test'] 57 | 58 | apis = {'google': apis.recognize_google, 59 | 'ibm': apis.recognize_ibm, 60 | 'microsoft': apis.recognize_bing} 61 | 62 | eval_apis = [] 63 | if args.resume: 64 | with codecs.open(save, 'r', encoding='utf8') as f: 65 | eval_apis = json.load(f) 66 | 67 | for i, data in enumerate(dataset): 68 | 69 | if len(eval_apis) > i: 70 | result = eval_apis[i] 71 | else: 72 | result = {} 73 | result['label'] = data['label'] 74 | result['audio'] = data['audio'] 75 | 76 | if args.all and 'dt' in data: 77 | result['dt'] = data['dt'] 78 | 79 | for api_name in args.apis: 80 | if api_name in result and result[api_name] != '': 81 | continue 82 | try: 83 | result[api_name] = apis[api_name](data['audio'], safe=False, 84 | language=args.language) 85 | except Exception as e: 86 | result[api_name] = '' 87 | print(e) 88 | 89 | if len(eval_apis) > i: 90 | eval_apis[i] = result 91 | else: 92 | eval_apis.append(result) 93 | 94 | if (args.save_every % (i + 1)) == 0: 95 | with codecs.open(save, 'w', encoding='utf8') as f: 96 | json.dump(eval_apis, f) 97 | 98 | print('Done %d/%d' % (i + 1, len(dataset))) 99 | time.sleep(.1) 100 | 101 | with codecs.open(save, 'w', encoding='utf8') as f: 102 | json.dump(eval_apis, f) 103 | -------------------------------------------------------------------------------- /extras/make_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import argparse 4 | 5 | from utils import generic_utils as utils 6 | from utils.hparams import HParams 7 | 8 | import preprocessing 9 | import datasets 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser(description='Generates a preprocessed dataset (hdf5 file) by providing the path to the dataset and the correct parser.') 13 | 14 | parser.add_argument('--dataset_dir', type=str, default=None) 15 | parser.add_argument('--parser', type=str, required=True) 16 | parser.add_argument('--parser_params', nargs='+', default=[]) 17 | 18 | parser.add_argument('--output_file', type=str, default=None) 19 | 20 | parser.add_argument('--input_parser', type=str, default=None) 21 | parser.add_argument('--input_parser_params', nargs='+', default=[]) 22 | 23 | parser.add_argument('--label_parser', type=str, 24 | default=None) 25 | parser.add_argument('--label_parser_params', nargs='+', default=[]) 26 | 27 | parser.add_argument('--override', action='store_true') 28 | 29 | args = parser.parse_args() 30 | 31 | parser = utils.get_from_module('datasets*', 32 | args.parser, 33 | regex=True) 34 | 35 | input_parser = utils.get_from_module('preprocessing.audio', 36 | args.input_parser, 37 | params=args.input_parser_params) 38 | label_parser = utils.get_from_module('preprocessing.text', 39 | args.label_parser, 40 | params=args.label_parser_params) 41 | 42 | dataset = parser(args.dataset_dir, 43 | **HParams().parse(args.parser_params).values()) 44 | 45 | output_file = dataset.to_h5(fname=args.output_file, 46 | input_parser=input_parser, 47 | label_parser=label_parser, 48 | override=args.override) 49 | 50 | print('Dataset %s saved at %s' % (parser.name, output_file)) 51 | -------------------------------------------------------------------------------- /extras/print_args.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import argparse 4 | 5 | from utils.core_utils import load_meta 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser(description='Print training arguments') 9 | parser.add_argument('--model', required=True, type=str) 10 | args = parser.parse_args() 11 | 12 | meta = load_meta(args.model) 13 | 14 | for k, v in meta['training_args'].items(): 15 | print('%s: %s' % (k, v)) 16 | -------------------------------------------------------------------------------- /extras/recognizer.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | # NOTE: this example requires PyAudio because it uses the Microphone class 4 | 5 | import sys 6 | import os 7 | import json 8 | import argparse 9 | import preprocessing 10 | import inspect 11 | import numpy as np 12 | 13 | import speech_recognition as sr 14 | 15 | import utils.generic_utils as utils 16 | 17 | from core.dataset_generator import DatasetIterator 18 | from utils.core_utils import setup_gpu 19 | 20 | import keras.backend as K 21 | from keras.models import Model 22 | from keras.layers import Lambda 23 | 24 | import tensorflow as tf 25 | 26 | if __name__ == "__main__": 27 | parser = argparse.ArgumentParser() 28 | 29 | parser.add_argument('source', type=str, nargs='+', default=['mic']) 30 | parser.add_argument('--language', default='pt-BR', type=str) 31 | 32 | # Custom asr 33 | parser.add_argument('--model', default=None, type=str) 34 | parser.add_argument('--gpu', default='0', type=str) 35 | parser.add_argument('--allow_growth', default=False, action='store_true') 36 | 37 | parser.add_argument('--apis', default=['google', 'ibm', 'microsoft'], nargs='+') 38 | 39 | args = parser.parse_args() 40 | 41 | r = sr.Recognizer() 42 | 43 | audios = [] 44 | if len(args.source) == 1 and args.source[0] == 'mic': 45 | # obtain audio from the microphone 46 | with sr.Microphone() as source: 47 | print("Say something! (language %s)" % args.language) 48 | mic_audio = r.listen(source) 49 | 50 | with tempfile.NamedTemporaryFile(delete=False) as f: 51 | f.write(mic_audio.get_wav_data()) 52 | audios.append((f.name, 'microphone')) 53 | else: 54 | for audio_fname in args.source: 55 | with sr.AudioFile(audio_fname) as source: 56 | audios.append((r.record(source), audio_fname)) 57 | # read the entire audio file 58 | 59 | if args.model is not None: 60 | setup_gpu(args.gpu, args.allow_growth) 61 | 62 | model, meta = utils.load_model(args.model, 63 | return_meta=True, 64 | mode='predict') 65 | training_args = meta['training_args'] 66 | 67 | # Features extractor 68 | input_parser = utils.get_from_module('preprocessing.audio', 69 | training_args['feats'], 70 | params=training_args['feats_params']) 71 | 72 | # Recovering text parser 73 | label_parser = utils.get_from_module('preprocessing.text', 74 | training_args['label_parser'], 75 | params=training_args['label_parser_params'] 76 | ) 77 | 78 | data_it = DatasetIterator(np.array([f for a, f in audios]), 79 | label_parser=input_parser, 80 | label_parser=label_parser) 81 | 82 | model_predictions = model.predict_generator( 83 | data_it, val_samples=len(audios)) 84 | 85 | model_predictions = [label_parser.imap(p[:(np.argmax(p == -1) or len(p))]) for p in model_predictions] 86 | 87 | for i, (audio, name) in enumerate(audios): 88 | 89 | print('Recognizing from: %s' % name) 90 | 91 | if 'google' in args.apis: 92 | rec = apis.recognize_google(audio, language=args.language) 93 | print("\tGoogle Cloud Speech:\n\t\t'%s'" % rec) 94 | 95 | if 'microsoft' in args.apis: 96 | rec = apis.recognize_bing(audio, language=args.language) 97 | print("\tMicrosoft Bing:\n\t\t'%s'" % rec) 98 | 99 | if 'ibm' in args.apis: 100 | rec = apis.recognize_ibm(audio, language=args.language) 101 | print("\tIBM Speech to Text:\n\t\t'%s'" % rec) 102 | 103 | if args.model is not None: 104 | print("\tTrained model:\n\t\t'%s'" % model_predictions[i]) 105 | -------------------------------------------------------------------------------- /extras/results2xlsx.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import argparse 5 | import yaml 6 | import numpy as np 7 | 8 | import openpyxl 9 | from openpyxl import Workbook 10 | 11 | from utils.core_utils import load_meta 12 | 13 | if __name__ == "__main__": 14 | parser = argparse.ArgumentParser() 15 | 16 | parser.add_argument('--folder', default='results', type=str) 17 | parser.add_argument('--del_empty_dir', action='store_true') 18 | 19 | args = parser.parse_args() 20 | 21 | metas = {} 22 | 23 | for subdir, dirs, files in os.walk(args.folder): 24 | if len(dirs): 25 | continue 26 | 27 | if not len(files): 28 | if args.del_empty_dir: 29 | print('deleting folder %s' % subdir) 30 | os.rmdir(os.path.abspath(subdir)) 31 | 32 | if 'model.h5' not in files: 33 | print('model.h5 not found in %s' % subdir) 34 | continue 35 | 36 | try: 37 | meta = load_meta(os.path.join(subdir, 'model.h5')) 38 | metas[subdir.split(os.sep)[-1]] = meta 39 | except KeyError: 40 | print('meta not found in %s' % os.path.join(subdir, 'model.h5')) 41 | 42 | training_args = list(set([arg for model in metas for arg in 43 | metas[model]['training_args']])) 44 | 45 | datasets = {} 46 | for model in metas: 47 | args = metas[model]['training_args'] 48 | meta = metas[model] 49 | 50 | try: 51 | key = args['dataset'] 52 | if type(key) in (list, set): 53 | key = key[0] 54 | key = key.split(os.sep)[-2] 55 | except KeyError: 56 | key = 'unknown' 57 | 58 | if key not in datasets: 59 | datasets[key] = {} 60 | 61 | datasets[key][model] = meta 62 | 63 | wb = Workbook() 64 | 65 | columns = ['path'] + ['epoch', 'best_val_ler'] + training_args 66 | 67 | for name in datasets: 68 | ws = wb.create_sheet(name) 69 | 70 | cell_range = ws['A1':'%s1' 71 | % openpyxl.utils.get_column_letter(len(columns))][0] 72 | 73 | for i, cell in zip(range(len(cell_range)), cell_range): 74 | cell.value = columns[i] 75 | 76 | for row, (model, meta) in enumerate(datasets[name].items(), start=2): 77 | 78 | ws['A%d' % row] = model 79 | for key in ('epoch', 'epochs'): 80 | if key in meta: 81 | ws['B%d' % row] = meta[key][np.argmin(meta['val_decoder_ler'])] 82 | break 83 | ws['C%d' % row] = np.min(meta['val_decoder_ler']) 84 | 85 | for arg, val in meta['training_args'].items(): 86 | col = openpyxl.utils.get_column_letter( 87 | training_args.index(arg) + 4) 88 | 89 | if type(val) in (list, set): 90 | val = ', '.join(val) 91 | 92 | ws['%s%d' % (col, row)] = val 93 | 94 | 95 | wb.save('results.xlsx') 96 | -------------------------------------------------------------------------------- /imgs/best_ler.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igormq/asr-study/302fa3087cc71aec4853360638dbe2f4a59b5726/imgs/best_ler.jpg -------------------------------------------------------------------------------- /imgs/best_ler.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igormq/asr-study/302fa3087cc71aec4853360638dbe2f4a59b5726/imgs/best_ler.pdf -------------------------------------------------------------------------------- /imgs/best_loss.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igormq/asr-study/302fa3087cc71aec4853360638dbe2f4a59b5726/imgs/best_loss.jpg -------------------------------------------------------------------------------- /imgs/best_loss.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igormq/asr-study/302fa3087cc71aec4853360638dbe2f4a59b5726/imgs/best_loss.pdf -------------------------------------------------------------------------------- /logging.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | disable_existing_loggers: False 3 | formatters: 4 | complete: 5 | format: "%(name)-12s: (asctime)s %(name)-12s %(levelname)-8s %(message)s" 6 | datefmt: "%m-%d %H:%M" 7 | simple: 8 | format: "%(name)-12s: %(levelname)-8s %(message)s" 9 | handlers: 10 | console: 11 | class: logging.StreamHandler 12 | level: WARNING 13 | formatter: simple 14 | stream: ext://sys.stdout 15 | file_handler: 16 | class: logging.handlers.RotatingFileHandler 17 | level: INFO 18 | formatter: complete 19 | filename: info.log 20 | maxBytes: 10485760 # 10MB 21 | backupCount: 20 22 | encoding: utf8 23 | root: 24 | level: INFO 25 | handlers: [console, file_handler] 26 | -------------------------------------------------------------------------------- /msc.yaml: -------------------------------------------------------------------------------- 1 | name: msc 2 | channels: 3 | - !!python/unicode 4 | 'defaults' 5 | dependencies: 6 | - !!python/unicode 7 | 'certifi=2016.2.28=py27_0' 8 | - !!python/unicode 9 | 'cycler=0.10.0=py27_0' 10 | - !!python/unicode 11 | 'freetype=2.5.5=2' 12 | - !!python/unicode 13 | 'functools32=3.2.3.2=py27_0' 14 | - !!python/unicode 15 | 'h5py=2.7.0=np113py27_0' 16 | - !!python/unicode 17 | 'hdf5=1.8.17=2' 18 | - !!python/unicode 19 | 'icu=54.1=0' 20 | - !!python/unicode 21 | 'jbig=2.1=0' 22 | - !!python/unicode 23 | 'jpeg=9b=0' 24 | - !!python/unicode 25 | 'libpng=1.6.30=1' 26 | - !!python/unicode 27 | 'libtiff=4.0.6=3' 28 | - !!python/unicode 29 | 'matplotlib=2.0.2=np113py27_0' 30 | - !!python/unicode 31 | 'mkl=2017.0.3=0' 32 | - !!python/unicode 33 | 'numpy=1.13.1=py27_0' 34 | - !!python/unicode 35 | 'olefile=0.44=py27_0' 36 | - !!python/unicode 37 | 'openssl=1.0.2l=0' 38 | - !!python/unicode 39 | 'pillow=4.2.1=py27_0' 40 | - !!python/unicode 41 | 'pip=9.0.1=py27_1' 42 | - !!python/unicode 43 | 'pyparsing=2.2.0=py27_0' 44 | - !!python/unicode 45 | 'pyqt=5.6.0=py27_2' 46 | - !!python/unicode 47 | 'python=2.7.13=0' 48 | - !!python/unicode 49 | 'python-dateutil=2.6.1=py27_0' 50 | - !!python/unicode 51 | 'pytz=2017.2=py27_0' 52 | - !!python/unicode 53 | 'pyyaml=3.12=py27_0' 54 | - !!python/unicode 55 | 'qt=5.6.2=2' 56 | - !!python/unicode 57 | 'readline=6.2=2' 58 | - !!python/unicode 59 | 'scipy=0.19.1=np113py27_0' 60 | - !!python/unicode 61 | 'setuptools=36.4.0=py27_1' 62 | - !!python/unicode 63 | 'sip=4.18=py27_0' 64 | - !!python/unicode 65 | 'six=1.10.0=py27_0' 66 | - !!python/unicode 67 | 'sqlite=3.13.0=0' 68 | - !!python/unicode 69 | 'subprocess32=3.2.7=py27_0' 70 | - !!python/unicode 71 | 'tk=8.5.18=0' 72 | - !!python/unicode 73 | 'wheel=0.29.0=py27_0' 74 | - !!python/unicode 75 | 'xz=5.2.3=0' 76 | - !!python/unicode 77 | 'yaml=0.1.6=0' 78 | - !!python/unicode 79 | 'zlib=1.2.11=0' 80 | - pip: 81 | - audioread==2.1.5 82 | - backports.weakref==1.0.post1 83 | - bleach==1.5.0 84 | - decorator==4.1.2 85 | - enum34==1.1.6 86 | - funcsigs==1.0.2 87 | - html5lib==0.9999999 88 | - joblib==0.11 89 | - keras==1.2.2 90 | - librosa==0.5.1 91 | - llvmlite==0.20.0 92 | - markdown==2.6.9 93 | - mock==2.0.0 94 | - numba==0.35.0 95 | - pbr==3.1.1 96 | - protobuf==3.4.0 97 | - resampy==0.2.0 98 | - scikit-learn==0.19.0 99 | - singledispatch==3.4.0.3 100 | - tensorflow==1.3.0 101 | - tensorflow-tensorboard==0.1.8 102 | - theano==0.9.0 103 | - unidecode==0.4.21 104 | - werkzeug==0.12.2 105 | prefix: !!python/unicode '/Users/igormq/miniconda2/envs/msc' 106 | 107 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import h5py 4 | import os 5 | import numpy as np 6 | import codecs 7 | 8 | from datasets.dataset_generator import DatasetGenerator, DatasetIterator 9 | 10 | from utils.core_utils import setup_gpu, load_model 11 | 12 | from utils.hparams import HParams 13 | from utils import generic_utils as utils 14 | 15 | from preprocessing import audio, text 16 | 17 | if __name__ == '__main__': 18 | 19 | parser = argparse.ArgumentParser(description='Evaluating an ASR system.') 20 | 21 | parser.add_argument('--model', required=True, type=str) 22 | parser.add_argument('--dataset', default=None, type=str) 23 | parser.add_argument('--file', default=None, type=str) 24 | parser.add_argument('--subset', type=str, default='test') 25 | 26 | # Features generation (if necessary) 27 | parser.add_argument('--input_parser', type=str, default=None) 28 | parser.add_argument('--input_parser_params', nargs='+', default=[]) 29 | 30 | # Label generation (if necessary) 31 | parser.add_argument('--label_parser', type=str, 32 | default='simple_char_parser') 33 | parser.add_argument('--label_parser_params', nargs='+', default=[]) 34 | parser.add_argument('--no_decoder', action='store_true', default=False) 35 | 36 | # Other configs 37 | parser.add_argument('--gpu', default='0', type=str) 38 | parser.add_argument('--allow_growth', default=False, action='store_true') 39 | 40 | parser.add_argument('--save', default=None, type=str) 41 | parser.add_argument('--override', default=False, action='store_true') 42 | 43 | args = parser.parse_args() 44 | args_nondefault = utils.parse_nondefault_args( 45 | args, parser.parse_args( 46 | ['--model', args.model, '--dataset', args.dataset])) 47 | 48 | if args.dataset is None and args.file is None: 49 | raise ValueError('dataset or file args must be set.') 50 | 51 | if args.dataset and args.file: 52 | print('Both dataset and file args was set. Ignoring file args.') 53 | 54 | # GPU configuration 55 | setup_gpu(args.gpu, args.allow_growth) 56 | 57 | # Loading model 58 | model, meta = load_model(args.model, return_meta=True, 59 | mode='predict', decoder=(not args.no_decoder)) 60 | 61 | args = HParams(**meta['training_args']).update(vars(args_nondefault)) 62 | 63 | # Features extractor 64 | input_parser = utils.get_from_module('preprocessing.audio', 65 | args.input_parser, 66 | params=args.input_parser_params) 67 | 68 | # Recovering text parser 69 | label_parser = utils.get_from_module('preprocessing.text', 70 | args.label_parser, 71 | params=args.label_parser_params) 72 | 73 | if args.dataset is not None: 74 | data_gen = DatasetGenerator(input_parser, label_parser, 75 | batch_size=1, seed=0, mode='predict', 76 | shuffle=False) 77 | test_flow = data_gen.flow_from_fname(args.dataset, 78 | datasets=args.subset) 79 | else: 80 | test_flow = DatasetIterator(np.array([args.file]), None, 81 | input_parser=input_parser, 82 | label_parser=label_parser, mode='predict', 83 | shuffle=False) 84 | test_flow.labels = np.array([u'']) 85 | 86 | results = [] 87 | for index in range(test_flow.len): 88 | prediction = model.predict(test_flow.next()) 89 | if not args.no_decoder: 90 | prediction = label_parser.imap(prediction[0]) 91 | results.append({'input': test_flow.inputs[0].tolist(), 'label': test_flow.labels[0], 'best': prediction.tolist()}) 92 | print('Ground Truth: %s' % (label_parser._sanitize(test_flow.labels[0]))) 93 | print(' Predicted: %s\n\n' % prediction) 94 | 95 | if args.save is not None: 96 | if os.path.exists(args.save): 97 | if not args.override: 98 | raise IOError('Unable to create file') 99 | os.remove(args.save) 100 | 101 | if args.no_decoder: 102 | with h5py.File(args.save) as f: 103 | predictions = f.create_dataset( 104 | 'predictions', (0,), maxshape=(None,), 105 | dtype=h5py.special_dtype(vlen=np.dtype('float32'))) 106 | predictions.attrs['num_labels'] = results[0]['prediction'].shape[-1] 107 | 108 | labels = f.create_dataset( 109 | 'labels', (0,), maxshape=(None,), 110 | dtype=h5py.special_dtype(vlen=unicode)) 111 | 112 | inputs = f.create_dataset( 113 | 'inputs', (0,), maxshape=(None,), 114 | dtype=h5py.special_dtype(vlen=unicode)) 115 | 116 | for index, result in enumerate(results): 117 | 118 | label = result['label'] 119 | prediction = result['prediction'] 120 | input_ = result['input'] 121 | 122 | inputs.resize(inputs.shape[0] + 1, axis=0) 123 | inputs[inputs.shape[0] - 1] = input_ 124 | 125 | labels.resize(labels.shape[0] + 1, axis=0) 126 | labels[labels.shape[0] - 1] = label.encode('utf8') 127 | 128 | predictions.resize(predictions.shape[0] + 1, axis=0) 129 | predictions[predictions.shape[0] - 1] = prediction.flatten().astype('float32') 130 | 131 | # Flush to disk only when it reaches 128 samples 132 | if index % 128 == 0: 133 | print('%d/%d done.' % (index, len(results))) 134 | f.flush() 135 | 136 | f.flush() 137 | print('%d/%d done.' % (len(results), len(results))) 138 | else: 139 | raise ValueError('save param must be set if no_decoder is Truepython') 140 | 141 | else: 142 | with codecs.open(args.save, 'w', encoding='utf8') as f: 143 | json.dump(results, f) 144 | 145 | from keras import backend as K 146 | K.clear_session() 147 | -------------------------------------------------------------------------------- /preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .audio import MFCC, FBank, LogFbank, Raw 3 | from .text import CharParser, simple_char_parser, complex_char_parser 4 | -------------------------------------------------------------------------------- /preprocessing/audio.py: -------------------------------------------------------------------------------- 1 | ''' Code partially copied from python_speech_features package 2 | ''' 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | from . import audio_utils as sigproc 8 | 9 | import os 10 | import numpy as np 11 | import logging 12 | 13 | from scipy import signal 14 | from scipy.fftpack import dct 15 | import librosa 16 | 17 | 18 | class Feature(object): 19 | """ Base class for features calculation 20 | All children class must implement __str__ and _call function. 21 | 22 | # Arguments 23 | fs: sampling frequency of audio signal. If the audio has not this fs, 24 | it will be resampled 25 | eps 26 | """ 27 | 28 | def __init__(self, fs=16e3, eps=1e-8, stride=1, num_context=0, 29 | mean_norm=True, var_norm=True): 30 | self.fs = fs 31 | self.eps = eps 32 | 33 | self.mean_norm = mean_norm 34 | self.var_norm = var_norm 35 | 36 | self.stride = stride 37 | self.num_context = num_context 38 | self._logger = logging.getLogger('%s.%s' % (__name__, 39 | self.__class__.__name__)) 40 | 41 | def __call__(self, audio): 42 | """ This method load the audio and do the transformation of signal 43 | 44 | # Inputs 45 | audio: 46 | if audio is a string and the file exists, the wave file will 47 | be loaded and resampled (if necessary) to fs 48 | if audio is a ndarray or list and is not empty, it will make 49 | the transformation without any resampling 50 | 51 | # Exception 52 | TypeError if audio were not recognized 53 | 54 | """ 55 | if ((isinstance(audio, str) or isinstance(audio, unicode)) 56 | and os.path.isfile(audio)): 57 | audio, current_fs = librosa.audio.load(audio) 58 | audio = librosa.core.resample(audio, current_fs, self.fs) 59 | feats = self._call(audio) 60 | elif type(audio) in (np.ndarray, list) and len(audio) > 1: 61 | feats = self._call(audio) 62 | else: 63 | TypeError("audio type is not support") 64 | 65 | return self._standarize(self._postprocessing(feats)) 66 | 67 | def _call(self, data): 68 | raise NotImplementedError("__call__ must be overrided") 69 | 70 | def _standarize(self, feats): 71 | if self.mean_norm: 72 | feats -= np.mean(feats, axis=0, keepdims=True) 73 | if self.var_norm: 74 | feats /= (np.std(feats, axis=0, keepdims=True) + self.eps) 75 | return feats 76 | 77 | def _postprocessing(self, feats): 78 | # Code adapted from 79 | # https://github.com/mozilla/DeepSpeech/blob/master/util/audio.py 80 | 81 | # We only keep every second feature (BiRNN stride = 2) 82 | feats = feats[::self.stride] 83 | 84 | if self.num_context == 0: 85 | return feats 86 | num_feats = feats.shape[1] 87 | 88 | train_inputs = np.array([], np.float32) 89 | train_inputs.resize((feats.shape[0], 90 | num_feats + 2*num_feats*self.num_context)) 91 | 92 | # Prepare pre-fix post fix context 93 | # (TODO: Fill empty_mfcc with MCFF of silence) 94 | empty_mfcc = np.array([]) 95 | empty_mfcc.resize((num_feats)) 96 | 97 | # Prepare train_inputs with past and future contexts 98 | time_slices = range(train_inputs.shape[0]) 99 | context_past_min = time_slices[0] + self.num_context 100 | context_future_max = time_slices[-1] - self.num_context 101 | for time_slice in time_slices: 102 | # Reminder: array[start:stop:step] 103 | # slices from indice |start| up to |stop| (not included), every 104 | # |step| 105 | # Pick up to self.num_context time slices in the past, and complete 106 | # with empty 107 | # mfcc features 108 | need_empty_past = max(0, (context_past_min - time_slice)) 109 | empty_source_past = list(empty_mfcc for empty_slots 110 | in range(need_empty_past)) 111 | data_source_past = feats[max(0, time_slice - 112 | self.num_context):time_slice] 113 | assert(len(empty_source_past) + 114 | len(data_source_past) == self.num_context) 115 | 116 | # Pick up to self.num_context time slices in the future, and 117 | # complete with empty 118 | # mfcc features 119 | need_empty_future = max(0, (time_slice - context_future_max)) 120 | empty_source_future = list(empty_mfcc 121 | for empty_slots in 122 | range(need_empty_future)) 123 | data_source_future = feats[time_slice + 1:time_slice + 124 | self.num_context + 1] 125 | 126 | assert(len(empty_source_future) + 127 | len(data_source_future) == self.num_context) 128 | 129 | if need_empty_past: 130 | past = np.concatenate((empty_source_past, data_source_past)) 131 | else: 132 | past = data_source_past 133 | 134 | if need_empty_future: 135 | future = np.concatenate((data_source_future, 136 | empty_source_future)) 137 | else: 138 | future = data_source_future 139 | 140 | past = np.reshape(past, self.num_context*num_feats) 141 | now = feats[time_slice] 142 | future = np.reshape(future, self.num_context*num_feats) 143 | 144 | train_inputs[time_slice] = np.concatenate((past, now, future)) 145 | assert(len(train_inputs[time_slice]) 146 | == num_feats + 2*num_feats*self.num_context) 147 | 148 | self._num_feats = num_feats + 2*num_feats*self.num_context 149 | 150 | return train_inputs 151 | 152 | def __str__(self): 153 | raise NotImplementedError("__str__ must be overrided") 154 | 155 | @property 156 | def num_feats(self): 157 | return self._num_feats 158 | 159 | 160 | class FBank(Feature): 161 | """Compute Mel-filterbank energy features from an audio signal. 162 | 163 | # Arguments 164 | win_len: the length of the analysis window in seconds. 165 | Default is 0.025s (25 milliseconds) 166 | win_step: the step between successive windows in seconds. 167 | Default is 0.01s (10 milliseconds) 168 | num_filt: the number of filters in the filterbank, default 40. 169 | nfft: the FFT size. Default is 512. 170 | low_freq: lowest band edge of mel filters in Hz. 171 | Default is 20. 172 | high_freq: highest band edge of mel filters in Hz. 173 | Default is 7800 174 | pre_emph: apply preemphasis filter with preemph as coefficient. 175 | 0 is no filter. Default is 0.97. 176 | win_func: the analysis window to apply to each frame. 177 | By default hamming window is applied. 178 | """ 179 | 180 | def __init__(self, win_len=0.025, win_step=0.01, 181 | num_filt=40, nfft=512, low_freq=20, high_freq=7800, 182 | pre_emph=0.97, win_fun=signal.hamming, **kwargs): 183 | 184 | super(FBank, self).__init__(**kwargs) 185 | 186 | if high_freq > self.fs / 2: 187 | raise ValueError("high_freq must be less or equal than fs/2") 188 | 189 | self.win_len = win_len 190 | self.win_step = win_step 191 | self.num_filt = num_filt 192 | self.nfft = nfft 193 | self.low_freq = low_freq 194 | self.high_freq = high_freq or self.fs / 2 195 | self.pre_emph = pre_emph 196 | self.win_fun = win_fun 197 | self._filterbanks = self._get_filterbanks() 198 | 199 | self._num_feats = self.num_filt 200 | 201 | @property 202 | def mel_points(self): 203 | return np.linspace(self._low_mel, self._high_mel, self.num_filt + 2) 204 | 205 | @property 206 | def low_freq(self): 207 | return self._low_freq 208 | 209 | @low_freq.setter 210 | def low_freq(self, value): 211 | self._low_mel = self._hz2mel(value) 212 | self._low_freq = value 213 | 214 | @property 215 | def high_freq(self): 216 | return self._high_freq 217 | 218 | @high_freq.setter 219 | def high_freq(self, value): 220 | self._high_mel = self._hz2mel(value) 221 | self._high_freq = value 222 | 223 | def _call(self, signal): 224 | """Compute Mel-filterbank energy features from an audio signal. 225 | :param signal: the audio signal from which to compute features. Should 226 | be an N*1 array 227 | 228 | Returns: 229 | 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) 230 | containing features. Each row holds 1 feature vector. The 231 | second return value is the energy in each frame (total energy, 232 | unwindowed) 233 | """ 234 | 235 | signal = sigproc.preemphasis(signal, self.pre_emph) 236 | 237 | frames = sigproc.framesig(signal, 238 | self.win_len * self.fs, 239 | self.win_step * self.fs, 240 | self.win_fun) 241 | 242 | pspec = sigproc.powspec(frames, self.nfft) 243 | # this stores the total energy in each frame 244 | energy = np.sum(pspec, 1) 245 | # if energy is zero, we get problems with log 246 | energy = np.where(energy == 0, np.finfo(float).eps, energy) 247 | 248 | # compute the filterbank energies 249 | feat = np.dot(pspec, self._filterbanks.T) 250 | # if feat is zero, we get problems with log 251 | feat = np.where(feat == 0, np.finfo(float).eps, feat) 252 | 253 | return feat, energy 254 | 255 | def _get_filterbanks(self): 256 | """Compute a Mel-filterbank. The filters are stored in the rows, the 257 | columns correspond 258 | to fft bins. The filters are returned as an array of size nfilt * 259 | (nfft / 2 + 1) 260 | 261 | Returns: 262 | A numpy array of size num_filt * (nfft/2 + 1) containing 263 | filterbank. Each row holds 1 filter. 264 | """ 265 | 266 | # our points are in Hz, but we use fft bins, so we have to convert 267 | # from Hz to fft bin number 268 | bin = np.floor((self.nfft + 1) * self._mel2hz(self.mel_points) / 269 | self.fs) 270 | 271 | fbank = np.zeros([self.num_filt, int(self.nfft / 2 + 1)]) 272 | for j in xrange(0, self.num_filt): 273 | for i in xrange(int(bin[j]), int(bin[j + 1])): 274 | fbank[j, i] = (i - bin[j]) / (bin[j + 1] - bin[j]) 275 | for i in xrange(int(bin[j + 1]), int(bin[j + 2])): 276 | fbank[j, i] = (bin[j + 2] - i) / (bin[j + 2] - bin[j + 1]) 277 | return fbank 278 | 279 | def _hz2mel(self, hz): 280 | """Convert a value in Hertz to Mels 281 | 282 | Args: 283 | hz: a value in Hz. This can also be a numpy array, conversion 284 | proceeds element-wise. 285 | 286 | Returns: 287 | A value in Mels. If an array was passed in, an identical sized 288 | array is returned. 289 | """ 290 | return 2595 * np.log10(1 + hz / 700.0) 291 | 292 | def _mel2hz(self, mel): 293 | """Convert a value in Mels to Hertz 294 | 295 | Args: 296 | mel: a value in Mels. This can also be a numpy array, conversion 297 | proceeds element-wise. 298 | 299 | Returns: 300 | A value in Hertz. If an array was passed in, an identical sized 301 | array is returned. 302 | """ 303 | return 700 * (10**(mel / 2595.0) - 1) 304 | 305 | def __str__(self): 306 | return "fbank" 307 | 308 | 309 | class MFCC(FBank): 310 | """Compute MFCC features from an audio signal. 311 | 312 | # Arguments 313 | num_cep: the number of cepstrum to return. Default 13. 314 | cep_lifter: apply a lifter to final cepstral coefficients. 0 is 315 | no lifter. Default is 22. 316 | append_energy: if this is true, the zeroth cepstral coefficient 317 | is replaced with the log of the total frame energy. 318 | d: if True add deltas coeficients. Default True 319 | dd: if True add delta-deltas coeficients. Default True 320 | norm: if 'cmn' performs the cepstral mean normalization. elif 'cmvn' 321 | performs the cepstral mean and variance normalizastion. Default 'cmn' 322 | """ 323 | 324 | def __init__(self, num_cep=13, cep_lifter=22, append_energy=True, 325 | d=True, dd=True, **kwargs): 326 | 327 | super(MFCC, self).__init__(**kwargs) 328 | 329 | self.num_cep = num_cep 330 | self.cep_lifter = cep_lifter 331 | self.append_energy = append_energy 332 | self.d = d 333 | self.dd = dd 334 | self._num_feats = (1 + self.d + self.dd) * self.num_cep 335 | 336 | self._logger = logging.getLogger('%s.%s' % (__name__, 337 | self.__class__.__name__)) 338 | 339 | def _call(self, signal): 340 | """Compute MFCC features from an audio signal. 341 | 342 | Args: 343 | signal: the audio signal from which to compute features. Should be 344 | an N*1 array 345 | 346 | Returns: 347 | A numpy array of size (NUMFRAMES by numcep) containing features. 348 | Each row holds 1 feature vector. 349 | """ 350 | feat, energy = super(MFCC, self)._call(signal) 351 | 352 | feat = np.log(feat) 353 | feat = dct(feat, type=2, axis=1, norm='ortho')[:, :self.num_cep] 354 | feat = self._lifter(feat, self.cep_lifter) 355 | 356 | if self.append_energy: 357 | # replace first cepstral coefficient with log of frame energy 358 | feat[:, 0] = np.log(energy + self.eps) 359 | 360 | if self.d: 361 | d = sigproc.delta(feat, 2) 362 | feat = np.hstack([feat, d]) 363 | 364 | if self.dd: 365 | feat = np.hstack([feat, sigproc.delta(d, 2)]) 366 | 367 | return feat 368 | 369 | def _lifter(self, cepstra, L=22): 370 | """Apply a cepstral lifter the the matrix of cepstra. 371 | 372 | This has the effect of increasing the magnitude of the high frequency 373 | DCT coeffs. 374 | 375 | Args: 376 | cepstra: the matrix of mel-cepstra, will be numframes * numcep in 377 | size. 378 | L: the liftering coefficient to use. Default is 22. L <= 0 disables 379 | lifter. 380 | """ 381 | if L > 0: 382 | nframes, ncoeff = np.shape(cepstra) 383 | n = np.arange(ncoeff) 384 | lift = 1 + (L / 2) * np.sin(np.pi * n / L) 385 | return lift * cepstra 386 | else: 387 | # values of L <= 0, do nothing 388 | return cepstra 389 | 390 | def __str__(self): 391 | return "mfcc" 392 | 393 | 394 | class LogFbank(FBank): 395 | """Compute Mel-filterbank energy features from an audio signal. 396 | 397 | # Arguments 398 | append_energy: if this is true, log of the total frame energy is 399 | append to the features vector. Default False 400 | d: if True add deltas coeficients. Default False 401 | dd: if True add delta-deltas coeficients. Default False 402 | """ 403 | 404 | def __init__(self, d=False, dd=False, append_energy=False, **kwargs): 405 | """Constructor 406 | """ 407 | 408 | super(LogFbank, self).__init__(**kwargs) 409 | 410 | self.d = d 411 | self.dd = dd 412 | self.append_energy = append_energy 413 | self._num_feats = ((1 + self.d + self.dd) 414 | * (self.num_filt + self.append_energy)) 415 | 416 | self._logger = logging.getLogger('%s.%s' % (__name__, 417 | self.__class__.__name__)) 418 | 419 | def _call(self, signal): 420 | """Compute log Mel-filterbank energy features from an audio signal. 421 | :param signal: the audio signal from which to compute features. Should 422 | be an N*1 array 423 | 424 | Returns: 425 | A numpy array of size (NUMFRAMES by nfilt) containing features. 426 | Each row holds 1 feature vector. 427 | """ 428 | feat, energy = super(LogFbank, self)._call(signal) 429 | 430 | feat = np.log(feat) 431 | 432 | if self.append_energy: 433 | feat = np.hstack([feat, np.log(energy + self.eps)[:, np.newaxis]]) 434 | 435 | if self.d: 436 | d = sigproc.delta(feat, 2) 437 | feat = np.hstack([feat, d]) 438 | 439 | if self.dd: 440 | feat = np.hstack([feat, sigproc.delta(d, 2)]) 441 | 442 | return feat 443 | 444 | def __str__(self): 445 | return "logfbank" 446 | 447 | 448 | class Raw(Feature): 449 | """ Raw features extractor 450 | """ 451 | def __init__(self, **kwargs): 452 | super(Raw, self).__init__(**kwargs) 453 | self._num_feats = None 454 | 455 | def _call(self, x): 456 | return x 457 | 458 | def _postprocessing(self, x): 459 | return x 460 | 461 | def __str__(self): 462 | return "raw" 463 | 464 | 465 | raw = Raw() 466 | -------------------------------------------------------------------------------- /preprocessing/audio_utils.py: -------------------------------------------------------------------------------- 1 | """ Code based on package python_speech_features 2 | 3 | Author: James Lyons 2012 4 | """ 5 | import decimal 6 | 7 | import numpy 8 | import math 9 | 10 | 11 | def round_half_up(number): 12 | return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), 13 | rounding=decimal.ROUND_HALF_UP 14 | )) 15 | 16 | 17 | def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))): 18 | """Frame a signal into overlapping frames. 19 | :param sig: the audio signal to frame. 20 | :param frame_len: length of each frame measured in samples. 21 | :param frame_step: number of samples after the start of the previous frame 22 | that the next frame should begin. 23 | :param winfunc: the analysis window to apply to each frame. By default no 24 | window is applied. 25 | :returns: an array of frames. Size is NUMFRAMES by frame_len. 26 | """ 27 | slen = len(sig) 28 | frame_len = int(round_half_up(frame_len)) 29 | frame_step = int(round_half_up(frame_step)) 30 | if slen <= frame_len: 31 | numframes = 1 32 | else: 33 | numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step)) 34 | 35 | padlen = int((numframes - 1) * frame_step + frame_len) 36 | 37 | zeros = numpy.zeros((padlen - slen,)) 38 | padsignal = numpy.concatenate((sig, zeros)) 39 | 40 | indices = numpy.tile( 41 | numpy.arange( 42 | 0, frame_len), 43 | (numframes, 1)) + numpy.tile( 44 | numpy.arange( 45 | 0, numframes * frame_step, frame_step), (frame_len, 1)).T 46 | 47 | indices = numpy.array(indices, dtype=numpy.int32) 48 | frames = padsignal[indices] 49 | win = numpy.tile(winfunc(frame_len), (numframes, 1)) 50 | return frames * win 51 | 52 | 53 | def deframesig(frames, siglen, frame_len, frame_step, 54 | winfunc=lambda x: numpy.ones((x,))): 55 | """Does overlap-add procedure to undo the action of framesig. 56 | :param frames: the array of frames. 57 | :param siglen: the length of the desired signal, use 0 if unknown. Output 58 | will be truncated to siglen samples. 59 | :param frame_len: length of each frame measured in samples. 60 | :param frame_step: number of samples after the start of the previous frame 61 | that the next frame should begin. 62 | :param winfunc: the analysis window to apply to each frame. By default no 63 | window is applied. 64 | :returns: a 1-D signal. 65 | """ 66 | frame_len = round_half_up(frame_len) 67 | frame_step = round_half_up(frame_step) 68 | numframes = numpy.shape(frames)[0] 69 | assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong\ 70 | size, 2nd dim is not equal to frame_len' 71 | 72 | indices = numpy.tile( 73 | numpy.arange( 74 | 0, frame_len), (numframes, 1)) + numpy.tile( 75 | numpy.arange( 76 | 0, numframes * frame_step, frame_step), (frame_len, 1)).T 77 | 78 | indices = numpy.array(indices, dtype=numpy.int32) 79 | padlen = (numframes - 1) * frame_step + frame_len 80 | 81 | if siglen <= 0: 82 | siglen = padlen 83 | 84 | rec_signal = numpy.zeros((padlen,)) 85 | window_correction = numpy.zeros((padlen,)) 86 | win = winfunc(frame_len) 87 | 88 | for i in range(0, numframes): 89 | # add a little bit so it is never zero 90 | window_correction[indices[i, :]] = window_correction[indices[i, :]] + \ 91 | win + 1e-15 92 | rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :] 93 | 94 | rec_signal = rec_signal / window_correction 95 | return rec_signal[0:siglen] 96 | 97 | 98 | def magspec(frames, NFFT): 99 | """Compute the magnitude spectrum of each frame in frames. If frames is an 100 | NxD matrix, output will be NxNFFT. 101 | :param frames: the array of frames. Each row is a frame. 102 | :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are 103 | zero-padded. 104 | :returns: If frames is an NxD matrix, output will be NxNFFT. Each row will 105 | be the magnitude spectrum of the corresponding frame. 106 | """ 107 | complex_spec = numpy.fft.rfft(frames, NFFT) 108 | return numpy.absolute(complex_spec) 109 | 110 | 111 | def powspec(frames, NFFT): 112 | """Compute the power spectrum of each frame in frames. If frames is an NxD 113 | matrix, output will be NxNFFT. 114 | :param frames: the array of frames. Each row is a frame. 115 | :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are 116 | zero-padded. 117 | :returns: If frames is an NxD matrix, output will be NxNFFT. Each row will 118 | be the power spectrum of the corresponding frame. 119 | """ 120 | return 1.0 / NFFT * numpy.square(magspec(frames, NFFT)) 121 | 122 | 123 | def logpowspec(frames, NFFT, norm=1): 124 | """Compute the log power spectrum of each frame in frames. If frames is an 125 | NxD matrix, output will be NxNFFT. 126 | :param frames: the array of frames. Each row is a frame. 127 | :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are 128 | zero-padded. 129 | :param norm: If norm=1, the log power spectrum is normalised so that the 130 | max value (across all frames) is 1. 131 | :returns: If frames is an NxD matrix, output will be NxNFFT. Each row will 132 | be the log power spectrum of the corresponding frame. 133 | """ 134 | ps = powspec(frames, NFFT) 135 | ps[ps <= 1e-30] = 1e-30 136 | lps = 10 * numpy.log10(ps) 137 | if norm: 138 | return lps - numpy.max(lps) 139 | else: 140 | return lps 141 | 142 | 143 | def preemphasis(signal, coeff=0.95): 144 | """perform preemphasis on the input signal. 145 | 146 | :param signal: The signal to filter. 147 | :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95. 148 | :returns: the filtered signal. 149 | """ 150 | return numpy.append(signal[0], signal[1:] - coeff * signal[:-1]) 151 | 152 | 153 | def delta(feat, N): 154 | """Compute delta features from a feature vector sequence. 155 | 156 | :param feat: A numpy array of size (NUMFRAMES by number of features) 157 | containing features. Each row holds 1 feature vector. 158 | :param N: For each frame, calculate delta features based on preceding and 159 | following N frames 160 | :returns: A numpy array of size (NUMFRAMES by number of features) 161 | containing delta features. Each row holds 1 delta feature vector. 162 | """ 163 | NUMFRAMES = len(feat) 164 | feat = numpy.concatenate(([feat[0] for i in range(N)], feat, [feat[-1] for 165 | i in 166 | range(N)])) 167 | denom = sum([2 * i * i for i in range(1, N + 1)]) 168 | dfeat = [] 169 | for j in range(NUMFRAMES): 170 | dfeat.append(numpy.sum([n * feat[N + j + n] 171 | for n in range(-1 * N, N + 1)], axis=0) / 172 | denom) 173 | return dfeat 174 | -------------------------------------------------------------------------------- /preprocessing/text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import, division, print_function 3 | 4 | import string 5 | from unidecode import unidecode 6 | import logging 7 | import numpy as np 8 | 9 | PUNCTUATIONS = "'""-,.!?:;" 10 | ACCENTS = u'ãõçâêôáíóúàüóé' 11 | 12 | 13 | class BaseParser(object): 14 | """ Interface class for all parsers 15 | """ 16 | 17 | def __init__(self): 18 | self._logger = logging.getLogger('%s.%s' % (__name__, 19 | self.__class__.__name__)) 20 | 21 | def __call__(self, _input): 22 | return self.map(_input) 23 | 24 | def map(self, _input): 25 | pass 26 | 27 | def imap(self, _input): 28 | pass 29 | 30 | def is_valid(self, _input): 31 | pass 32 | 33 | 34 | class CharParser(BaseParser): 35 | """ Class responsible to map any text in a certain character vocabulary 36 | 37 | # Arguments 38 | mode: Which type of vacabulary will be generated. Modes can be 39 | concatenated by using pipeline '|' 40 | 'space' or 's': accepts space character 41 | 'accents' or 'a': accepts pt-br accents 42 | 'punctuation' or 'p': accepts punctuation defined in 43 | string.punctuation 44 | 'digits': accepts all digits 45 | 'sensitive' or 'S': characters will be case sensitive 46 | 'all': shortcut that enables all modes 47 | """ 48 | 49 | def __init__(self, mode='space'): 50 | self._permitted_modes = {'sensitive': 'S', 'space': 's', 'accents': 51 | 'a', 'punctuation': 'p', 'digits': 'd'} 52 | 53 | if mode == 'all': 54 | self.mode = self._permitted_modes.values() 55 | else: 56 | self.mode = [] 57 | for m in mode.split('|'): 58 | try: 59 | self.mode.append(self._permitted_modes[m]) 60 | except KeyError: 61 | if m not in self._permitted_modes.values(): 62 | raise ValueError('Unknown mode %s' % m) 63 | 64 | self.mode.append(m) 65 | 66 | self._vocab, self._inv_vocab = self._gen_vocab() 67 | 68 | def map(self, txt, sanitize=True): 69 | if sanitize: 70 | label = np.array([self._vocab[c] for c in self._sanitize(txt)], 71 | dtype='int32') 72 | else: 73 | label = np.array([self._vocab[c] for c in txt], dtype='int32') 74 | 75 | return label 76 | 77 | def imap(self, labels): 78 | txt = ''.join([self._inv_vocab[l] for l in labels]) 79 | 80 | return txt 81 | 82 | def _sanitize(self, text): 83 | # removing duplicated spaces 84 | text = ' '.join(text.split()) 85 | 86 | if not('d' in self.mode): 87 | text = ''.join([c for c in text if not c.isdigit()]) 88 | 89 | if not('a' in self.mode): 90 | text = unidecode(text) 91 | 92 | if not('p' in self.mode): 93 | text = text.translate( 94 | string.maketrans("-'", ' ')).translate(None, 95 | string.punctuation) 96 | 97 | if not ('s' in self.mode): 98 | text = text.replace(' ', '') 99 | 100 | if not('S' in self.mode): 101 | text = text.lower() 102 | 103 | return text 104 | 105 | def is_valid(self, text): 106 | # verify if the text is valid without sanitization 107 | try: 108 | _ = self.map(text, sanitize=False) 109 | return True 110 | except KeyError: 111 | return False 112 | 113 | def _gen_vocab(self): 114 | 115 | vocab = {chr(value + ord('a')): (value) 116 | for value in xrange(ord('z') - ord('a') + 1)} 117 | 118 | if 'a' in self.mode: 119 | for a in ACCENTS: 120 | vocab[a] = len(vocab) 121 | 122 | if 'S' in self.mode: 123 | for char in vocab.keys(): 124 | vocab[char.upper()] = len(vocab) 125 | 126 | if 's' in self.mode: 127 | # Inserts space label 128 | vocab[' '] = len(vocab) 129 | 130 | if 'p' in self.mode: 131 | for p in PUNCTUATIONS: 132 | vocab[p] = len(vocab) 133 | 134 | if 'd' in self.mode: 135 | for num in range(10): 136 | vocab[str(num)] = len(vocab) 137 | 138 | inv_vocab = {v: k for (k, v) in vocab.iteritems()} 139 | 140 | # Add blank label 141 | inv_vocab[len(inv_vocab)] = '' 142 | 143 | return vocab, inv_vocab 144 | 145 | 146 | simple_char_parser = CharParser() 147 | complex_char_parser = CharParser(mode='s|p|a|d') 148 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: osx-64 4 | certifi=2016.2.28=py27_0 5 | cycler=0.10.0=py27_0 6 | freetype=2.5.5=2 7 | functools32=3.2.3.2=py27_0 8 | h5py=2.7.0=np113py27_0 9 | hdf5=1.8.17=2 10 | icu=54.1=0 11 | jbig=2.1=0 12 | jpeg=9b=0 13 | libpng=1.6.30=1 14 | libtiff=4.0.6=3 15 | matplotlib=2.0.2=np113py27_0 16 | mkl=2017.0.3=0 17 | numpy=1.13.1=py27_0 18 | olefile=0.44=py27_0 19 | openssl=1.0.2l=0 20 | pillow=4.2.1=py27_0 21 | pip=9.0.1=py27_1 22 | pyparsing=2.2.0=py27_0 23 | pyqt=5.6.0=py27_2 24 | python=2.7.13=0 25 | python-dateutil=2.6.1=py27_0 26 | pytz=2017.2=py27_0 27 | pyyaml=3.12=py27_0 28 | qt=5.6.2=2 29 | readline=6.2=2 30 | scipy=0.19.1=np113py27_0 31 | setuptools=36.4.0=py27_1 32 | sip=4.18=py27_0 33 | six=1.10.0=py27_0 34 | sqlite=3.13.0=0 35 | subprocess32=3.2.7=py27_0 36 | tk=8.5.18=0 37 | wheel=0.29.0=py27_0 38 | xz=5.2.3=0 39 | yaml=0.1.6=0 40 | zlib=1.2.11=0 41 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | # Preventing pool_allocator message 7 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 8 | 9 | import argparse 10 | import uuid 11 | import sys 12 | import json 13 | import datetime 14 | import inspect 15 | import codecs 16 | 17 | import logging 18 | try: 19 | import warpctc_tensorflow 20 | except ImportError: 21 | logging.warning('warpctc binding for tensorflow not found. :(') 22 | import tensorflow as tf 23 | 24 | import keras 25 | 26 | import keras.backend as K 27 | from keras.optimizers import SGD, Adam 28 | from keras.callbacks import ReduceLROnPlateau 29 | 30 | from core import metrics 31 | from core.ctc_utils import ctc_dummy_loss, decoder_dummy_loss 32 | from core.callbacks import MetaCheckpoint, ProgbarLogger 33 | from utils.core_utils import setup_gpu 34 | 35 | from preprocessing import audio, text 36 | 37 | from datasets.dataset_generator import DatasetGenerator 38 | from utils.hparams import HParams 39 | 40 | import utils.generic_utils as utils 41 | 42 | from utils.core_utils import load_model 43 | 44 | if __name__ == '__main__': 45 | 46 | parser = argparse.ArgumentParser(description='Training an ASR system.') 47 | 48 | # Resume training 49 | parser.add_argument('--load', default=None, type=str) 50 | 51 | # Model settings 52 | parser.add_argument('--model', default='brsmv1', type=str) 53 | parser.add_argument('--model_params', nargs='+', default=[]) 54 | 55 | # Hyper parameters 56 | parser.add_argument('--num_epochs', default=100, type=int) 57 | parser.add_argument('--lr', default=0.001, type=float) 58 | parser.add_argument('--momentum', default=0.9, type=float) 59 | parser.add_argument('--clipnorm', default=400, type=float) 60 | parser.add_argument('--batch_size', default=32, type=int) 61 | parser.add_argument('--opt', default='adam', type=str, 62 | choices=['sgd', 'adam']) 63 | # End of hyper parameters 64 | 65 | # Dataset definitions 66 | parser.add_argument('--dataset', default=None, type=str, nargs='+') 67 | 68 | # Features generation (if necessary) 69 | parser.add_argument('--input_parser', type=str, default=None) 70 | parser.add_argument('--input_parser_params', nargs='+', default=[]) 71 | 72 | # Label generation (if necessary) 73 | parser.add_argument('--label_parser', type=str, 74 | default='simple_char_parser') 75 | parser.add_argument('--label_parser_params', nargs='+', default=[]) 76 | 77 | # Callbacks 78 | parser.add_argument('--lr_schedule', default=None) 79 | parser.add_argument('--lr_params', nargs='+', default=[]) 80 | 81 | # Other configs 82 | parser.add_argument('--save', default=None, type=str) 83 | parser.add_argument('--gpu', default='0', type=str) 84 | parser.add_argument('--allow_growth', default=False, action='store_true') 85 | parser.add_argument('--verbose', default=0, type=int) 86 | parser.add_argument('--seed', default=None, type=float) 87 | 88 | args = parser.parse_args() 89 | 90 | # Setup logging 91 | utils.setup_logging() 92 | logger = logging.getLogger(__name__) 93 | tf.logging.set_verbosity(tf.logging.ERROR) 94 | 95 | # hack in ProgbarLogger: avoid logger.infoing the dummy losses 96 | keras.callbacks.ProgbarLogger = lambda: ProgbarLogger( 97 | show_metrics=['loss', 'decoder_ler', 'val_loss', 'val_decoder_ler']) 98 | 99 | # GPU configuration 100 | setup_gpu(args.gpu, args.allow_growth, 101 | log_device_placement=args.verbose > 1) 102 | 103 | # Initial configuration 104 | epoch_offset = 0 105 | meta = None 106 | 107 | if args.load: 108 | args_nondefault = utils.parse_nondefault_args(args, 109 | parser.parse_args([])) 110 | 111 | logger.info('Loading model...') 112 | model, meta = load_model(args.load, return_meta=True) 113 | 114 | logger.info('Loading parameters...') 115 | args = HParams(**meta['training_args']).update(vars(args_nondefault)) 116 | 117 | epoch_offset = len(meta['epochs']) 118 | logger.info('Current epoch: %d' % epoch_offset) 119 | 120 | if args_nondefault.lr: 121 | logger.info('Setting current learning rate to %f...' % args.lr) 122 | K.set_value(model.optimizer.lr, args.lr) 123 | 124 | else: 125 | logger.info('Creating model...') 126 | # Recovering all valid models 127 | model_fn = utils.get_from_module('core.models', args.model) 128 | # Loading model 129 | model = model_fn(**(HParams().parse(args.model_params).values())) 130 | 131 | logger.info('Setting the optimizer...') 132 | # Optimization 133 | if args.opt.strip().lower() == 'sgd': 134 | opt = SGD(lr=args.lr, momentum=args.momentum, 135 | clipnorm=args.clipnorm) 136 | elif args.opt.strip().lower() == 'adam': 137 | opt = Adam(lr=args.lr, clipnorm=args.clipnorm) 138 | 139 | # Compile with dummy loss 140 | model.compile(loss={'ctc': ctc_dummy_loss, 141 | 'decoder': decoder_dummy_loss}, 142 | optimizer=opt, metrics={'decoder': metrics.ler}, 143 | loss_weights=[1, 0]) 144 | 145 | logger.info('Creating results folder...') 146 | # Creating the results folder 147 | output_dir = args.save 148 | if output_dir is None: 149 | output_dir = os.path.join('results', 150 | '%s_%s' % (args.model, 151 | datetime.datetime.now())) 152 | if not os.path.isdir(output_dir): 153 | os.makedirs(output_dir) 154 | 155 | logger.info('Adding callbacks') 156 | # Callbacks 157 | model_ckpt = MetaCheckpoint(os.path.join(output_dir, 'model.h5'), 158 | training_args=args, meta=meta) 159 | best_ckpt = MetaCheckpoint( 160 | os.path.join(output_dir, 'best.h5'), monitor='val_decoder_ler', 161 | save_best_only=True, mode='min', training_args=args, meta=meta) 162 | callback_list = [model_ckpt, best_ckpt] 163 | 164 | # LR schedules 165 | if args.lr_schedule: 166 | lr_schedule_fn = utils.get_from_module('keras.callbacks', 167 | args.lr_schedule) 168 | if lr_schedule_fn: 169 | lr_schedule = lr_schedule_fn(**HParams().parse(args.lr_params).values()) 170 | callback_list.append(lr_schedule) 171 | else: 172 | raise ValueError('Learning rate schedule unrecognized') 173 | 174 | logger.info('Getting the feature extractor...') 175 | # Features extractor 176 | input_parser = utils.get_from_module('preprocessing.audio', 177 | args.input_parser, 178 | params=args.input_parser_params) 179 | 180 | logger.info('Getting the text parser...') 181 | # Recovering text parser 182 | label_parser = utils.get_from_module('preprocessing.text', 183 | args.label_parser, 184 | params=args.label_parser_params) 185 | 186 | logger.info('Getting the data generator...') 187 | # Data generator 188 | data_gen = DatasetGenerator(input_parser, label_parser, 189 | batch_size=args.batch_size, 190 | seed=args.seed) 191 | # iterators over datasets 192 | train_flow, valid_flow, test_flow = None, None, None 193 | num_val_samples = num_test_samples = 0 194 | 195 | logger.info('Generating flow...') 196 | if len(args.dataset) == 1: 197 | train_flow, valid_flow, test_flow = data_gen.flow_from_fname( 198 | args.dataset[0], datasets=['train', 'valid', 'test']) 199 | num_val_samples = valid_flow.len 200 | else: 201 | train_flow = data_gen.flow_from_fname(args.dataset[0]) 202 | valid_flow = data_gen.flow_from_fname(args.dataset[1]) 203 | 204 | num_val_samples = valid_flow.len 205 | if len(args.dataset) == 3: 206 | test_flow = data_gen.flow_from_fname(args.dataset[2]) 207 | num_test_samples = test_flow.len 208 | 209 | logger.info(str(vars(args))) 210 | print(str(vars(args))) 211 | logger.info('Initialzing training...') 212 | # Fit the model 213 | model.fit_generator(train_flow, samples_per_epoch=train_flow.len, 214 | nb_epoch=args.num_epochs, validation_data=valid_flow, 215 | nb_val_samples=num_val_samples, max_q_size=10, 216 | nb_worker=1, callbacks=callback_list, verbose=1, 217 | initial_epoch=epoch_offset) 218 | 219 | if test_flow: 220 | del model 221 | model = load_model(os.path.join(output_dir, 'best.h5'), mode='eval') 222 | logger.info('Evaluating best model on test set') 223 | metrics = model.evaluate_generator(test_flow, test_flow.len, 224 | max_q_size=10, nb_worker=1) 225 | 226 | msg = 'Total loss: %.4f\n\ 227 | CTC Loss: %.4f\nLER: %.2f%%' % (metrics[0], metrics[1], metrics[3]*100) 228 | logger.info(msg) 229 | 230 | with open(os.path.join(output_dir, 'results.txt'), 'w') as f: 231 | f.write(msg) 232 | 233 | print(msg) 234 | 235 | K.clear_session() 236 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | from .hparams import HParams 4 | -------------------------------------------------------------------------------- /utils/core_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import h5py 6 | import yaml 7 | 8 | import keras 9 | import keras.backend as K 10 | from keras.models import Model 11 | from keras.layers import Lambda 12 | import tensorflow as tf 13 | 14 | import core 15 | from core import layers_utils 16 | from core import ctc_utils 17 | from core import metrics 18 | 19 | from utils.generic_utils import inspect_module 20 | 21 | 22 | def setup_gpu(gpu, allow_growth=False, log_device_placement=False): 23 | # Choosing gpu 24 | if gpu == '-1': 25 | config = tf.ConfigProto(device_count={'GPU': 0}, 26 | log_device_placement=log_device_placement) 27 | else: 28 | if gpu == 'all': 29 | gpu = '' 30 | config = tf.ConfigProto(log_device_placement=log_device_placement) 31 | config.gpu_options.visible_device_list = gpu 32 | if allow_growth: # dynamic gpu memory allocation 33 | config.gpu_options.allow_growth = True 34 | session = tf.Session(config=config) 35 | K.set_session(session) 36 | 37 | 38 | def get_custom_objects(): 39 | """ Verify all custom object that may be used to load a keras model 40 | """ 41 | all_custom_objects = [] 42 | for module in ['core.layers', 'core.layers_utils', 43 | 'core.metrics', 'core.ctc_utils', 44 | 'core.initializers']: 45 | all_custom_objects.extend(inspect_module(module, to_dict=False)) 46 | 47 | return dict(all_custom_objects) 48 | 49 | def load_model(model_fname, return_meta=False, mode='train', **kwargs): 50 | """ Loading keras model with custom objects 51 | 52 | Args 53 | mode: 54 | if 'train', model will follow the definition in core.models 55 | if 'predict', beamsearch decoder will be used and the model return 56 | a np array with -1 filled in no data area 57 | if 'eval', greedy decoder will be replaced by beam search decoder 58 | of predictions 59 | """ 60 | if mode not in ('train', 'predict', 'eval'): 61 | raise ValueError('mode must be one of (train, predict, eval)') 62 | 63 | model = keras.models.load_model(model_fname, 64 | custom_objects=get_custom_objects()) 65 | 66 | # Define the new decoder and the to_dense layer 67 | if kwargs.get('decoder', True): 68 | dec = Lambda(ctc_utils.decode, 69 | output_shape=ctc_utils.decode_output_shape, 70 | arguments={'is_greedy': kwargs.get('is_greedy', False), 71 | 'beam_width': kwargs.get('beam_width', 400)}, 72 | name='beam_search') 73 | else: 74 | dec = Lambda(lambda x: x[0]) 75 | 76 | if mode == 'predict': 77 | y_pred = (model.get_layer('y_pred') or 78 | model.get_layer('decoder').input[0]) 79 | 80 | input_ = model.get_layer('inputs').input 81 | inputs_length = model.get_layer('inputs_length').input 82 | 83 | to_dense_layer = Lambda( 84 | layers_utils.to_dense, 85 | output_shape=layers_utils.to_dense_output_shape, 86 | name="to_dense") 87 | 88 | y_pred = dec([y_pred, inputs_length]) 89 | 90 | y_pred = to_dense_layer(y_pred) 91 | 92 | model = Model(input=[input_, inputs_length], 93 | output=[y_pred]) 94 | elif mode == 'eval': 95 | dec_layer = model.get_layer('decoder') 96 | 97 | y_pred_bs = dec(dec_layer.input) 98 | 99 | model = Model(input=model.inputs, output=[model.outputs[0], y_pred_bs]) 100 | 101 | # Freezing layers 102 | for l in model.layers: 103 | l.trainable = False 104 | 105 | model.compile('sgd', 106 | loss={'ctc': ctc_utils.ctc_dummy_loss, 107 | 'beam_search': ctc_utils.decoder_dummy_loss}, 108 | metrics={'beam_search': metrics.ler}, 109 | loss_weights=[1, 0]) 110 | 111 | if return_meta: 112 | meta = load_meta(model_fname) 113 | return model, meta 114 | 115 | return model 116 | 117 | 118 | def load_meta(model_fname): 119 | ''' Load meta configuration 120 | ''' 121 | meta = {} 122 | 123 | with h5py.File(model_fname, 'r') as f: 124 | meta_group = f['meta'] 125 | 126 | meta['training_args'] = yaml.load( 127 | meta_group.attrs['training_args']) 128 | for k in meta_group.keys(): 129 | meta[k] = list(meta_group[k]) 130 | 131 | return meta 132 | -------------------------------------------------------------------------------- /utils/generic_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import h5py 6 | import sys 7 | import os 8 | 9 | import logging 10 | import logging.config 11 | import yaml 12 | 13 | import numpy as np 14 | from scipy import sparse 15 | 16 | import inspect 17 | import yaml 18 | 19 | from .hparams import HParams 20 | 21 | import re 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | def safe_mkdirs(path): 27 | ''' Safe makedirs 28 | Directory is created with command `makedir -p`. 29 | Returns: 30 | `path` if the directory already exists or is created 31 | Exception: 32 | OSError if something is wrong 33 | ''' 34 | try: 35 | os.makedirs(path) 36 | except OSError, e: 37 | if e.errno != 17: # 17 = file exists 38 | raise 39 | 40 | return path 41 | 42 | 43 | def get_from_module(module, name, params=None, regex=False): 44 | """ Get a class or method from a module given its name 45 | """ 46 | members = inspect_module(module, regex=regex) 47 | 48 | if name is None or name.lower() == 'none': 49 | return None 50 | 51 | members = {k.lower().strip(): v for k, v in members.items()} 52 | 53 | try: 54 | member = members[name.lower().strip()] 55 | # is a class and must be instantiate if params is not none 56 | if (member and params is not None) and inspect.isclass(member): 57 | return member(**HParams().parse(params).values()) 58 | 59 | return member 60 | except KeyError, e: 61 | raise KeyError("%s not found in %s.\n Valid values are: %s" % 62 | (name, module, ', '.join(members.keys()))) 63 | 64 | 65 | def inspect_module(module, to_dict=True, regex=False): 66 | modules = {} 67 | if regex: 68 | pattern = re.compile(module) 69 | for key, value in sys.modules.items(): 70 | if pattern.match(key): 71 | modules[key] = value 72 | else: 73 | modules = {module: sys.modules[module]} 74 | 75 | members = [] 76 | for key, value in modules.items(): 77 | members.extend(inspect.getmembers(value, lambda member: 78 | hasattr(member, '__module__') and 79 | member.__module__ == key)) 80 | 81 | if to_dict: 82 | return dict(members) 83 | 84 | return members 85 | 86 | 87 | def ld2dl(ld): 88 | '''Transform a list of dictionaries in a dictionaries with lists 89 | # Note 90 | All dictionaries have the same keys 91 | ''' 92 | return dict(zip(ld[0], zip(*[d.values() for d in ld]))) 93 | 94 | def check_ext(fname, ext): 95 | # Adding dot 96 | ext = ext if ext[0] == '.' else '.' + ext 97 | fname, f_ext = os.path.splitext(fname) 98 | 99 | if f_ext == ext: 100 | return True 101 | 102 | return False 103 | 104 | 105 | def parse_nondefault_args(args, default_args): 106 | # removing default arguments 107 | args_default = {k: v for k, v in vars(default_args).items() 108 | if k not in [arg.split('-')[-1] for arg in sys.argv 109 | if arg.startswith('-')]} 110 | args_nondefault = {k: v for k, v in vars(args).items() 111 | if k not in args_default or args_default[k] != v} 112 | 113 | args_nondefault = HParams().parse(args_nondefault) 114 | 115 | return args_nondefault 116 | 117 | 118 | def setup_logging(default_path='logging.yaml', default_level=logging.INFO, 119 | env_key='LOG_CFG'): 120 | """Setup logging configuration 121 | 122 | """ 123 | path = default_path 124 | value = os.getenv(env_key, None) 125 | if value: 126 | path = value 127 | if os.path.exists(path): 128 | with open(path, 'rt') as f: 129 | config = yaml.safe_load(f.read()) 130 | logging.config.dictConfig(config) 131 | else: 132 | logging.basicConfig(level=default_level) 133 | -------------------------------------------------------------------------------- /utils/hparams.py: -------------------------------------------------------------------------------- 1 | import ast 2 | 3 | class HParams(object): 4 | """Creates an object for passing around hyperparameter values. 5 | Use the parse method to overwrite the default hyperparameters with values 6 | passed in as a string representation of a Python dictionary mapping 7 | hyperparameters to values. 8 | 9 | # Example 10 | hparams = magenta.common.HParams(batch_size=128, hidden_size=256) 11 | hparams.parse('{"hidden_size":512}') 12 | assert hparams.batch_size == 128 13 | assert hparams.hidden_size == 512 14 | 15 | 16 | Code adpated from Google Magenta 17 | """ 18 | 19 | def __init__(self, **init_hparams): 20 | object.__setattr__(self, 'keyvals', init_hparams) 21 | 22 | def __getitem__(self, key): 23 | """Returns value of the given hyperameter, or None if does not 24 | exist.""" 25 | return self.keyvals.get(key) 26 | 27 | def __getattribute__(self, attribute): 28 | if attribute == '__dict__': 29 | return self.keyvals 30 | else: 31 | return object.__getattribute__(self, attribute) 32 | 33 | def __getattr__(self, key): 34 | """Returns value of the given hyperameter, or None if does not 35 | exist.""" 36 | return self.keyvals.get(key) 37 | 38 | def __setattr__(self, key, value): 39 | """Sets value for the hyperameter.""" 40 | self.keyvals[key] = value 41 | 42 | def update(self, values_dict): 43 | """Merges in new hyperparameters, replacing existing with same key.""" 44 | self.keyvals.update(values_dict) 45 | 46 | return self 47 | 48 | def parse(self, values): 49 | """Merges in new hyperparameters, replacing existing with same key.""" 50 | 51 | if type(values) == dict: 52 | return self.update(values) 53 | 54 | if type(values) in (set, list): 55 | tmp = {} 56 | for k, v in zip(values[::2], values[1::2]): 57 | try: 58 | tmp[k] = ast.literal_eval(v) 59 | except ValueError: 60 | tmp[k] = v 61 | return self.update(tmp) 62 | 63 | return self.update(ast.literal_eval(values)) 64 | 65 | def values(self): 66 | """Return the hyperparameter values as a Python dictionary.""" 67 | return self.keyvals 68 | 69 | def __str__(self): 70 | return str(self.keyvals) 71 | --------------------------------------------------------------------------------