├── .gitignore
├── LICENSE.md
├── README.md
├── core
├── __init__.py
├── callbacks.py
├── ctc_utils.py
├── initializers.py
├── layers.py
├── layers_utils.py
├── metrics.py
└── models.py
├── data
├── download_brsmv1.sh
└── download_datasets.sh
├── datasets
├── __init__.py
├── brsd.py
├── cslu.py
├── dataset_generator.py
├── dataset_parser.py
├── dummy.py
├── lapsbm.py
├── sid.py
└── voxforge.py
├── eval.py
├── extras
├── __init__.py
├── apis.py
├── ctc_viz.py
├── eval_apis.py
├── make_dataset.py
├── print_args.py
├── recognizer.py
└── results2xlsx.py
├── imgs
├── best_ler.jpg
├── best_ler.pdf
├── best_loss.jpg
└── best_loss.pdf
├── logging.yaml
├── msc.yaml
├── predict.py
├── preprocessing
├── __init__.py
├── audio.py
├── audio_utils.py
└── text.py
├── requirements.txt
├── train.py
└── utils
├── __init__.py
├── core_utils.py
├── generic_utils.py
└── hparams.py
/.gitignore:
--------------------------------------------------------------------------------
1 | Ignoring automatic generated folders in data
2 | # Created by https://www.gitignore.io/api/sublimetext,macos,linux,python,ipythonnotebook,windows
3 |
4 | ### SublimeText ###
5 | # cache files for sublime text
6 | *.tmlanguage.cache
7 | *.tmPreferences.cache
8 | *.stTheme.cache
9 |
10 | # workspace files are user-specific
11 | *.sublime-workspace
12 |
13 | # project files should be checked into the repository, unless a significant
14 | # proportion of contributors will probably not be using SublimeText
15 | # *.sublime-project
16 |
17 | # sftp configuration file
18 | sftp-config.json
19 |
20 | # Package control specific files
21 | Package Control.last-run
22 | Package Control.ca-list
23 | Package Control.ca-bundle
24 | Package Control.system-ca-bundle
25 | Package Control.cache/
26 | Package Control.ca-certs/
27 | bh_unicode_properties.cache
28 |
29 | # Sublime-github package stores a github token in this file
30 | # https://packagecontrol.io/packages/sublime-github
31 | GitHub.sublime-settings
32 |
33 |
34 | ### macOS ###
35 | *.DS_Store
36 | .AppleDouble
37 | .LSOverride
38 |
39 | # Icon must end with two \r
40 | Icon
41 | # Thumbnails
42 | ._*
43 | # Files that might appear in the root of a volume
44 | .DocumentRevisions-V100
45 | .fseventsd
46 | .Spotlight-V100
47 | .TemporaryItems
48 | .Trashes
49 | .VolumeIcon.icns
50 | .com.apple.timemachine.donotpresent
51 | # Directories potentially created on remote AFP share
52 | .AppleDB
53 | .AppleDesktop
54 | Network Trash Folder
55 | Temporary Items
56 | .apdisk
57 |
58 |
59 | ### Linux ###
60 | *~
61 |
62 | # temporary files which can be created if a process still has a handle open of a deleted file
63 | .fuse_hidden*
64 |
65 | # KDE directory preferences
66 | .directory
67 |
68 | # Linux trash folder which might appear on any partition or disk
69 | .Trash-*
70 |
71 | # .nfs files are created when an open file is removed but is still being accessed
72 | .nfs*
73 |
74 |
75 | ### Python ###
76 | # Byte-compiled / optimized / DLL files
77 | __pycache__/
78 | *.py[cod]
79 | *$py.class
80 |
81 | # C extensions
82 | *.so
83 |
84 | # Distribution / packaging
85 | .Python
86 | env/
87 | build/
88 | develop-eggs/
89 | dist/
90 | downloads/
91 | eggs/
92 | .eggs/
93 | lib/
94 | lib64/
95 | parts/
96 | sdist/
97 | var/
98 | *.egg-info/
99 | .installed.cfg
100 | *.egg
101 |
102 | # PyInstaller
103 | # Usually these files are written by a python script from a template
104 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
105 | *.manifest
106 | *.spec
107 |
108 | # Installer logs
109 | pip-log.txt
110 | pip-delete-this-directory.txt
111 |
112 | # Unit test / coverage reports
113 | htmlcov/
114 | .tox/
115 | .coverage
116 | .coverage.*
117 | .cache
118 | nosetests.xml
119 | coverage.xml
120 | *,cover
121 | .hypothesis/
122 |
123 | # Translations
124 | *.mo
125 | *.pot
126 |
127 | # Django stuff:
128 | *.log
129 | local_settings.py
130 |
131 | # Flask stuff:
132 | instance/
133 | .webassets-cache
134 |
135 | # Scrapy stuff:
136 | .scrapy
137 |
138 | # Sphinx documentation
139 | docs/_build/
140 |
141 | # PyBuilder
142 | target/
143 |
144 | # IPython Notebook
145 | .ipynum_checkpoints
146 |
147 | # pyenv
148 | .python-version
149 |
150 | # celery beat schedule file
151 | celerybeat-schedule
152 |
153 | # dotenv
154 | .env
155 |
156 | # virtualenv
157 | .venv/
158 | venv/
159 | ENV/
160 |
161 | # Spyder project settings
162 | .spyderproject
163 |
164 | # Rope project settings
165 | .ropeproject
166 |
167 |
168 | ### IPythonNotebook ###
169 | # Temporary data
170 | .ipynum_checkpoints/
171 |
172 |
173 | ### Windows ###
174 | # Windows image file caches
175 | Thumbs.db
176 | ehthumbs.db
177 |
178 | # Folder config file
179 | Desktop.ini
180 |
181 | # Recycle Bin used on file shares
182 | $RECYCLE.BIN/
183 |
184 | # Windows Installer files
185 | *.cab
186 | *.msi
187 | *.msm
188 | *.msp
189 |
190 | # Windows shortcuts
191 | *.lnk
192 |
193 | # My ignores
194 | timit
195 | *.h5
196 | results/
197 | .datasets/
198 | .envrc
199 | notebooks/
200 | sims/
201 | results*/
202 | <<<<<<< HEAD
203 | data/*/**
204 | data/*
205 | refs/
206 | software
207 | results.json
208 | .vscode
209 | *.tar.gz
210 | *.xls*
211 | *.json
212 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | Copyright (c) 2016 Igor Macedo Quintanilha
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # asr-study: a study of all-neural speech recognition models
2 | This repository contains my efforts on developing an end-to-end ASR system using Keras and Tensorflow.
3 |
4 | ## Training a character-based all-neural Brazilian Portuguese speech recognition model
5 |
6 | Our model was trained using four datasets: [CSLU Spoltech (LDC2006S16)](https://catalog.ldc.upenn.edu/LDC2006S16), Sid, [VoxForge](http://www.voxforge.org), and [LapsBM1.4]( http://www.laps.ufpa.br/falabrasil/). Only the CSLU dataset is paid.
7 |
8 | #### Set up the (partial) Brazilian Portuguese Speech Dataset (BRSD)
9 |
10 | You can download the freely available datasets with the provided script (it may take a while):
11 |
12 | ```bash
13 | $ cd data; sh download_datasets.sh
14 | ```
15 |
16 | Next, you can preprocess it into an hdf5 file. Click [here](extras/make_dataset.py) for more information.
17 |
18 | ```bash
19 | $ python -m extras.make_dataset --parser brsd --input_parser mfcc
20 | ```
21 |
22 | #### Train the network
23 |
24 | You can train the network with the `train.py` script. For more usage information see [this](train.py). To train with the default parameters:
25 |
26 | ```bash
27 | $ python train.py --dataset .datasets/brsd/data.h5
28 | ```
29 |
30 | ## Pre-trained model
31 |
32 | You may download a pre-trained [brsm v1.0 model](core/models.py) over the full brsd dataset (including the CSLU dataset):
33 |
34 | ```bash
35 | $ mkdir models; sh download_brsmv1.sh
36 | ```
37 |
38 | Also, you can evaluate the model against the **brsd** test set
39 |
40 | ```bash
41 | $ python eval.py --model models/brsmv1.h5 --dataset .datasets/brsd/data.h5
42 | ```
43 |
44 | #### brsmv1.h5 training
45 |
46 |
47 |

48 |

49 |
50 |
51 | Test set: LER **25.13%** (using beam search decoder with beam width of 100)
52 |
53 |
54 | ## Predicting the outputs
55 |
56 | To predict the outputs of a trained model using some dataset:
57 |
58 | ```bash
59 | $ python predict.py --model MODEL --dataset DATASET
60 | ```
61 |
62 | ## Available dataset parsers
63 | You can see in [datasets/](datasets/) all the datasets parsers available.
64 |
65 | #### Creating a custom dataset parser
66 |
67 | You may create your own dataset parser. Here an example:
68 |
69 | ```python
70 | class CustomParser(DatasetParser):
71 |
72 | def __init__(self, dataset_dir, name='default name', **kwargs):
73 | super(CustomParser, self).__init__(dataset_dir, name, **kwargs)
74 |
75 | def _iter(self):
76 | for line in dataset:
77 | yield {'duration': line['duration'],
78 | 'input': line['input'],
79 | 'label': line['label'],
80 | 'non-optional-field': line['non-optional-field']}
81 |
82 | def _report(self, dl):
83 | args = extract_statistics(dl)
84 | report = '''General information
85 | Number of utterances: %d
86 | Total size (in seconds) of utterances: %.f
87 | Number of speakers: %d''' % (args)
88 | ```
89 |
90 | ## Available models
91 | You can see all the available models in [core/models.py](core/models.py)
92 | #### Creating a custom model
93 |
94 | You may create your custom model. Here an example of CTC-based model
95 |
96 | ```python
97 | def custom_model(num_features=26, num_hiddens=100, num_classes=28):
98 |
99 | x = Input(name='inputs', shape=(None, num_features))
100 | o = x
101 |
102 | o = Bidirectional(LSTM(num_hiddens,
103 | return_sequences=True,
104 | consume_less='gpu'))(o)
105 | o = TimeDistributed(Dense(num_classes))(o)
106 |
107 | return ctc_model(x, o)
108 | ```
109 | ## Contributing
110 | There are a plenty of work to be done. All contributions are welcome :).
111 |
112 | #### asr-related work
113 | * Add new layers
114 | * Batch normalized recurrent neural networks [arXiv](https://arxiv.org/abs/1510.01378)
115 | * Batch recurrent normalization [arXiv](https://arxiv.org/abs/1603.09025)
116 | * Reproduce topologies and results
117 | * [EESEN](https://arxiv.org/abs/1507.08240)
118 | * [Deep Speech 2](https://arxiv.org/abs/1512.02595)
119 | * ConvNet-based architectures
120 | * Add language model
121 | * [WFST](https://arxiv.org/abs/1507.08240)
122 | * [RNNLN](http://www.fit.vutbr.cz/~imikolov/rnnlm/)
123 | * Beam search decoder with LM or CLM
124 | * Encoder-decoder models with attention mechanism
125 | * ASR from raw speech
126 | * Real-time ASR
127 |
128 | #### brsp-related work
129 | * Investigate the brsdv1 model with
130 | * Multiplicative integration [arXiv](https://arxiv.org/abs/1606.06630)
131 | * Layer nomalization [arXiv](https://arxiv.org/abs/1607.06450)
132 | * Zoneout [arXiv](https://arxiv.org/abs/1606.01305)
133 | * Increase the number of datasets (ideally with free datasets)
134 | * Improve the LER
135 | * Train a language model
136 |
137 | #### code-related work
138 | * Test coverage
139 | * Examples
140 | * Better documentation
141 | * Improve the API
142 | * More features extractors, see [audio](preprocessing/audio.py) and [text](preprocessing/text.py)
143 | * More datasets parsers
144 | * [LibriSpeech](http://www.openslr.org/12/)
145 | * [Teldium](http://www-lium.univ-lemans.fr/en/content/ted-lium-corpus))
146 | * WSJ
147 | * Switchboard
148 | * [TIMIT](https://catalog.ldc.upenn.edu/ldc93s1)
149 | * [VCTK](http://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html)
150 | * **Implement a nice wrapper for Kaldi in order to enjoy their feature extractors**
151 | * Better way of store the entire preprocessed dataset
152 |
153 | #### Known bugs
154 | * High memory and CPU consumption
155 | * Predicting with batch size greater than 1 (Keras' bug)
156 | * warp-ctc does not seem to speed up training
157 | * [zoneout](core/layers.py) implementation
158 |
159 |
160 | ## Requirements
161 |
162 | #### basic requirements
163 | * Python 2.7
164 | * Numpy
165 | * Scipy
166 | * Pyyaml
167 | * HDF5
168 | * Unidecode
169 | * Librosa
170 | * Tensorflow
171 | * Keras
172 |
173 | #### recommended
174 | * [warp-ctc](https://github.com/baidu-research/warp-ctc) (for fast CTC loss calculation)
175 |
176 | #### optional
177 | * [SpeechRecognition](https://pypi.python.org/pypi/SpeechRecognition/) (to use the [eval apis](extras/eval_apis.py))
178 | * [openpyxl](https://pypi.python.org/pypi/openpyxl) (to [save the results in a excel file](extras/results2xlsx.py))
179 |
180 | ## Acknowledgements
181 | * [python_speech_features](https://github.com/jameslyons/python_speech_features) for the [audio preprocessing](preprocessing/audio.py)
182 | * [Google Magenta](https://github.com/tensorflow/magenta) for the [hparams](core/hparams.py)
183 | * @robertomest for helping me with everything
184 |
185 | ## License
186 | See [LICENSE.md](LICENSE.md) for more information
187 |
--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
1 | import layers
2 | import layers_utils
3 | import metrics
4 | import ctc_utils
5 | import models
6 | import initializers
7 | import callbacks
8 |
--------------------------------------------------------------------------------
/core/callbacks.py:
--------------------------------------------------------------------------------
1 | import keras.callbacks as callbacks
2 |
3 | import h5py
4 | import numpy as np
5 | import yaml
6 |
7 |
8 | class MetaCheckpoint(callbacks.ModelCheckpoint):
9 | """
10 | Checkpoints some training information with the model. This should enable
11 | resuming training and having training information on every checkpoint.
12 |
13 | Thanks to Roberto Estevao @robertomest - robertomest@poli.ufrj.br
14 | """
15 |
16 | def __init__(self, filepath, monitor='val_loss', verbose=0,
17 | save_best_only=False, save_weights_only=False,
18 | mode='auto', period=1, training_args=None, meta=None):
19 |
20 | super(MetaCheckpoint, self).__init__(filepath, monitor='val_loss',
21 | verbose=0, save_best_only=False,
22 | save_weights_only=False,
23 | mode='auto', period=1)
24 |
25 | self.filepath = filepath
26 | self.meta = meta or {'epochs': []}
27 |
28 | if training_args:
29 | training_args = vars(training_args)
30 |
31 | self.meta['training_args'] = training_args
32 |
33 | def on_train_begin(self, logs={}):
34 | super(MetaCheckpoint, self).on_train_begin(logs)
35 |
36 | def on_epoch_end(self, epoch, logs={}):
37 | super(MetaCheckpoint, self).on_epoch_end(epoch, logs)
38 |
39 | # Get statistics
40 | self.meta['epochs'].append(epoch)
41 | for k, v in logs.items():
42 | # Get default gets the value or sets (and gets) the default value
43 | self.meta.setdefault(k, []).append(v)
44 |
45 | # Save to file
46 | filepath = self.filepath.format(epoch=epoch, **logs)
47 |
48 | if self.epochs_since_last_save == 0:
49 | with h5py.File(filepath, 'r+') as f:
50 | meta_group = f.create_group('meta')
51 | meta_group.attrs['training_args'] = yaml.dump(
52 | self.meta.get('training_args', '{}'))
53 | meta_group.create_dataset('epochs',
54 | data=np.array(self.meta['epochs']))
55 | for k in logs:
56 | meta_group.create_dataset(k, data=np.array(self.meta[k]))
57 |
58 |
59 | class ProgbarLogger(callbacks.ProgbarLogger):
60 |
61 | def __init__(self, show_metrics=None):
62 | super(ProgbarLogger, self).__init__()
63 |
64 | self.show_metrics = show_metrics
65 |
66 | def on_train_begin(self, logs=None):
67 | super(ProgbarLogger, self).on_train_begin(logs)
68 |
69 | if self.show_metrics:
70 | self.params['metrics'] = self.show_metrics
71 |
--------------------------------------------------------------------------------
/core/ctc_utils.py:
--------------------------------------------------------------------------------
1 | import keras
2 | import keras.backend as K
3 |
4 | import numpy as np
5 | import tensorflow as tf
6 |
7 |
8 | def decode(inputs, **kwargs):
9 | """ Decodes a sequence of probabilities choosing the path with highest
10 | probability of occur
11 |
12 | # Arguments
13 | is_greedy: if True (default) the greedy decoder will be used;
14 | otherwise beam search decoder will be used
15 |
16 | if is_greedy is False:
17 | see the documentation of tf.nn.ctc_beam_search_decoder for more
18 | options
19 |
20 | # Inputs
21 | A tuple (y_pred, seq_len) where:
22 | y_pred is a tensor (N, T, C) where N is the bath size, T is the
23 | maximum timestep and C is the number of classes (including the
24 | blank label)
25 | seq_len is a tensor (N,) that indicates the real number of
26 | timesteps of each sequence
27 |
28 | # Outputs
29 | A sparse tensor with the top path decoded sequence
30 |
31 | """
32 |
33 | # Little hack for load_model
34 | import tensorflow as tf
35 | is_greedy = kwargs.get('is_greedy', True)
36 | y_pred, seq_len = inputs
37 |
38 | seq_len = tf.cast(seq_len[:, 0], tf.int32)
39 | y_pred = tf.transpose(y_pred, perm=[1, 0, 2])
40 |
41 | if is_greedy:
42 | decoded = tf.nn.ctc_greedy_decoder(y_pred, seq_len)[0][0]
43 | else:
44 | beam_width = kwargs.get('beam_width', 100)
45 | top_paths = kwargs.get('top_paths', 1)
46 | merge_repeated = kwargs.get('merge_repeated', True)
47 |
48 | decoded = tf.nn.ctc_beam_search_decoder(y_pred, seq_len, beam_width,
49 | top_paths,
50 | merge_repeated)[0][0]
51 |
52 | return decoded
53 |
54 |
55 | def decode_output_shape(inputs_shape):
56 | y_pred_shape, seq_len_shape = inputs_shape
57 | return (y_pred_shape[:1], None)
58 |
59 |
60 | def ctc_lambda_func(args):
61 | """ CTC cost function
62 | """
63 | y_pred, labels, inputs_length = args
64 |
65 | # Little hack for load_model
66 | import tensorflow as tf
67 |
68 | return tf.nn.ctc_loss(labels,
69 | tf.transpose(y_pred, perm=[1, 0, 2]),
70 | inputs_length[:, 0])
71 |
72 |
73 | def ctc_dummy_loss(y_true, y_pred):
74 | """ Little hack to make CTC working with Keras
75 | """
76 | return y_pred
77 |
78 |
79 | def decoder_dummy_loss(y_true, y_pred):
80 | """ Little hack to make CTC working with Keras
81 | """
82 | return K.zeros((1,))
83 |
--------------------------------------------------------------------------------
/core/initializers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import keras.backend as K
4 |
5 |
6 | def k_init(k):
7 | def init(shape, name=None):
8 | return K.variable(k*np.ones(shape), dtype='float32',
9 | name=name)
10 | return init
11 |
--------------------------------------------------------------------------------
/core/layers.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | import numpy as np
4 |
5 | import keras.backend as K
6 | import tensorflow as tf
7 |
8 | from keras import activations, initializations, regularizers
9 | import keras.layers as keras_layers
10 | from keras.layers.recurrent import Recurrent
11 | from keras.engine import Layer, InputSpec
12 |
13 | from .layers_utils import highway_bias_initializer
14 | from .layers_utils import layer_normalization
15 | from .layers_utils import multiplicative_integration_init
16 | from .layers_utils import multiplicative_integration
17 | from .layers_utils import zoneout
18 |
19 | from .initializers import k_init
20 |
21 | import logging
22 |
23 |
24 | class LayerNormalization(Layer):
25 | '''Normalize from all of the summed inputs to the neurons in a layer on
26 | a single training case. Unlike batch normalization, layer normalization
27 | performs exactly the same computation at training and tests time.
28 |
29 | # Arguments
30 | epsilon: small float > 0. Fuzz parameter
31 | num_var: how many tensor are condensed in the input
32 | weights: Initialization weights.
33 | List of 2 Numpy arrays, with shapes:
34 | `[(input_shape,), (input_shape,)]`
35 | Note that the order of this list is [gain, bias]
36 | gain_init: name of initialization function for gain parameter
37 | (see [initializations](../initializations.md)), or alternatively,
38 | Theano/TensorFlow function to use for weights initialization.
39 | This parameter is only relevant if you don't pass a `weights`
40 | argument.
41 | bias_init: name of initialization function for bias parameter
42 | (see [initializations](../initializations.md)), or alternatively,
43 | Theano/TensorFlow function to use for weights initialization.
44 | This parameter is only relevant if you don't pass a `weights`
45 | argument.
46 | # Input shape
47 |
48 | # Output shape
49 | Same shape as input.
50 |
51 | # References
52 | - [Layer Normalization](https://arxiv.org/abs/1607.06450)
53 | '''
54 | def __init__(self, epsilon=1e-5, weights=None, gain_init='one',
55 | bias_init='zero', **kwargs):
56 | self.epsilon = epsilon
57 | self.gain_init = initializations.get(gain_init)
58 | self.bias_init = initializations.get(bias_init)
59 | self.initial_weights = weights
60 | self._logger = logging.getLogger('%s.%s' % (__name__,
61 | self.__class__.__name__))
62 |
63 | super(LayerNormalization, self).__init__(**kwargs)
64 |
65 | def build(self, input_shape):
66 | self.input_spec = [InputSpec(shape=input_shape)]
67 | shape = (input_shape[-1],)
68 |
69 | self.g = self.gain_init(shape, name='{}_gain'.format(self.name))
70 | self.b = self.bias_init(shape, name='{}_bias'.format(self.name))
71 |
72 | self.trainable_weights = [self.g, self.b]
73 |
74 | if self.initial_weights is not None:
75 | self.set_weights(self.initial_weights)
76 | del self.initial_weights
77 |
78 | self.built = True
79 |
80 | def call(self, x, mask=None):
81 | return LN(x, self.g, self.b, epsilon=self.epsilon)
82 |
83 | def get_config(self):
84 | config = {"epsilon": self.epsilon,
85 | 'num_var': self.num_var,
86 | 'gain_init': self.gain_init.__name__,
87 | 'bias_init': self.bias_init.__name__}
88 | base_config = super(LayerNormalization, self).get_config()
89 | return dict(list(base_config.items()) + list(config.items()))
90 |
91 |
92 | class RHN(Recurrent):
93 | '''Recurrent Highway Network - Julian Georg Zilly, Rupesh Kumar Srivastava,
94 | Jan Koutník, Jürgen Schmidhuber - 2016.
95 | For a step-by-step description of the network, see
96 | [this paper](https://arxiv.org/abs/1607.03474).
97 | # Arguments
98 | output_dim: dimension of the internal projections and the final output.
99 | depth: recurrency depth size.
100 | init: weight initialization function.
101 | Can be the name of an existing function (str),
102 | or a Theano function (see:
103 | [initializations](../initializations.md)).
104 | inner_init: initialization function of the inner cells.
105 | bias_init: initialization function of the bias.
106 | (see [this
107 | post](http://people.idsia.ch/~rupesh/very_deep_learning/)
108 | for more information)
109 | activation: activation function.
110 | Can be the name of an existing function (str),
111 | or a Theano function (see: [activations](../activations.md)).
112 | inner_activation: activation function for the inner cells.
113 | coupling: if True, carry gate will be coupled to the transform gate,
114 | i.e., c = 1 - t
115 | W_regularizer: instance of [WeightRegularizer](../regularizers.md)
116 | (eg. L1 or L2 regularization), applied to the input weights
117 | matrices.
118 | U_regularizer: instance of [WeightRegularizer](../regularizers.md)
119 | (eg. L1 or L2 regularization), applied to the recurrent weights
120 | matrices.
121 | b_regularizer: instance of [WeightRegularizer](../regularizers.md),
122 | applied to the bias.
123 | dropout_W: float between 0 and 1. Fraction of the input units to drop
124 | for input gates.
125 | dropout_U: float between 0 and 1. Fraction of the input units to drop
126 | for recurrent connections.
127 | # References
128 | - [Recurrent Highway Networks](https://arxiv.org/abs/1607.03474)
129 | (original paper)
130 | - [Layer Normalization](https://arxiv.org/abs/1607.06450)
131 | - [A Theoretically Grounded Application of Dropout in Recurrent Neural
132 | Networks](http://arxiv.org/abs/1512.05287)
133 | # TODO: different dropout rates for each layer
134 | '''
135 | def __init__(self, output_dim, depth=1,
136 | init='glorot_uniform', inner_init='orthogonal',
137 | bias_init=highway_bias_initializer,
138 | activation='tanh', inner_activation='hard_sigmoid',
139 | coupling=True, layer_norm=False, ln_gain_init='one',
140 | ln_bias_init='zero', mi=False,
141 | W_regularizer=None, U_regularizer=None,
142 | b_regularizer=None, dropout_W=0., dropout_U=0., **kwargs):
143 | self.output_dim = output_dim
144 | self.depth = depth
145 | self.init = initializations.get(init)
146 | self.inner_init = initializations.get(inner_init)
147 | self.bias_init = initializations.get(bias_init)
148 | self.activation = activations.get(activation)
149 | self.inner_activation = activations.get(inner_activation)
150 | self.coupling = coupling
151 | self.has_layer_norm = layer_norm
152 | self.ln_gain_init = initializations.get(ln_gain_init)
153 | self.ln_bias_init = initializations.get(ln_bias_init)
154 | self.mi = mi
155 | self.W_regularizer = regularizers.get(W_regularizer)
156 | self.U_regularizer = regularizers.get(U_regularizer)
157 | self.b_regularizer = regularizers.get(b_regularizer)
158 | self.dropout_W, self.dropout_U = dropout_W, dropout_U
159 |
160 | self._logger = logging.getLogger('%s.%s' % (__name__,
161 | self.__class__.__name__))
162 |
163 | if self.dropout_W or self.dropout_U:
164 | self.uses_learning_phase = True
165 |
166 | super(RHN, self).__init__(**kwargs)
167 |
168 | if not self.consume_less == "gpu":
169 | self._logger.warning("Ignoring consume_less=%s. Setting to 'gpu'." % self.consume_less)
170 |
171 | def build(self, input_shape):
172 | self.input_spec = [InputSpec(shape=input_shape)]
173 | self.input_dim = input_shape[2]
174 |
175 | if self.stateful:
176 | self.reset_states()
177 | else:
178 | self.states = [None]
179 |
180 | self.W = self.init((self.input_dim, (2 + (not self.coupling)) *
181 | self.output_dim), name='{}_W'.format(self.name))
182 | self.Us = [self.inner_init(
183 | (self.output_dim, (2 + (not self.coupling)) * self.output_dim),
184 | name='%s_%d_U' % (self.name, i)) for i in xrange(self.depth)]
185 |
186 | bias_init_value = K.get_value(self.bias_init((self.output_dim,)))
187 | b = [np.zeros(self.output_dim),
188 | np.copy(bias_init_value)]
189 |
190 | if not self.coupling:
191 | b.append(np.copy(bias_init_value))
192 |
193 | self.bs = [K.variable(np.hstack(b),
194 | name='%s_%d_b' % (self.name, i)) for i in
195 | xrange(self.depth)]
196 |
197 | self.trainable_weights = [self.W] + self.Us + self.bs
198 |
199 | if self.mi:
200 | self.mi_params = [multiplicative_integration_init(
201 | ((2 + (not self.coupling)) * self.output_dim,),
202 | name='%s_%d' % (self.name, i),
203 | has_input=(i == 0)) for i in xrange(self.depth)]
204 |
205 | for p in self.mi_params:
206 | if type(p) in {list, tuple}:
207 | self.trainable_weights += p
208 | else:
209 | self.trainable_weights += [p]
210 |
211 | if self.has_layer_norm:
212 | self.ln_weights = []
213 | ln_names = ['h', 't', 'c']
214 | for l in xrange(self.depth):
215 |
216 | ln_gains = [self.ln_gain_init(
217 | (self.output_dim,), name='%s_%d_ln_gain_%s' %
218 | (self.name, l, ln_names[i])) for i in xrange(1)]
219 |
220 | ln_biases = [self.ln_bias_init(
221 | (self.output_dim,), name='%s_%d_ln_bias_%s' %
222 | (self.name, l, ln_names[i])) for i in xrange(1)]
223 | self.ln_weights.append([ln_gains, ln_biases])
224 | self.trainable_weights += ln_gains + ln_biases
225 |
226 | self.regularizers = []
227 | if self.W_regularizer:
228 | self.W_regularizer.set_param(self.W)
229 | self.regularizers.append(self.W_regularizer)
230 | if self.U_regularizer:
231 | self.U_regularizer.set_param(self.U)
232 | self.regularizers.append(self.U_regularizer)
233 | if self.b_regularizer:
234 | self.b_regularizer.set_param(self.b)
235 | self.regularizers.append(self.b_regularizer)
236 |
237 | if self.initial_weights is not None:
238 | self.set_weights(self.initial_weights)
239 | del self.initial_weights
240 |
241 | def reset_states(self):
242 | assert self.stateful, 'Layer must be stateful.'
243 | input_shape = self.input_spec[0].shape
244 | if not input_shape[0]:
245 | raise Exception('If a RNN is stateful, a complete ' +
246 | 'input_shape must be provided (including batch \
247 | size).')
248 | if hasattr(self, 'states'):
249 | K.set_value(self.states[0],
250 | np.zeros((input_shape[0], self.output_dim)))
251 | else:
252 | self.states = [K.zeros((input_shape[0], self.output_dim))]
253 |
254 | def step(self, x, states):
255 | s_tm1 = states[0]
256 |
257 | for layer in xrange(self.depth):
258 | B_U = states[layer + 1][0]
259 | U, b = self.Us[layer], self.bs[layer]
260 |
261 | if layer == 0:
262 | B_W = states[layer + 1][1]
263 | Wx = K.dot(x * B_W, self.W)
264 | else:
265 | Wx = 0
266 |
267 | Us = K.dot(s_tm1 * B_U, U)
268 |
269 | if self.mi:
270 | a = multiplicative_integration(Wx, Us,
271 | self.mi_params[layer]) + b
272 | else:
273 | a = Wx + Us + b
274 |
275 | a0 = a[:, :self.output_dim]
276 | a1 = a[:, self.output_dim: 2 * self.output_dim]
277 | if not self.coupling:
278 | a2 = a[:, 2 * self.output_dim:]
279 |
280 | if self.has_layer_norm:
281 | ln_gains, ln_biases = self.ln_weights[layer]
282 | a0 = LN(a0, ln_gains[0], ln_biases[0])
283 | # a1 = LN(a1, ln_gains[1], ln_biases[1])
284 | # if not self.coupling:
285 | # a2 = LN(a2, ln_gains[2], ln_biases[2])
286 |
287 | # Equation 7
288 | h = self.activation(a0)
289 | # Equation 8
290 | t = self.inner_activation(a1)
291 | # Equation 9
292 | if not self.coupling:
293 | c = self.inner_activation(a2)
294 | else:
295 | c = 1 - t # carry gate was coupled to the transform gate
296 |
297 | s = h * t + s_tm1 * c
298 | s_tm1 = s
299 |
300 | return s, [s]
301 |
302 | def get_constants(self, x):
303 | constants = []
304 |
305 | for layer in xrange(self.depth):
306 | constant = []
307 | if 0 < self.dropout_U < 1:
308 | ones = K.ones_like(K.reshape(x[:, 0, 0], (-1, 1)))
309 | ones = K.tile(ones, (1, self.output_dim))
310 | B_U = K.in_train_phase(K.dropout(ones, self.dropout_U), ones)
311 | constant.append(B_U)
312 | else:
313 | constant.append(K.cast_to_floatx(1.))
314 |
315 | if layer == 0:
316 | if 0 < self.dropout_W < 1:
317 | input_shape = self.input_spec[0].shape
318 | input_dim = input_shape[-1]
319 | ones = K.ones_like(K.reshape(x[:, 0, 0], (-1, 1)))
320 | ones = K.tile(ones, (1, input_dim))
321 | B_W = K.in_train_phase(K.dropout(ones,
322 | self.dropout_W), ones)
323 | constant.append(B_W)
324 | else:
325 | constant.append(K.cast_to_floatx(1.))
326 |
327 | constants.append(constant)
328 |
329 | return constants
330 |
331 | def get_config(self):
332 | config = {'output_dim': self.output_dim,
333 | 'depth': self.depth,
334 | 'init': self.init.__name__,
335 | 'inner_init': self.inner_init.__name__,
336 | 'bias_init': self.bias_init.__name__,
337 | 'activation': self.activation.__name__,
338 | 'inner_activation': self.inner_activation.__name__,
339 | 'coupling': self.coupling,
340 | 'layer_norm': self.has_layer_norm,
341 | 'ln_gain_init': self.ln_gain_init.__name__,
342 | 'ln_bias_init': self.ln_bias_init.__name__,
343 | 'mi': self.mi,
344 | 'W_regularizer': self.W_regularizer.get_config() if
345 | self.W_regularizer else None,
346 | 'U_regularizer': self.U_regularizer.get_config() if
347 | self.U_regularizer else None,
348 | 'b_regularizer': self.b_regularizer.get_config() if
349 | self.b_regularizer else None,
350 | 'dropout_W': self.dropout_W,
351 | 'dropout_U': self.dropout_U}
352 | base_config = super(RHN, self).get_config()
353 | return dict(list(base_config.items()) + list(config.items()))
354 |
355 |
356 | class LSTM(keras_layers.LSTM):
357 | """
358 | # Arguments
359 | ln: None, list of float or list of list of floats. Determines whether will apply LN or not. If list of floats, the same init will be applied to every LN; otherwise will be individual
360 | mi: list of floats or None. If list of floats, the multiplicative integration will be active and initialized with these values.
361 | zoneout_h: float between 0 and 1. Fraction of the hidden/output units to maintain their previous values.
362 | zoneout_c: float between 0 and 1. Fraction of the cell units to maintain their previous values.
363 | # References
364 | - [Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations](https://arxiv.org/abs/1606.01305)
365 | """
366 | def __init__(self, output_dim, zoneout_h=0., zoneout_c=0.,
367 | layer_norm=None, mi=None, **kwargs):
368 |
369 | super(LSTM, self).__init__(output_dim, **kwargs)
370 |
371 | self._logger = logging.getLogger('%s.%s' % (__name__,
372 | self.__class__.__name__))
373 |
374 | self.layer_norm = layer_norm
375 | self.mi = mi
376 |
377 | self.zoneout_c = zoneout_c
378 | self.zoneout_h = zoneout_h
379 |
380 | if self.zoneout_h or self.zoneout_c:
381 | self.uses_learning_phase = True
382 |
383 | if self.consume_less != 'gpu':
384 | self._logger.warn("Invalid option for `consume_less`. Falling back \
385 | to option `gpu`.")
386 | self.consume_less = 'gpu'
387 |
388 | def build(self, input_shape):
389 | super(LSTM, self).build(input_shape)
390 |
391 | if self.mi is not None:
392 | alpha_init, beta1_init, beta2_init = self.mi
393 |
394 | self.mi_alpha = self.add_weight(
395 | (4 * self.output_dim, ),
396 | initializer=k_init(alpha_init),
397 | name='{}_mi_alpha'.format(self.name))
398 | self.mi_beta1 = self.add_weight(
399 | (4 * self.output_dim, ),
400 | initializer=k_init(beta1_init),
401 | name='{}_mi_beta1'.format(self.name))
402 | self.mi_beta2 = self.add_weight(
403 | (4 * self.output_dim, ),
404 | initializer=k_init(beta2_init),
405 | name='{}_mi_beta2'.format(self.name))
406 |
407 | if self.layer_norm is not None:
408 | ln_gain_init, ln_bias_init = self.layer_norm
409 |
410 | self.layer_norm_params = {}
411 | for n, i in {'Uh': 4, 'Wx': 4, 'new_c': 1}.items():
412 |
413 | gain = self.add_weight(
414 | (i*self.output_dim, ),
415 | initializer=k_init(ln_gain_init),
416 | name='%s_ln_gain_%s' % (self.name, n))
417 | bias = self.add_weight(
418 | (i*self.output_dim, ),
419 | initializer=k_init(ln_bias_init),
420 | name='%s_ln_bias_%s' % (self.name, n))
421 |
422 | self.layer_norm_params[n] = [gain, bias]
423 |
424 | def _layer_norm(self, x, param_name):
425 | if self.layer_norm is None:
426 | return x
427 |
428 | gain, bias = self.layer_norm_params[param_name]
429 |
430 | return layer_normalization(x, gain, bias)
431 |
432 | def step(self, x, states):
433 | h_tm1 = states[0]
434 | c_tm1 = states[1]
435 | B_U = states[2]
436 | B_W = states[3]
437 |
438 | Uh = self._layer_norm(K.dot(h_tm1 * B_U[0], self.U), 'Uh')
439 | Wx = self._layer_norm(K.dot(x * B_W[0], self.W), 'Wx')
440 |
441 | if self.mi is not None:
442 | z = self.mi_alpha * Wx * Uh + self.mi_beta1 * Uh + \
443 | self.mi_beta2 * Wx + self.b
444 | else:
445 | z = Wx + Uh + self.b
446 |
447 | z_i = z[:, :self.output_dim]
448 | z_f = z[:, self.output_dim: 2 * self.output_dim]
449 | z_c = z[:, 2 * self.output_dim: 3 * self.output_dim]
450 | z_o = z[:, 3 * self.output_dim:]
451 |
452 | i = self.inner_activation(z_i)
453 | f = self.inner_activation(z_f)
454 | c = f * c_tm1 + i * self.activation(z_c)
455 | o = self.inner_activation(z_o)
456 |
457 | if 0 < self.zoneout_c < 1:
458 | c = zoneout(self.zoneout_c, c_tm1, c,
459 | noise_shape=(self.output_dim,))
460 |
461 | # this is returning a lot of nan
462 | new_c = self._layer_norm(c, 'new_c')
463 |
464 | h = o * self.activation(new_c)
465 | if 0 < self.zoneout_h < 1:
466 | h = zoneout(self.zoneout_h, h_tm1, h,
467 | noise_shape=(self.output_dim,))
468 |
469 | return h, [h, c]
470 |
471 | def get_config(self):
472 | config = {'layer_norm': self.layer_norm,
473 | 'mi': self.mi,
474 | 'zoneout_h': self.zoneout_h,
475 | 'zoneout_c': self.zoneout_c
476 | }
477 |
478 | base_config = super(LSTM, self).get_config()
479 | return dict(list(base_config.items()) + list(config.items()))
480 |
481 |
482 | def recurrent(output_dim, model='keras_lstm', activation='tanh',
483 | regularizer=None, dropout=0., **kwargs):
484 | if model == 'rnn':
485 | return keras_layers.SimpleRNN(output_dim, activation=activation,
486 | W_regularizer=regularizer,
487 | U_regularizer=regularizer,
488 | dropout_W=dropout, dropout_U=dropout, consume_less='gpu',
489 | **kwargs)
490 | if model == 'gru':
491 | return keras_layers.GRU(output_dim, activation=activation,
492 | W_regularizer=regularizer,
493 | U_regularizer=regularizer, dropout_W=dropout,
494 | dropout_U=dropout,
495 | consume_less='gpu', **kwargs)
496 | if model == 'keras_lstm':
497 | return keras_layers.LSTM(output_dim, activation=activation,
498 | W_regularizer=regularizer,
499 | U_regularizer=regularizer,
500 | dropout_W=dropout, dropout_U=dropout,
501 | consume_less='gpu', **kwargs)
502 | if model == 'rhn':
503 | return RHN(output_dim, depth=1,
504 | bias_init=highway_bias_initializer,
505 | activation=activation, layer_norm=False, ln_gain_init='one',
506 | ln_bias_init='zero', mi=False,
507 | W_regularizer=regularizer, U_regularizer=regularizer,
508 | dropout_W=dropout, dropout_U=dropout, consume_less='gpu',
509 | **kwargs)
510 |
511 | if model == 'lstm':
512 | return LSTM(output_dim, activation=activation,
513 | W_regularizer=regularizer, U_regularizer=regularizer,
514 | dropout_W=dropout, dropout_U=dropout,
515 | consume_less='gpu', **kwargs)
516 | raise ValueError('model %s was not recognized' % model)
517 |
518 |
519 | if __name__ == "__main__":
520 | from keras.models import Sequential
521 | from keras.utils.visualize_util import plot
522 |
523 | model = Sequential()
524 | model.add(RHN(10, input_dim=2, depth=2, layer_norm=True))
525 | # plot(model)
526 |
--------------------------------------------------------------------------------
/core/layers_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 |
4 | import keras.backend as K
5 | import tensorflow as tf
6 |
7 | from keras import activations, initializations, regularizers
8 | from keras.layers import GRU, SimpleRNN
9 | from keras.layers import LSTM as keras_LSTM
10 |
11 |
12 | def highway_bias_initializer(shape, name=None):
13 | return -2 * initializations.one(shape, name=name)
14 |
15 |
16 | def layer_normalization(x, gain, bias, epsilon=1e-5):
17 | mean, std = tf.nn.moments(x, [1], keep_dims=True)
18 | x_normed = (x - mean) / K.sqrt(std + epsilon) * gain + bias
19 | return x_normed
20 |
21 |
22 | def multiplicative_integration_init(shape, alpha_init='one',
23 | beta1_init='one', beta2_init='one',
24 | name='mi', has_input=True):
25 | beta1 = initializations.get(beta1_init)(shape, name='%s_beta1' % name)
26 | if has_input:
27 | alpha = initializations.get(alpha_init)(shape, name='%s_alpha' % name)
28 | beta2 = initializations.get(beta2_init)(shape, name='%s_beta2' % name)
29 | return alpha, beta1, beta2
30 |
31 | return beta1
32 |
33 |
34 | def zoneout(level, h_tm1, h, noise_shape):
35 | '''Apply a zoneout function to preserve a fraction of values from h_tm1 in
36 | h.'''
37 | h_diff = h - h_tm1
38 | h = K.in_train_phase(K.dropout(h_diff,
39 | level,
40 | noise_shape=noise_shape), h_diff)
41 | h = h * (1. - level) + h_tm1
42 | return h
43 |
44 |
45 | def multiplicative_integration(Wx, Uz, params, has_input=True):
46 | if has_input:
47 | alpha, beta1, beta2 = params
48 | return alpha * Wx * Uz + beta1 * Uz + beta2 * Wx
49 |
50 | beta1 = params
51 | return beta1 * Uz
52 |
53 |
54 | def to_dense(x):
55 | if K.is_sparse(x):
56 | return tf.sparse_tensor_to_dense(x, default_value=-1)
57 | return x
58 |
59 |
60 | def to_dense_output_shape(input_shape):
61 | return input_shape
62 |
63 |
64 | LN = layer_normalization
65 | mi = multiplicative_integration
66 | mi_init = multiplicative_integration_init
67 |
--------------------------------------------------------------------------------
/core/metrics.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 |
4 | def ler(y_true, y_pred, **kwargs):
5 | """
6 | Label Error Rate. For more information see 'tf.edit_distance'
7 | """
8 | return tf.reduce_mean(tf.edit_distance(y_pred, y_true, **kwargs))
9 |
--------------------------------------------------------------------------------
/core/models.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import core.ctc_utils as ctc_utils
6 | from utils.hparams import HParams
7 |
8 | import keras
9 | import keras.backend as K
10 | from keras.initializations import uniform
11 | from keras.activations import relu
12 |
13 | from keras.models import Model
14 |
15 | from keras.layers import Input
16 | from keras.layers import GaussianNoise
17 | from keras.layers import TimeDistributed
18 | from keras.layers import Dense
19 | from .layers import LSTM
20 | from keras.layers import Masking
21 | from keras.layers import Bidirectional
22 | from keras.layers import Lambda
23 | from keras.layers import Dropout
24 | from keras.layers import merge
25 |
26 | from keras.regularizers import l1, l2, l1l2
27 |
28 | from .layers import recurrent
29 |
30 |
31 | def ctc_model(inputs, output, **kwargs):
32 | """ Given the input and output returns a model appending ctc_loss, the
33 | decoder, labels, and inputs_length
34 |
35 | # Arguments
36 | see core.ctc_utils.layer_utils.decode for more arguments
37 | """
38 |
39 | # Define placeholders
40 | labels = Input(name='labels', shape=(None,), dtype='int32', sparse=True)
41 | inputs_length = Input(name='inputs_length', shape=(None,), dtype='int32')
42 |
43 | # Define a decoder
44 | dec = Lambda(ctc_utils.decode, output_shape=ctc_utils.decode_output_shape,
45 | arguments={'is_greedy': True}, name='decoder')
46 | y_pred = dec([output, inputs_length])
47 |
48 | ctc = Lambda(ctc_utils.ctc_lambda_func, output_shape=(1,), name="ctc")
49 | # Define loss as a layer
50 | loss = ctc([output, labels, inputs_length])
51 |
52 | return Model(input=[inputs, labels, inputs_length], output=[loss, y_pred])
53 |
54 |
55 | def graves2006(num_features=26, num_hiddens=100, num_classes=28, std=.6):
56 | """ Implementation of Graves' model
57 | Reference:
58 | [1] Graves, Alex, et al. "Connectionist temporal classification:
59 | labelling unsegmented sequence data with recurrent neural networks."
60 | Proceedings of the 23rd international conference on Machine learning.
61 | ACM, 2006.
62 | """
63 |
64 | x = Input(name='inputs', shape=(None, num_features))
65 | o = x
66 |
67 | o = GaussianNoise(std)(o)
68 | o = Bidirectional(LSTM(num_hiddens,
69 | return_sequences=True,
70 | consume_less='gpu'))(o)
71 | o = TimeDistributed(Dense(num_classes))(o)
72 |
73 | return ctc_model(x, o)
74 |
75 |
76 | def eyben(num_features=39, num_hiddens=[78, 120, 27], num_classes=28):
77 | """ Implementation of Eybens' model
78 | Reference:
79 | [1] Eyben, Florian, et al. "From speech to letters-using a novel neural
80 | network architecture for grapheme based asr." Automatic Speech
81 | Recognition & Understanding, 2009. ASRU 2009. IEEE Workshop on. IEEE,
82 | 2009.
83 | """
84 |
85 | assert len(num_hiddens) == 3
86 |
87 | x = Input(name='inputs', shape=(None, num_features))
88 | o = x
89 |
90 | if num_hiddens[0]:
91 | o = TimeDistributed(Dense(num_hiddens[0]))(o)
92 | if num_hiddens[1]:
93 | o = Bidirectional(LSTM(num_hiddens[1],
94 | return_sequences=True,
95 | consume_less='gpu'))(o)
96 | if num_hiddens[2]:
97 | o = Bidirectional(LSTM(num_hiddens[2],
98 | return_sequences=True,
99 | consume_less='gpu'))(o)
100 |
101 | o = TimeDistributed(Dense(num_classes))(o)
102 |
103 | return ctc_model(x, o)
104 |
105 |
106 | def maas(num_features=81, num_classes=29, num_hiddens=1824, dropout=0.1,
107 | max_value=20):
108 | """ Maas' model.
109 | Reference:
110 | [1] Maas, Andrew L., et al. "Lexicon-Free Conversational Speech
111 | Recognition with Neural Networks." HLT-NAACL. 2015.
112 | """
113 |
114 | x = Input(name='inputs', shape=(None, num_features))
115 | o = x
116 |
117 | def clipped_relu(x):
118 | return relu(x, max_value=max_value)
119 |
120 | # First layer
121 | o = TimeDistributed(Dense(num_hiddens))(o)
122 | o = TimeDistributed(Activation(clipped_relu))(o)
123 |
124 | # Second layer
125 | o = TimeDistributed(Dense(num_hiddens))(o)
126 | o = TimeDistributed(Activation(clipped_relu))(o)
127 |
128 | # Third layer
129 | o = Bidirectional(SimpleRNN(num_hiddens, return_sequences=True,
130 | dropout_W=dropout,
131 | activation=clipped_relu,
132 | init='he_normal'), merge_mode='sum')(o)
133 |
134 | # Fourth layer
135 | o = TimeDistributed(Dense(num_hiddens))(o)
136 | o = TimeDistributed(Activation(clipped_relu))(o)
137 |
138 | # Fifth layer
139 | o = TimeDistributed(Dense(num_hiddens))(o)
140 | o = TimeDistributed(Activation(clipped_relu))(o)
141 |
142 | # Output layer
143 | o = TimeDistributed(Dense(num_classes))(o)
144 |
145 | return ctc_model(x, o)
146 |
147 |
148 | def deep_speech(num_features=81, num_classes=29, num_hiddens=2048, dropout=0.1,
149 | max_value=20):
150 | """ Deep Speech model.
151 |
152 | Contains five layers: 3 FC - BRNN - 1 FC
153 | Dropout only applied to fully connected layers (between 5% to 10%)
154 |
155 | Note:
156 | * We are not translating the raw audio files by 5 ms (Sec 2.1 in [1])
157 | * We are not striding the RNN to halve the timesteps (Sec 3.3 in [1])
158 | * We are not using frames of context
159 | * Their output contains {a, ..., z, space, apostrophe, blank}
160 | Experiment 5.1: Conversational speech: Switchboard Hub5'00 (full)
161 | * Input - 80 linearly spaced log filter banks and an energy term. The
162 | filter banks are computed over windows of 20ms strided by 10ms.
163 | * Speaker adaptation - spectral features are normalized on a per
164 | speaker basis.
165 | * Hidden units: {2304, 2048}
166 | * Essemble of 4 networks
167 | Experiment 5.2: Noisy speech
168 | * Input - 160 linearly spaced log filter banks. The filter banks are
169 | computed over windows of 20ms strided by 10ms. Global mean and standard
170 | deviation over training set normalization
171 | * Speaker adaptation - none
172 | * Hidden units: 2560
173 | * Essemble of 6 networks
174 | Reference:
175 | [1] HANNUN, A. Y. et al. Deep Speech: Scaling up end-to-end speech
176 | recognition. arXiV, 2014.
177 | """
178 | x = Input(name='inputs', shape=(None, num_features))
179 | o = x
180 |
181 | def clipped_relu(x):
182 | return relu(x, max_value=max_value)
183 |
184 | # First layer
185 | o = TimeDistributed(Dense(num_hiddens))(o)
186 | o = TimeDistributed(Activation(clipped_relu))(o)
187 | o = TimeDistributed(Dropout(dropout))(o)
188 |
189 | # Second layer
190 | o = TimeDistributed(Dense(num_hiddens))(o)
191 | o = TimeDistributed(Activation(clipped_relu))(o)
192 | o = TimeDistributed(Dropout(dropout))(o)
193 |
194 | # Third layer
195 | o = TimeDistributed(Dense(num_hiddens))(o)
196 | o = TimeDistributed(Activation(clipped_relu))(o)
197 | o = TimeDistributed(Dropout(dropout))(o)
198 |
199 | # Fourth layer
200 | o = Bidirectional(SimpleRNN(num_hiddens, return_sequences=True,
201 | dropout_W=dropout,
202 | activation=clipped_relu,
203 | init='he_normal'), merge_mode='sum')(o)
204 | o = TimeDistributed(Dropout(dropout))(o)
205 |
206 | # Fifth layer
207 | o = TimeDistributed(Dense(num_hiddens))(o)
208 | o = TimeDistributed(Activation(clipped_relu))(o)
209 | o = TimeDistributed(Dropout(dropout))(o)
210 |
211 | # Output layer
212 | o = TimeDistributed(Dense(num_classes))(o)
213 |
214 | return ctc_model(x, o)
215 |
216 |
217 | def brsmv1(num_features=39, num_classes=28, num_hiddens=256, num_layers=5,
218 | dropout=0.2, zoneout=0., input_dropout=False,
219 | input_std_noise=.0, weight_decay=1e-4, residual=None,
220 | layer_norm=None, mi=None, activation='tanh'):
221 | """ BRSM v1.0
222 | Improved features:
223 | * Residual connection
224 | * Variational Dropout
225 | * Zoneout
226 | * Layer Normalization
227 | * Multiplicative Integration
228 | Note:
229 | Dropout, zoneout and weight decay is tied through layers, in order to
230 | minimizing the number of hyper parameters
231 | Reference:
232 | [1] Gal, Y, "A Theoretically Grounded Application of Dropout in
233 | Recurrent Neural Networks", 2015.
234 | [2] Graves, Alex, Abdel-rahman Mohamed, and Geoffrey Hinton. "Speech
235 | recognition with deep recurrent neural networks", 2013.
236 | [3] Krueger, David, et al. "Zoneout: Regularizing rnns by randomly
237 | preserving hidden activations", 2016.
238 | [4] Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton. "Layer
239 | normalization.", 2016.
240 | [5] Wu, Yuhuai, et al. "On multiplicative integration with recurrent
241 | neural networks." Advances In Neural Information Processing Systems.
242 | 2016.
243 | [6] Wu, Yonghui, et al. "Google's Neural Machine Translation System:
244 | Bridging the Gap between Human and Machine Translation.", 2016.
245 | """
246 |
247 | x = Input(name='inputs', shape=(None, num_features))
248 | o = x
249 |
250 | if input_std_noise is not None:
251 | o = GaussianNoise(input_std_noise)(o)
252 |
253 | if residual is not None:
254 | o = TimeDistributed(Dense(num_hiddens*2,
255 | W_regularizer=l2(weight_decay)))(o)
256 |
257 | if input_dropout:
258 | o = Dropout(dropout)(o)
259 |
260 | for i, _ in enumerate(range(num_layers)):
261 | new_o = Bidirectional(LSTM(num_hiddens,
262 | return_sequences=True,
263 | W_regularizer=l2(weight_decay),
264 | U_regularizer=l2(weight_decay),
265 | dropout_W=dropout,
266 | dropout_U=dropout,
267 | zoneout_c=zoneout,
268 | zoneout_h=zoneout,
269 | mi=mi,
270 | layer_norm=layer_norm,
271 | activation=activation))(o)
272 |
273 | if residual is not None:
274 | o = merge([new_o, o], mode=residual)
275 | else:
276 | o = new_o
277 |
278 | o = TimeDistributed(Dense(num_classes,
279 | W_regularizer=l2(weight_decay)))(o)
280 |
281 | return ctc_model(x, o)
282 |
--------------------------------------------------------------------------------
/data/download_brsmv1.sh:
--------------------------------------------------------------------------------
1 | echo "Downloading the brsmv1 pre-trained model:"
2 | mkdir -p models/
3 | wget -c -q --show-progress -O models/brsmv1.h5 https://www.dropbox.com/s/ink8zxhzysxvzxa/best_ptbr.h5?dl=0
4 |
--------------------------------------------------------------------------------
/data/download_datasets.sh:
--------------------------------------------------------------------------------
1 | echo "Downloading pt-br datasets. This may take a while"
2 | echo "Downloading Sid dataset:"
3 | wget -c -q --show-progress -O ./sid.tar.gz https://www.dropbox.com/s/0wxlweatglrr7wl/sid.tar.gz?dl=0
4 | echo "Downloading VoxForge dataset:"
5 | wget -c -q --show-progress -O ./voxforge-ptbr.tar.gz https://www.dropbox.com/s/wrguetal6xmrgta/voxforge-ptbr.tar.gz?dl=0
6 | echo "Downloading LapsBenchmark1.4 dataset:"
7 | wget -c -q --show-progress -O ./lapsbm.tar.gz https://www.dropbox.com/s/8aqm9ktulmnry6d/lapsbm.tar.gz?dl=0
8 |
9 | echo "Extracting Sid dataset..."
10 | mkdir -p sid
11 | cd sid; tar -xzf ../sid.tar.gz; cd ..
12 |
13 | echo "Extracting VoxForge dataset..."
14 | mkdir -p voxforge
15 | cd voxforge; tar -xzf ../voxforge-ptbr.tar.gz; cd ..
16 |
17 | echo "Extracting LapsBenchmark1.4 dataset..."
18 | mkdir -p lapsbm
19 | cd lapsbm; tar -xzf ../lapsbm.tar.gz; cd ..
20 |
21 | echo "Finished."
22 |
--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 |
3 | from utils.generic_utils import safe_mkdirs
4 | import os
5 |
6 | DT_ABSPATH = os.path.join(os.path.sep.join(os.path.dirname(os.path.abspath(__file__)).split(os.path.sep)[:-1]), '.datasets')
7 | safe_mkdirs(DT_ABSPATH)
8 |
9 | from datasets.dataset_parser import DatasetParser
10 | from datasets.sid import Sid
11 | from datasets.lapsbm import LapsBM
12 | from datasets.voxforge import VoxForge
13 | from datasets.cslu import CSLU
14 | from datasets.dummy import Dummy
15 | from datasets.brsd import BRSD
16 |
--------------------------------------------------------------------------------
/datasets/brsd.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | from datasets import DatasetParser
6 | from datasets import LapsBM
7 |
8 | from utils.generic_utils import get_from_module
9 |
10 |
11 | class BRSD(DatasetParser):
12 | """ Brazilian Portuguese Speech dataset reader and parser
13 |
14 | This dataset is a combination of four smaller datasets (voxforge, lapsbm,
15 | sid, and cslu spoltech port). The dataset was divided in the following
16 | way:
17 | * Train: voxforge, sid, and cslu spoltech port
18 | * Valid: 5 women and 15 men from LaspBM
19 | * Test: 5 women 10 men from LapsBM (without overlapping with valid set
20 | either in speaker and utterance spoken)
21 |
22 | After cleaning (removing label with zero length, label with numeric
23 | digits, e.g., 4 instead of four) the training set contains 11702
24 | utterances with 425 speakers.
25 |
26 | """
27 |
28 | def __init__(self, dataset_dir=None, name='brsd', **kwargs):
29 |
30 | dataset_dir = dataset_dir or {'lapsbm': None,
31 | 'voxforge': None,
32 | 'sid': None,
33 | 'cslu': None}
34 |
35 | super(BRSD, self).__init__(dataset_dir, name, **kwargs)
36 |
37 | @property
38 | def dataset_dir(self):
39 | """Filepath to the dataset directory"""
40 | return self._dataset_dir
41 |
42 | @dataset_dir.setter
43 | def dataset_dir(self, value):
44 | """Filepath to the dataset directory"""
45 |
46 | if value is None:
47 | raise ValueError("You must set the variable dataset_dir"
48 | " (the location of dataset) before continue")
49 |
50 | if not isinstance(value, dict):
51 | raise ValueError("dataset_dir must be a dictionary")
52 |
53 | for key in ('lapsbm', 'voxforge', 'sid'):
54 | if key not in value:
55 | raise KeyError("dataset_dir must have the key %s" % key)
56 |
57 | if 'cslu' not in value:
58 | self._logger.warning('CSLU not found. Ignoring it.')
59 |
60 | self._dataset_dir = value
61 |
62 | def _iter(self):
63 |
64 | for name, path in self.dataset_dir.items():
65 |
66 | if name == 'lapsbm':
67 | continue
68 |
69 | try:
70 | dataset_cls = get_from_module('datasets*', name, regex=True)
71 | dataset = dataset_cls(dataset_dir=path)
72 |
73 | for d in dataset._iter():
74 | yield {'duration': d['duration'],
75 | 'input': d['input'],
76 | 'label': d['label'],
77 | 'speaker': '%s_%s' % (str(dataset), d['speaker']),
78 | 'dataset': 'train'}
79 | except ValueError, e:
80 | self._logger.warning('Skipping dataset %s: %s' % (name, e.message))
81 | # Test and valid set
82 | lapsbm = LapsBM(dataset_dir=self.dataset_dir['lapsbm'], split=True)
83 | for d in lapsbm._iter():
84 | yield {'duration': d['duration'],
85 | 'input': d['input'],
86 | 'label': d['label'],
87 | 'speaker': '%s_%s' % (str(dataset), d['speaker']),
88 | 'dataset': d['dataset']}
89 |
90 | def _report(self, dl):
91 | report = '''General information:
92 | Number of utterances: %d
93 | Total size (in seconds) of utterances: %.f
94 | Number of speakers: %d''' % (len(dl['input']),
95 | sum(dl['duration']),
96 | len(set(dl['speaker'])))
97 |
98 | return report
99 |
--------------------------------------------------------------------------------
/datasets/cslu.py:
--------------------------------------------------------------------------------
1 | from datasets import DatasetParser
2 |
3 | import os
4 | import re
5 | import librosa
6 | import codecs
7 |
8 |
9 | class CSLU(DatasetParser):
10 | """ CSLU Spoltech Port dataset reader and parser
11 |
12 | More about the dataset: https://catalog.ldc.upenn.edu/LDC2006S16
13 | """
14 |
15 | def __init__(self, dataset_dir=None, name='cslu', **kwargs):
16 |
17 | dataset_dir = dataset_dir or 'data/cslu'
18 |
19 | super(CSLU, self).__init__(dataset_dir, name, **kwargs)
20 |
21 | def _iter(self):
22 | trans_directory = os.path.join(self.dataset_dir, 'trans')
23 |
24 | for speaker_path in os.listdir(trans_directory):
25 |
26 | root_path = os.path.join(os.path.abspath(trans_directory),
27 | speaker_path)
28 |
29 | if not os.path.isdir(os.path.join(root_path)):
30 | continue
31 |
32 | labels_files = os.listdir(root_path)
33 |
34 | for labels_file in labels_files:
35 |
36 | label = codecs.open(
37 | os.path.join(root_path, labels_file), 'r',
38 | 'latin-1').read().strip().lower()
39 |
40 | audio_file = os.path.join(os.path.abspath(self.dataset_dir),
41 | 'speech', speaker_path,
42 | labels_file[:-4])
43 |
44 | audio_file = audio_file + '.wav'
45 | speaker_id = speaker_path
46 |
47 | try:
48 | duration = librosa.audio.get_duration(filename=audio_file)
49 | except IOError:
50 | self._logger.error('File %s not found' % audio_file)
51 | continue
52 |
53 | yield {'duration': duration,
54 | 'input': audio_file,
55 | 'label': label,
56 | 'speaker': speaker_id}
57 |
58 | def _report(self, dl):
59 | report = '''General information:
60 | Number of utterances: %d
61 | Total size (in seconds) of utterances: %.f
62 | Number of speakers: %d''' % (len(dl['audio']), sum(dl['duration']),
63 | len(set(dl['speaker'])))
64 |
65 | return report
66 |
--------------------------------------------------------------------------------
/datasets/dataset_generator.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 |
3 | from keras.preprocessing.image import Iterator
4 | from keras.preprocessing.sequence import pad_sequences
5 |
6 | import scipy
7 | import librosa
8 | import h5py
9 | import numpy as np
10 | import codecs
11 | import json
12 | import os
13 |
14 | import time
15 |
16 | from preprocessing import audio, text
17 | from utils import generic_utils as utils
18 |
19 | import logging
20 |
21 |
22 | class DatasetGenerator(object):
23 | """ Dataset generator that handles several forms of input and return an
24 | iterator over it. Only works for a CTC model
25 |
26 | # Arguments
27 | input_parser: instance of Feature [preprocessing.audio.Feature]
28 | feature that is applied to each audio file (or audio data)
29 | label_parser: instance of Parser [preprocessing.text.Parser].
30 | parser that is applied to each label data
31 | batch_size: number of samples per batch
32 | shuffle: reordering index per epoch. This avoid some bias in training
33 | seed: default None
34 | """
35 |
36 | def __init__(self, input_parser=None, label_parser=None, batch_size=32,
37 | shuffle=True, seed=None, mode='train'):
38 | self._logger = logging.getLogger('%s.%s' % (__name__,
39 | self.__class__.__name__))
40 | self.input_parser = input_parser
41 | self.label_parser = label_parser
42 | self.batch_size = batch_size
43 | self.shuffle = shuffle
44 | self.seed = seed
45 | self.mode = mode
46 |
47 | def flow_from_fname(self, fname, datasets=None):
48 | """ Returns an specific iterator given the filename
49 |
50 | # Arguments
51 | datasets: str or list. If str will return one iterator; otherwise
52 | will return len(dataset) iterators for each dataset
53 |
54 | # Inputs
55 | fname: path to a file.
56 | *.h5 (HDF5 format)
57 | *json (JSON format)
58 |
59 | # Outputs
60 | If fname is:
61 | HDF5 format: H5Iterator
62 | JSON format: JSONIterator
63 | """
64 | out = None
65 | datasets = datasets or ['/']
66 | if type(datasets) not in (set, list):
67 | datasets = [datasets]
68 |
69 | if h5py.is_hdf5(fname):
70 | h5_f = h5py.File(fname, 'r')
71 | out = [self.flow_from_h5_group(h5_f[dataset])
72 | for dataset in datasets]
73 |
74 | ext = os.path.splitext(fname)[1]
75 | if ext == '.json':
76 | out = [self.flow_from_json(fname, dataset) for dataset in datasets]
77 |
78 | if out is None:
79 | raise ValueError("Extension not recognized")
80 |
81 | if len(out) == 1:
82 | return out[0]
83 | return out
84 |
85 | def flow_from_json(self, fname, dataset=None):
86 | """ Returns JSONIterator given the filename"""
87 | return JSONIterator(
88 | fname, dataset, batch_size=self.batch_size,
89 | shuffle=self.shuffle, seed=self.seed,
90 | input_parser=self.input_parser,
91 | label_parser=self.label_parser,
92 | mode=self.mode)
93 |
94 | def flow_from_dl(self, dl, dataset=None):
95 | """ Return DictListIterator given a list of dictionaries. Each
96 | dictionary must have the keys 'input' and 'label'
97 | """
98 | return DictListIterator(dl, dataset, batch_size=self.batch_size,
99 | shuffle=self.shuffle, seed=self.seed,
100 | input_parser=self.input_parser,
101 | label_parser=self.label_parser,
102 | mode=self.mode)
103 |
104 | def flow_from_h5_group(self, h5_group=None):
105 | """ Returns H5Iterator given a h5group from a HDF5 data
106 | """
107 | return H5Iterator(h5_group, batch_size=self.batch_size,
108 | shuffle=self.shuffle, seed=self.seed,
109 | input_parser=self.input_parser,
110 | label_parser=self.label_parser,
111 | mode=self.mode)
112 |
113 | def flow_from_h5_file(self, h5_file, dataset='/'):
114 | h5_f = h5py.File(h5_file, 'r')
115 | return H5Iterator(h5_f[dataset], batch_size=self.batch_size,
116 | shuffle=self.shuffle, seed=self.seed,
117 | input_parser=self.input_parser,
118 | label_parser=self.label_parser,
119 | mode=self.mode)
120 |
121 | def flow(self, inputs, labels):
122 | return DatasetIterator(inputs, labels, batch_size=self.batch_size,
123 | shuffle=self.shuffle, seed=self.seed,
124 | input_parser=self.input_parser,
125 | label_parser=self.label_parser,
126 | mode=self.mode)
127 |
128 |
129 | class DatasetIterator(Iterator):
130 |
131 | def __init__(self, inputs, labels=None, batch_size=32, shuffle=False,
132 | seed=None, input_parser=None, label_parser=None,
133 | standarize=None, mode='train'):
134 | """ DatasetIterator iterates in a batch over a dataset and do some
135 | preprocessing on inputs and labels
136 |
137 | # Arguments
138 | inputs: a list of ndarray
139 | labels: a list of str or ndarray
140 | batch_size: size of each batch
141 | shuffle: if True after each epoch the dataset will shuffle the
142 | indexes
143 | seed: seed the random generator
144 | input_parser: instance of Feature
145 | [preprocessing.audio.Feature]
146 | feature that is applied to each ndarray in batch
147 | label_parser: instance of Parser [preprocessing.text.Parser].
148 | parser that is applied to each label in batch
149 | standarize: if a set (mean, std), the input will be
150 | normalized
151 | mode: if 'predict', only the inputs is generated
152 | """
153 |
154 | if labels is not None and len(inputs) != len(labels):
155 | raise ValueError('inputs and labels '
156 | 'should have the same length. '
157 | 'Found: len(inputs) = %s, len(labels) = %s' %
158 | (len(inputs), len(labels)))
159 | self._logger = logging.getLogger('%s.%s' % (__name__,
160 | self.__class__.__name__))
161 | self.inputs = inputs
162 | self.labels = labels
163 |
164 | self.input_parser = input_parser
165 | self.label_parser = label_parser
166 |
167 | self.standarize = standarize
168 | self.mode = mode
169 |
170 | if self.input_parser is not None:
171 | logging.warning('Feature extractor is not None. It may slow down'
172 | + ' training')
173 |
174 | super(DatasetIterator, self).__init__(len(inputs), batch_size,
175 | shuffle, seed)
176 |
177 | @property
178 | def len(self):
179 | """ Return the total size of dataset
180 | """
181 | return len(self.inputs)
182 |
183 | def next(self):
184 | """ Iterates over batches
185 |
186 | # Outputs
187 | Returns a tuple (input, output) that can be fed a CTC model
188 | input: is a list containing the inputs, labels and sequence
189 | length for the current batch
190 | output: is a list containing a vector of zeros (fake data for
191 | the decoder) and the batch labels for the decoder of a CTC
192 | model
193 | """
194 |
195 | # Copy from DirectoryIterator from keras
196 | with self.lock:
197 | index_array, current_index, current_batch_size = next(
198 | self.index_generator)
199 |
200 | index_array.sort()
201 |
202 | index_array_list = index_array.tolist()
203 |
204 | batch_inputs, batch_inputs_len = self._make_in(
205 | self.inputs[index_array_list], current_batch_size)
206 |
207 | if self.labels is not None:
208 | batch_labels = self._make_out(self.labels[index_array_list],
209 | current_batch_size)
210 | else:
211 | batch_labels = None
212 |
213 | return self._make_in_out(batch_inputs, batch_labels, batch_inputs_len)
214 |
215 | def _make_in_out(self, batch_inputs, batch_labels, batch_inputs_len=None):
216 | # if label is not provided output is not necessary
217 | if batch_labels is None:
218 | return [batch_inputs, batch_inputs_len]
219 |
220 | return ([batch_inputs, batch_labels, batch_inputs_len],
221 | [np.zeros((batch_inputs.shape[0],)), batch_labels])
222 |
223 | def _make_in(self, inputs, batch_size=None):
224 | if self.input_parser is not None:
225 | inputs = np.asarray([self.input_parser(i) for i in inputs])
226 |
227 | batch_inputs = pad_sequences(inputs, dtype='float32', padding='post')
228 |
229 | if self.standarize:
230 | mean, std = self.standarize
231 | batch_inputs -= mean
232 | batch_inputs /= (std + self.eps)
233 |
234 | batch_inputs_len = np.asarray([i.shape[0] for i in inputs])
235 | return batch_inputs, batch_inputs_len
236 |
237 | def _make_out(self, labels, batch_size=None):
238 | if self.labels is None or self.mode == 'predict':
239 | return None
240 |
241 | if self.label_parser is not None:
242 | labels = [self.label_parser(l) for l in labels]
243 |
244 | rows, cols, data = [], [], []
245 |
246 | for row, label in enumerate(labels):
247 | cols.extend(range(len(label)))
248 | rows.extend(len(label) * [row])
249 | data.extend(label)
250 |
251 | return scipy.sparse.coo_matrix((data, (rows, cols)), dtype='int32')
252 |
253 |
254 | class H5Iterator(DatasetIterator):
255 |
256 | def __init__(self, h5group, **kwargs):
257 |
258 | inputs = h5group['inputs']
259 | labels = h5group['labels']
260 |
261 | if kwargs.get('label_parser') is None:
262 | raise ValueError("label_parser must be set")
263 |
264 | self.num_feats = None
265 | if 'num_feats' in inputs.attrs.keys():
266 | self.num_feats = inputs.attrs['num_feats']
267 |
268 | self.durations = h5group['durations']
269 |
270 | super(H5Iterator, self).__init__(inputs, labels, **kwargs)
271 |
272 | def _make_in(self, inputs, batch_size=None):
273 |
274 | if self.num_feats is not None:
275 | inputs = [i.reshape((-1, self.num_feats)) for i in inputs]
276 |
277 | return super(H5Iterator, self)._make_in(inputs)
278 |
279 |
280 | class JSONIterator(DatasetIterator):
281 |
282 | def __init__(self, fname, dataset=None, **kwargs):
283 |
284 | self._logger = logging.getLogger('%s.%s' % (__name__,
285 | self.__class__.__name__))
286 |
287 | kwargs.setdefault('input_parser', audio.raw)
288 |
289 | if kwargs.get('input_parser') is None:
290 | raise ValueError("input_parser must be set")
291 |
292 | if kwargs.get('label_parser') is None:
293 | raise ValueError("label_parser must be set")
294 |
295 | with codecs.open(fname, 'r', encoding='utf8') as f:
296 | ld = json.load(f)
297 |
298 | data = utils.ld2dl(ld)
299 |
300 | if dataset and 'dataset' not in data:
301 | self._logger.warning('No dataset key found. Falling back to None')
302 | dataset = None
303 |
304 | if dataset:
305 | inputs = np.array([i for i, d in zip(
306 | data['input'], data['dataset']) if d == dataset])
307 | labels = np.array([l for l, d in zip(
308 | data['label'], data['dataset']) if d == dataset])
309 | else:
310 | inputs = np.array(data['input'])
311 | labels = np.array(data['label'])
312 |
313 | super(JSONIterator, self).__init__(inputs, labels, **kwargs)
314 |
315 | self.durations = np.array(data['duration'])
316 |
317 |
318 | class DictListIterator(DatasetIterator):
319 |
320 | def __init__(self, dict_list, dataset=None, **kwargs):
321 |
322 | kwargs.setdefault('input_parser', audio.raw)
323 |
324 | if kwargs.get('input_parser') is None:
325 | raise ValueError("input_parser must be set")
326 |
327 | if kwargs.get('label_parser') is None:
328 | raise ValueError("label_parser must be set")
329 |
330 | if dataset:
331 | dict_list = self._get_by_dataset(dict_list, dataset)
332 |
333 | inputs = np.array(dict_list['audio'])
334 | labels = np.array(dict_list['label'])
335 |
336 | super(DictListIterator, self).__init__(inputs, labels, **kwargs)
337 |
338 | self.durations = np.array(dict_list['duration'])
339 |
340 | def _get_by_dataset(self, dl, dataset):
341 | mask = [i for i, d in enumerate(dl['dataset']) if d == dataset]
342 | return {k: np.array(v)[mask] for k, v in dl.iteritems()
343 | if k != 'dataset'}
344 |
--------------------------------------------------------------------------------
/datasets/dataset_parser.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 |
3 | import os
4 | import codecs
5 | import json
6 |
7 | import logging
8 | import h5py
9 |
10 | import numpy as np
11 |
12 | from preprocessing import audio, text
13 | from datasets import DT_ABSPATH
14 | from utils.generic_utils import safe_mkdirs, ld2dl
15 |
16 |
17 | class DatasetParser(object):
18 | '''Read data from directory and parser in a proper format
19 | '''
20 |
21 | def __init__(self, dataset_dir, name=None):
22 | self._logger = logging.getLogger('%s.%s' % (__name__,
23 | self.__class__.__name__))
24 | self.dataset_dir = dataset_dir
25 | self._name = name
26 |
27 | self.default_output_dir = os.path.join(DT_ABSPATH, self.name)
28 |
29 | @property
30 | def dataset_dir(self):
31 | """Filepath to the dataset directory"""
32 | return self._dataset_dir
33 |
34 | @dataset_dir.setter
35 | def dataset_dir(self, value):
36 | if value is None:
37 | raise ValueError("You must set the variable dataset_dir (the location of dataset) before continue")
38 |
39 | if not os.path.isdir(value):
40 | raise ValueError("Dataset directory provided is not a directory")
41 | self._dataset_dir = value
42 |
43 | def _to_ld(self, label_parser=None):
44 | ''' Transform dataset in a list of dictionary
45 | '''
46 | data = []
47 | for d in self._iter():
48 | if not isinstance(d, dict):
49 | raise TypeError("__loop must return a dict")
50 |
51 | for k in ['input', 'label', 'duration']:
52 | if k not in d:
53 | raise KeyError("__loop must return a dict with %s key" % k)
54 |
55 | if not self._is_valid_label(d['label'], label_parser=label_parser):
56 | self._logger.warning(u'File %s has a forbidden label: "%s". Skipping', d['input'], d['label'])
57 | continue
58 |
59 | data.append(d)
60 | return data
61 |
62 | def to_json(self, fname=None):
63 | ''' Parse the entire dataset to a list of dictionary containin at least
64 | two keys:
65 | `input`: path to audio file
66 | `duration`: length of the audio
67 | `label`: transcription of the audio
68 | '''
69 | fname = fname or os.path.join(
70 | self.default_output_dir, 'data.json')
71 |
72 | if os.path.exists(fname) and override:
73 | os.remove(fname)
74 |
75 | if not os.path.isdir(os.path.split(fname)[0]):
76 | safe_mkdirs(os.path.split(fname)[0])
77 |
78 | data = self._to_ld()
79 |
80 | with codecs.open(fname, 'w', encoding='utf8') as f:
81 | json.dump(data, f)
82 |
83 | self._logger.info(self._report(ld2dl(data)))
84 |
85 | return fname
86 |
87 | def to_h5(self, fname=None, input_parser=audio.raw, label_parser=None,
88 | split_sets=True, override=False):
89 | ''' Generates h5df file for the dataset
90 | Note that this function will calculate the features rather than store
91 | the path to the audio file
92 |
93 | Args
94 | split_sets: if True and dataset is split in several sets (e.g.
95 | train, valid, test) the h5 file will create the corresponding
96 | datasets; otherwise no dataset is create
97 | '''
98 | if not issubclass(input_parser.__class__, audio.Feature):
99 | raise TypeError("input_parser must be an instance of audio.Feature")
100 |
101 | fname = fname or os.path.join(self.default_output_dir, 'data.h5')
102 |
103 | if h5py.is_hdf5(fname) and override:
104 | os.remove(fname)
105 |
106 | if not os.path.isdir(os.path.split(fname)[0]):
107 | safe_mkdirs(os.path.split(fname)[0])
108 |
109 | feat_name = str(input_parser)
110 |
111 | data = self._to_ld(label_parser=label_parser)
112 |
113 | if len(data) == 0:
114 | raise IndexError("Data is empty")
115 |
116 | datasets = ['/']
117 | if 'dataset' in data[0]:
118 | datasets = list(set([d['dataset'] for d in data]))
119 |
120 | self._logger.info('Opening %s', fname)
121 | with h5py.File(fname) as f:
122 |
123 | # create all datasets
124 | for dataset in datasets:
125 |
126 | group = f['/']
127 | if dataset != '/':
128 | group = f.create_group(dataset)
129 |
130 | inputs = group.create_dataset(
131 | 'inputs', (0,), maxshape=(None,),
132 | dtype=h5py.special_dtype(vlen=np.dtype('float32')))
133 |
134 | if input_parser.num_feats:
135 | inputs.attrs['num_feats'] = input_parser.num_feats
136 |
137 | group.create_dataset(
138 | 'labels', (0,), maxshape=(None,),
139 | dtype=h5py.special_dtype(vlen=unicode))
140 |
141 | group.create_dataset(
142 | 'durations', (0,), maxshape=(None,))
143 |
144 | for i, d in enumerate(data):
145 |
146 | dataset = '/'
147 | if dataset not in datasets:
148 | dataset = d['dataset']
149 |
150 | # HDF5 pointers
151 | inputs = f[dataset]['inputs']
152 | labels = f[dataset]['labels']
153 | durations = f[dataset]['durations']
154 |
155 | # Data
156 | input_ = input_parser(d['input'])
157 | label = d['label']
158 | duration = d['duration']
159 |
160 | inputs.resize(inputs.shape[0] + 1, axis=0)
161 | inputs[inputs.shape[0] - 1] = input_.flatten().astype('float32')
162 |
163 | labels.resize(labels.shape[0] + 1, axis=0)
164 | labels[labels.shape[0] - 1] = label.encode('utf8')
165 |
166 | durations.resize(durations.shape[0] + 1, axis=0)
167 | durations[durations.shape[0] - 1] = duration
168 |
169 | # Flush to disk only when it reaches 128 samples
170 | if i % 128 == 0:
171 | self._logger.info('%d/%d done.' % (i, len(data)))
172 | f.flush()
173 |
174 | f.flush()
175 | self._logger.info('%d/%d done.' % (len(data), len(data)))
176 |
177 | return fname
178 |
179 | def _iter(self):
180 | raise NotImplementedError("_iter must be implemented")
181 |
182 | def _report(self, dl):
183 | """
184 | Args
185 | dl: dictionary of list, where the keys were defined in _iter()
186 | """
187 | raise NotImplementedError("_report must be implemented")
188 |
189 | def _is_valid_label(self, label, label_parser=None):
190 | if len(label) == 0:
191 | return False
192 |
193 | if label_parser is not None:
194 | return label_parser.is_valid(label)
195 |
196 | return True
197 |
198 | @property
199 | def name(self):
200 | return self._name
201 |
202 | def __str__(self):
203 | return self.name
204 |
--------------------------------------------------------------------------------
/datasets/dummy.py:
--------------------------------------------------------------------------------
1 | from datasets import DatasetParser
2 |
3 | import os
4 | import re
5 | import librosa
6 | import codecs
7 | import tempfile
8 |
9 | import numpy as np
10 |
11 |
12 | class Dummy(DatasetParser):
13 | """ Fake dataset reader and parser to do some tests
14 |
15 | # Arguments
16 | num_speakers: number of speakers
17 | num_utterances_per_speaker: number of utterances that each speaker will
18 | have
19 | max_duration: max duration in seconds of each fake audio
20 | min_duration: min duration in seconds of each fake audio
21 | max_label_length: max size of each fake label
22 | fs: sampling frequency of each fake audio
23 | split: list with two values. It will divide this dataset in three sets
24 | (train, valid and test) given the proportions
25 | """
26 |
27 | def __init__(self, dataset_dir=None, num_speakers=10,
28 | num_utterances_per_speaker=10,
29 | max_duration=10.0, min_duration=1.0, max_label_length=50,
30 | fs=16e3, split=None, name='dummy', **kwargs):
31 | '''
32 | Args:
33 | split: list or nparray of size 2 that splits the data between
34 | train, valid and test. example: split = [.8 .15] = 80% train, 15%
35 | valid and 5% test
36 | '''
37 |
38 | super(Dummy, self).__init__(None, name, **kwargs)
39 |
40 | self.num_speakers = num_speakers
41 | self.num_utterances_per_speaker = num_utterances_per_speaker
42 | self.max_duration = max_duration
43 | self.min_duration = min_duration
44 | self.fs = fs
45 | self.max_label_length = max_label_length
46 | self.split = split
47 |
48 | if split is not None and (len(split) != 2 or np.sum(split) > 1.):
49 | raise ValueError('Split must have len = 2 and must sum <= 1')
50 |
51 | @property
52 | def dataset_dir(self):
53 | """Filepath to the dataset directory"""
54 | return self._dataset_dir
55 |
56 | @dataset_dir.setter
57 | def dataset_dir(self, value):
58 | self._dataset_dir = value
59 |
60 | def _iter(self):
61 |
62 | counter = 0
63 | total = self.num_speakers * self.num_utterances_per_speaker
64 |
65 | for speaker in range(self.num_speakers):
66 | for utterance in range(self.num_utterances_per_speaker):
67 |
68 | duration = np.random.uniform(low=self.min_duration,
69 | high=self.max_duration)
70 |
71 | samples = np.floor(duration * self.fs)
72 | audio = np.random.randn(int(samples))
73 |
74 | audio_file = tempfile.NamedTemporaryFile(delete=False)
75 | audio_fname = audio_file.name
76 | audio_file.close()
77 |
78 | librosa.output.write_wav(audio_fname, audio, self.fs)
79 |
80 | label = np.random.randint(
81 | low=ord('a'), high=ord('z'),
82 | size=(np.random.randint(2, self.max_label_length),))
83 |
84 | label = ''.join([chr(l) for l in label])
85 |
86 | data = {'duration': duration,
87 | 'input': audio_fname,
88 | 'label': label,
89 | 'speaker': 'speaker_%d' % speaker}
90 |
91 | if self.split is not None:
92 | if counter < np.floor(self.split[0] * total):
93 | dataset = 'train'
94 | elif counter < np.floor(np.sum(self.split) * total):
95 | dataset = 'valid'
96 | else:
97 | dataset = 'test'
98 |
99 | data['dataset'] = dataset
100 | counter += 1
101 |
102 | yield data
103 |
104 | def _report(self, dl):
105 | report = '''General information
106 | Number of utterances: %d
107 | Total size (in seconds) of utterances: %.f
108 | Number of speakers: %d''' % (len(dl['audio']),
109 | sum(dl['duration']),
110 | len(set(dl['speaker'])))
111 |
112 | return report
113 |
--------------------------------------------------------------------------------
/datasets/lapsbm.py:
--------------------------------------------------------------------------------
1 | from datasets import DatasetParser
2 |
3 | import os
4 | import re
5 | import librosa
6 | import codecs
7 |
8 |
9 | class LapsBM(DatasetParser):
10 | """ Laps benchmark version 1.4 dataset reader and parser
11 |
12 | More about this dataset: http://www.laps.ufpa.br/falabrasil/downloads.php
13 | """
14 |
15 | version = '1.4'
16 |
17 | # Random separation of LAPSBM1.4 dataset in validation and test if required
18 | # 5 women, 10 men
19 | _test_speaker_id = [3, 11, 13, 17, 12,
20 | 33, 5, 22, 16, 8,
21 | 4, 0, 20, 10, 9]
22 |
23 | # 5 women, 15 men
24 | _valid_speaker_id = [29, 32, 14, 31, 25,
25 | 23, 19, 26, 6, 2,
26 | 24, 15, 1, 21, 28,
27 | 30, 34, 27, 18, 7]
28 |
29 | def __init__(self, dataset_dir=None, name='lapsbm', split=False, **kwargs):
30 |
31 | dataset_dir = dataset_dir or 'data/lapsbm'
32 |
33 | self._split = split
34 |
35 | super(LapsBM, self).__init__(dataset_dir, name, **kwargs)
36 |
37 | def _iter(self):
38 | for speaker_path in os.listdir(self.dataset_dir):
39 |
40 | root_path = os.path.join(os.path.abspath(self.dataset_dir),
41 | speaker_path)
42 |
43 | if not os.path.isdir(os.path.join(root_path)):
44 | continue
45 |
46 | label_files = [f for f in os.listdir(root_path)
47 | if '.txt' in f.lower()]
48 |
49 | for label_file in label_files:
50 |
51 | label = ' '.join(
52 | codecs.open(
53 | os.path.join(root_path, label_file), 'r',
54 | encoding='utf8')
55 | .read().strip().split(' ')).lower()
56 |
57 | audio_file = os.path.join(root_path,
58 | "%s.wav" % (label_file[:-4]))
59 | gender_speaker = speaker_path.split('-')[1]
60 | gender = gender_speaker[0].lower()
61 | speaker_id = gender_speaker[1:]
62 |
63 | try:
64 | duration = librosa.audio.get_duration(filename=audio_file)
65 | except IOError:
66 | print('File %s not found' % audio_file)
67 | continue
68 |
69 | dataset = 'valid'
70 | if int(speaker_id) in self._test_speaker_id:
71 | dataset = 'test'
72 |
73 | data = {'duration': duration,
74 | 'input': audio_file,
75 | 'label': label,
76 | 'gender': gender,
77 | 'speaker': speaker_id}
78 |
79 | if self._split:
80 | data['dataset'] = dataset
81 |
82 | yield data
83 |
84 | def _report(self, dl):
85 | report = '''General information:
86 | Number of utterances: %d
87 | Total size (in seconds) of utterances: %.f
88 | Number of speakers: %d'
89 | %% of female speaker: %.2f%%''' \
90 | % (len(dl['audio']), sum(dl['duration']), len(set(dl['speaker'])),
91 | 100 * (sum([1 for g in dl['gender'] if g == 'f']) /
92 | (1.0 * len(dl['gender']))))
93 |
94 | return report
95 |
--------------------------------------------------------------------------------
/datasets/sid.py:
--------------------------------------------------------------------------------
1 | from datasets import DatasetParser
2 |
3 | import os
4 | import re
5 | import librosa
6 | import codecs
7 |
8 | import numpy as np
9 |
10 | regex = r"Nome=(?P.*)[\n]+Idade=(?P.*)[\n]+.*[\n]+Sexo=(?P.*)[\n]+Escolaridade=(?P.*)[\n]+"
11 |
12 |
13 | class Sid(DatasetParser):
14 | """ Sid dataset reader and parser
15 | """
16 |
17 | def __init__(self, dataset_dir=None, name='sid', **kwargs):
18 |
19 | dataset_dir = dataset_dir or 'data/sid'
20 |
21 | super(Sid, self).__init__(dataset_dir, name, **kwargs)
22 |
23 | def _iter(self):
24 | for speaker_path in os.listdir(self.dataset_dir):
25 |
26 | root_path = os.path.join(os.path.abspath(self.dataset_dir),
27 | speaker_path)
28 |
29 | if not os.path.isdir(os.path.join(root_path)):
30 | continue
31 |
32 | labels_file = os.path.join(root_path, 'prompts.txt')
33 |
34 | speaker_info_file = os.path.join(root_path, 'speaker.txt')
35 |
36 | with open(speaker_info_file) as f:
37 | info_text = f.read()
38 |
39 | pattern = re.compile(regex, re.MULTILINE | re.UNICODE)
40 |
41 | info = list(re.finditer(pattern, info_text))[0].groupdict()
42 |
43 | gender = info['gender'][0].lower()
44 | speaker_id = speaker_path.lower()
45 |
46 | try:
47 | age = int(info['age'])
48 | except ValueError:
49 | self._logger.error('age %s could not be converted in int.',
50 | (info['age']))
51 | age = 0
52 |
53 | for line in codecs.open(labels_file, 'r', encoding='utf8'):
54 |
55 | split = line.strip().split('=')
56 | file_id = int(split[0])
57 |
58 | label = split[1].lower()
59 |
60 | audio_file = os.path.join(
61 | root_path, "%s%03d" % (speaker_path, file_id)) + '.wav'
62 |
63 | try:
64 | duration = librosa.audio.get_duration(filename=audio_file)
65 | except IOError:
66 | self._logger.error('File %s not found' % audio_file)
67 | continue
68 |
69 | yield {'duration': duration,
70 | 'input': audio_file,
71 | 'label': label,
72 | 'gender': gender,
73 | 'speaker': speaker_id,
74 | 'age': age}
75 |
76 | def _report(self, dl):
77 | args = len(dl['audio']), sum(dl['duration']),
78 | len(set(dl['speaker'])),
79 | 100 * (sum([1 for g in dl['gender'] if g == 'f']) /
80 | (1.0 * len(dl['gender']))),
81 | min([a for a in dl['age'] if a is not 0]),
82 | max(dl['age']), np.mean([a for a in dl['age'] if a is not 0])
83 |
84 | report = '''General information
85 | Number of utterances: %d
86 | Total size (in seconds) of utterances: %.f
87 | Number of speakers: %d
88 | %% of female speaker: %.2f%%
89 | age range: from %d to %d. Mean: %.f''' % (args)
90 |
91 | return report
92 |
93 |
94 | if __name__ == '__main__':
95 | """ Script to fix some errors in sid dataset about the name convention
96 | on folder and some errors in transcription
97 | """
98 | parser = argparse.ArgumentParser()
99 | parser.add_argument('data_directory', type=str,
100 | help='Path to data directory')
101 | parser.add_argument('output_directory', type=str,
102 | help='Path to data directory')
103 | args = parser.parse_args()
104 |
105 | data_directory = args.data_directory
106 | output_directory = args.output_directory
107 |
108 | # fix wav filenamess
109 | matches = []
110 | for root, dirnames, filenames in os.walk(data_directory):
111 | for filename in fnmatch.filter(filenames, '*.[Ww][Aa][Vv]'):
112 | filepath = os.path.join(root, filename)
113 | number = "%03d" % int(filename[-7:-4])
114 | prefix = filepath.split(os.path.sep)[-2]
115 |
116 | new_filename = "%s%s" % (prefix, number) + '.wav'
117 | new_filepath = os.path.join(output_directory, root, new_filename)
118 |
119 | if not os.path.exists(os.path.join(output_directory, root)):
120 | os.makedirs(os.path.join(output_directory, root))
121 |
122 | copyfile(filepath, new_filepath)
123 |
124 | for root, dirnames, filenames in os.walk(data_directory):
125 | for filename in fnmatch.filter(filenames, '*.[tT][xX][tT]'):
126 | filepath = os.path.join(root, filename)
127 |
128 | if filename.lower().startswith('texto'):
129 | filename = 'prompts.txt'
130 |
131 | new_filepath = os.path.join(output_directory,
132 | root, filename.lower())
133 | copyfile(filepath, new_filepath)
134 |
--------------------------------------------------------------------------------
/datasets/voxforge.py:
--------------------------------------------------------------------------------
1 | from datasets import DatasetParser
2 |
3 | import os
4 | import re
5 | import librosa
6 | import codecs
7 |
8 | regex = r"User\s+Name\:[\s]*(?P.*)[\n]+.*[\n]+Gender\:[\s]*(?P[a-zA-Z]+)[\w\r\s\n:\/]+Pronunciation dialect\:\s+(?P.*)"
9 |
10 |
11 | class VoxForge(DatasetParser):
12 | """ VoxForge (only portuguese brazilian audio files) dataset reader and parser
13 |
14 | More about the dataset: http://www.voxforge.org/
15 | """
16 |
17 | IGNORED_LIST = ['Marcelo-20131106-iqc',
18 | 'anonymous-20140619-wcy',
19 | 'ThiagoCastro-20131129-qpn',
20 | 'anonymous-20131016-uzv']
21 |
22 | def __init__(self, dataset_dir=None, name='voxforge', **kwargs):
23 |
24 | dataset_dir = dataset_dir or 'data/voxforge'
25 |
26 | super(VoxForge, self).__init__(dataset_dir, name, **kwargs)
27 |
28 | if (self.dataset_dir is not None and
29 | os.path.isdir(os.path.join(self.dataset_dir, 'files'))):
30 |
31 | self.dataset_dir = os.path.join(self.dataset_dir, 'files')
32 |
33 | def _iter(self):
34 | for speaker_path in os.listdir(self.dataset_dir):
35 |
36 | if speaker_path in self.IGNORED_LIST:
37 | continue
38 |
39 | root_path = os.path.join(
40 | os.path.abspath(self.dataset_dir), speaker_path)
41 |
42 | if not os.path.isdir(os.path.join(root_path)):
43 | continue
44 |
45 | labels_file = os.path.join(root_path, 'etc', 'PROMPTS')
46 |
47 | if not os.path.exists(labels_file):
48 | labels_file = os.path.join(root_path, 'PROMPTS')
49 |
50 | speaker_info_file = os.path.join(root_path, 'etc', 'README')
51 |
52 | if not os.path.exists(speaker_info_file):
53 | speaker_info_file = os.path.join(root_path, 'README')
54 |
55 | with open(speaker_info_file) as f:
56 | info_text = f.read()
57 |
58 | pattern = re.compile(regex, re.MULTILINE | re.UNICODE)
59 |
60 | info = list(re.finditer(pattern, info_text))[0].groupdict()
61 |
62 | gender = info['gender'][0].lower()
63 | speaker_id = info['speaker']
64 |
65 | for line in codecs.open(labels_file, 'r', encoding='utf8'):
66 | split = line.strip().split()
67 | file_id = split[0].split('/')[-1]
68 |
69 | label = ' '.join(split[1:]).lower()
70 |
71 | audio_file = os.path.join(root_path, 'wav', file_id) + '.wav'
72 |
73 | if not os.path.exists(audio_file):
74 | audio_file = os.path.join(root_path, file_id) + '.wav'
75 |
76 | try:
77 | duration = librosa.audio.get_duration(filename=audio_file)
78 | except IOError:
79 | self._logger.error('File %s not found' % audio_file)
80 | continue
81 |
82 | yield {'duration': duration,
83 | 'input': audio_file,
84 | 'label': label,
85 | 'gender': gender,
86 | 'speaker': speaker_id}
87 |
88 | def _report(self, dl):
89 | args = len(dl['audio']), sum(dl['duration']),
90 | len(set(dl['speaker'])),
91 | 100 * (sum([1 for g in dl['gender'] if g == 'f']) /
92 | (1.0 * len(dl['gender']))),
93 | 100 * (sum([1 for s in dl['speaker'] if s == 'anonymous']) /
94 | (1.0 * len(dl['speaker'])))
95 |
96 | report = '''General information
97 | Number of utterances: %d
98 | Total size (in seconds) of utterances: %.f
99 | Number of speakers: %d
100 | %% of female speaker: %.2f%%
101 | Anonymous speaker: %.2f%%''' % (args)
102 |
103 | return report
104 |
--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import os
6 | import codecs
7 | import json
8 | import numpy as np
9 | # Preventing pool_allocator message
10 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
11 |
12 | import argparse
13 | import h5py
14 | import inspect
15 |
16 | from preprocessing import audio, text
17 |
18 | from utils import generic_utils as utils
19 | from utils.hparams import HParams
20 |
21 | from datasets.dataset_generator import DatasetGenerator, DatasetIterator
22 |
23 | from utils.core_utils import setup_gpu, load_model
24 |
25 | if __name__ == '__main__':
26 | parser = argparse.ArgumentParser(description='Evaluating an ASR system.')
27 |
28 | parser.add_argument('--model', required=True, type=str)
29 | parser.add_argument('--dataset', required=True, type=str)
30 | parser.add_argument('--subset', type=str, default='test')
31 |
32 | parser.add_argument('--batch_size', default=32, type=int)
33 |
34 | # Features generation (if necessary)
35 | parser.add_argument('--input_parser', type=str, default=None)
36 | parser.add_argument('--input_parser_params', nargs='+', default=[])
37 |
38 | # Label generation (if necessary)
39 | parser.add_argument('--label_parser', type=str,
40 | default='simple_char_parser')
41 | parser.add_argument('--label_parser_params', nargs='+', default=[])
42 |
43 | # Other configs
44 | parser.add_argument('--gpu', default='0', type=str)
45 | parser.add_argument('--allow_growth', default=False, action='store_true')
46 |
47 | parser.add_argument('--save_transcriptions', default=None, type=str)
48 |
49 | args = parser.parse_args()
50 | args_nondefault = utils.parse_nondefault_args(
51 | args, parser.parse_args(
52 | ['--model', args.model, '--dataset', args.dataset]))
53 |
54 | # GPU configuration
55 | setup_gpu(args.gpu, args.allow_growth)
56 |
57 | # Loading model
58 | model, meta = load_model(args.model, return_meta=True, mode='eval')
59 |
60 | args = HParams(**meta['training_args']).update(vars(args_nondefault))
61 |
62 | # Features extractor
63 | input_parser = utils.get_from_module('preprocessing.audio',
64 | args.input_parser,
65 | params=args.input_parser_params)
66 |
67 | # Recovering text parser
68 | label_parser = utils.get_from_module('preprocessing.text',
69 | args.label_parser,
70 | params=args.label_parser_params)
71 |
72 | data_gen = DatasetGenerator(input_parser, label_parser,
73 | batch_size=args.batch_size, seed=0)
74 | test_flow = data_gen.flow_from_fname(args.dataset, datasets=args.subset)
75 |
76 | metrics = model.evaluate_generator(test_flow, test_flow.len,
77 | max_q_size=10, nb_worker=1)
78 |
79 | for m, v in zip(model.metrics_names, metrics):
80 | print('%s: %4f' % (m, v))
81 |
82 | from keras import backend as K; K.clear_session()
83 |
--------------------------------------------------------------------------------
/extras/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 |
3 | from utils.generic_utils import setup_logging
4 | setup_logging()
5 |
--------------------------------------------------------------------------------
/extras/apis.py:
--------------------------------------------------------------------------------
1 | import os
2 | import speech_recognition as sr
3 |
4 | r = sr.Recognizer()
5 |
6 | def recognize_from_api(audio, api, name='API', safe=True, **kwargs):
7 | if not isinstance(audio, sr.AudioData):
8 | with sr.AudioFile(audio) as source:
9 | audio = r.record(source)
10 | try:
11 | return api(audio, **kwargs)
12 | except sr.UnknownValueError as e:
13 | if not safe:
14 | raise e
15 | return "\t%s could not understand audio" % name
16 | except sr.RequestError as e:
17 | if not safe:
18 | raise e
19 | return "\tCould not request results from %s \
20 | service; {0}" % (name, e)
21 |
22 |
23 | def recognize_google(audio,
24 | credentials=os.environ['GOOGLE_CLOUD_API'],
25 | **kwargs):
26 |
27 | return recognize_from_api(audio, r.recognize_google_cloud,
28 | name='Google Cloud Speech',
29 | credentials_json=credentials,
30 | **kwargs)
31 |
32 |
33 | def recognize_bing(audio, key=os.environ['BING_API'], **kwargs):
34 | return recognize_from_api(audio, r.recognize_bing,
35 | name='Microsoft Bing Voice',
36 | key=key, **kwargs)
37 |
38 |
39 | def recognize_ibm(audio,
40 | username=os.environ['IBM_USERNAME'],
41 | password=os.environ['IBM_PASSWORD'], **kwargs):
42 | return recognize_from_api(audio, r.recognize_ibm,
43 | name='IBM Speech to Text',
44 | username=username, password=password,
45 | **kwargs)
46 |
--------------------------------------------------------------------------------
/extras/ctc_viz.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import numpy as np
4 |
5 | from datasets.dataset_generator import DatasetGenerator, DatasetIterator
6 |
7 | from utils.core_utils import setup_gpu, load_model
8 |
9 | from utils.hparams import HParams
10 | from utils import generic_utils as utils
11 |
12 | from preprocessing import audio, text
13 |
14 | import matplotlib
15 | import matplotlib.pyplot as plt
16 |
17 | if __name__ == '__main__':
18 |
19 | parser = argparse.ArgumentParser(description='Evaluating an ASR system.')
20 |
21 | parser.add_argument('--model', required=True, type=str)
22 | parser.add_argument('--dataset', default=None, type=str)
23 | parser.add_argument('--files', default=[], type=str, nargs='+')
24 | parser.add_argument('--labels', default=[], nargs='+', type=str)
25 | parser.add_argument('--subset', type=str, default='test')
26 |
27 | # Features generation (if necessary)
28 | parser.add_argument('--input_parser', type=str, default=None)
29 | parser.add_argument('--input_parser_params', nargs='+', default=[])
30 |
31 | # Label generation (if necessary)
32 | parser.add_argument('--label_parser', type=str,
33 | default='simple_char_parser')
34 | parser.add_argument('--label_parser_params', nargs='+', default=[])
35 |
36 | # Other configs
37 | parser.add_argument('--gpu', default='0', type=str)
38 | parser.add_argument('--allow_growth', default=False, action='store_true')
39 |
40 |
41 | parser.add_argument('--plt_backend', type=str, default="Qt5Agg")
42 |
43 | parser.add_argument('--save', default=None, type=str)
44 |
45 | args = parser.parse_args()
46 | args_nondefault = utils.parse_nondefault_args(
47 | args, parser.parse_args(
48 | ['--model', args.model, '--dataset', args.dataset]))
49 |
50 | matplotlib.use(args.plt_backend)
51 |
52 | if args.dataset is None and len(args.files) == 0:
53 | raise ValueError('dataset or file args must be set.')
54 |
55 | if args.dataset and args.files:
56 | print('Both dataset and file args was set. Ignoring file args.')
57 |
58 | # GPU configuration
59 | setup_gpu(args.gpu, args.allow_growth)
60 |
61 | # Loading model
62 | model, meta = load_model(args.model, return_meta=True, mode='eval')
63 |
64 | args = HParams(**meta['training_args']).update(vars(args_nondefault))
65 |
66 | # Features extractor
67 | input_parser = utils.get_from_module('preprocessing.audio',
68 | args.input_parser,
69 | params=args.input_parser_params)
70 |
71 | # Recovering text parser
72 | label_parser = utils.get_from_module('preprocessing.text',
73 | args.label_parser,
74 | params=args.label_parser_params)
75 |
76 | if args.dataset is not None:
77 | data_gen = DatasetGenerator(input_parser, label_parser,
78 | batch_size=1, seed=0, mode='predict')
79 | test_flow = data_gen.flow_from_fname(args.dataset,
80 | datasets=args.subset)
81 | else:
82 | if len(args.files) == 0:
83 | raise ValueError("files arg must be > 0")
84 |
85 | test_flow = DatasetIterator(np.array(args.files), None,
86 | input_parser=input_parser,
87 | label_parser=label_parser, mode='predict')
88 | test_flow.labels = np.array([u'']*len(args.file))
89 |
90 | model = load_model(args.model, mode='predict', decoder=False)
91 |
92 | results = []
93 |
94 | plt.figure()
95 | for index in range(test_flow.len):
96 | prediction = model.predict(test_flow.next())
97 |
98 | truth = label_parser._sanitize(test_flow.labels[0])
99 |
100 | plt.plot(prediction[0,...])
101 | plt.show()
102 |
103 |
104 | from keras import backend as K; K.clear_session()
105 |
--------------------------------------------------------------------------------
/extras/eval_apis.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import os
6 | import argparse
7 | import codecs
8 | import json
9 | import time
10 |
11 | from preprocessing import audio, text
12 | from utils import generic_utils as utils
13 |
14 | import apis
15 | import speech_recognition as sr
16 |
17 | if __name__ == '__main__':
18 | parser = argparse.ArgumentParser(description='Evaluating an ASR system \
19 | over an API.')
20 |
21 | parser.add_argument('--dataset', required=True, type=str)
22 | parser.add_argument('--language', default='pt-BR', type=str)
23 | parser.add_argument('--all', action='store_true', help='Will evaluate \
24 | over all dataset, not only with the dt key equals test.')
25 |
26 | # Label generation (if necessary)
27 | parser.add_argument('--label_parser', type=str,
28 | default='simple_char_parser')
29 | parser.add_argument('--label_parser_params',nargs='+', default=[])
30 |
31 | # Other configs
32 | parser.add_argument('--save_every', default=10, type=int)
33 | parser.add_argument('--resume', action='store_true')
34 | parser.add_argument('--save', default=None, type=str)
35 | parser.add_argument('--apis', default=['google', 'ibm', 'microsoft'],
36 | nargs='+')
37 |
38 | args = parser.parse_args()
39 |
40 | # If save is not defined, it will use the folder name of dataset location
41 | save = args.save
42 | if args.save is None:
43 | save = '%s_eval_apis.json' % args.dataset.split(os.path.sep)[-2]
44 |
45 | # Recovering text parser
46 | label_parser = utils.get_from_module('preprocessing.text',
47 | args.label_parser,
48 | params=args.label_parser_params)
49 |
50 | if not utils.check_ext(args.dataset, 'json'):
51 | raise ValueError('dataset must be a json file')
52 |
53 | dataset = json.load(codecs.open(args.dataset, 'r', encoding='utf8'))
54 |
55 | if not args.all and 'dt' in dataset[0]:
56 | dataset = [d for d in dataset if d['dt'] == 'test']
57 |
58 | apis = {'google': apis.recognize_google,
59 | 'ibm': apis.recognize_ibm,
60 | 'microsoft': apis.recognize_bing}
61 |
62 | eval_apis = []
63 | if args.resume:
64 | with codecs.open(save, 'r', encoding='utf8') as f:
65 | eval_apis = json.load(f)
66 |
67 | for i, data in enumerate(dataset):
68 |
69 | if len(eval_apis) > i:
70 | result = eval_apis[i]
71 | else:
72 | result = {}
73 | result['label'] = data['label']
74 | result['audio'] = data['audio']
75 |
76 | if args.all and 'dt' in data:
77 | result['dt'] = data['dt']
78 |
79 | for api_name in args.apis:
80 | if api_name in result and result[api_name] != '':
81 | continue
82 | try:
83 | result[api_name] = apis[api_name](data['audio'], safe=False,
84 | language=args.language)
85 | except Exception as e:
86 | result[api_name] = ''
87 | print(e)
88 |
89 | if len(eval_apis) > i:
90 | eval_apis[i] = result
91 | else:
92 | eval_apis.append(result)
93 |
94 | if (args.save_every % (i + 1)) == 0:
95 | with codecs.open(save, 'w', encoding='utf8') as f:
96 | json.dump(eval_apis, f)
97 |
98 | print('Done %d/%d' % (i + 1, len(dataset)))
99 | time.sleep(.1)
100 |
101 | with codecs.open(save, 'w', encoding='utf8') as f:
102 | json.dump(eval_apis, f)
103 |
--------------------------------------------------------------------------------
/extras/make_dataset.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 |
3 | import argparse
4 |
5 | from utils import generic_utils as utils
6 | from utils.hparams import HParams
7 |
8 | import preprocessing
9 | import datasets
10 |
11 | if __name__ == '__main__':
12 | parser = argparse.ArgumentParser(description='Generates a preprocessed dataset (hdf5 file) by providing the path to the dataset and the correct parser.')
13 |
14 | parser.add_argument('--dataset_dir', type=str, default=None)
15 | parser.add_argument('--parser', type=str, required=True)
16 | parser.add_argument('--parser_params', nargs='+', default=[])
17 |
18 | parser.add_argument('--output_file', type=str, default=None)
19 |
20 | parser.add_argument('--input_parser', type=str, default=None)
21 | parser.add_argument('--input_parser_params', nargs='+', default=[])
22 |
23 | parser.add_argument('--label_parser', type=str,
24 | default=None)
25 | parser.add_argument('--label_parser_params', nargs='+', default=[])
26 |
27 | parser.add_argument('--override', action='store_true')
28 |
29 | args = parser.parse_args()
30 |
31 | parser = utils.get_from_module('datasets*',
32 | args.parser,
33 | regex=True)
34 |
35 | input_parser = utils.get_from_module('preprocessing.audio',
36 | args.input_parser,
37 | params=args.input_parser_params)
38 | label_parser = utils.get_from_module('preprocessing.text',
39 | args.label_parser,
40 | params=args.label_parser_params)
41 |
42 | dataset = parser(args.dataset_dir,
43 | **HParams().parse(args.parser_params).values())
44 |
45 | output_file = dataset.to_h5(fname=args.output_file,
46 | input_parser=input_parser,
47 | label_parser=label_parser,
48 | override=args.override)
49 |
50 | print('Dataset %s saved at %s' % (parser.name, output_file))
51 |
--------------------------------------------------------------------------------
/extras/print_args.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 |
3 | import argparse
4 |
5 | from utils.core_utils import load_meta
6 |
7 | if __name__ == '__main__':
8 | parser = argparse.ArgumentParser(description='Print training arguments')
9 | parser.add_argument('--model', required=True, type=str)
10 | args = parser.parse_args()
11 |
12 | meta = load_meta(args.model)
13 |
14 | for k, v in meta['training_args'].items():
15 | print('%s: %s' % (k, v))
16 |
--------------------------------------------------------------------------------
/extras/recognizer.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 |
3 | # NOTE: this example requires PyAudio because it uses the Microphone class
4 |
5 | import sys
6 | import os
7 | import json
8 | import argparse
9 | import preprocessing
10 | import inspect
11 | import numpy as np
12 |
13 | import speech_recognition as sr
14 |
15 | import utils.generic_utils as utils
16 |
17 | from core.dataset_generator import DatasetIterator
18 | from utils.core_utils import setup_gpu
19 |
20 | import keras.backend as K
21 | from keras.models import Model
22 | from keras.layers import Lambda
23 |
24 | import tensorflow as tf
25 |
26 | if __name__ == "__main__":
27 | parser = argparse.ArgumentParser()
28 |
29 | parser.add_argument('source', type=str, nargs='+', default=['mic'])
30 | parser.add_argument('--language', default='pt-BR', type=str)
31 |
32 | # Custom asr
33 | parser.add_argument('--model', default=None, type=str)
34 | parser.add_argument('--gpu', default='0', type=str)
35 | parser.add_argument('--allow_growth', default=False, action='store_true')
36 |
37 | parser.add_argument('--apis', default=['google', 'ibm', 'microsoft'], nargs='+')
38 |
39 | args = parser.parse_args()
40 |
41 | r = sr.Recognizer()
42 |
43 | audios = []
44 | if len(args.source) == 1 and args.source[0] == 'mic':
45 | # obtain audio from the microphone
46 | with sr.Microphone() as source:
47 | print("Say something! (language %s)" % args.language)
48 | mic_audio = r.listen(source)
49 |
50 | with tempfile.NamedTemporaryFile(delete=False) as f:
51 | f.write(mic_audio.get_wav_data())
52 | audios.append((f.name, 'microphone'))
53 | else:
54 | for audio_fname in args.source:
55 | with sr.AudioFile(audio_fname) as source:
56 | audios.append((r.record(source), audio_fname))
57 | # read the entire audio file
58 |
59 | if args.model is not None:
60 | setup_gpu(args.gpu, args.allow_growth)
61 |
62 | model, meta = utils.load_model(args.model,
63 | return_meta=True,
64 | mode='predict')
65 | training_args = meta['training_args']
66 |
67 | # Features extractor
68 | input_parser = utils.get_from_module('preprocessing.audio',
69 | training_args['feats'],
70 | params=training_args['feats_params'])
71 |
72 | # Recovering text parser
73 | label_parser = utils.get_from_module('preprocessing.text',
74 | training_args['label_parser'],
75 | params=training_args['label_parser_params']
76 | )
77 |
78 | data_it = DatasetIterator(np.array([f for a, f in audios]),
79 | label_parser=input_parser,
80 | label_parser=label_parser)
81 |
82 | model_predictions = model.predict_generator(
83 | data_it, val_samples=len(audios))
84 |
85 | model_predictions = [label_parser.imap(p[:(np.argmax(p == -1) or len(p))]) for p in model_predictions]
86 |
87 | for i, (audio, name) in enumerate(audios):
88 |
89 | print('Recognizing from: %s' % name)
90 |
91 | if 'google' in args.apis:
92 | rec = apis.recognize_google(audio, language=args.language)
93 | print("\tGoogle Cloud Speech:\n\t\t'%s'" % rec)
94 |
95 | if 'microsoft' in args.apis:
96 | rec = apis.recognize_bing(audio, language=args.language)
97 | print("\tMicrosoft Bing:\n\t\t'%s'" % rec)
98 |
99 | if 'ibm' in args.apis:
100 | rec = apis.recognize_ibm(audio, language=args.language)
101 | print("\tIBM Speech to Text:\n\t\t'%s'" % rec)
102 |
103 | if args.model is not None:
104 | print("\tTrained model:\n\t\t'%s'" % model_predictions[i])
105 |
--------------------------------------------------------------------------------
/extras/results2xlsx.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 |
3 | import os
4 | import argparse
5 | import yaml
6 | import numpy as np
7 |
8 | import openpyxl
9 | from openpyxl import Workbook
10 |
11 | from utils.core_utils import load_meta
12 |
13 | if __name__ == "__main__":
14 | parser = argparse.ArgumentParser()
15 |
16 | parser.add_argument('--folder', default='results', type=str)
17 | parser.add_argument('--del_empty_dir', action='store_true')
18 |
19 | args = parser.parse_args()
20 |
21 | metas = {}
22 |
23 | for subdir, dirs, files in os.walk(args.folder):
24 | if len(dirs):
25 | continue
26 |
27 | if not len(files):
28 | if args.del_empty_dir:
29 | print('deleting folder %s' % subdir)
30 | os.rmdir(os.path.abspath(subdir))
31 |
32 | if 'model.h5' not in files:
33 | print('model.h5 not found in %s' % subdir)
34 | continue
35 |
36 | try:
37 | meta = load_meta(os.path.join(subdir, 'model.h5'))
38 | metas[subdir.split(os.sep)[-1]] = meta
39 | except KeyError:
40 | print('meta not found in %s' % os.path.join(subdir, 'model.h5'))
41 |
42 | training_args = list(set([arg for model in metas for arg in
43 | metas[model]['training_args']]))
44 |
45 | datasets = {}
46 | for model in metas:
47 | args = metas[model]['training_args']
48 | meta = metas[model]
49 |
50 | try:
51 | key = args['dataset']
52 | if type(key) in (list, set):
53 | key = key[0]
54 | key = key.split(os.sep)[-2]
55 | except KeyError:
56 | key = 'unknown'
57 |
58 | if key not in datasets:
59 | datasets[key] = {}
60 |
61 | datasets[key][model] = meta
62 |
63 | wb = Workbook()
64 |
65 | columns = ['path'] + ['epoch', 'best_val_ler'] + training_args
66 |
67 | for name in datasets:
68 | ws = wb.create_sheet(name)
69 |
70 | cell_range = ws['A1':'%s1'
71 | % openpyxl.utils.get_column_letter(len(columns))][0]
72 |
73 | for i, cell in zip(range(len(cell_range)), cell_range):
74 | cell.value = columns[i]
75 |
76 | for row, (model, meta) in enumerate(datasets[name].items(), start=2):
77 |
78 | ws['A%d' % row] = model
79 | for key in ('epoch', 'epochs'):
80 | if key in meta:
81 | ws['B%d' % row] = meta[key][np.argmin(meta['val_decoder_ler'])]
82 | break
83 | ws['C%d' % row] = np.min(meta['val_decoder_ler'])
84 |
85 | for arg, val in meta['training_args'].items():
86 | col = openpyxl.utils.get_column_letter(
87 | training_args.index(arg) + 4)
88 |
89 | if type(val) in (list, set):
90 | val = ', '.join(val)
91 |
92 | ws['%s%d' % (col, row)] = val
93 |
94 |
95 | wb.save('results.xlsx')
96 |
--------------------------------------------------------------------------------
/imgs/best_ler.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igormq/asr-study/302fa3087cc71aec4853360638dbe2f4a59b5726/imgs/best_ler.jpg
--------------------------------------------------------------------------------
/imgs/best_ler.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igormq/asr-study/302fa3087cc71aec4853360638dbe2f4a59b5726/imgs/best_ler.pdf
--------------------------------------------------------------------------------
/imgs/best_loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igormq/asr-study/302fa3087cc71aec4853360638dbe2f4a59b5726/imgs/best_loss.jpg
--------------------------------------------------------------------------------
/imgs/best_loss.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igormq/asr-study/302fa3087cc71aec4853360638dbe2f4a59b5726/imgs/best_loss.pdf
--------------------------------------------------------------------------------
/logging.yaml:
--------------------------------------------------------------------------------
1 | version: 1
2 | disable_existing_loggers: False
3 | formatters:
4 | complete:
5 | format: "%(name)-12s: (asctime)s %(name)-12s %(levelname)-8s %(message)s"
6 | datefmt: "%m-%d %H:%M"
7 | simple:
8 | format: "%(name)-12s: %(levelname)-8s %(message)s"
9 | handlers:
10 | console:
11 | class: logging.StreamHandler
12 | level: WARNING
13 | formatter: simple
14 | stream: ext://sys.stdout
15 | file_handler:
16 | class: logging.handlers.RotatingFileHandler
17 | level: INFO
18 | formatter: complete
19 | filename: info.log
20 | maxBytes: 10485760 # 10MB
21 | backupCount: 20
22 | encoding: utf8
23 | root:
24 | level: INFO
25 | handlers: [console, file_handler]
26 |
--------------------------------------------------------------------------------
/msc.yaml:
--------------------------------------------------------------------------------
1 | name: msc
2 | channels:
3 | - !!python/unicode
4 | 'defaults'
5 | dependencies:
6 | - !!python/unicode
7 | 'certifi=2016.2.28=py27_0'
8 | - !!python/unicode
9 | 'cycler=0.10.0=py27_0'
10 | - !!python/unicode
11 | 'freetype=2.5.5=2'
12 | - !!python/unicode
13 | 'functools32=3.2.3.2=py27_0'
14 | - !!python/unicode
15 | 'h5py=2.7.0=np113py27_0'
16 | - !!python/unicode
17 | 'hdf5=1.8.17=2'
18 | - !!python/unicode
19 | 'icu=54.1=0'
20 | - !!python/unicode
21 | 'jbig=2.1=0'
22 | - !!python/unicode
23 | 'jpeg=9b=0'
24 | - !!python/unicode
25 | 'libpng=1.6.30=1'
26 | - !!python/unicode
27 | 'libtiff=4.0.6=3'
28 | - !!python/unicode
29 | 'matplotlib=2.0.2=np113py27_0'
30 | - !!python/unicode
31 | 'mkl=2017.0.3=0'
32 | - !!python/unicode
33 | 'numpy=1.13.1=py27_0'
34 | - !!python/unicode
35 | 'olefile=0.44=py27_0'
36 | - !!python/unicode
37 | 'openssl=1.0.2l=0'
38 | - !!python/unicode
39 | 'pillow=4.2.1=py27_0'
40 | - !!python/unicode
41 | 'pip=9.0.1=py27_1'
42 | - !!python/unicode
43 | 'pyparsing=2.2.0=py27_0'
44 | - !!python/unicode
45 | 'pyqt=5.6.0=py27_2'
46 | - !!python/unicode
47 | 'python=2.7.13=0'
48 | - !!python/unicode
49 | 'python-dateutil=2.6.1=py27_0'
50 | - !!python/unicode
51 | 'pytz=2017.2=py27_0'
52 | - !!python/unicode
53 | 'pyyaml=3.12=py27_0'
54 | - !!python/unicode
55 | 'qt=5.6.2=2'
56 | - !!python/unicode
57 | 'readline=6.2=2'
58 | - !!python/unicode
59 | 'scipy=0.19.1=np113py27_0'
60 | - !!python/unicode
61 | 'setuptools=36.4.0=py27_1'
62 | - !!python/unicode
63 | 'sip=4.18=py27_0'
64 | - !!python/unicode
65 | 'six=1.10.0=py27_0'
66 | - !!python/unicode
67 | 'sqlite=3.13.0=0'
68 | - !!python/unicode
69 | 'subprocess32=3.2.7=py27_0'
70 | - !!python/unicode
71 | 'tk=8.5.18=0'
72 | - !!python/unicode
73 | 'wheel=0.29.0=py27_0'
74 | - !!python/unicode
75 | 'xz=5.2.3=0'
76 | - !!python/unicode
77 | 'yaml=0.1.6=0'
78 | - !!python/unicode
79 | 'zlib=1.2.11=0'
80 | - pip:
81 | - audioread==2.1.5
82 | - backports.weakref==1.0.post1
83 | - bleach==1.5.0
84 | - decorator==4.1.2
85 | - enum34==1.1.6
86 | - funcsigs==1.0.2
87 | - html5lib==0.9999999
88 | - joblib==0.11
89 | - keras==1.2.2
90 | - librosa==0.5.1
91 | - llvmlite==0.20.0
92 | - markdown==2.6.9
93 | - mock==2.0.0
94 | - numba==0.35.0
95 | - pbr==3.1.1
96 | - protobuf==3.4.0
97 | - resampy==0.2.0
98 | - scikit-learn==0.19.0
99 | - singledispatch==3.4.0.3
100 | - tensorflow==1.3.0
101 | - tensorflow-tensorboard==0.1.8
102 | - theano==0.9.0
103 | - unidecode==0.4.21
104 | - werkzeug==0.12.2
105 | prefix: !!python/unicode '/Users/igormq/miniconda2/envs/msc'
106 |
107 |
--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import h5py
4 | import os
5 | import numpy as np
6 | import codecs
7 |
8 | from datasets.dataset_generator import DatasetGenerator, DatasetIterator
9 |
10 | from utils.core_utils import setup_gpu, load_model
11 |
12 | from utils.hparams import HParams
13 | from utils import generic_utils as utils
14 |
15 | from preprocessing import audio, text
16 |
17 | if __name__ == '__main__':
18 |
19 | parser = argparse.ArgumentParser(description='Evaluating an ASR system.')
20 |
21 | parser.add_argument('--model', required=True, type=str)
22 | parser.add_argument('--dataset', default=None, type=str)
23 | parser.add_argument('--file', default=None, type=str)
24 | parser.add_argument('--subset', type=str, default='test')
25 |
26 | # Features generation (if necessary)
27 | parser.add_argument('--input_parser', type=str, default=None)
28 | parser.add_argument('--input_parser_params', nargs='+', default=[])
29 |
30 | # Label generation (if necessary)
31 | parser.add_argument('--label_parser', type=str,
32 | default='simple_char_parser')
33 | parser.add_argument('--label_parser_params', nargs='+', default=[])
34 | parser.add_argument('--no_decoder', action='store_true', default=False)
35 |
36 | # Other configs
37 | parser.add_argument('--gpu', default='0', type=str)
38 | parser.add_argument('--allow_growth', default=False, action='store_true')
39 |
40 | parser.add_argument('--save', default=None, type=str)
41 | parser.add_argument('--override', default=False, action='store_true')
42 |
43 | args = parser.parse_args()
44 | args_nondefault = utils.parse_nondefault_args(
45 | args, parser.parse_args(
46 | ['--model', args.model, '--dataset', args.dataset]))
47 |
48 | if args.dataset is None and args.file is None:
49 | raise ValueError('dataset or file args must be set.')
50 |
51 | if args.dataset and args.file:
52 | print('Both dataset and file args was set. Ignoring file args.')
53 |
54 | # GPU configuration
55 | setup_gpu(args.gpu, args.allow_growth)
56 |
57 | # Loading model
58 | model, meta = load_model(args.model, return_meta=True,
59 | mode='predict', decoder=(not args.no_decoder))
60 |
61 | args = HParams(**meta['training_args']).update(vars(args_nondefault))
62 |
63 | # Features extractor
64 | input_parser = utils.get_from_module('preprocessing.audio',
65 | args.input_parser,
66 | params=args.input_parser_params)
67 |
68 | # Recovering text parser
69 | label_parser = utils.get_from_module('preprocessing.text',
70 | args.label_parser,
71 | params=args.label_parser_params)
72 |
73 | if args.dataset is not None:
74 | data_gen = DatasetGenerator(input_parser, label_parser,
75 | batch_size=1, seed=0, mode='predict',
76 | shuffle=False)
77 | test_flow = data_gen.flow_from_fname(args.dataset,
78 | datasets=args.subset)
79 | else:
80 | test_flow = DatasetIterator(np.array([args.file]), None,
81 | input_parser=input_parser,
82 | label_parser=label_parser, mode='predict',
83 | shuffle=False)
84 | test_flow.labels = np.array([u''])
85 |
86 | results = []
87 | for index in range(test_flow.len):
88 | prediction = model.predict(test_flow.next())
89 | if not args.no_decoder:
90 | prediction = label_parser.imap(prediction[0])
91 | results.append({'input': test_flow.inputs[0].tolist(), 'label': test_flow.labels[0], 'best': prediction.tolist()})
92 | print('Ground Truth: %s' % (label_parser._sanitize(test_flow.labels[0])))
93 | print(' Predicted: %s\n\n' % prediction)
94 |
95 | if args.save is not None:
96 | if os.path.exists(args.save):
97 | if not args.override:
98 | raise IOError('Unable to create file')
99 | os.remove(args.save)
100 |
101 | if args.no_decoder:
102 | with h5py.File(args.save) as f:
103 | predictions = f.create_dataset(
104 | 'predictions', (0,), maxshape=(None,),
105 | dtype=h5py.special_dtype(vlen=np.dtype('float32')))
106 | predictions.attrs['num_labels'] = results[0]['prediction'].shape[-1]
107 |
108 | labels = f.create_dataset(
109 | 'labels', (0,), maxshape=(None,),
110 | dtype=h5py.special_dtype(vlen=unicode))
111 |
112 | inputs = f.create_dataset(
113 | 'inputs', (0,), maxshape=(None,),
114 | dtype=h5py.special_dtype(vlen=unicode))
115 |
116 | for index, result in enumerate(results):
117 |
118 | label = result['label']
119 | prediction = result['prediction']
120 | input_ = result['input']
121 |
122 | inputs.resize(inputs.shape[0] + 1, axis=0)
123 | inputs[inputs.shape[0] - 1] = input_
124 |
125 | labels.resize(labels.shape[0] + 1, axis=0)
126 | labels[labels.shape[0] - 1] = label.encode('utf8')
127 |
128 | predictions.resize(predictions.shape[0] + 1, axis=0)
129 | predictions[predictions.shape[0] - 1] = prediction.flatten().astype('float32')
130 |
131 | # Flush to disk only when it reaches 128 samples
132 | if index % 128 == 0:
133 | print('%d/%d done.' % (index, len(results)))
134 | f.flush()
135 |
136 | f.flush()
137 | print('%d/%d done.' % (len(results), len(results)))
138 | else:
139 | raise ValueError('save param must be set if no_decoder is Truepython')
140 |
141 | else:
142 | with codecs.open(args.save, 'w', encoding='utf8') as f:
143 | json.dump(results, f)
144 |
145 | from keras import backend as K
146 | K.clear_session()
147 |
--------------------------------------------------------------------------------
/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .audio import MFCC, FBank, LogFbank, Raw
3 | from .text import CharParser, simple_char_parser, complex_char_parser
4 |
--------------------------------------------------------------------------------
/preprocessing/audio.py:
--------------------------------------------------------------------------------
1 | ''' Code partially copied from python_speech_features package
2 | '''
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 | from . import audio_utils as sigproc
8 |
9 | import os
10 | import numpy as np
11 | import logging
12 |
13 | from scipy import signal
14 | from scipy.fftpack import dct
15 | import librosa
16 |
17 |
18 | class Feature(object):
19 | """ Base class for features calculation
20 | All children class must implement __str__ and _call function.
21 |
22 | # Arguments
23 | fs: sampling frequency of audio signal. If the audio has not this fs,
24 | it will be resampled
25 | eps
26 | """
27 |
28 | def __init__(self, fs=16e3, eps=1e-8, stride=1, num_context=0,
29 | mean_norm=True, var_norm=True):
30 | self.fs = fs
31 | self.eps = eps
32 |
33 | self.mean_norm = mean_norm
34 | self.var_norm = var_norm
35 |
36 | self.stride = stride
37 | self.num_context = num_context
38 | self._logger = logging.getLogger('%s.%s' % (__name__,
39 | self.__class__.__name__))
40 |
41 | def __call__(self, audio):
42 | """ This method load the audio and do the transformation of signal
43 |
44 | # Inputs
45 | audio:
46 | if audio is a string and the file exists, the wave file will
47 | be loaded and resampled (if necessary) to fs
48 | if audio is a ndarray or list and is not empty, it will make
49 | the transformation without any resampling
50 |
51 | # Exception
52 | TypeError if audio were not recognized
53 |
54 | """
55 | if ((isinstance(audio, str) or isinstance(audio, unicode))
56 | and os.path.isfile(audio)):
57 | audio, current_fs = librosa.audio.load(audio)
58 | audio = librosa.core.resample(audio, current_fs, self.fs)
59 | feats = self._call(audio)
60 | elif type(audio) in (np.ndarray, list) and len(audio) > 1:
61 | feats = self._call(audio)
62 | else:
63 | TypeError("audio type is not support")
64 |
65 | return self._standarize(self._postprocessing(feats))
66 |
67 | def _call(self, data):
68 | raise NotImplementedError("__call__ must be overrided")
69 |
70 | def _standarize(self, feats):
71 | if self.mean_norm:
72 | feats -= np.mean(feats, axis=0, keepdims=True)
73 | if self.var_norm:
74 | feats /= (np.std(feats, axis=0, keepdims=True) + self.eps)
75 | return feats
76 |
77 | def _postprocessing(self, feats):
78 | # Code adapted from
79 | # https://github.com/mozilla/DeepSpeech/blob/master/util/audio.py
80 |
81 | # We only keep every second feature (BiRNN stride = 2)
82 | feats = feats[::self.stride]
83 |
84 | if self.num_context == 0:
85 | return feats
86 | num_feats = feats.shape[1]
87 |
88 | train_inputs = np.array([], np.float32)
89 | train_inputs.resize((feats.shape[0],
90 | num_feats + 2*num_feats*self.num_context))
91 |
92 | # Prepare pre-fix post fix context
93 | # (TODO: Fill empty_mfcc with MCFF of silence)
94 | empty_mfcc = np.array([])
95 | empty_mfcc.resize((num_feats))
96 |
97 | # Prepare train_inputs with past and future contexts
98 | time_slices = range(train_inputs.shape[0])
99 | context_past_min = time_slices[0] + self.num_context
100 | context_future_max = time_slices[-1] - self.num_context
101 | for time_slice in time_slices:
102 | # Reminder: array[start:stop:step]
103 | # slices from indice |start| up to |stop| (not included), every
104 | # |step|
105 | # Pick up to self.num_context time slices in the past, and complete
106 | # with empty
107 | # mfcc features
108 | need_empty_past = max(0, (context_past_min - time_slice))
109 | empty_source_past = list(empty_mfcc for empty_slots
110 | in range(need_empty_past))
111 | data_source_past = feats[max(0, time_slice -
112 | self.num_context):time_slice]
113 | assert(len(empty_source_past) +
114 | len(data_source_past) == self.num_context)
115 |
116 | # Pick up to self.num_context time slices in the future, and
117 | # complete with empty
118 | # mfcc features
119 | need_empty_future = max(0, (time_slice - context_future_max))
120 | empty_source_future = list(empty_mfcc
121 | for empty_slots in
122 | range(need_empty_future))
123 | data_source_future = feats[time_slice + 1:time_slice +
124 | self.num_context + 1]
125 |
126 | assert(len(empty_source_future) +
127 | len(data_source_future) == self.num_context)
128 |
129 | if need_empty_past:
130 | past = np.concatenate((empty_source_past, data_source_past))
131 | else:
132 | past = data_source_past
133 |
134 | if need_empty_future:
135 | future = np.concatenate((data_source_future,
136 | empty_source_future))
137 | else:
138 | future = data_source_future
139 |
140 | past = np.reshape(past, self.num_context*num_feats)
141 | now = feats[time_slice]
142 | future = np.reshape(future, self.num_context*num_feats)
143 |
144 | train_inputs[time_slice] = np.concatenate((past, now, future))
145 | assert(len(train_inputs[time_slice])
146 | == num_feats + 2*num_feats*self.num_context)
147 |
148 | self._num_feats = num_feats + 2*num_feats*self.num_context
149 |
150 | return train_inputs
151 |
152 | def __str__(self):
153 | raise NotImplementedError("__str__ must be overrided")
154 |
155 | @property
156 | def num_feats(self):
157 | return self._num_feats
158 |
159 |
160 | class FBank(Feature):
161 | """Compute Mel-filterbank energy features from an audio signal.
162 |
163 | # Arguments
164 | win_len: the length of the analysis window in seconds.
165 | Default is 0.025s (25 milliseconds)
166 | win_step: the step between successive windows in seconds.
167 | Default is 0.01s (10 milliseconds)
168 | num_filt: the number of filters in the filterbank, default 40.
169 | nfft: the FFT size. Default is 512.
170 | low_freq: lowest band edge of mel filters in Hz.
171 | Default is 20.
172 | high_freq: highest band edge of mel filters in Hz.
173 | Default is 7800
174 | pre_emph: apply preemphasis filter with preemph as coefficient.
175 | 0 is no filter. Default is 0.97.
176 | win_func: the analysis window to apply to each frame.
177 | By default hamming window is applied.
178 | """
179 |
180 | def __init__(self, win_len=0.025, win_step=0.01,
181 | num_filt=40, nfft=512, low_freq=20, high_freq=7800,
182 | pre_emph=0.97, win_fun=signal.hamming, **kwargs):
183 |
184 | super(FBank, self).__init__(**kwargs)
185 |
186 | if high_freq > self.fs / 2:
187 | raise ValueError("high_freq must be less or equal than fs/2")
188 |
189 | self.win_len = win_len
190 | self.win_step = win_step
191 | self.num_filt = num_filt
192 | self.nfft = nfft
193 | self.low_freq = low_freq
194 | self.high_freq = high_freq or self.fs / 2
195 | self.pre_emph = pre_emph
196 | self.win_fun = win_fun
197 | self._filterbanks = self._get_filterbanks()
198 |
199 | self._num_feats = self.num_filt
200 |
201 | @property
202 | def mel_points(self):
203 | return np.linspace(self._low_mel, self._high_mel, self.num_filt + 2)
204 |
205 | @property
206 | def low_freq(self):
207 | return self._low_freq
208 |
209 | @low_freq.setter
210 | def low_freq(self, value):
211 | self._low_mel = self._hz2mel(value)
212 | self._low_freq = value
213 |
214 | @property
215 | def high_freq(self):
216 | return self._high_freq
217 |
218 | @high_freq.setter
219 | def high_freq(self, value):
220 | self._high_mel = self._hz2mel(value)
221 | self._high_freq = value
222 |
223 | def _call(self, signal):
224 | """Compute Mel-filterbank energy features from an audio signal.
225 | :param signal: the audio signal from which to compute features. Should
226 | be an N*1 array
227 |
228 | Returns:
229 | 2 values. The first is a numpy array of size (NUMFRAMES by nfilt)
230 | containing features. Each row holds 1 feature vector. The
231 | second return value is the energy in each frame (total energy,
232 | unwindowed)
233 | """
234 |
235 | signal = sigproc.preemphasis(signal, self.pre_emph)
236 |
237 | frames = sigproc.framesig(signal,
238 | self.win_len * self.fs,
239 | self.win_step * self.fs,
240 | self.win_fun)
241 |
242 | pspec = sigproc.powspec(frames, self.nfft)
243 | # this stores the total energy in each frame
244 | energy = np.sum(pspec, 1)
245 | # if energy is zero, we get problems with log
246 | energy = np.where(energy == 0, np.finfo(float).eps, energy)
247 |
248 | # compute the filterbank energies
249 | feat = np.dot(pspec, self._filterbanks.T)
250 | # if feat is zero, we get problems with log
251 | feat = np.where(feat == 0, np.finfo(float).eps, feat)
252 |
253 | return feat, energy
254 |
255 | def _get_filterbanks(self):
256 | """Compute a Mel-filterbank. The filters are stored in the rows, the
257 | columns correspond
258 | to fft bins. The filters are returned as an array of size nfilt *
259 | (nfft / 2 + 1)
260 |
261 | Returns:
262 | A numpy array of size num_filt * (nfft/2 + 1) containing
263 | filterbank. Each row holds 1 filter.
264 | """
265 |
266 | # our points are in Hz, but we use fft bins, so we have to convert
267 | # from Hz to fft bin number
268 | bin = np.floor((self.nfft + 1) * self._mel2hz(self.mel_points) /
269 | self.fs)
270 |
271 | fbank = np.zeros([self.num_filt, int(self.nfft / 2 + 1)])
272 | for j in xrange(0, self.num_filt):
273 | for i in xrange(int(bin[j]), int(bin[j + 1])):
274 | fbank[j, i] = (i - bin[j]) / (bin[j + 1] - bin[j])
275 | for i in xrange(int(bin[j + 1]), int(bin[j + 2])):
276 | fbank[j, i] = (bin[j + 2] - i) / (bin[j + 2] - bin[j + 1])
277 | return fbank
278 |
279 | def _hz2mel(self, hz):
280 | """Convert a value in Hertz to Mels
281 |
282 | Args:
283 | hz: a value in Hz. This can also be a numpy array, conversion
284 | proceeds element-wise.
285 |
286 | Returns:
287 | A value in Mels. If an array was passed in, an identical sized
288 | array is returned.
289 | """
290 | return 2595 * np.log10(1 + hz / 700.0)
291 |
292 | def _mel2hz(self, mel):
293 | """Convert a value in Mels to Hertz
294 |
295 | Args:
296 | mel: a value in Mels. This can also be a numpy array, conversion
297 | proceeds element-wise.
298 |
299 | Returns:
300 | A value in Hertz. If an array was passed in, an identical sized
301 | array is returned.
302 | """
303 | return 700 * (10**(mel / 2595.0) - 1)
304 |
305 | def __str__(self):
306 | return "fbank"
307 |
308 |
309 | class MFCC(FBank):
310 | """Compute MFCC features from an audio signal.
311 |
312 | # Arguments
313 | num_cep: the number of cepstrum to return. Default 13.
314 | cep_lifter: apply a lifter to final cepstral coefficients. 0 is
315 | no lifter. Default is 22.
316 | append_energy: if this is true, the zeroth cepstral coefficient
317 | is replaced with the log of the total frame energy.
318 | d: if True add deltas coeficients. Default True
319 | dd: if True add delta-deltas coeficients. Default True
320 | norm: if 'cmn' performs the cepstral mean normalization. elif 'cmvn'
321 | performs the cepstral mean and variance normalizastion. Default 'cmn'
322 | """
323 |
324 | def __init__(self, num_cep=13, cep_lifter=22, append_energy=True,
325 | d=True, dd=True, **kwargs):
326 |
327 | super(MFCC, self).__init__(**kwargs)
328 |
329 | self.num_cep = num_cep
330 | self.cep_lifter = cep_lifter
331 | self.append_energy = append_energy
332 | self.d = d
333 | self.dd = dd
334 | self._num_feats = (1 + self.d + self.dd) * self.num_cep
335 |
336 | self._logger = logging.getLogger('%s.%s' % (__name__,
337 | self.__class__.__name__))
338 |
339 | def _call(self, signal):
340 | """Compute MFCC features from an audio signal.
341 |
342 | Args:
343 | signal: the audio signal from which to compute features. Should be
344 | an N*1 array
345 |
346 | Returns:
347 | A numpy array of size (NUMFRAMES by numcep) containing features.
348 | Each row holds 1 feature vector.
349 | """
350 | feat, energy = super(MFCC, self)._call(signal)
351 |
352 | feat = np.log(feat)
353 | feat = dct(feat, type=2, axis=1, norm='ortho')[:, :self.num_cep]
354 | feat = self._lifter(feat, self.cep_lifter)
355 |
356 | if self.append_energy:
357 | # replace first cepstral coefficient with log of frame energy
358 | feat[:, 0] = np.log(energy + self.eps)
359 |
360 | if self.d:
361 | d = sigproc.delta(feat, 2)
362 | feat = np.hstack([feat, d])
363 |
364 | if self.dd:
365 | feat = np.hstack([feat, sigproc.delta(d, 2)])
366 |
367 | return feat
368 |
369 | def _lifter(self, cepstra, L=22):
370 | """Apply a cepstral lifter the the matrix of cepstra.
371 |
372 | This has the effect of increasing the magnitude of the high frequency
373 | DCT coeffs.
374 |
375 | Args:
376 | cepstra: the matrix of mel-cepstra, will be numframes * numcep in
377 | size.
378 | L: the liftering coefficient to use. Default is 22. L <= 0 disables
379 | lifter.
380 | """
381 | if L > 0:
382 | nframes, ncoeff = np.shape(cepstra)
383 | n = np.arange(ncoeff)
384 | lift = 1 + (L / 2) * np.sin(np.pi * n / L)
385 | return lift * cepstra
386 | else:
387 | # values of L <= 0, do nothing
388 | return cepstra
389 |
390 | def __str__(self):
391 | return "mfcc"
392 |
393 |
394 | class LogFbank(FBank):
395 | """Compute Mel-filterbank energy features from an audio signal.
396 |
397 | # Arguments
398 | append_energy: if this is true, log of the total frame energy is
399 | append to the features vector. Default False
400 | d: if True add deltas coeficients. Default False
401 | dd: if True add delta-deltas coeficients. Default False
402 | """
403 |
404 | def __init__(self, d=False, dd=False, append_energy=False, **kwargs):
405 | """Constructor
406 | """
407 |
408 | super(LogFbank, self).__init__(**kwargs)
409 |
410 | self.d = d
411 | self.dd = dd
412 | self.append_energy = append_energy
413 | self._num_feats = ((1 + self.d + self.dd)
414 | * (self.num_filt + self.append_energy))
415 |
416 | self._logger = logging.getLogger('%s.%s' % (__name__,
417 | self.__class__.__name__))
418 |
419 | def _call(self, signal):
420 | """Compute log Mel-filterbank energy features from an audio signal.
421 | :param signal: the audio signal from which to compute features. Should
422 | be an N*1 array
423 |
424 | Returns:
425 | A numpy array of size (NUMFRAMES by nfilt) containing features.
426 | Each row holds 1 feature vector.
427 | """
428 | feat, energy = super(LogFbank, self)._call(signal)
429 |
430 | feat = np.log(feat)
431 |
432 | if self.append_energy:
433 | feat = np.hstack([feat, np.log(energy + self.eps)[:, np.newaxis]])
434 |
435 | if self.d:
436 | d = sigproc.delta(feat, 2)
437 | feat = np.hstack([feat, d])
438 |
439 | if self.dd:
440 | feat = np.hstack([feat, sigproc.delta(d, 2)])
441 |
442 | return feat
443 |
444 | def __str__(self):
445 | return "logfbank"
446 |
447 |
448 | class Raw(Feature):
449 | """ Raw features extractor
450 | """
451 | def __init__(self, **kwargs):
452 | super(Raw, self).__init__(**kwargs)
453 | self._num_feats = None
454 |
455 | def _call(self, x):
456 | return x
457 |
458 | def _postprocessing(self, x):
459 | return x
460 |
461 | def __str__(self):
462 | return "raw"
463 |
464 |
465 | raw = Raw()
466 |
--------------------------------------------------------------------------------
/preprocessing/audio_utils.py:
--------------------------------------------------------------------------------
1 | """ Code based on package python_speech_features
2 |
3 | Author: James Lyons 2012
4 | """
5 | import decimal
6 |
7 | import numpy
8 | import math
9 |
10 |
11 | def round_half_up(number):
12 | return int(decimal.Decimal(number).quantize(decimal.Decimal('1'),
13 | rounding=decimal.ROUND_HALF_UP
14 | ))
15 |
16 |
17 | def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
18 | """Frame a signal into overlapping frames.
19 | :param sig: the audio signal to frame.
20 | :param frame_len: length of each frame measured in samples.
21 | :param frame_step: number of samples after the start of the previous frame
22 | that the next frame should begin.
23 | :param winfunc: the analysis window to apply to each frame. By default no
24 | window is applied.
25 | :returns: an array of frames. Size is NUMFRAMES by frame_len.
26 | """
27 | slen = len(sig)
28 | frame_len = int(round_half_up(frame_len))
29 | frame_step = int(round_half_up(frame_step))
30 | if slen <= frame_len:
31 | numframes = 1
32 | else:
33 | numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))
34 |
35 | padlen = int((numframes - 1) * frame_step + frame_len)
36 |
37 | zeros = numpy.zeros((padlen - slen,))
38 | padsignal = numpy.concatenate((sig, zeros))
39 |
40 | indices = numpy.tile(
41 | numpy.arange(
42 | 0, frame_len),
43 | (numframes, 1)) + numpy.tile(
44 | numpy.arange(
45 | 0, numframes * frame_step, frame_step), (frame_len, 1)).T
46 |
47 | indices = numpy.array(indices, dtype=numpy.int32)
48 | frames = padsignal[indices]
49 | win = numpy.tile(winfunc(frame_len), (numframes, 1))
50 | return frames * win
51 |
52 |
53 | def deframesig(frames, siglen, frame_len, frame_step,
54 | winfunc=lambda x: numpy.ones((x,))):
55 | """Does overlap-add procedure to undo the action of framesig.
56 | :param frames: the array of frames.
57 | :param siglen: the length of the desired signal, use 0 if unknown. Output
58 | will be truncated to siglen samples.
59 | :param frame_len: length of each frame measured in samples.
60 | :param frame_step: number of samples after the start of the previous frame
61 | that the next frame should begin.
62 | :param winfunc: the analysis window to apply to each frame. By default no
63 | window is applied.
64 | :returns: a 1-D signal.
65 | """
66 | frame_len = round_half_up(frame_len)
67 | frame_step = round_half_up(frame_step)
68 | numframes = numpy.shape(frames)[0]
69 | assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong\
70 | size, 2nd dim is not equal to frame_len'
71 |
72 | indices = numpy.tile(
73 | numpy.arange(
74 | 0, frame_len), (numframes, 1)) + numpy.tile(
75 | numpy.arange(
76 | 0, numframes * frame_step, frame_step), (frame_len, 1)).T
77 |
78 | indices = numpy.array(indices, dtype=numpy.int32)
79 | padlen = (numframes - 1) * frame_step + frame_len
80 |
81 | if siglen <= 0:
82 | siglen = padlen
83 |
84 | rec_signal = numpy.zeros((padlen,))
85 | window_correction = numpy.zeros((padlen,))
86 | win = winfunc(frame_len)
87 |
88 | for i in range(0, numframes):
89 | # add a little bit so it is never zero
90 | window_correction[indices[i, :]] = window_correction[indices[i, :]] + \
91 | win + 1e-15
92 | rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
93 |
94 | rec_signal = rec_signal / window_correction
95 | return rec_signal[0:siglen]
96 |
97 |
98 | def magspec(frames, NFFT):
99 | """Compute the magnitude spectrum of each frame in frames. If frames is an
100 | NxD matrix, output will be NxNFFT.
101 | :param frames: the array of frames. Each row is a frame.
102 | :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are
103 | zero-padded.
104 | :returns: If frames is an NxD matrix, output will be NxNFFT. Each row will
105 | be the magnitude spectrum of the corresponding frame.
106 | """
107 | complex_spec = numpy.fft.rfft(frames, NFFT)
108 | return numpy.absolute(complex_spec)
109 |
110 |
111 | def powspec(frames, NFFT):
112 | """Compute the power spectrum of each frame in frames. If frames is an NxD
113 | matrix, output will be NxNFFT.
114 | :param frames: the array of frames. Each row is a frame.
115 | :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are
116 | zero-padded.
117 | :returns: If frames is an NxD matrix, output will be NxNFFT. Each row will
118 | be the power spectrum of the corresponding frame.
119 | """
120 | return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))
121 |
122 |
123 | def logpowspec(frames, NFFT, norm=1):
124 | """Compute the log power spectrum of each frame in frames. If frames is an
125 | NxD matrix, output will be NxNFFT.
126 | :param frames: the array of frames. Each row is a frame.
127 | :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are
128 | zero-padded.
129 | :param norm: If norm=1, the log power spectrum is normalised so that the
130 | max value (across all frames) is 1.
131 | :returns: If frames is an NxD matrix, output will be NxNFFT. Each row will
132 | be the log power spectrum of the corresponding frame.
133 | """
134 | ps = powspec(frames, NFFT)
135 | ps[ps <= 1e-30] = 1e-30
136 | lps = 10 * numpy.log10(ps)
137 | if norm:
138 | return lps - numpy.max(lps)
139 | else:
140 | return lps
141 |
142 |
143 | def preemphasis(signal, coeff=0.95):
144 | """perform preemphasis on the input signal.
145 |
146 | :param signal: The signal to filter.
147 | :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
148 | :returns: the filtered signal.
149 | """
150 | return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
151 |
152 |
153 | def delta(feat, N):
154 | """Compute delta features from a feature vector sequence.
155 |
156 | :param feat: A numpy array of size (NUMFRAMES by number of features)
157 | containing features. Each row holds 1 feature vector.
158 | :param N: For each frame, calculate delta features based on preceding and
159 | following N frames
160 | :returns: A numpy array of size (NUMFRAMES by number of features)
161 | containing delta features. Each row holds 1 delta feature vector.
162 | """
163 | NUMFRAMES = len(feat)
164 | feat = numpy.concatenate(([feat[0] for i in range(N)], feat, [feat[-1] for
165 | i in
166 | range(N)]))
167 | denom = sum([2 * i * i for i in range(1, N + 1)])
168 | dfeat = []
169 | for j in range(NUMFRAMES):
170 | dfeat.append(numpy.sum([n * feat[N + j + n]
171 | for n in range(-1 * N, N + 1)], axis=0) /
172 | denom)
173 | return dfeat
174 |
--------------------------------------------------------------------------------
/preprocessing/text.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import, division, print_function
3 |
4 | import string
5 | from unidecode import unidecode
6 | import logging
7 | import numpy as np
8 |
9 | PUNCTUATIONS = "'""-,.!?:;"
10 | ACCENTS = u'ãõçâêôáíóúàüóé'
11 |
12 |
13 | class BaseParser(object):
14 | """ Interface class for all parsers
15 | """
16 |
17 | def __init__(self):
18 | self._logger = logging.getLogger('%s.%s' % (__name__,
19 | self.__class__.__name__))
20 |
21 | def __call__(self, _input):
22 | return self.map(_input)
23 |
24 | def map(self, _input):
25 | pass
26 |
27 | def imap(self, _input):
28 | pass
29 |
30 | def is_valid(self, _input):
31 | pass
32 |
33 |
34 | class CharParser(BaseParser):
35 | """ Class responsible to map any text in a certain character vocabulary
36 |
37 | # Arguments
38 | mode: Which type of vacabulary will be generated. Modes can be
39 | concatenated by using pipeline '|'
40 | 'space' or 's': accepts space character
41 | 'accents' or 'a': accepts pt-br accents
42 | 'punctuation' or 'p': accepts punctuation defined in
43 | string.punctuation
44 | 'digits': accepts all digits
45 | 'sensitive' or 'S': characters will be case sensitive
46 | 'all': shortcut that enables all modes
47 | """
48 |
49 | def __init__(self, mode='space'):
50 | self._permitted_modes = {'sensitive': 'S', 'space': 's', 'accents':
51 | 'a', 'punctuation': 'p', 'digits': 'd'}
52 |
53 | if mode == 'all':
54 | self.mode = self._permitted_modes.values()
55 | else:
56 | self.mode = []
57 | for m in mode.split('|'):
58 | try:
59 | self.mode.append(self._permitted_modes[m])
60 | except KeyError:
61 | if m not in self._permitted_modes.values():
62 | raise ValueError('Unknown mode %s' % m)
63 |
64 | self.mode.append(m)
65 |
66 | self._vocab, self._inv_vocab = self._gen_vocab()
67 |
68 | def map(self, txt, sanitize=True):
69 | if sanitize:
70 | label = np.array([self._vocab[c] for c in self._sanitize(txt)],
71 | dtype='int32')
72 | else:
73 | label = np.array([self._vocab[c] for c in txt], dtype='int32')
74 |
75 | return label
76 |
77 | def imap(self, labels):
78 | txt = ''.join([self._inv_vocab[l] for l in labels])
79 |
80 | return txt
81 |
82 | def _sanitize(self, text):
83 | # removing duplicated spaces
84 | text = ' '.join(text.split())
85 |
86 | if not('d' in self.mode):
87 | text = ''.join([c for c in text if not c.isdigit()])
88 |
89 | if not('a' in self.mode):
90 | text = unidecode(text)
91 |
92 | if not('p' in self.mode):
93 | text = text.translate(
94 | string.maketrans("-'", ' ')).translate(None,
95 | string.punctuation)
96 |
97 | if not ('s' in self.mode):
98 | text = text.replace(' ', '')
99 |
100 | if not('S' in self.mode):
101 | text = text.lower()
102 |
103 | return text
104 |
105 | def is_valid(self, text):
106 | # verify if the text is valid without sanitization
107 | try:
108 | _ = self.map(text, sanitize=False)
109 | return True
110 | except KeyError:
111 | return False
112 |
113 | def _gen_vocab(self):
114 |
115 | vocab = {chr(value + ord('a')): (value)
116 | for value in xrange(ord('z') - ord('a') + 1)}
117 |
118 | if 'a' in self.mode:
119 | for a in ACCENTS:
120 | vocab[a] = len(vocab)
121 |
122 | if 'S' in self.mode:
123 | for char in vocab.keys():
124 | vocab[char.upper()] = len(vocab)
125 |
126 | if 's' in self.mode:
127 | # Inserts space label
128 | vocab[' '] = len(vocab)
129 |
130 | if 'p' in self.mode:
131 | for p in PUNCTUATIONS:
132 | vocab[p] = len(vocab)
133 |
134 | if 'd' in self.mode:
135 | for num in range(10):
136 | vocab[str(num)] = len(vocab)
137 |
138 | inv_vocab = {v: k for (k, v) in vocab.iteritems()}
139 |
140 | # Add blank label
141 | inv_vocab[len(inv_vocab)] = ''
142 |
143 | return vocab, inv_vocab
144 |
145 |
146 | simple_char_parser = CharParser()
147 | complex_char_parser = CharParser(mode='s|p|a|d')
148 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file may be used to create an environment using:
2 | # $ conda create --name --file
3 | # platform: osx-64
4 | certifi=2016.2.28=py27_0
5 | cycler=0.10.0=py27_0
6 | freetype=2.5.5=2
7 | functools32=3.2.3.2=py27_0
8 | h5py=2.7.0=np113py27_0
9 | hdf5=1.8.17=2
10 | icu=54.1=0
11 | jbig=2.1=0
12 | jpeg=9b=0
13 | libpng=1.6.30=1
14 | libtiff=4.0.6=3
15 | matplotlib=2.0.2=np113py27_0
16 | mkl=2017.0.3=0
17 | numpy=1.13.1=py27_0
18 | olefile=0.44=py27_0
19 | openssl=1.0.2l=0
20 | pillow=4.2.1=py27_0
21 | pip=9.0.1=py27_1
22 | pyparsing=2.2.0=py27_0
23 | pyqt=5.6.0=py27_2
24 | python=2.7.13=0
25 | python-dateutil=2.6.1=py27_0
26 | pytz=2017.2=py27_0
27 | pyyaml=3.12=py27_0
28 | qt=5.6.2=2
29 | readline=6.2=2
30 | scipy=0.19.1=np113py27_0
31 | setuptools=36.4.0=py27_1
32 | sip=4.18=py27_0
33 | six=1.10.0=py27_0
34 | sqlite=3.13.0=0
35 | subprocess32=3.2.7=py27_0
36 | tk=8.5.18=0
37 | wheel=0.29.0=py27_0
38 | xz=5.2.3=0
39 | yaml=0.1.6=0
40 | zlib=1.2.11=0
41 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import os
6 | # Preventing pool_allocator message
7 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
8 |
9 | import argparse
10 | import uuid
11 | import sys
12 | import json
13 | import datetime
14 | import inspect
15 | import codecs
16 |
17 | import logging
18 | try:
19 | import warpctc_tensorflow
20 | except ImportError:
21 | logging.warning('warpctc binding for tensorflow not found. :(')
22 | import tensorflow as tf
23 |
24 | import keras
25 |
26 | import keras.backend as K
27 | from keras.optimizers import SGD, Adam
28 | from keras.callbacks import ReduceLROnPlateau
29 |
30 | from core import metrics
31 | from core.ctc_utils import ctc_dummy_loss, decoder_dummy_loss
32 | from core.callbacks import MetaCheckpoint, ProgbarLogger
33 | from utils.core_utils import setup_gpu
34 |
35 | from preprocessing import audio, text
36 |
37 | from datasets.dataset_generator import DatasetGenerator
38 | from utils.hparams import HParams
39 |
40 | import utils.generic_utils as utils
41 |
42 | from utils.core_utils import load_model
43 |
44 | if __name__ == '__main__':
45 |
46 | parser = argparse.ArgumentParser(description='Training an ASR system.')
47 |
48 | # Resume training
49 | parser.add_argument('--load', default=None, type=str)
50 |
51 | # Model settings
52 | parser.add_argument('--model', default='brsmv1', type=str)
53 | parser.add_argument('--model_params', nargs='+', default=[])
54 |
55 | # Hyper parameters
56 | parser.add_argument('--num_epochs', default=100, type=int)
57 | parser.add_argument('--lr', default=0.001, type=float)
58 | parser.add_argument('--momentum', default=0.9, type=float)
59 | parser.add_argument('--clipnorm', default=400, type=float)
60 | parser.add_argument('--batch_size', default=32, type=int)
61 | parser.add_argument('--opt', default='adam', type=str,
62 | choices=['sgd', 'adam'])
63 | # End of hyper parameters
64 |
65 | # Dataset definitions
66 | parser.add_argument('--dataset', default=None, type=str, nargs='+')
67 |
68 | # Features generation (if necessary)
69 | parser.add_argument('--input_parser', type=str, default=None)
70 | parser.add_argument('--input_parser_params', nargs='+', default=[])
71 |
72 | # Label generation (if necessary)
73 | parser.add_argument('--label_parser', type=str,
74 | default='simple_char_parser')
75 | parser.add_argument('--label_parser_params', nargs='+', default=[])
76 |
77 | # Callbacks
78 | parser.add_argument('--lr_schedule', default=None)
79 | parser.add_argument('--lr_params', nargs='+', default=[])
80 |
81 | # Other configs
82 | parser.add_argument('--save', default=None, type=str)
83 | parser.add_argument('--gpu', default='0', type=str)
84 | parser.add_argument('--allow_growth', default=False, action='store_true')
85 | parser.add_argument('--verbose', default=0, type=int)
86 | parser.add_argument('--seed', default=None, type=float)
87 |
88 | args = parser.parse_args()
89 |
90 | # Setup logging
91 | utils.setup_logging()
92 | logger = logging.getLogger(__name__)
93 | tf.logging.set_verbosity(tf.logging.ERROR)
94 |
95 | # hack in ProgbarLogger: avoid logger.infoing the dummy losses
96 | keras.callbacks.ProgbarLogger = lambda: ProgbarLogger(
97 | show_metrics=['loss', 'decoder_ler', 'val_loss', 'val_decoder_ler'])
98 |
99 | # GPU configuration
100 | setup_gpu(args.gpu, args.allow_growth,
101 | log_device_placement=args.verbose > 1)
102 |
103 | # Initial configuration
104 | epoch_offset = 0
105 | meta = None
106 |
107 | if args.load:
108 | args_nondefault = utils.parse_nondefault_args(args,
109 | parser.parse_args([]))
110 |
111 | logger.info('Loading model...')
112 | model, meta = load_model(args.load, return_meta=True)
113 |
114 | logger.info('Loading parameters...')
115 | args = HParams(**meta['training_args']).update(vars(args_nondefault))
116 |
117 | epoch_offset = len(meta['epochs'])
118 | logger.info('Current epoch: %d' % epoch_offset)
119 |
120 | if args_nondefault.lr:
121 | logger.info('Setting current learning rate to %f...' % args.lr)
122 | K.set_value(model.optimizer.lr, args.lr)
123 |
124 | else:
125 | logger.info('Creating model...')
126 | # Recovering all valid models
127 | model_fn = utils.get_from_module('core.models', args.model)
128 | # Loading model
129 | model = model_fn(**(HParams().parse(args.model_params).values()))
130 |
131 | logger.info('Setting the optimizer...')
132 | # Optimization
133 | if args.opt.strip().lower() == 'sgd':
134 | opt = SGD(lr=args.lr, momentum=args.momentum,
135 | clipnorm=args.clipnorm)
136 | elif args.opt.strip().lower() == 'adam':
137 | opt = Adam(lr=args.lr, clipnorm=args.clipnorm)
138 |
139 | # Compile with dummy loss
140 | model.compile(loss={'ctc': ctc_dummy_loss,
141 | 'decoder': decoder_dummy_loss},
142 | optimizer=opt, metrics={'decoder': metrics.ler},
143 | loss_weights=[1, 0])
144 |
145 | logger.info('Creating results folder...')
146 | # Creating the results folder
147 | output_dir = args.save
148 | if output_dir is None:
149 | output_dir = os.path.join('results',
150 | '%s_%s' % (args.model,
151 | datetime.datetime.now()))
152 | if not os.path.isdir(output_dir):
153 | os.makedirs(output_dir)
154 |
155 | logger.info('Adding callbacks')
156 | # Callbacks
157 | model_ckpt = MetaCheckpoint(os.path.join(output_dir, 'model.h5'),
158 | training_args=args, meta=meta)
159 | best_ckpt = MetaCheckpoint(
160 | os.path.join(output_dir, 'best.h5'), monitor='val_decoder_ler',
161 | save_best_only=True, mode='min', training_args=args, meta=meta)
162 | callback_list = [model_ckpt, best_ckpt]
163 |
164 | # LR schedules
165 | if args.lr_schedule:
166 | lr_schedule_fn = utils.get_from_module('keras.callbacks',
167 | args.lr_schedule)
168 | if lr_schedule_fn:
169 | lr_schedule = lr_schedule_fn(**HParams().parse(args.lr_params).values())
170 | callback_list.append(lr_schedule)
171 | else:
172 | raise ValueError('Learning rate schedule unrecognized')
173 |
174 | logger.info('Getting the feature extractor...')
175 | # Features extractor
176 | input_parser = utils.get_from_module('preprocessing.audio',
177 | args.input_parser,
178 | params=args.input_parser_params)
179 |
180 | logger.info('Getting the text parser...')
181 | # Recovering text parser
182 | label_parser = utils.get_from_module('preprocessing.text',
183 | args.label_parser,
184 | params=args.label_parser_params)
185 |
186 | logger.info('Getting the data generator...')
187 | # Data generator
188 | data_gen = DatasetGenerator(input_parser, label_parser,
189 | batch_size=args.batch_size,
190 | seed=args.seed)
191 | # iterators over datasets
192 | train_flow, valid_flow, test_flow = None, None, None
193 | num_val_samples = num_test_samples = 0
194 |
195 | logger.info('Generating flow...')
196 | if len(args.dataset) == 1:
197 | train_flow, valid_flow, test_flow = data_gen.flow_from_fname(
198 | args.dataset[0], datasets=['train', 'valid', 'test'])
199 | num_val_samples = valid_flow.len
200 | else:
201 | train_flow = data_gen.flow_from_fname(args.dataset[0])
202 | valid_flow = data_gen.flow_from_fname(args.dataset[1])
203 |
204 | num_val_samples = valid_flow.len
205 | if len(args.dataset) == 3:
206 | test_flow = data_gen.flow_from_fname(args.dataset[2])
207 | num_test_samples = test_flow.len
208 |
209 | logger.info(str(vars(args)))
210 | print(str(vars(args)))
211 | logger.info('Initialzing training...')
212 | # Fit the model
213 | model.fit_generator(train_flow, samples_per_epoch=train_flow.len,
214 | nb_epoch=args.num_epochs, validation_data=valid_flow,
215 | nb_val_samples=num_val_samples, max_q_size=10,
216 | nb_worker=1, callbacks=callback_list, verbose=1,
217 | initial_epoch=epoch_offset)
218 |
219 | if test_flow:
220 | del model
221 | model = load_model(os.path.join(output_dir, 'best.h5'), mode='eval')
222 | logger.info('Evaluating best model on test set')
223 | metrics = model.evaluate_generator(test_flow, test_flow.len,
224 | max_q_size=10, nb_worker=1)
225 |
226 | msg = 'Total loss: %.4f\n\
227 | CTC Loss: %.4f\nLER: %.2f%%' % (metrics[0], metrics[1], metrics[3]*100)
228 | logger.info(msg)
229 |
230 | with open(os.path.join(output_dir, 'results.txt'), 'w') as f:
231 | f.write(msg)
232 |
233 | print(msg)
234 |
235 | K.clear_session()
236 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 |
3 | from .hparams import HParams
4 |
--------------------------------------------------------------------------------
/utils/core_utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import h5py
6 | import yaml
7 |
8 | import keras
9 | import keras.backend as K
10 | from keras.models import Model
11 | from keras.layers import Lambda
12 | import tensorflow as tf
13 |
14 | import core
15 | from core import layers_utils
16 | from core import ctc_utils
17 | from core import metrics
18 |
19 | from utils.generic_utils import inspect_module
20 |
21 |
22 | def setup_gpu(gpu, allow_growth=False, log_device_placement=False):
23 | # Choosing gpu
24 | if gpu == '-1':
25 | config = tf.ConfigProto(device_count={'GPU': 0},
26 | log_device_placement=log_device_placement)
27 | else:
28 | if gpu == 'all':
29 | gpu = ''
30 | config = tf.ConfigProto(log_device_placement=log_device_placement)
31 | config.gpu_options.visible_device_list = gpu
32 | if allow_growth: # dynamic gpu memory allocation
33 | config.gpu_options.allow_growth = True
34 | session = tf.Session(config=config)
35 | K.set_session(session)
36 |
37 |
38 | def get_custom_objects():
39 | """ Verify all custom object that may be used to load a keras model
40 | """
41 | all_custom_objects = []
42 | for module in ['core.layers', 'core.layers_utils',
43 | 'core.metrics', 'core.ctc_utils',
44 | 'core.initializers']:
45 | all_custom_objects.extend(inspect_module(module, to_dict=False))
46 |
47 | return dict(all_custom_objects)
48 |
49 | def load_model(model_fname, return_meta=False, mode='train', **kwargs):
50 | """ Loading keras model with custom objects
51 |
52 | Args
53 | mode:
54 | if 'train', model will follow the definition in core.models
55 | if 'predict', beamsearch decoder will be used and the model return
56 | a np array with -1 filled in no data area
57 | if 'eval', greedy decoder will be replaced by beam search decoder
58 | of predictions
59 | """
60 | if mode not in ('train', 'predict', 'eval'):
61 | raise ValueError('mode must be one of (train, predict, eval)')
62 |
63 | model = keras.models.load_model(model_fname,
64 | custom_objects=get_custom_objects())
65 |
66 | # Define the new decoder and the to_dense layer
67 | if kwargs.get('decoder', True):
68 | dec = Lambda(ctc_utils.decode,
69 | output_shape=ctc_utils.decode_output_shape,
70 | arguments={'is_greedy': kwargs.get('is_greedy', False),
71 | 'beam_width': kwargs.get('beam_width', 400)},
72 | name='beam_search')
73 | else:
74 | dec = Lambda(lambda x: x[0])
75 |
76 | if mode == 'predict':
77 | y_pred = (model.get_layer('y_pred') or
78 | model.get_layer('decoder').input[0])
79 |
80 | input_ = model.get_layer('inputs').input
81 | inputs_length = model.get_layer('inputs_length').input
82 |
83 | to_dense_layer = Lambda(
84 | layers_utils.to_dense,
85 | output_shape=layers_utils.to_dense_output_shape,
86 | name="to_dense")
87 |
88 | y_pred = dec([y_pred, inputs_length])
89 |
90 | y_pred = to_dense_layer(y_pred)
91 |
92 | model = Model(input=[input_, inputs_length],
93 | output=[y_pred])
94 | elif mode == 'eval':
95 | dec_layer = model.get_layer('decoder')
96 |
97 | y_pred_bs = dec(dec_layer.input)
98 |
99 | model = Model(input=model.inputs, output=[model.outputs[0], y_pred_bs])
100 |
101 | # Freezing layers
102 | for l in model.layers:
103 | l.trainable = False
104 |
105 | model.compile('sgd',
106 | loss={'ctc': ctc_utils.ctc_dummy_loss,
107 | 'beam_search': ctc_utils.decoder_dummy_loss},
108 | metrics={'beam_search': metrics.ler},
109 | loss_weights=[1, 0])
110 |
111 | if return_meta:
112 | meta = load_meta(model_fname)
113 | return model, meta
114 |
115 | return model
116 |
117 |
118 | def load_meta(model_fname):
119 | ''' Load meta configuration
120 | '''
121 | meta = {}
122 |
123 | with h5py.File(model_fname, 'r') as f:
124 | meta_group = f['meta']
125 |
126 | meta['training_args'] = yaml.load(
127 | meta_group.attrs['training_args'])
128 | for k in meta_group.keys():
129 | meta[k] = list(meta_group[k])
130 |
131 | return meta
132 |
--------------------------------------------------------------------------------
/utils/generic_utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import h5py
6 | import sys
7 | import os
8 |
9 | import logging
10 | import logging.config
11 | import yaml
12 |
13 | import numpy as np
14 | from scipy import sparse
15 |
16 | import inspect
17 | import yaml
18 |
19 | from .hparams import HParams
20 |
21 | import re
22 |
23 | logger = logging.getLogger(__name__)
24 |
25 |
26 | def safe_mkdirs(path):
27 | ''' Safe makedirs
28 | Directory is created with command `makedir -p`.
29 | Returns:
30 | `path` if the directory already exists or is created
31 | Exception:
32 | OSError if something is wrong
33 | '''
34 | try:
35 | os.makedirs(path)
36 | except OSError, e:
37 | if e.errno != 17: # 17 = file exists
38 | raise
39 |
40 | return path
41 |
42 |
43 | def get_from_module(module, name, params=None, regex=False):
44 | """ Get a class or method from a module given its name
45 | """
46 | members = inspect_module(module, regex=regex)
47 |
48 | if name is None or name.lower() == 'none':
49 | return None
50 |
51 | members = {k.lower().strip(): v for k, v in members.items()}
52 |
53 | try:
54 | member = members[name.lower().strip()]
55 | # is a class and must be instantiate if params is not none
56 | if (member and params is not None) and inspect.isclass(member):
57 | return member(**HParams().parse(params).values())
58 |
59 | return member
60 | except KeyError, e:
61 | raise KeyError("%s not found in %s.\n Valid values are: %s" %
62 | (name, module, ', '.join(members.keys())))
63 |
64 |
65 | def inspect_module(module, to_dict=True, regex=False):
66 | modules = {}
67 | if regex:
68 | pattern = re.compile(module)
69 | for key, value in sys.modules.items():
70 | if pattern.match(key):
71 | modules[key] = value
72 | else:
73 | modules = {module: sys.modules[module]}
74 |
75 | members = []
76 | for key, value in modules.items():
77 | members.extend(inspect.getmembers(value, lambda member:
78 | hasattr(member, '__module__') and
79 | member.__module__ == key))
80 |
81 | if to_dict:
82 | return dict(members)
83 |
84 | return members
85 |
86 |
87 | def ld2dl(ld):
88 | '''Transform a list of dictionaries in a dictionaries with lists
89 | # Note
90 | All dictionaries have the same keys
91 | '''
92 | return dict(zip(ld[0], zip(*[d.values() for d in ld])))
93 |
94 | def check_ext(fname, ext):
95 | # Adding dot
96 | ext = ext if ext[0] == '.' else '.' + ext
97 | fname, f_ext = os.path.splitext(fname)
98 |
99 | if f_ext == ext:
100 | return True
101 |
102 | return False
103 |
104 |
105 | def parse_nondefault_args(args, default_args):
106 | # removing default arguments
107 | args_default = {k: v for k, v in vars(default_args).items()
108 | if k not in [arg.split('-')[-1] for arg in sys.argv
109 | if arg.startswith('-')]}
110 | args_nondefault = {k: v for k, v in vars(args).items()
111 | if k not in args_default or args_default[k] != v}
112 |
113 | args_nondefault = HParams().parse(args_nondefault)
114 |
115 | return args_nondefault
116 |
117 |
118 | def setup_logging(default_path='logging.yaml', default_level=logging.INFO,
119 | env_key='LOG_CFG'):
120 | """Setup logging configuration
121 |
122 | """
123 | path = default_path
124 | value = os.getenv(env_key, None)
125 | if value:
126 | path = value
127 | if os.path.exists(path):
128 | with open(path, 'rt') as f:
129 | config = yaml.safe_load(f.read())
130 | logging.config.dictConfig(config)
131 | else:
132 | logging.basicConfig(level=default_level)
133 |
--------------------------------------------------------------------------------
/utils/hparams.py:
--------------------------------------------------------------------------------
1 | import ast
2 |
3 | class HParams(object):
4 | """Creates an object for passing around hyperparameter values.
5 | Use the parse method to overwrite the default hyperparameters with values
6 | passed in as a string representation of a Python dictionary mapping
7 | hyperparameters to values.
8 |
9 | # Example
10 | hparams = magenta.common.HParams(batch_size=128, hidden_size=256)
11 | hparams.parse('{"hidden_size":512}')
12 | assert hparams.batch_size == 128
13 | assert hparams.hidden_size == 512
14 |
15 |
16 | Code adpated from Google Magenta
17 | """
18 |
19 | def __init__(self, **init_hparams):
20 | object.__setattr__(self, 'keyvals', init_hparams)
21 |
22 | def __getitem__(self, key):
23 | """Returns value of the given hyperameter, or None if does not
24 | exist."""
25 | return self.keyvals.get(key)
26 |
27 | def __getattribute__(self, attribute):
28 | if attribute == '__dict__':
29 | return self.keyvals
30 | else:
31 | return object.__getattribute__(self, attribute)
32 |
33 | def __getattr__(self, key):
34 | """Returns value of the given hyperameter, or None if does not
35 | exist."""
36 | return self.keyvals.get(key)
37 |
38 | def __setattr__(self, key, value):
39 | """Sets value for the hyperameter."""
40 | self.keyvals[key] = value
41 |
42 | def update(self, values_dict):
43 | """Merges in new hyperparameters, replacing existing with same key."""
44 | self.keyvals.update(values_dict)
45 |
46 | return self
47 |
48 | def parse(self, values):
49 | """Merges in new hyperparameters, replacing existing with same key."""
50 |
51 | if type(values) == dict:
52 | return self.update(values)
53 |
54 | if type(values) in (set, list):
55 | tmp = {}
56 | for k, v in zip(values[::2], values[1::2]):
57 | try:
58 | tmp[k] = ast.literal_eval(v)
59 | except ValueError:
60 | tmp[k] = v
61 | return self.update(tmp)
62 |
63 | return self.update(ast.literal_eval(values))
64 |
65 | def values(self):
66 | """Return the hyperparameter values as a Python dictionary."""
67 | return self.keyvals
68 |
69 | def __str__(self):
70 | return str(self.keyvals)
71 |
--------------------------------------------------------------------------------