├── .gitignore
├── LICENSE.md
├── README.md
├── core
    ├── __init__.py
    ├── callbacks.py
    ├── ctc_utils.py
    ├── initializers.py
    ├── layers.py
    ├── layers_utils.py
    ├── metrics.py
    └── models.py
├── data
    ├── download_brsmv1.sh
    └── download_datasets.sh
├── datasets
    ├── __init__.py
    ├── brsd.py
    ├── cslu.py
    ├── dataset_generator.py
    ├── dataset_parser.py
    ├── dummy.py
    ├── lapsbm.py
    ├── sid.py
    └── voxforge.py
├── eval.py
├── extras
    ├── __init__.py
    ├── apis.py
    ├── ctc_viz.py
    ├── eval_apis.py
    ├── make_dataset.py
    ├── print_args.py
    ├── recognizer.py
    └── results2xlsx.py
├── imgs
    ├── best_ler.jpg
    ├── best_ler.pdf
    ├── best_loss.jpg
    └── best_loss.pdf
├── logging.yaml
├── msc.yaml
├── predict.py
├── preprocessing
    ├── __init__.py
    ├── audio.py
    ├── audio_utils.py
    └── text.py
├── requirements.txt
├── train.py
└── utils
    ├── __init__.py
    ├── core_utils.py
    ├── generic_utils.py
    └── hparams.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | Ignoring automatic generated folders in data
  2 | # Created by https://www.gitignore.io/api/sublimetext,macos,linux,python,ipythonnotebook,windows
  3 | 
  4 | ### SublimeText ###
  5 | # cache files for sublime text
  6 | *.tmlanguage.cache
  7 | *.tmPreferences.cache
  8 | *.stTheme.cache
  9 | 
 10 | # workspace files are user-specific
 11 | *.sublime-workspace
 12 | 
 13 | # project files should be checked into the repository, unless a significant
 14 | # proportion of contributors will probably not be using SublimeText
 15 | # *.sublime-project
 16 | 
 17 | # sftp configuration file
 18 | sftp-config.json
 19 | 
 20 | # Package control specific files
 21 | Package Control.last-run
 22 | Package Control.ca-list
 23 | Package Control.ca-bundle
 24 | Package Control.system-ca-bundle
 25 | Package Control.cache/
 26 | Package Control.ca-certs/
 27 | bh_unicode_properties.cache
 28 | 
 29 | # Sublime-github package stores a github token in this file
 30 | # https://packagecontrol.io/packages/sublime-github
 31 | GitHub.sublime-settings
 32 | 
 33 | 
 34 | ### macOS ###
 35 | *.DS_Store
 36 | .AppleDouble
 37 | .LSOverride
 38 | 
 39 | # Icon must end with two \r
 40 | Icon
 41 | # Thumbnails
 42 | ._*
 43 | # Files that might appear in the root of a volume
 44 | .DocumentRevisions-V100
 45 | .fseventsd
 46 | .Spotlight-V100
 47 | .TemporaryItems
 48 | .Trashes
 49 | .VolumeIcon.icns
 50 | .com.apple.timemachine.donotpresent
 51 | # Directories potentially created on remote AFP share
 52 | .AppleDB
 53 | .AppleDesktop
 54 | Network Trash Folder
 55 | Temporary Items
 56 | .apdisk
 57 | 
 58 | 
 59 | ### Linux ###
 60 | *~
 61 | 
 62 | # temporary files which can be created if a process still has a handle open of a deleted file
 63 | .fuse_hidden*
 64 | 
 65 | # KDE directory preferences
 66 | .directory
 67 | 
 68 | # Linux trash folder which might appear on any partition or disk
 69 | .Trash-*
 70 | 
 71 | # .nfs files are created when an open file is removed but is still being accessed
 72 | .nfs*
 73 | 
 74 | 
 75 | ### Python ###
 76 | # Byte-compiled / optimized / DLL files
 77 | __pycache__/
 78 | *.py[cod]
 79 | *$py.class
 80 | 
 81 | # C extensions
 82 | *.so
 83 | 
 84 | # Distribution / packaging
 85 | .Python
 86 | env/
 87 | build/
 88 | develop-eggs/
 89 | dist/
 90 | downloads/
 91 | eggs/
 92 | .eggs/
 93 | lib/
 94 | lib64/
 95 | parts/
 96 | sdist/
 97 | var/
 98 | *.egg-info/
 99 | .installed.cfg
100 | *.egg
101 | 
102 | # PyInstaller
103 | #  Usually these files are written by a python script from a template
104 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
105 | *.manifest
106 | *.spec
107 | 
108 | # Installer logs
109 | pip-log.txt
110 | pip-delete-this-directory.txt
111 | 
112 | # Unit test / coverage reports
113 | htmlcov/
114 | .tox/
115 | .coverage
116 | .coverage.*
117 | .cache
118 | nosetests.xml
119 | coverage.xml
120 | *,cover
121 | .hypothesis/
122 | 
123 | # Translations
124 | *.mo
125 | *.pot
126 | 
127 | # Django stuff:
128 | *.log
129 | local_settings.py
130 | 
131 | # Flask stuff:
132 | instance/
133 | .webassets-cache
134 | 
135 | # Scrapy stuff:
136 | .scrapy
137 | 
138 | # Sphinx documentation
139 | docs/_build/
140 | 
141 | # PyBuilder
142 | target/
143 | 
144 | # IPython Notebook
145 | .ipynum_checkpoints
146 | 
147 | # pyenv
148 | .python-version
149 | 
150 | # celery beat schedule file
151 | celerybeat-schedule
152 | 
153 | # dotenv
154 | .env
155 | 
156 | # virtualenv
157 | .venv/
158 | venv/
159 | ENV/
160 | 
161 | # Spyder project settings
162 | .spyderproject
163 | 
164 | # Rope project settings
165 | .ropeproject
166 | 
167 | 
168 | ### IPythonNotebook ###
169 | # Temporary data
170 | .ipynum_checkpoints/
171 | 
172 | 
173 | ### Windows ###
174 | # Windows image file caches
175 | Thumbs.db
176 | ehthumbs.db
177 | 
178 | # Folder config file
179 | Desktop.ini
180 | 
181 | # Recycle Bin used on file shares
182 | $RECYCLE.BIN/
183 | 
184 | # Windows Installer files
185 | *.cab
186 | *.msi
187 | *.msm
188 | *.msp
189 | 
190 | # Windows shortcuts
191 | *.lnk
192 | 
193 | # My ignores
194 | timit
195 | *.h5
196 | results/
197 | .datasets/
198 | .envrc
199 | notebooks/
200 | sims/
201 | results*/
202 | <<<<<<< HEAD
203 | data/*/**
204 | data/*
205 | refs/
206 | software
207 | results.json
208 | .vscode
209 | *.tar.gz
210 | *.xls*
211 | *.json
212 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2016 Igor Macedo Quintanilha
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # asr-study: a study of all-neural speech recognition models
  2 | This repository contains my efforts on developing an end-to-end ASR system using Keras and Tensorflow.
  3 | 
  4 | ## Training a character-based all-neural Brazilian Portuguese speech recognition model
  5 | 
  6 | Our model was trained using four datasets: [CSLU Spoltech (LDC2006S16)](https://catalog.ldc.upenn.edu/LDC2006S16), Sid, [VoxForge](http://www.voxforge.org), and [LapsBM1.4]( http://www.laps.ufpa.br/falabrasil/). Only the CSLU dataset is paid.
  7 | 
  8 | #### Set up the (partial) Brazilian Portuguese Speech Dataset (BRSD)
  9 | 
 10 | You can download the freely available datasets with the provided script (it may take a while):
 11 | 
 12 | ```bash
 13 | $ cd data; sh download_datasets.sh
 14 | ```
 15 | 
 16 | Next, you can preprocess it into an hdf5 file. Click [here](extras/make_dataset.py) for more information.
 17 | 
 18 | ```bash
 19 | $ python -m extras.make_dataset --parser brsd --input_parser mfcc
 20 | ```
 21 | 
 22 | #### Train the network
 23 | 
 24 | You can train the network with the `train.py` script. For more usage information see [this](train.py). To train with the default parameters:
 25 | 
 26 | ```bash
 27 | $ python train.py --dataset .datasets/brsd/data.h5
 28 | ```
 29 | 
 30 | ## Pre-trained model
 31 | 
 32 | You may download a pre-trained [brsm v1.0 model](core/models.py) over the full brsd dataset (including the CSLU dataset):
 33 | 
 34 | ```bash
 35 | $ mkdir models; sh download_brsmv1.sh
 36 | ```
 37 | 
 38 | Also, you can evaluate the model against the **brsd** test set
 39 | 
 40 | ```bash
 41 | $ python eval.py --model models/brsmv1.h5 --dataset .datasets/brsd/data.h5
 42 | ```
 43 | 
 44 | #### brsmv1.h5 training
 45 | 
 46 | <div align=center>
 47 |   <img src='imgs/best_loss.jpg' height='300px'/>
 48 |   <img src='imgs/best_ler.jpg' height='300px'/>
 49 | </div>
 50 | 
 51 | Test set: LER **25.13%** (using beam search decoder with beam width of 100)
 52 | 
 53 | 
 54 | ## Predicting the outputs
 55 | 
 56 | To predict the outputs of a trained model using some dataset:
 57 | 
 58 | ```bash
 59 | $ python predict.py --model MODEL --dataset DATASET
 60 | ```
 61 | 
 62 | ## Available dataset parsers
 63 | You can see in [datasets/](datasets/) all the datasets parsers available.
 64 | 
 65 | #### Creating a custom dataset parser
 66 | 
 67 | You may create your own dataset parser. Here an example:
 68 | 
 69 | ```python
 70 | class CustomParser(DatasetParser):
 71 | 
 72 |     def __init__(self, dataset_dir, name='default name', **kwargs):
 73 |         super(CustomParser, self).__init__(dataset_dir, name, **kwargs)
 74 | 
 75 |     def _iter(self):
 76 |       for line in dataset:
 77 |         yield {'duration': line['duration'],
 78 |                'input': line['input'],
 79 |                'label': line['label'],
 80 |                'non-optional-field': line['non-optional-field']}
 81 | 
 82 |     def _report(self, dl):
 83 |       args = extract_statistics(dl)
 84 |       report = '''General information
 85 |                   Number of utterances: %d
 86 |                   Total size (in seconds) of utterances: %.f
 87 |                   Number of speakers: %d''' % (args)
 88 | ```
 89 | 
 90 | ## Available models
 91 | You can see all the available models in [core/models.py](core/models.py)
 92 | #### Creating a custom model
 93 | 
 94 | You may create your custom model. Here an example of CTC-based model
 95 | 
 96 | ```python
 97 | def custom_model(num_features=26, num_hiddens=100, num_classes=28):
 98 | 
 99 |     x = Input(name='inputs', shape=(None, num_features))
100 |     o = x
101 | 
102 |     o = Bidirectional(LSTM(num_hiddens,
103 |                       return_sequences=True,
104 |                       consume_less='gpu'))(o)
105 |     o = TimeDistributed(Dense(num_classes))(o)
106 | 
107 |     return ctc_model(x, o)
108 | ```
109 | ## Contributing
110 | There are a plenty of work to be done. All contributions are welcome :).
111 | 
112 | #### asr-related work
113 | * Add new layers
114 |   * Batch normalized recurrent neural networks [arXiv](https://arxiv.org/abs/1510.01378)
115 |   * Batch recurrent normalization [arXiv](https://arxiv.org/abs/1603.09025)
116 | * Reproduce topologies and results
117 |   * [EESEN](https://arxiv.org/abs/1507.08240)
118 |   * [Deep Speech 2](https://arxiv.org/abs/1512.02595)
119 |   * ConvNet-based architectures
120 | * Add language model
121 |   * [WFST](https://arxiv.org/abs/1507.08240)
122 |   * [RNNLN](http://www.fit.vutbr.cz/~imikolov/rnnlm/)
123 |   * Beam search decoder with LM or CLM
124 | * Encoder-decoder models with attention mechanism
125 | * ASR from raw speech
126 | * Real-time ASR
127 | 
128 | #### brsp-related work
129 | * Investigate the brsdv1 model with
130 |   * Multiplicative integration [arXiv](https://arxiv.org/abs/1606.06630)
131 |   * Layer nomalization [arXiv](https://arxiv.org/abs/1607.06450)
132 |   * Zoneout [arXiv](https://arxiv.org/abs/1606.01305)
133 | * Increase the number of datasets (ideally with free datasets)
134 | * Improve the LER
135 | * Train a language model
136 | 
137 | #### code-related work
138 | * Test coverage
139 | * Examples
140 | * Better documentation
141 | * Improve the API
142 | * More features extractors, see [audio](preprocessing/audio.py) and [text](preprocessing/text.py)
143 | * More datasets parsers
144 |   * [LibriSpeech](http://www.openslr.org/12/)
145 |   * [Teldium](http://www-lium.univ-lemans.fr/en/content/ted-lium-corpus))
146 |   * WSJ
147 |   * Switchboard
148 |   * [TIMIT](https://catalog.ldc.upenn.edu/ldc93s1)
149 |   * [VCTK](http://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html)
150 | * **Implement a nice wrapper for Kaldi in order to enjoy their feature extractors**
151 | * Better way of store the entire preprocessed dataset
152 | 
153 | #### Known bugs
154 | * High memory and CPU consumption
155 | * Predicting with batch size greater than 1 (Keras' bug)
156 | * warp-ctc does not seem to speed up training
157 | * [zoneout](core/layers.py) implementation
158 | 
159 | 
160 | ## Requirements
161 | 
162 | #### basic requirements
163 | * Python 2.7
164 | * Numpy
165 | * Scipy
166 | * Pyyaml
167 | * HDF5
168 | * Unidecode
169 | * Librosa
170 | * Tensorflow
171 | * Keras
172 | 
173 | #### recommended
174 | * [warp-ctc](https://github.com/baidu-research/warp-ctc) (for fast CTC loss calculation)
175 | 
176 | #### optional
177 | * [SpeechRecognition](https://pypi.python.org/pypi/SpeechRecognition/) (to use the [eval apis](extras/eval_apis.py))
178 | * [openpyxl](https://pypi.python.org/pypi/openpyxl) (to [save the results in a excel file](extras/results2xlsx.py))
179 | 
180 | ## Acknowledgements
181 | * [python_speech_features](https://github.com/jameslyons/python_speech_features) for the [audio preprocessing](preprocessing/audio.py)
182 | * [Google Magenta](https://github.com/tensorflow/magenta) for the [hparams](core/hparams.py)
183 | * @robertomest for helping me with everything
184 | 
185 | ## License
186 | See [LICENSE.md](LICENSE.md) for more information
187 | 


--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
1 | import layers
2 | import layers_utils
3 | import metrics
4 | import ctc_utils
5 | import models
6 | import initializers
7 | import callbacks
8 | 


--------------------------------------------------------------------------------
/core/callbacks.py:
--------------------------------------------------------------------------------
 1 | import keras.callbacks as callbacks
 2 | 
 3 | import h5py
 4 | import numpy as np
 5 | import yaml
 6 | 
 7 | 
 8 | class MetaCheckpoint(callbacks.ModelCheckpoint):
 9 |     """
10 |     Checkpoints some training information with the model. This should enable
11 |     resuming training and having training information on every checkpoint.
12 | 
13 |     Thanks to Roberto Estevao @robertomest - robertomest@poli.ufrj.br
14 |     """
15 | 
16 |     def __init__(self, filepath, monitor='val_loss', verbose=0,
17 |                  save_best_only=False, save_weights_only=False,
18 |                  mode='auto', period=1, training_args=None, meta=None):
19 | 
20 |         super(MetaCheckpoint, self).__init__(filepath, monitor='val_loss',
21 |                                              verbose=0, save_best_only=False,
22 |                                              save_weights_only=False,
23 |                                              mode='auto', period=1)
24 | 
25 |         self.filepath = filepath
26 |         self.meta = meta or {'epochs': []}
27 | 
28 |         if training_args:
29 |             training_args = vars(training_args)
30 | 
31 |             self.meta['training_args'] = training_args
32 | 
33 |     def on_train_begin(self, logs={}):
34 |         super(MetaCheckpoint, self).on_train_begin(logs)
35 | 
36 |     def on_epoch_end(self, epoch, logs={}):
37 |         super(MetaCheckpoint, self).on_epoch_end(epoch, logs)
38 | 
39 |         # Get statistics
40 |         self.meta['epochs'].append(epoch)
41 |         for k, v in logs.items():
42 |             # Get default gets the value or sets (and gets) the default value
43 |             self.meta.setdefault(k, []).append(v)
44 | 
45 |         # Save to file
46 |         filepath = self.filepath.format(epoch=epoch, **logs)
47 | 
48 |         if self.epochs_since_last_save == 0:
49 |             with h5py.File(filepath, 'r+') as f:
50 |                 meta_group = f.create_group('meta')
51 |                 meta_group.attrs['training_args'] = yaml.dump(
52 |                     self.meta.get('training_args', '{}'))
53 |                 meta_group.create_dataset('epochs',
54 |                                           data=np.array(self.meta['epochs']))
55 |                 for k in logs:
56 |                     meta_group.create_dataset(k, data=np.array(self.meta[k]))
57 | 
58 | 
59 | class ProgbarLogger(callbacks.ProgbarLogger):
60 | 
61 |     def __init__(self, show_metrics=None):
62 |         super(ProgbarLogger, self).__init__()
63 | 
64 |         self.show_metrics = show_metrics
65 | 
66 |     def on_train_begin(self, logs=None):
67 |         super(ProgbarLogger, self).on_train_begin(logs)
68 | 
69 |         if self.show_metrics:
70 |             self.params['metrics'] = self.show_metrics
71 | 


--------------------------------------------------------------------------------
/core/ctc_utils.py:
--------------------------------------------------------------------------------
 1 | import keras
 2 | import keras.backend as K
 3 | 
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | 
 7 | 
 8 | def decode(inputs, **kwargs):
 9 |     """ Decodes a sequence of probabilities choosing the path with highest
10 |     probability of occur
11 | 
12 |     # Arguments
13 |         is_greedy: if True (default) the greedy decoder will be used;
14 |         otherwise beam search decoder will be used
15 | 
16 |         if is_greedy is False:
17 |             see the documentation of tf.nn.ctc_beam_search_decoder for more
18 |             options
19 | 
20 |     # Inputs
21 |         A tuple (y_pred, seq_len) where:
22 |             y_pred is a tensor (N, T, C) where N is the bath size, T is the
23 |             maximum timestep and C is the number of classes (including the
24 |             blank label)
25 |             seq_len is a tensor (N,) that indicates the real number of
26 |             timesteps of each sequence
27 | 
28 |     # Outputs
29 |         A sparse tensor with the top path decoded sequence
30 | 
31 |     """
32 | 
33 |     # Little hack for load_model
34 |     import tensorflow as tf
35 |     is_greedy = kwargs.get('is_greedy', True)
36 |     y_pred, seq_len = inputs
37 | 
38 |     seq_len = tf.cast(seq_len[:, 0], tf.int32)
39 |     y_pred = tf.transpose(y_pred, perm=[1, 0, 2])
40 | 
41 |     if is_greedy:
42 |         decoded = tf.nn.ctc_greedy_decoder(y_pred, seq_len)[0][0]
43 |     else:
44 |         beam_width = kwargs.get('beam_width', 100)
45 |         top_paths = kwargs.get('top_paths', 1)
46 |         merge_repeated = kwargs.get('merge_repeated', True)
47 | 
48 |         decoded = tf.nn.ctc_beam_search_decoder(y_pred, seq_len, beam_width,
49 |                                                 top_paths,
50 |                                                 merge_repeated)[0][0]
51 | 
52 |     return decoded
53 | 
54 | 
55 | def decode_output_shape(inputs_shape):
56 |     y_pred_shape, seq_len_shape = inputs_shape
57 |     return (y_pred_shape[:1], None)
58 | 
59 | 
60 | def ctc_lambda_func(args):
61 |     """ CTC cost function
62 |     """
63 |     y_pred, labels, inputs_length = args
64 | 
65 |     # Little hack for load_model
66 |     import tensorflow as tf
67 | 
68 |     return tf.nn.ctc_loss(labels,
69 |                           tf.transpose(y_pred, perm=[1, 0, 2]),
70 |                           inputs_length[:, 0])
71 | 
72 | 
73 | def ctc_dummy_loss(y_true, y_pred):
74 |     """ Little hack to make CTC working with Keras
75 |     """
76 |     return y_pred
77 | 
78 | 
79 | def decoder_dummy_loss(y_true, y_pred):
80 |     """ Little hack to make CTC working with Keras
81 |     """
82 |     return K.zeros((1,))
83 | 


--------------------------------------------------------------------------------
/core/initializers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import keras.backend as K
 4 | 
 5 | 
 6 | def k_init(k):
 7 |     def init(shape, name=None):
 8 |         return K.variable(k*np.ones(shape), dtype='float32',
 9 |                    name=name)
10 |     return init
11 | 


--------------------------------------------------------------------------------
/core/layers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | import numpy as np
  4 | 
  5 | import keras.backend as K
  6 | import tensorflow as tf
  7 | 
  8 | from keras import activations, initializations, regularizers
  9 | import keras.layers as keras_layers
 10 | from keras.layers.recurrent import Recurrent
 11 | from keras.engine import Layer, InputSpec
 12 | 
 13 | from .layers_utils import highway_bias_initializer
 14 | from .layers_utils import layer_normalization
 15 | from .layers_utils import multiplicative_integration_init
 16 | from .layers_utils import multiplicative_integration
 17 | from .layers_utils import zoneout
 18 | 
 19 | from .initializers import k_init
 20 | 
 21 | import logging
 22 | 
 23 | 
 24 | class LayerNormalization(Layer):
 25 |     '''Normalize from all of the summed inputs to the neurons in a layer on
 26 |     a single training case. Unlike batch normalization, layer normalization
 27 |     performs exactly the same computation at training and tests time.
 28 | 
 29 |     # Arguments
 30 |         epsilon: small float > 0. Fuzz parameter
 31 |         num_var: how many tensor are condensed in the input
 32 |         weights: Initialization weights.
 33 |             List of 2 Numpy arrays, with shapes:
 34 |             `[(input_shape,), (input_shape,)]`
 35 |             Note that the order of this list is [gain, bias]
 36 |         gain_init: name of initialization function for gain parameter
 37 |             (see [initializations](../initializations.md)), or alternatively,
 38 |             Theano/TensorFlow function to use for weights initialization.
 39 |             This parameter is only relevant if you don't pass a `weights`
 40 |             argument.
 41 |         bias_init: name of initialization function for bias parameter
 42 |             (see [initializations](../initializations.md)), or alternatively,
 43 |             Theano/TensorFlow function to use for weights initialization.
 44 |             This parameter is only relevant if you don't pass a `weights`
 45 |             argument.
 46 |     # Input shape
 47 | 
 48 |     # Output shape
 49 |         Same shape as input.
 50 | 
 51 |     # References
 52 |         - [Layer Normalization](https://arxiv.org/abs/1607.06450)
 53 |     '''
 54 |     def __init__(self, epsilon=1e-5, weights=None, gain_init='one',
 55 |                  bias_init='zero', **kwargs):
 56 |         self.epsilon = epsilon
 57 |         self.gain_init = initializations.get(gain_init)
 58 |         self.bias_init = initializations.get(bias_init)
 59 |         self.initial_weights = weights
 60 |         self._logger = logging.getLogger('%s.%s' % (__name__,
 61 |                                                     self.__class__.__name__))
 62 | 
 63 |         super(LayerNormalization, self).__init__(**kwargs)
 64 | 
 65 |     def build(self, input_shape):
 66 |         self.input_spec = [InputSpec(shape=input_shape)]
 67 |         shape = (input_shape[-1],)
 68 | 
 69 |         self.g = self.gain_init(shape, name='{}_gain'.format(self.name))
 70 |         self.b = self.bias_init(shape, name='{}_bias'.format(self.name))
 71 | 
 72 |         self.trainable_weights = [self.g, self.b]
 73 | 
 74 |         if self.initial_weights is not None:
 75 |             self.set_weights(self.initial_weights)
 76 |             del self.initial_weights
 77 | 
 78 |         self.built = True
 79 | 
 80 |     def call(self, x, mask=None):
 81 |         return LN(x, self.g, self.b, epsilon=self.epsilon)
 82 | 
 83 |     def get_config(self):
 84 |         config = {"epsilon": self.epsilon,
 85 |                   'num_var': self.num_var,
 86 |                   'gain_init': self.gain_init.__name__,
 87 |                   'bias_init': self.bias_init.__name__}
 88 |         base_config = super(LayerNormalization, self).get_config()
 89 |         return dict(list(base_config.items()) + list(config.items()))
 90 | 
 91 | 
 92 | class RHN(Recurrent):
 93 |     '''Recurrent Highway Network - Julian Georg Zilly, Rupesh Kumar Srivastava,
 94 |     Jan Koutník, Jürgen Schmidhuber - 2016.
 95 |     For a step-by-step description of the network, see
 96 |     [this paper](https://arxiv.org/abs/1607.03474).
 97 |     # Arguments
 98 |         output_dim: dimension of the internal projections and the final output.
 99 |         depth: recurrency depth size.
100 |         init: weight initialization function.
101 |             Can be the name of an existing function (str),
102 |             or a Theano function (see:
103 |             [initializations](../initializations.md)).
104 |         inner_init: initialization function of the inner cells.
105 |         bias_init: initialization function of the bias.
106 |             (see [this
107 |             post](http://people.idsia.ch/~rupesh/very_deep_learning/)
108 |             for more information)
109 |         activation: activation function.
110 |             Can be the name of an existing function (str),
111 |             or a Theano function (see: [activations](../activations.md)).
112 |         inner_activation: activation function for the inner cells.
113 |         coupling: if True, carry gate will be coupled to the transform gate,
114 |             i.e., c = 1 - t
115 |         W_regularizer: instance of [WeightRegularizer](../regularizers.md)
116 |             (eg. L1 or L2 regularization), applied to the input weights
117 |             matrices.
118 |         U_regularizer: instance of [WeightRegularizer](../regularizers.md)
119 |             (eg. L1 or L2 regularization), applied to the recurrent weights
120 |             matrices.
121 |         b_regularizer: instance of [WeightRegularizer](../regularizers.md),
122 |             applied to the bias.
123 |         dropout_W: float between 0 and 1. Fraction of the input units to drop
124 |         for input gates.
125 |         dropout_U: float between 0 and 1. Fraction of the input units to drop
126 |         for recurrent connections.
127 |     # References
128 |         - [Recurrent Highway Networks](https://arxiv.org/abs/1607.03474)
129 |         (original paper)
130 |         - [Layer Normalization](https://arxiv.org/abs/1607.06450)
131 |         - [A Theoretically Grounded Application of Dropout in Recurrent Neural
132 |         Networks](http://arxiv.org/abs/1512.05287)
133 |     # TODO: different dropout rates for each layer
134 |     '''
135 |     def __init__(self, output_dim, depth=1,
136 |                  init='glorot_uniform', inner_init='orthogonal',
137 |                  bias_init=highway_bias_initializer,
138 |                  activation='tanh', inner_activation='hard_sigmoid',
139 |                  coupling=True, layer_norm=False, ln_gain_init='one',
140 |                  ln_bias_init='zero', mi=False,
141 |                  W_regularizer=None, U_regularizer=None,
142 |                  b_regularizer=None, dropout_W=0., dropout_U=0., **kwargs):
143 |         self.output_dim = output_dim
144 |         self.depth = depth
145 |         self.init = initializations.get(init)
146 |         self.inner_init = initializations.get(inner_init)
147 |         self.bias_init = initializations.get(bias_init)
148 |         self.activation = activations.get(activation)
149 |         self.inner_activation = activations.get(inner_activation)
150 |         self.coupling = coupling
151 |         self.has_layer_norm = layer_norm
152 |         self.ln_gain_init = initializations.get(ln_gain_init)
153 |         self.ln_bias_init = initializations.get(ln_bias_init)
154 |         self.mi = mi
155 |         self.W_regularizer = regularizers.get(W_regularizer)
156 |         self.U_regularizer = regularizers.get(U_regularizer)
157 |         self.b_regularizer = regularizers.get(b_regularizer)
158 |         self.dropout_W, self.dropout_U = dropout_W, dropout_U
159 | 
160 |         self._logger = logging.getLogger('%s.%s' % (__name__,
161 |                                                     self.__class__.__name__))
162 | 
163 |         if self.dropout_W or self.dropout_U:
164 |             self.uses_learning_phase = True
165 | 
166 |         super(RHN, self).__init__(**kwargs)
167 | 
168 |         if not self.consume_less == "gpu":
169 |             self._logger.warning("Ignoring consume_less=%s. Setting to 'gpu'." % self.consume_less)
170 | 
171 |     def build(self, input_shape):
172 |         self.input_spec = [InputSpec(shape=input_shape)]
173 |         self.input_dim = input_shape[2]
174 | 
175 |         if self.stateful:
176 |             self.reset_states()
177 |         else:
178 |             self.states = [None]
179 | 
180 |         self.W = self.init((self.input_dim, (2 + (not self.coupling)) *
181 |                             self.output_dim), name='{}_W'.format(self.name))
182 |         self.Us = [self.inner_init(
183 |             (self.output_dim, (2 + (not self.coupling)) * self.output_dim),
184 |             name='%s_%d_U' % (self.name, i)) for i in xrange(self.depth)]
185 | 
186 |         bias_init_value = K.get_value(self.bias_init((self.output_dim,)))
187 |         b = [np.zeros(self.output_dim),
188 |              np.copy(bias_init_value)]
189 | 
190 |         if not self.coupling:
191 |             b.append(np.copy(bias_init_value))
192 | 
193 |         self.bs = [K.variable(np.hstack(b),
194 |                               name='%s_%d_b' % (self.name, i)) for i in
195 |                    xrange(self.depth)]
196 | 
197 |         self.trainable_weights = [self.W] + self.Us + self.bs
198 | 
199 |         if self.mi:
200 |             self.mi_params = [multiplicative_integration_init(
201 |                 ((2 + (not self.coupling)) * self.output_dim,),
202 |                 name='%s_%d' % (self.name, i),
203 |                 has_input=(i == 0)) for i in xrange(self.depth)]
204 | 
205 |             for p in self.mi_params:
206 |                 if type(p) in {list, tuple}:
207 |                     self.trainable_weights += p
208 |                 else:
209 |                     self.trainable_weights += [p]
210 | 
211 |         if self.has_layer_norm:
212 |             self.ln_weights = []
213 |             ln_names = ['h', 't', 'c']
214 |             for l in xrange(self.depth):
215 | 
216 |                 ln_gains = [self.ln_gain_init(
217 |                     (self.output_dim,), name='%s_%d_ln_gain_%s' %
218 |                     (self.name, l, ln_names[i])) for i in xrange(1)]
219 | 
220 |                 ln_biases = [self.ln_bias_init(
221 |                     (self.output_dim,), name='%s_%d_ln_bias_%s' %
222 |                     (self.name, l, ln_names[i])) for i in xrange(1)]
223 |                 self.ln_weights.append([ln_gains, ln_biases])
224 |                 self.trainable_weights += ln_gains + ln_biases
225 | 
226 |         self.regularizers = []
227 |         if self.W_regularizer:
228 |             self.W_regularizer.set_param(self.W)
229 |             self.regularizers.append(self.W_regularizer)
230 |         if self.U_regularizer:
231 |             self.U_regularizer.set_param(self.U)
232 |             self.regularizers.append(self.U_regularizer)
233 |         if self.b_regularizer:
234 |             self.b_regularizer.set_param(self.b)
235 |             self.regularizers.append(self.b_regularizer)
236 | 
237 |         if self.initial_weights is not None:
238 |             self.set_weights(self.initial_weights)
239 |             del self.initial_weights
240 | 
241 |     def reset_states(self):
242 |         assert self.stateful, 'Layer must be stateful.'
243 |         input_shape = self.input_spec[0].shape
244 |         if not input_shape[0]:
245 |             raise Exception('If a RNN is stateful, a complete ' +
246 |                             'input_shape must be provided (including batch \
247 |                             size).')
248 |         if hasattr(self, 'states'):
249 |             K.set_value(self.states[0],
250 |                         np.zeros((input_shape[0], self.output_dim)))
251 |         else:
252 |             self.states = [K.zeros((input_shape[0], self.output_dim))]
253 | 
254 |     def step(self, x, states):
255 |         s_tm1 = states[0]
256 | 
257 |         for layer in xrange(self.depth):
258 |             B_U = states[layer + 1][0]
259 |             U, b = self.Us[layer], self.bs[layer]
260 | 
261 |             if layer == 0:
262 |                 B_W = states[layer + 1][1]
263 |                 Wx = K.dot(x * B_W, self.W)
264 |             else:
265 |                 Wx = 0
266 | 
267 |             Us = K.dot(s_tm1 * B_U, U)
268 | 
269 |             if self.mi:
270 |                 a = multiplicative_integration(Wx, Us,
271 |                                                self.mi_params[layer]) + b
272 |             else:
273 |                 a = Wx + Us + b
274 | 
275 |             a0 = a[:, :self.output_dim]
276 |             a1 = a[:, self.output_dim: 2 * self.output_dim]
277 |             if not self.coupling:
278 |                 a2 = a[:, 2 * self.output_dim:]
279 | 
280 |             if self.has_layer_norm:
281 |                 ln_gains, ln_biases = self.ln_weights[layer]
282 |                 a0 = LN(a0, ln_gains[0], ln_biases[0])
283 |                 # a1 = LN(a1, ln_gains[1], ln_biases[1])
284 |                 # if not self.coupling:
285 |                 #     a2 = LN(a2, ln_gains[2], ln_biases[2])
286 | 
287 |             # Equation 7
288 |             h = self.activation(a0)
289 |             # Equation 8
290 |             t = self.inner_activation(a1)
291 |             # Equation 9
292 |             if not self.coupling:
293 |                 c = self.inner_activation(a2)
294 |             else:
295 |                 c = 1 - t  # carry gate was coupled to the transform gate
296 | 
297 |             s = h * t + s_tm1 * c
298 |             s_tm1 = s
299 | 
300 |         return s, [s]
301 | 
302 |     def get_constants(self, x):
303 |         constants = []
304 | 
305 |         for layer in xrange(self.depth):
306 |             constant = []
307 |             if 0 < self.dropout_U < 1:
308 |                 ones = K.ones_like(K.reshape(x[:, 0, 0], (-1, 1)))
309 |                 ones = K.tile(ones, (1, self.output_dim))
310 |                 B_U = K.in_train_phase(K.dropout(ones, self.dropout_U), ones)
311 |                 constant.append(B_U)
312 |             else:
313 |                 constant.append(K.cast_to_floatx(1.))
314 | 
315 |             if layer == 0:
316 |                 if 0 < self.dropout_W < 1:
317 |                     input_shape = self.input_spec[0].shape
318 |                     input_dim = input_shape[-1]
319 |                     ones = K.ones_like(K.reshape(x[:, 0, 0], (-1, 1)))
320 |                     ones = K.tile(ones, (1, input_dim))
321 |                     B_W = K.in_train_phase(K.dropout(ones,
322 |                                                      self.dropout_W), ones)
323 |                     constant.append(B_W)
324 |                 else:
325 |                     constant.append(K.cast_to_floatx(1.))
326 | 
327 |             constants.append(constant)
328 | 
329 |         return constants
330 | 
331 |     def get_config(self):
332 |         config = {'output_dim': self.output_dim,
333 |                   'depth': self.depth,
334 |                   'init': self.init.__name__,
335 |                   'inner_init': self.inner_init.__name__,
336 |                   'bias_init': self.bias_init.__name__,
337 |                   'activation': self.activation.__name__,
338 |                   'inner_activation': self.inner_activation.__name__,
339 |                   'coupling': self.coupling,
340 |                   'layer_norm': self.has_layer_norm,
341 |                   'ln_gain_init': self.ln_gain_init.__name__,
342 |                   'ln_bias_init': self.ln_bias_init.__name__,
343 |                   'mi': self.mi,
344 |                   'W_regularizer': self.W_regularizer.get_config() if
345 |                   self.W_regularizer else None,
346 |                   'U_regularizer': self.U_regularizer.get_config() if
347 |                   self.U_regularizer else None,
348 |                   'b_regularizer': self.b_regularizer.get_config() if
349 |                   self.b_regularizer else None,
350 |                   'dropout_W': self.dropout_W,
351 |                   'dropout_U': self.dropout_U}
352 |         base_config = super(RHN, self).get_config()
353 |         return dict(list(base_config.items()) + list(config.items()))
354 | 
355 | 
356 | class LSTM(keras_layers.LSTM):
357 |     """
358 |     # Arguments
359 |         ln: None, list of float or list of list of floats. Determines whether will apply LN or not. If list of floats, the same init will be applied to every LN; otherwise will be individual
360 |         mi: list of floats or None. If list of floats, the multiplicative integration will be active and initialized with these values.
361 |         zoneout_h: float between 0 and 1. Fraction of the hidden/output units to maintain their previous values.
362 |         zoneout_c: float between 0 and 1. Fraction of the cell units to maintain their previous values.
363 |     # References
364 |         - [Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations](https://arxiv.org/abs/1606.01305)
365 |     """
366 |     def __init__(self, output_dim, zoneout_h=0., zoneout_c=0.,
367 |                  layer_norm=None, mi=None, **kwargs):
368 | 
369 |         super(LSTM, self).__init__(output_dim, **kwargs)
370 | 
371 |         self._logger = logging.getLogger('%s.%s' % (__name__,
372 |                                                     self.__class__.__name__))
373 | 
374 |         self.layer_norm = layer_norm
375 |         self.mi = mi
376 | 
377 |         self.zoneout_c = zoneout_c
378 |         self.zoneout_h = zoneout_h
379 | 
380 |         if self.zoneout_h or self.zoneout_c:
381 |             self.uses_learning_phase = True
382 | 
383 |         if self.consume_less != 'gpu':
384 |             self._logger.warn("Invalid option for `consume_less`. Falling back \
385 | to option `gpu`.")
386 |             self.consume_less = 'gpu'
387 | 
388 |     def build(self, input_shape):
389 |         super(LSTM, self).build(input_shape)
390 | 
391 |         if self.mi is not None:
392 |             alpha_init, beta1_init, beta2_init = self.mi
393 | 
394 |             self.mi_alpha = self.add_weight(
395 |                 (4 * self.output_dim, ),
396 |                 initializer=k_init(alpha_init),
397 |                 name='{}_mi_alpha'.format(self.name))
398 |             self.mi_beta1 = self.add_weight(
399 |                 (4 * self.output_dim, ),
400 |                 initializer=k_init(beta1_init),
401 |                 name='{}_mi_beta1'.format(self.name))
402 |             self.mi_beta2 = self.add_weight(
403 |                 (4 * self.output_dim, ),
404 |                 initializer=k_init(beta2_init),
405 |                 name='{}_mi_beta2'.format(self.name))
406 | 
407 |         if self.layer_norm is not None:
408 |             ln_gain_init, ln_bias_init = self.layer_norm
409 | 
410 |             self.layer_norm_params = {}
411 |             for n, i in {'Uh': 4, 'Wx': 4, 'new_c': 1}.items():
412 | 
413 |                 gain = self.add_weight(
414 |                     (i*self.output_dim, ),
415 |                     initializer=k_init(ln_gain_init),
416 |                     name='%s_ln_gain_%s' % (self.name, n))
417 |                 bias = self.add_weight(
418 |                     (i*self.output_dim, ),
419 |                     initializer=k_init(ln_bias_init),
420 |                     name='%s_ln_bias_%s' % (self.name, n))
421 | 
422 |                 self.layer_norm_params[n] = [gain, bias]
423 | 
424 |     def _layer_norm(self, x, param_name):
425 |         if self.layer_norm is None:
426 |             return x
427 | 
428 |         gain, bias = self.layer_norm_params[param_name]
429 | 
430 |         return layer_normalization(x, gain, bias)
431 | 
432 |     def step(self, x, states):
433 |         h_tm1 = states[0]
434 |         c_tm1 = states[1]
435 |         B_U = states[2]
436 |         B_W = states[3]
437 | 
438 |         Uh = self._layer_norm(K.dot(h_tm1 * B_U[0], self.U), 'Uh')
439 |         Wx = self._layer_norm(K.dot(x * B_W[0], self.W), 'Wx')
440 | 
441 |         if self.mi is not None:
442 |             z = self.mi_alpha * Wx * Uh + self.mi_beta1 * Uh + \
443 |                 self.mi_beta2 * Wx + self.b
444 |         else:
445 |             z = Wx + Uh + self.b
446 | 
447 |         z_i = z[:, :self.output_dim]
448 |         z_f = z[:, self.output_dim: 2 * self.output_dim]
449 |         z_c = z[:, 2 * self.output_dim: 3 * self.output_dim]
450 |         z_o = z[:, 3 * self.output_dim:]
451 | 
452 |         i = self.inner_activation(z_i)
453 |         f = self.inner_activation(z_f)
454 |         c = f * c_tm1 + i * self.activation(z_c)
455 |         o = self.inner_activation(z_o)
456 | 
457 |         if 0 < self.zoneout_c < 1:
458 |             c = zoneout(self.zoneout_c, c_tm1, c,
459 |                         noise_shape=(self.output_dim,))
460 | 
461 |         # this is returning a lot of nan
462 |         new_c = self._layer_norm(c, 'new_c')
463 | 
464 |         h = o * self.activation(new_c)
465 |         if 0 < self.zoneout_h < 1:
466 |             h = zoneout(self.zoneout_h, h_tm1, h,
467 |                         noise_shape=(self.output_dim,))
468 | 
469 |         return h, [h, c]
470 | 
471 |     def get_config(self):
472 |         config = {'layer_norm': self.layer_norm,
473 |                   'mi': self.mi,
474 |                   'zoneout_h': self.zoneout_h,
475 |                   'zoneout_c': self.zoneout_c
476 |                   }
477 | 
478 |         base_config = super(LSTM, self).get_config()
479 |         return dict(list(base_config.items()) + list(config.items()))
480 | 
481 | 
482 | def recurrent(output_dim, model='keras_lstm', activation='tanh',
483 |               regularizer=None, dropout=0., **kwargs):
484 |     if model == 'rnn':
485 |         return keras_layers.SimpleRNN(output_dim, activation=activation,
486 |                                       W_regularizer=regularizer,
487 |                                       U_regularizer=regularizer,
488 |                                       dropout_W=dropout, dropout_U=dropout, consume_less='gpu',
489 |                                       **kwargs)
490 |     if model == 'gru':
491 |         return keras_layers.GRU(output_dim, activation=activation,
492 |                                 W_regularizer=regularizer,
493 |                                 U_regularizer=regularizer, dropout_W=dropout,
494 |                                 dropout_U=dropout,
495 |                                 consume_less='gpu', **kwargs)
496 |     if model == 'keras_lstm':
497 |         return keras_layers.LSTM(output_dim, activation=activation,
498 |                                  W_regularizer=regularizer,
499 |                                  U_regularizer=regularizer,
500 |                                  dropout_W=dropout, dropout_U=dropout,
501 |                                  consume_less='gpu', **kwargs)
502 |     if model == 'rhn':
503 |         return RHN(output_dim, depth=1,
504 |                    bias_init=highway_bias_initializer,
505 |                    activation=activation, layer_norm=False, ln_gain_init='one',
506 |                    ln_bias_init='zero', mi=False,
507 |                    W_regularizer=regularizer, U_regularizer=regularizer,
508 |                    dropout_W=dropout, dropout_U=dropout, consume_less='gpu',
509 |                    **kwargs)
510 | 
511 |     if model == 'lstm':
512 |         return LSTM(output_dim, activation=activation,
513 |                     W_regularizer=regularizer, U_regularizer=regularizer,
514 |                     dropout_W=dropout, dropout_U=dropout,
515 |                     consume_less='gpu', **kwargs)
516 |     raise ValueError('model %s was not recognized' % model)
517 | 
518 | 
519 | if __name__ == "__main__":
520 |     from keras.models import Sequential
521 |     from keras.utils.visualize_util import plot
522 | 
523 |     model = Sequential()
524 |     model.add(RHN(10, input_dim=2, depth=2, layer_norm=True))
525 |     # plot(model)
526 | 


--------------------------------------------------------------------------------
/core/layers_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | 
 4 | import keras.backend as K
 5 | import tensorflow as tf
 6 | 
 7 | from keras import activations, initializations, regularizers
 8 | from keras.layers import GRU, SimpleRNN
 9 | from keras.layers import LSTM as keras_LSTM
10 | 
11 | 
12 | def highway_bias_initializer(shape, name=None):
13 |     return -2 * initializations.one(shape, name=name)
14 | 
15 | 
16 | def layer_normalization(x, gain, bias, epsilon=1e-5):
17 |     mean, std = tf.nn.moments(x, [1], keep_dims=True)
18 |     x_normed = (x - mean) / K.sqrt(std + epsilon) * gain + bias
19 |     return x_normed
20 | 
21 | 
22 | def multiplicative_integration_init(shape, alpha_init='one',
23 |                                     beta1_init='one', beta2_init='one',
24 |                                     name='mi', has_input=True):
25 |     beta1 = initializations.get(beta1_init)(shape, name='%s_beta1' % name)
26 |     if has_input:
27 |         alpha = initializations.get(alpha_init)(shape, name='%s_alpha' % name)
28 |         beta2 = initializations.get(beta2_init)(shape, name='%s_beta2' % name)
29 |         return alpha, beta1, beta2
30 | 
31 |     return beta1
32 | 
33 | 
34 | def zoneout(level, h_tm1, h, noise_shape):
35 |     '''Apply a zoneout function to preserve a fraction of values from h_tm1 in
36 |     h.'''
37 |     h_diff = h - h_tm1
38 |     h = K.in_train_phase(K.dropout(h_diff,
39 |                                    level,
40 |                                    noise_shape=noise_shape), h_diff)
41 |     h = h * (1. - level) + h_tm1
42 |     return h
43 | 
44 | 
45 | def multiplicative_integration(Wx, Uz, params, has_input=True):
46 |     if has_input:
47 |         alpha, beta1, beta2 = params
48 |         return alpha * Wx * Uz + beta1 * Uz + beta2 * Wx
49 | 
50 |     beta1 = params
51 |     return beta1 * Uz
52 | 
53 | 
54 | def to_dense(x):
55 |     if K.is_sparse(x):
56 |         return tf.sparse_tensor_to_dense(x, default_value=-1)
57 |     return x
58 | 
59 | 
60 | def to_dense_output_shape(input_shape):
61 |     return input_shape
62 | 
63 | 
64 | LN = layer_normalization
65 | mi = multiplicative_integration
66 | mi_init = multiplicative_integration_init
67 | 


--------------------------------------------------------------------------------
/core/metrics.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | 
3 | 
4 | def ler(y_true, y_pred, **kwargs):
5 |     """
6 |         Label Error Rate. For more information see 'tf.edit_distance'
7 |     """
8 |     return tf.reduce_mean(tf.edit_distance(y_pred, y_true, **kwargs))
9 | 


--------------------------------------------------------------------------------
/core/models.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import core.ctc_utils as ctc_utils
  6 | from utils.hparams import HParams
  7 | 
  8 | import keras
  9 | import keras.backend as K
 10 | from keras.initializations import uniform
 11 | from keras.activations import relu
 12 | 
 13 | from keras.models import Model
 14 | 
 15 | from keras.layers import Input
 16 | from keras.layers import GaussianNoise
 17 | from keras.layers import TimeDistributed
 18 | from keras.layers import Dense
 19 | from .layers import LSTM
 20 | from keras.layers import Masking
 21 | from keras.layers import Bidirectional
 22 | from keras.layers import Lambda
 23 | from keras.layers import Dropout
 24 | from keras.layers import merge
 25 | 
 26 | from keras.regularizers import l1, l2, l1l2
 27 | 
 28 | from .layers import recurrent
 29 | 
 30 | 
 31 | def ctc_model(inputs, output, **kwargs):
 32 |     """ Given the input and output returns a model appending ctc_loss, the
 33 |     decoder, labels, and inputs_length
 34 | 
 35 |     # Arguments
 36 |         see core.ctc_utils.layer_utils.decode for more arguments
 37 |     """
 38 | 
 39 |     # Define placeholders
 40 |     labels = Input(name='labels', shape=(None,), dtype='int32', sparse=True)
 41 |     inputs_length = Input(name='inputs_length', shape=(None,), dtype='int32')
 42 | 
 43 |     # Define a decoder
 44 |     dec = Lambda(ctc_utils.decode, output_shape=ctc_utils.decode_output_shape,
 45 |                  arguments={'is_greedy': True}, name='decoder')
 46 |     y_pred = dec([output, inputs_length])
 47 | 
 48 |     ctc = Lambda(ctc_utils.ctc_lambda_func, output_shape=(1,), name="ctc")
 49 |     # Define loss as a layer
 50 |     loss = ctc([output, labels, inputs_length])
 51 | 
 52 |     return Model(input=[inputs, labels, inputs_length], output=[loss, y_pred])
 53 | 
 54 | 
 55 | def graves2006(num_features=26, num_hiddens=100, num_classes=28, std=.6):
 56 |     """ Implementation of Graves' model
 57 |     Reference:
 58 |         [1] Graves, Alex, et al. "Connectionist temporal classification:
 59 |         labelling unsegmented sequence data with recurrent neural networks."
 60 |         Proceedings of the 23rd international conference on Machine learning.
 61 |         ACM, 2006.
 62 |     """
 63 | 
 64 |     x = Input(name='inputs', shape=(None, num_features))
 65 |     o = x
 66 | 
 67 |     o = GaussianNoise(std)(o)
 68 |     o = Bidirectional(LSTM(num_hiddens,
 69 |                       return_sequences=True,
 70 |                       consume_less='gpu'))(o)
 71 |     o = TimeDistributed(Dense(num_classes))(o)
 72 | 
 73 |     return ctc_model(x, o)
 74 | 
 75 | 
 76 | def eyben(num_features=39, num_hiddens=[78, 120, 27], num_classes=28):
 77 |     """ Implementation of Eybens' model
 78 |     Reference:
 79 |         [1] Eyben, Florian, et al. "From speech to letters-using a novel neural
 80 |         network architecture for grapheme based asr." Automatic Speech
 81 |         Recognition & Understanding, 2009. ASRU 2009. IEEE Workshop on. IEEE,
 82 |         2009.
 83 |     """
 84 | 
 85 |     assert len(num_hiddens) == 3
 86 | 
 87 |     x = Input(name='inputs', shape=(None, num_features))
 88 |     o = x
 89 | 
 90 |     if num_hiddens[0]:
 91 |         o = TimeDistributed(Dense(num_hiddens[0]))(o)
 92 |     if num_hiddens[1]:
 93 |         o = Bidirectional(LSTM(num_hiddens[1],
 94 |                           return_sequences=True,
 95 |                           consume_less='gpu'))(o)
 96 |     if num_hiddens[2]:
 97 |         o = Bidirectional(LSTM(num_hiddens[2],
 98 |                           return_sequences=True,
 99 |                           consume_less='gpu'))(o)
100 | 
101 |     o = TimeDistributed(Dense(num_classes))(o)
102 | 
103 |     return ctc_model(x, o)
104 | 
105 | 
106 | def maas(num_features=81, num_classes=29, num_hiddens=1824, dropout=0.1,
107 |          max_value=20):
108 |     """ Maas' model.
109 |     Reference:
110 |         [1] Maas, Andrew L., et al. "Lexicon-Free Conversational Speech
111 |         Recognition with Neural Networks." HLT-NAACL. 2015.
112 |     """
113 | 
114 |     x = Input(name='inputs', shape=(None, num_features))
115 |     o = x
116 | 
117 |     def clipped_relu(x):
118 |         return relu(x, max_value=max_value)
119 | 
120 |     # First layer
121 |     o = TimeDistributed(Dense(num_hiddens))(o)
122 |     o = TimeDistributed(Activation(clipped_relu))(o)
123 | 
124 |     # Second layer
125 |     o = TimeDistributed(Dense(num_hiddens))(o)
126 |     o = TimeDistributed(Activation(clipped_relu))(o)
127 | 
128 |     # Third layer
129 |     o = Bidirectional(SimpleRNN(num_hiddens, return_sequences=True,
130 |                                 dropout_W=dropout,
131 |                                 activation=clipped_relu,
132 |                                 init='he_normal'), merge_mode='sum')(o)
133 | 
134 |     # Fourth layer
135 |     o = TimeDistributed(Dense(num_hiddens))(o)
136 |     o = TimeDistributed(Activation(clipped_relu))(o)
137 | 
138 |     # Fifth layer
139 |     o = TimeDistributed(Dense(num_hiddens))(o)
140 |     o = TimeDistributed(Activation(clipped_relu))(o)
141 | 
142 |     # Output layer
143 |     o = TimeDistributed(Dense(num_classes))(o)
144 | 
145 |     return ctc_model(x, o)
146 | 
147 | 
148 | def deep_speech(num_features=81, num_classes=29, num_hiddens=2048, dropout=0.1,
149 |                 max_value=20):
150 |     """ Deep Speech model.
151 | 
152 |         Contains five layers: 3 FC - BRNN - 1 FC
153 |         Dropout only applied to fully connected layers (between 5% to 10%)
154 | 
155 |     Note:
156 |         * We are not translating the raw audio files by 5 ms (Sec 2.1 in [1])
157 |         * We are not striding the RNN to halve the timesteps (Sec 3.3 in [1])
158 |         * We are not using frames of context
159 |         * Their output contains {a, ..., z, space, apostrophe, blank}
160 |     Experiment 5.1: Conversational speech: Switchboard Hub5'00 (full)
161 |         * Input - 80 linearly spaced log filter banks and an energy term. The
162 |         filter banks are computed over windows of 20ms strided by 10ms.
163 |         * Speaker adaptation - spectral features are normalized on a per
164 |         speaker basis.
165 |         * Hidden units: {2304, 2048}
166 |         * Essemble of 4 networks
167 |     Experiment 5.2: Noisy speech
168 |         * Input - 160 linearly spaced log filter banks. The filter banks are
169 |         computed over windows of 20ms strided by 10ms. Global mean and standard
170 |         deviation over training set normalization
171 |         * Speaker adaptation - none
172 |         * Hidden units: 2560
173 |         * Essemble of 6 networks
174 |     Reference:
175 |         [1] HANNUN, A. Y. et al. Deep Speech: Scaling up end-to-end speech
176 |         recognition. arXiV, 2014.
177 |     """
178 |     x = Input(name='inputs', shape=(None, num_features))
179 |     o = x
180 | 
181 |     def clipped_relu(x):
182 |         return relu(x, max_value=max_value)
183 | 
184 |     # First layer
185 |     o = TimeDistributed(Dense(num_hiddens))(o)
186 |     o = TimeDistributed(Activation(clipped_relu))(o)
187 |     o = TimeDistributed(Dropout(dropout))(o)
188 | 
189 |     # Second layer
190 |     o = TimeDistributed(Dense(num_hiddens))(o)
191 |     o = TimeDistributed(Activation(clipped_relu))(o)
192 |     o = TimeDistributed(Dropout(dropout))(o)
193 | 
194 |     # Third layer
195 |     o = TimeDistributed(Dense(num_hiddens))(o)
196 |     o = TimeDistributed(Activation(clipped_relu))(o)
197 |     o = TimeDistributed(Dropout(dropout))(o)
198 | 
199 |     # Fourth layer
200 |     o = Bidirectional(SimpleRNN(num_hiddens, return_sequences=True,
201 |                                 dropout_W=dropout,
202 |                                 activation=clipped_relu,
203 |                                 init='he_normal'), merge_mode='sum')(o)
204 |     o = TimeDistributed(Dropout(dropout))(o)
205 | 
206 |     # Fifth layer
207 |     o = TimeDistributed(Dense(num_hiddens))(o)
208 |     o = TimeDistributed(Activation(clipped_relu))(o)
209 |     o = TimeDistributed(Dropout(dropout))(o)
210 | 
211 |     # Output layer
212 |     o = TimeDistributed(Dense(num_classes))(o)
213 | 
214 |     return ctc_model(x, o)
215 | 
216 | 
217 | def brsmv1(num_features=39, num_classes=28, num_hiddens=256, num_layers=5,
218 |            dropout=0.2, zoneout=0., input_dropout=False,
219 |            input_std_noise=.0, weight_decay=1e-4, residual=None,
220 |            layer_norm=None, mi=None, activation='tanh'):
221 |     """ BRSM v1.0
222 |     Improved features:
223 |         * Residual connection
224 |         * Variational Dropout
225 |         * Zoneout
226 |         * Layer Normalization
227 |         * Multiplicative Integration
228 |     Note:
229 |         Dropout, zoneout and weight decay is tied through layers, in order to
230 |         minimizing the number of hyper parameters
231 |     Reference:
232 |         [1] Gal, Y, "A Theoretically Grounded Application of Dropout in
233 |         Recurrent Neural Networks", 2015.
234 |         [2] Graves, Alex, Abdel-rahman Mohamed, and Geoffrey Hinton. "Speech
235 |         recognition with deep recurrent neural networks", 2013.
236 |         [3] Krueger, David, et al. "Zoneout: Regularizing rnns by randomly
237 |         preserving hidden activations", 2016.
238 |         [4] Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton. "Layer
239 |         normalization.", 2016.
240 |         [5] Wu, Yuhuai, et al. "On multiplicative integration with recurrent
241 |         neural networks." Advances In Neural Information Processing Systems.
242 |         2016.
243 |         [6] Wu, Yonghui, et al. "Google's Neural Machine Translation System:
244 |         Bridging the Gap between Human and Machine Translation.", 2016.
245 |     """
246 | 
247 |     x = Input(name='inputs', shape=(None, num_features))
248 |     o = x
249 | 
250 |     if input_std_noise is not None:
251 |         o = GaussianNoise(input_std_noise)(o)
252 | 
253 |     if residual is not None:
254 |         o = TimeDistributed(Dense(num_hiddens*2,
255 |                                   W_regularizer=l2(weight_decay)))(o)
256 | 
257 |     if input_dropout:
258 |         o = Dropout(dropout)(o)
259 | 
260 |     for i, _ in enumerate(range(num_layers)):
261 |         new_o = Bidirectional(LSTM(num_hiddens,
262 |                                    return_sequences=True,
263 |                                    W_regularizer=l2(weight_decay),
264 |                                    U_regularizer=l2(weight_decay),
265 |                                    dropout_W=dropout,
266 |                                    dropout_U=dropout,
267 |                                    zoneout_c=zoneout,
268 |                                    zoneout_h=zoneout,
269 |                                    mi=mi,
270 |                                    layer_norm=layer_norm,
271 |                                    activation=activation))(o)
272 | 
273 |         if residual is not None:
274 |             o = merge([new_o,  o], mode=residual)
275 |         else:
276 |             o = new_o
277 | 
278 |     o = TimeDistributed(Dense(num_classes,
279 |                               W_regularizer=l2(weight_decay)))(o)
280 | 
281 |     return ctc_model(x, o)
282 | 


--------------------------------------------------------------------------------
/data/download_brsmv1.sh:
--------------------------------------------------------------------------------
1 | echo "Downloading the brsmv1 pre-trained model:"
2 | mkdir -p models/
3 | wget -c -q --show-progress -O models/brsmv1.h5 https://www.dropbox.com/s/ink8zxhzysxvzxa/best_ptbr.h5?dl=0
4 | 


--------------------------------------------------------------------------------
/data/download_datasets.sh:
--------------------------------------------------------------------------------
 1 | echo "Downloading pt-br datasets. This may take a while"
 2 | echo "Downloading Sid dataset:"
 3 | wget -c -q --show-progress -O ./sid.tar.gz https://www.dropbox.com/s/0wxlweatglrr7wl/sid.tar.gz?dl=0
 4 | echo "Downloading VoxForge dataset:"
 5 | wget -c -q --show-progress -O ./voxforge-ptbr.tar.gz https://www.dropbox.com/s/wrguetal6xmrgta/voxforge-ptbr.tar.gz?dl=0
 6 | echo "Downloading LapsBenchmark1.4 dataset:"
 7 | wget -c -q --show-progress -O ./lapsbm.tar.gz https://www.dropbox.com/s/8aqm9ktulmnry6d/lapsbm.tar.gz?dl=0
 8 | 
 9 | echo "Extracting Sid dataset..."
10 | mkdir -p sid
11 | cd sid; tar -xzf ../sid.tar.gz; cd ..
12 | 
13 | echo "Extracting VoxForge dataset..."
14 | mkdir -p voxforge
15 | cd voxforge; tar -xzf ../voxforge-ptbr.tar.gz; cd ..
16 | 
17 | echo "Extracting LapsBenchmark1.4 dataset..."
18 | mkdir -p lapsbm
19 | cd lapsbm; tar -xzf ../lapsbm.tar.gz; cd ..
20 | 
21 | echo "Finished."
22 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | from utils.generic_utils import safe_mkdirs
 4 | import os
 5 | 
 6 | DT_ABSPATH = os.path.join(os.path.sep.join(os.path.dirname(os.path.abspath(__file__)).split(os.path.sep)[:-1]), '.datasets')
 7 | safe_mkdirs(DT_ABSPATH)
 8 | 
 9 | from datasets.dataset_parser import DatasetParser
10 | from datasets.sid import Sid
11 | from datasets.lapsbm import LapsBM
12 | from datasets.voxforge import VoxForge
13 | from datasets.cslu import CSLU
14 | from datasets.dummy import Dummy
15 | from datasets.brsd import BRSD
16 | 


--------------------------------------------------------------------------------
/datasets/brsd.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | from datasets import DatasetParser
 6 | from datasets import LapsBM
 7 | 
 8 | from utils.generic_utils import get_from_module
 9 | 
10 | 
11 | class BRSD(DatasetParser):
12 |     """ Brazilian Portuguese Speech dataset reader and parser
13 | 
14 |     This dataset is a combination of four smaller datasets (voxforge, lapsbm,
15 |     sid, and cslu spoltech port). The dataset was divided in the following
16 |     way:
17 |         * Train: voxforge, sid, and cslu spoltech port
18 |         * Valid: 5 women and 15 men from LaspBM
19 |         * Test: 5 women 10 men from LapsBM (without overlapping with valid set
20 |         either in speaker and utterance spoken)
21 | 
22 |     After cleaning (removing label with zero length, label with numeric
23 |     digits, e.g., 4 instead of four) the training set contains 11702
24 |     utterances with 425 speakers.
25 | 
26 |     """
27 | 
28 |     def __init__(self, dataset_dir=None, name='brsd', **kwargs):
29 | 
30 |         dataset_dir = dataset_dir or {'lapsbm': None,
31 |                                       'voxforge': None,
32 |                                       'sid': None,
33 |                                       'cslu': None}
34 | 
35 |         super(BRSD, self).__init__(dataset_dir, name, **kwargs)
36 | 
37 |     @property
38 |     def dataset_dir(self):
39 |         """Filepath to the dataset directory"""
40 |         return self._dataset_dir
41 | 
42 |     @dataset_dir.setter
43 |     def dataset_dir(self, value):
44 |         """Filepath to the dataset directory"""
45 | 
46 |         if value is None:
47 |             raise ValueError("You must set the variable dataset_dir"
48 |                              " (the location of dataset) before continue")
49 | 
50 |         if not isinstance(value, dict):
51 |             raise ValueError("dataset_dir must be a dictionary")
52 | 
53 |         for key in ('lapsbm', 'voxforge', 'sid'):
54 |             if key not in value:
55 |                 raise KeyError("dataset_dir must have the key %s" % key)
56 | 
57 |         if 'cslu' not in value:
58 |             self._logger.warning('CSLU not found. Ignoring it.')
59 | 
60 |         self._dataset_dir = value
61 | 
62 |     def _iter(self):
63 | 
64 |         for name, path in self.dataset_dir.items():
65 | 
66 |             if name == 'lapsbm':
67 |                 continue
68 | 
69 |             try:
70 |                 dataset_cls = get_from_module('datasets*', name, regex=True)
71 |                 dataset = dataset_cls(dataset_dir=path)
72 | 
73 |                 for d in dataset._iter():
74 |                     yield {'duration': d['duration'],
75 |                            'input': d['input'],
76 |                            'label': d['label'],
77 |                            'speaker': '%s_%s' % (str(dataset), d['speaker']),
78 |                            'dataset': 'train'}
79 |             except ValueError, e:
80 |                 self._logger.warning('Skipping dataset %s: %s' % (name, e.message))
81 |         # Test and valid set
82 |         lapsbm = LapsBM(dataset_dir=self.dataset_dir['lapsbm'], split=True)
83 |         for d in lapsbm._iter():
84 |             yield {'duration': d['duration'],
85 |                    'input': d['input'],
86 |                    'label': d['label'],
87 |                    'speaker': '%s_%s' % (str(dataset), d['speaker']),
88 |                    'dataset': d['dataset']}
89 | 
90 |     def _report(self, dl):
91 |         report = '''General information:
92 |            Number of utterances: %d
93 |            Total size (in seconds) of utterances: %.f
94 |            Number of speakers: %d''' % (len(dl['input']),
95 |                                         sum(dl['duration']),
96 |                                         len(set(dl['speaker'])))
97 | 
98 |         return report
99 | 


--------------------------------------------------------------------------------
/datasets/cslu.py:
--------------------------------------------------------------------------------
 1 | from datasets import DatasetParser
 2 | 
 3 | import os
 4 | import re
 5 | import librosa
 6 | import codecs
 7 | 
 8 | 
 9 | class CSLU(DatasetParser):
10 |     """ CSLU Spoltech Port dataset reader and parser
11 | 
12 |     More about the dataset: https://catalog.ldc.upenn.edu/LDC2006S16
13 |     """
14 | 
15 |     def __init__(self, dataset_dir=None, name='cslu', **kwargs):
16 | 
17 |         dataset_dir = dataset_dir or 'data/cslu'
18 | 
19 |         super(CSLU, self).__init__(dataset_dir, name, **kwargs)
20 | 
21 |     def _iter(self):
22 |         trans_directory = os.path.join(self.dataset_dir, 'trans')
23 | 
24 |         for speaker_path in os.listdir(trans_directory):
25 | 
26 |             root_path = os.path.join(os.path.abspath(trans_directory),
27 |                                      speaker_path)
28 | 
29 |             if not os.path.isdir(os.path.join(root_path)):
30 |                 continue
31 | 
32 |             labels_files = os.listdir(root_path)
33 | 
34 |             for labels_file in labels_files:
35 | 
36 |                 label = codecs.open(
37 |                     os.path.join(root_path, labels_file), 'r',
38 |                     'latin-1').read().strip().lower()
39 | 
40 |                 audio_file = os.path.join(os.path.abspath(self.dataset_dir),
41 |                                           'speech', speaker_path,
42 |                                           labels_file[:-4])
43 | 
44 |                 audio_file = audio_file + '.wav'
45 |                 speaker_id = speaker_path
46 | 
47 |                 try:
48 |                     duration = librosa.audio.get_duration(filename=audio_file)
49 |                 except IOError:
50 |                     self._logger.error('File %s not found' % audio_file)
51 |                     continue
52 | 
53 |                 yield {'duration': duration,
54 |                        'input': audio_file,
55 |                        'label': label,
56 |                        'speaker': speaker_id}
57 | 
58 |     def _report(self, dl):
59 |         report = '''General information:
60 |            Number of utterances: %d
61 |            Total size (in seconds) of utterances: %.f
62 |            Number of speakers: %d''' % (len(dl['audio']), sum(dl['duration']),
63 |                                         len(set(dl['speaker'])))
64 | 
65 |         return report
66 | 


--------------------------------------------------------------------------------
/datasets/dataset_generator.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | 
  3 | from keras.preprocessing.image import Iterator
  4 | from keras.preprocessing.sequence import pad_sequences
  5 | 
  6 | import scipy
  7 | import librosa
  8 | import h5py
  9 | import numpy as np
 10 | import codecs
 11 | import json
 12 | import os
 13 | 
 14 | import time
 15 | 
 16 | from preprocessing import audio, text
 17 | from utils import generic_utils as utils
 18 | 
 19 | import logging
 20 | 
 21 | 
 22 | class DatasetGenerator(object):
 23 |     """ Dataset generator that handles several forms of input and return an
 24 |     iterator over it. Only works for a CTC model
 25 | 
 26 |     # Arguments
 27 |         input_parser: instance of Feature [preprocessing.audio.Feature]
 28 |             feature that is applied to each audio file (or audio data)
 29 |         label_parser: instance of Parser [preprocessing.text.Parser].
 30 |             parser that is applied to each label data
 31 |         batch_size: number of samples per batch
 32 |         shuffle: reordering index per epoch. This avoid some bias in training
 33 |         seed: default None
 34 |     """
 35 | 
 36 |     def __init__(self, input_parser=None, label_parser=None, batch_size=32,
 37 |                  shuffle=True, seed=None, mode='train'):
 38 |         self._logger = logging.getLogger('%s.%s' % (__name__,
 39 |                                                     self.__class__.__name__))
 40 |         self.input_parser = input_parser
 41 |         self.label_parser = label_parser
 42 |         self.batch_size = batch_size
 43 |         self.shuffle = shuffle
 44 |         self.seed = seed
 45 |         self.mode = mode
 46 | 
 47 |     def flow_from_fname(self, fname, datasets=None):
 48 |         """ Returns an specific iterator given the filename
 49 | 
 50 |         # Arguments
 51 |             datasets: str or list. If str will return one iterator; otherwise
 52 |             will return len(dataset) iterators for each dataset
 53 | 
 54 |         # Inputs
 55 |             fname: path to a file.
 56 |                 *.h5 (HDF5 format)
 57 |                 *json (JSON format)
 58 | 
 59 |         # Outputs
 60 |             If fname is:
 61 |                 HDF5 format: H5Iterator
 62 |                 JSON format: JSONIterator
 63 |         """
 64 |         out = None
 65 |         datasets = datasets or ['/']
 66 |         if type(datasets) not in (set, list):
 67 |             datasets = [datasets]
 68 | 
 69 |         if h5py.is_hdf5(fname):
 70 |             h5_f = h5py.File(fname, 'r')
 71 |             out = [self.flow_from_h5_group(h5_f[dataset])
 72 |                    for dataset in datasets]
 73 | 
 74 |         ext = os.path.splitext(fname)[1]
 75 |         if ext == '.json':
 76 |             out = [self.flow_from_json(fname, dataset) for dataset in datasets]
 77 | 
 78 |         if out is None:
 79 |             raise ValueError("Extension not recognized")
 80 | 
 81 |         if len(out) == 1:
 82 |             return out[0]
 83 |         return out
 84 | 
 85 |     def flow_from_json(self, fname, dataset=None):
 86 |         """ Returns JSONIterator given the filename"""
 87 |         return JSONIterator(
 88 |             fname, dataset, batch_size=self.batch_size,
 89 |             shuffle=self.shuffle, seed=self.seed,
 90 |             input_parser=self.input_parser,
 91 |             label_parser=self.label_parser,
 92 |             mode=self.mode)
 93 | 
 94 |     def flow_from_dl(self, dl, dataset=None):
 95 |         """ Return DictListIterator given a list of dictionaries. Each
 96 |         dictionary must have the keys 'input' and 'label'
 97 |         """
 98 |         return DictListIterator(dl, dataset, batch_size=self.batch_size,
 99 |                                 shuffle=self.shuffle, seed=self.seed,
100 |                                 input_parser=self.input_parser,
101 |                                 label_parser=self.label_parser,
102 |                                 mode=self.mode)
103 | 
104 |     def flow_from_h5_group(self, h5_group=None):
105 |         """ Returns H5Iterator given a h5group from a HDF5 data
106 |         """
107 |         return H5Iterator(h5_group, batch_size=self.batch_size,
108 |                           shuffle=self.shuffle, seed=self.seed,
109 |                           input_parser=self.input_parser,
110 |                           label_parser=self.label_parser,
111 |                           mode=self.mode)
112 | 
113 |     def flow_from_h5_file(self, h5_file, dataset='/'):
114 |         h5_f = h5py.File(h5_file, 'r')
115 |         return H5Iterator(h5_f[dataset], batch_size=self.batch_size,
116 |                           shuffle=self.shuffle, seed=self.seed,
117 |                           input_parser=self.input_parser,
118 |                           label_parser=self.label_parser,
119 |                           mode=self.mode)
120 | 
121 |     def flow(self, inputs, labels):
122 |         return DatasetIterator(inputs, labels, batch_size=self.batch_size,
123 |                                shuffle=self.shuffle, seed=self.seed,
124 |                                input_parser=self.input_parser,
125 |                                label_parser=self.label_parser,
126 |                                mode=self.mode)
127 | 
128 | 
129 | class DatasetIterator(Iterator):
130 | 
131 |     def __init__(self, inputs, labels=None, batch_size=32, shuffle=False,
132 |                  seed=None, input_parser=None, label_parser=None,
133 |                  standarize=None, mode='train'):
134 |         """ DatasetIterator iterates in a batch over a dataset and do some
135 |         preprocessing on inputs and labels
136 | 
137 |         # Arguments
138 |             inputs: a list of ndarray
139 |             labels: a list of str or ndarray
140 |             batch_size: size of each batch
141 |             shuffle: if True after each epoch the dataset will shuffle the
142 |             indexes
143 |             seed: seed the random generator
144 |             input_parser: instance of Feature
145 |             [preprocessing.audio.Feature]
146 |                 feature that is applied to each ndarray in batch
147 |             label_parser: instance of Parser [preprocessing.text.Parser].
148 |                 parser that is applied to each label in batch
149 |             standarize: if a set (mean, std), the input will be
150 |             normalized
151 |             mode: if 'predict', only the inputs is generated
152 |         """
153 | 
154 |         if labels is not None and len(inputs) != len(labels):
155 |             raise ValueError('inputs and labels '
156 |                              'should have the same length. '
157 |                              'Found: len(inputs) = %s, len(labels) = %s' %
158 |                              (len(inputs), len(labels)))
159 |         self._logger = logging.getLogger('%s.%s' % (__name__,
160 |                                                     self.__class__.__name__))
161 |         self.inputs = inputs
162 |         self.labels = labels
163 | 
164 |         self.input_parser = input_parser
165 |         self.label_parser = label_parser
166 | 
167 |         self.standarize = standarize
168 |         self.mode = mode
169 | 
170 |         if self.input_parser is not None:
171 |             logging.warning('Feature extractor is not None. It may slow down'
172 |                             + ' training')
173 | 
174 |         super(DatasetIterator, self).__init__(len(inputs), batch_size,
175 |                                               shuffle, seed)
176 | 
177 |     @property
178 |     def len(self):
179 |         """ Return the total size of dataset
180 |         """
181 |         return len(self.inputs)
182 | 
183 |     def next(self):
184 |         """ Iterates over batches
185 | 
186 |         # Outputs
187 |             Returns a tuple (input, output) that can be fed a CTC model
188 |                 input: is a list containing the inputs, labels and sequence
189 |                 length for the current batch
190 |                 output: is a list containing a vector of zeros (fake data for
191 |                 the decoder) and the batch labels for the decoder of a CTC
192 |                 model
193 |         """
194 | 
195 |         # Copy from DirectoryIterator from keras
196 |         with self.lock:
197 |             index_array, current_index, current_batch_size = next(
198 |                 self.index_generator)
199 | 
200 |         index_array.sort()
201 | 
202 |         index_array_list = index_array.tolist()
203 | 
204 |         batch_inputs, batch_inputs_len = self._make_in(
205 |             self.inputs[index_array_list], current_batch_size)
206 | 
207 |         if self.labels is not None:
208 |             batch_labels = self._make_out(self.labels[index_array_list],
209 |                                           current_batch_size)
210 |         else:
211 |             batch_labels = None
212 | 
213 |         return self._make_in_out(batch_inputs, batch_labels, batch_inputs_len)
214 | 
215 |     def _make_in_out(self, batch_inputs, batch_labels, batch_inputs_len=None):
216 |         # if label is not provided output is not necessary
217 |         if batch_labels is None:
218 |             return [batch_inputs, batch_inputs_len]
219 | 
220 |         return ([batch_inputs, batch_labels, batch_inputs_len],
221 |                 [np.zeros((batch_inputs.shape[0],)), batch_labels])
222 | 
223 |     def _make_in(self, inputs, batch_size=None):
224 |         if self.input_parser is not None:
225 |             inputs = np.asarray([self.input_parser(i) for i in inputs])
226 | 
227 |         batch_inputs = pad_sequences(inputs, dtype='float32', padding='post')
228 | 
229 |         if self.standarize:
230 |             mean, std = self.standarize
231 |             batch_inputs -= mean
232 |             batch_inputs /= (std + self.eps)
233 | 
234 |         batch_inputs_len = np.asarray([i.shape[0] for i in inputs])
235 |         return batch_inputs, batch_inputs_len
236 | 
237 |     def _make_out(self, labels, batch_size=None):
238 |         if self.labels is None or self.mode == 'predict':
239 |             return None
240 | 
241 |         if self.label_parser is not None:
242 |             labels = [self.label_parser(l) for l in labels]
243 | 
244 |         rows, cols, data = [], [], []
245 | 
246 |         for row, label in enumerate(labels):
247 |             cols.extend(range(len(label)))
248 |             rows.extend(len(label) * [row])
249 |             data.extend(label)
250 | 
251 |         return scipy.sparse.coo_matrix((data, (rows, cols)), dtype='int32')
252 | 
253 | 
254 | class H5Iterator(DatasetIterator):
255 | 
256 |     def __init__(self, h5group, **kwargs):
257 | 
258 |         inputs = h5group['inputs']
259 |         labels = h5group['labels']
260 | 
261 |         if kwargs.get('label_parser') is None:
262 |             raise ValueError("label_parser must be set")
263 | 
264 |         self.num_feats = None
265 |         if 'num_feats' in inputs.attrs.keys():
266 |             self.num_feats = inputs.attrs['num_feats']
267 | 
268 |         self.durations = h5group['durations']
269 | 
270 |         super(H5Iterator, self).__init__(inputs, labels, **kwargs)
271 | 
272 |     def _make_in(self, inputs, batch_size=None):
273 | 
274 |         if self.num_feats is not None:
275 |             inputs = [i.reshape((-1, self.num_feats)) for i in inputs]
276 | 
277 |         return super(H5Iterator, self)._make_in(inputs)
278 | 
279 | 
280 | class JSONIterator(DatasetIterator):
281 | 
282 |     def __init__(self, fname, dataset=None, **kwargs):
283 | 
284 |         self._logger = logging.getLogger('%s.%s' % (__name__,
285 |                                                     self.__class__.__name__))
286 | 
287 |         kwargs.setdefault('input_parser', audio.raw)
288 | 
289 |         if kwargs.get('input_parser') is None:
290 |             raise ValueError("input_parser must be set")
291 | 
292 |         if kwargs.get('label_parser') is None:
293 |             raise ValueError("label_parser must be set")
294 | 
295 |         with codecs.open(fname, 'r', encoding='utf8') as f:
296 |             ld = json.load(f)
297 | 
298 |         data = utils.ld2dl(ld)
299 | 
300 |         if dataset and 'dataset' not in data:
301 |             self._logger.warning('No dataset key found. Falling back to None')
302 |             dataset = None
303 | 
304 |         if dataset:
305 |             inputs = np.array([i for i, d in zip(
306 |                 data['input'], data['dataset']) if d == dataset])
307 |             labels = np.array([l for l, d in zip(
308 |                 data['label'], data['dataset']) if d == dataset])
309 |         else:
310 |             inputs = np.array(data['input'])
311 |             labels = np.array(data['label'])
312 | 
313 |         super(JSONIterator, self).__init__(inputs, labels, **kwargs)
314 | 
315 |         self.durations = np.array(data['duration'])
316 | 
317 | 
318 | class DictListIterator(DatasetIterator):
319 | 
320 |     def __init__(self, dict_list, dataset=None, **kwargs):
321 | 
322 |         kwargs.setdefault('input_parser', audio.raw)
323 | 
324 |         if kwargs.get('input_parser') is None:
325 |             raise ValueError("input_parser must be set")
326 | 
327 |         if kwargs.get('label_parser') is None:
328 |             raise ValueError("label_parser must be set")
329 | 
330 |         if dataset:
331 |             dict_list = self._get_by_dataset(dict_list, dataset)
332 | 
333 |         inputs = np.array(dict_list['audio'])
334 |         labels = np.array(dict_list['label'])
335 | 
336 |         super(DictListIterator, self).__init__(inputs, labels, **kwargs)
337 | 
338 |         self.durations = np.array(dict_list['duration'])
339 | 
340 |     def _get_by_dataset(self, dl, dataset):
341 |         mask = [i for i, d in enumerate(dl['dataset']) if d == dataset]
342 |         return {k: np.array(v)[mask] for k, v in dl.iteritems()
343 |                 if k != 'dataset'}
344 | 


--------------------------------------------------------------------------------
/datasets/dataset_parser.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | 
  3 | import os
  4 | import codecs
  5 | import json
  6 | 
  7 | import logging
  8 | import h5py
  9 | 
 10 | import numpy as np
 11 | 
 12 | from preprocessing import audio, text
 13 | from datasets import DT_ABSPATH
 14 | from utils.generic_utils import safe_mkdirs, ld2dl
 15 | 
 16 | 
 17 | class DatasetParser(object):
 18 |     '''Read data from directory and parser in a proper format
 19 |     '''
 20 | 
 21 |     def __init__(self, dataset_dir, name=None):
 22 |         self._logger = logging.getLogger('%s.%s' % (__name__,
 23 |                                                     self.__class__.__name__))
 24 |         self.dataset_dir = dataset_dir
 25 |         self._name = name
 26 | 
 27 |         self.default_output_dir = os.path.join(DT_ABSPATH, self.name)
 28 | 
 29 |     @property
 30 |     def dataset_dir(self):
 31 |         """Filepath to the dataset directory"""
 32 |         return self._dataset_dir
 33 | 
 34 |     @dataset_dir.setter
 35 |     def dataset_dir(self, value):
 36 |         if value is None:
 37 |             raise ValueError("You must set the variable dataset_dir (the location of dataset) before continue")
 38 | 
 39 |         if not os.path.isdir(value):
 40 |             raise ValueError("Dataset directory provided is not a directory")
 41 |         self._dataset_dir = value
 42 | 
 43 |     def _to_ld(self, label_parser=None):
 44 |         ''' Transform dataset in a list of dictionary
 45 |         '''
 46 |         data = []
 47 |         for d in self._iter():
 48 |             if not isinstance(d, dict):
 49 |                 raise TypeError("__loop must return a dict")
 50 | 
 51 |             for k in ['input', 'label', 'duration']:
 52 |                 if k not in d:
 53 |                     raise KeyError("__loop must return a dict with %s key" % k)
 54 | 
 55 |             if not self._is_valid_label(d['label'], label_parser=label_parser):
 56 |                 self._logger.warning(u'File %s has a forbidden label: "%s". Skipping', d['input'], d['label'])
 57 |                 continue
 58 | 
 59 |             data.append(d)
 60 |         return data
 61 | 
 62 |     def to_json(self, fname=None):
 63 |         ''' Parse the entire dataset to a list of dictionary containin at least
 64 |         two keys:
 65 |             `input`: path to audio file
 66 |             `duration`: length of the audio
 67 |             `label`: transcription of the audio
 68 |         '''
 69 |         fname = fname or os.path.join(
 70 |             self.default_output_dir, 'data.json')
 71 | 
 72 |         if os.path.exists(fname) and override:
 73 |             os.remove(fname)
 74 | 
 75 |         if not os.path.isdir(os.path.split(fname)[0]):
 76 |             safe_mkdirs(os.path.split(fname)[0])
 77 | 
 78 |         data = self._to_ld()
 79 | 
 80 |         with codecs.open(fname, 'w', encoding='utf8') as f:
 81 |             json.dump(data, f)
 82 | 
 83 |         self._logger.info(self._report(ld2dl(data)))
 84 | 
 85 |         return fname
 86 | 
 87 |     def to_h5(self, fname=None, input_parser=audio.raw, label_parser=None,
 88 |               split_sets=True, override=False):
 89 |         ''' Generates h5df file for the dataset
 90 |         Note that this function will calculate the features rather than store
 91 |         the path to the audio file
 92 | 
 93 |         Args
 94 |             split_sets: if True and dataset is split in several sets (e.g.
 95 |             train, valid, test) the h5 file will create the corresponding
 96 |             datasets; otherwise no dataset is create
 97 |         '''
 98 |         if not issubclass(input_parser.__class__, audio.Feature):
 99 |             raise TypeError("input_parser must be an instance of audio.Feature")
100 | 
101 |         fname = fname or os.path.join(self.default_output_dir, 'data.h5')
102 | 
103 |         if h5py.is_hdf5(fname) and override:
104 |             os.remove(fname)
105 | 
106 |         if not os.path.isdir(os.path.split(fname)[0]):
107 |             safe_mkdirs(os.path.split(fname)[0])
108 | 
109 |         feat_name = str(input_parser)
110 | 
111 |         data = self._to_ld(label_parser=label_parser)
112 | 
113 |         if len(data) == 0:
114 |             raise IndexError("Data is empty")
115 | 
116 |         datasets = ['/']
117 |         if 'dataset' in data[0]:
118 |             datasets = list(set([d['dataset'] for d in data]))
119 | 
120 |         self._logger.info('Opening %s', fname)
121 |         with h5py.File(fname) as f:
122 | 
123 |             # create all datasets
124 |             for dataset in datasets:
125 | 
126 |                 group = f['/']
127 |                 if dataset != '/':
128 |                     group = f.create_group(dataset)
129 | 
130 |                 inputs = group.create_dataset(
131 |                     'inputs', (0,), maxshape=(None,),
132 |                     dtype=h5py.special_dtype(vlen=np.dtype('float32')))
133 | 
134 |                 if input_parser.num_feats:
135 |                     inputs.attrs['num_feats'] = input_parser.num_feats
136 | 
137 |                 group.create_dataset(
138 |                     'labels', (0,), maxshape=(None,),
139 |                     dtype=h5py.special_dtype(vlen=unicode))
140 | 
141 |                 group.create_dataset(
142 |                     'durations', (0,), maxshape=(None,))
143 | 
144 |             for i, d in enumerate(data):
145 | 
146 |                 dataset = '/'
147 |                 if dataset not in datasets:
148 |                     dataset = d['dataset']
149 | 
150 |                 # HDF5 pointers
151 |                 inputs = f[dataset]['inputs']
152 |                 labels = f[dataset]['labels']
153 |                 durations = f[dataset]['durations']
154 | 
155 |                 # Data
156 |                 input_ = input_parser(d['input'])
157 |                 label = d['label']
158 |                 duration = d['duration']
159 | 
160 |                 inputs.resize(inputs.shape[0] + 1, axis=0)
161 |                 inputs[inputs.shape[0] - 1] = input_.flatten().astype('float32')
162 | 
163 |                 labels.resize(labels.shape[0] + 1, axis=0)
164 |                 labels[labels.shape[0] - 1] = label.encode('utf8')
165 | 
166 |                 durations.resize(durations.shape[0] + 1, axis=0)
167 |                 durations[durations.shape[0] - 1] = duration
168 | 
169 |                 # Flush to disk only when it reaches 128 samples
170 |                 if i % 128 == 0:
171 |                     self._logger.info('%d/%d done.' % (i, len(data)))
172 |                     f.flush()
173 | 
174 |             f.flush()
175 |             self._logger.info('%d/%d done.' % (len(data), len(data)))
176 | 
177 |             return fname
178 | 
179 |     def _iter(self):
180 |         raise NotImplementedError("_iter must be implemented")
181 | 
182 |     def _report(self, dl):
183 |         """
184 |         Args
185 |             dl: dictionary of list, where the keys were defined in _iter()
186 |         """
187 |         raise NotImplementedError("_report must be implemented")
188 | 
189 |     def _is_valid_label(self, label, label_parser=None):
190 |         if len(label) == 0:
191 |             return False
192 | 
193 |         if label_parser is not None:
194 |             return label_parser.is_valid(label)
195 | 
196 |         return True
197 | 
198 |     @property
199 |     def name(self):
200 |         return self._name
201 | 
202 |     def __str__(self):
203 |         return self.name
204 | 


--------------------------------------------------------------------------------
/datasets/dummy.py:
--------------------------------------------------------------------------------
  1 | from datasets import DatasetParser
  2 | 
  3 | import os
  4 | import re
  5 | import librosa
  6 | import codecs
  7 | import tempfile
  8 | 
  9 | import numpy as np
 10 | 
 11 | 
 12 | class Dummy(DatasetParser):
 13 |     """ Fake dataset reader and parser to do some tests
 14 | 
 15 |     # Arguments
 16 |         num_speakers: number of speakers
 17 |         num_utterances_per_speaker: number of utterances that each speaker will
 18 |         have
 19 |         max_duration: max duration in seconds of each fake audio
 20 |         min_duration: min duration in seconds of each fake audio
 21 |         max_label_length: max size of each fake label
 22 |         fs: sampling frequency of each fake audio
 23 |         split: list with two values. It will divide this dataset in three sets
 24 |         (train, valid and test) given the proportions
 25 |     """
 26 | 
 27 |     def __init__(self, dataset_dir=None, num_speakers=10,
 28 |                  num_utterances_per_speaker=10,
 29 |                  max_duration=10.0, min_duration=1.0, max_label_length=50,
 30 |                  fs=16e3, split=None, name='dummy', **kwargs):
 31 |         '''
 32 |         Args:
 33 |             split: list or nparray of size 2 that splits the data between
 34 |             train, valid and test. example: split = [.8 .15] = 80% train, 15%
 35 |             valid and 5% test
 36 |         '''
 37 | 
 38 |         super(Dummy, self).__init__(None, name, **kwargs)
 39 | 
 40 |         self.num_speakers = num_speakers
 41 |         self.num_utterances_per_speaker = num_utterances_per_speaker
 42 |         self.max_duration = max_duration
 43 |         self.min_duration = min_duration
 44 |         self.fs = fs
 45 |         self.max_label_length = max_label_length
 46 |         self.split = split
 47 | 
 48 |         if split is not None and (len(split) != 2 or np.sum(split) > 1.):
 49 |             raise ValueError('Split must have len = 2 and must sum <= 1')
 50 | 
 51 |     @property
 52 |     def dataset_dir(self):
 53 |         """Filepath to the dataset directory"""
 54 |         return self._dataset_dir
 55 | 
 56 |     @dataset_dir.setter
 57 |     def dataset_dir(self, value):
 58 |         self._dataset_dir = value
 59 | 
 60 |     def _iter(self):
 61 | 
 62 |         counter = 0
 63 |         total = self.num_speakers * self.num_utterances_per_speaker
 64 | 
 65 |         for speaker in range(self.num_speakers):
 66 |             for utterance in range(self.num_utterances_per_speaker):
 67 | 
 68 |                 duration = np.random.uniform(low=self.min_duration,
 69 |                                              high=self.max_duration)
 70 | 
 71 |                 samples = np.floor(duration * self.fs)
 72 |                 audio = np.random.randn(int(samples))
 73 | 
 74 |                 audio_file = tempfile.NamedTemporaryFile(delete=False)
 75 |                 audio_fname = audio_file.name
 76 |                 audio_file.close()
 77 | 
 78 |                 librosa.output.write_wav(audio_fname, audio, self.fs)
 79 | 
 80 |                 label = np.random.randint(
 81 |                     low=ord('a'), high=ord('z'),
 82 |                     size=(np.random.randint(2, self.max_label_length),))
 83 | 
 84 |                 label = ''.join([chr(l) for l in label])
 85 | 
 86 |                 data = {'duration': duration,
 87 |                         'input': audio_fname,
 88 |                         'label': label,
 89 |                         'speaker': 'speaker_%d' % speaker}
 90 | 
 91 |                 if self.split is not None:
 92 |                     if counter < np.floor(self.split[0] * total):
 93 |                         dataset = 'train'
 94 |                     elif counter < np.floor(np.sum(self.split) * total):
 95 |                         dataset = 'valid'
 96 |                     else:
 97 |                         dataset = 'test'
 98 | 
 99 |                     data['dataset'] = dataset
100 |                 counter += 1
101 | 
102 |                 yield data
103 | 
104 |     def _report(self, dl):
105 |         report = '''General information
106 |                 Number of utterances: %d
107 |                 Total size (in seconds) of utterances: %.f
108 |                 Number of speakers: %d''' % (len(dl['audio']),
109 |                                              sum(dl['duration']),
110 |                                              len(set(dl['speaker'])))
111 | 
112 |         return report
113 | 


--------------------------------------------------------------------------------
/datasets/lapsbm.py:
--------------------------------------------------------------------------------
 1 | from datasets import DatasetParser
 2 | 
 3 | import os
 4 | import re
 5 | import librosa
 6 | import codecs
 7 | 
 8 | 
 9 | class LapsBM(DatasetParser):
10 |     """ Laps benchmark version 1.4 dataset reader and parser
11 | 
12 |     More about this dataset: http://www.laps.ufpa.br/falabrasil/downloads.php
13 |     """
14 | 
15 |     version = '1.4'
16 | 
17 |     # Random separation of LAPSBM1.4 dataset in validation and test if required
18 |     # 5 women, 10 men
19 |     _test_speaker_id = [3, 11, 13, 17, 12,
20 |                         33, 5, 22, 16, 8,
21 |                         4, 0, 20, 10, 9]
22 | 
23 |     # 5 women, 15 men
24 |     _valid_speaker_id = [29, 32, 14, 31, 25,
25 |                          23, 19, 26, 6, 2,
26 |                          24, 15, 1, 21, 28,
27 |                          30, 34, 27, 18, 7]
28 | 
29 |     def __init__(self, dataset_dir=None, name='lapsbm', split=False, **kwargs):
30 | 
31 |         dataset_dir = dataset_dir or 'data/lapsbm'
32 | 
33 |         self._split = split
34 | 
35 |         super(LapsBM, self).__init__(dataset_dir, name, **kwargs)
36 | 
37 |     def _iter(self):
38 |         for speaker_path in os.listdir(self.dataset_dir):
39 | 
40 |             root_path = os.path.join(os.path.abspath(self.dataset_dir),
41 |                                      speaker_path)
42 | 
43 |             if not os.path.isdir(os.path.join(root_path)):
44 |                 continue
45 | 
46 |             label_files = [f for f in os.listdir(root_path)
47 |                            if '.txt' in f.lower()]
48 | 
49 |             for label_file in label_files:
50 | 
51 |                 label = ' '.join(
52 |                     codecs.open(
53 |                         os.path.join(root_path, label_file), 'r',
54 |                         encoding='utf8')
55 |                     .read().strip().split(' ')).lower()
56 | 
57 |                 audio_file = os.path.join(root_path,
58 |                                           "%s.wav" % (label_file[:-4]))
59 |                 gender_speaker = speaker_path.split('-')[1]
60 |                 gender = gender_speaker[0].lower()
61 |                 speaker_id = gender_speaker[1:]
62 | 
63 |                 try:
64 |                     duration = librosa.audio.get_duration(filename=audio_file)
65 |                 except IOError:
66 |                     print('File %s not found' % audio_file)
67 |                     continue
68 | 
69 |                 dataset = 'valid'
70 |                 if int(speaker_id) in self._test_speaker_id:
71 |                     dataset = 'test'
72 | 
73 |                 data = {'duration': duration,
74 |                         'input': audio_file,
75 |                         'label': label,
76 |                         'gender': gender,
77 |                         'speaker': speaker_id}
78 | 
79 |                 if self._split:
80 |                     data['dataset'] = dataset
81 | 
82 |                 yield data
83 | 
84 |     def _report(self, dl):
85 |         report = '''General information:
86 |            Number of utterances: %d
87 |            Total size (in seconds) of utterances: %.f
88 |            Number of speakers: %d'
89 |            %% of female speaker: %.2f%%''' \
90 |            % (len(dl['audio']), sum(dl['duration']), len(set(dl['speaker'])),
91 |               100 * (sum([1 for g in dl['gender'] if g == 'f']) /
92 |                         (1.0 * len(dl['gender']))))
93 | 
94 |         return report
95 | 


--------------------------------------------------------------------------------
/datasets/sid.py:
--------------------------------------------------------------------------------
  1 | from datasets import DatasetParser
  2 | 
  3 | import os
  4 | import re
  5 | import librosa
  6 | import codecs
  7 | 
  8 | import numpy as np
  9 | 
 10 | regex = r"Nome=(?P<name>.*)[\n]+Idade=(?P<age>.*)[\n]+.*[\n]+Sexo=(?P<gender>.*)[\n]+Escolaridade=(?P<education>.*)[\n]+"
 11 | 
 12 | 
 13 | class Sid(DatasetParser):
 14 |     """ Sid dataset reader and parser
 15 |     """
 16 | 
 17 |     def __init__(self, dataset_dir=None, name='sid', **kwargs):
 18 | 
 19 |         dataset_dir = dataset_dir or 'data/sid'
 20 | 
 21 |         super(Sid, self).__init__(dataset_dir, name, **kwargs)
 22 | 
 23 |     def _iter(self):
 24 |         for speaker_path in os.listdir(self.dataset_dir):
 25 | 
 26 |             root_path = os.path.join(os.path.abspath(self.dataset_dir),
 27 |                                      speaker_path)
 28 | 
 29 |             if not os.path.isdir(os.path.join(root_path)):
 30 |                 continue
 31 | 
 32 |             labels_file = os.path.join(root_path, 'prompts.txt')
 33 | 
 34 |             speaker_info_file = os.path.join(root_path, 'speaker.txt')
 35 | 
 36 |             with open(speaker_info_file) as f:
 37 |                 info_text = f.read()
 38 | 
 39 |             pattern = re.compile(regex, re.MULTILINE | re.UNICODE)
 40 | 
 41 |             info = list(re.finditer(pattern, info_text))[0].groupdict()
 42 | 
 43 |             gender = info['gender'][0].lower()
 44 |             speaker_id = speaker_path.lower()
 45 | 
 46 |             try:
 47 |                 age = int(info['age'])
 48 |             except ValueError:
 49 |                 self._logger.error('age %s could not be converted in int.',
 50 |                                   (info['age']))
 51 |                 age = 0
 52 | 
 53 |             for line in codecs.open(labels_file, 'r', encoding='utf8'):
 54 | 
 55 |                 split = line.strip().split('=')
 56 |                 file_id = int(split[0])
 57 | 
 58 |                 label = split[1].lower()
 59 | 
 60 |                 audio_file = os.path.join(
 61 |                     root_path, "%s%03d" % (speaker_path, file_id)) + '.wav'
 62 | 
 63 |                 try:
 64 |                     duration = librosa.audio.get_duration(filename=audio_file)
 65 |                 except IOError:
 66 |                     self._logger.error('File %s not found' % audio_file)
 67 |                     continue
 68 | 
 69 |                 yield {'duration': duration,
 70 |                        'input': audio_file,
 71 |                        'label': label,
 72 |                        'gender': gender,
 73 |                        'speaker': speaker_id,
 74 |                        'age': age}
 75 | 
 76 |     def _report(self, dl):
 77 |         args = len(dl['audio']), sum(dl['duration']),
 78 |         len(set(dl['speaker'])),
 79 |         100 * (sum([1 for g in dl['gender'] if g == 'f']) /
 80 |                (1.0 * len(dl['gender']))),
 81 |         min([a for a in dl['age'] if a is not 0]),
 82 |         max(dl['age']), np.mean([a for a in dl['age'] if a is not 0])
 83 | 
 84 |         report = '''General information
 85 |                 Number of utterances: %d
 86 |                 Total size (in seconds) of utterances: %.f
 87 |                 Number of speakers: %d
 88 |                 %% of female speaker: %.2f%%
 89 |                 age range: from %d to %d. Mean: %.f''' % (args)
 90 | 
 91 |         return report
 92 | 
 93 | 
 94 | if __name__ == '__main__':
 95 |     """ Script to fix some errors in sid dataset about the name convention
 96 |     on folder and some errors in transcription
 97 |     """
 98 |     parser = argparse.ArgumentParser()
 99 |     parser.add_argument('data_directory', type=str,
100 |                         help='Path to data directory')
101 |     parser.add_argument('output_directory', type=str,
102 |                         help='Path to data directory')
103 |     args = parser.parse_args()
104 | 
105 |     data_directory = args.data_directory
106 |     output_directory = args.output_directory
107 | 
108 |     # fix wav filenamess
109 |     matches = []
110 |     for root, dirnames, filenames in os.walk(data_directory):
111 |         for filename in fnmatch.filter(filenames, '*.[Ww][Aa][Vv]'):
112 |             filepath = os.path.join(root, filename)
113 |             number = "%03d" % int(filename[-7:-4])
114 |             prefix = filepath.split(os.path.sep)[-2]
115 | 
116 |             new_filename = "%s%s" % (prefix, number) + '.wav'
117 |             new_filepath = os.path.join(output_directory, root, new_filename)
118 | 
119 |             if not os.path.exists(os.path.join(output_directory, root)):
120 |                 os.makedirs(os.path.join(output_directory, root))
121 | 
122 |             copyfile(filepath, new_filepath)
123 | 
124 |     for root, dirnames, filenames in os.walk(data_directory):
125 |         for filename in fnmatch.filter(filenames, '*.[tT][xX][tT]'):
126 |             filepath = os.path.join(root, filename)
127 | 
128 |             if filename.lower().startswith('texto'):
129 |                 filename = 'prompts.txt'
130 | 
131 |             new_filepath = os.path.join(output_directory,
132 |                                         root, filename.lower())
133 |             copyfile(filepath, new_filepath)
134 | 


--------------------------------------------------------------------------------
/datasets/voxforge.py:
--------------------------------------------------------------------------------
  1 | from datasets import DatasetParser
  2 | 
  3 | import os
  4 | import re
  5 | import librosa
  6 | import codecs
  7 | 
  8 | regex = r"User\s+Name\:[\s]*(?P<speaker>.*)[\n]+.*[\n]+Gender\:[\s]*(?P<gender>[a-zA-Z]+)[\w\r\s\n:\/]+Pronunciation dialect\:\s+(?P<dialect>.*)"
  9 | 
 10 | 
 11 | class VoxForge(DatasetParser):
 12 |     """ VoxForge (only portuguese brazilian audio files) dataset reader and parser
 13 | 
 14 |     More about the dataset: http://www.voxforge.org/
 15 |     """
 16 | 
 17 |     IGNORED_LIST = ['Marcelo-20131106-iqc',
 18 |                     'anonymous-20140619-wcy',
 19 |                     'ThiagoCastro-20131129-qpn',
 20 |                     'anonymous-20131016-uzv']
 21 | 
 22 |     def __init__(self, dataset_dir=None, name='voxforge', **kwargs):
 23 | 
 24 |         dataset_dir = dataset_dir or 'data/voxforge'
 25 | 
 26 |         super(VoxForge, self).__init__(dataset_dir, name, **kwargs)
 27 | 
 28 |         if (self.dataset_dir is not None and
 29 |             os.path.isdir(os.path.join(self.dataset_dir, 'files'))):
 30 | 
 31 |             self.dataset_dir = os.path.join(self.dataset_dir, 'files')
 32 | 
 33 |     def _iter(self):
 34 |         for speaker_path in os.listdir(self.dataset_dir):
 35 | 
 36 |             if speaker_path in self.IGNORED_LIST:
 37 |                 continue
 38 | 
 39 |             root_path = os.path.join(
 40 |                 os.path.abspath(self.dataset_dir), speaker_path)
 41 | 
 42 |             if not os.path.isdir(os.path.join(root_path)):
 43 |                 continue
 44 | 
 45 |             labels_file = os.path.join(root_path, 'etc', 'PROMPTS')
 46 | 
 47 |             if not os.path.exists(labels_file):
 48 |                 labels_file = os.path.join(root_path, 'PROMPTS')
 49 | 
 50 |             speaker_info_file = os.path.join(root_path, 'etc', 'README')
 51 | 
 52 |             if not os.path.exists(speaker_info_file):
 53 |                 speaker_info_file = os.path.join(root_path, 'README')
 54 | 
 55 |             with open(speaker_info_file) as f:
 56 |                 info_text = f.read()
 57 | 
 58 |             pattern = re.compile(regex, re.MULTILINE | re.UNICODE)
 59 | 
 60 |             info = list(re.finditer(pattern, info_text))[0].groupdict()
 61 | 
 62 |             gender = info['gender'][0].lower()
 63 |             speaker_id = info['speaker']
 64 | 
 65 |             for line in codecs.open(labels_file, 'r', encoding='utf8'):
 66 |                 split = line.strip().split()
 67 |                 file_id = split[0].split('/')[-1]
 68 | 
 69 |                 label = ' '.join(split[1:]).lower()
 70 | 
 71 |                 audio_file = os.path.join(root_path, 'wav', file_id) + '.wav'
 72 | 
 73 |                 if not os.path.exists(audio_file):
 74 |                     audio_file = os.path.join(root_path, file_id) + '.wav'
 75 | 
 76 |                 try:
 77 |                     duration = librosa.audio.get_duration(filename=audio_file)
 78 |                 except IOError:
 79 |                     self._logger.error('File %s not found' % audio_file)
 80 |                     continue
 81 | 
 82 |                 yield {'duration': duration,
 83 |                        'input': audio_file,
 84 |                        'label': label,
 85 |                        'gender': gender,
 86 |                        'speaker': speaker_id}
 87 | 
 88 |     def _report(self, dl):
 89 |         args = len(dl['audio']), sum(dl['duration']),
 90 |         len(set(dl['speaker'])),
 91 |         100 * (sum([1 for g in dl['gender'] if g == 'f']) /
 92 |                (1.0 * len(dl['gender']))),
 93 |         100 * (sum([1 for s in dl['speaker'] if s == 'anonymous']) /
 94 |                (1.0 * len(dl['speaker'])))
 95 | 
 96 |         report = '''General information
 97 |                 Number of utterances: %d
 98 |                 Total size (in seconds) of utterances: %.f
 99 |                 Number of speakers: %d
100 |                 %% of female speaker: %.2f%%
101 |                 Anonymous speaker: %.2f%%''' % (args)
102 | 
103 |         return report
104 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | import os
 6 | import codecs
 7 | import json
 8 | import numpy as np
 9 | # Preventing pool_allocator message
10 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
11 | 
12 | import argparse
13 | import h5py
14 | import inspect
15 | 
16 | from preprocessing import audio, text
17 | 
18 | from utils import generic_utils as utils
19 | from utils.hparams import HParams
20 | 
21 | from datasets.dataset_generator import DatasetGenerator, DatasetIterator
22 | 
23 | from utils.core_utils import setup_gpu, load_model
24 | 
25 | if __name__ == '__main__':
26 |     parser = argparse.ArgumentParser(description='Evaluating an ASR system.')
27 | 
28 |     parser.add_argument('--model', required=True, type=str)
29 |     parser.add_argument('--dataset', required=True, type=str)
30 |     parser.add_argument('--subset', type=str, default='test')
31 | 
32 |     parser.add_argument('--batch_size', default=32, type=int)
33 | 
34 |     # Features generation (if necessary)
35 |     parser.add_argument('--input_parser', type=str, default=None)
36 |     parser.add_argument('--input_parser_params', nargs='+', default=[])
37 | 
38 |     # Label generation (if necessary)
39 |     parser.add_argument('--label_parser', type=str,
40 |                         default='simple_char_parser')
41 |     parser.add_argument('--label_parser_params', nargs='+', default=[])
42 | 
43 |     # Other configs
44 |     parser.add_argument('--gpu', default='0', type=str)
45 |     parser.add_argument('--allow_growth', default=False, action='store_true')
46 | 
47 |     parser.add_argument('--save_transcriptions', default=None, type=str)
48 | 
49 |     args = parser.parse_args()
50 |     args_nondefault = utils.parse_nondefault_args(
51 |         args, parser.parse_args(
52 |             ['--model', args.model, '--dataset', args.dataset]))
53 | 
54 |     # GPU configuration
55 |     setup_gpu(args.gpu, args.allow_growth)
56 | 
57 |     # Loading model
58 |     model, meta = load_model(args.model, return_meta=True, mode='eval')
59 | 
60 |     args = HParams(**meta['training_args']).update(vars(args_nondefault))
61 | 
62 |     # Features extractor
63 |     input_parser = utils.get_from_module('preprocessing.audio',
64 |                                          args.input_parser,
65 |                                          params=args.input_parser_params)
66 | 
67 |     # Recovering text parser
68 |     label_parser = utils.get_from_module('preprocessing.text',
69 |                                          args.label_parser,
70 |                                          params=args.label_parser_params)
71 | 
72 |     data_gen = DatasetGenerator(input_parser, label_parser,
73 |                                 batch_size=args.batch_size, seed=0)
74 |     test_flow = data_gen.flow_from_fname(args.dataset, datasets=args.subset)
75 | 
76 |     metrics = model.evaluate_generator(test_flow, test_flow.len,
77 |                                        max_q_size=10, nb_worker=1)
78 | 
79 |     for m, v in zip(model.metrics_names, metrics):
80 |         print('%s: %4f' % (m, v))
81 | 
82 |     from keras import backend as K; K.clear_session()
83 | 


--------------------------------------------------------------------------------
/extras/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 | 
3 | from utils.generic_utils import setup_logging
4 | setup_logging()
5 | 


--------------------------------------------------------------------------------
/extras/apis.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import speech_recognition as sr
 3 | 
 4 | r = sr.Recognizer()
 5 | 
 6 | def recognize_from_api(audio, api, name='API', safe=True, **kwargs):
 7 |     if not isinstance(audio, sr.AudioData):
 8 |         with sr.AudioFile(audio) as source:
 9 |             audio = r.record(source)
10 |     try:
11 |         return api(audio, **kwargs)
12 |     except sr.UnknownValueError as e:
13 |         if not safe:
14 |             raise e
15 |         return "\t%s could not understand audio" % name
16 |     except sr.RequestError as e:
17 |         if not safe:
18 |             raise e
19 |         return "\tCould not request results from %s \
20 |     service; {0}" % (name, e)
21 | 
22 | 
23 | def recognize_google(audio,
24 |                      credentials=os.environ['GOOGLE_CLOUD_API'],
25 |                      **kwargs):
26 | 
27 |     return recognize_from_api(audio, r.recognize_google_cloud,
28 |                               name='Google Cloud Speech',
29 |                               credentials_json=credentials,
30 |                               **kwargs)
31 | 
32 | 
33 | def recognize_bing(audio, key=os.environ['BING_API'], **kwargs):
34 |     return recognize_from_api(audio, r.recognize_bing,
35 |                               name='Microsoft Bing Voice',
36 |                               key=key, **kwargs)
37 | 
38 | 
39 | def recognize_ibm(audio,
40 |                   username=os.environ['IBM_USERNAME'],
41 |                   password=os.environ['IBM_PASSWORD'], **kwargs):
42 |     return recognize_from_api(audio, r.recognize_ibm,
43 |                               name='IBM Speech to Text',
44 |                               username=username, password=password,
45 |                               **kwargs)
46 | 


--------------------------------------------------------------------------------
/extras/ctc_viz.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import numpy as np
  4 | 
  5 | from datasets.dataset_generator import DatasetGenerator, DatasetIterator
  6 | 
  7 | from utils.core_utils import setup_gpu, load_model
  8 | 
  9 | from utils.hparams import HParams
 10 | from utils import generic_utils as utils
 11 | 
 12 | from preprocessing import audio, text
 13 | 
 14 | import matplotlib
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | if __name__ == '__main__':
 18 | 
 19 |     parser = argparse.ArgumentParser(description='Evaluating an ASR system.')
 20 | 
 21 |     parser.add_argument('--model', required=True, type=str)
 22 |     parser.add_argument('--dataset', default=None, type=str)
 23 |     parser.add_argument('--files', default=[], type=str, nargs='+')
 24 |     parser.add_argument('--labels', default=[], nargs='+', type=str)
 25 |     parser.add_argument('--subset', type=str, default='test')
 26 | 
 27 |     # Features generation (if necessary)
 28 |     parser.add_argument('--input_parser', type=str, default=None)
 29 |     parser.add_argument('--input_parser_params', nargs='+', default=[])
 30 | 
 31 |     # Label generation (if necessary)
 32 |     parser.add_argument('--label_parser', type=str,
 33 |                         default='simple_char_parser')
 34 |     parser.add_argument('--label_parser_params', nargs='+', default=[])
 35 | 
 36 |     # Other configs
 37 |     parser.add_argument('--gpu', default='0', type=str)
 38 |     parser.add_argument('--allow_growth', default=False, action='store_true')
 39 | 
 40 | 
 41 |     parser.add_argument('--plt_backend', type=str, default="Qt5Agg")
 42 | 
 43 |     parser.add_argument('--save', default=None, type=str)
 44 | 
 45 |     args = parser.parse_args()
 46 |     args_nondefault = utils.parse_nondefault_args(
 47 |         args, parser.parse_args(
 48 |             ['--model', args.model, '--dataset', args.dataset]))
 49 | 
 50 |     matplotlib.use(args.plt_backend)
 51 | 
 52 |     if args.dataset is None and len(args.files) == 0:
 53 |         raise ValueError('dataset or file args must be set.')
 54 | 
 55 |     if args.dataset and args.files:
 56 |         print('Both dataset and file args was set. Ignoring file args.')
 57 | 
 58 |     # GPU configuration
 59 |     setup_gpu(args.gpu, args.allow_growth)
 60 | 
 61 |     # Loading model
 62 |     model, meta = load_model(args.model, return_meta=True, mode='eval')
 63 | 
 64 |     args = HParams(**meta['training_args']).update(vars(args_nondefault))
 65 | 
 66 |     # Features extractor
 67 |     input_parser = utils.get_from_module('preprocessing.audio',
 68 |                                          args.input_parser,
 69 |                                          params=args.input_parser_params)
 70 | 
 71 |     # Recovering text parser
 72 |     label_parser = utils.get_from_module('preprocessing.text',
 73 |                                          args.label_parser,
 74 |                                          params=args.label_parser_params)
 75 | 
 76 |     if args.dataset is not None:
 77 |         data_gen = DatasetGenerator(input_parser, label_parser,
 78 |                                     batch_size=1, seed=0, mode='predict')
 79 |         test_flow = data_gen.flow_from_fname(args.dataset,
 80 |                                              datasets=args.subset)
 81 |     else:
 82 |         if len(args.files) == 0:
 83 |             raise ValueError("files arg must be > 0")
 84 | 
 85 |         test_flow = DatasetIterator(np.array(args.files), None,
 86 |                                     input_parser=input_parser,
 87 |                                     label_parser=label_parser, mode='predict')
 88 |         test_flow.labels = np.array([u'']*len(args.file))
 89 | 
 90 |     model = load_model(args.model, mode='predict', decoder=False)
 91 | 
 92 |     results = []
 93 | 
 94 |     plt.figure()
 95 |     for index in range(test_flow.len):
 96 |         prediction = model.predict(test_flow.next())
 97 | 
 98 |         truth = label_parser._sanitize(test_flow.labels[0])
 99 | 
100 |         plt.plot(prediction[0,...])
101 |         plt.show()
102 | 
103 | 
104 |     from keras import backend as K; K.clear_session()
105 | 


--------------------------------------------------------------------------------
/extras/eval_apis.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import os
  6 | import argparse
  7 | import codecs
  8 | import json
  9 | import time
 10 | 
 11 | from preprocessing import audio, text
 12 | from utils import generic_utils as utils
 13 | 
 14 | import apis
 15 | import speech_recognition as sr
 16 | 
 17 | if __name__ == '__main__':
 18 |     parser = argparse.ArgumentParser(description='Evaluating an ASR system \
 19 | over an API.')
 20 | 
 21 |     parser.add_argument('--dataset', required=True, type=str)
 22 |     parser.add_argument('--language', default='pt-BR', type=str)
 23 |     parser.add_argument('--all', action='store_true', help='Will evaluate \
 24 | over all dataset, not only with the dt key equals test.')
 25 | 
 26 |     # Label generation (if necessary)
 27 |     parser.add_argument('--label_parser', type=str,
 28 |                         default='simple_char_parser')
 29 |     parser.add_argument('--label_parser_params',nargs='+', default=[])
 30 | 
 31 |     # Other configs
 32 |     parser.add_argument('--save_every', default=10, type=int)
 33 |     parser.add_argument('--resume', action='store_true')
 34 |     parser.add_argument('--save', default=None, type=str)
 35 |     parser.add_argument('--apis', default=['google', 'ibm', 'microsoft'],
 36 |                         nargs='+')
 37 | 
 38 |     args = parser.parse_args()
 39 | 
 40 |     # If save is not defined, it will use the folder name of dataset location
 41 |     save = args.save
 42 |     if args.save is None:
 43 |         save = '%s_eval_apis.json' % args.dataset.split(os.path.sep)[-2]
 44 | 
 45 |     # Recovering text parser
 46 |     label_parser = utils.get_from_module('preprocessing.text',
 47 |                                         args.label_parser,
 48 |                                         params=args.label_parser_params)
 49 | 
 50 |     if not utils.check_ext(args.dataset, 'json'):
 51 |         raise ValueError('dataset must be a json file')
 52 | 
 53 |     dataset = json.load(codecs.open(args.dataset, 'r', encoding='utf8'))
 54 | 
 55 |     if not args.all and 'dt' in dataset[0]:
 56 |         dataset = [d for d in dataset if d['dt'] == 'test']
 57 | 
 58 |     apis = {'google': apis.recognize_google,
 59 |             'ibm': apis.recognize_ibm,
 60 |             'microsoft': apis.recognize_bing}
 61 | 
 62 |     eval_apis = []
 63 |     if args.resume:
 64 |         with codecs.open(save, 'r', encoding='utf8') as f:
 65 |             eval_apis = json.load(f)
 66 | 
 67 |     for i, data in enumerate(dataset):
 68 | 
 69 |         if len(eval_apis) > i:
 70 |             result = eval_apis[i]
 71 |         else:
 72 |             result = {}
 73 |             result['label'] = data['label']
 74 |             result['audio'] = data['audio']
 75 | 
 76 |             if args.all and 'dt' in data:
 77 |                 result['dt'] = data['dt']
 78 | 
 79 |         for api_name in args.apis:
 80 |             if api_name in result and result[api_name] != '':
 81 |                 continue
 82 |             try:
 83 |                 result[api_name] = apis[api_name](data['audio'], safe=False,
 84 |                                                   language=args.language)
 85 |             except Exception as e:
 86 |                 result[api_name] = ''
 87 |                 print(e)
 88 | 
 89 |         if len(eval_apis) > i:
 90 |             eval_apis[i] = result
 91 |         else:
 92 |             eval_apis.append(result)
 93 | 
 94 |         if (args.save_every % (i + 1)) == 0:
 95 |             with codecs.open(save, 'w', encoding='utf8') as f:
 96 |                 json.dump(eval_apis, f)
 97 | 
 98 |         print('Done %d/%d' % (i + 1, len(dataset)))
 99 |         time.sleep(.1)
100 | 
101 |     with codecs.open(save, 'w', encoding='utf8') as f:
102 |         json.dump(eval_apis, f)
103 | 


--------------------------------------------------------------------------------
/extras/make_dataset.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | import argparse
 4 | 
 5 | from utils import generic_utils as utils
 6 | from utils.hparams import HParams
 7 | 
 8 | import preprocessing
 9 | import datasets
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser(description='Generates a preprocessed dataset (hdf5 file) by providing the path to the dataset and the correct parser.')
13 | 
14 |     parser.add_argument('--dataset_dir', type=str, default=None)
15 |     parser.add_argument('--parser', type=str, required=True)
16 |     parser.add_argument('--parser_params', nargs='+', default=[])
17 | 
18 |     parser.add_argument('--output_file', type=str, default=None)
19 | 
20 |     parser.add_argument('--input_parser', type=str, default=None)
21 |     parser.add_argument('--input_parser_params', nargs='+', default=[])
22 | 
23 |     parser.add_argument('--label_parser', type=str,
24 |                         default=None)
25 |     parser.add_argument('--label_parser_params', nargs='+', default=[])
26 | 
27 |     parser.add_argument('--override', action='store_true')
28 | 
29 |     args = parser.parse_args()
30 | 
31 |     parser = utils.get_from_module('datasets*',
32 |                                    args.parser,
33 |                                    regex=True)
34 | 
35 |     input_parser = utils.get_from_module('preprocessing.audio',
36 |                                          args.input_parser,
37 |                                          params=args.input_parser_params)
38 |     label_parser = utils.get_from_module('preprocessing.text',
39 |                                          args.label_parser,
40 |                                          params=args.label_parser_params)
41 | 
42 |     dataset = parser(args.dataset_dir,
43 |                      **HParams().parse(args.parser_params).values())
44 | 
45 |     output_file = dataset.to_h5(fname=args.output_file,
46 |                                 input_parser=input_parser,
47 |                                 label_parser=label_parser,
48 |                                 override=args.override)
49 | 
50 |     print('Dataset %s saved at %s' % (parser.name, output_file))
51 | 


--------------------------------------------------------------------------------
/extras/print_args.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | import argparse
 4 | 
 5 | from utils.core_utils import load_meta
 6 | 
 7 | if __name__ == '__main__':
 8 |     parser = argparse.ArgumentParser(description='Print training arguments')
 9 |     parser.add_argument('--model', required=True, type=str)
10 |     args = parser.parse_args()
11 | 
12 |     meta = load_meta(args.model)
13 | 
14 |     for k, v in meta['training_args'].items():
15 |         print('%s: %s' % (k, v))
16 | 


--------------------------------------------------------------------------------
/extras/recognizer.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | 
  3 | # NOTE: this example requires PyAudio because it uses the Microphone class
  4 | 
  5 | import sys
  6 | import os
  7 | import json
  8 | import argparse
  9 | import preprocessing
 10 | import inspect
 11 | import numpy as np
 12 | 
 13 | import speech_recognition as sr
 14 | 
 15 | import utils.generic_utils as utils
 16 | 
 17 | from core.dataset_generator import DatasetIterator
 18 | from utils.core_utils import setup_gpu
 19 | 
 20 | import keras.backend as K
 21 | from keras.models import Model
 22 | from keras.layers import Lambda
 23 | 
 24 | import tensorflow as tf
 25 | 
 26 | if __name__ == "__main__":
 27 |     parser = argparse.ArgumentParser()
 28 | 
 29 |     parser.add_argument('source', type=str, nargs='+', default=['mic'])
 30 |     parser.add_argument('--language', default='pt-BR', type=str)
 31 | 
 32 |     # Custom asr
 33 |     parser.add_argument('--model', default=None, type=str)
 34 |     parser.add_argument('--gpu', default='0', type=str)
 35 |     parser.add_argument('--allow_growth', default=False, action='store_true')
 36 | 
 37 |     parser.add_argument('--apis', default=['google', 'ibm', 'microsoft'], nargs='+')
 38 | 
 39 |     args = parser.parse_args()
 40 | 
 41 |     r = sr.Recognizer()
 42 | 
 43 |     audios = []
 44 |     if len(args.source) == 1 and args.source[0] == 'mic':
 45 |         # obtain audio from the microphone
 46 |         with sr.Microphone() as source:
 47 |             print("Say something! (language %s)" % args.language)
 48 |             mic_audio = r.listen(source)
 49 | 
 50 |         with tempfile.NamedTemporaryFile(delete=False) as f:
 51 |             f.write(mic_audio.get_wav_data())
 52 |         audios.append((f.name, 'microphone'))
 53 |     else:
 54 |         for audio_fname in args.source:
 55 |             with sr.AudioFile(audio_fname) as source:
 56 |                 audios.append((r.record(source), audio_fname))
 57 |                 # read the entire audio file
 58 | 
 59 |     if args.model is not None:
 60 |         setup_gpu(args.gpu, args.allow_growth)
 61 | 
 62 |         model, meta = utils.load_model(args.model,
 63 |                                        return_meta=True,
 64 |                                        mode='predict')
 65 |         training_args = meta['training_args']
 66 | 
 67 |         # Features extractor
 68 |         input_parser = utils.get_from_module('preprocessing.audio',
 69 |                                              training_args['feats'],
 70 |                                              params=training_args['feats_params'])
 71 | 
 72 |         # Recovering text parser
 73 |         label_parser = utils.get_from_module('preprocessing.text',
 74 |                                              training_args['label_parser'],
 75 |                                              params=training_args['label_parser_params']
 76 |                                             )
 77 | 
 78 |         data_it = DatasetIterator(np.array([f for a, f in audios]),
 79 |                                   label_parser=input_parser,
 80 |                                   label_parser=label_parser)
 81 | 
 82 |         model_predictions = model.predict_generator(
 83 |             data_it, val_samples=len(audios))
 84 | 
 85 |         model_predictions = [label_parser.imap(p[:(np.argmax(p == -1) or len(p))]) for p in model_predictions]
 86 | 
 87 |     for i, (audio, name) in enumerate(audios):
 88 | 
 89 |         print('Recognizing from: %s' % name)
 90 | 
 91 |         if 'google' in args.apis:
 92 |             rec = apis.recognize_google(audio, language=args.language)
 93 |             print("\tGoogle Cloud Speech:\n\t\t'%s'" % rec)
 94 | 
 95 |         if 'microsoft' in args.apis:
 96 |             rec = apis.recognize_bing(audio, language=args.language)
 97 |             print("\tMicrosoft Bing:\n\t\t'%s'" % rec)
 98 | 
 99 |         if 'ibm' in args.apis:
100 |             rec = apis.recognize_ibm(audio, language=args.language)
101 |             print("\tIBM Speech to Text:\n\t\t'%s'" % rec)
102 | 
103 |         if args.model is not None:
104 |             print("\tTrained model:\n\t\t'%s'" % model_predictions[i])
105 | 


--------------------------------------------------------------------------------
/extras/results2xlsx.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | import os
 4 | import argparse
 5 | import yaml
 6 | import numpy as np
 7 | 
 8 | import openpyxl
 9 | from openpyxl import Workbook
10 | 
11 | from utils.core_utils import load_meta
12 | 
13 | if __name__ == "__main__":
14 |     parser = argparse.ArgumentParser()
15 | 
16 |     parser.add_argument('--folder', default='results', type=str)
17 |     parser.add_argument('--del_empty_dir', action='store_true')
18 | 
19 |     args = parser.parse_args()
20 | 
21 |     metas = {}
22 | 
23 |     for subdir, dirs, files in os.walk(args.folder):
24 |         if len(dirs):
25 |             continue
26 | 
27 |         if not len(files):
28 |             if args.del_empty_dir:
29 |                 print('deleting folder %s' % subdir)
30 |                 os.rmdir(os.path.abspath(subdir))
31 | 
32 |         if 'model.h5' not in files:
33 |             print('model.h5 not found in %s' % subdir)
34 |             continue
35 | 
36 |         try:
37 |             meta = load_meta(os.path.join(subdir, 'model.h5'))
38 |             metas[subdir.split(os.sep)[-1]] = meta
39 |         except KeyError:
40 |             print('meta not found in %s' % os.path.join(subdir, 'model.h5'))
41 | 
42 |     training_args = list(set([arg for model in metas for arg in
43 |                              metas[model]['training_args']]))
44 | 
45 |     datasets = {}
46 |     for model in metas:
47 |         args = metas[model]['training_args']
48 |         meta = metas[model]
49 | 
50 |         try:
51 |             key = args['dataset']
52 |             if type(key) in (list, set):
53 |                 key = key[0]
54 |             key = key.split(os.sep)[-2]
55 |         except KeyError:
56 |             key = 'unknown'
57 | 
58 |         if key not in datasets:
59 |             datasets[key] = {}
60 | 
61 |         datasets[key][model] = meta
62 | 
63 |     wb = Workbook()
64 | 
65 | columns = ['path'] + ['epoch', 'best_val_ler'] + training_args
66 | 
67 | for name in datasets:
68 |     ws = wb.create_sheet(name)
69 | 
70 |     cell_range = ws['A1':'%s1'
71 |                     % openpyxl.utils.get_column_letter(len(columns))][0]
72 | 
73 |     for i, cell in zip(range(len(cell_range)), cell_range):
74 |         cell.value = columns[i]
75 | 
76 |     for row, (model, meta) in enumerate(datasets[name].items(), start=2):
77 | 
78 |         ws['A%d' % row] = model
79 |         for key in ('epoch', 'epochs'):
80 |             if key in meta:
81 |                 ws['B%d' % row] = meta[key][np.argmin(meta['val_decoder_ler'])]
82 |                 break
83 |         ws['C%d' % row] = np.min(meta['val_decoder_ler'])
84 | 
85 |         for arg, val in meta['training_args'].items():
86 |             col = openpyxl.utils.get_column_letter(
87 |                 training_args.index(arg) + 4)
88 | 
89 |             if type(val) in (list, set):
90 |                 val = ', '.join(val)
91 | 
92 |             ws['%s%d' % (col, row)] = val
93 | 
94 | 
95 | wb.save('results.xlsx')
96 | 


--------------------------------------------------------------------------------
/imgs/best_ler.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igormq/asr-study/302fa3087cc71aec4853360638dbe2f4a59b5726/imgs/best_ler.jpg


--------------------------------------------------------------------------------
/imgs/best_ler.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igormq/asr-study/302fa3087cc71aec4853360638dbe2f4a59b5726/imgs/best_ler.pdf


--------------------------------------------------------------------------------
/imgs/best_loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igormq/asr-study/302fa3087cc71aec4853360638dbe2f4a59b5726/imgs/best_loss.jpg


--------------------------------------------------------------------------------
/imgs/best_loss.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igormq/asr-study/302fa3087cc71aec4853360638dbe2f4a59b5726/imgs/best_loss.pdf


--------------------------------------------------------------------------------
/logging.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | disable_existing_loggers: False
 3 | formatters:
 4 |     complete:
 5 |         format: "%(name)-12s: (asctime)s %(name)-12s %(levelname)-8s %(message)s"
 6 |         datefmt: "%m-%d %H:%M"
 7 |     simple:
 8 |         format: "%(name)-12s: %(levelname)-8s %(message)s"
 9 | handlers:
10 |     console:
11 |         class: logging.StreamHandler
12 |         level: WARNING
13 |         formatter: simple
14 |         stream: ext://sys.stdout
15 |     file_handler:
16 |         class: logging.handlers.RotatingFileHandler
17 |         level: INFO
18 |         formatter: complete
19 |         filename: info.log
20 |         maxBytes: 10485760 # 10MB
21 |         backupCount: 20
22 |         encoding: utf8
23 | root:
24 |     level: INFO
25 |     handlers: [console, file_handler]
26 | 


--------------------------------------------------------------------------------
/msc.yaml:
--------------------------------------------------------------------------------
  1 | name: msc
  2 | channels:
  3 | - !!python/unicode
  4 |   'defaults'
  5 | dependencies:
  6 | - !!python/unicode
  7 |   'certifi=2016.2.28=py27_0'
  8 | - !!python/unicode
  9 |   'cycler=0.10.0=py27_0'
 10 | - !!python/unicode
 11 |   'freetype=2.5.5=2'
 12 | - !!python/unicode
 13 |   'functools32=3.2.3.2=py27_0'
 14 | - !!python/unicode
 15 |   'h5py=2.7.0=np113py27_0'
 16 | - !!python/unicode
 17 |   'hdf5=1.8.17=2'
 18 | - !!python/unicode
 19 |   'icu=54.1=0'
 20 | - !!python/unicode
 21 |   'jbig=2.1=0'
 22 | - !!python/unicode
 23 |   'jpeg=9b=0'
 24 | - !!python/unicode
 25 |   'libpng=1.6.30=1'
 26 | - !!python/unicode
 27 |   'libtiff=4.0.6=3'
 28 | - !!python/unicode
 29 |   'matplotlib=2.0.2=np113py27_0'
 30 | - !!python/unicode
 31 |   'mkl=2017.0.3=0'
 32 | - !!python/unicode
 33 |   'numpy=1.13.1=py27_0'
 34 | - !!python/unicode
 35 |   'olefile=0.44=py27_0'
 36 | - !!python/unicode
 37 |   'openssl=1.0.2l=0'
 38 | - !!python/unicode
 39 |   'pillow=4.2.1=py27_0'
 40 | - !!python/unicode
 41 |   'pip=9.0.1=py27_1'
 42 | - !!python/unicode
 43 |   'pyparsing=2.2.0=py27_0'
 44 | - !!python/unicode
 45 |   'pyqt=5.6.0=py27_2'
 46 | - !!python/unicode
 47 |   'python=2.7.13=0'
 48 | - !!python/unicode
 49 |   'python-dateutil=2.6.1=py27_0'
 50 | - !!python/unicode
 51 |   'pytz=2017.2=py27_0'
 52 | - !!python/unicode
 53 |   'pyyaml=3.12=py27_0'
 54 | - !!python/unicode
 55 |   'qt=5.6.2=2'
 56 | - !!python/unicode
 57 |   'readline=6.2=2'
 58 | - !!python/unicode
 59 |   'scipy=0.19.1=np113py27_0'
 60 | - !!python/unicode
 61 |   'setuptools=36.4.0=py27_1'
 62 | - !!python/unicode
 63 |   'sip=4.18=py27_0'
 64 | - !!python/unicode
 65 |   'six=1.10.0=py27_0'
 66 | - !!python/unicode
 67 |   'sqlite=3.13.0=0'
 68 | - !!python/unicode
 69 |   'subprocess32=3.2.7=py27_0'
 70 | - !!python/unicode
 71 |   'tk=8.5.18=0'
 72 | - !!python/unicode
 73 |   'wheel=0.29.0=py27_0'
 74 | - !!python/unicode
 75 |   'xz=5.2.3=0'
 76 | - !!python/unicode
 77 |   'yaml=0.1.6=0'
 78 | - !!python/unicode
 79 |   'zlib=1.2.11=0'
 80 | - pip:
 81 |   - audioread==2.1.5
 82 |   - backports.weakref==1.0.post1
 83 |   - bleach==1.5.0
 84 |   - decorator==4.1.2
 85 |   - enum34==1.1.6
 86 |   - funcsigs==1.0.2
 87 |   - html5lib==0.9999999
 88 |   - joblib==0.11
 89 |   - keras==1.2.2
 90 |   - librosa==0.5.1
 91 |   - llvmlite==0.20.0
 92 |   - markdown==2.6.9
 93 |   - mock==2.0.0
 94 |   - numba==0.35.0
 95 |   - pbr==3.1.1
 96 |   - protobuf==3.4.0
 97 |   - resampy==0.2.0
 98 |   - scikit-learn==0.19.0
 99 |   - singledispatch==3.4.0.3
100 |   - tensorflow==1.3.0
101 |   - tensorflow-tensorboard==0.1.8
102 |   - theano==0.9.0
103 |   - unidecode==0.4.21
104 |   - werkzeug==0.12.2
105 | prefix: !!python/unicode '/Users/igormq/miniconda2/envs/msc'
106 | 
107 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import h5py
  4 | import os
  5 | import numpy as np
  6 | import codecs
  7 | 
  8 | from datasets.dataset_generator import DatasetGenerator, DatasetIterator
  9 | 
 10 | from utils.core_utils import setup_gpu, load_model
 11 | 
 12 | from utils.hparams import HParams
 13 | from utils import generic_utils as utils
 14 | 
 15 | from preprocessing import audio, text
 16 | 
 17 | if __name__ == '__main__':
 18 | 
 19 |     parser = argparse.ArgumentParser(description='Evaluating an ASR system.')
 20 | 
 21 |     parser.add_argument('--model', required=True, type=str)
 22 |     parser.add_argument('--dataset', default=None, type=str)
 23 |     parser.add_argument('--file', default=None, type=str)
 24 |     parser.add_argument('--subset', type=str, default='test')
 25 | 
 26 |     # Features generation (if necessary)
 27 |     parser.add_argument('--input_parser', type=str, default=None)
 28 |     parser.add_argument('--input_parser_params', nargs='+', default=[])
 29 | 
 30 |     # Label generation (if necessary)
 31 |     parser.add_argument('--label_parser', type=str,
 32 |                         default='simple_char_parser')
 33 |     parser.add_argument('--label_parser_params', nargs='+', default=[])
 34 |     parser.add_argument('--no_decoder', action='store_true', default=False)
 35 | 
 36 |     # Other configs
 37 |     parser.add_argument('--gpu', default='0', type=str)
 38 |     parser.add_argument('--allow_growth', default=False, action='store_true')
 39 | 
 40 |     parser.add_argument('--save', default=None, type=str)
 41 |     parser.add_argument('--override', default=False, action='store_true')
 42 | 
 43 |     args = parser.parse_args()
 44 |     args_nondefault = utils.parse_nondefault_args(
 45 |         args, parser.parse_args(
 46 |             ['--model', args.model, '--dataset', args.dataset]))
 47 | 
 48 |     if args.dataset is None and args.file is None:
 49 |         raise ValueError('dataset or file args must be set.')
 50 | 
 51 |     if args.dataset and args.file:
 52 |         print('Both dataset and file args was set. Ignoring file args.')
 53 | 
 54 |     # GPU configuration
 55 |     setup_gpu(args.gpu, args.allow_growth)
 56 | 
 57 |     # Loading model
 58 |     model, meta = load_model(args.model, return_meta=True,
 59 |                              mode='predict', decoder=(not args.no_decoder))
 60 | 
 61 |     args = HParams(**meta['training_args']).update(vars(args_nondefault))
 62 | 
 63 |     # Features extractor
 64 |     input_parser = utils.get_from_module('preprocessing.audio',
 65 |                                          args.input_parser,
 66 |                                          params=args.input_parser_params)
 67 | 
 68 |     # Recovering text parser
 69 |     label_parser = utils.get_from_module('preprocessing.text',
 70 |                                          args.label_parser,
 71 |                                          params=args.label_parser_params)
 72 | 
 73 |     if args.dataset is not None:
 74 |         data_gen = DatasetGenerator(input_parser, label_parser,
 75 |                                     batch_size=1, seed=0, mode='predict',
 76 |                                     shuffle=False)
 77 |         test_flow = data_gen.flow_from_fname(args.dataset,
 78 |                                              datasets=args.subset)
 79 |     else:
 80 |         test_flow = DatasetIterator(np.array([args.file]), None,
 81 |                                     input_parser=input_parser,
 82 |                                     label_parser=label_parser, mode='predict',
 83 |                                     shuffle=False)
 84 |         test_flow.labels = np.array([u''])
 85 | 
 86 |     results = []
 87 |     for index in range(test_flow.len):
 88 |         prediction = model.predict(test_flow.next())
 89 |         if not args.no_decoder:
 90 |             prediction = label_parser.imap(prediction[0])
 91 |         results.append({'input': test_flow.inputs[0].tolist(), 'label': test_flow.labels[0], 'best': prediction.tolist()})
 92 |         print('Ground Truth: %s' % (label_parser._sanitize(test_flow.labels[0])))
 93 |         print('   Predicted: %s\n\n' % prediction)
 94 | 
 95 |     if args.save is not None:
 96 |         if os.path.exists(args.save):
 97 |             if not args.override:
 98 |                 raise IOError('Unable to create file')
 99 |             os.remove(args.save)
100 | 
101 |         if args.no_decoder:
102 |             with h5py.File(args.save) as f:
103 |                 predictions = f.create_dataset(
104 |                     'predictions', (0,), maxshape=(None,),
105 |                     dtype=h5py.special_dtype(vlen=np.dtype('float32')))
106 |                 predictions.attrs['num_labels'] = results[0]['prediction'].shape[-1]
107 | 
108 |                 labels = f.create_dataset(
109 |                     'labels', (0,), maxshape=(None,),
110 |                     dtype=h5py.special_dtype(vlen=unicode))
111 | 
112 |                 inputs = f.create_dataset(
113 |                     'inputs', (0,), maxshape=(None,),
114 |                     dtype=h5py.special_dtype(vlen=unicode))
115 | 
116 |                 for index, result in enumerate(results):
117 | 
118 |                     label = result['label']
119 |                     prediction = result['prediction']
120 |                     input_ = result['input']
121 | 
122 |                     inputs.resize(inputs.shape[0] + 1, axis=0)
123 |                     inputs[inputs.shape[0] - 1] = input_
124 | 
125 |                     labels.resize(labels.shape[0] + 1, axis=0)
126 |                     labels[labels.shape[0] - 1] = label.encode('utf8')
127 | 
128 |                     predictions.resize(predictions.shape[0] + 1, axis=0)
129 |                     predictions[predictions.shape[0] - 1] = prediction.flatten().astype('float32')
130 | 
131 |                     # Flush to disk only when it reaches 128 samples
132 |                     if index % 128 == 0:
133 |                         print('%d/%d done.' % (index, len(results)))
134 |                         f.flush()
135 | 
136 |                 f.flush()
137 |                 print('%d/%d done.' % (len(results), len(results)))
138 |         else:
139 |             raise ValueError('save param must be set if no_decoder is Truepython')
140 | 
141 |     else:
142 |         with codecs.open(args.save, 'w', encoding='utf8') as f:
143 |             json.dump(results, f)
144 | 
145 |     from keras import backend as K
146 |     K.clear_session()
147 | 


--------------------------------------------------------------------------------
/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .audio import MFCC, FBank, LogFbank, Raw
3 | from .text import CharParser, simple_char_parser, complex_char_parser
4 | 


--------------------------------------------------------------------------------
/preprocessing/audio.py:
--------------------------------------------------------------------------------
  1 | ''' Code partially copied from python_speech_features package
  2 | '''
  3 | from __future__ import absolute_import
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | 
  7 | from . import audio_utils as sigproc
  8 | 
  9 | import os
 10 | import numpy as np
 11 | import logging
 12 | 
 13 | from scipy import signal
 14 | from scipy.fftpack import dct
 15 | import librosa
 16 | 
 17 | 
 18 | class Feature(object):
 19 |     """ Base class for features calculation
 20 |     All children class must implement __str__ and _call function.
 21 | 
 22 |     # Arguments
 23 |         fs: sampling frequency of audio signal. If the audio has not this fs,
 24 |         it will be resampled
 25 |         eps
 26 |     """
 27 | 
 28 |     def __init__(self, fs=16e3, eps=1e-8, stride=1, num_context=0,
 29 |                  mean_norm=True, var_norm=True):
 30 |         self.fs = fs
 31 |         self.eps = eps
 32 | 
 33 |         self.mean_norm = mean_norm
 34 |         self.var_norm = var_norm
 35 | 
 36 |         self.stride = stride
 37 |         self.num_context = num_context
 38 |         self._logger = logging.getLogger('%s.%s' % (__name__,
 39 |                                                     self.__class__.__name__))
 40 | 
 41 |     def __call__(self, audio):
 42 |         """ This method load the audio and do the transformation of signal
 43 | 
 44 |         # Inputs
 45 |             audio:
 46 |                 if audio is a string and the file exists, the wave file will
 47 |                 be loaded and resampled (if necessary) to fs
 48 |                 if audio is a ndarray or list and is not empty, it will make
 49 |                 the transformation without any resampling
 50 | 
 51 |         # Exception
 52 |             TypeError if audio were not recognized
 53 | 
 54 |         """
 55 |         if ((isinstance(audio, str) or isinstance(audio, unicode))
 56 |             and os.path.isfile(audio)):
 57 |             audio, current_fs = librosa.audio.load(audio)
 58 |             audio = librosa.core.resample(audio, current_fs, self.fs)
 59 |             feats = self._call(audio)
 60 |         elif type(audio) in (np.ndarray, list) and len(audio) > 1:
 61 |             feats = self._call(audio)
 62 |         else:
 63 |             TypeError("audio type is not support")
 64 | 
 65 |         return self._standarize(self._postprocessing(feats))
 66 | 
 67 |     def _call(self, data):
 68 |         raise NotImplementedError("__call__ must be overrided")
 69 | 
 70 |     def _standarize(self, feats):
 71 |         if self.mean_norm:
 72 |             feats -= np.mean(feats, axis=0, keepdims=True)
 73 |         if self.var_norm:
 74 |             feats /= (np.std(feats, axis=0, keepdims=True) + self.eps)
 75 |         return feats
 76 | 
 77 |     def _postprocessing(self, feats):
 78 |         # Code adapted from
 79 |         # https://github.com/mozilla/DeepSpeech/blob/master/util/audio.py
 80 | 
 81 |         # We only keep every second feature (BiRNN stride = 2)
 82 |         feats = feats[::self.stride]
 83 | 
 84 |         if self.num_context == 0:
 85 |             return feats
 86 |         num_feats = feats.shape[1]
 87 | 
 88 |         train_inputs = np.array([], np.float32)
 89 |         train_inputs.resize((feats.shape[0],
 90 |                             num_feats + 2*num_feats*self.num_context))
 91 | 
 92 |         # Prepare pre-fix post fix context
 93 |         # (TODO: Fill empty_mfcc with MCFF of silence)
 94 |         empty_mfcc = np.array([])
 95 |         empty_mfcc.resize((num_feats))
 96 | 
 97 |         # Prepare train_inputs with past and future contexts
 98 |         time_slices = range(train_inputs.shape[0])
 99 |         context_past_min = time_slices[0] + self.num_context
100 |         context_future_max = time_slices[-1] - self.num_context
101 |         for time_slice in time_slices:
102 |             # Reminder: array[start:stop:step]
103 |             # slices from indice |start| up to |stop| (not included), every
104 |             # |step|
105 |             # Pick up to self.num_context time slices in the past, and complete
106 |             # with empty
107 |             # mfcc features
108 |             need_empty_past = max(0, (context_past_min - time_slice))
109 |             empty_source_past = list(empty_mfcc for empty_slots
110 |                                      in range(need_empty_past))
111 |             data_source_past = feats[max(0, time_slice -
112 |                                          self.num_context):time_slice]
113 |             assert(len(empty_source_past) +
114 |                    len(data_source_past) == self.num_context)
115 | 
116 |             # Pick up to self.num_context time slices in the future, and
117 |             # complete with empty
118 |             # mfcc features
119 |             need_empty_future = max(0, (time_slice - context_future_max))
120 |             empty_source_future = list(empty_mfcc
121 |                                        for empty_slots in
122 |                                        range(need_empty_future))
123 |             data_source_future = feats[time_slice + 1:time_slice +
124 |                                        self.num_context + 1]
125 | 
126 |             assert(len(empty_source_future) +
127 |                    len(data_source_future) == self.num_context)
128 | 
129 |             if need_empty_past:
130 |                 past = np.concatenate((empty_source_past, data_source_past))
131 |             else:
132 |                 past = data_source_past
133 | 
134 |             if need_empty_future:
135 |                 future = np.concatenate((data_source_future,
136 |                                          empty_source_future))
137 |             else:
138 |                 future = data_source_future
139 | 
140 |             past = np.reshape(past, self.num_context*num_feats)
141 |             now = feats[time_slice]
142 |             future = np.reshape(future, self.num_context*num_feats)
143 | 
144 |             train_inputs[time_slice] = np.concatenate((past, now, future))
145 |             assert(len(train_inputs[time_slice])
146 |                    == num_feats + 2*num_feats*self.num_context)
147 | 
148 |         self._num_feats = num_feats + 2*num_feats*self.num_context
149 | 
150 |         return train_inputs
151 | 
152 |     def __str__(self):
153 |         raise NotImplementedError("__str__ must be overrided")
154 | 
155 |     @property
156 |     def num_feats(self):
157 |         return self._num_feats
158 | 
159 | 
160 | class FBank(Feature):
161 |     """Compute Mel-filterbank energy features from an audio signal.
162 | 
163 |     # Arguments
164 |         win_len: the length of the analysis window in seconds.
165 |             Default  is 0.025s (25 milliseconds)
166 |         win_step: the step between successive windows in seconds.
167 |             Default is 0.01s (10 milliseconds)
168 |         num_filt: the number of filters in the filterbank, default 40.
169 |         nfft: the FFT size. Default is 512.
170 |         low_freq: lowest band edge of mel filters in Hz.
171 |             Default is 20.
172 |         high_freq: highest band edge of mel filters in Hz.
173 |             Default is 7800
174 |         pre_emph: apply preemphasis filter with preemph as coefficient.
175 |         0 is no filter. Default is 0.97.
176 |         win_func: the analysis window to apply to each frame.
177 |             By default hamming window is applied.
178 |     """
179 | 
180 |     def __init__(self, win_len=0.025, win_step=0.01,
181 |                  num_filt=40, nfft=512, low_freq=20, high_freq=7800,
182 |                  pre_emph=0.97, win_fun=signal.hamming, **kwargs):
183 | 
184 |         super(FBank, self).__init__(**kwargs)
185 | 
186 |         if high_freq > self.fs / 2:
187 |             raise ValueError("high_freq must be less or equal than fs/2")
188 | 
189 |         self.win_len = win_len
190 |         self.win_step = win_step
191 |         self.num_filt = num_filt
192 |         self.nfft = nfft
193 |         self.low_freq = low_freq
194 |         self.high_freq = high_freq or self.fs / 2
195 |         self.pre_emph = pre_emph
196 |         self.win_fun = win_fun
197 |         self._filterbanks = self._get_filterbanks()
198 | 
199 |         self._num_feats = self.num_filt
200 | 
201 |     @property
202 |     def mel_points(self):
203 |         return np.linspace(self._low_mel, self._high_mel, self.num_filt + 2)
204 | 
205 |     @property
206 |     def low_freq(self):
207 |         return self._low_freq
208 | 
209 |     @low_freq.setter
210 |     def low_freq(self, value):
211 |         self._low_mel = self._hz2mel(value)
212 |         self._low_freq = value
213 | 
214 |     @property
215 |     def high_freq(self):
216 |         return self._high_freq
217 | 
218 |     @high_freq.setter
219 |     def high_freq(self, value):
220 |         self._high_mel = self._hz2mel(value)
221 |         self._high_freq = value
222 | 
223 |     def _call(self, signal):
224 |         """Compute Mel-filterbank energy features from an audio signal.
225 |         :param signal: the audio signal from which to compute features. Should
226 |         be an N*1 array
227 | 
228 |         Returns:
229 |             2 values. The first is a numpy array of size (NUMFRAMES by nfilt)
230 |             containing features. Each row holds 1 feature vector. The
231 |             second return value is the energy in each frame (total energy,
232 |             unwindowed)
233 |         """
234 | 
235 |         signal = sigproc.preemphasis(signal, self.pre_emph)
236 | 
237 |         frames = sigproc.framesig(signal,
238 |                                   self.win_len * self.fs,
239 |                                   self.win_step * self.fs,
240 |                                   self.win_fun)
241 | 
242 |         pspec = sigproc.powspec(frames, self.nfft)
243 |         # this stores the total energy in each frame
244 |         energy = np.sum(pspec, 1)
245 |         # if energy is zero, we get problems with log
246 |         energy = np.where(energy == 0, np.finfo(float).eps, energy)
247 | 
248 |         # compute the filterbank energies
249 |         feat = np.dot(pspec, self._filterbanks.T)
250 |         # if feat is zero, we get problems with log
251 |         feat = np.where(feat == 0, np.finfo(float).eps, feat)
252 | 
253 |         return feat, energy
254 | 
255 |     def _get_filterbanks(self):
256 |         """Compute a Mel-filterbank. The filters are stored in the rows, the
257 |         columns correspond
258 |         to fft bins. The filters are returned as an array of size nfilt *
259 |         (nfft / 2 + 1)
260 | 
261 |         Returns:
262 |             A numpy array of size num_filt * (nfft/2 + 1) containing
263 |             filterbank. Each row holds 1 filter.
264 |         """
265 | 
266 |         # our points are in Hz, but we use fft bins, so we have to convert
267 |         #  from Hz to fft bin number
268 |         bin = np.floor((self.nfft + 1) * self._mel2hz(self.mel_points) /
269 |                        self.fs)
270 | 
271 |         fbank = np.zeros([self.num_filt, int(self.nfft / 2 + 1)])
272 |         for j in xrange(0, self.num_filt):
273 |             for i in xrange(int(bin[j]), int(bin[j + 1])):
274 |                 fbank[j, i] = (i - bin[j]) / (bin[j + 1] - bin[j])
275 |             for i in xrange(int(bin[j + 1]), int(bin[j + 2])):
276 |                 fbank[j, i] = (bin[j + 2] - i) / (bin[j + 2] - bin[j + 1])
277 |         return fbank
278 | 
279 |     def _hz2mel(self, hz):
280 |         """Convert a value in Hertz to Mels
281 | 
282 |         Args:
283 |             hz: a value in Hz. This can also be a numpy array, conversion
284 |             proceeds element-wise.
285 | 
286 |         Returns:
287 |             A value in Mels. If an array was passed in, an identical sized
288 |             array is returned.
289 |         """
290 |         return 2595 * np.log10(1 + hz / 700.0)
291 | 
292 |     def _mel2hz(self, mel):
293 |         """Convert a value in Mels to Hertz
294 | 
295 |         Args:
296 |             mel: a value in Mels. This can also be a numpy array, conversion
297 |             proceeds element-wise.
298 | 
299 |         Returns:
300 |             A value in Hertz. If an array was passed in, an identical sized
301 |             array is returned.
302 |         """
303 |         return 700 * (10**(mel / 2595.0) - 1)
304 | 
305 |     def __str__(self):
306 |         return "fbank"
307 | 
308 | 
309 | class MFCC(FBank):
310 |     """Compute MFCC features from an audio signal.
311 | 
312 |     # Arguments
313 |         num_cep: the number of cepstrum to return. Default 13.
314 |         cep_lifter: apply a lifter to final cepstral coefficients. 0 is
315 |         no lifter. Default is 22.
316 |         append_energy: if this is true, the zeroth cepstral coefficient
317 |         is replaced with the log of the total frame energy.
318 |         d: if True add deltas coeficients. Default True
319 |         dd: if True add delta-deltas coeficients. Default True
320 |         norm: if 'cmn' performs the cepstral mean normalization. elif 'cmvn'
321 |         performs the cepstral mean and variance normalizastion. Default 'cmn'
322 |     """
323 | 
324 |     def __init__(self, num_cep=13, cep_lifter=22, append_energy=True,
325 |                  d=True, dd=True, **kwargs):
326 | 
327 |         super(MFCC, self).__init__(**kwargs)
328 | 
329 |         self.num_cep = num_cep
330 |         self.cep_lifter = cep_lifter
331 |         self.append_energy = append_energy
332 |         self.d = d
333 |         self.dd = dd
334 |         self._num_feats = (1 + self.d + self.dd) * self.num_cep
335 | 
336 |         self._logger = logging.getLogger('%s.%s' % (__name__,
337 |                                                     self.__class__.__name__))
338 | 
339 |     def _call(self, signal):
340 |         """Compute MFCC features from an audio signal.
341 | 
342 |         Args:
343 |             signal: the audio signal from which to compute features. Should be
344 |             an N*1 array
345 | 
346 |         Returns:
347 |             A numpy array of size (NUMFRAMES by numcep) containing features.
348 |             Each row holds 1 feature vector.
349 |         """
350 |         feat, energy = super(MFCC, self)._call(signal)
351 | 
352 |         feat = np.log(feat)
353 |         feat = dct(feat, type=2, axis=1, norm='ortho')[:, :self.num_cep]
354 |         feat = self._lifter(feat, self.cep_lifter)
355 | 
356 |         if self.append_energy:
357 |             # replace first cepstral coefficient with log of frame energy
358 |             feat[:, 0] = np.log(energy + self.eps)
359 | 
360 |         if self.d:
361 |             d = sigproc.delta(feat, 2)
362 |             feat = np.hstack([feat, d])
363 | 
364 |             if self.dd:
365 |                 feat = np.hstack([feat, sigproc.delta(d, 2)])
366 | 
367 |         return feat
368 | 
369 |     def _lifter(self, cepstra, L=22):
370 |         """Apply a cepstral lifter the the matrix of cepstra.
371 | 
372 |         This has the effect of increasing the magnitude of the high frequency
373 |         DCT coeffs.
374 | 
375 |         Args:
376 |             cepstra: the matrix of mel-cepstra, will be numframes * numcep in
377 |             size.
378 |             L: the liftering coefficient to use. Default is 22. L <= 0 disables
379 |             lifter.
380 |         """
381 |         if L > 0:
382 |             nframes, ncoeff = np.shape(cepstra)
383 |             n = np.arange(ncoeff)
384 |             lift = 1 + (L / 2) * np.sin(np.pi * n / L)
385 |             return lift * cepstra
386 |         else:
387 |             # values of L <= 0, do nothing
388 |             return cepstra
389 | 
390 |     def __str__(self):
391 |         return "mfcc"
392 | 
393 | 
394 | class LogFbank(FBank):
395 |     """Compute Mel-filterbank energy features from an audio signal.
396 | 
397 |     # Arguments
398 |         append_energy: if this is true, log of the total frame energy is
399 |         append to the features vector. Default False
400 |         d: if True add deltas coeficients. Default False
401 |         dd: if True add delta-deltas coeficients. Default False
402 |     """
403 | 
404 |     def __init__(self, d=False, dd=False, append_energy=False, **kwargs):
405 |         """Constructor
406 |         """
407 | 
408 |         super(LogFbank, self).__init__(**kwargs)
409 | 
410 |         self.d = d
411 |         self.dd = dd
412 |         self.append_energy = append_energy
413 |         self._num_feats = ((1 + self.d + self.dd)
414 |                            * (self.num_filt + self.append_energy))
415 | 
416 |         self._logger = logging.getLogger('%s.%s' % (__name__,
417 |                                                     self.__class__.__name__))
418 | 
419 |     def _call(self, signal):
420 |         """Compute log Mel-filterbank energy features from an audio signal.
421 |         :param signal: the audio signal from which to compute features. Should
422 |         be an N*1 array
423 | 
424 |         Returns:
425 |              A numpy array of size (NUMFRAMES by nfilt) containing features.
426 |              Each row holds 1 feature vector.
427 |         """
428 |         feat, energy = super(LogFbank, self)._call(signal)
429 | 
430 |         feat = np.log(feat)
431 | 
432 |         if self.append_energy:
433 |             feat = np.hstack([feat, np.log(energy + self.eps)[:, np.newaxis]])
434 | 
435 |         if self.d:
436 |             d = sigproc.delta(feat, 2)
437 |             feat = np.hstack([feat, d])
438 | 
439 |             if self.dd:
440 |                 feat = np.hstack([feat, sigproc.delta(d, 2)])
441 | 
442 |         return feat
443 | 
444 |     def __str__(self):
445 |         return "logfbank"
446 | 
447 | 
448 | class Raw(Feature):
449 |     """ Raw features extractor
450 |     """
451 |     def __init__(self, **kwargs):
452 |         super(Raw, self).__init__(**kwargs)
453 |         self._num_feats = None
454 | 
455 |     def _call(self, x):
456 |         return x
457 | 
458 |     def _postprocessing(self, x):
459 |         return x
460 | 
461 |     def __str__(self):
462 |         return "raw"
463 | 
464 | 
465 | raw = Raw()
466 | 


--------------------------------------------------------------------------------
/preprocessing/audio_utils.py:
--------------------------------------------------------------------------------
  1 | """ Code based on package python_speech_features
  2 | 
  3 | Author: James Lyons 2012
  4 | """
  5 | import decimal
  6 | 
  7 | import numpy
  8 | import math
  9 | 
 10 | 
 11 | def round_half_up(number):
 12 |     return int(decimal.Decimal(number).quantize(decimal.Decimal('1'),
 13 |                                                 rounding=decimal.ROUND_HALF_UP
 14 |                                                 ))
 15 | 
 16 | 
 17 | def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
 18 |     """Frame a signal into overlapping frames.
 19 |     :param sig: the audio signal to frame.
 20 |     :param frame_len: length of each frame measured in samples.
 21 |     :param frame_step: number of samples after the start of the previous frame
 22 |     that the next frame should begin.
 23 |     :param winfunc: the analysis window to apply to each frame. By default no
 24 |     window is applied.
 25 |     :returns: an array of frames. Size is NUMFRAMES by frame_len.
 26 |     """
 27 |     slen = len(sig)
 28 |     frame_len = int(round_half_up(frame_len))
 29 |     frame_step = int(round_half_up(frame_step))
 30 |     if slen <= frame_len:
 31 |         numframes = 1
 32 |     else:
 33 |         numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))
 34 | 
 35 |     padlen = int((numframes - 1) * frame_step + frame_len)
 36 | 
 37 |     zeros = numpy.zeros((padlen - slen,))
 38 |     padsignal = numpy.concatenate((sig, zeros))
 39 | 
 40 |     indices = numpy.tile(
 41 |         numpy.arange(
 42 |             0, frame_len),
 43 |         (numframes, 1)) + numpy.tile(
 44 |             numpy.arange(
 45 |                 0, numframes * frame_step, frame_step), (frame_len, 1)).T
 46 | 
 47 |     indices = numpy.array(indices, dtype=numpy.int32)
 48 |     frames = padsignal[indices]
 49 |     win = numpy.tile(winfunc(frame_len), (numframes, 1))
 50 |     return frames * win
 51 | 
 52 | 
 53 | def deframesig(frames, siglen, frame_len, frame_step,
 54 |                winfunc=lambda x: numpy.ones((x,))):
 55 |     """Does overlap-add procedure to undo the action of framesig.
 56 |     :param frames: the array of frames.
 57 |     :param siglen: the length of the desired signal, use 0 if unknown. Output
 58 |     will be truncated to siglen samples.
 59 |     :param frame_len: length of each frame measured in samples.
 60 |     :param frame_step: number of samples after the start of the previous frame
 61 |     that the next frame should begin.
 62 |     :param winfunc: the analysis window to apply to each frame. By default no
 63 |     window is applied.
 64 |     :returns: a 1-D signal.
 65 |     """
 66 |     frame_len = round_half_up(frame_len)
 67 |     frame_step = round_half_up(frame_step)
 68 |     numframes = numpy.shape(frames)[0]
 69 |     assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong\
 70 |     size, 2nd dim is not equal to frame_len'
 71 | 
 72 |     indices = numpy.tile(
 73 |         numpy.arange(
 74 |             0, frame_len), (numframes, 1)) + numpy.tile(
 75 |                 numpy.arange(
 76 |                     0, numframes * frame_step, frame_step), (frame_len, 1)).T
 77 | 
 78 |     indices = numpy.array(indices, dtype=numpy.int32)
 79 |     padlen = (numframes - 1) * frame_step + frame_len
 80 | 
 81 |     if siglen <= 0:
 82 |         siglen = padlen
 83 | 
 84 |     rec_signal = numpy.zeros((padlen,))
 85 |     window_correction = numpy.zeros((padlen,))
 86 |     win = winfunc(frame_len)
 87 | 
 88 |     for i in range(0, numframes):
 89 |         # add a little bit so it is never zero
 90 |         window_correction[indices[i, :]] = window_correction[indices[i, :]] + \
 91 |                                            win + 1e-15
 92 |         rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
 93 | 
 94 |     rec_signal = rec_signal / window_correction
 95 |     return rec_signal[0:siglen]
 96 | 
 97 | 
 98 | def magspec(frames, NFFT):
 99 |     """Compute the magnitude spectrum of each frame in frames. If frames is an
100 |     NxD matrix, output will be NxNFFT.
101 |     :param frames: the array of frames. Each row is a frame.
102 |     :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are
103 |     zero-padded.
104 |     :returns: If frames is an NxD matrix, output will be NxNFFT. Each row will
105 |     be the magnitude spectrum of the corresponding frame.
106 |     """
107 |     complex_spec = numpy.fft.rfft(frames, NFFT)
108 |     return numpy.absolute(complex_spec)
109 | 
110 | 
111 | def powspec(frames, NFFT):
112 |     """Compute the power spectrum of each frame in frames. If frames is an NxD
113 |     matrix, output will be NxNFFT.
114 |     :param frames: the array of frames. Each row is a frame.
115 |     :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are
116 |     zero-padded.
117 |     :returns: If frames is an NxD matrix, output will be NxNFFT. Each row will
118 |     be the power spectrum of the corresponding frame.
119 |     """
120 |     return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))
121 | 
122 | 
123 | def logpowspec(frames, NFFT, norm=1):
124 |     """Compute the log power spectrum of each frame in frames. If frames is an
125 |     NxD matrix, output will be NxNFFT.
126 |     :param frames: the array of frames. Each row is a frame.
127 |     :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are
128 |     zero-padded.
129 |     :param norm: If norm=1, the log power spectrum is normalised so that the
130 |     max value (across all frames) is 1.
131 |     :returns: If frames is an NxD matrix, output will be NxNFFT. Each row will
132 |     be the log power spectrum of the corresponding frame.
133 |     """
134 |     ps = powspec(frames, NFFT)
135 |     ps[ps <= 1e-30] = 1e-30
136 |     lps = 10 * numpy.log10(ps)
137 |     if norm:
138 |         return lps - numpy.max(lps)
139 |     else:
140 |         return lps
141 | 
142 | 
143 | def preemphasis(signal, coeff=0.95):
144 |     """perform preemphasis on the input signal.
145 | 
146 |     :param signal: The signal to filter.
147 |     :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
148 |     :returns: the filtered signal.
149 |     """
150 |     return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
151 | 
152 | 
153 | def delta(feat, N):
154 |     """Compute delta features from a feature vector sequence.
155 | 
156 |     :param feat: A numpy array of size (NUMFRAMES by number of features)
157 |     containing features. Each row holds 1 feature vector.
158 |     :param N: For each frame, calculate delta features based on preceding and
159 |     following N frames
160 |     :returns: A numpy array of size (NUMFRAMES by number of features)
161 |     containing delta features. Each row holds 1 delta feature vector.
162 |     """
163 |     NUMFRAMES = len(feat)
164 |     feat = numpy.concatenate(([feat[0] for i in range(N)], feat, [feat[-1] for
165 |                                                                   i in
166 |                                                                   range(N)]))
167 |     denom = sum([2 * i * i for i in range(1, N + 1)])
168 |     dfeat = []
169 |     for j in range(NUMFRAMES):
170 |         dfeat.append(numpy.sum([n * feat[N + j + n]
171 |                                 for n in range(-1 * N, N + 1)], axis=0) /
172 |                      denom)
173 |     return dfeat
174 | 


--------------------------------------------------------------------------------
/preprocessing/text.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import, division, print_function
  3 | 
  4 | import string
  5 | from unidecode import unidecode
  6 | import logging
  7 | import numpy as np
  8 | 
  9 | PUNCTUATIONS = "'""-,.!?:;"
 10 | ACCENTS = u'ãõçâêôáíóúàüóé'
 11 | 
 12 | 
 13 | class BaseParser(object):
 14 |     """ Interface class for all parsers
 15 |     """
 16 | 
 17 |     def __init__(self):
 18 |         self._logger = logging.getLogger('%s.%s' % (__name__,
 19 |                                                     self.__class__.__name__))
 20 | 
 21 |     def __call__(self, _input):
 22 |         return self.map(_input)
 23 | 
 24 |     def map(self, _input):
 25 |         pass
 26 | 
 27 |     def imap(self, _input):
 28 |         pass
 29 | 
 30 |     def is_valid(self, _input):
 31 |         pass
 32 | 
 33 | 
 34 | class CharParser(BaseParser):
 35 |     """ Class responsible to map any text in a certain character vocabulary
 36 | 
 37 |     # Arguments
 38 |         mode: Which type of vacabulary will be generated. Modes can be
 39 |         concatenated by using pipeline '|'
 40 |             'space' or 's': accepts space character
 41 |             'accents' or 'a': accepts pt-br accents
 42 |             'punctuation' or 'p': accepts punctuation defined in
 43 |             string.punctuation
 44 |             'digits': accepts all digits
 45 |             'sensitive' or 'S': characters will be case sensitive
 46 |             'all': shortcut that enables all modes
 47 |     """
 48 | 
 49 |     def __init__(self, mode='space'):
 50 |         self._permitted_modes = {'sensitive': 'S', 'space': 's', 'accents':
 51 |                                  'a', 'punctuation': 'p', 'digits': 'd'}
 52 | 
 53 |         if mode == 'all':
 54 |             self.mode = self._permitted_modes.values()
 55 |         else:
 56 |             self.mode = []
 57 |             for m in mode.split('|'):
 58 |                 try:
 59 |                     self.mode.append(self._permitted_modes[m])
 60 |                 except KeyError:
 61 |                     if m not in self._permitted_modes.values():
 62 |                         raise ValueError('Unknown mode %s' % m)
 63 | 
 64 |                     self.mode.append(m)
 65 | 
 66 |         self._vocab, self._inv_vocab = self._gen_vocab()
 67 | 
 68 |     def map(self, txt, sanitize=True):
 69 |         if sanitize:
 70 |             label = np.array([self._vocab[c] for c in self._sanitize(txt)],
 71 |                              dtype='int32')
 72 |         else:
 73 |             label = np.array([self._vocab[c] for c in txt], dtype='int32')
 74 | 
 75 |         return label
 76 | 
 77 |     def imap(self, labels):
 78 |         txt = ''.join([self._inv_vocab[l] for l in labels])
 79 | 
 80 |         return txt
 81 | 
 82 |     def _sanitize(self, text):
 83 |         # removing duplicated spaces
 84 |         text = ' '.join(text.split())
 85 | 
 86 |         if not('d' in self.mode):
 87 |             text = ''.join([c for c in text if not c.isdigit()])
 88 | 
 89 |         if not('a' in self.mode):
 90 |             text = unidecode(text)
 91 | 
 92 |         if not('p' in self.mode):
 93 |             text = text.translate(
 94 |                 string.maketrans("-'", '  ')).translate(None,
 95 |                                                         string.punctuation)
 96 | 
 97 |         if not ('s' in self.mode):
 98 |             text = text.replace(' ', '')
 99 | 
100 |         if not('S' in self.mode):
101 |             text = text.lower()
102 | 
103 |         return text
104 | 
105 |     def is_valid(self, text):
106 |         # verify if the text is valid without sanitization
107 |         try:
108 |             _ = self.map(text, sanitize=False)
109 |             return True
110 |         except KeyError:
111 |             return False
112 | 
113 |     def _gen_vocab(self):
114 | 
115 |         vocab = {chr(value + ord('a')): (value)
116 |                  for value in xrange(ord('z') - ord('a') + 1)}
117 | 
118 |         if 'a' in self.mode:
119 |             for a in ACCENTS:
120 |                 vocab[a] = len(vocab)
121 | 
122 |         if 'S' in self.mode:
123 |             for char in vocab.keys():
124 |                 vocab[char.upper()] = len(vocab)
125 | 
126 |         if 's' in self.mode:
127 |             # Inserts space label
128 |             vocab[' '] = len(vocab)
129 | 
130 |         if 'p' in self.mode:
131 |             for p in PUNCTUATIONS:
132 |                 vocab[p] = len(vocab)
133 | 
134 |         if 'd' in self.mode:
135 |             for num in range(10):
136 |                 vocab[str(num)] = len(vocab)
137 | 
138 |         inv_vocab = {v: k for (k, v) in vocab.iteritems()}
139 | 
140 |         # Add blank label
141 |         inv_vocab[len(inv_vocab)] = '<b>'
142 | 
143 |         return vocab, inv_vocab
144 | 
145 | 
146 | simple_char_parser = CharParser()
147 | complex_char_parser = CharParser(mode='s|p|a|d')
148 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # This file may be used to create an environment using:
 2 | # $ conda create --name <env> --file <this file>
 3 | # platform: osx-64
 4 | certifi=2016.2.28=py27_0
 5 | cycler=0.10.0=py27_0
 6 | freetype=2.5.5=2
 7 | functools32=3.2.3.2=py27_0
 8 | h5py=2.7.0=np113py27_0
 9 | hdf5=1.8.17=2
10 | icu=54.1=0
11 | jbig=2.1=0
12 | jpeg=9b=0
13 | libpng=1.6.30=1
14 | libtiff=4.0.6=3
15 | matplotlib=2.0.2=np113py27_0
16 | mkl=2017.0.3=0
17 | numpy=1.13.1=py27_0
18 | olefile=0.44=py27_0
19 | openssl=1.0.2l=0
20 | pillow=4.2.1=py27_0
21 | pip=9.0.1=py27_1
22 | pyparsing=2.2.0=py27_0
23 | pyqt=5.6.0=py27_2
24 | python=2.7.13=0
25 | python-dateutil=2.6.1=py27_0
26 | pytz=2017.2=py27_0
27 | pyyaml=3.12=py27_0
28 | qt=5.6.2=2
29 | readline=6.2=2
30 | scipy=0.19.1=np113py27_0
31 | setuptools=36.4.0=py27_1
32 | sip=4.18=py27_0
33 | six=1.10.0=py27_0
34 | sqlite=3.13.0=0
35 | subprocess32=3.2.7=py27_0
36 | tk=8.5.18=0
37 | wheel=0.29.0=py27_0
38 | xz=5.2.3=0
39 | yaml=0.1.6=0
40 | zlib=1.2.11=0
41 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import os
  6 | # Preventing pool_allocator message
  7 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
  8 | 
  9 | import argparse
 10 | import uuid
 11 | import sys
 12 | import json
 13 | import datetime
 14 | import inspect
 15 | import codecs
 16 | 
 17 | import logging
 18 | try:
 19 |     import warpctc_tensorflow
 20 | except ImportError:
 21 |     logging.warning('warpctc binding for tensorflow not found. :(')
 22 | import tensorflow as tf
 23 | 
 24 | import keras
 25 | 
 26 | import keras.backend as K
 27 | from keras.optimizers import SGD, Adam
 28 | from keras.callbacks import ReduceLROnPlateau
 29 | 
 30 | from core import metrics
 31 | from core.ctc_utils import ctc_dummy_loss, decoder_dummy_loss
 32 | from core.callbacks import MetaCheckpoint, ProgbarLogger
 33 | from utils.core_utils import setup_gpu
 34 | 
 35 | from preprocessing import audio, text
 36 | 
 37 | from datasets.dataset_generator import DatasetGenerator
 38 | from utils.hparams import HParams
 39 | 
 40 | import utils.generic_utils as utils
 41 | 
 42 | from utils.core_utils import load_model
 43 | 
 44 | if __name__ == '__main__':
 45 | 
 46 |     parser = argparse.ArgumentParser(description='Training an ASR system.')
 47 | 
 48 |     # Resume training
 49 |     parser.add_argument('--load', default=None, type=str)
 50 | 
 51 |     # Model settings
 52 |     parser.add_argument('--model', default='brsmv1', type=str)
 53 |     parser.add_argument('--model_params', nargs='+', default=[])
 54 | 
 55 |     # Hyper parameters
 56 |     parser.add_argument('--num_epochs', default=100, type=int)
 57 |     parser.add_argument('--lr', default=0.001, type=float)
 58 |     parser.add_argument('--momentum', default=0.9, type=float)
 59 |     parser.add_argument('--clipnorm', default=400, type=float)
 60 |     parser.add_argument('--batch_size', default=32, type=int)
 61 |     parser.add_argument('--opt', default='adam', type=str,
 62 |                         choices=['sgd', 'adam'])
 63 |     # End of hyper parameters
 64 | 
 65 |     # Dataset definitions
 66 |     parser.add_argument('--dataset', default=None, type=str, nargs='+')
 67 | 
 68 |     # Features generation (if necessary)
 69 |     parser.add_argument('--input_parser', type=str, default=None)
 70 |     parser.add_argument('--input_parser_params', nargs='+', default=[])
 71 | 
 72 |     # Label generation (if necessary)
 73 |     parser.add_argument('--label_parser', type=str,
 74 |                         default='simple_char_parser')
 75 |     parser.add_argument('--label_parser_params', nargs='+', default=[])
 76 | 
 77 |     # Callbacks
 78 |     parser.add_argument('--lr_schedule', default=None)
 79 |     parser.add_argument('--lr_params', nargs='+', default=[])
 80 | 
 81 |     # Other configs
 82 |     parser.add_argument('--save', default=None, type=str)
 83 |     parser.add_argument('--gpu', default='0', type=str)
 84 |     parser.add_argument('--allow_growth', default=False, action='store_true')
 85 |     parser.add_argument('--verbose', default=0, type=int)
 86 |     parser.add_argument('--seed', default=None, type=float)
 87 | 
 88 |     args = parser.parse_args()
 89 | 
 90 |     # Setup logging
 91 |     utils.setup_logging()
 92 |     logger = logging.getLogger(__name__)
 93 |     tf.logging.set_verbosity(tf.logging.ERROR)
 94 | 
 95 |     # hack in ProgbarLogger: avoid logger.infoing the dummy losses
 96 |     keras.callbacks.ProgbarLogger = lambda: ProgbarLogger(
 97 |         show_metrics=['loss', 'decoder_ler', 'val_loss', 'val_decoder_ler'])
 98 | 
 99 |     # GPU configuration
100 |     setup_gpu(args.gpu, args.allow_growth,
101 |               log_device_placement=args.verbose > 1)
102 | 
103 |     # Initial configuration
104 |     epoch_offset = 0
105 |     meta = None
106 | 
107 |     if args.load:
108 |         args_nondefault = utils.parse_nondefault_args(args,
109 |                                                       parser.parse_args([]))
110 | 
111 |         logger.info('Loading model...')
112 |         model, meta = load_model(args.load, return_meta=True)
113 | 
114 |         logger.info('Loading parameters...')
115 |         args = HParams(**meta['training_args']).update(vars(args_nondefault))
116 | 
117 |         epoch_offset = len(meta['epochs'])
118 |         logger.info('Current epoch: %d' % epoch_offset)
119 | 
120 |         if args_nondefault.lr:
121 |             logger.info('Setting current learning rate to %f...' % args.lr)
122 |             K.set_value(model.optimizer.lr, args.lr)
123 | 
124 |     else:
125 |         logger.info('Creating model...')
126 |         # Recovering all valid models
127 |         model_fn = utils.get_from_module('core.models', args.model)
128 |         # Loading model
129 |         model = model_fn(**(HParams().parse(args.model_params).values()))
130 | 
131 |         logger.info('Setting the optimizer...')
132 |         # Optimization
133 |         if args.opt.strip().lower() == 'sgd':
134 |             opt = SGD(lr=args.lr, momentum=args.momentum,
135 |                       clipnorm=args.clipnorm)
136 |         elif args.opt.strip().lower() == 'adam':
137 |             opt = Adam(lr=args.lr, clipnorm=args.clipnorm)
138 | 
139 |         # Compile with dummy loss
140 |         model.compile(loss={'ctc': ctc_dummy_loss,
141 |                             'decoder': decoder_dummy_loss},
142 |                       optimizer=opt, metrics={'decoder': metrics.ler},
143 |                       loss_weights=[1, 0])
144 | 
145 |     logger.info('Creating results folder...')
146 |     # Creating the results folder
147 |     output_dir = args.save
148 |     if output_dir is None:
149 |         output_dir = os.path.join('results',
150 |                                   '%s_%s' % (args.model,
151 |                                              datetime.datetime.now()))
152 |     if not os.path.isdir(output_dir):
153 |         os.makedirs(output_dir)
154 | 
155 |     logger.info('Adding callbacks')
156 |     # Callbacks
157 |     model_ckpt = MetaCheckpoint(os.path.join(output_dir, 'model.h5'),
158 |                                 training_args=args, meta=meta)
159 |     best_ckpt = MetaCheckpoint(
160 |         os.path.join(output_dir, 'best.h5'), monitor='val_decoder_ler',
161 |         save_best_only=True, mode='min', training_args=args, meta=meta)
162 |     callback_list = [model_ckpt, best_ckpt]
163 | 
164 |     # LR schedules
165 |     if args.lr_schedule:
166 |         lr_schedule_fn = utils.get_from_module('keras.callbacks',
167 |                                                args.lr_schedule)
168 |         if lr_schedule_fn:
169 |             lr_schedule = lr_schedule_fn(**HParams().parse(args.lr_params).values())
170 |             callback_list.append(lr_schedule)
171 |         else:
172 |             raise ValueError('Learning rate schedule unrecognized')
173 | 
174 |     logger.info('Getting the feature extractor...')
175 |     # Features extractor
176 |     input_parser = utils.get_from_module('preprocessing.audio',
177 |                                          args.input_parser,
178 |                                          params=args.input_parser_params)
179 | 
180 |     logger.info('Getting the text parser...')
181 |     # Recovering text parser
182 |     label_parser = utils.get_from_module('preprocessing.text',
183 |                                          args.label_parser,
184 |                                          params=args.label_parser_params)
185 | 
186 |     logger.info('Getting the data generator...')
187 |     # Data generator
188 |     data_gen = DatasetGenerator(input_parser, label_parser,
189 |                                 batch_size=args.batch_size,
190 |                                 seed=args.seed)
191 |     # iterators over datasets
192 |     train_flow, valid_flow, test_flow = None, None, None
193 |     num_val_samples = num_test_samples = 0
194 | 
195 |     logger.info('Generating flow...')
196 |     if len(args.dataset) == 1:
197 |         train_flow, valid_flow, test_flow = data_gen.flow_from_fname(
198 |             args.dataset[0], datasets=['train', 'valid', 'test'])
199 |         num_val_samples = valid_flow.len
200 |     else:
201 |         train_flow = data_gen.flow_from_fname(args.dataset[0])
202 |         valid_flow = data_gen.flow_from_fname(args.dataset[1])
203 | 
204 |         num_val_samples = valid_flow.len
205 |         if len(args.dataset) == 3:
206 |             test_flow = data_gen.flow_from_fname(args.dataset[2])
207 |             num_test_samples = test_flow.len
208 | 
209 |     logger.info(str(vars(args)))
210 |     print(str(vars(args)))
211 |     logger.info('Initialzing training...')
212 |     # Fit the model
213 |     model.fit_generator(train_flow, samples_per_epoch=train_flow.len,
214 |                         nb_epoch=args.num_epochs, validation_data=valid_flow,
215 |                         nb_val_samples=num_val_samples, max_q_size=10,
216 |                         nb_worker=1, callbacks=callback_list, verbose=1,
217 |                         initial_epoch=epoch_offset)
218 | 
219 |     if test_flow:
220 |         del model
221 |         model = load_model(os.path.join(output_dir, 'best.h5'), mode='eval')
222 |         logger.info('Evaluating best model on test set')
223 |         metrics = model.evaluate_generator(test_flow, test_flow.len,
224 |                                            max_q_size=10, nb_worker=1)
225 | 
226 |         msg = 'Total loss: %.4f\n\
227 | CTC Loss: %.4f\nLER: %.2f%%' % (metrics[0], metrics[1], metrics[3]*100)
228 |         logger.info(msg)
229 | 
230 |         with open(os.path.join(output_dir, 'results.txt'), 'w') as f:
231 |             f.write(msg)
232 | 
233 |         print(msg)
234 | 
235 |     K.clear_session()
236 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 | 
3 | from .hparams import HParams
4 | 


--------------------------------------------------------------------------------
/utils/core_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import h5py
  6 | import yaml
  7 | 
  8 | import keras
  9 | import keras.backend as K
 10 | from keras.models import Model
 11 | from keras.layers import Lambda
 12 | import tensorflow as tf
 13 | 
 14 | import core
 15 | from core import layers_utils
 16 | from core import ctc_utils
 17 | from core import metrics
 18 | 
 19 | from utils.generic_utils import inspect_module
 20 | 
 21 | 
 22 | def setup_gpu(gpu, allow_growth=False, log_device_placement=False):
 23 |     # Choosing gpu
 24 |     if gpu == '-1':
 25 |         config = tf.ConfigProto(device_count={'GPU': 0},
 26 |                                 log_device_placement=log_device_placement)
 27 |     else:
 28 |         if gpu == 'all':
 29 |             gpu = ''
 30 |         config = tf.ConfigProto(log_device_placement=log_device_placement)
 31 |         config.gpu_options.visible_device_list = gpu
 32 |     if allow_growth:  # dynamic gpu memory allocation
 33 |         config.gpu_options.allow_growth = True
 34 |     session = tf.Session(config=config)
 35 |     K.set_session(session)
 36 | 
 37 | 
 38 | def get_custom_objects():
 39 |     """ Verify all custom object that may be used to load a keras model
 40 |     """
 41 |     all_custom_objects = []
 42 |     for module in ['core.layers', 'core.layers_utils',
 43 |                    'core.metrics', 'core.ctc_utils',
 44 |                    'core.initializers']:
 45 |         all_custom_objects.extend(inspect_module(module, to_dict=False))
 46 | 
 47 |     return dict(all_custom_objects)
 48 | 
 49 | def load_model(model_fname, return_meta=False, mode='train', **kwargs):
 50 |     """ Loading keras model with custom objects
 51 | 
 52 |     Args
 53 |         mode:
 54 |             if 'train', model will follow the definition in core.models
 55 |             if 'predict', beamsearch decoder will be used and the model return
 56 |             a np array with -1 filled in no data area
 57 |             if 'eval', greedy decoder will be replaced by beam search decoder
 58 |         of predictions
 59 |     """
 60 |     if mode not in ('train', 'predict', 'eval'):
 61 |         raise ValueError('mode must be one of (train, predict, eval)')
 62 | 
 63 |     model = keras.models.load_model(model_fname,
 64 |                                     custom_objects=get_custom_objects())
 65 | 
 66 |     # Define the new decoder and the to_dense layer
 67 |     if kwargs.get('decoder', True):
 68 |         dec = Lambda(ctc_utils.decode,
 69 |                      output_shape=ctc_utils.decode_output_shape,
 70 |                      arguments={'is_greedy': kwargs.get('is_greedy', False),
 71 |                                 'beam_width': kwargs.get('beam_width', 400)},
 72 |                      name='beam_search')
 73 |     else:
 74 |         dec = Lambda(lambda x: x[0])
 75 | 
 76 |     if mode == 'predict':
 77 |         y_pred = (model.get_layer('y_pred') or
 78 |                   model.get_layer('decoder').input[0])
 79 | 
 80 |         input_ = model.get_layer('inputs').input
 81 |         inputs_length = model.get_layer('inputs_length').input
 82 | 
 83 |         to_dense_layer = Lambda(
 84 |             layers_utils.to_dense,
 85 |             output_shape=layers_utils.to_dense_output_shape,
 86 |             name="to_dense")
 87 | 
 88 |         y_pred = dec([y_pred, inputs_length])
 89 | 
 90 |         y_pred = to_dense_layer(y_pred)
 91 | 
 92 |         model = Model(input=[input_, inputs_length],
 93 |                       output=[y_pred])
 94 |     elif mode == 'eval':
 95 |         dec_layer = model.get_layer('decoder')
 96 | 
 97 |         y_pred_bs = dec(dec_layer.input)
 98 | 
 99 |         model = Model(input=model.inputs, output=[model.outputs[0], y_pred_bs])
100 | 
101 |         # Freezing layers
102 |         for l in model.layers:
103 |             l.trainable = False
104 | 
105 |         model.compile('sgd',
106 |                       loss={'ctc': ctc_utils.ctc_dummy_loss,
107 |                             'beam_search': ctc_utils.decoder_dummy_loss},
108 |                       metrics={'beam_search': metrics.ler},
109 |                       loss_weights=[1, 0])
110 | 
111 |     if return_meta:
112 |         meta = load_meta(model_fname)
113 |         return model, meta
114 | 
115 |     return model
116 | 
117 | 
118 | def load_meta(model_fname):
119 |     ''' Load meta configuration
120 |     '''
121 |     meta = {}
122 | 
123 |     with h5py.File(model_fname, 'r') as f:
124 |         meta_group = f['meta']
125 | 
126 |         meta['training_args'] = yaml.load(
127 |             meta_group.attrs['training_args'])
128 |         for k in meta_group.keys():
129 |             meta[k] = list(meta_group[k])
130 | 
131 |     return meta
132 | 


--------------------------------------------------------------------------------
/utils/generic_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import h5py
  6 | import sys
  7 | import os
  8 | 
  9 | import logging
 10 | import logging.config
 11 | import yaml
 12 | 
 13 | import numpy as np
 14 | from scipy import sparse
 15 | 
 16 | import inspect
 17 | import yaml
 18 | 
 19 | from .hparams import HParams
 20 | 
 21 | import re
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | 
 26 | def safe_mkdirs(path):
 27 |     ''' Safe makedirs
 28 |     Directory is created with command `makedir -p`.
 29 |     Returns:
 30 |         `path` if the directory already exists or is created
 31 |     Exception:
 32 |         OSError if something is wrong
 33 |     '''
 34 |     try:
 35 |         os.makedirs(path)
 36 |     except OSError, e:
 37 |         if e.errno != 17:  # 17 = file exists
 38 |             raise
 39 | 
 40 |     return path
 41 | 
 42 | 
 43 | def get_from_module(module, name, params=None, regex=False):
 44 |     """ Get a class or method from a module given its name
 45 |     """
 46 |     members = inspect_module(module, regex=regex)
 47 | 
 48 |     if name is None or name.lower() == 'none':
 49 |         return None
 50 | 
 51 |     members = {k.lower().strip(): v for k, v in members.items()}
 52 | 
 53 |     try:
 54 |         member = members[name.lower().strip()]
 55 |         # is a class and must be instantiate if params is not none
 56 |         if (member and params is not None) and inspect.isclass(member):
 57 |             return member(**HParams().parse(params).values())
 58 | 
 59 |         return member
 60 |     except KeyError, e:
 61 |         raise KeyError("%s not found in %s.\n Valid values are: %s" %
 62 |                        (name, module, ', '.join(members.keys())))
 63 | 
 64 | 
 65 | def inspect_module(module, to_dict=True, regex=False):
 66 |     modules = {}
 67 |     if regex:
 68 |         pattern = re.compile(module)
 69 |         for key, value in sys.modules.items():
 70 |             if pattern.match(key):
 71 |                 modules[key] = value
 72 |     else:
 73 |         modules = {module: sys.modules[module]}
 74 | 
 75 |     members = []
 76 |     for key, value in modules.items():
 77 |         members.extend(inspect.getmembers(value, lambda member:
 78 |                                           hasattr(member, '__module__') and
 79 |                                           member.__module__ == key))
 80 | 
 81 |     if to_dict:
 82 |         return dict(members)
 83 | 
 84 |     return members
 85 | 
 86 | 
 87 | def ld2dl(ld):
 88 |     '''Transform a list of dictionaries in a dictionaries with lists
 89 |     # Note
 90 |         All dictionaries have the same keys
 91 |     '''
 92 |     return dict(zip(ld[0], zip(*[d.values() for d in ld])))
 93 | 
 94 | def check_ext(fname, ext):
 95 |     # Adding dot
 96 |     ext = ext if ext[0] == '.' else '.' + ext
 97 |     fname, f_ext = os.path.splitext(fname)
 98 | 
 99 |     if f_ext == ext:
100 |         return True
101 | 
102 |     return False
103 | 
104 | 
105 | def parse_nondefault_args(args, default_args):
106 |     # removing default arguments
107 |     args_default = {k: v for k, v in vars(default_args).items()
108 |                     if k not in [arg.split('-')[-1] for arg in sys.argv
109 |                                  if arg.startswith('-')]}
110 |     args_nondefault = {k: v for k, v in vars(args).items()
111 |                        if k not in args_default or args_default[k] != v}
112 | 
113 |     args_nondefault = HParams().parse(args_nondefault)
114 | 
115 |     return args_nondefault
116 | 
117 | 
118 | def setup_logging(default_path='logging.yaml', default_level=logging.INFO,
119 |                   env_key='LOG_CFG'):
120 |     """Setup logging configuration
121 | 
122 |     """
123 |     path = default_path
124 |     value = os.getenv(env_key, None)
125 |     if value:
126 |         path = value
127 |     if os.path.exists(path):
128 |         with open(path, 'rt') as f:
129 |             config = yaml.safe_load(f.read())
130 |         logging.config.dictConfig(config)
131 |     else:
132 |         logging.basicConfig(level=default_level)
133 | 


--------------------------------------------------------------------------------
/utils/hparams.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | 
 3 | class HParams(object):
 4 |     """Creates an object for passing around hyperparameter values.
 5 |     Use the parse method to overwrite the default hyperparameters with values
 6 |     passed in as a string representation of a Python dictionary mapping
 7 |     hyperparameters to values.
 8 | 
 9 |     # Example
10 |       hparams = magenta.common.HParams(batch_size=128, hidden_size=256)
11 |       hparams.parse('{"hidden_size":512}')
12 |       assert hparams.batch_size == 128
13 |       assert hparams.hidden_size == 512
14 | 
15 | 
16 |       Code adpated from Google Magenta
17 |     """
18 | 
19 |     def __init__(self, **init_hparams):
20 |         object.__setattr__(self, 'keyvals', init_hparams)
21 | 
22 |     def __getitem__(self, key):
23 |         """Returns value of the given hyperameter, or None if does not
24 |         exist."""
25 |         return self.keyvals.get(key)
26 | 
27 |     def __getattribute__(self, attribute):
28 |         if attribute == '__dict__':
29 |             return self.keyvals
30 |         else:
31 |             return object.__getattribute__(self, attribute)
32 | 
33 |     def __getattr__(self, key):
34 |         """Returns value of the given hyperameter, or None if does not
35 |         exist."""
36 |         return self.keyvals.get(key)
37 | 
38 |     def __setattr__(self, key, value):
39 |         """Sets value for the hyperameter."""
40 |         self.keyvals[key] = value
41 | 
42 |     def update(self, values_dict):
43 |         """Merges in new hyperparameters, replacing existing with same key."""
44 |         self.keyvals.update(values_dict)
45 | 
46 |         return self
47 | 
48 |     def parse(self, values):
49 |         """Merges in new hyperparameters, replacing existing with same key."""
50 | 
51 |         if type(values) == dict:
52 |             return self.update(values)
53 | 
54 |         if type(values) in (set, list):
55 |             tmp = {}
56 |             for k, v in zip(values[::2], values[1::2]):
57 |                 try:
58 |                     tmp[k] = ast.literal_eval(v)
59 |                 except ValueError:
60 |                     tmp[k] = v
61 |             return self.update(tmp)
62 | 
63 |         return self.update(ast.literal_eval(values))
64 | 
65 |     def values(self):
66 |         """Return the hyperparameter values as a Python dictionary."""
67 |         return self.keyvals
68 | 
69 |     def __str__(self):
70 |         return str(self.keyvals)
71 | 


--------------------------------------------------------------------------------