├── LICENSE
├── README.md
├── data.py
├── data
    ├── eval.pkl
    └── train.pkl
├── data_load.py
├── eval.py
├── fig
    ├── asr2.png
    ├── ori2.png
    └── training_curve.png
├── hyperparams.py
├── modules.py
├── networks.py
├── prepro.py
├── samples
    └── model_gs_19860_0.19.txt
├── train.py
└── utils.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Speech Recognition Using Tacotron
  2 | 
  3 | 
  4 | ## Motivation
  5 | Tacotron is an end-to-end speech generation model which was first introduced in [Towards End-to-End Speech Synthesis](https://arxiv.org/abs/1703.10135). It takes as input text at the character level, and targets mel filterbanks and the linear spectrogram. Although it is a generation model, I felt like testing how well it can be applied to the speech recognition task.
  6 | 
  7 | ## Requirements
  8 |   * NumPy >= 1.11.1
  9 |   * TensorFlow == 1.1
 10 |   * librosa
 11 | 
 12 | ## Model description
 13 | 
 14 | <img src="fig/ori2.png">
 15 | 
 16 | Tacotron—Speech Synthesis Model (From_ Figure 1 in [Towards End-to-End Speech Synthesis](https://arxiv.org/abs/1703.10135))
 17 | 
 18 | <img src="fig/asr2.png">
 19 | Modified architecture for speech recognition
 20 | 
 21 | ## Data
 22 | 
 23 | [The World English Bible](https://en.wikipedia.org/wiki/World_English_Bible) is a public domain update of the American Standard Version of 1901 into modern English. Its text and audio recordings are freely available [here](http://www.audiotreasure.com/webindex.htm). Unfortunately, however, each of the audio files matches a chapter, not a verse, so is too long for many machine learning tasks. I had someone slice them by verse manually. You can download [the audio data](https://www.dropbox.com/s/nde56czgda8q77e/WEB.zip?dl=0) and its [text](https://www.dropbox.com/s/lcfhs1kk9shvypj/text.csv?dl=0) from my dropbox.
 24 | 
 25 | ## File description
 26 |   * `hyperparams.py` includes all hyper parameters.
 27 |   * `prepro.py` creates training and evaluation data to `data/` folder.
 28 |   * `data_load.py` loads data and put them in queues so multiple mini-bach data are generated in parallel.
 29 |   * `utils.py` has some operational functions.
 30 |   * `modules.py` contains building blocks for encoding and decoding networks.
 31 |   * `networks.py` defines encoding and decoding networks.
 32 |   * `train.py` executes training.
 33 |   * `eval.py` executes evaluation.
 34 | 
 35 | ## Training
 36 |   * STEP 1. Adjust hyper parameters in `hyperparams.py` if necessary.
 37 |   * STEP 2. Download and extract [the audio data](https://dl.dropboxusercontent.com/u/42868014/WEB.zip) and its [text](https://dl.dropboxusercontent.com/u/42868014/text.csv).
 38 |   * STEP 3. Run `train.py`. Or you can download my [pretrained file](https://www.dropbox.com/s/n55aqjx6mge96pb/logdir.zip?dl=0)
 39 | 
 40 | ## Evaluation
 41 |   * Run `eval.py` to get speech recognition results for the test set.
 42 | 
 43 | ## Results
 44 | 
 45 | The **training curve** looks like
 46 | 
 47 | <img src="fig/training_curve.png">
 48 | 
 49 | **Sample results** are
 50 | 
 51 | Expected: the third poured out his bowl into the rivers and springs of water and they became blood<br>
 52 | Got     : the first will lie down to the rivers and springs of waters and it became blood
 53 | 
 54 | Expected: i heard the altar saying yes lord god the almighty true and righteous are your judgments<br>
 55 | Got     : i heard the altar saying yes were like your own like you tree in righteousness for your judgments
 56 | 
 57 | Expected: the fourth poured out his bowl on the sun and it was given to him to scorch men with fire<br>
 58 | Got     : the foolish very armed were on the sun and was given to him to spoke to him with fire
 59 | 
 60 | Expected: he gathered them together into the place which is called in hebrew megiddo<br>
 61 | Got     : he gathered them together into the place which is called and he weep and at every
 62 | 
 63 | Expected: every island fled away and the mountains were not found<br>
 64 | Got     : hadad and kedemoth aroen and another and spread out them
 65 | 
 66 | Expected: here is the mind that has wisdom the seven heads are seven mountains on which the woman sits<br>
 67 | Got     : he is the mighty have wisdom the seven heads of seven rountains are with the wind sixter
 68 | 
 69 | Expected: these have one mind and they give their power and authority to the beast<br>
 70 | Got     : these are those who are mine and they give holl of a fool in the deeps
 71 | 
 72 | Expected: the woman whom you saw is the great city which reigns over the kings of the earth<br>
 73 | Got     : the woman whom he saw it his degrection which ran and to advening to be ear
 74 | 
 75 | Expected: for her sins have reached to the sky and god has remembered her iniquities<br>
 76 | Got     : for he sends a least in the sky and god has remembered her iniquities
 77 | 
 78 | Expected: the merchants of the earth weep and mourn over her for no one buys their merchandise any more<br>
 79 | Got     : the mittites of the earth weeps in your own are before from knowing babylon busine backsliding all t
 80 | 
 81 | Expected: and cried out as they looked at the smoke of her burning saying 'what is like the great city'<br>
 82 | Got     : and cried all the wicked beside of a good one and saying when is like the great sight
 83 | 
 84 | Expected: in her was found the blood of prophets and of saints and of all who have been slain on the earth<br>
 85 | Got     : and her with stones a dwellified confidence and all who have been slain on the earth
 86 | 
 87 | Expected: a second said hallelujah her smoke goes up forever and ever<br>
 88 | Got     : as set him said how many men utter for smoke go down for every male it
 89 | 
 90 | Expected: he is clothed in a garment sprinkled with blood his name is called the word of god<br>
 91 | Got     : he is close in a garment speaking in the blood his name is called 'the word of god'
 92 | 
 93 | Expected: the armies which are in heaven followed him on white horses clothed in white pure fine linen<br>
 94 | Got     : the army which are in heaven falls on the mighty one horses clothes driven on the affliction
 95 | 
 96 | Expected: he has on his garment and on his thigh a name written king of kings and lord of lords<br>
 97 | Got     : he has understandings on his folly among widow the king of kings and yahweh of armies
 98 | 
 99 | Expected: i saw an angel coming down out of heaven having the key of the abyss and a great chain in his hand<br>
100 | Got     : i saw an even become young lion having you trust of the ages and a great chamber is hand
101 | 
102 | Expected: and after the thousand years satan will be released from his prison<br>
103 | Got     : and after the palace and mizpah and eleven eleenth were the twentieth
104 | 
105 | Expected: death and hades were thrown into the lake of fire this is the second death the lake of fire<br>
106 | Got     : let them hate with one and to wait for fire this is the second death and lead a time
107 | 
108 | Expected: if anyone was not found written in the book of life he was cast into the lake of fire<br>
109 | Got     : the ten man will not think within your demon as with a blood he will cast him to ram for fire
110 | 
111 | Expected: he who overcomes i will give him these things i will be his god and he will be my son<br>
112 | Got     : he who recompenses i will give him be stings i will be his god and he will be my son
113 | 
114 | Expected: its wall is one hundred fortyfour cubits by the measure of a man that is of an angel<br>
115 | Got     : is through all his womb home before you for accusation that we may know him by these are in egypt
116 | 
117 | Expected: the construction of its wall was jasper the city was pure gold like pure glass<br>
118 | Got     : if he struck him of his wallor is not speaking with torment hold on her grass
119 | 
120 | Expected: i saw no temple in it for the lord god the almighty and the lamb are its temple<br>
121 | Got     : i saw in a tenth wind for we will dry up you among the linen ox skillful
122 | 
123 | Expected: its gates will in no way be shut by day for there will be no night there<br>
124 | Got     : his greech wind more redeems shameful the redeemer man don't know
125 | 
126 | Expected: and they shall bring the glory and the honor of the nations into it so that they may enter<br>
127 | Got     : and they shall bring the glory in the high mountains and the egyptian into the midst of the needy
128 | 
129 | Expected: they will see his face and his name will be on their foreheads<br>
130 | Got     : they will see his face and his name on their follows
131 | 
132 | Expected: behold i come quickly blessed is he who keeps the words of the prophecy of this book<br>
133 | Got     : behold i happened with me when i could see me to still it is a prophet his bueld
134 | 
135 | Expected: he said to me don't seal up the words of the prophecy of this book for the time is at hand<br>
136 | Got     : he said to him why sil with the words of the prophets it is book for the times and her
137 | 
138 | Expected: behold i come quickly my reward is with me to repay to each man according to his work<br>
139 | Got     : behold i come perfect i yahweh is with me to repent to be shamed according to his work
140 | 
141 | Expected: i am the alpha and the omega the first and the last the beginning and the end<br>
142 | Got     : i have you hope from you and you and the first from aloes of the dew and the enemy
143 | 
144 | Expected: he who testifies these things says yes i come quickly amen yes come lord jesus<br>
145 | Got     : he who testifies these things says yes i come proclaim i man listen will jesus
146 | 
147 | 
148 | 
149 | ## Related projects
150 |   * [A TensorFlow Implementation of Tacotron: A Fully End-to-End Text-To-Speech Synthesis Model](https://github.com/Kyubyong/tacotron)
151 |   * [Speech-to-Text-WaveNet : End-to-end sentence level English speech recognition based on DeepMind's WaveNet and tensorflow](https://github.com/buriburisuri/speech-to-text-wavenet)
152 | 
153 | 


--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com. 
 5 | https://www.github.com/kyubyong/tacotron_asr
 6 | '''
 7 | 
 8 | from __future__ import print_function
 9 | import os
10 | import re
11 | import numpy as np
12 | import pickle
13 | from hyperparams import Hyperparams as hp
14 | 
15 | 
16 | def load_vocab():
17 |     vocab = "ES abcdefghijklmnopqrstuvwxyz'" # E: Empty
18 |     char2idx = {char:idx for idx, char in enumerate(vocab)}
19 |     idx2char = {idx:char for idx, char in enumerate(vocab)}
20 |     return char2idx, idx2char
21 |                 
22 | def text2idx(text):
23 |     # Load vocabulary
24 |     char2idx, idx2char = load_vocab() 
25 |     
26 |     # Convert
27 |     text = re.sub(r"[^ a-z']", "", text.lower()).strip() + "S"
28 |     converted = [char2idx[char] for char in text]
29 |     return text, converted
30 |         
31 | def load_train_data():
32 |     """We train on the whole data but the last mini-batch."""
33 |     
34 |     sound_fpaths, converteds = pickle.load(open('data/train.pkl', 'rb'))
35 |     return sound_fpaths, converteds
36 |  
37 | def load_eval_data():
38 |     from utils import get_spectrogram, reduce_frames
39 |     """We evaluate on the last mini-batch."""
40 |     sound_fpaths, texts = pickle.load(open('data/eval.pkl', 'rb'))
41 | 
42 |     # Extract spectrogram from sound_fpaths
43 |     char2idx, idx2char = load_vocab() 
44 |     
45 |     xs, maxlen = [], 0
46 |     for sound_fpath in sound_fpaths:
47 |         spectrogram = get_spectrogram(sound_fpath)
48 |         x = reduce_frames(spectrogram, hp.r)
49 |         maxlen = max(maxlen, len(x))
50 |         xs.append(x)
51 |         
52 |     # Set the length of samples in X to the maximum among them.
53 |     X = np.zeros(shape=(len(xs), maxlen, hp.n_mels*hp.r), dtype=np.float32)
54 |     for i, x in enumerate(xs):
55 |         X[i, :len(x), :] = x
56 |         
57 |     return X, texts # 3d array, list of str 
58 |  
59 | 
60 | 


--------------------------------------------------------------------------------
/data/eval.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/tacotron_asr/fdc0d9a3ff56405e90e2cb6ff8d540e735b01b7d/data/eval.pkl


--------------------------------------------------------------------------------
/data/train.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/tacotron_asr/fdc0d9a3ff56405e90e2cb6ff8d540e735b01b7d/data/train.pkl


--------------------------------------------------------------------------------
/data_load.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/tacotron
  6 | '''
  7 | 
  8 | from functools import wraps
  9 | import threading
 10 | 
 11 | from tensorflow.python.platform import tf_logging as logging
 12 | 
 13 | from hyperparams import Hyperparams as hp
 14 | import numpy as np
 15 | import tensorflow as tf
 16 | from utils import get_spectrogram, reduce_frames
 17 | from data import load_train_data
 18 | 
 19 | # Adapted from the `sugartensor` code.
 20 | # https://github.com/buriburisuri/sugartensor/blob/master/sugartensor/sg_queue.py
 21 | def producer_func(func):
 22 |     r"""Decorates a function `func` as producer_func.
 23 | 
 24 |     Args:
 25 |       func: A function to decorate.
 26 |     """
 27 |     @wraps(func)
 28 |     def wrapper(inputs, dtypes, capacity, num_threads):
 29 |         r"""
 30 |         Args:
 31 |             inputs: A inputs queue list to enqueue
 32 |             dtypes: Data types of each tensor
 33 |             capacity: Queue capacity. Default is 32.
 34 |             num_threads: Number of threads. Default is 1.
 35 |         """
 36 |         # enqueue function
 37 |         def enqueue_func(sess, op):
 38 |             # read data from source queue
 39 |             data = func(sess.run(inputs))
 40 |             # create feeder dict
 41 |             feed_dict = {}
 42 |             for ph, col in zip(placeholders, data):
 43 |                 feed_dict[ph] = col
 44 |             # run session
 45 |             sess.run(op, feed_dict=feed_dict)
 46 | 
 47 |         # create place holder list
 48 |         placeholders = []
 49 |         for dtype in dtypes:
 50 |             placeholders.append(tf.placeholder(dtype=dtype))
 51 | 
 52 |         # create FIFO queue
 53 |         queue = tf.FIFOQueue(capacity, dtypes=dtypes)
 54 | 
 55 |         # enqueue operation
 56 |         enqueue_op = queue.enqueue(placeholders)
 57 | 
 58 |         # create queue runner
 59 |         runner = _FuncQueueRunner(enqueue_func, queue, [enqueue_op] * num_threads)
 60 | 
 61 |         # register to global collection
 62 |         tf.train.add_queue_runner(runner)
 63 | 
 64 |         # return de-queue operation
 65 |         return queue.dequeue()
 66 | 
 67 |     return wrapper
 68 | 
 69 | 
 70 | class _FuncQueueRunner(tf.train.QueueRunner):
 71 | 
 72 |     def __init__(self, func, queue=None, enqueue_ops=None, close_op=None,
 73 |                  cancel_op=None, queue_closed_exception_types=None,
 74 |                  queue_runner_def=None):
 75 |         # save ad-hoc function
 76 |         self.func = func
 77 |         # call super()
 78 |         super(_FuncQueueRunner, self).__init__(queue, enqueue_ops, close_op, cancel_op,
 79 |                                                queue_closed_exception_types, queue_runner_def)
 80 | 
 81 |     # pylint: disable=broad-except
 82 |     def _run(self, sess, enqueue_op, coord=None):
 83 | 
 84 |         if coord:
 85 |             coord.register_thread(threading.current_thread())
 86 |         decremented = False
 87 |         try:
 88 |             while True:
 89 |                 if coord and coord.should_stop():
 90 |                     break
 91 |                 try:
 92 |                     self.func(sess, enqueue_op)  # call enqueue function
 93 |                 except self._queue_closed_exception_types:  # pylint: disable=catching-non-exception
 94 |                     # This exception indicates that a queue was closed.
 95 |                     with self._lock:
 96 |                         self._runs_per_session[sess] -= 1
 97 |                         decremented = True
 98 |                         if self._runs_per_session[sess] == 0:
 99 |                             try:
100 |                                 sess.run(self._close_op)
101 |                             except Exception as e:
102 |                                 # Intentionally ignore errors from close_op.
103 |                                 logging.vlog(1, "Ignored exception: %s", str(e))
104 |                         return
105 |         except Exception as e:
106 |             # This catches all other exceptions.
107 |             if coord:
108 |                 coord.request_stop(e)
109 |             else:
110 |                 logging.error("Exception in QueueRunner: %s", str(e))
111 |                 with self._lock:
112 |                     self._exceptions_raised.append(e)
113 |                 raise
114 |         finally:
115 |             # Make sure we account for all terminations: normal or errors.
116 |             if not decremented:
117 |                 with self._lock:
118 |                     self._runs_per_session[sess] -= 1
119 | 
120 | @producer_func
121 | def get_spectrogram_and_text(_inputs):
122 |     '''From `_inputs`, which has been fetched from slice queues,
123 |        makes text, spectrogram, and magnitude,
124 |        then enqueue them again. 
125 |     '''
126 |     sound_fpath, text = _inputs
127 |     spectrogram = get_spectrogram(sound_fpath)
128 |     spectrogram = reduce_frames(spectrogram, hp.r)
129 |     
130 |     text = np.fromstring(text, np.int32)
131 |     return spectrogram, text
132 |                             
133 | def get_batch():
134 |     """Loads training data and put them in queues"""
135 |     with tf.device('/cpu:0'):
136 |         # Load data
137 |         sound_fpaths, texts = load_train_data() # string, bytes
138 |         
139 |         
140 |         # calc total batch count
141 |         num_batch = len(sound_fpaths) // hp.batch_size
142 |          
143 |         # Convert to tensor
144 |         sound_fpaths = tf.convert_to_tensor(sound_fpaths)
145 |         texts = tf.convert_to_tensor(texts)
146 |          
147 |         # Create Queues
148 |         q = tf.train.slice_input_producer([sound_fpaths, texts], shuffle=True)
149 | 
150 |         # Decode sound file
151 |         x, y = get_spectrogram_and_text(inputs=q, 
152 |                                             dtypes=[tf.float32, tf.int32],
153 |                                             capacity=128,
154 |                                             num_threads=32)
155 |         # create batch queues
156 |         x, y = tf.train.batch([x, y],
157 |                                 shapes=[(None, hp.n_mels*hp.r), (None,)],
158 |                                 num_threads=32,
159 |                                 batch_size=hp.batch_size, 
160 |                                 capacity=hp.batch_size*32,   
161 |                                 dynamic_pad=True)
162 |     return x, y, num_batch
163 | 
164 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com. 
 5 | https://www.github.com/kyubyong/tacotron_asr
 6 | '''
 7 | 
 8 | from __future__ import print_function
 9 | 
10 | import codecs
11 | import os
12 | 
13 | from data import load_vocab, load_eval_data, load_train_data
14 | from hyperparams import Hyperparams as hp
15 | import numpy as np
16 | import tensorflow as tf
17 | from train import Graph
18 | 
19 | 
20 | def eval(): 
21 |     # Load graph
22 |     g = Graph(is_training=False); print("Graph loaded")
23 |     
24 |     # Load data
25 |     x, y = load_eval_data()
26 |     char2idx, idx2char = load_vocab()
27 |             
28 |     with g.graph.as_default():    
29 |         sv = tf.train.Supervisor()
30 |         with sv.managed_session() as sess:
31 |             # Restore parameters
32 |             sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
33 |             print("Restored!")
34 |             # Get model name
35 |             mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1]
36 |             
37 |             # Speech to Text
38 |             if not os.path.exists('samples'): os.mkdir('samples') 
39 |             with codecs.open('samples/{}.txt'.format(mname), 'w', 'utf-8') as fout:
40 |                 preds = np.zeros((hp.batch_size, hp.max_len), np.int32)
41 |                 for j in range(hp.max_len):
42 |                     _preds = sess.run(g.preds, {g.x: x, g.y: preds})
43 |                     preds[:, j] = _preds[:, j]
44 |                 
45 |                 # Write to file
46 |                 for i, (expected, got) in enumerate(zip(y, preds)): # ground truth vs. prediction
47 |                     fout.write("Expected: {}\n".format(expected.split("S")[0]))
48 |                     fout.write("Got     : {}\n\n".format(("".join(idx2char[idx] for idx in np.fromstring(got, np.int32))).split("S")[0]))
49 |                     fout.flush()
50 |                                           
51 | if __name__ == '__main__':
52 |     eval()
53 |     print("Done")
54 |     
55 |     
56 | 


--------------------------------------------------------------------------------
/fig/asr2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/tacotron_asr/fdc0d9a3ff56405e90e2cb6ff8d540e735b01b7d/fig/asr2.png


--------------------------------------------------------------------------------
/fig/ori2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/tacotron_asr/fdc0d9a3ff56405e90e2cb6ff8d540e735b01b7d/fig/ori2.png


--------------------------------------------------------------------------------
/fig/training_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/tacotron_asr/fdc0d9a3ff56405e90e2cb6ff8d540e735b01b7d/fig/training_curve.png


--------------------------------------------------------------------------------
/hyperparams.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com. 
 5 | https://www.github.com/kyubyong/tacotron_asr
 6 | '''
 7 | 
 8 | class Hyperparams:
 9 |     '''Hyper parameters'''
10 |     # data
11 |     web = 'WEB'
12 |     max_len = 100 # maximum length of text
13 |     
14 |     # signal processing
15 |     sr = 22050 # Sampling rate.
16 |     n_fft = 2048 # fft points (samples)
17 |     frame_shift = 0.0125 # seconds
18 |     frame_length = 0.05 # seconds
19 |     hop_length = int(sr*frame_shift) # samples  This is dependent on the frame_shift.
20 |     win_length = int(sr*frame_length) # samples This is dependent on the frame_length.
21 |     n_mels = 80 # Number of Mel banks to generate
22 |     
23 |     # model
24 |     embed_size = 256 # alias = E
25 |     encoder_num_banks = 16
26 |     decoder_num_banks = 8
27 |     num_highwaynet_blocks = 4
28 |     r = 5 # Reduction factor. Paper => 2, 3, 5
29 |     
30 |     # training scheme
31 |     lr = 0.0001 
32 |     logdir = "logdir"
33 |     batch_size = 32
34 |     num_epochs = 20 
35 |     
36 | 


--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #/usr/bin/python2
  3 | '''
  4 | By kyubyong park. kbpark.linguist@gmail.com. 
  5 | https://www.github.com/kyubyong/tacotron_asr
  6 | '''
  7 | 
  8 | from __future__ import print_function
  9 | import tensorflow as tf
 10 | from hyperparams import Hyperparams as hp
 11 | 
 12 | def embed(inputs, vocab_size, num_units, zero_pad=True, scope="embedding", reuse=None):
 13 |     '''Embeds a given tensor. 
 14 |     
 15 |     Args:
 16 |       inputs: A `Tensor` with type `int32` or `int64` containing the ids
 17 |          to be looked up in `lookup table`.
 18 |       vocab_size: An int. Vocabulary size.
 19 |       num_units: An int. Number of embedding hidden units.
 20 |       zero_pad: A boolean. If True, all the values of the fist row (id 0)
 21 |         should be constant zeros.
 22 |       scope: Optional scope for `variable_scope`.  
 23 |       reuse: Boolean, whether to reuse the weights of a previous layer
 24 |         by the same name.
 25 |         
 26 |     Returns:
 27 |       A `Tensor` with one more rank than inputs's. The last dimesionality
 28 |         should be `num_units`.
 29 |     '''
 30 |     with tf.variable_scope(scope, reuse=reuse):
 31 |         lookup_table = tf.get_variable('lookup_table', 
 32 |                                        dtype=tf.float32, 
 33 |                                        shape=[vocab_size, num_units],
 34 |                                        initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.01))
 35 |         if zero_pad:
 36 |             lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 
 37 |                                       lookup_table[1:, :]), 0)
 38 |     return tf.nn.embedding_lookup(lookup_table, inputs)   
 39 |  
 40 | def normalize(inputs, 
 41 |               type="bn",
 42 |               decay=.99,
 43 |               is_training=True, 
 44 |               activation_fn=None,
 45 |               scope="normalize"):
 46 |     '''Applies {batch|layer} normalization.
 47 |     
 48 |     Args:
 49 |       inputs: A tensor with 2 or more dimensions, where the first dimension has
 50 |         `batch_size`. If type is `bn`, the normalization is over all but 
 51 |         the last dimension. Or if type is `ln`, the normalization is over 
 52 |         the last dimension. Note that this is different from the native 
 53 |         `tf.contrib.layers.batch_norm`. For this I recommend you change
 54 |         a line in ``tensorflow/contrib/layers/python/layers/layer.py` 
 55 |         as follows.
 56 |         Before: mean, variance = nn.moments(inputs, axis, keep_dims=True)
 57 |         After: mean, variance = nn.moments(inputs, [-1], keep_dims=True)
 58 |       type: A string. Either "bn" or "ln".
 59 |       decay: Decay for the moving average. Reasonable values for `decay` are close
 60 |         to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc.
 61 |         Lower `decay` value (recommend trying `decay`=0.9) if model experiences
 62 |         reasonably good training performance but poor validation and/or test
 63 |         performance.
 64 |       is_training: Whether or not the layer is in training mode. W
 65 |       activation_fn: Activation function.
 66 |       scope: Optional scope for `variable_scope`.
 67 |       
 68 |     Returns:
 69 |       A tensor with the same shape and data dtype as `inputs`.
 70 |     '''
 71 |     if type=="bn":
 72 |         inputs_shape = inputs.get_shape()
 73 |         inputs_rank = inputs_shape.ndims
 74 |         
 75 |         # use fused batch norm if inputs_rank in [2, 3, 4] as it is much faster.
 76 |         # pay attention to the fact that fused_batch_norm requires shape to be rank 4 of NHWC.
 77 |         if inputs_rank in [2, 3, 4]:
 78 |             if inputs_rank==2:
 79 |                 inputs = tf.expand_dims(inputs, axis=1)
 80 |                 inputs = tf.expand_dims(inputs, axis=2)
 81 |             elif inputs_rank==3:
 82 |                 inputs = tf.expand_dims(inputs, axis=1)
 83 |             
 84 |             outputs = tf.contrib.layers.batch_norm(inputs=inputs, 
 85 |                                                decay=decay,
 86 |                                                center=True, 
 87 |                                                scale=True, 
 88 |                                                activation_fn=activation_fn, 
 89 |                                                updates_collections=None,
 90 |                                                is_training=is_training,
 91 |                                                scope=scope,
 92 |                                                zero_debias_moving_mean=True,
 93 |                                                fused=True)
 94 |             # restore original shape
 95 |             if inputs_rank==2:
 96 |                 outputs = tf.squeeze(outputs, axis=[1, 2])
 97 |             elif inputs_rank==3:
 98 |                 outputs = tf.squeeze(outputs, axis=1)
 99 |         else: # fallback to naive batch norm
100 |             outputs = tf.contrib.layers.batch_norm(inputs=inputs, 
101 |                                                decay=decay,
102 |                                                center=True, 
103 |                                                scale=True, 
104 |                                                activation_fn=activation_fn, 
105 |                                                updates_collections=None,
106 |                                                is_training=is_training,
107 |                                                scope=scope,
108 |                                                fused=False)    
109 |     elif type=="ln":
110 |         outputs = tf.contrib.layers.layer_norm(inputs=inputs, 
111 |                                             center=True, 
112 |                                             scale=True, 
113 |                                             activation_fn=activation_fn, 
114 |                                             scope=scope)
115 |     elif type == "in":
116 |         with tf.variable_scope(scope):
117 |             batch, steps, channels = inputs.get_shape().as_list()
118 |             var_shape = [channels]
119 |             mu, sigma_sq = tf.nn.moments(inputs, [1], keep_dims=True)
120 |             shift = tf.Variable(tf.zeros(var_shape))
121 |             scale = tf.Variable(tf.ones(var_shape))
122 |             epsilon = 1e-8
123 |             normalized = (inputs - mu) / (sigma_sq + epsilon) ** (.5)
124 |             outputs = scale * normalized + shift
125 |             if activation_fn:
126 |                 outputs = activation_fn(outputs)
127 |     else:
128 |         raise ValueError("Currently we support `bn` or `ln` only.")
129 |     
130 |     return outputs
131 | 
132 | def conv1d(inputs, 
133 |            filters=None, 
134 |            size=1, 
135 |            rate=1, 
136 |            padding="SAME", 
137 |            use_bias=False,
138 |            activation_fn=None,
139 |            scope="conv1d",
140 |            reuse=None):
141 |     '''
142 |     Args:
143 |       inputs: A 3-D tensor with shape of [batch, time, depth].
144 |       filters: An int. Number of outputs (=activation maps)
145 |       size: An int. Filter size.
146 |       rate: An int. Dilation rate.
147 |       padding: Either `same` or `valid` or `causal` (case-insensitive).
148 |       use_bias: A boolean.
149 |       scope: Optional scope for `variable_scope`.
150 |       reuse: Boolean, whether to reuse the weights of a previous layer
151 |         by the same name.
152 |     
153 |     Returns:
154 |       A masked tensor of the same shape and dtypes as `inputs`.
155 |     '''
156 |     
157 |     with tf.variable_scope(scope):
158 |         if padding.lower()=="causal":
159 |             # pre-padding for causality
160 |             pad_len = (size - 1) * rate  # padding size
161 |             inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
162 |             padding = "valid"
163 |         
164 |         if filters is None:
165 |             filters = inputs.get_shape().as_list[-1]
166 |         
167 |         params = {"inputs":inputs, "filters":filters, "kernel_size":size,
168 |                 "dilation_rate":rate, "padding":padding, "activation":activation_fn, 
169 |                 "use_bias":use_bias, "reuse":reuse}
170 |         
171 |         outputs = tf.layers.conv1d(**params)
172 |     return outputs
173 | 
174 | def conv1d_banks(inputs, K=16, is_training=True, scope="conv1d_banks", reuse=None):
175 |     '''Applies a series of conv1d separately.
176 |     
177 |     Args:
178 |       inputs: A 3d tensor with shape of [N, T, C]
179 |       K: An int. The size of conv1d banks. That is, 
180 |         The `inputs` are convolved with K filters: 1, 2, ..., K.
181 |       is_training: A boolean. This is passed to an argument of `batch_normalize`.
182 |     
183 |     Returns:
184 |       A 3d tensor with shape of [N, T, K*Hp.embed_size//2].
185 |     '''
186 |     with tf.variable_scope(scope, reuse=reuse):
187 |         outputs = conv1d(inputs, hp.embed_size//2, 1) # k=1
188 |         outputs = normalize(outputs, type="in", is_training=is_training, 
189 |                             activation_fn=tf.nn.relu)
190 |         for k in range(2, K+1): # k = 2...K
191 |             with tf.variable_scope("num_{}".format(k)):
192 |                 output = conv1d(inputs, hp.embed_size//2, k)
193 |                 output = normalize(output, type="in", is_training=is_training, 
194 |                             activation_fn=tf.nn.relu)
195 |                 outputs = tf.concat((outputs, output), -1)
196 |     return outputs # (N, T, Hp.embed_size//2*K)
197 | 
198 | def gru(inputs, num_units=None, bidirection=False, scope="gru", reuse=None):
199 |     '''Applies a GRU.
200 |     
201 |     Args:
202 |       inputs: A 3d tensor with shape of [N, T, C].
203 |       num_units: An int. The number of hidden units.
204 |       bidirection: A boolean. If True, bidirectional results 
205 |         are concatenated.
206 |       scope: Optional scope for `variable_scope`.  
207 |       reuse: Boolean, whether to reuse the weights of a previous layer
208 |         by the same name.
209 |         
210 |     Returns:
211 |       If bidirection is True, a 3d tensor with shape of [N, T, 2*num_units],
212 |         otherwise [N, T, num_units].
213 |     '''
214 |     with tf.variable_scope(scope, reuse=reuse):
215 |         if num_units is None:
216 |             num_units = inputs.get_shape().as_list[-1]
217 |             
218 |         cell = tf.contrib.rnn.GRUCell(num_units)  
219 |         if bidirection: 
220 |             cell_bw = tf.contrib.rnn.GRUCell(num_units)
221 |             outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell, cell_bw, inputs, dtype=tf.float32)
222 |             return tf.concat(outputs, 2)  
223 |         else:
224 |             outputs, _ = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
225 |             return outputs
226 | 
227 | def attention_decoder(inputs, memory, num_units=None, scope="attention_decoder", reuse=None):
228 |     '''Applies a GRU to `inputs`, while attending `memory`.
229 |     Args:
230 |       inputs: A 3d tensor with shape of [N, T', C']. Decoder inputs.
231 |       num_units: An int. Attention size.
232 |       memory: A 3d tensor with shape of [N, T, C]. Outputs of encoder network.
233 |       scope: Optional scope for `variable_scope`.  
234 |       reuse: Boolean, whether to reuse the weights of a previous layer
235 |         by the same name.
236 |     
237 |     Returns:
238 |       A 3d tensor with shape of [N, T, num_units].    
239 |     '''
240 |     with tf.variable_scope(scope, reuse=reuse):
241 |         if num_units is None:
242 |             num_units = inputs.get_shape().as_list[-1]
243 |             
244 |         attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units, memory)
245 |         decoder_cell = tf.contrib.rnn.GRUCell(num_units)
246 |         cell_with_attetion = tf.contrib.seq2seq.DynamicAttentionWrapper(decoder_cell, attention_mechanism, num_units)
247 |         outputs, _ = tf.nn.dynamic_rnn(cell_with_attetion, inputs, dtype=tf.float32) #( 1, 6, 16)
248 |     return outputs
249 | 
250 | def prenet(inputs, is_training=True, scope="prenet", reuse=None):
251 |     '''Prenet for Encoder and Decoder.
252 |     Args:
253 |       inputs: A 3D tensor of shape [N, T, hp.embed_size].
254 |       scope: Optional scope for `variable_scope`.  
255 |       reuse: Boolean, whether to reuse the weights of a previous layer
256 |         by the same name.
257 |         
258 |     Returns:
259 |       A 3D tensor of shape [N, T, num_units/2].
260 |     '''
261 |     with tf.variable_scope(scope, reuse=reuse):
262 |         outputs = tf.layers.dense(inputs, units=hp.embed_size, activation=tf.nn.relu, name="dense1")
263 |         outputs = tf.nn.dropout(outputs, keep_prob=.5 if is_training==True else 1., name="dropout1")
264 |         outputs = tf.layers.dense(outputs, units=hp.embed_size//2, activation=tf.nn.relu, name="dense2")
265 |         outputs = tf.nn.dropout(outputs, keep_prob=.5 if is_training==True else 1., name="dropout2") 
266 |     return outputs # (N, T, num_units/2)
267 | 
268 | def highwaynet(inputs, num_units=None, scope="highwaynet", reuse=None):
269 |     '''Highway networks, see https://arxiv.org/abs/1505.00387
270 | 
271 |     Args:
272 |       inputs: A 3D tensor of shape [N, T, W].
273 |       num_units: An int or `None`. Specifies the number of units in the highway layer
274 |              or uses the input size if `None`.
275 |       scope: Optional scope for `variable_scope`.  
276 |       reuse: Boolean, whether to reuse the weights of a previous layer
277 |         by the same name.
278 | 
279 |     Returns:
280 |       A 3D tensor of shape [N, T, W].
281 |     '''
282 |     if not num_units:
283 |         num_units = inputs.get_shape()[-1]
284 |         
285 |     with tf.variable_scope(scope, reuse=reuse):
286 |         H = tf.layers.dense(inputs, units=num_units, activation=tf.nn.relu, name="dense1")
287 |         T = tf.layers.dense(inputs, units=num_units, activation=tf.nn.sigmoid, name="dense2")
288 |         C = 1. - T
289 |         outputs = H * T + inputs * C
290 |     return outputs
291 | 


--------------------------------------------------------------------------------
/networks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com. 
 5 | https://www.github.com/kyubyong/tacotron
 6 | '''
 7 | 
 8 | from __future__ import print_function
 9 | 
10 | from hyperparams import Hyperparams as hp
11 | from modules import *
12 | from data import load_vocab
13 | import tensorflow as tf
14 | 
15 | def encode(inputs, is_training=True, scope="encoder", reuse=None):
16 |     '''
17 |     Args:
18 |       inputs: A 2d tensor with shape of [N, T], dtype of int32.
19 |       is_training: Whether or not the layer is in training mode.
20 |       scope: Optional scope for `variable_scope`
21 |       reuse: Boolean, whether to reuse the weights of a previous layer
22 |         by the same name.
23 |     
24 |     Returns:
25 |       A collection of Hidden vectors, whose shape is (N, T, E).
26 |     '''
27 |     with tf.variable_scope(scope, reuse=reuse):
28 |         # Load vocabulary 
29 |         char2idx, idx2char = load_vocab()
30 |           
31 | #         # Character Embedding
32 | #         inputs = embed(inputs, len(char2idx), hp.embed_size) # (N, T, E)  
33 | 
34 |         # Encoder pre-net
35 |         prenet_out = prenet(inputs, scope="prenet", is_training=is_training) # (N, T, E/2)
36 | 
37 |         # Encoder CBHG 
38 |         ## Conv1D bank 
39 |         enc = conv1d_banks(prenet_out, K=hp.encoder_num_banks, is_training=is_training) # (N, T, K * E / 2)
40 | 
41 |         ### Max pooling
42 |         enc = tf.layers.max_pooling1d(enc, 2, 1, padding="same")  # (N, T, K * E / 2)
43 |           
44 |         ### Conv1D projections
45 |         enc = conv1d(enc, hp.embed_size//2, 3, scope="conv1d_1") # (N, T, E/2)
46 |         enc = normalize(enc, type="in", is_training=is_training, 
47 |                             activation_fn=tf.nn.relu)
48 |         enc = conv1d(enc, hp.embed_size//2, 3, scope="conv1d_2") # (N, T, E/2)
49 |         enc = normalize(enc, type="in", is_training=is_training, 
50 |                             activation_fn=None)
51 |         enc += prenet_out # (N, T, E/2) # residual connections
52 |           
53 |         ### Highway Nets
54 |         for i in range(hp.num_highwaynet_blocks):
55 |             enc = highwaynet(enc, num_units=hp.embed_size//2,
56 |                                  scope='highwaynet_{}'.format(i)) # (N, T, E/2)
57 | 
58 |         ### Bidirectional GRU
59 |         memory = gru(enc, hp.embed_size//2, True) # (N, T, E)
60 |     
61 |     return memory
62 |         
63 | def decode(decoder_inputs, memory, is_training=True, scope="decoder1", reuse=None):
64 |     '''
65 |     Args:
66 |       decoder_inputs: A 3d tensor with shape of [N, T', C'], where C'=hp.n_mels*hp.r, 
67 |         dtype of float32. Shifted melspectrogram of sound files.
68 |       memory: A 3d tensor with shape of [N, T, C], where C=hp.embed_size.
69 |       scope: Optional scope for `variable_scope`
70 |       reuse: Boolean, whether to reuse the weights of a previous layer
71 |         by the same name.
72 |         
73 |     Returns
74 |       Predicted melspectrogram tensor with shape of [N, T', C'].
75 |     '''
76 |     with tf.variable_scope(scope, reuse=reuse):
77 |         # Decoder pre-net
78 |         dec = prenet(decoder_inputs, is_training=is_training) # (N, T', E/2)
79 |         
80 |        # Attention RNN
81 |         dec = attention_decoder(dec, memory, hp.embed_size) # (N, T', E)
82 |         
83 |         # decoder rnn
84 |         dec += gru(dec, hp.embed_size, False, scope="gru1") # (N, T', E)
85 |         dec += gru(dec, hp.embed_size, False, scope="gru2") # (N, T', E)
86 |                   
87 |         # Outputs => (N, T', V)
88 |         char2idx, idx2char = load_vocab()
89 |         outputs = tf.layers.dense(dec, len(char2idx)) 
90 |     
91 |     return outputs
92 | 


--------------------------------------------------------------------------------
/prepro.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com. 
 5 | https://www.github.com/kyubyong/tacotron_asr
 6 | '''
 7 | from __future__ import print_function
 8 | 
 9 | from collections import Counter
10 | import glob
11 | import os
12 | import pickle
13 | import re
14 | 
15 | import librosa
16 | from tqdm import tqdm 
17 | 
18 | from hyperparams import Hyperparams as hp
19 | import numpy as np
20 | import csv
21 | import codecs
22 | from data import text2idx
23 | 
24 | def make_train_data():
25 |     import csv
26 |     
27 |     sound_fpaths, converteds, texts = [], [], [] 
28 |     reader = csv.reader(codecs.open(hp.web + "/text.csv", 'rb', 'utf-8'))
29 |     for row in reader:
30 |         sound_fname, text, duration = row
31 |         sound_fpath = hp.web + "/" + sound_fname + ".wav"
32 |         cleaned, converted = text2idx(text)
33 |         if (len(text) <= hp.max_len):
34 |             sound_fpaths.append(sound_fpath)
35 |             converteds.append(np.array(converted, np.int32).tostring())
36 |             texts.append(cleaned)
37 | 
38 |     # Split into train and eval. We reserve the last mini-batch for evaluation
39 |     X_train, Y_train = sound_fpaths[:-hp.batch_size], converteds[:-hp.batch_size]
40 |     X_eval, Y_eval = sound_fpaths[-hp.batch_size:], texts[-hp.batch_size:]
41 |     
42 |     # Save
43 |     pickle.dump((X_train, Y_train), open('data/train.pkl', 'wb'))
44 |     pickle.dump((X_eval, Y_eval), open('data/eval.pkl', 'wb'))
45 |     
46 | if __name__ == "__main__":
47 |     make_train_data()
48 |     print("Done!")            
49 |        


--------------------------------------------------------------------------------
/samples/model_gs_19860_0.19.txt:
--------------------------------------------------------------------------------
 1 | Expected: the third poured out his bowl into the rivers and springs of water and they became blood
 2 | Got     : the first will lie down to the rivers and springs of waters and it became blood
 3 | 
 4 | Expected: i heard the altar saying yes lord god the almighty true and righteous are your judgments
 5 | Got     : i heard the altar saying yes were like your own like you tree in righteousness for your judgments
 6 | 
 7 | Expected: the fourth poured out his bowl on the sun and it was given to him to scorch men with fire
 8 | Got     : the foolish very armed were on the sun and was given to him to spoke to him with fire
 9 | 
10 | Expected: he gathered them together into the place which is called in hebrew megiddo
11 | Got     : he gathered them together into the place which is called and he weep and at every
12 | 
13 | Expected: every island fled away and the mountains were not found
14 | Got     : hadad and kedemoth aroen and another and spread out them
15 | 
16 | Expected: here is the mind that has wisdom the seven heads are seven mountains on which the woman sits
17 | Got     : he is the mighty have wisdom the seven heads of seven rountains are with the wind sixter
18 | 
19 | Expected: these have one mind and they give their power and authority to the beast
20 | Got     : these are those who are mine and they give holl of a fool in the deeps
21 | 
22 | Expected: the woman whom you saw is the great city which reigns over the kings of the earth
23 | Got     : the woman whom he saw it his degrection which ran and to advening to be ear
24 | 
25 | Expected: for her sins have reached to the sky and god has remembered her iniquities
26 | Got     : for he sends a least in the sky and god has remembered her iniquities
27 | 
28 | Expected: the merchants of the earth weep and mourn over her for no one buys their merchandise any more
29 | Got     : the mittites of the earth weeps in your own are before from knowing babylon busine backsliding all t
30 | 
31 | Expected: and cried out as they looked at the smoke of her burning saying 'what is like the great city'
32 | Got     : and cried all the wicked beside of a good one and saying when is like the great sight
33 | 
34 | Expected: in her was found the blood of prophets and of saints and of all who have been slain on the earth
35 | Got     : and her with stones a dwellified confidence and all who have been slain on the earth
36 | 
37 | Expected: a second said hallelujah her smoke goes up forever and ever
38 | Got     : as set him said how many men utter for smoke go down for every male it
39 | 
40 | Expected: he is clothed in a garment sprinkled with blood his name is called the word of god
41 | Got     : he is close in a garment speaking in the blood his name is called 'the word of god'
42 | 
43 | Expected: the armies which are in heaven followed him on white horses clothed in white pure fine linen
44 | Got     : the army which are in heaven falls on the mighty one horses clothes driven on the affliction
45 | 
46 | Expected: he has on his garment and on his thigh a name written king of kings and lord of lords
47 | Got     : he has understandings on his folly among widow the king of kings and yahweh of armies
48 | 
49 | Expected: i saw an angel coming down out of heaven having the key of the abyss and a great chain in his hand
50 | Got     : i saw an even become young lion having you trust of the ages and a great chamber is hand
51 | 
52 | Expected: and after the thousand years satan will be released from his prison
53 | Got     : and after the palace and mizpah and eleven eleenth were the twentieth
54 | 
55 | Expected: death and hades were thrown into the lake of fire this is the second death the lake of fire
56 | Got     : let them hate with one and to wait for fire this is the second death and lead a time
57 | 
58 | Expected: if anyone was not found written in the book of life he was cast into the lake of fire
59 | Got     : the ten man will not think within your demon as with a blood he will cast him to ram for fire
60 | 
61 | Expected: he who overcomes i will give him these things i will be his god and he will be my son
62 | Got     : he who recompenses i will give him be stings i will be his god and he will be my son
63 | 
64 | Expected: its wall is one hundred fortyfour cubits by the measure of a man that is of an angel
65 | Got     : is through all his womb home before you for accusation that we may know him by these are in egypt
66 | 
67 | Expected: the construction of its wall was jasper the city was pure gold like pure glass
68 | Got     : if he struck him of his wallor is not speaking with torment hold on her grass
69 | 
70 | Expected: i saw no temple in it for the lord god the almighty and the lamb are its temple
71 | Got     : i saw in a tenth wind for we will dry up you among the linen ox skillful
72 | 
73 | Expected: its gates will in no way be shut by day for there will be no night there
74 | Got     : his greech wind more redeems shameful the redeemer man don't know
75 | 
76 | Expected: and they shall bring the glory and the honor of the nations into it so that they may enter
77 | Got     : and they shall bring the glory in the high mountains and the egyptian into the midst of the needy
78 | 
79 | Expected: they will see his face and his name will be on their foreheads
80 | Got     : they will see his face and his name on their follows
81 | 
82 | Expected: behold i come quickly blessed is he who keeps the words of the prophecy of this book
83 | Got     : behold i happened with me when i could see me to still it is a prophet his bueld
84 | 
85 | Expected: he said to me don't seal up the words of the prophecy of this book for the time is at hand
86 | Got     : he said to him why sil with the words of the prophets it is book for the times and her
87 | 
88 | Expected: behold i come quickly my reward is with me to repay to each man according to his work
89 | Got     : behold i come perfect i yahweh is with me to repent to be shamed according to his work
90 | 
91 | Expected: i am the alpha and the omega the first and the last the beginning and the end
92 | Got     : i have you hope from you and you and the first from aloes of the dew and the enemy
93 | 
94 | Expected: he who testifies these things says yes i come quickly amen yes come lord jesus
95 | Got     : he who testifies these things says yes i come proclaim i man listen will jesus
96 | 
97 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com. 
 5 | https://www.github.com/kyubyong/tacotron_asr
 6 | '''
 7 | 
 8 | from __future__ import print_function
 9 | 
10 | import os
11 | 
12 | import librosa
13 | from tqdm import tqdm
14 | 
15 | from data import load_vocab, load_train_data
16 | from data_load import get_batch
17 | from hyperparams import Hyperparams as hp
18 | from modules import *
19 | from networks import encode, decode
20 | import numpy as np
21 | import tensorflow as tf
22 | from utils import shift_by_one
23 | 
24 | char2idx, idx2char = load_vocab()
25 |  
26 | class Graph:
27 |     def __init__(self, is_training=True):
28 |         self.graph = tf.Graph()
29 |         self.is_training=is_training
30 |         with self.graph.as_default():
31 |             if is_training:
32 |                 self.x, self.y, self.num_batch = get_batch() 
33 |             else: # Evaluation
34 |                 self.x = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels*hp.r))
35 |                 self.y = tf.placeholder(tf.int32, shape=(None, hp.max_len))
36 |             
37 |             self.decoder_inputs = embed(shift_by_one(self.y), len(char2idx), hp.embed_size) # (N, T', E)
38 |             
39 |             with tf.variable_scope('net'):
40 |                 # Encoder
41 |                 self.memory = encode(self.x, is_training=is_training) # (N, T, hp.n_mels*hp.r)
42 |                 
43 |                 # Decoder
44 |                 self.outputs = decode(self.decoder_inputs, self.memory, is_training=is_training) # (N, T', E)
45 |                 self.logprobs = tf.log(tf.nn.softmax(self.outputs)+1e-10) 
46 |                 self.preds = tf.arg_max(self.outputs, dimension=-1)
47 |                 
48 |             if is_training:  
49 |                 # Loss
50 |                 self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.outputs)
51 |                 
52 |                 # Target masking
53 |                 self.istarget = tf.to_float(tf.not_equal(self.y, 0))
54 |                 self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget) + 1e-7)
55 |                 
56 |                 # Training Scheme
57 |                 self.global_step = tf.Variable(0, name='global_step', trainable=False)
58 |                 self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
59 |                 self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
60 |                    
61 |                 # Summary 
62 |                 tf.summary.scalar('mean_loss', self.mean_loss)
63 |                 self.merged = tf.summary.merge_all()
64 |          
65 | def main():   
66 |     g = Graph(); print("Training Graph loaded")
67 |     
68 |     with g.graph.as_default():
69 |         # Training 
70 |         sv = tf.train.Supervisor(logdir=hp.logdir,
71 |                                  save_model_secs=0)
72 |         with sv.managed_session() as sess:
73 |             for epoch in range(1, hp.num_epochs+1): 
74 |                 if sv.should_stop(): break
75 |                 for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
76 |                     sess.run(g.train_op)
77 |                     
78 |                 # Write checkpoint files at every epoch
79 |                 l, gs = sess.run([g.mean_loss, g.global_step])
80 |                 sv.saver.save(sess, hp.logdir + '/model_gs_%d_%.2f' % (gs, l))
81 | 
82 | if __name__ == '__main__':
83 |     main()
84 |     print("Done")
85 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #/usr/bin/python2
 3 | '''
 4 | By kyubyong park. kbpark.linguist@gmail.com. 
 5 | https://www.github.com/kyubyong/tacotron_asr
 6 | '''
 7 | from __future__ import print_function
 8 | 
 9 | import codecs
10 | import copy
11 | import re
12 | 
13 | import librosa
14 | 
15 | from hyperparams import Hyperparams as hp
16 | import numpy as np
17 | import tensorflow as tf
18 | 
19 | def get_spectrogram(sound_fpath): 
20 |     '''Extracts melspectrogram and magnitude from given `sound_file`.
21 |     Args:
22 |       sound_fpath: A string. Full path of a sound file.
23 | 
24 |     Returns:
25 |       Transposed S: A 2d array. A transposed melspectrogram with shape of (T, n_mels)
26 |       Transposed magnitude: A 2d array. A transposed magnitude spectrogram 
27 |         with shape of (T, 1+hp.n_fft//2)
28 |     '''
29 |     # Loading sound file
30 |     y, sr = librosa.load(sound_fpath, sr=None) # or set sr to hp.sr.
31 |     
32 |     # stft. D: (1+n_fft//2, T)
33 |     D = librosa.stft(y=y,
34 |                      n_fft=hp.n_fft, 
35 |                      hop_length=hp.hop_length, 
36 |                      win_length=hp.win_length) 
37 |     
38 |     # magnitude spectrogram
39 |     magnitude = np.abs(D) #(1+n_fft/2, T)
40 |     
41 |     # power spectrogram
42 |     power = magnitude**2
43 |     
44 |     # mel spectrogram
45 |     S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels) #(n_mels, T)
46 | 
47 |     return np.transpose(S.astype(np.float32))
48 |             
49 | def shift_by_one(inputs):
50 |     '''Shifts the content of `inputs` to the right by one 
51 |       so that it becomes the decoder inputs.
52 |       
53 |     Args:
54 |       inputs: A 3d tensor with shape of [N, T, C]
55 |     
56 |     Returns:
57 |       A 3d tensor with the same shape and dtype as `inputs`.
58 |     '''
59 |     return tf.concat((tf.zeros_like(inputs[:, :1]), inputs[:, :-1]), 1)
60 | 
61 | def reduce_frames(arry, r):
62 |     '''Reduces and adjust the shape and content of `arry` according to r.
63 |     
64 |     Args:
65 |       arry: A 2d array with shape of [T, C]
66 |       r: Reduction factor
67 |      
68 |     Returns:
69 |       A 2d array with shape of [-1, C*r]
70 |     '''
71 |     T, C = arry.shape
72 |     num_paddings = hp.r - (T % r) if T % r != 0 else 0
73 |      
74 |     padded = np.pad(arry, [[0, num_paddings], [0, 0]], 'constant')
75 |     output = np.reshape(padded, (-1, C*r))
76 |     return output
77 | 
78 | def restore_shape(arry, r):
79 |     '''Restore and adjust the shape and content of `inputs` according to r.
80 |     Args:
81 |       arry: A 3d array with shape of [N, T, C]
82 |       r: Reduction factor
83 |       
84 |     Returns:
85 |       A 3d tensor with shape of [-1, C*r]
86 |     '''
87 |     N, T, C = arry.shape
88 |     return arry.reshape((N, -1, C//r))


--------------------------------------------------------------------------------