├── LICENSE
├── README.md
├── data.py
├── data
├── eval.pkl
└── train.pkl
├── data_load.py
├── eval.py
├── fig
├── asr2.png
├── ori2.png
└── training_curve.png
├── hyperparams.py
├── modules.py
├── networks.py
├── prepro.py
├── samples
└── model_gs_19860_0.19.txt
├── train.py
└── utils.py
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Speech Recognition Using Tacotron
2 |
3 |
4 | ## Motivation
5 | Tacotron is an end-to-end speech generation model which was first introduced in [Towards End-to-End Speech Synthesis](https://arxiv.org/abs/1703.10135). It takes as input text at the character level, and targets mel filterbanks and the linear spectrogram. Although it is a generation model, I felt like testing how well it can be applied to the speech recognition task.
6 |
7 | ## Requirements
8 | * NumPy >= 1.11.1
9 | * TensorFlow == 1.1
10 | * librosa
11 |
12 | ## Model description
13 |
14 |
15 |
16 | Tacotron—Speech Synthesis Model (From_ Figure 1 in [Towards End-to-End Speech Synthesis](https://arxiv.org/abs/1703.10135))
17 |
18 |
19 | Modified architecture for speech recognition
20 |
21 | ## Data
22 |
23 | [The World English Bible](https://en.wikipedia.org/wiki/World_English_Bible) is a public domain update of the American Standard Version of 1901 into modern English. Its text and audio recordings are freely available [here](http://www.audiotreasure.com/webindex.htm). Unfortunately, however, each of the audio files matches a chapter, not a verse, so is too long for many machine learning tasks. I had someone slice them by verse manually. You can download [the audio data](https://www.dropbox.com/s/nde56czgda8q77e/WEB.zip?dl=0) and its [text](https://www.dropbox.com/s/lcfhs1kk9shvypj/text.csv?dl=0) from my dropbox.
24 |
25 | ## File description
26 | * `hyperparams.py` includes all hyper parameters.
27 | * `prepro.py` creates training and evaluation data to `data/` folder.
28 | * `data_load.py` loads data and put them in queues so multiple mini-bach data are generated in parallel.
29 | * `utils.py` has some operational functions.
30 | * `modules.py` contains building blocks for encoding and decoding networks.
31 | * `networks.py` defines encoding and decoding networks.
32 | * `train.py` executes training.
33 | * `eval.py` executes evaluation.
34 |
35 | ## Training
36 | * STEP 1. Adjust hyper parameters in `hyperparams.py` if necessary.
37 | * STEP 2. Download and extract [the audio data](https://dl.dropboxusercontent.com/u/42868014/WEB.zip) and its [text](https://dl.dropboxusercontent.com/u/42868014/text.csv).
38 | * STEP 3. Run `train.py`. Or you can download my [pretrained file](https://www.dropbox.com/s/n55aqjx6mge96pb/logdir.zip?dl=0)
39 |
40 | ## Evaluation
41 | * Run `eval.py` to get speech recognition results for the test set.
42 |
43 | ## Results
44 |
45 | The **training curve** looks like
46 |
47 |
48 |
49 | **Sample results** are
50 |
51 | Expected: the third poured out his bowl into the rivers and springs of water and they became blood
52 | Got : the first will lie down to the rivers and springs of waters and it became blood
53 |
54 | Expected: i heard the altar saying yes lord god the almighty true and righteous are your judgments
55 | Got : i heard the altar saying yes were like your own like you tree in righteousness for your judgments
56 |
57 | Expected: the fourth poured out his bowl on the sun and it was given to him to scorch men with fire
58 | Got : the foolish very armed were on the sun and was given to him to spoke to him with fire
59 |
60 | Expected: he gathered them together into the place which is called in hebrew megiddo
61 | Got : he gathered them together into the place which is called and he weep and at every
62 |
63 | Expected: every island fled away and the mountains were not found
64 | Got : hadad and kedemoth aroen and another and spread out them
65 |
66 | Expected: here is the mind that has wisdom the seven heads are seven mountains on which the woman sits
67 | Got : he is the mighty have wisdom the seven heads of seven rountains are with the wind sixter
68 |
69 | Expected: these have one mind and they give their power and authority to the beast
70 | Got : these are those who are mine and they give holl of a fool in the deeps
71 |
72 | Expected: the woman whom you saw is the great city which reigns over the kings of the earth
73 | Got : the woman whom he saw it his degrection which ran and to advening to be ear
74 |
75 | Expected: for her sins have reached to the sky and god has remembered her iniquities
76 | Got : for he sends a least in the sky and god has remembered her iniquities
77 |
78 | Expected: the merchants of the earth weep and mourn over her for no one buys their merchandise any more
79 | Got : the mittites of the earth weeps in your own are before from knowing babylon busine backsliding all t
80 |
81 | Expected: and cried out as they looked at the smoke of her burning saying 'what is like the great city'
82 | Got : and cried all the wicked beside of a good one and saying when is like the great sight
83 |
84 | Expected: in her was found the blood of prophets and of saints and of all who have been slain on the earth
85 | Got : and her with stones a dwellified confidence and all who have been slain on the earth
86 |
87 | Expected: a second said hallelujah her smoke goes up forever and ever
88 | Got : as set him said how many men utter for smoke go down for every male it
89 |
90 | Expected: he is clothed in a garment sprinkled with blood his name is called the word of god
91 | Got : he is close in a garment speaking in the blood his name is called 'the word of god'
92 |
93 | Expected: the armies which are in heaven followed him on white horses clothed in white pure fine linen
94 | Got : the army which are in heaven falls on the mighty one horses clothes driven on the affliction
95 |
96 | Expected: he has on his garment and on his thigh a name written king of kings and lord of lords
97 | Got : he has understandings on his folly among widow the king of kings and yahweh of armies
98 |
99 | Expected: i saw an angel coming down out of heaven having the key of the abyss and a great chain in his hand
100 | Got : i saw an even become young lion having you trust of the ages and a great chamber is hand
101 |
102 | Expected: and after the thousand years satan will be released from his prison
103 | Got : and after the palace and mizpah and eleven eleenth were the twentieth
104 |
105 | Expected: death and hades were thrown into the lake of fire this is the second death the lake of fire
106 | Got : let them hate with one and to wait for fire this is the second death and lead a time
107 |
108 | Expected: if anyone was not found written in the book of life he was cast into the lake of fire
109 | Got : the ten man will not think within your demon as with a blood he will cast him to ram for fire
110 |
111 | Expected: he who overcomes i will give him these things i will be his god and he will be my son
112 | Got : he who recompenses i will give him be stings i will be his god and he will be my son
113 |
114 | Expected: its wall is one hundred fortyfour cubits by the measure of a man that is of an angel
115 | Got : is through all his womb home before you for accusation that we may know him by these are in egypt
116 |
117 | Expected: the construction of its wall was jasper the city was pure gold like pure glass
118 | Got : if he struck him of his wallor is not speaking with torment hold on her grass
119 |
120 | Expected: i saw no temple in it for the lord god the almighty and the lamb are its temple
121 | Got : i saw in a tenth wind for we will dry up you among the linen ox skillful
122 |
123 | Expected: its gates will in no way be shut by day for there will be no night there
124 | Got : his greech wind more redeems shameful the redeemer man don't know
125 |
126 | Expected: and they shall bring the glory and the honor of the nations into it so that they may enter
127 | Got : and they shall bring the glory in the high mountains and the egyptian into the midst of the needy
128 |
129 | Expected: they will see his face and his name will be on their foreheads
130 | Got : they will see his face and his name on their follows
131 |
132 | Expected: behold i come quickly blessed is he who keeps the words of the prophecy of this book
133 | Got : behold i happened with me when i could see me to still it is a prophet his bueld
134 |
135 | Expected: he said to me don't seal up the words of the prophecy of this book for the time is at hand
136 | Got : he said to him why sil with the words of the prophets it is book for the times and her
137 |
138 | Expected: behold i come quickly my reward is with me to repay to each man according to his work
139 | Got : behold i come perfect i yahweh is with me to repent to be shamed according to his work
140 |
141 | Expected: i am the alpha and the omega the first and the last the beginning and the end
142 | Got : i have you hope from you and you and the first from aloes of the dew and the enemy
143 |
144 | Expected: he who testifies these things says yes i come quickly amen yes come lord jesus
145 | Got : he who testifies these things says yes i come proclaim i man listen will jesus
146 |
147 |
148 |
149 | ## Related projects
150 | * [A TensorFlow Implementation of Tacotron: A Fully End-to-End Text-To-Speech Synthesis Model](https://github.com/Kyubyong/tacotron)
151 | * [Speech-to-Text-WaveNet : End-to-end sentence level English speech recognition based on DeepMind's WaveNet and tensorflow](https://github.com/buriburisuri/speech-to-text-wavenet)
152 |
153 |
--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #/usr/bin/python2
3 | '''
4 | By kyubyong park. kbpark.linguist@gmail.com.
5 | https://www.github.com/kyubyong/tacotron_asr
6 | '''
7 |
8 | from __future__ import print_function
9 | import os
10 | import re
11 | import numpy as np
12 | import pickle
13 | from hyperparams import Hyperparams as hp
14 |
15 |
16 | def load_vocab():
17 | vocab = "ES abcdefghijklmnopqrstuvwxyz'" # E: Empty
18 | char2idx = {char:idx for idx, char in enumerate(vocab)}
19 | idx2char = {idx:char for idx, char in enumerate(vocab)}
20 | return char2idx, idx2char
21 |
22 | def text2idx(text):
23 | # Load vocabulary
24 | char2idx, idx2char = load_vocab()
25 |
26 | # Convert
27 | text = re.sub(r"[^ a-z']", "", text.lower()).strip() + "S"
28 | converted = [char2idx[char] for char in text]
29 | return text, converted
30 |
31 | def load_train_data():
32 | """We train on the whole data but the last mini-batch."""
33 |
34 | sound_fpaths, converteds = pickle.load(open('data/train.pkl', 'rb'))
35 | return sound_fpaths, converteds
36 |
37 | def load_eval_data():
38 | from utils import get_spectrogram, reduce_frames
39 | """We evaluate on the last mini-batch."""
40 | sound_fpaths, texts = pickle.load(open('data/eval.pkl', 'rb'))
41 |
42 | # Extract spectrogram from sound_fpaths
43 | char2idx, idx2char = load_vocab()
44 |
45 | xs, maxlen = [], 0
46 | for sound_fpath in sound_fpaths:
47 | spectrogram = get_spectrogram(sound_fpath)
48 | x = reduce_frames(spectrogram, hp.r)
49 | maxlen = max(maxlen, len(x))
50 | xs.append(x)
51 |
52 | # Set the length of samples in X to the maximum among them.
53 | X = np.zeros(shape=(len(xs), maxlen, hp.n_mels*hp.r), dtype=np.float32)
54 | for i, x in enumerate(xs):
55 | X[i, :len(x), :] = x
56 |
57 | return X, texts # 3d array, list of str
58 |
59 |
60 |
--------------------------------------------------------------------------------
/data/eval.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/tacotron_asr/fdc0d9a3ff56405e90e2cb6ff8d540e735b01b7d/data/eval.pkl
--------------------------------------------------------------------------------
/data/train.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/tacotron_asr/fdc0d9a3ff56405e90e2cb6ff8d540e735b01b7d/data/train.pkl
--------------------------------------------------------------------------------
/data_load.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #/usr/bin/python2
3 | '''
4 | By kyubyong park. kbpark.linguist@gmail.com.
5 | https://www.github.com/kyubyong/tacotron
6 | '''
7 |
8 | from functools import wraps
9 | import threading
10 |
11 | from tensorflow.python.platform import tf_logging as logging
12 |
13 | from hyperparams import Hyperparams as hp
14 | import numpy as np
15 | import tensorflow as tf
16 | from utils import get_spectrogram, reduce_frames
17 | from data import load_train_data
18 |
19 | # Adapted from the `sugartensor` code.
20 | # https://github.com/buriburisuri/sugartensor/blob/master/sugartensor/sg_queue.py
21 | def producer_func(func):
22 | r"""Decorates a function `func` as producer_func.
23 |
24 | Args:
25 | func: A function to decorate.
26 | """
27 | @wraps(func)
28 | def wrapper(inputs, dtypes, capacity, num_threads):
29 | r"""
30 | Args:
31 | inputs: A inputs queue list to enqueue
32 | dtypes: Data types of each tensor
33 | capacity: Queue capacity. Default is 32.
34 | num_threads: Number of threads. Default is 1.
35 | """
36 | # enqueue function
37 | def enqueue_func(sess, op):
38 | # read data from source queue
39 | data = func(sess.run(inputs))
40 | # create feeder dict
41 | feed_dict = {}
42 | for ph, col in zip(placeholders, data):
43 | feed_dict[ph] = col
44 | # run session
45 | sess.run(op, feed_dict=feed_dict)
46 |
47 | # create place holder list
48 | placeholders = []
49 | for dtype in dtypes:
50 | placeholders.append(tf.placeholder(dtype=dtype))
51 |
52 | # create FIFO queue
53 | queue = tf.FIFOQueue(capacity, dtypes=dtypes)
54 |
55 | # enqueue operation
56 | enqueue_op = queue.enqueue(placeholders)
57 |
58 | # create queue runner
59 | runner = _FuncQueueRunner(enqueue_func, queue, [enqueue_op] * num_threads)
60 |
61 | # register to global collection
62 | tf.train.add_queue_runner(runner)
63 |
64 | # return de-queue operation
65 | return queue.dequeue()
66 |
67 | return wrapper
68 |
69 |
70 | class _FuncQueueRunner(tf.train.QueueRunner):
71 |
72 | def __init__(self, func, queue=None, enqueue_ops=None, close_op=None,
73 | cancel_op=None, queue_closed_exception_types=None,
74 | queue_runner_def=None):
75 | # save ad-hoc function
76 | self.func = func
77 | # call super()
78 | super(_FuncQueueRunner, self).__init__(queue, enqueue_ops, close_op, cancel_op,
79 | queue_closed_exception_types, queue_runner_def)
80 |
81 | # pylint: disable=broad-except
82 | def _run(self, sess, enqueue_op, coord=None):
83 |
84 | if coord:
85 | coord.register_thread(threading.current_thread())
86 | decremented = False
87 | try:
88 | while True:
89 | if coord and coord.should_stop():
90 | break
91 | try:
92 | self.func(sess, enqueue_op) # call enqueue function
93 | except self._queue_closed_exception_types: # pylint: disable=catching-non-exception
94 | # This exception indicates that a queue was closed.
95 | with self._lock:
96 | self._runs_per_session[sess] -= 1
97 | decremented = True
98 | if self._runs_per_session[sess] == 0:
99 | try:
100 | sess.run(self._close_op)
101 | except Exception as e:
102 | # Intentionally ignore errors from close_op.
103 | logging.vlog(1, "Ignored exception: %s", str(e))
104 | return
105 | except Exception as e:
106 | # This catches all other exceptions.
107 | if coord:
108 | coord.request_stop(e)
109 | else:
110 | logging.error("Exception in QueueRunner: %s", str(e))
111 | with self._lock:
112 | self._exceptions_raised.append(e)
113 | raise
114 | finally:
115 | # Make sure we account for all terminations: normal or errors.
116 | if not decremented:
117 | with self._lock:
118 | self._runs_per_session[sess] -= 1
119 |
120 | @producer_func
121 | def get_spectrogram_and_text(_inputs):
122 | '''From `_inputs`, which has been fetched from slice queues,
123 | makes text, spectrogram, and magnitude,
124 | then enqueue them again.
125 | '''
126 | sound_fpath, text = _inputs
127 | spectrogram = get_spectrogram(sound_fpath)
128 | spectrogram = reduce_frames(spectrogram, hp.r)
129 |
130 | text = np.fromstring(text, np.int32)
131 | return spectrogram, text
132 |
133 | def get_batch():
134 | """Loads training data and put them in queues"""
135 | with tf.device('/cpu:0'):
136 | # Load data
137 | sound_fpaths, texts = load_train_data() # string, bytes
138 |
139 |
140 | # calc total batch count
141 | num_batch = len(sound_fpaths) // hp.batch_size
142 |
143 | # Convert to tensor
144 | sound_fpaths = tf.convert_to_tensor(sound_fpaths)
145 | texts = tf.convert_to_tensor(texts)
146 |
147 | # Create Queues
148 | q = tf.train.slice_input_producer([sound_fpaths, texts], shuffle=True)
149 |
150 | # Decode sound file
151 | x, y = get_spectrogram_and_text(inputs=q,
152 | dtypes=[tf.float32, tf.int32],
153 | capacity=128,
154 | num_threads=32)
155 | # create batch queues
156 | x, y = tf.train.batch([x, y],
157 | shapes=[(None, hp.n_mels*hp.r), (None,)],
158 | num_threads=32,
159 | batch_size=hp.batch_size,
160 | capacity=hp.batch_size*32,
161 | dynamic_pad=True)
162 | return x, y, num_batch
163 |
164 |
--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #/usr/bin/python2
3 | '''
4 | By kyubyong park. kbpark.linguist@gmail.com.
5 | https://www.github.com/kyubyong/tacotron_asr
6 | '''
7 |
8 | from __future__ import print_function
9 |
10 | import codecs
11 | import os
12 |
13 | from data import load_vocab, load_eval_data, load_train_data
14 | from hyperparams import Hyperparams as hp
15 | import numpy as np
16 | import tensorflow as tf
17 | from train import Graph
18 |
19 |
20 | def eval():
21 | # Load graph
22 | g = Graph(is_training=False); print("Graph loaded")
23 |
24 | # Load data
25 | x, y = load_eval_data()
26 | char2idx, idx2char = load_vocab()
27 |
28 | with g.graph.as_default():
29 | sv = tf.train.Supervisor()
30 | with sv.managed_session() as sess:
31 | # Restore parameters
32 | sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
33 | print("Restored!")
34 | # Get model name
35 | mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1]
36 |
37 | # Speech to Text
38 | if not os.path.exists('samples'): os.mkdir('samples')
39 | with codecs.open('samples/{}.txt'.format(mname), 'w', 'utf-8') as fout:
40 | preds = np.zeros((hp.batch_size, hp.max_len), np.int32)
41 | for j in range(hp.max_len):
42 | _preds = sess.run(g.preds, {g.x: x, g.y: preds})
43 | preds[:, j] = _preds[:, j]
44 |
45 | # Write to file
46 | for i, (expected, got) in enumerate(zip(y, preds)): # ground truth vs. prediction
47 | fout.write("Expected: {}\n".format(expected.split("S")[0]))
48 | fout.write("Got : {}\n\n".format(("".join(idx2char[idx] for idx in np.fromstring(got, np.int32))).split("S")[0]))
49 | fout.flush()
50 |
51 | if __name__ == '__main__':
52 | eval()
53 | print("Done")
54 |
55 |
56 |
--------------------------------------------------------------------------------
/fig/asr2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/tacotron_asr/fdc0d9a3ff56405e90e2cb6ff8d540e735b01b7d/fig/asr2.png
--------------------------------------------------------------------------------
/fig/ori2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/tacotron_asr/fdc0d9a3ff56405e90e2cb6ff8d540e735b01b7d/fig/ori2.png
--------------------------------------------------------------------------------
/fig/training_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Kyubyong/tacotron_asr/fdc0d9a3ff56405e90e2cb6ff8d540e735b01b7d/fig/training_curve.png
--------------------------------------------------------------------------------
/hyperparams.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #/usr/bin/python2
3 | '''
4 | By kyubyong park. kbpark.linguist@gmail.com.
5 | https://www.github.com/kyubyong/tacotron_asr
6 | '''
7 |
8 | class Hyperparams:
9 | '''Hyper parameters'''
10 | # data
11 | web = 'WEB'
12 | max_len = 100 # maximum length of text
13 |
14 | # signal processing
15 | sr = 22050 # Sampling rate.
16 | n_fft = 2048 # fft points (samples)
17 | frame_shift = 0.0125 # seconds
18 | frame_length = 0.05 # seconds
19 | hop_length = int(sr*frame_shift) # samples This is dependent on the frame_shift.
20 | win_length = int(sr*frame_length) # samples This is dependent on the frame_length.
21 | n_mels = 80 # Number of Mel banks to generate
22 |
23 | # model
24 | embed_size = 256 # alias = E
25 | encoder_num_banks = 16
26 | decoder_num_banks = 8
27 | num_highwaynet_blocks = 4
28 | r = 5 # Reduction factor. Paper => 2, 3, 5
29 |
30 | # training scheme
31 | lr = 0.0001
32 | logdir = "logdir"
33 | batch_size = 32
34 | num_epochs = 20
35 |
36 |
--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #/usr/bin/python2
3 | '''
4 | By kyubyong park. kbpark.linguist@gmail.com.
5 | https://www.github.com/kyubyong/tacotron_asr
6 | '''
7 |
8 | from __future__ import print_function
9 | import tensorflow as tf
10 | from hyperparams import Hyperparams as hp
11 |
12 | def embed(inputs, vocab_size, num_units, zero_pad=True, scope="embedding", reuse=None):
13 | '''Embeds a given tensor.
14 |
15 | Args:
16 | inputs: A `Tensor` with type `int32` or `int64` containing the ids
17 | to be looked up in `lookup table`.
18 | vocab_size: An int. Vocabulary size.
19 | num_units: An int. Number of embedding hidden units.
20 | zero_pad: A boolean. If True, all the values of the fist row (id 0)
21 | should be constant zeros.
22 | scope: Optional scope for `variable_scope`.
23 | reuse: Boolean, whether to reuse the weights of a previous layer
24 | by the same name.
25 |
26 | Returns:
27 | A `Tensor` with one more rank than inputs's. The last dimesionality
28 | should be `num_units`.
29 | '''
30 | with tf.variable_scope(scope, reuse=reuse):
31 | lookup_table = tf.get_variable('lookup_table',
32 | dtype=tf.float32,
33 | shape=[vocab_size, num_units],
34 | initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.01))
35 | if zero_pad:
36 | lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
37 | lookup_table[1:, :]), 0)
38 | return tf.nn.embedding_lookup(lookup_table, inputs)
39 |
40 | def normalize(inputs,
41 | type="bn",
42 | decay=.99,
43 | is_training=True,
44 | activation_fn=None,
45 | scope="normalize"):
46 | '''Applies {batch|layer} normalization.
47 |
48 | Args:
49 | inputs: A tensor with 2 or more dimensions, where the first dimension has
50 | `batch_size`. If type is `bn`, the normalization is over all but
51 | the last dimension. Or if type is `ln`, the normalization is over
52 | the last dimension. Note that this is different from the native
53 | `tf.contrib.layers.batch_norm`. For this I recommend you change
54 | a line in ``tensorflow/contrib/layers/python/layers/layer.py`
55 | as follows.
56 | Before: mean, variance = nn.moments(inputs, axis, keep_dims=True)
57 | After: mean, variance = nn.moments(inputs, [-1], keep_dims=True)
58 | type: A string. Either "bn" or "ln".
59 | decay: Decay for the moving average. Reasonable values for `decay` are close
60 | to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc.
61 | Lower `decay` value (recommend trying `decay`=0.9) if model experiences
62 | reasonably good training performance but poor validation and/or test
63 | performance.
64 | is_training: Whether or not the layer is in training mode. W
65 | activation_fn: Activation function.
66 | scope: Optional scope for `variable_scope`.
67 |
68 | Returns:
69 | A tensor with the same shape and data dtype as `inputs`.
70 | '''
71 | if type=="bn":
72 | inputs_shape = inputs.get_shape()
73 | inputs_rank = inputs_shape.ndims
74 |
75 | # use fused batch norm if inputs_rank in [2, 3, 4] as it is much faster.
76 | # pay attention to the fact that fused_batch_norm requires shape to be rank 4 of NHWC.
77 | if inputs_rank in [2, 3, 4]:
78 | if inputs_rank==2:
79 | inputs = tf.expand_dims(inputs, axis=1)
80 | inputs = tf.expand_dims(inputs, axis=2)
81 | elif inputs_rank==3:
82 | inputs = tf.expand_dims(inputs, axis=1)
83 |
84 | outputs = tf.contrib.layers.batch_norm(inputs=inputs,
85 | decay=decay,
86 | center=True,
87 | scale=True,
88 | activation_fn=activation_fn,
89 | updates_collections=None,
90 | is_training=is_training,
91 | scope=scope,
92 | zero_debias_moving_mean=True,
93 | fused=True)
94 | # restore original shape
95 | if inputs_rank==2:
96 | outputs = tf.squeeze(outputs, axis=[1, 2])
97 | elif inputs_rank==3:
98 | outputs = tf.squeeze(outputs, axis=1)
99 | else: # fallback to naive batch norm
100 | outputs = tf.contrib.layers.batch_norm(inputs=inputs,
101 | decay=decay,
102 | center=True,
103 | scale=True,
104 | activation_fn=activation_fn,
105 | updates_collections=None,
106 | is_training=is_training,
107 | scope=scope,
108 | fused=False)
109 | elif type=="ln":
110 | outputs = tf.contrib.layers.layer_norm(inputs=inputs,
111 | center=True,
112 | scale=True,
113 | activation_fn=activation_fn,
114 | scope=scope)
115 | elif type == "in":
116 | with tf.variable_scope(scope):
117 | batch, steps, channels = inputs.get_shape().as_list()
118 | var_shape = [channels]
119 | mu, sigma_sq = tf.nn.moments(inputs, [1], keep_dims=True)
120 | shift = tf.Variable(tf.zeros(var_shape))
121 | scale = tf.Variable(tf.ones(var_shape))
122 | epsilon = 1e-8
123 | normalized = (inputs - mu) / (sigma_sq + epsilon) ** (.5)
124 | outputs = scale * normalized + shift
125 | if activation_fn:
126 | outputs = activation_fn(outputs)
127 | else:
128 | raise ValueError("Currently we support `bn` or `ln` only.")
129 |
130 | return outputs
131 |
132 | def conv1d(inputs,
133 | filters=None,
134 | size=1,
135 | rate=1,
136 | padding="SAME",
137 | use_bias=False,
138 | activation_fn=None,
139 | scope="conv1d",
140 | reuse=None):
141 | '''
142 | Args:
143 | inputs: A 3-D tensor with shape of [batch, time, depth].
144 | filters: An int. Number of outputs (=activation maps)
145 | size: An int. Filter size.
146 | rate: An int. Dilation rate.
147 | padding: Either `same` or `valid` or `causal` (case-insensitive).
148 | use_bias: A boolean.
149 | scope: Optional scope for `variable_scope`.
150 | reuse: Boolean, whether to reuse the weights of a previous layer
151 | by the same name.
152 |
153 | Returns:
154 | A masked tensor of the same shape and dtypes as `inputs`.
155 | '''
156 |
157 | with tf.variable_scope(scope):
158 | if padding.lower()=="causal":
159 | # pre-padding for causality
160 | pad_len = (size - 1) * rate # padding size
161 | inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
162 | padding = "valid"
163 |
164 | if filters is None:
165 | filters = inputs.get_shape().as_list[-1]
166 |
167 | params = {"inputs":inputs, "filters":filters, "kernel_size":size,
168 | "dilation_rate":rate, "padding":padding, "activation":activation_fn,
169 | "use_bias":use_bias, "reuse":reuse}
170 |
171 | outputs = tf.layers.conv1d(**params)
172 | return outputs
173 |
174 | def conv1d_banks(inputs, K=16, is_training=True, scope="conv1d_banks", reuse=None):
175 | '''Applies a series of conv1d separately.
176 |
177 | Args:
178 | inputs: A 3d tensor with shape of [N, T, C]
179 | K: An int. The size of conv1d banks. That is,
180 | The `inputs` are convolved with K filters: 1, 2, ..., K.
181 | is_training: A boolean. This is passed to an argument of `batch_normalize`.
182 |
183 | Returns:
184 | A 3d tensor with shape of [N, T, K*Hp.embed_size//2].
185 | '''
186 | with tf.variable_scope(scope, reuse=reuse):
187 | outputs = conv1d(inputs, hp.embed_size//2, 1) # k=1
188 | outputs = normalize(outputs, type="in", is_training=is_training,
189 | activation_fn=tf.nn.relu)
190 | for k in range(2, K+1): # k = 2...K
191 | with tf.variable_scope("num_{}".format(k)):
192 | output = conv1d(inputs, hp.embed_size//2, k)
193 | output = normalize(output, type="in", is_training=is_training,
194 | activation_fn=tf.nn.relu)
195 | outputs = tf.concat((outputs, output), -1)
196 | return outputs # (N, T, Hp.embed_size//2*K)
197 |
198 | def gru(inputs, num_units=None, bidirection=False, scope="gru", reuse=None):
199 | '''Applies a GRU.
200 |
201 | Args:
202 | inputs: A 3d tensor with shape of [N, T, C].
203 | num_units: An int. The number of hidden units.
204 | bidirection: A boolean. If True, bidirectional results
205 | are concatenated.
206 | scope: Optional scope for `variable_scope`.
207 | reuse: Boolean, whether to reuse the weights of a previous layer
208 | by the same name.
209 |
210 | Returns:
211 | If bidirection is True, a 3d tensor with shape of [N, T, 2*num_units],
212 | otherwise [N, T, num_units].
213 | '''
214 | with tf.variable_scope(scope, reuse=reuse):
215 | if num_units is None:
216 | num_units = inputs.get_shape().as_list[-1]
217 |
218 | cell = tf.contrib.rnn.GRUCell(num_units)
219 | if bidirection:
220 | cell_bw = tf.contrib.rnn.GRUCell(num_units)
221 | outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell, cell_bw, inputs, dtype=tf.float32)
222 | return tf.concat(outputs, 2)
223 | else:
224 | outputs, _ = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
225 | return outputs
226 |
227 | def attention_decoder(inputs, memory, num_units=None, scope="attention_decoder", reuse=None):
228 | '''Applies a GRU to `inputs`, while attending `memory`.
229 | Args:
230 | inputs: A 3d tensor with shape of [N, T', C']. Decoder inputs.
231 | num_units: An int. Attention size.
232 | memory: A 3d tensor with shape of [N, T, C]. Outputs of encoder network.
233 | scope: Optional scope for `variable_scope`.
234 | reuse: Boolean, whether to reuse the weights of a previous layer
235 | by the same name.
236 |
237 | Returns:
238 | A 3d tensor with shape of [N, T, num_units].
239 | '''
240 | with tf.variable_scope(scope, reuse=reuse):
241 | if num_units is None:
242 | num_units = inputs.get_shape().as_list[-1]
243 |
244 | attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units, memory)
245 | decoder_cell = tf.contrib.rnn.GRUCell(num_units)
246 | cell_with_attetion = tf.contrib.seq2seq.DynamicAttentionWrapper(decoder_cell, attention_mechanism, num_units)
247 | outputs, _ = tf.nn.dynamic_rnn(cell_with_attetion, inputs, dtype=tf.float32) #( 1, 6, 16)
248 | return outputs
249 |
250 | def prenet(inputs, is_training=True, scope="prenet", reuse=None):
251 | '''Prenet for Encoder and Decoder.
252 | Args:
253 | inputs: A 3D tensor of shape [N, T, hp.embed_size].
254 | scope: Optional scope for `variable_scope`.
255 | reuse: Boolean, whether to reuse the weights of a previous layer
256 | by the same name.
257 |
258 | Returns:
259 | A 3D tensor of shape [N, T, num_units/2].
260 | '''
261 | with tf.variable_scope(scope, reuse=reuse):
262 | outputs = tf.layers.dense(inputs, units=hp.embed_size, activation=tf.nn.relu, name="dense1")
263 | outputs = tf.nn.dropout(outputs, keep_prob=.5 if is_training==True else 1., name="dropout1")
264 | outputs = tf.layers.dense(outputs, units=hp.embed_size//2, activation=tf.nn.relu, name="dense2")
265 | outputs = tf.nn.dropout(outputs, keep_prob=.5 if is_training==True else 1., name="dropout2")
266 | return outputs # (N, T, num_units/2)
267 |
268 | def highwaynet(inputs, num_units=None, scope="highwaynet", reuse=None):
269 | '''Highway networks, see https://arxiv.org/abs/1505.00387
270 |
271 | Args:
272 | inputs: A 3D tensor of shape [N, T, W].
273 | num_units: An int or `None`. Specifies the number of units in the highway layer
274 | or uses the input size if `None`.
275 | scope: Optional scope for `variable_scope`.
276 | reuse: Boolean, whether to reuse the weights of a previous layer
277 | by the same name.
278 |
279 | Returns:
280 | A 3D tensor of shape [N, T, W].
281 | '''
282 | if not num_units:
283 | num_units = inputs.get_shape()[-1]
284 |
285 | with tf.variable_scope(scope, reuse=reuse):
286 | H = tf.layers.dense(inputs, units=num_units, activation=tf.nn.relu, name="dense1")
287 | T = tf.layers.dense(inputs, units=num_units, activation=tf.nn.sigmoid, name="dense2")
288 | C = 1. - T
289 | outputs = H * T + inputs * C
290 | return outputs
291 |
--------------------------------------------------------------------------------
/networks.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #/usr/bin/python2
3 | '''
4 | By kyubyong park. kbpark.linguist@gmail.com.
5 | https://www.github.com/kyubyong/tacotron
6 | '''
7 |
8 | from __future__ import print_function
9 |
10 | from hyperparams import Hyperparams as hp
11 | from modules import *
12 | from data import load_vocab
13 | import tensorflow as tf
14 |
15 | def encode(inputs, is_training=True, scope="encoder", reuse=None):
16 | '''
17 | Args:
18 | inputs: A 2d tensor with shape of [N, T], dtype of int32.
19 | is_training: Whether or not the layer is in training mode.
20 | scope: Optional scope for `variable_scope`
21 | reuse: Boolean, whether to reuse the weights of a previous layer
22 | by the same name.
23 |
24 | Returns:
25 | A collection of Hidden vectors, whose shape is (N, T, E).
26 | '''
27 | with tf.variable_scope(scope, reuse=reuse):
28 | # Load vocabulary
29 | char2idx, idx2char = load_vocab()
30 |
31 | # # Character Embedding
32 | # inputs = embed(inputs, len(char2idx), hp.embed_size) # (N, T, E)
33 |
34 | # Encoder pre-net
35 | prenet_out = prenet(inputs, scope="prenet", is_training=is_training) # (N, T, E/2)
36 |
37 | # Encoder CBHG
38 | ## Conv1D bank
39 | enc = conv1d_banks(prenet_out, K=hp.encoder_num_banks, is_training=is_training) # (N, T, K * E / 2)
40 |
41 | ### Max pooling
42 | enc = tf.layers.max_pooling1d(enc, 2, 1, padding="same") # (N, T, K * E / 2)
43 |
44 | ### Conv1D projections
45 | enc = conv1d(enc, hp.embed_size//2, 3, scope="conv1d_1") # (N, T, E/2)
46 | enc = normalize(enc, type="in", is_training=is_training,
47 | activation_fn=tf.nn.relu)
48 | enc = conv1d(enc, hp.embed_size//2, 3, scope="conv1d_2") # (N, T, E/2)
49 | enc = normalize(enc, type="in", is_training=is_training,
50 | activation_fn=None)
51 | enc += prenet_out # (N, T, E/2) # residual connections
52 |
53 | ### Highway Nets
54 | for i in range(hp.num_highwaynet_blocks):
55 | enc = highwaynet(enc, num_units=hp.embed_size//2,
56 | scope='highwaynet_{}'.format(i)) # (N, T, E/2)
57 |
58 | ### Bidirectional GRU
59 | memory = gru(enc, hp.embed_size//2, True) # (N, T, E)
60 |
61 | return memory
62 |
63 | def decode(decoder_inputs, memory, is_training=True, scope="decoder1", reuse=None):
64 | '''
65 | Args:
66 | decoder_inputs: A 3d tensor with shape of [N, T', C'], where C'=hp.n_mels*hp.r,
67 | dtype of float32. Shifted melspectrogram of sound files.
68 | memory: A 3d tensor with shape of [N, T, C], where C=hp.embed_size.
69 | scope: Optional scope for `variable_scope`
70 | reuse: Boolean, whether to reuse the weights of a previous layer
71 | by the same name.
72 |
73 | Returns
74 | Predicted melspectrogram tensor with shape of [N, T', C'].
75 | '''
76 | with tf.variable_scope(scope, reuse=reuse):
77 | # Decoder pre-net
78 | dec = prenet(decoder_inputs, is_training=is_training) # (N, T', E/2)
79 |
80 | # Attention RNN
81 | dec = attention_decoder(dec, memory, hp.embed_size) # (N, T', E)
82 |
83 | # decoder rnn
84 | dec += gru(dec, hp.embed_size, False, scope="gru1") # (N, T', E)
85 | dec += gru(dec, hp.embed_size, False, scope="gru2") # (N, T', E)
86 |
87 | # Outputs => (N, T', V)
88 | char2idx, idx2char = load_vocab()
89 | outputs = tf.layers.dense(dec, len(char2idx))
90 |
91 | return outputs
92 |
--------------------------------------------------------------------------------
/prepro.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #/usr/bin/python2
3 | '''
4 | By kyubyong park. kbpark.linguist@gmail.com.
5 | https://www.github.com/kyubyong/tacotron_asr
6 | '''
7 | from __future__ import print_function
8 |
9 | from collections import Counter
10 | import glob
11 | import os
12 | import pickle
13 | import re
14 |
15 | import librosa
16 | from tqdm import tqdm
17 |
18 | from hyperparams import Hyperparams as hp
19 | import numpy as np
20 | import csv
21 | import codecs
22 | from data import text2idx
23 |
24 | def make_train_data():
25 | import csv
26 |
27 | sound_fpaths, converteds, texts = [], [], []
28 | reader = csv.reader(codecs.open(hp.web + "/text.csv", 'rb', 'utf-8'))
29 | for row in reader:
30 | sound_fname, text, duration = row
31 | sound_fpath = hp.web + "/" + sound_fname + ".wav"
32 | cleaned, converted = text2idx(text)
33 | if (len(text) <= hp.max_len):
34 | sound_fpaths.append(sound_fpath)
35 | converteds.append(np.array(converted, np.int32).tostring())
36 | texts.append(cleaned)
37 |
38 | # Split into train and eval. We reserve the last mini-batch for evaluation
39 | X_train, Y_train = sound_fpaths[:-hp.batch_size], converteds[:-hp.batch_size]
40 | X_eval, Y_eval = sound_fpaths[-hp.batch_size:], texts[-hp.batch_size:]
41 |
42 | # Save
43 | pickle.dump((X_train, Y_train), open('data/train.pkl', 'wb'))
44 | pickle.dump((X_eval, Y_eval), open('data/eval.pkl', 'wb'))
45 |
46 | if __name__ == "__main__":
47 | make_train_data()
48 | print("Done!")
49 |
--------------------------------------------------------------------------------
/samples/model_gs_19860_0.19.txt:
--------------------------------------------------------------------------------
1 | Expected: the third poured out his bowl into the rivers and springs of water and they became blood
2 | Got : the first will lie down to the rivers and springs of waters and it became blood
3 |
4 | Expected: i heard the altar saying yes lord god the almighty true and righteous are your judgments
5 | Got : i heard the altar saying yes were like your own like you tree in righteousness for your judgments
6 |
7 | Expected: the fourth poured out his bowl on the sun and it was given to him to scorch men with fire
8 | Got : the foolish very armed were on the sun and was given to him to spoke to him with fire
9 |
10 | Expected: he gathered them together into the place which is called in hebrew megiddo
11 | Got : he gathered them together into the place which is called and he weep and at every
12 |
13 | Expected: every island fled away and the mountains were not found
14 | Got : hadad and kedemoth aroen and another and spread out them
15 |
16 | Expected: here is the mind that has wisdom the seven heads are seven mountains on which the woman sits
17 | Got : he is the mighty have wisdom the seven heads of seven rountains are with the wind sixter
18 |
19 | Expected: these have one mind and they give their power and authority to the beast
20 | Got : these are those who are mine and they give holl of a fool in the deeps
21 |
22 | Expected: the woman whom you saw is the great city which reigns over the kings of the earth
23 | Got : the woman whom he saw it his degrection which ran and to advening to be ear
24 |
25 | Expected: for her sins have reached to the sky and god has remembered her iniquities
26 | Got : for he sends a least in the sky and god has remembered her iniquities
27 |
28 | Expected: the merchants of the earth weep and mourn over her for no one buys their merchandise any more
29 | Got : the mittites of the earth weeps in your own are before from knowing babylon busine backsliding all t
30 |
31 | Expected: and cried out as they looked at the smoke of her burning saying 'what is like the great city'
32 | Got : and cried all the wicked beside of a good one and saying when is like the great sight
33 |
34 | Expected: in her was found the blood of prophets and of saints and of all who have been slain on the earth
35 | Got : and her with stones a dwellified confidence and all who have been slain on the earth
36 |
37 | Expected: a second said hallelujah her smoke goes up forever and ever
38 | Got : as set him said how many men utter for smoke go down for every male it
39 |
40 | Expected: he is clothed in a garment sprinkled with blood his name is called the word of god
41 | Got : he is close in a garment speaking in the blood his name is called 'the word of god'
42 |
43 | Expected: the armies which are in heaven followed him on white horses clothed in white pure fine linen
44 | Got : the army which are in heaven falls on the mighty one horses clothes driven on the affliction
45 |
46 | Expected: he has on his garment and on his thigh a name written king of kings and lord of lords
47 | Got : he has understandings on his folly among widow the king of kings and yahweh of armies
48 |
49 | Expected: i saw an angel coming down out of heaven having the key of the abyss and a great chain in his hand
50 | Got : i saw an even become young lion having you trust of the ages and a great chamber is hand
51 |
52 | Expected: and after the thousand years satan will be released from his prison
53 | Got : and after the palace and mizpah and eleven eleenth were the twentieth
54 |
55 | Expected: death and hades were thrown into the lake of fire this is the second death the lake of fire
56 | Got : let them hate with one and to wait for fire this is the second death and lead a time
57 |
58 | Expected: if anyone was not found written in the book of life he was cast into the lake of fire
59 | Got : the ten man will not think within your demon as with a blood he will cast him to ram for fire
60 |
61 | Expected: he who overcomes i will give him these things i will be his god and he will be my son
62 | Got : he who recompenses i will give him be stings i will be his god and he will be my son
63 |
64 | Expected: its wall is one hundred fortyfour cubits by the measure of a man that is of an angel
65 | Got : is through all his womb home before you for accusation that we may know him by these are in egypt
66 |
67 | Expected: the construction of its wall was jasper the city was pure gold like pure glass
68 | Got : if he struck him of his wallor is not speaking with torment hold on her grass
69 |
70 | Expected: i saw no temple in it for the lord god the almighty and the lamb are its temple
71 | Got : i saw in a tenth wind for we will dry up you among the linen ox skillful
72 |
73 | Expected: its gates will in no way be shut by day for there will be no night there
74 | Got : his greech wind more redeems shameful the redeemer man don't know
75 |
76 | Expected: and they shall bring the glory and the honor of the nations into it so that they may enter
77 | Got : and they shall bring the glory in the high mountains and the egyptian into the midst of the needy
78 |
79 | Expected: they will see his face and his name will be on their foreheads
80 | Got : they will see his face and his name on their follows
81 |
82 | Expected: behold i come quickly blessed is he who keeps the words of the prophecy of this book
83 | Got : behold i happened with me when i could see me to still it is a prophet his bueld
84 |
85 | Expected: he said to me don't seal up the words of the prophecy of this book for the time is at hand
86 | Got : he said to him why sil with the words of the prophets it is book for the times and her
87 |
88 | Expected: behold i come quickly my reward is with me to repay to each man according to his work
89 | Got : behold i come perfect i yahweh is with me to repent to be shamed according to his work
90 |
91 | Expected: i am the alpha and the omega the first and the last the beginning and the end
92 | Got : i have you hope from you and you and the first from aloes of the dew and the enemy
93 |
94 | Expected: he who testifies these things says yes i come quickly amen yes come lord jesus
95 | Got : he who testifies these things says yes i come proclaim i man listen will jesus
96 |
97 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #/usr/bin/python2
3 | '''
4 | By kyubyong park. kbpark.linguist@gmail.com.
5 | https://www.github.com/kyubyong/tacotron_asr
6 | '''
7 |
8 | from __future__ import print_function
9 |
10 | import os
11 |
12 | import librosa
13 | from tqdm import tqdm
14 |
15 | from data import load_vocab, load_train_data
16 | from data_load import get_batch
17 | from hyperparams import Hyperparams as hp
18 | from modules import *
19 | from networks import encode, decode
20 | import numpy as np
21 | import tensorflow as tf
22 | from utils import shift_by_one
23 |
24 | char2idx, idx2char = load_vocab()
25 |
26 | class Graph:
27 | def __init__(self, is_training=True):
28 | self.graph = tf.Graph()
29 | self.is_training=is_training
30 | with self.graph.as_default():
31 | if is_training:
32 | self.x, self.y, self.num_batch = get_batch()
33 | else: # Evaluation
34 | self.x = tf.placeholder(tf.float32, shape=(None, None, hp.n_mels*hp.r))
35 | self.y = tf.placeholder(tf.int32, shape=(None, hp.max_len))
36 |
37 | self.decoder_inputs = embed(shift_by_one(self.y), len(char2idx), hp.embed_size) # (N, T', E)
38 |
39 | with tf.variable_scope('net'):
40 | # Encoder
41 | self.memory = encode(self.x, is_training=is_training) # (N, T, hp.n_mels*hp.r)
42 |
43 | # Decoder
44 | self.outputs = decode(self.decoder_inputs, self.memory, is_training=is_training) # (N, T', E)
45 | self.logprobs = tf.log(tf.nn.softmax(self.outputs)+1e-10)
46 | self.preds = tf.arg_max(self.outputs, dimension=-1)
47 |
48 | if is_training:
49 | # Loss
50 | self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.outputs)
51 |
52 | # Target masking
53 | self.istarget = tf.to_float(tf.not_equal(self.y, 0))
54 | self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget) + 1e-7)
55 |
56 | # Training Scheme
57 | self.global_step = tf.Variable(0, name='global_step', trainable=False)
58 | self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
59 | self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
60 |
61 | # Summary
62 | tf.summary.scalar('mean_loss', self.mean_loss)
63 | self.merged = tf.summary.merge_all()
64 |
65 | def main():
66 | g = Graph(); print("Training Graph loaded")
67 |
68 | with g.graph.as_default():
69 | # Training
70 | sv = tf.train.Supervisor(logdir=hp.logdir,
71 | save_model_secs=0)
72 | with sv.managed_session() as sess:
73 | for epoch in range(1, hp.num_epochs+1):
74 | if sv.should_stop(): break
75 | for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'):
76 | sess.run(g.train_op)
77 |
78 | # Write checkpoint files at every epoch
79 | l, gs = sess.run([g.mean_loss, g.global_step])
80 | sv.saver.save(sess, hp.logdir + '/model_gs_%d_%.2f' % (gs, l))
81 |
82 | if __name__ == '__main__':
83 | main()
84 | print("Done")
85 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #/usr/bin/python2
3 | '''
4 | By kyubyong park. kbpark.linguist@gmail.com.
5 | https://www.github.com/kyubyong/tacotron_asr
6 | '''
7 | from __future__ import print_function
8 |
9 | import codecs
10 | import copy
11 | import re
12 |
13 | import librosa
14 |
15 | from hyperparams import Hyperparams as hp
16 | import numpy as np
17 | import tensorflow as tf
18 |
19 | def get_spectrogram(sound_fpath):
20 | '''Extracts melspectrogram and magnitude from given `sound_file`.
21 | Args:
22 | sound_fpath: A string. Full path of a sound file.
23 |
24 | Returns:
25 | Transposed S: A 2d array. A transposed melspectrogram with shape of (T, n_mels)
26 | Transposed magnitude: A 2d array. A transposed magnitude spectrogram
27 | with shape of (T, 1+hp.n_fft//2)
28 | '''
29 | # Loading sound file
30 | y, sr = librosa.load(sound_fpath, sr=None) # or set sr to hp.sr.
31 |
32 | # stft. D: (1+n_fft//2, T)
33 | D = librosa.stft(y=y,
34 | n_fft=hp.n_fft,
35 | hop_length=hp.hop_length,
36 | win_length=hp.win_length)
37 |
38 | # magnitude spectrogram
39 | magnitude = np.abs(D) #(1+n_fft/2, T)
40 |
41 | # power spectrogram
42 | power = magnitude**2
43 |
44 | # mel spectrogram
45 | S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels) #(n_mels, T)
46 |
47 | return np.transpose(S.astype(np.float32))
48 |
49 | def shift_by_one(inputs):
50 | '''Shifts the content of `inputs` to the right by one
51 | so that it becomes the decoder inputs.
52 |
53 | Args:
54 | inputs: A 3d tensor with shape of [N, T, C]
55 |
56 | Returns:
57 | A 3d tensor with the same shape and dtype as `inputs`.
58 | '''
59 | return tf.concat((tf.zeros_like(inputs[:, :1]), inputs[:, :-1]), 1)
60 |
61 | def reduce_frames(arry, r):
62 | '''Reduces and adjust the shape and content of `arry` according to r.
63 |
64 | Args:
65 | arry: A 2d array with shape of [T, C]
66 | r: Reduction factor
67 |
68 | Returns:
69 | A 2d array with shape of [-1, C*r]
70 | '''
71 | T, C = arry.shape
72 | num_paddings = hp.r - (T % r) if T % r != 0 else 0
73 |
74 | padded = np.pad(arry, [[0, num_paddings], [0, 0]], 'constant')
75 | output = np.reshape(padded, (-1, C*r))
76 | return output
77 |
78 | def restore_shape(arry, r):
79 | '''Restore and adjust the shape and content of `inputs` according to r.
80 | Args:
81 | arry: A 3d array with shape of [N, T, C]
82 | r: Reduction factor
83 |
84 | Returns:
85 | A 3d tensor with shape of [-1, C*r]
86 | '''
87 | N, T, C = arry.shape
88 | return arry.reshape((N, -1, C//r))
--------------------------------------------------------------------------------