├── .gitignore
├── DISCLAIMER
├── LICENSE
├── README.md
├── app.py
├── assets
    ├── attention_multi_speaker.gif
    ├── attention_single_speaker.gif
    └── model.png
├── audio
    ├── __init__.py
    ├── audio_range.py
    ├── get_duration.py
    ├── google_speech.py
    └── silence.py
├── datasets
    ├── LJSpeech_1_0
    │   ├── README
    │   └── prepare.py
    ├── __init__.py
    ├── datafeeder.py
    ├── generate_data.py
    ├── kim_anchor
    │   └── download.py
    ├── son
    │   └── download.py
    └── yuinna
    │   └── download.py
├── download.py
├── eval.py
├── hparams.py
├── models
    ├── __init__.py
    ├── helpers.py
    ├── modules.py
    ├── rnn_wrappers.py
    └── tacotron.py
├── nohup.out
├── recognition
    ├── alignment.py
    └── google.py
├── requirements.txt
├── run.sh
├── scripts
    └── prepare_son.sh
├── synthesizer.py
├── text
    ├── __init__.py
    ├── cleaners.py
    ├── en_numbers.py
    ├── english.py
    ├── ko_dictionary.py
    ├── korean.py
    └── symbols.py
├── train.py
├── utils
    ├── NanumBarunGothic.ttf
    ├── __init__.py
    ├── infolog.py
    └── plot.py
├── web
    ├── static
    │   ├── css
    │   │   └── main.css
    │   └── js
    │   │   ├── main.js
    │   │   └── siriwave.js
    └── templates
    │   └── index.html
└── 김앵커한마디_음성받아오기.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Text
  2 | *.png
  3 | *.txt
  4 | *.json
  5 | *.csv
  6 | 
  7 | # Data
  8 | logs
  9 | *.npy
 10 | *.npz
 11 | *.tar
 12 | *.tar.gz
 13 | 
 14 | # Media
 15 | *.mp4
 16 | *.mp3
 17 | *.flac
 18 | *.wav
 19 | *.ts
 20 | *.avi
 21 | 
 22 | .DS_Store
 23 | 
 24 | # Created by https://www.gitignore.io/api/python,vim
 25 | 
 26 | ### Python ###
 27 | # Byte-compiled / optimized / DLL files
 28 | __pycache__/
 29 | *.py[cod]
 30 | *$py.class
 31 | 
 32 | # C extensions
 33 | *.so
 34 | 
 35 | # Distribution / packaging
 36 | .Python
 37 | env/
 38 | build/
 39 | develop-eggs/
 40 | dist/
 41 | downloads/
 42 | eggs/
 43 | .eggs/
 44 | lib64/
 45 | parts/
 46 | sdist/
 47 | var/
 48 | wheels/
 49 | *.egg-info/
 50 | .installed.cfg
 51 | *.egg
 52 | 
 53 | # PyInstaller
 54 | #  Usually these files are written by a python script from a template
 55 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 56 | *.manifest
 57 | *.spec
 58 | 
 59 | # Installer logs
 60 | pip-log.txt
 61 | pip-delete-this-directory.txt
 62 | 
 63 | # Unit test / coverage reports
 64 | htmlcov/
 65 | .tox/
 66 | .coverage
 67 | .coverage.*
 68 | .cache
 69 | nosetests.xml
 70 | coverage.xml
 71 | *,cover
 72 | .hypothesis/
 73 | 
 74 | # Translations
 75 | *.mo
 76 | *.pot
 77 | 
 78 | # Django stuff:
 79 | *.log
 80 | local_settings.py
 81 | 
 82 | # Flask stuff:
 83 | instance/
 84 | .webassets-cache
 85 | 
 86 | # Scrapy stuff:
 87 | .scrapy
 88 | 
 89 | # Sphinx documentation
 90 | docs/_build/
 91 | 
 92 | # PyBuilder
 93 | target/
 94 | 
 95 | # Jupyter Notebook
 96 | .ipynb_checkpoints
 97 | 
 98 | # pyenv
 99 | .python-version
100 | 
101 | # celery beat schedule file
102 | celerybeat-schedule
103 | 
104 | # dotenv
105 | .env
106 | 
107 | # virtualenv
108 | .venv/
109 | venv/
110 | ENV/
111 | 
112 | # Spyder project settings
113 | .spyderproject
114 | 
115 | # Rope project settings
116 | .ropeproject
117 | 
118 | 
119 | ### Vim ###
120 | # swap
121 | [._]*.s[a-v][a-z]
122 | [._]*.sw[a-p]
123 | [._]s[a-v][a-z]
124 | [._]sw[a-p]
125 | # session
126 | Session.vim
127 | # temporary
128 | .netrwhist
129 | *~
130 | # auto-generated tag files
131 | tags
132 | 
133 | # End of https://www.gitignore.io/api/python,vim
134 | 


--------------------------------------------------------------------------------
/DISCLAIMER:
--------------------------------------------------------------------------------
1 | This is not an official [DEVSISTERS](http://devsisters.com/) product and is not responsible for misuse or for any damage that you may cause. You agree that you use this software at your own risk.
2 | 
3 | 이것은 [데브시스터즈](http://devsisters.com/)의 공식적인 제품이 아닙니다. [데브시스터즈](http://devsisters.com )는 이 코드를 잘못 사용했을 시 발생한 문제나 이슈에 대한 책임을 지지 않으며 이 소프트웨어의 사용은 사용자 자신에>게 전적으로 책임이 있습니다.
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Devsisters
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 
21 | 
22 | Copyright (c) 2017 Keith Ito
23 | 
24 | Permission is hereby granted, free of charge, to any person obtaining a copy
25 | of this software and associated documentation files (the "Software"), to deal
26 | in the Software without restriction, including without limitation the rights
27 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
28 | copies of the Software, and to permit persons to whom the Software is
29 | furnished to do so, subject to the following conditions:
30 | 
31 | The above copyright notice and this permission notice shall be included in
32 | all copies or substantial portions of the Software.
33 | 
34 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
35 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
36 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
37 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
38 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
39 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
40 | THE SOFTWARE.
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 책 읽어주는 딥러닝: 배우 유인나가 해리포터를 읽어준다면 DEVIEW 2017 from Taehoon Kim 님이 발표한 자료를 바탕으로 약간의 소스코드를 더하여 수정한 소스코드입니다. 자세한 구현 방법은 아래의 블로그 글에서 참고하시면 좋을 것 같습니다 :) 
  2 | 
  3 | - DeepVoice 이용한 프로젝트 수행해보기
  4 | 	- [인공지능 deep voice를 이용한 TTS(음성합성) 구현하기 _ 손석희 앵커](http://melonicedlatte.com/machinelearning/2018/07/02/215933.html)
  5 | 	- [deep voice를 이용한 TTS(Text-To-Speech) 구현하기 _ 김앵커 한마디 학습](http://melonicedlatte.com/machinelearning/2018/07/09/141346.html)
  6 | 	- fork 한 원래 repository 주소 : [carpedm20/multi-speaker-tacotron-tensorflow](https://github.com/carpedm20/multi-speaker-tacotron-tensorflow)
  7 | ---
  8 | 
  9 | # Multi-Speaker Tacotron in TensorFlow
 10 | 
 11 | TensorFlow implementation of:
 12 | 
 13 | - [Deep Voice 2: Multi-Speaker Neural Text-to-Speech](https://arxiv.org/abs/1705.08947)
 14 | - [Listening while Speaking: Speech Chain by Deep Learning](https://arxiv.org/abs/1707.04879)
 15 | - [Tacotron: Towards End-to-End Speech Synthesis](https://arxiv.org/abs/1703.10135)
 16 | 
 17 | Samples audios (in Korean) can be found [here](http://carpedm20.github.io/tacotron/en.html).
 18 | 
 19 | ![model](./assets/model.png)
 20 | 
 21 | 
 22 | ## Prerequisites
 23 | 
 24 | - Python 3.6+
 25 | - FFmpeg
 26 | - [Tensorflow 1.3](https://www.tensorflow.org/install/)
 27 | 
 28 | 
 29 | ## Usage
 30 | 
 31 | ### 1. Install prerequisites
 32 | 
 33 | After preparing [Tensorflow](https://www.tensorflow.org/install/), install prerequisites with:
 34 | 
 35 |     pip3 install -r requirements.txt
 36 |     python -c "import nltk; nltk.download('punkt')"
 37 | 
 38 | If you want to synthesize a speech in Korean dicrectly, follow [2-3. Download pre-trained models](#2-3-download-pre-trained-models).
 39 | 
 40 | 
 41 | ### 2-1. Generate custom datasets
 42 | 
 43 | The `datasets` directory should look like:
 44 | 
 45 |     datasets
 46 |     ├── son
 47 |     │   ├── alignment.json
 48 |     │   └── audio
 49 |     │       ├── 1.mp3
 50 |     │       ├── 2.mp3
 51 |     │       ├── 3.mp3
 52 |     │       └── ...
 53 |     └── YOUR_DATASET
 54 |         ├── alignment.json
 55 |         └── audio
 56 |             ├── 1.mp3
 57 |             ├── 2.mp3
 58 |             ├── 3.mp3
 59 |             └── ...
 60 | 
 61 | and `YOUR_DATASET/alignment.json` should look like:
 62 | 
 63 |     {
 64 |         "./datasets/YOUR_DATASET/audio/001.mp3": "My name is Taehoon Kim.",
 65 |         "./datasets/YOUR_DATASET/audio/002.mp3": "The buses aren't the problem.",
 66 |         "./datasets/YOUR_DATASET/audio/003.mp3": "They have discovered a new particle.",
 67 |     }
 68 | 
 69 | After you prepare as described, you should genearte preprocessed data with:
 70 | 
 71 |     python3 -m datasets.generate_data ./datasets/YOUR_DATASET/alignment.json
 72 | 
 73 | 
 74 | ### 2-2. Generate Korean datasets
 75 | 
 76 | Follow below commands. (explain with `son` dataset)
 77 | 
 78 | 0. To automate an alignment between sounds and texts, prepare `GOOGLE_APPLICATION_CREDENTIALS` to use [Google Speech Recognition API](https://cloud.google.com/speech/). To get credentials, read [this](https://developers.google.com/identity/protocols/application-default-credentials).
 79 | 
 80 |        export GOOGLE_APPLICATION_CREDENTIALS="YOUR-GOOGLE.CREDENTIALS.json"
 81 | 
 82 | 1. Download speech(or video) and text.
 83 | 
 84 |        python3 -m datasets.son.download
 85 | 
 86 | 2. Segment all audios on silence.
 87 | 
 88 |        python3 -m audio.silence --audio_pattern "./datasets/son/audio/*.wav" --method=pydub
 89 | 
 90 | 3. By using [Google Speech Recognition API](https://cloud.google.com/speech/), we predict sentences for all segmented audios.
 91 | 
 92 |        python3 -m recognition.google --audio_pattern "./datasets/son/audio/*.*.wav"
 93 | 
 94 | 4. By comparing original text and recognised text, save `audio<->text` pair information into `./datasets/son/alignment.json`.
 95 | 
 96 |        python3 -m recognition.alignment --recognition_path "./datasets/son/recognition.json" --score_threshold=0.5
 97 | 
 98 | 5. Finally, generated numpy files which will be used in training.
 99 | 
100 |        python3 -m datasets.generate_data ./datasets/son/alignment.json
101 | 
102 | Because the automatic generation is extremely naive, the dataset is noisy. However, if you have enough datasets (20+ hours with random initialization or 5+ hours with pretrained model initialization), you can expect an acceptable quality of audio synthesis.
103 | 
104 | ### 2-3. Generate English datasets
105 | 
106 | 1. Download speech dataset at https://keithito.com/LJ-Speech-Dataset/
107 | 
108 | 2. Convert metadata CSV file to json file. (arguments are available for changing preferences)
109 | 		
110 | 		python3 -m datasets.LJSpeech_1_0.prepare
111 | 
112 | 3. Finally, generate numpy files which will be used in training.
113 | 		
114 | 		python3 -m datasets.generate_data ./datasets/LJSpeech_1_0
115 | 		
116 | 
117 | ### 3. Train a model
118 | 
119 | The important hyperparameters for a models are defined in `hparams.py`.
120 | 
121 | (**Change `cleaners` in `hparams.py` from `korean_cleaners` to `english_cleaners` to train with English dataset**)
122 | 
123 | To train a single-speaker model:
124 | 
125 |     python3 train.py --data_path=datasets/son
126 |     python3 train.py --data_path=datasets/son --initialize_path=PATH_TO_CHECKPOINT
127 | 
128 | To train a multi-speaker model:
129 | 
130 |     # after change `model_type` in `hparams.py` to `deepvoice` or `simple`
131 |     python3 train.py --data_path=datasets/son1,datasets/son2
132 | 
133 | To restart a training from previous experiments such as `logs/son-20171015`:
134 | 
135 |     python3 train.py --data_path=datasets/son --load_path logs/son-20171015
136 | 
137 | If you don't have good and enough (10+ hours) dataset, it would be better to use `--initialize_path` to use a well-trained model as initial parameters.
138 | 
139 | 
140 | ### 4. Synthesize audio
141 | 
142 | You can train your own models with:
143 | 
144 |     python3 app.py --load_path logs/son-20171015 --num_speakers=1
145 | 
146 | or generate audio directly with:
147 | 
148 |     python3 synthesizer.py --load_path logs/son-20171015 --text "이거 실화냐?"
149 | 	
150 | ### 4-1. Synthesizing non-korean(english) audio
151 | 
152 | For generating non-korean audio, you must set the argument --is_korean False.
153 | 		
154 | 	python3 app.py --load_path logs/LJSpeech_1_0-20180108 --num_speakers=1 --is_korean=False
155 | 	python3 synthesizer.py --load_path logs/LJSpeech_1_0-20180108 --text="Winter is coming." --is_korean=False
156 | 
157 | ## Results
158 | 
159 | Training attention on single speaker model:
160 | 
161 | ![model](./assets/attention_single_speaker.gif)
162 | 
163 | Training attention on multi speaker model:
164 | 
165 | ![model](./assets/attention_multi_speaker.gif)
166 | 
167 | 
168 | ## Disclaimer
169 | 
170 | This is not an official [DEVSISTERS](http://devsisters.com/) product. This project is not responsible for misuse or for any damage that you may cause. You agree that you use this software at your own risk.
171 | 
172 | 
173 | ## References
174 | 
175 | - [Keith Ito](https://github.com/keithito)'s [tacotron](https://github.com/keithito/tacotron)
176 | - [DEVIEW 2017 presentation](https://www.slideshare.net/carpedm20/deview-2017-80824162)
177 | 
178 | 
179 | ## Author
180 | 
181 | Taehoon Kim / [@carpedm20](http://carpedm20.github.io/)
182 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melonicedlatte/multi-speaker-tacotron-tensorflow/b98628b467e0e577541d48d344dc3347d99c8392/app.py


--------------------------------------------------------------------------------
/assets/attention_multi_speaker.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melonicedlatte/multi-speaker-tacotron-tensorflow/b98628b467e0e577541d48d344dc3347d99c8392/assets/attention_multi_speaker.gif


--------------------------------------------------------------------------------
/assets/attention_single_speaker.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melonicedlatte/multi-speaker-tacotron-tensorflow/b98628b467e0e577541d48d344dc3347d99c8392/assets/attention_single_speaker.gif


--------------------------------------------------------------------------------
/assets/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melonicedlatte/multi-speaker-tacotron-tensorflow/b98628b467e0e577541d48d344dc3347d99c8392/assets/model.png


--------------------------------------------------------------------------------
/audio/__init__.py:
--------------------------------------------------------------------------------
  1 | # Code based on https://github.com/keithito/tacotron/blob/master/util/audio.py
  2 | import math
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from scipy import signal
  6 | from hparams import hparams
  7 | 
  8 | import librosa
  9 | import librosa.filters
 10 | 
 11 | 
 12 | def load_audio(path, pre_silence_length=0, post_silence_length=0):
 13 |     audio = librosa.core.load(path, sr=hparams.sample_rate)[0]
 14 |     if pre_silence_length > 0 or post_silence_length > 0:
 15 |         audio = np.concatenate([
 16 |                 get_silence(pre_silence_length),
 17 |                 audio,
 18 |                 get_silence(post_silence_length),
 19 |         ])
 20 |     return audio
 21 | 
 22 | def save_audio(audio, path, sample_rate=None):
 23 |     audio *= 32767 / max(0.01, np.max(np.abs(audio)))
 24 |     librosa.output.write_wav(path, audio.astype(np.int16),
 25 |             hparams.sample_rate if sample_rate is None else sample_rate)
 26 | 
 27 |     print(" [*] Audio saved: {}".format(path))
 28 | 
 29 | 
 30 | def resample_audio(audio, target_sample_rate):
 31 |     return librosa.core.resample(
 32 |             audio, hparams.sample_rate, target_sample_rate)
 33 | 
 34 | 
 35 | def get_duration(audio):
 36 |     return librosa.core.get_duration(audio, sr=hparams.sample_rate)
 37 | 
 38 | 
 39 | def frames_to_hours(n_frames):
 40 |     return sum((n_frame for n_frame in n_frames)) * \
 41 |             hparams.frame_shift_ms / (3600 * 1000)
 42 | 
 43 | 
 44 | def get_silence(sec):
 45 |     return np.zeros(hparams.sample_rate * sec)
 46 | 
 47 | 
 48 | def spectrogram(y):
 49 |     D = _stft(_preemphasis(y))
 50 |     S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
 51 |     return _normalize(S)
 52 | 
 53 | 
 54 | def inv_spectrogram(spectrogram):
 55 |     S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db)    # Convert back to linear
 56 |     return inv_preemphasis(_griffin_lim(S ** hparams.power))                 # Reconstruct phase
 57 | 
 58 | 
 59 | def inv_spectrogram_tensorflow(spectrogram):
 60 |     S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db)
 61 |     return _griffin_lim_tensorflow(tf.pow(S, hparams.power))
 62 | 
 63 | 
 64 | def melspectrogram(y):
 65 |     D = _stft(_preemphasis(y))
 66 |     S = _amp_to_db(_linear_to_mel(np.abs(D)))
 67 |     return _normalize(S)
 68 | 
 69 | 
 70 | def inv_melspectrogram(melspectrogram):
 71 |     S = _mel_to_linear(_db_to_amp(_denormalize(melspectrogram)))     # Convert back to linear
 72 |     return inv_preemphasis(_griffin_lim(S ** hparams.power))            # Reconstruct phase
 73 | 
 74 | 
 75 | # Based on https://github.com/librosa/librosa/issues/434
 76 | def _griffin_lim(S):
 77 |     angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
 78 |     S_complex = np.abs(S).astype(np.complex)
 79 | 
 80 |     y = _istft(S_complex * angles)
 81 |     for i in range(hparams.griffin_lim_iters):
 82 |         angles = np.exp(1j * np.angle(_stft(y)))
 83 |         y = _istft(S_complex * angles)
 84 |     return y
 85 | 
 86 | 
 87 | def _griffin_lim_tensorflow(S):
 88 |     with tf.variable_scope('griffinlim'):
 89 |         S = tf.expand_dims(S, 0)
 90 |         S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
 91 |         y = _istft_tensorflow(S_complex)
 92 |         for i in range(hparams.griffin_lim_iters):
 93 |             est = _stft_tensorflow(y)
 94 |             angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
 95 |             y = _istft_tensorflow(S_complex * angles)
 96 |         return tf.squeeze(y, 0)
 97 | 
 98 | 
 99 | def _stft(y):
100 |     n_fft, hop_length, win_length = _stft_parameters()
101 |     return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
102 | 
103 | 
104 | def _istft(y):
105 |     _, hop_length, win_length = _stft_parameters()
106 |     return librosa.istft(y, hop_length=hop_length, win_length=win_length)
107 | 
108 | 
109 | def _stft_tensorflow(signals):
110 |     n_fft, hop_length, win_length = _stft_parameters()
111 |     return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False)
112 |   
113 |   
114 | def _istft_tensorflow(stfts):
115 |     n_fft, hop_length, win_length = _stft_parameters()
116 |     return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
117 | 
118 | def _stft_parameters():
119 |     n_fft = (hparams.num_freq - 1) * 2
120 |     hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
121 |     win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
122 |     return n_fft, hop_length, win_length
123 | 
124 | 
125 | # Conversions:
126 | 
127 | _mel_basis = None
128 | _inv_mel_basis = None
129 | 
130 | def _linear_to_mel(spectrogram):
131 |     global _mel_basis
132 |     if _mel_basis is None:
133 |         _mel_basis = _build_mel_basis()
134 |     return np.dot(_mel_basis, spectrogram)
135 | 
136 | def _mel_to_linear(mel_spectrogram):
137 |     global _inv_mel_basis
138 |     if _inv_mel_basis is None:
139 |         _inv_mel_basis = np.linalg.pinv(_build_mel_basis())
140 |     return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
141 | 
142 | def _build_mel_basis():
143 |     n_fft = (hparams.num_freq - 1) * 2
144 |     return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
145 | 
146 | def _amp_to_db(x):
147 |     return 20 * np.log10(np.maximum(1e-5, x))
148 | 
149 | def _db_to_amp(x):
150 |     return np.power(10.0, x * 0.05)
151 | 
152 | def _db_to_amp_tensorflow(x):
153 |     return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
154 | 
155 | def _preemphasis(x):
156 |     return signal.lfilter([1, -hparams.preemphasis], [1], x)
157 | 
158 | def inv_preemphasis(x):
159 |     return signal.lfilter([1], [1, -hparams.preemphasis], x)
160 | 
161 | def _normalize(S):
162 |     return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
163 | 
164 | def _denormalize(S):
165 |     return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
166 | 
167 | def _denormalize_tensorflow(S):
168 |     return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
169 | 


--------------------------------------------------------------------------------
/audio/audio_range.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import tqdm
 4 | 
 5 | def search(dirname):
 6 |     try:
 7 |         filenames = os.listdir(dirname)
 8 |         for filename in filenames:
 9 |             full_filename = os.path.join(dirname, filename)
10 |             if os.path.isdir(full_filename):
11 |                 search(full_filename)
12 |             else:
13 |                 ext = os.path.splitext(full_filename)[-1]
14 |                 if ext == '.py':
15 |                     print(full_filename)
16 |     except PermissionError:
17 |         pass
18 | 
19 | 
20 | def audio_range(_load_path, _min, _max):
21 |     base_dir = _load_path
22 | 
23 |     for (path, dir, files) in os.walk(base_dir):
24 |         for filename in tqdm.tqdm(files):
25 |             print(filename)
26 |             each_size = os.path.getsize(path + '/' + filename)
27 |             print(filename, '  /   size is == ', each_size)
28 | 
29 |             ext = os.path.splitext(filename)[-1]
30 |             if not ext == '.wav':
31 |                 print('This folder contains not audio file!! In audio folder, they must have only wav file!!')
32 |                 return
33 | 
34 |             print(os.getcwd())
35 | 
36 |             # 규정 사이즈 이상은 제거
37 |             if not (_min <= each_size and each_size <= _max) :
38 |                 print(path + '/' + filename, ' is removed!!')
39 |                 os.remove( path + '/' + filename )
40 | 
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     parser.add_argument('--load_path', required=True)
46 |     parser.add_argument('--min', default= 300000)
47 |     parser.add_argument('--max', default=1600000)
48 |     config = parser.parse_args()
49 | 
50 |     if not os.path.exists(config.load_path):
51 |         print("wrong path!!")
52 | 
53 |     print (config.load_path)
54 | 
55 |     if config.load_path in 'kim_anchor':
56 |         print("wrong path!! path must have kim_anchor")
57 | 
58 |     else :
59 |         audio_range(config.load_path, config.min, config.max)
60 | 
61 |     # 텍스트에 아무 것도 없는 내용 제거 


--------------------------------------------------------------------------------
/audio/get_duration.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import datetime
 3 | from glob import glob
 4 | from tqdm import tqdm
 5 | from tinytag import TinyTag
 6 | from collections import defaultdict
 7 | from multiprocessing.dummy import Pool
 8 | 
 9 | from utils import load_json
10 | 
11 | def second_to_hour(sec):
12 |     return str(datetime.timedelta(seconds=int(sec)))
13 | 
14 | def get_duration(path):
15 |     filename = os.path.basename(path)
16 |     candidates = filename.split('.')[0].split('_')
17 |     dataset = candidates[0]
18 | 
19 |     if not os.path.exists(path):
20 |         print(" [!] {} not found".format(path))
21 |         return dataset, 0
22 | 
23 |     if True: # tinytag
24 |         tag = TinyTag.get(path)
25 |         duration = tag.duration
26 |     else: # librosa
27 |         y, sr = librosa.load(path)
28 |         duration = librosa.get_duration(y=y, sr=sr)
29 | 
30 |     return dataset, duration
31 | 
32 | def get_durations(paths, print_detail=True):
33 |     duration_all = 0
34 |     duration_book = defaultdict(list)
35 | 
36 |     pool = Pool()
37 |     iterator = pool.imap_unordered(get_duration, paths)
38 |     for dataset, duration in tqdm(iterator, total=len(paths)):
39 |         duration_all += duration
40 |         duration_book[dataset].append(duration)
41 | 
42 |     total_count = 0
43 |     for book, duration in duration_book.items():
44 |         if book:
45 |             time = second_to_hour(sum(duration))
46 |             file_count = len(duration)
47 |             total_count += file_count
48 | 
49 |             if print_detail:
50 |                 print(" [*] Duration of {}: {} (file #: {})". \
51 |                         format(book, time, file_count))
52 | 
53 |     print(" [*] Total Duration : {} (file #: {})". \
54 |             format(second_to_hour(duration_all), total_count))
55 |     print()
56 |     return duration_all
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     import argparse
61 | 
62 |     parser = argparse.ArgumentParser()
63 |     parser.add_argument('--audio-pattern', default=None) # datasets/krbook/audio/*.wav
64 |     parser.add_argument('--data-path', default=None) # datasets/jtbc/alignment.json
65 |     config, unparsed = parser.parse_known_args()
66 | 
67 |     if config.audio_pattern is not None:
68 |         duration = get_durations(get_paths_by_pattern(config.data_dir))
69 |     elif config.data_path is not None:
70 |         paths = load_json(config.data_path).keys()
71 |         duration = get_durations(paths)
72 | 


--------------------------------------------------------------------------------
/audio/silence.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import sys
  4 | import json
  5 | import librosa
  6 | import argparse
  7 | import numpy as np
  8 | from tqdm import tqdm
  9 | from glob import glob
 10 | from pydub import silence
 11 | from pydub import AudioSegment
 12 | from functools import partial
 13 | 
 14 | from hparams import hparams
 15 | from utils import parallel_run, add_postfix
 16 | from audio import load_audio, save_audio, get_duration, get_silence
 17 | 
 18 | def abs_mean(x):
 19 |     return abs(x).mean()
 20 | 
 21 | def remove_breath(audio):
 22 |     edges = librosa.effects.split(
 23 |             audio, top_db=40, frame_length=128, hop_length=32)
 24 | 
 25 |     for idx in range(len(edges)):
 26 |         start_idx, end_idx = edges[idx][0], edges[idx][1]
 27 |         if start_idx < len(audio):
 28 |             if abs_mean(audio[start_idx:end_idx]) < abs_mean(audio) - 0.05:
 29 |                 audio[start_idx:end_idx] = 0
 30 | 
 31 |     return audio
 32 | 
 33 | def split_on_silence_with_librosa(
 34 |         audio_path, top_db=40, frame_length=1024, hop_length=256,
 35 |         skip_idx=0, out_ext="wav",
 36 |         min_segment_length=3, max_segment_length=8,
 37 |         pre_silence_length=0, post_silence_length=0):
 38 | 
 39 |     filename = os.path.basename(audio_path).split('.', 1)[0]
 40 |     in_ext = audio_path.rsplit(".")[1]
 41 | 
 42 |     audio = load_audio(audio_path)
 43 | 
 44 |     edges = librosa.effects.split(audio,
 45 |             top_db=top_db, frame_length=frame_length, hop_length=hop_length)
 46 | 
 47 |     new_audio = np.zeros_like(audio)
 48 |     for idx, (start, end) in enumerate(edges[skip_idx:]):
 49 |         new_audio[start:end] = remove_breath(audio[start:end])
 50 |         
 51 |     save_audio(new_audio, add_postfix(audio_path, "no_breath"))
 52 |     audio = new_audio
 53 |     edges = librosa.effects.split(audio,
 54 |             top_db=top_db, frame_length=frame_length, hop_length=hop_length)
 55 | 
 56 |     audio_paths = []
 57 |     for idx, (start, end) in enumerate(edges[skip_idx:]):
 58 |         segment = audio[start:end]
 59 |         duration = get_duration(segment)
 60 | 
 61 |         if duration <= min_segment_length or duration >= max_segment_length:
 62 |             continue
 63 | 
 64 |         output_path = "{}/{}.{:04d}.{}".format(
 65 |                 os.path.dirname(audio_path), filename, idx, out_ext)
 66 | 
 67 |         padded_segment = np.concatenate([
 68 |                 get_silence(pre_silence_length),
 69 |                 segment,
 70 |                 get_silence(post_silence_length),
 71 |         ])
 72 | 
 73 | 
 74 |         
 75 |         save_audio(padded_segment, output_path)
 76 |         audio_paths.append(output_path)
 77 | 
 78 |     return audio_paths
 79 | 
 80 | def read_audio(audio_path):
 81 |     return AudioSegment.from_file(audio_path)
 82 | 
 83 | def split_on_silence_with_pydub(
 84 |         audio_path, skip_idx=0, out_ext="wav",
 85 |         silence_thresh=-40, min_silence_len=400,
 86 |         silence_chunk_len=100, keep_silence=100):
 87 | 
 88 |     filename = os.path.basename(audio_path).split('.', 1)[0]
 89 |     in_ext = audio_path.rsplit(".")[1]
 90 | 
 91 |     audio = read_audio(audio_path)
 92 |     not_silence_ranges = silence.detect_nonsilent(
 93 |         audio, min_silence_len=silence_chunk_len,
 94 |         silence_thresh=silence_thresh)
 95 | 
 96 |     edges = [not_silence_ranges[0]]
 97 | 
 98 |     for idx in range(1, len(not_silence_ranges)-1):
 99 |         cur_start = not_silence_ranges[idx][0]
100 |         prev_end = edges[-1][1]
101 | 
102 |         if cur_start - prev_end < min_silence_len:
103 |             edges[-1][1] = not_silence_ranges[idx][1]
104 |         else:
105 |             edges.append(not_silence_ranges[idx])
106 |     
107 |     audio_paths = []
108 |     for idx, (start_idx, end_idx) in enumerate(edges[skip_idx:]):
109 |         start_idx = max(0, start_idx - keep_silence)
110 |         end_idx += keep_silence
111 | 
112 |         target_audio_path = "{}/{}.{:04d}.{}".format(
113 |                 os.path.dirname(audio_path), filename, idx, out_ext)
114 | 
115 |         segment=audio[start_idx:end_idx]
116 |         
117 |         segment.export(target_audio_path, out_ext)  # for soundsegment
118 | 
119 |         audio_paths.append(target_audio_path)
120 | 
121 |     return audio_paths
122 | 
123 | def split_on_silence_batch(audio_paths, method, **kargv):
124 |     audio_paths.sort()
125 |     method = method.lower()
126 | 
127 |     if method == "librosa":
128 |         fn = partial(split_on_silence_with_librosa, **kargv)
129 |     elif method == "pydub":
130 |         fn = partial(split_on_silence_with_pydub, **kargv)
131 | 
132 |     parallel_run(fn, audio_paths,
133 |             desc="Split on silence", parallel=False)
134 | 
135 | if __name__ == "__main__":
136 |     parser = argparse.ArgumentParser()
137 |     parser.add_argument('--audio_pattern', required=True)
138 |     parser.add_argument('--out_ext', default='wav')
139 |     parser.add_argument('--method', choices=['librosa', 'pydub'], required=True)
140 |     config = parser.parse_args()
141 | 
142 |     audio_paths = glob(config.audio_pattern)
143 | 
144 |     split_on_silence_batch(
145 |             audio_paths, config.method,
146 |             out_ext=config.out_ext,
147 |     )
148 | 


--------------------------------------------------------------------------------
/datasets/LJSpeech_1_0/README:
--------------------------------------------------------------------------------
  1 | -----------------------------------------------------------------------------
  2 | The LJ Speech Dataset
  3 | 
  4 | Version 1.0
  5 | July 5, 2017
  6 | https://keithito.com/LJ-Speech-Dataset
  7 | -----------------------------------------------------------------------------
  8 | 
  9 | 
 10 | OVERVIEW
 11 | 
 12 | This is a public domain speech dataset consisting of 13,100 short audio clips
 13 | of a single speaker reading passages from 7 non-fiction books. A transcription
 14 | is provided for each clip. Clips vary in length from 1 to 10 seconds and have
 15 | a total length of approximately 24 hours.
 16 | 
 17 | The texts were published between 1884 and 1964, and are in the public domain.
 18 | The audio was recorded in 2016-17 by the LibriVox project and is also in the
 19 | public domain.
 20 | 
 21 | 
 22 | 
 23 | FILE FORMAT
 24 | 
 25 | Metadata is provided in metadata.csv. This file consists of one record per
 26 | line, delimited by the pipe character (0x7c). The fields are:
 27 | 
 28 |   1. ID: this is the name of the corresponding .wav file
 29 |   2. Transcription: words spoken by the reader (UTF-8)
 30 |   3. Normalized Transcription: transcription with numbers, ordinals, and
 31 |      monetary units expanded into full words (UTF-8).
 32 | 
 33 | Each audio file is a single-channel 16-bit PCM WAV with a sample rate of
 34 | 22050 Hz.
 35 | 
 36 | 
 37 | 
 38 | STATISTICS
 39 | 
 40 | Total Clips            13,100
 41 | Total Words            225,715
 42 | Total Characters       1,308,674
 43 | Total Duration         23:55:17
 44 | Mean Clip Duration     6.57 sec
 45 | Min Clip Duration      1.11 sec
 46 | Max Clip Duration      10.10 sec
 47 | Mean Words per Clip    17.23
 48 | Distinct Words         13,821
 49 | 
 50 | 
 51 | 
 52 | MISCELLANEOUS
 53 | 
 54 | The audio clips range in length from approximately 1 second to 10 seconds.
 55 | They were segmented automatically based on silences in the recording. Clip
 56 | boundaries generally align with sentence or clause boundaries, but not always.
 57 | 
 58 | The text was matched to the audio manually, and a QA pass was done to ensure
 59 | that the text accurately matched the words spoken in the audio.
 60 | 
 61 | The original LibriVox recordings were distributed as 128 kbps MP3 files. As a
 62 | result, they may contain artifacts introduced by the MP3 encoding.
 63 | 
 64 | The following abbreviations appear in the text. They may be expanded as
 65 | follows:
 66 | 
 67 |      Abbreviation   Expansion
 68 |      --------------------------
 69 |      Mr.            Mister
 70 |      Mrs.           Misess (*)
 71 |      Dr.            Doctor
 72 |      No.            Number
 73 |      St.            Saint
 74 |      Co.            Company
 75 |      Jr.            Junior
 76 |      Maj.           Major
 77 |      Gen.           General
 78 |      Drs.           Doctors
 79 |      Rev.           Reverend
 80 |      Lt.            Lieutenant
 81 |      Hon.           Honorable
 82 |      Sgt.           Sergeant
 83 |      Capt.          Captain
 84 |      Esq.           Esquire
 85 |      Ltd.           Limited
 86 |      Col.           Colonel
 87 |      Ft.            Fort
 88 | 
 89 |      * there's no standard expansion of "Mrs."
 90 | 
 91 | 
 92 | 19 of the transcriptions contain non-ASCII characters (for example, LJ016-0257
 93 | contains "raison d'être").
 94 | 
 95 | For more information or to report errors, please email kito@kito.us.
 96 | 
 97 | 
 98 | 
 99 | LICENSE
100 | 
101 | This dataset is in the public domain in the USA (and likely other countries as
102 | well). There are no restrictions on its use. For more information, please see:
103 | https://librivox.org/pages/public-domain.
104 | 
105 | 
106 | 
107 | CREDITS
108 | 
109 | This dataset consists of excerpts from the following works:
110 | 
111 | * Morris, William, et al. Arts and Crafts Essays. 1893.
112 | * Griffiths, Arthur. The Chronicles of Newgate, Vol. 2. 1884.
113 | * Roosevelt, Franklin D. The Fireside Chats of Franklin Delano Roosevelt.
114 |   1933-42.
115 | * Harland, Marion. Marion Harland's Cookery for Beginners. 1893.
116 | * Rolt-Wheeler, Francis. The Science - History of the Universe, Vol. 5:
117 |   Biology. 1910.
118 | * Banks, Edgar J. The Seven Wonders of the Ancient World. 1916.
119 | * President's Commission on the Assassination of President Kennedy. Report
120 |   of the President's Commission on the Assassination of President Kennedy.
121 |   1964.
122 | 
123 | Recordings by Linda Johnson. Alignment and annotation by Keith Ito. All text,
124 | audio, and annotations are in the public domain.
125 | 
126 | If you would like to cite this work, please do so by linking to:
127 |   https://keithito.com/LJ-Speech-Dataset
128 | 
129 | or by using the citation:
130 |   Ito, Keith. The LJ Speech Dataset. 2017. https://keithito.com/LJ-Speech-Dataset.
131 | 


--------------------------------------------------------------------------------
/datasets/LJSpeech_1_0/prepare.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Jan  4 10:50:07 2018
 4 | # Conversion tool for https://github.com/carpedm20/multi-speaker-tacotron-tensorflow
 5 | This prepares LJ-Dataset (available at https://keithito.com/LJ-Speech-Dataset/) to json and wav format
 6 | that can be processed into .npz file using datasets.generate_data. 
 7 | 
 8 | @author: engiecat (github)
 9 | """
10 | import os
11 | from utils import load_json, write_json, backup_file, str2bool
12 | import argparse
13 | 
14 | base_dir = os.path.dirname(os.path.realpath(__file__))
15 | work_dir = os.getcwd()
16 | class Data(object):
17 |     def __init__(
18 |             self, audio_name, audio_transcript,audio_normalized_transcript,audio_path='ERR'):
19 |         self.audio_name = audio_name
20 |         self.audio_transcript = audio_transcript
21 |         self.audio_normalized_transcript=audio_normalized_transcript
22 |         self.audio_path = audio_path
23 | 
24 | def read_csv(path,fn_encoding='UTF8'):
25 |     # reads csv file into audio snippet name and its transcript
26 |     with open(path, encoding=fn_encoding) as f:
27 |         data = []
28 |         temp='' # for storing non-normalized
29 |         for line in f:
30 |             audio_name, audio_transcript,audio_normalized_transcript = line.split('|')
31 |             audio_transcript=audio_transcript.strip()
32 |             audio_normalized_transcript=audio_normalized_transcript.strip()
33 |             data.append(Data(audio_name, audio_transcript,audio_normalized_transcript))
34 |         return data
35 | 
36 | def convert_name_to_path(name, audio_dir, audio_format):
37 |     # converts audio snippet name to audio snippet path
38 |     abs_audio_dir=os.path.abspath(os.path.join(base_dir,audio_dir))
39 |     # the audio directory is respective to dataset folder(base_dir)
40 |     # while the working directory is at the root directory (work_dir)
41 |     result= os.path.join('./',os.path.relpath(abs_audio_dir,work_dir), name+'.'+audio_format )
42 |     return result
43 | 
44 | def convert_to_json_format(data, is_normalized):
45 |     # converts into json format
46 |     if is_normalized:
47 |         result={data.audio_path:[data.audio_normalized_transcript]}
48 |     else:
49 |         result={data.audio_path:[data.audio_transcript]}
50 |     return result
51 | 
52 | if __name__ == '__main__':
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument('--metadata', default="metadata.csv")
55 |     parser.add_argument('--metadata_enconding', default="UTF8")
56 |     parser.add_argument('--audio_dir', default="wavs")
57 |     parser.add_argument('--audio_format', default='wav')
58 |     parser.add_argument('--alignment_filename', default="alignment.json")
59 |     parser.add_argument('--use_normalize', default=True, type=str2bool)
60 |     config = parser.parse_args()
61 |     
62 |     print(' [*] Reading metadata file - '+config.metadata)
63 |     data = read_csv(os.path.join(base_dir, config.metadata))
64 |     print(' [*] Converting to audio_path...')
65 |     results={}
66 |     for d in data:
67 |         d.audio_path=convert_name_to_path(d.audio_name,config.audio_dir,config.audio_format) 
68 |         results.update(convert_to_json_format(d, config.use_normalize))
69 |     print(' [*] Saving to json...')
70 |     alignment_path = \
71 |         os.path.join(base_dir, config.alignment_filename)
72 |     if os.path.exists(alignment_path):
73 |         backup_file(alignment_path)
74 |     write_json(alignment_path, results)
75 |     print(' [!] All Done!')
76 |     print(work_dir)
77 |     
78 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melonicedlatte/multi-speaker-tacotron-tensorflow/b98628b467e0e577541d48d344dc3347d99c8392/datasets/__init__.py


--------------------------------------------------------------------------------
/datasets/datafeeder.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import pprint
  4 | import random
  5 | import threading
  6 | import traceback
  7 | import numpy as np
  8 | from glob import glob
  9 | import tensorflow as tf
 10 | from collections import defaultdict
 11 | 
 12 | import text
 13 | from utils.infolog import log
 14 | from utils import parallel_run, remove_file
 15 | from audio import frames_to_hours
 16 | from audio.get_duration import get_durations
 17 | 
 18 | 
 19 | _pad = 0
 20 | 
 21 | def get_frame(path):
 22 |     data = np.load(path)
 23 |     n_frame = data["linear"].shape[0]
 24 |     n_token = len(data["tokens"])
 25 |     return (path, n_frame, n_token)
 26 | 
 27 | def get_path_dict(
 28 |         data_dirs, hparams, config,
 29 |         data_type, n_test=None,
 30 |         rng=np.random.RandomState(123)):
 31 | 
 32 |     # Load metadata:
 33 |     path_dict = {}
 34 |     for data_dir in data_dirs:
 35 |         paths = glob("{}/*.npz".format(data_dir))
 36 | 
 37 |         if data_type == 'train':
 38 |             rng.shuffle(paths)
 39 | 
 40 |         if not config.skip_path_filter:
 41 |             items = parallel_run(
 42 |                     get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True)
 43 | 
 44 |             min_n_frame = hparams.reduction_factor * hparams.min_iters
 45 |             max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor
 46 |             
 47 |             new_items = [(path, n) for path, n, n_tokens in items \
 48 |                     if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens]
 49 | 
 50 |             if any(check in data_dir for check in ["son", "yuinna"]):
 51 |                 blacklists = [".0000.", ".0001.", "NB11479580.0001"]
 52 |                 new_items = [item for item in new_items \
 53 |                         if any(check not in item[0] for check in blacklists)]
 54 | 
 55 |             new_paths = [path for path, n in new_items]
 56 |             new_n_frames = [n for path, n in new_items]
 57 | 
 58 |             hours = frames_to_hours(new_n_frames)
 59 | 
 60 |             log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'. \
 61 |                     format(data_dir, len(new_n_frames), hours))
 62 |             log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames)))
 63 |             log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames)))
 64 |         else:
 65 |             new_paths = paths
 66 | 
 67 |         if data_type == 'train':
 68 |             new_paths = new_paths[:-n_test]
 69 |         elif data_type == 'test':
 70 |             new_paths = new_paths[-n_test:]
 71 |         else:
 72 |             raise Exception(" [!] Unkown data_type: {}".format(data_type))
 73 | 
 74 |         path_dict[data_dir] = new_paths
 75 | 
 76 |     return path_dict
 77 | 
 78 | class DataFeeder(threading.Thread):
 79 |     '''Feeds batches of data into a queue on a background thread.'''
 80 | 
 81 |     def __init__(self, coordinator, data_dirs,
 82 |             hparams, config, batches_per_group, data_type, batch_size):
 83 |         super(DataFeeder, self).__init__()
 84 | 
 85 |         self._coord = coordinator
 86 |         self._hp = hparams
 87 |         self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
 88 |         self._step = 0
 89 |         self._offset = defaultdict(lambda: 2)
 90 |         self._batches_per_group = batches_per_group
 91 | 
 92 |         self.rng = np.random.RandomState(config.random_seed)
 93 |         self.data_type = data_type
 94 |         self.batch_size = batch_size
 95 | 
 96 |         self.min_tokens = hparams.min_tokens
 97 |         self.min_n_frame = hparams.reduction_factor * hparams.min_iters
 98 |         self.max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor
 99 |         self.skip_path_filter = config.skip_path_filter
100 | 
101 |         # Load metadata:
102 |         self.path_dict = get_path_dict(
103 |                 data_dirs, self._hp, config, self.data_type,
104 |                 n_test=self.batch_size, rng=self.rng)
105 | 
106 |         self.data_dirs = list(self.path_dict.keys())
107 |         self.data_dir_to_id = {
108 |                 data_dir: idx for idx, data_dir in enumerate(self.data_dirs)}
109 | 
110 |         data_weight = {
111 |                 data_dir: 1. for data_dir in self.data_dirs
112 |         }
113 | 
114 |         if self._hp.main_data_greedy_factor > 0 and \
115 |                 any(main_data in data_dir for data_dir in self.data_dirs \
116 |                                          for main_data in self._hp.main_data):
117 |             for main_data in self._hp.main_data:
118 |                 for data_dir in self.data_dirs:
119 |                     if main_data in data_dir:
120 |                         data_weight[data_dir] += self._hp.main_data_greedy_factor
121 | 
122 |         weight_Z = sum(data_weight.values())
123 |         self.data_ratio = {
124 |                 data_dir: weight / weight_Z for data_dir, weight in data_weight.items()
125 |         }
126 | 
127 |         log("="*40)
128 |         log(pprint.pformat(self.data_ratio, indent=4))
129 |         log("="*40)
130 | 
131 |         #audio_paths = [path.replace("/data/", "/audio/"). \
132 |         #        replace(".npz", ".wav") for path in self.data_paths]
133 |         #duration = get_durations(audio_paths, print_detail=False)
134 | 
135 |         # Create placeholders for inputs and targets. Don't specify batch size because we want to
136 |         # be able to feed different sized batches at eval time.
137 | 
138 |         self._placeholders = [
139 |             tf.placeholder(tf.int32, [None, None], 'inputs'),
140 |             tf.placeholder(tf.int32, [None], 'input_lengths'),
141 |             tf.placeholder(tf.float32, [None], 'loss_coeff'),
142 |             tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'),
143 |             tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'),
144 |         ]
145 | 
146 |         # Create queue for buffering data:
147 |         dtypes = [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32]
148 | 
149 |         self.is_multi_speaker = len(self.data_dirs) > 1
150 | 
151 |         if self.is_multi_speaker:
152 |             self._placeholders.append(
153 |                     tf.placeholder(tf.int32, [None], 'inputs'),
154 |             )
155 |             dtypes.append(tf.int32)
156 | 
157 |         num_worker = 8 if self.data_type == 'train' else 1
158 |         queue = tf.FIFOQueue(num_worker, dtypes, name='input_queue')
159 | 
160 |         self._enqueue_op = queue.enqueue(self._placeholders)
161 | 
162 |         if self.is_multi_speaker:
163 |             self.inputs, self.input_lengths, self.loss_coeff, \
164 |                     self.mel_targets, self.linear_targets, self.speaker_id = queue.dequeue()
165 |         else:
166 |             self.inputs, self.input_lengths, self.loss_coeff, \
167 |                     self.mel_targets, self.linear_targets = queue.dequeue()
168 | 
169 |         self.inputs.set_shape(self._placeholders[0].shape)
170 |         self.input_lengths.set_shape(self._placeholders[1].shape)
171 |         self.loss_coeff.set_shape(self._placeholders[2].shape)
172 |         self.mel_targets.set_shape(self._placeholders[3].shape)
173 |         self.linear_targets.set_shape(self._placeholders[4].shape)
174 | 
175 |         if self.is_multi_speaker:
176 |             self.speaker_id.set_shape(self._placeholders[5].shape)
177 |         else:
178 |             self.speaker_id = None
179 | 
180 |         if self.data_type == 'test':
181 |             examples = []
182 |             while True:
183 |                 for data_dir in self.data_dirs:
184 |                     examples.append(self._get_next_example(data_dir))
185 |                     #print(data_dir, text.sequence_to_text(examples[-1][0], False, True))
186 |                     if len(examples) >= self.batch_size:
187 |                         break
188 |                 if len(examples) >= self.batch_size:
189 |                     break
190 |             self.static_batches = [examples for _ in range(self._batches_per_group)]
191 | 
192 |         else:
193 |             self.static_batches = None
194 | 
195 |     def start_in_session(self, session, start_step):
196 |         self._step = start_step
197 |         self._session = session
198 |         self.start()
199 | 
200 | 
201 |     def run(self):
202 |         try:
203 |             while not self._coord.should_stop():
204 |                 self._enqueue_next_group()
205 |         except Exception as e:
206 |             traceback.print_exc()
207 |             self._coord.request_stop(e)
208 | 
209 | 
210 |     def _enqueue_next_group(self):
211 |         start = time.time()
212 | 
213 |         # Read a group of examples:
214 |         n = self.batch_size
215 |         r = self._hp.reduction_factor
216 | 
217 |         if self.static_batches is not None:
218 |             batches = self.static_batches
219 |         else:
220 |             examples = []
221 |             for data_dir in self.data_dirs:
222 |                 if self._hp.initial_data_greedy:
223 |                     if self._step < self._hp.initial_phase_step and \
224 |                             any("krbook" in data_dir for data_dir in self.data_dirs):
225 |                         data_dir = [data_dir for data_dir in self.data_dirs if "krbook" in data_dir][0]
226 | 
227 |                 if self._step < self._hp.initial_phase_step:
228 |                     example = [self._get_next_example(data_dir) \
229 |                             for _ in range(int(n * self._batches_per_group // len(self.data_dirs)))]
230 |                 else:
231 |                     example = [self._get_next_example(data_dir) \
232 |                             for _ in range(int(n * self._batches_per_group * self.data_ratio[data_dir]))]
233 |                 examples.extend(example)
234 |             examples.sort(key=lambda x: x[-1])
235 | 
236 |             batches = [examples[i:i+n] for i in range(0, len(examples), n)]
237 |             self.rng.shuffle(batches)
238 | 
239 |         log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start))
240 |         for batch in batches:
241 |             feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r, self.rng, self.data_type)))
242 |             self._session.run(self._enqueue_op, feed_dict=feed_dict)
243 |             self._step += 1
244 | 
245 | 
246 |     def _get_next_example(self, data_dir):
247 |         '''Loads a single example (input, mel_target, linear_target, cost) from disk'''
248 |         data_paths = self.path_dict[data_dir]
249 | 
250 |         while True:
251 |             if self._offset[data_dir] >= len(data_paths):
252 |                 self._offset[data_dir] = 0
253 | 
254 |                 if self.data_type == 'train':
255 |                     self.rng.shuffle(data_paths)
256 | 
257 |             data_path = data_paths[self._offset[data_dir]]
258 |             self._offset[data_dir] += 1
259 | 
260 |             try:
261 |                 if os.path.exists(data_path):
262 |                     data = np.load(data_path)
263 |                 else:
264 |                     continue
265 |             except:
266 |                 remove_file(data_path)
267 |                 continue
268 | 
269 |             if not self.skip_path_filter:
270 |                 break
271 | 
272 |             if self.min_n_frame <= data["linear"].shape[0] <= self.max_n_frame and \
273 |                     len(data["tokens"]) > self.min_tokens:
274 |                 break
275 | 
276 |         input_data = data['tokens']
277 |         mel_target = data['mel']
278 | 
279 |         if 'loss_coeff' in data:
280 |             loss_coeff = data['loss_coeff']
281 |         else:
282 |             loss_coeff = 1
283 |         linear_target = data['linear']
284 | 
285 |         return (input_data, loss_coeff, mel_target, linear_target, 
286 |                 self.data_dir_to_id[data_dir], len(linear_target))
287 | 
288 | 
289 | def _prepare_batch(batch, reduction_factor, rng, data_type=None):
290 |     if data_type == 'train':
291 |         rng.shuffle(batch)
292 | 
293 |     inputs = _prepare_inputs([x[0] for x in batch])
294 |     input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32)
295 |     loss_coeff = np.asarray([x[1] for x in batch], dtype=np.float32)
296 | 
297 |     mel_targets = _prepare_targets([x[2] for x in batch], reduction_factor)
298 |     linear_targets = _prepare_targets([x[3] for x in batch], reduction_factor)
299 | 
300 |     if len(batch[0]) == 6:
301 |         speaker_id = np.asarray([x[4] for x in batch], dtype=np.int32)
302 |         return (inputs, input_lengths, loss_coeff,
303 |                 mel_targets, linear_targets, speaker_id)
304 |     else:
305 |         return (inputs, input_lengths, loss_coeff, mel_targets, linear_targets)
306 | 
307 | 
308 | def _prepare_inputs(inputs):
309 |     max_len = max((len(x) for x in inputs))
310 |     return np.stack([_pad_input(x, max_len) for x in inputs])
311 | 
312 | 
313 | def _prepare_targets(targets, alignment):
314 |     max_len = max((len(t) for t in targets)) + 1
315 |     return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets])
316 | 
317 | 
318 | def _pad_input(x, length):
319 |     return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
320 | 
321 | 
322 | def _pad_target(t, length):
323 |     return np.pad(t, [(0, length - t.shape[0]), (0,0)], mode='constant', constant_values=_pad)
324 | 
325 | 
326 | def _round_up(x, multiple):
327 |     remainder = x % multiple
328 |     return x if remainder == 0 else x + multiple - remainder
329 | 


--------------------------------------------------------------------------------
/datasets/generate_data.py:
--------------------------------------------------------------------------------
  1 | # Code based on https://github.com/keithito/tacotron/blob/master/datasets/ljspeech.py
  2 | import os
  3 | import re
  4 | import sys
  5 | import json
  6 | import argparse
  7 | import numpy as np
  8 | from tqdm import tqdm
  9 | from glob import glob
 10 | from functools import partial
 11 | 
 12 | from collections import Counter, defaultdict
 13 | from concurrent.futures import ProcessPoolExecutor
 14 | 
 15 | import matplotlib
 16 | matplotlib.use('agg')
 17 | import matplotlib.pyplot as plt
 18 | 
 19 | from hparams import hparams
 20 | from text import text_to_sequence
 21 | from utils import makedirs, remove_file, warning
 22 | from audio import load_audio, spectrogram, melspectrogram, frames_to_hours
 23 | 
 24 | def one(x=None):
 25 |     return 1
 26 | 
 27 | def build_from_path(config):
 28 |     warning("Sampling rate: {}".format(hparams.sample_rate))
 29 | 
 30 |     executor = ProcessPoolExecutor(max_workers=config.num_workers)
 31 |     futures = []
 32 |     index = 1
 33 | 
 34 |     base_dir = os.path.dirname(config.metadata_path)
 35 |     data_dir = os.path.join(base_dir, config.data_dirname)
 36 |     makedirs(data_dir)
 37 | 
 38 |     loss_coeff = defaultdict(one)
 39 |     if config.metadata_path.endswith("json"):
 40 |         with open(config.metadata_path) as f:
 41 |             content = f.read()
 42 |         info = json.loads(content)
 43 |     elif config.metadata_path.endswith("csv"):
 44 |         with open(config.metadata_path) as f:
 45 |             info = {}
 46 |             for line in f:
 47 |                 path, text = line.strip().split('|')
 48 |                 info[path] = text
 49 |     else:
 50 |         raise Exception(" [!] Unkown metadata format: {}".format(config.metadata_path))
 51 | 
 52 |     new_info = {}
 53 |     for path in info.keys():
 54 |         if not os.path.exists(path):
 55 |             new_path = os.path.join(base_dir, path)
 56 |             if not os.path.exists(new_path):
 57 |                 print(" [!] Audio not found: {}".format([path, new_path]))
 58 |                 continue
 59 |         else:
 60 |             new_path = path
 61 | 
 62 |         new_info[new_path] = info[path]
 63 | 
 64 |     info = new_info
 65 | 
 66 |     for path in info.keys():
 67 |         if type(info[path]) == list:
 68 |             if hparams.ignore_recognition_level == 1 and len(info[path]) == 1 or \
 69 |                     hparams.ignore_recognition_level == 2:
 70 |                 loss_coeff[path] = hparams.recognition_loss_coeff
 71 | 
 72 |             info[path] = info[path][0]
 73 | 
 74 |     ignore_description = {
 75 |         0: "use all",
 76 |         1: "ignore only unmatched_alignment",
 77 |         2: "fully ignore recognitio",
 78 |     }
 79 | 
 80 |     print(" [!] Skip recognition level: {} ({})". \
 81 |             format(hparams.ignore_recognition_level,
 82 |                    ignore_description[hparams.ignore_recognition_level]))
 83 | 
 84 |     for audio_path, text in info.items():
 85 |         if hparams.ignore_recognition_level > 0 and loss_coeff[audio_path] != 1:
 86 |             continue
 87 | 
 88 |         if base_dir not in audio_path:
 89 |             audio_path = os.path.join(base_dir, audio_path)
 90 | 
 91 |         try:
 92 |             tokens = text_to_sequence(text)
 93 |         except:
 94 |             continue
 95 | 
 96 |         fn = partial(
 97 |                 _process_utterance,
 98 |                 audio_path, data_dir, tokens, loss_coeff[audio_path])
 99 |         futures.append(executor.submit(fn))
100 | 
101 |     n_frames = [future.result() for future in tqdm(futures)]
102 |     n_frames = [n_frame for n_frame in n_frames if n_frame is not None]
103 | 
104 |     hours = frames_to_hours(n_frames)
105 | 
106 |     print(' [*] Loaded metadata for {} examples ({:.2f} hours)'.format(len(n_frames), hours))
107 |     print(' [*] Max length: {}'.format(max(n_frames)))
108 |     print(' [*] Min length: {}'.format(min(n_frames)))
109 | 
110 |     plot_n_frames(n_frames, os.path.join(
111 |             base_dir, "n_frames_before_filter.png"))
112 | 
113 |     min_n_frame = hparams.reduction_factor * hparams.min_iters
114 |     max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor
115 | 
116 |     n_frames = [n for n in n_frames if min_n_frame <= n <= max_n_frame]
117 |     hours = frames_to_hours(n_frames)
118 | 
119 |     print(' [*] After filtered: {} examples ({:.2f} hours)'.format(len(n_frames), hours))
120 |     print(' [*] Max length: {}'.format(max(n_frames)))
121 |     print(' [*] Min length: {}'.format(min(n_frames)))
122 | 
123 |     plot_n_frames(n_frames, os.path.join(
124 |             base_dir, "n_frames_after_filter.png"))
125 | 
126 | def plot_n_frames(n_frames, path):
127 |     labels, values = list(zip(*Counter(n_frames).most_common()))
128 | 
129 |     values = [v for _, v in sorted(zip(labels, values))]
130 |     labels = sorted(labels)
131 | 
132 |     indexes = np.arange(len(labels))
133 |     width = 1
134 | 
135 |     fig, ax = plt.subplots(figsize=(len(labels) / 2, 5))
136 | 
137 |     plt.bar(indexes, values, width)
138 |     plt.xticks(indexes + width * 0.5, labels)
139 | 
140 |     plt.tight_layout()
141 |     plt.savefig(path)
142 | 
143 | 
144 | def _process_utterance(audio_path, data_dir, tokens, loss_coeff):
145 |     audio_name = os.path.basename(audio_path)
146 | 
147 |     filename = audio_name.rsplit('.', 1)[0] + ".npz"
148 |     numpy_path = os.path.join(data_dir, filename)
149 | 
150 |     if not os.path.exists(numpy_path):
151 |         wav = load_audio(audio_path)
152 | 
153 |         linear_spectrogram = spectrogram(wav).astype(np.float32)
154 |         mel_spectrogram = melspectrogram(wav).astype(np.float32)
155 | 
156 |         data = {
157 |             "linear": linear_spectrogram.T,
158 |             "mel": mel_spectrogram.T,
159 |             "tokens": tokens,
160 |             "loss_coeff": loss_coeff,
161 |         }
162 | 
163 |         n_frame = linear_spectrogram.shape[1]
164 | 
165 |         if hparams.skip_inadequate:
166 |             min_n_frame = hparams.reduction_factor * hparams.min_iters
167 |             max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor
168 | 
169 |             if min_n_frame <= n_frame <= max_n_frame and len(tokens) >= hparams.min_tokens:
170 |                 return None
171 | 
172 |         np.savez(numpy_path, **data, allow_pickle=False)
173 |     else:
174 |         try:
175 |             data = np.load(numpy_path)
176 |             n_frame = data["linear"].shape[0]
177 |         except:
178 |             remove_file(numpy_path)
179 |             return _process_utterance(audio_path, data_dir, tokens, loss_coeff)
180 | 
181 |     return n_frame
182 | 
183 | if __name__ == '__main__':
184 |     parser = argparse.ArgumentParser(description='spectrogram')
185 | 
186 |     parser.add_argument('metadata_path', type=str)
187 |     parser.add_argument('--data_dirname', type=str, default="data")
188 |     parser.add_argument('--num_workers', type=int, default=None)
189 | 
190 |     config = parser.parse_args()
191 |     build_from_path(config)
192 | 


--------------------------------------------------------------------------------
/datasets/kim_anchor/download.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import sys
  4 | import m3u8
  5 | import json
  6 | import requests
  7 | import subprocess
  8 | from functools import partial
  9 | from bs4 import BeautifulSoup
 10 | from utils import get_encoder_name, parallel_run, makedirs
 11 | 
 12 | BASE_URL = 'http://nsvc.jtbc.joins.com/API/News/Newapp/Default.aspx'
 13 | 
 14 | def soupify(text):
 15 |     return BeautifulSoup(text, "html.parser")
 16 | 
 17 | def get_news_ids(page_id):
 18 |     params = {
 19 |         'NJC': 'NJC300',
 20 |         'CAID': 'NC10011174',
 21 |         'PGI': page_id,
 22 |     }
 23 | 
 24 |     response = requests.request(
 25 |         method='GET', url=BASE_URL, params=params,
 26 |     )
 27 |     soup = soupify(response.text)
 28 | 
 29 |     return [item.text for item in soup.find_all('news_id')]
 30 | 
 31 | def download_news_video_and_content(
 32 |         news_id, base_dir, chunk_size=32*1024,
 33 |         video_dir="video", asset_dir="assets", audio_dir="audio"):
 34 | 
 35 |     video_dir = os.path.join(base_dir, video_dir)
 36 |     asset_dir = os.path.join(base_dir, asset_dir)
 37 |     audio_dir = os.path.join(base_dir, audio_dir)
 38 | 
 39 |     makedirs(video_dir)
 40 |     makedirs(asset_dir)
 41 |     makedirs(audio_dir)
 42 | 
 43 |     text_path = os.path.join(asset_dir, "{}.txt".format(news_id))
 44 |     original_text_path = os.path.join(asset_dir, "original-{}.txt".format(news_id))
 45 | 
 46 |     video_path = os.path.join(video_dir, "{}.ts".format(news_id))
 47 |     audio_path = os.path.join(audio_dir, "{}.wav".format(news_id))
 48 |     audio_raw_path = os.path.join(audio_dir + "/../audio_raw/", "{}_raw.wav".format(news_id))
 49 |     # prof_path = os.path.join(audio_dir + "/../audio_raw/", "{}.prof".format(news_id))
 50 | 
 51 |     params = {
 52 |         'NJC': 'NJC400',
 53 |         'NID': news_id, # NB11515152
 54 |         'CD': 'A0100',
 55 |     }
 56 | 
 57 |     response = requests.request(
 58 |         method='GET', url=BASE_URL, params=params,
 59 |     )
 60 | 
 61 |     soup = soupify(response.text)
 62 | 
 63 |     try:
 64 |         article_title = soup.find_all('article_title')
 65 |         print(news_id)
 66 |         print(article_title)
 67 | 
 68 |         article_contents = soup.find_all('article_contents')
 69 | 
 70 |         assert len(article_contents) == 1, \
 71 |                 "# of <article_contents> of {} should be 1: {}".format(news_id, response.text)
 72 | 
 73 |         text = soupify(article_contents[0].text).get_text() # remove <div>
 74 | 
 75 |         with open(original_text_path, "w") as f:
 76 |             f.write(text)
 77 | 
 78 |         with open(text_path, "w") as f:
 79 |             from nltk import sent_tokenize
 80 | 
 81 |             text = re.sub(r'\[.{0,80} :\s.+]', '', text) # remove quote
 82 |             text = re.sub(r'☞.+http.+\)', '', text) # remove quote
 83 |             text = re.sub(r'\(https?:\/\/.*[\r\n]*\)', '', text) # remove url
 84 | 
 85 |             sentences = sent_tokenize(text)
 86 |             sentences = [sent for sentence in sentences for sent in sentence.split('\n') if sent]
 87 | 
 88 |             new_texts = []
 89 |             for sent in sentences:
 90 |                 sent = sent.strip()
 91 |                 sent = re.sub(r'\([^)]*\)', '', sent)
 92 |                 #sent = re.sub(r'\<.{0,80}\>', '', sent)
 93 |                 sent = sent.replace('…', '.')
 94 |                 new_texts.append(sent)
 95 | 
 96 |             f.write("\n".join([sent for sent in new_texts if sent]))
 97 | 
 98 |         vod_paths = soup.find_all('vod_path')
 99 | 
100 |         assert len(vod_paths) == 1, \
101 |                 "# of <vod_path> of {} should be 1: {}".format(news_id, response.text)
102 | 
103 |         if not os.path.exists(video_path):
104 |             redirect_url = soup.find_all('vod_path')[0].text
105 | 
106 |             list_url = m3u8.load(redirect_url).playlists[0].absolute_uri
107 |             video_urls = [segment.absolute_uri for segment in m3u8.load(list_url).segments]
108 | 
109 |             with open(video_path, "wb") as f:
110 |                 for url in video_urls:
111 |                     response = requests.get(url, stream=True)
112 |                     total_size = int(response.headers.get('content-length', 0))
113 | 
114 |                     for chunk in response.iter_content(chunk_size):
115 |                         if chunk: # filter out keep-alive new chunks
116 |                             f.write(chunk)
117 | 
118 |         if not os.path.exists(audio_path):
119 |             encoder = get_encoder_name()
120 |             # 영상의 오디오 추출
121 |             command = '{} -y -loglevel panic -i {} -ab 160k {}'.\
122 |                     format(encoder, video_path, audio_path)
123 |             subprocess.call(command, shell=True)
124 | 
125 | 
126 |             # 잡음 제거
127 |             command = 'noiseclean/noiseclean.sh {} {} '.format(audio_raw_path, audio_path)
128 |             subprocess.call(command, shell=True)
129 | 
130 |             ###################
131 | 
132 |             # # prof 파일 생성
133 |             # command = 'sox {} -n noiseprof {}  '.\
134 |             #     format(audio_raw_path, prof_path)
135 |             # subprocess.call(command, shell=True)
136 | 
137 |             # # 잡음 제거
138 |             # command = 'sox -S --multi-threaded --buffer 131072 {} {} noisered {} 0.21 norm;'.\
139 |             #     format(audio_raw_path, audio_path , prof_path)
140 |             # subprocess.call(command, shell=True)
141 |     
142 |     except Exception as ex:
143 |         print (ex)
144 | 
145 |     return True
146 | 
147 | if __name__ == '__main__':
148 |     news_ids = []
149 |     page_idx = 1
150 | 
151 |     base_dir = os.path.dirname(os.path.realpath(__file__))
152 |     news_id_path = os.path.join(base_dir, "news_ids.json")
153 | 
154 |     if not os.path.exists(news_id_path):
155 |         while True:
156 |             tmp_ids = get_news_ids(page_idx)
157 |             if len(tmp_ids) == 0:
158 |                 break
159 | 
160 |             news_ids.extend(tmp_ids)
161 |             print(" [*] Download page {}: {}/{}".format(page_idx, len(tmp_ids), len(news_ids)))
162 | 
163 |             page_idx += 1
164 | 
165 |         with open(news_id_path, "w") as f:
166 |             json.dump(news_ids, f, indent=2, ensure_ascii=False)
167 |     else:
168 |         with open(news_id_path) as f:
169 |             news_ids = json.loads(f.read())
170 | 
171 |     exceptions = ["NB10830162"]
172 |     news_ids = list(set(news_ids) - set(exceptions))
173 | 
174 |     fn = partial(download_news_video_and_content, base_dir=base_dir)
175 | 
176 |     results = parallel_run(
177 |             fn, news_ids, desc="Download news video+text", parallel=True)
178 | 


--------------------------------------------------------------------------------
/datasets/son/download.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import sys
  4 | import m3u8
  5 | import json
  6 | import requests
  7 | import subprocess
  8 | from functools import partial
  9 | from bs4 import BeautifulSoup
 10 | 
 11 | from utils import get_encoder_name, parallel_run, makedirs
 12 | 
 13 | API_URL = 'http://api.jtbc.joins.com/ad/pre/NV10173083'
 14 | BASE_URL = 'http://nsvc.jtbc.joins.com/API/News/Newapp/Default.aspx'
 15 | 
 16 | def soupify(text):
 17 |     return BeautifulSoup(text, "html.parser")
 18 | 
 19 | def get_news_ids(page_id):
 20 |     params = {
 21 |         'NJC': 'NJC300',
 22 |         'CAID': 'NC10011174',
 23 |         'PGI': page_id,
 24 |     }
 25 | 
 26 |     response = requests.request(
 27 |         method='GET', url=BASE_URL, params=params,
 28 |     )
 29 |     soup = soupify(response.text)
 30 | 
 31 |     return [item.text for item in soup.find_all('news_id')]
 32 | 
 33 | def download_news_video_and_content(
 34 |         news_id, base_dir, chunk_size=32*1024,
 35 |         video_dir="video", asset_dir="assets", audio_dir="audio"):
 36 | 
 37 |     video_dir = os.path.join(base_dir, video_dir)
 38 |     asset_dir = os.path.join(base_dir, asset_dir)
 39 |     audio_dir = os.path.join(base_dir, audio_dir)
 40 | 
 41 |     makedirs(video_dir)
 42 |     makedirs(asset_dir)
 43 |     makedirs(audio_dir)
 44 | 
 45 |     text_path = os.path.join(asset_dir, "{}.txt".format(news_id))
 46 |     original_text_path = os.path.join(asset_dir, "original-{}.txt".format(news_id))
 47 | 
 48 |     video_path = os.path.join(video_dir, "{}.ts".format(news_id))
 49 |     audio_path = os.path.join(audio_dir, "{}.wav".format(news_id))
 50 | 
 51 |     params = {
 52 |         'NJC': 'NJC400',
 53 |         'NID': news_id, # NB11515152
 54 |         'CD': 'A0100',
 55 |     }
 56 | 
 57 |     response = requests.request(
 58 |         method='GET', url=BASE_URL, params=params,
 59 |     )
 60 |     soup = soupify(response.text)
 61 | 
 62 |     article_contents = soup.find_all('article_contents')
 63 | 
 64 |     assert len(article_contents) == 1, \
 65 |             "# of <article_contents> of {} should be 1: {}".format(news_id, response.text)
 66 | 
 67 |     text = soupify(article_contents[0].text).get_text() # remove <div>
 68 | 
 69 |     with open(original_text_path, "w") as f:
 70 |         f.write(text)
 71 | 
 72 |     with open(text_path, "w") as f:
 73 |         from nltk import sent_tokenize
 74 | 
 75 |         text = re.sub(r'\[.{0,80} :\s.+]', '', text) # remove quote
 76 |         text = re.sub(r'☞.+http.+\)', '', text) # remove quote
 77 |         text = re.sub(r'\(https?:\/\/.*[\r\n]*\)', '', text) # remove url
 78 | 
 79 |         sentences = sent_tokenize(text)
 80 |         sentences = [sent for sentence in sentences for sent in sentence.split('\n') if sent]
 81 | 
 82 |         new_texts = []
 83 |         for sent in sentences:
 84 |             sent = sent.strip()
 85 |             sent = re.sub(r'\([^)]*\)', '', sent)
 86 |             #sent = re.sub(r'\<.{0,80}\>', '', sent)
 87 |             sent = sent.replace('…', '.')
 88 |             new_texts.append(sent)
 89 | 
 90 |         f.write("\n".join([sent for sent in new_texts if sent]))
 91 | 
 92 |     vod_paths = soup.find_all('vod_path')
 93 | 
 94 |     assert len(vod_paths) == 1, \
 95 |             "# of <vod_path> of {} should be 1: {}".format(news_id, response.text)
 96 | 
 97 |     if not os.path.exists(video_path):
 98 |         redirect_url = soup.find_all('vod_path')[0].text
 99 | 
100 |         list_url = m3u8.load(redirect_url).playlists[0].absolute_uri
101 |         video_urls = [segment.absolute_uri for segment in m3u8.load(list_url).segments]
102 | 
103 |         with open(video_path, "wb") as f:
104 |             for url in video_urls:
105 |                 response = requests.get(url, stream=True)
106 |                 total_size = int(response.headers.get('content-length', 0))
107 | 
108 |                 for chunk in response.iter_content(chunk_size):
109 |                     if chunk: # filter out keep-alive new chunks
110 |                         f.write(chunk)
111 | 
112 |     if not os.path.exists(audio_path):
113 |         encoder = get_encoder_name()
114 |         command = "{} -y -loglevel panic -i {} -ab 160k -ac 2 -ar 44100 -vn {}".\
115 |                 format(encoder, video_path, audio_path)
116 |         subprocess.call(command, shell=True)
117 | 
118 |     return True
119 | 
120 | if __name__ == '__main__':
121 |     news_ids = []
122 |     page_idx = 1
123 | 
124 |     base_dir = os.path.dirname(os.path.realpath(__file__))
125 |     news_id_path = os.path.join(base_dir, "news_ids.json")
126 | 
127 |     if not os.path.exists(news_id_path):
128 |         while True:
129 |             tmp_ids = get_news_ids(page_idx)
130 |             if len(tmp_ids) == 0:
131 |                 break
132 | 
133 |             news_ids.extend(tmp_ids)
134 |             print(" [*] Download page {}: {}/{}".format(page_idx, len(tmp_ids), len(news_ids)))
135 | 
136 |             page_idx += 1
137 | 
138 |         with open(news_id_path, "w") as f:
139 |             json.dump(news_ids, f, indent=2, ensure_ascii=False)
140 |     else:
141 |         with open(news_id_path) as f:
142 |             news_ids = json.loads(f.read())
143 | 
144 |     exceptions = ["NB10830162"]
145 |     news_ids = list(set(news_ids) - set(exceptions))
146 | 
147 |     fn = partial(download_news_video_and_content, base_dir=base_dir)
148 | 
149 |     results = parallel_run(
150 |             fn, news_ids, desc="Download news video+text", parallel=True)
151 | 


--------------------------------------------------------------------------------
/datasets/yuinna/download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import argparse
 5 | import requests
 6 | from bs4 import BeautifulSoup
 7 | from functools import partial
 8 | 
 9 | from utils import download_with_url, makedirs, parallel_run
10 | 
11 | base_path = os.path.dirname(os.path.realpath(__file__))
12 | RSS_URL = "http://enabler.kbs.co.kr/api/podcast_channel/feed.xml?channel_id=R2010-0440"
13 | 
14 | def itunes_download(item):
15 |     audio_dir = os.path.join(base_path, "audio")
16 | 
17 |     date, url = item
18 |     path = os.path.join(audio_dir, "{}.mp4".format(date))
19 | 
20 |     if not os.path.exists(path):
21 |         download_with_url(url, path)
22 | 
23 | def download_all(config):
24 |     audio_dir = os.path.join(base_path, "audio")
25 |     makedirs(audio_dir)
26 | 
27 |     soup = BeautifulSoup(requests.get(RSS_URL).text, "html5lib")
28 | 
29 |     items = [item for item in soup.find_all('item')]
30 | 
31 |     titles = [item.find('title').text[9:-3] for item in items]
32 |     guids = [item.find('guid').text for item in items]
33 | 
34 |     accept_list = ['친절한 인나씨', '반납예정일', '귀욤열매 드세요']
35 | 
36 |     new_guids = [guid for title, guid in zip(titles, guids) \
37 |             if any(accept in title for accept in accept_list) and '-' not in title]
38 |     new_titles = [title for title, _ in zip(titles, guids) \
39 |             if any(accept in title for accept in accept_list) and '-' not in title]
40 | 
41 |     for idx, title in enumerate(new_titles):
42 |         print(" [{:3d}] {}, {}".format(idx + 1, title, 
43 |                 os.path.basename(new_guids[idx]).split('_')[2]))
44 |         if idx == config.max_num: print("="*30)
45 | 
46 |     urls = {
47 |             os.path.basename(guid).split('_')[2]: guid \
48 |                     for guid in new_guids[:config.max_num]
49 |     }
50 | 
51 |     parallel_run(itunes_download, urls.items(),
52 |             desc=" [*] Itunes download", parallel=True)
53 | 
54 | if __name__ == '__main__':
55 |     parser = argparse.ArgumentParser()
56 |     parser.add_argument('--max_num', default=100, type=int)
57 |     config, unparsed = parser.parse_known_args()
58 | 
59 |     download_all(config)
60 | 


--------------------------------------------------------------------------------
/download.py:
--------------------------------------------------------------------------------
  1 | # Code based on https://github.com/carpedm20/DCGAN-tensorflow/blob/master/download.py
  2 | 
  3 | from __future__ import print_function
  4 | import os
  5 | import sys
  6 | import gzip
  7 | import json
  8 | import tarfile
  9 | import zipfile
 10 | import argparse
 11 | import requests
 12 | from tqdm import tqdm
 13 | from six.moves import urllib
 14 | 
 15 | from utils import query_yes_no
 16 | 
 17 | parser = argparse.ArgumentParser(description='Download model checkpoints.')
 18 | parser.add_argument('checkpoints', metavar='N', type=str, nargs='+', choices=['son', 'park'],
 19 |                      help='name of checkpoints to download [son, park]')
 20 | 
 21 | def download(url, dirpath):
 22 |     filename = url.split('/')[-1]
 23 |     filepath = os.path.join(dirpath, filename)
 24 |     u = urllib.request.urlopen(url)
 25 |     f = open(filepath, 'wb')
 26 |     filesize = int(u.headers["Content-Length"])
 27 |     print("Downloading: %s Bytes: %s" % (filename, filesize))
 28 | 
 29 |     downloaded = 0
 30 |     block_sz = 8192
 31 |     status_width = 70
 32 |     while True:
 33 |         buf = u.read(block_sz)
 34 |         if not buf:
 35 |             print('')
 36 |             break
 37 |         else:
 38 |             print('', end='\r')
 39 |         downloaded += len(buf)
 40 |         f.write(buf)
 41 |         status = (("[%-" + str(status_width + 1) + "s] %3.2f%%") %
 42 |             ('=' * int(float(downloaded) / filesize * status_width) + '>', downloaded * 100. / filesize))
 43 |         print(status, end='')
 44 |         sys.stdout.flush()
 45 |     f.close()
 46 |     return filepath
 47 | 
 48 | def download_file_from_google_drive(id, destination):
 49 |     URL = "https://docs.google.com/uc?export=download"
 50 |     session = requests.Session()
 51 | 
 52 |     response = session.get(URL, params={ 'id': id }, stream=True)
 53 |     token = get_confirm_token(response)
 54 | 
 55 |     if token:
 56 |         params = { 'id' : id, 'confirm' : token }
 57 |         response = session.get(URL, params=params, stream=True)
 58 | 
 59 |     save_response_content(response, destination)
 60 | 
 61 | def get_confirm_token(response):
 62 |     for key, value in response.cookies.items():
 63 |         if key.startswith('download_warning'):
 64 |             return value
 65 |     return None
 66 | 
 67 | def save_response_content(response, destination, chunk_size=32*1024):
 68 |     total_size = int(response.headers.get('content-length', 0))
 69 |     with open(destination, "wb") as f:
 70 |         for chunk in tqdm(response.iter_content(chunk_size), total=total_size,
 71 |                             unit='B', unit_scale=True, desc=destination):
 72 |             if chunk: # filter out keep-alive new chunks
 73 |                 f.write(chunk)
 74 | 
 75 | def unzip(filepath):
 76 |     print("Extracting: " + filepath)
 77 |     dirpath = os.path.dirname(filepath)
 78 |     with zipfile.ZipFile(filepath) as zf:
 79 |         zf.extractall(dirpath)
 80 |     os.remove(filepath)
 81 | 
 82 | def download_checkpoint(checkpoint):
 83 |     if checkpoint == "son":
 84 |         save_path, drive_id = "son-20171015.tar.gz", "0B_7wC-DuR6ORcmpaY1A5V1AzZUU"
 85 |     elif checkpoint == "park":
 86 |         save_path, drive_id = "park-20171015.tar.gz", "0B_7wC-DuR6ORYjhlekl5bVlkQ2c"
 87 |     else:
 88 |         raise Exception(" [!] Unknown checkpoint: {}".format(checkpoint))
 89 | 
 90 |     if os.path.exists(save_path):
 91 |         print('[*] {} already exists'.format(save_path))
 92 |     else:
 93 |         download_file_from_google_drive(drive_id, save_path)
 94 | 
 95 |     if save_path.endswith(".zip"):
 96 |         zip_dir = ''
 97 |         with zipfile.ZipFile(save_path) as zf:
 98 |             zip_dir = zf.namelist()[0]
 99 |             zf.extractall(dirpath)
100 |         os.remove(save_path)
101 |         os.rename(os.path.join(dirpath, zip_dir), os.path.join(dirpath, data_dir))
102 |     elif save_path.endswith("tar.gz"):
103 |         tar = tarfile.open(save_path, "r:gz")
104 |         tar.extractall()
105 |         tar.close()
106 |     elif save_path.endswith("tar"):
107 |         tar = tarfile.open(save_path, "r:")
108 |         tar.extractall()
109 |         tar.close()
110 | 
111 | if __name__ == '__main__':
112 |     args = parser.parse_args()
113 | 
114 |     print(" [!] The pre-trained models are being made available for research purpose only")
115 |     print(" [!] 학습된 모델을 연구 이외의 목적으로 사용하는 것을 금지합니다.")
116 |     print()
117 | 
118 |     if query_yes_no(" [?] Are you agree on this? 이에 동의하십니까?"):
119 |         if 'park' in args.checkpoints:
120 |             download_checkpoint('park')
121 |         if 'son' in args.checkpoints:
122 |             download_checkpoint('son')
123 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import math
  4 | import argparse
  5 | from glob import glob
  6 | 
  7 | from synthesizer import Synthesizer
  8 | from train import create_batch_inputs_from_texts
  9 | from utils import makedirs, str2bool, backup_file
 10 | from hparams import hparams, hparams_debug_string
 11 | 
 12 | 
 13 | texts = [
 14 |     '텍스트를 음성으로 읽어주는 "음성합성" 기술은 시각 장애인을 위한 오디오북, 음성 안내 시스템, 대화 인공지능 등 많은 분야에 활용할 수 있습니다.',
 15 |     "하지만 개인이 원하는 목소리로 음성합성 엔진을 만들기에는 어려움이 많았고 소수의 기업만이 기술을 보유하고 있었습니다.",
 16 |     "최근 딥러닝 기술의 발전은 음성합성 기술의 진입 장벽을 많이 낮췄고 이제는 누구나 손쉽게 음성합성 엔진을 만들 수 있게 되었습니다.",
 17 | 
 18 |     "본 세션에서는 딥러닝을 활용한 음성합성 기술을 소개하고 개발 경험과 그 과정에서 얻었던 팁을 공유하고자 합니다.",
 19 |     "음성합성 엔진을 구현하는데 사용한 세 가지 연구를 소개하고 각각의 기술이 얼마나 자연스러운 목소리를 만들 수 있는지를 공유합니다.",
 20 | 
 21 |     # Harry Potter
 22 |     "그리고 헤르미온느는 겁에 질려 마룻바닥에 쓰러져 있었다.",
 23 |     "그러자 론은 요술지팡이를 꺼냈다. 무엇을 할지도 모르면서 그는 머리에 처음으로 떠오른 주문을 외치고 있었다.",
 24 |     "윙가르디움 레비오우사.... 하지만, 그렇게 소리쳤다.",
 25 |     "그러자 그 방망이가 갑자기 트롤의 손에서 벗어나, 저 위로 올라가더니 탁하며 그 주인의 머리 위에 떨어졌다.",
 26 |     "그러자 트롤이 그 자리에서 비틀거리더니 방 전체를 흔들어버릴 것 같은 커다란 소리를 내며 쿵 하고 넘어졌다. ",
 27 |     "그러자 조그맣게 펑 하는 소리가 나면서 가장 가까이 있는 가로등이 꺼졌다.",
 28 |     "그리고 그가 다시 찰깍하자 그 다음 가로등이 깜박거리며 나가 버렸다.",
 29 | 
 30 |     #"그가 그렇게 가로등 끄기를 열두번 하자, 이제 그 거리에 남아 있는 불빛이라곤, ",
 31 |     #"바늘로 꼭 질러둔 것처럼 작게 보이는 멀리서 그를 지켜보고 있는 고양이의 두 눈뿐이었다.",
 32 |     #"프리벳가 4번지에 살고 있는 더즐리 부부는 자신들이 정상적이라는 것을 아주 자랑스럽게 여기는 사람들이었다. ",
 33 |     #"그들은 기이하거나 신비스런 일과는 전혀 무관해 보였다.",
 34 |     #"아니, 그런 터무니없는 것은 도저히 참아내지 못했다.",
 35 |     #"더즐리 씨는 그루닝스라는 드릴제작회사의 중역이었다.",
 36 |     #"그는 목이 거의 없을 정도로 살이 뒤룩뒤룩 찐 몸집이 큰 사내로, 코밑에는 커다란 콧수염을 기르고 있었다.",
 37 |     #"더즐리 부인은 마른 체구의 금발이었고, 목이 보통사람보다 두 배는 길어서, 담 너머로 고개를 쭉 배고 이웃 사람들을 몰래 훔쳐보는 그녀의 취미에는 더없이 제격이었다.",
 38 | 
 39 |     # From Yoo Inna's Audiobook (http://campaign.happybean.naver.com/yooinna_audiobook):
 40 |     #'16세기 중엽 어느 가을날 옛 런던 시의 가난한 캔티 집안에 사내아이 하나가 태어났다.',
 41 |     #'그런데 그 집안에서는 그 사내아이를 별로 반기지 않았다.',
 42 |     #'바로 같은 날 또 한 명의 사내아이가 영국의 부유한 튜터 가문에서 태어났다.',
 43 |     #'그런데 그 가문에서는 그 아이를 무척이나 반겼다.',
 44 |     #'온 영국이 다 함께 그 아이를 반겼다.',
 45 | 
 46 |     ## From NAVER's Audiobook (http://campaign.happybean.naver.com/yooinna_audiobook):
 47 |     #'부랑자 패거리는 이른 새벽에 일찍 출발하여 길을 떠났다.',
 48 |     #'하늘은 찌푸렸고, 발밑의 땅은 질퍽거렸으며, 겨울의 냉기가 공기 중에 감돌았다.',
 49 |     #'지난밤의 흥겨움은 온데간데없이 사라졌다.',
 50 |     #'시무룩하게 말이 없는 사람들도 있었고, 안달복달하며 조바심을 내는 사람들도 있었지만, 기분이 좋은 사람은 하나도 없었다.',
 51 | 
 52 |     ## From NAVER's nVoice example (https://www.facebook.com/naverlabs/videos/422780217913446):
 53 |     #'감사합니다. Devsisters 김태훈 님의 발표였습니다.',
 54 |     #'이것으로 금일 마련된 track 2의 모든 세션이 종료되었습니다.',
 55 |     #'장시간 끝까지 참석해주신 개발자 여러분들께 진심으로 감사의 말씀을 드리며,',
 56 |     #'잠시 후 5시 15분부터 특정 주제에 관심 있는 사람들이 모여 자유롭게 이야기하는 오프미팅이 진행될 예정이므로',
 57 |     #'참여신청을 해주신 분들은 진행 요원의 안내에 따라 이동해주시기 바랍니다.',
 58 | 
 59 |     ## From Kakao's Son Seok hee example (https://www.youtube.com/watch?v=ScfdAH2otrY):
 60 |     #'소설가 마크 트웨인이 말했습니다.',
 61 |     #'인생에 가장 중요한 이틀이 있는데, 하나는 세상에 태어난 날이고 다른 하나는 왜 이 세상에 왔는가를 깨닫는 날이다.',
 62 |     #'그런데 그 첫번째 날은 누구나 다 알지만 두번째 날은 참 어려운 것 같습니다.',
 63 |     #'누구나 그 두번째 날을 만나기 위해 애쓰는게 삶인지도 모르겠습니다.',
 64 |     #'뉴스룸도 그런 면에서 똑같습니다.',
 65 |     #'저희들도 그 두번째의 날을 만나고 기억하기 위해 매일 매일 최선을 다하겠습니다.',
 66 | ]
 67 | 
 68 | 
 69 | def get_output_base_path(load_path, eval_dirname="eval"):
 70 |     if not os.path.isdir(load_path):
 71 |         base_dir = os.path.dirname(load_path)
 72 |     else:
 73 |         base_dir = load_path
 74 | 
 75 |     base_dir = os.path.join(base_dir, eval_dirname)
 76 |     if os.path.exists(base_dir):
 77 |         backup_file(base_dir)
 78 |     makedirs(base_dir)
 79 | 
 80 |     m = re.compile(r'.*?\.ckpt\-([0-9]+)').match(load_path)
 81 |     base_path = os.path.join(base_dir,
 82 |             'eval-%d' % int(m.group(1)) if m else 'eval')
 83 |     return base_path
 84 | 
 85 | 
 86 | def run_eval(args):
 87 |     print(hparams_debug_string())
 88 | 
 89 |     load_paths = glob(args.load_path_pattern)
 90 | 
 91 |     for load_path in load_paths:
 92 |         if not os.path.exists(os.path.join(load_path, "checkpoint")):
 93 |             print(" [!] Skip non model directory: {}".format(load_path))
 94 |             continue
 95 | 
 96 |         synth = Synthesizer()
 97 |         synth.load(load_path)
 98 | 
 99 |         for speaker_id in range(synth.num_speakers):
100 |             base_path = get_output_base_path(load_path, "eval-{}".format(speaker_id))
101 | 
102 |             inputs, input_lengths = create_batch_inputs_from_texts(texts)
103 | 
104 |             for idx in range(math.ceil(len(inputs) / args.batch_size)):
105 |                 start_idx, end_idx = idx*args.batch_size, (idx+1)*args.batch_size
106 | 
107 |                 cur_texts = texts[start_idx:end_idx]
108 |                 cur_inputs = inputs[start_idx:end_idx]
109 | 
110 |                 synth.synthesize(
111 |                         texts=cur_texts,
112 |                         speaker_ids=[speaker_id] * len(cur_texts),
113 |                         tokens=cur_inputs,
114 |                         base_path="{}-{}".format(base_path, idx),
115 |                         manual_attention_mode=args.manual_attention_mode,
116 |                         base_alignment_path=args.base_alignment_path,
117 |                 )
118 | 
119 |         synth.close()
120 | 
121 | def main():
122 |     parser = argparse.ArgumentParser()
123 |     parser.add_argument('--batch_size', default=16)
124 |     parser.add_argument('--load_path_pattern', required=True)
125 |     parser.add_argument('--base_alignment_path', default=None)
126 |     parser.add_argument('--manual_attention_mode', default=0, type=int,
127 |             help="0: None, 1: Argmax, 2: Sharpening, 3. Pruning")
128 |     parser.add_argument('--hparams', default='',
129 |         help='Hyperparameter overrides as a comma-separated list of name=value pairs')
130 |     args = parser.parse_args()
131 | 
132 |     #hparams.max_iters = 100
133 |     #hparams.parse(args.hparams)
134 |     run_eval(args)
135 | 
136 | 
137 | if __name__ == '__main__':
138 |     main()
139 | 


--------------------------------------------------------------------------------
/hparams.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | SCALE_FACTOR = 1
  4 | 
  5 | def f(num):
  6 |     return num // SCALE_FACTOR
  7 | 
  8 | basic_params = {
  9 |     # Comma-separated list of cleaners to run on text prior to training and eval. For non-English
 10 |     # text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md.
 11 |     'cleaners': 'korean_cleaners' #originally korean_cleaners
 12 | }
 13 | 
 14 | basic_params.update({
 15 |     # Audio
 16 |     'num_mels': 80,
 17 |     'num_freq': 1025,
 18 |     'sample_rate': 24000, # trained as 20000 but need to be 24000 
 19 |     'frame_length_ms': 50,
 20 |     'frame_shift_ms': 12.5,
 21 |     'preemphasis': 0.97,
 22 |     'min_level_db': -100,
 23 |     'ref_level_db': 20,
 24 | })
 25 | 
 26 | if True:
 27 |     basic_params.update({
 28 |         'sample_rate': 22050, #originally 24000 (krbook), 22050(lj-data), 20000(others) 
 29 |     })
 30 | 
 31 | basic_params.update({
 32 |     # Model
 33 |     'model_type': 'single', # [single, simple, deepvoice]
 34 |     'speaker_embedding_size': f(16),
 35 | 
 36 |     'embedding_size': f(256),
 37 |     'dropout_prob': 0.5,
 38 | 
 39 |     # Encoder
 40 |     'enc_prenet_sizes': [f(256), f(128)],
 41 |     'enc_bank_size': 16,
 42 |     'enc_bank_channel_size': f(128),
 43 |     'enc_maxpool_width': 2,
 44 |     'enc_highway_depth': 4,
 45 |     'enc_rnn_size': f(128),
 46 |     'enc_proj_sizes': [f(128), f(128)],
 47 |     'enc_proj_width': 3,
 48 | 
 49 |     # Attention
 50 |     'attention_type': 'bah_mon', # ntm2-5
 51 |     'attention_size': f(256),
 52 |     'attention_state_size': f(256),
 53 | 
 54 |     # Decoder recurrent network
 55 |     'dec_layer_num': 2,
 56 |     'dec_rnn_size': f(256),
 57 | 
 58 |     # Decoder
 59 |     'dec_prenet_sizes': [f(256), f(128)],
 60 |     'post_bank_size': 8,
 61 |     'post_bank_channel_size': f(256),
 62 |     'post_maxpool_width': 2,
 63 |     'post_highway_depth': 4,
 64 |     'post_rnn_size': f(128),
 65 |     'post_proj_sizes': [f(256), 80], # num_mels=80
 66 |     'post_proj_width': 3,
 67 | 
 68 |     'reduction_factor': 4,
 69 | })
 70 | 
 71 | if False: # Deep Voice 2 AudioBook Dataset
 72 |     basic_params.update({
 73 |         'dropout_prob': 0.8,
 74 | 
 75 |         'attention_size': f(512),
 76 | 
 77 |         'dec_prenet_sizes': [f(256), f(128), f(64)],
 78 |         'post_bank_channel_size': f(512),
 79 |         'post_rnn_size': f(256),
 80 | 
 81 |         'reduction_factor': 5, # changed from 4
 82 |     })
 83 | elif False: # Deep Voice 2 VCTK dataset
 84 |     basic_params.update({
 85 |         'dropout_prob': 0.8,
 86 | 
 87 |         #'attention_size': f(512),
 88 | 
 89 |         #'dec_prenet_sizes': [f(256), f(128)],
 90 |         #'post_bank_channel_size': f(512),
 91 |         'post_rnn_size': f(256),
 92 | 
 93 |         'reduction_factor': 5,
 94 |     })
 95 | elif True: # Single Speaker
 96 |     basic_params.update({
 97 |         'dropout_prob': 0.5,
 98 | 
 99 |         'attention_size': f(128),
100 | 
101 |         'post_bank_channel_size': f(128),
102 |         #'post_rnn_size': f(128),
103 | 
104 |         'reduction_factor': 5, #chhanged from 4
105 |     })
106 | elif False: # Single Speaker with generalization
107 |     basic_params.update({
108 |         'dropout_prob': 0.8,
109 | 
110 |         'attention_size': f(256),
111 | 
112 |         'dec_prenet_sizes': [f(256), f(128), f(64)],
113 |         'post_bank_channel_size': f(128),
114 |         'post_rnn_size': f(128),
115 | 
116 |         'reduction_factor': 4,
117 |     })
118 | 
119 | 
120 | basic_params.update({
121 |     # Training
122 |     'batch_size': 32,
123 |     'adam_beta1': 0.9,
124 |     'adam_beta2': 0.999,
125 |     'use_fixed_test_inputs': False,
126 | 
127 |     'initial_learning_rate': 0.001,
128 |     'decay_learning_rate_mode': 0, # True in deepvoice2 paper
129 |     'initial_data_greedy': True,
130 |     'initial_phase_step': 8000,
131 |     'main_data_greedy_factor': 0,
132 |     'main_data': [''],
133 |     'prioritize_loss': False,
134 | 
135 |     'recognition_loss_coeff': 0.2,
136 |     'ignore_recognition_level': 0, # 0: use all, 1: ignore only unmatched_alignment, 2: fully ignore recognition
137 | 
138 |     # Eval
139 |     'min_tokens': 50,#originally 50, 30 is good for korean,
140 |     'min_iters': 30,
141 |     'max_iters': 200,
142 |     'skip_inadequate': False,
143 | 
144 |     'griffin_lim_iters': 60,
145 |     'power': 1.5, # Power to raise magnitudes to prior to Griffin-Lim
146 | })
147 | 
148 | 
149 | # Default hyperparameters:
150 | hparams = tf.contrib.training.HParams(**basic_params)
151 | 
152 | 
153 | def hparams_debug_string():
154 |     values = hparams.values()
155 |     hp = ['    %s: %s' % (name, values[name]) for name in sorted(values)]
156 |     return 'Hyperparameters:\n' + '\n'.join(hp)
157 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from glob import glob
 3 | from .tacotron import Tacotron
 4 | 
 5 | 
 6 | def create_model(hparams):
 7 |   return Tacotron(hparams)
 8 | 
 9 | 
10 | def get_most_recent_checkpoint(checkpoint_dir):
11 |     checkpoint_paths = [path for path in glob("{}/*.ckpt-*.data-*".format(checkpoint_dir))]
12 |     idxes = [int(os.path.basename(path).split('-')[1].split('.')[0]) for path in checkpoint_paths]
13 | 
14 |     max_idx = max(idxes)
15 |     lastest_checkpoint = os.path.join(checkpoint_dir, "model.ckpt-{}".format(max_idx))
16 | 
17 |     #latest_checkpoint=checkpoint_paths[0]
18 |     print(" [*] Found lastest checkpoint: {}".format(lastest_checkpoint))
19 |     return lastest_checkpoint
20 | 


--------------------------------------------------------------------------------
/models/helpers.py:
--------------------------------------------------------------------------------
 1 | # Code based on https://github.com/keithito/tacotron/blob/master/models/tacotron.py
 2 | 
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | from tensorflow.contrib.seq2seq import Helper
 6 | 
 7 | 
 8 | # Adapted from tf.contrib.seq2seq.GreedyEmbeddingHelper
 9 | class TacoTestHelper(Helper):
10 |   def __init__(self, batch_size, output_dim, r):
11 |     with tf.name_scope('TacoTestHelper'):
12 |       self._batch_size = batch_size
13 |       self._output_dim = output_dim
14 |       self._end_token = tf.tile([0.0], [output_dim * r])
15 | 
16 |   @property
17 |   def batch_size(self):
18 |     return self._batch_size
19 | 
20 |   def initialize(self, name=None):
21 |     return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
22 | 
23 |   def sample(self, time, outputs, state, name=None):
24 |     return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
25 | 
26 |   def next_inputs(self, time, outputs, state, sample_ids, name=None):
27 |     '''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.'''
28 |     with tf.name_scope('TacoTestHelper'):
29 |       finished = tf.reduce_all(tf.equal(outputs, self._end_token), axis=1)
30 |       # Feed last output frame as next input. outputs is [N, output_dim * r]
31 |       next_inputs = outputs[:, -self._output_dim:]
32 |       return (finished, next_inputs, state)
33 | 
34 | 
35 | class TacoTrainingHelper(Helper):
36 |   def __init__(self, inputs, targets, output_dim, r, rnn_decoder_test_mode=False):
37 |     # inputs is [N, T_in], targets is [N, T_out, D]
38 |     with tf.name_scope('TacoTrainingHelper'):
39 |       self._batch_size = tf.shape(inputs)[0]
40 |       self._output_dim = output_dim
41 |       self._rnn_decoder_test_mode = rnn_decoder_test_mode
42 | 
43 |       # Feed every r-th target frame as input
44 |       self._targets = targets[:, r-1::r, :]
45 | 
46 |       # Use full length for every target because we don't want to mask the padding frames
47 |       num_steps = tf.shape(self._targets)[1]
48 |       self._lengths = tf.tile([num_steps], [self._batch_size])
49 | 
50 |   @property
51 |   def batch_size(self):
52 |     return self._batch_size
53 | 
54 |   def initialize(self, name=None):
55 |     return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim))
56 | 
57 |   def sample(self, time, outputs, state, name=None):
58 |     return tf.tile([0], [self._batch_size])  # Return all 0; we ignore them
59 | 
60 |   def next_inputs(self, time, outputs, state, sample_ids, name=None):
61 |     with tf.name_scope(name or 'TacoTrainingHelper'):
62 |       finished = (time + 1 >= self._lengths)
63 |       if self._rnn_decoder_test_mode:
64 |         next_inputs = outputs[:, -self._output_dim:]
65 |       else:
66 |         next_inputs = self._targets[:, time, :]
67 |       return (finished, next_inputs, state)
68 | 
69 | 
70 | def _go_frames(batch_size, output_dim):
71 |   '''Returns all-zero <GO> frames for a given batch size and output dimension'''
72 |   return tf.tile([[0.0]], [batch_size, output_dim])
73 | 
74 | 


--------------------------------------------------------------------------------
/models/modules.py:
--------------------------------------------------------------------------------
  1 | # Code based on https://github.com/keithito/tacotron/blob/master/models/tacotron.py
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow.contrib.rnn import GRUCell
  5 | from tensorflow.python.layers import core
  6 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper \
  7 |         import _bahdanau_score, _BaseAttentionMechanism, BahdanauAttention, \
  8 |                AttentionWrapper, AttentionWrapperState
  9 | 
 10 | 
 11 | def get_embed(inputs, num_inputs, embed_size, name):
 12 |     embed_table = tf.get_variable(
 13 |             name, [num_inputs, embed_size], dtype=tf.float32,
 14 |             initializer=tf.truncated_normal_initializer(stddev=0.1))
 15 |     return tf.nn.embedding_lookup(embed_table, inputs)
 16 | 
 17 | 
 18 | def prenet(inputs, is_training, layer_sizes, drop_prob, scope=None):
 19 |     x = inputs
 20 |     drop_rate = drop_prob if is_training else 0.0
 21 |     with tf.variable_scope(scope or 'prenet'):
 22 |         for i, size in enumerate(layer_sizes):
 23 |             dense = tf.layers.dense(x, units=size, activation=tf.nn.relu, name='dense_%d' % (i+1))
 24 |             x = tf.layers.dropout(dense, rate=drop_rate, name='dropout_%d' % (i+1))
 25 |     return x
 26 | 
 27 | def cbhg(inputs, input_lengths, is_training, 
 28 |         bank_size, bank_channel_size,
 29 |         maxpool_width, highway_depth, rnn_size,
 30 |         proj_sizes, proj_width, scope,
 31 |         before_highway=None, encoder_rnn_init_state=None):
 32 | 
 33 |     batch_size = tf.shape(inputs)[0]
 34 |     with tf.variable_scope(scope):
 35 |         with tf.variable_scope('conv_bank'):
 36 |             # Convolution bank: concatenate on the last axis
 37 |             # to stack channels from all convolutions
 38 |             conv_fn = lambda k: \
 39 |                     conv1d(inputs, k, bank_channel_size, 
 40 |                             tf.nn.relu, is_training, 'conv1d_%d' % k)
 41 | 
 42 |             conv_outputs = tf.concat(
 43 |                 [conv_fn(k) for k in range(1, bank_size+1)], axis=-1,
 44 |             )
 45 | 
 46 |         # Maxpooling:
 47 |         maxpool_output = tf.layers.max_pooling1d(
 48 |             conv_outputs,
 49 |             pool_size=maxpool_width,
 50 |             strides=1,
 51 |             padding='same')
 52 | 
 53 |         # Two projection layers:
 54 |         proj_out = maxpool_output
 55 |         for idx, proj_size in enumerate(proj_sizes):
 56 |             activation_fn = None if idx == len(proj_sizes) - 1 else tf.nn.relu
 57 |             proj_out = conv1d(
 58 |                     proj_out, proj_width, proj_size, activation_fn,
 59 |                     is_training, 'proj_{}'.format(idx + 1))
 60 | 
 61 |         # Residual connection:
 62 |         if before_highway is not None:
 63 |             expanded_before_highway = tf.expand_dims(before_highway, [1])
 64 |             tiled_before_highway = tf.tile(
 65 |                     expanded_before_highway, [1, tf.shape(proj_out)[1], 1])
 66 | 
 67 |             highway_input = proj_out + inputs + tiled_before_highway
 68 |         else:
 69 |             highway_input = proj_out + inputs
 70 | 
 71 |         # Handle dimensionality mismatch:
 72 |         if highway_input.shape[2] != rnn_size:
 73 |             highway_input = tf.layers.dense(highway_input, rnn_size)
 74 | 
 75 |         # 4-layer HighwayNet:
 76 |         for idx in range(highway_depth):
 77 |             highway_input = highwaynet(highway_input, 'highway_%d' % (idx+1))
 78 | 
 79 |         rnn_input = highway_input
 80 | 
 81 |         # Bidirectional RNN
 82 |         if encoder_rnn_init_state is not None:
 83 |             initial_state_fw, initial_state_bw = \
 84 |                     tf.split(encoder_rnn_init_state, 2, 1)
 85 |         else:
 86 |             initial_state_fw, initial_state_bw = None, None
 87 | 
 88 |         cell_fw, cell_bw = GRUCell(rnn_size), GRUCell(rnn_size)
 89 |         outputs, states = tf.nn.bidirectional_dynamic_rnn(
 90 |                 cell_fw, cell_bw,
 91 |                 rnn_input,
 92 |                 sequence_length=input_lengths,
 93 |                 initial_state_fw=initial_state_fw,
 94 |                 initial_state_bw=initial_state_bw,
 95 |                 dtype=tf.float32)
 96 |         return tf.concat(outputs, axis=2)    # Concat forward and backward
 97 | 
 98 | 
 99 | def batch_tile(tensor, batch_size):
100 |     expaneded_tensor = tf.expand_dims(tensor, [0])
101 |     return tf.tile(expaneded_tensor, \
102 |             [batch_size] + [1 for _ in tensor.get_shape()])
103 | 
104 | 
105 | def highwaynet(inputs, scope):
106 |     highway_dim = int(inputs.get_shape()[-1])
107 | 
108 |     with tf.variable_scope(scope):
109 |         H = tf.layers.dense(
110 |             inputs,
111 |             units=highway_dim,
112 |             activation=tf.nn.relu,
113 |             name='H')
114 |         T = tf.layers.dense(
115 |             inputs,
116 |             units=highway_dim,
117 |             activation=tf.nn.sigmoid,
118 |             name='T',
119 |             bias_initializer=tf.constant_initializer(-1.0))
120 |         return H * T + inputs * (1.0 - T)
121 | 
122 | 
123 | def conv1d(inputs, kernel_size, channels, activation, is_training, scope):
124 |     with tf.variable_scope(scope):
125 |         conv1d_output = tf.layers.conv1d(
126 |             inputs,
127 |             filters=channels,
128 |             kernel_size=kernel_size,
129 |             activation=activation,
130 |             padding='same')
131 |         return tf.layers.batch_normalization(conv1d_output, training=is_training)
132 | 


--------------------------------------------------------------------------------
/models/tacotron.py:
--------------------------------------------------------------------------------
  1 | # Code based on https://github.com/keithito/tacotron/blob/master/models/tacotron.py
  2 | 
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from tensorflow.contrib.seq2seq import BasicDecoder, BahdanauAttention, BahdanauMonotonicAttention
  6 | from tensorflow.contrib.rnn import GRUCell, MultiRNNCell, OutputProjectionWrapper, ResidualWrapper
  7 | 
  8 | from utils.infolog import log
  9 | from text.symbols import symbols
 10 | 
 11 | from .modules import *
 12 | from .helpers import TacoTestHelper, TacoTrainingHelper
 13 | from .rnn_wrappers import AttentionWrapper, DecoderPrenetWrapper, ConcatOutputAndAttentionWrapper
 14 | 
 15 | 
 16 | class Tacotron():
 17 |     def __init__(self, hparams):
 18 |         self._hparams = hparams
 19 | 
 20 | 
 21 |     def initialize(
 22 |             self, inputs, input_lengths, num_speakers, speaker_id,
 23 |             mel_targets=None, linear_targets=None, loss_coeff=None,
 24 |             rnn_decoder_test_mode=False, is_randomly_initialized=False,
 25 |         ):
 26 |         is_training = linear_targets is not None
 27 |         self.is_randomly_initialized = is_randomly_initialized
 28 | 
 29 |         with tf.variable_scope('inference') as scope:
 30 |             hp = self._hparams
 31 |             batch_size = tf.shape(inputs)[0]
 32 | 
 33 |             # Embeddings
 34 |             char_embed_table = tf.get_variable(
 35 |                     'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32,
 36 |                     initializer=tf.truncated_normal_initializer(stddev=0.5))
 37 |             # [N, T_in, embedding_size]
 38 |             char_embedded_inputs = \
 39 |                     tf.nn.embedding_lookup(char_embed_table, inputs)
 40 | 
 41 |             self.num_speakers = num_speakers
 42 |             if self.num_speakers > 1:
 43 |                 if hp.speaker_embedding_size != 1:
 44 |                     speaker_embed_table = tf.get_variable(
 45 |                             'speaker_embedding',
 46 |                             [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32,
 47 |                             initializer=tf.truncated_normal_initializer(stddev=0.5))
 48 |                     # [N, T_in, speaker_embedding_size]
 49 |                     speaker_embed = tf.nn.embedding_lookup(speaker_embed_table, speaker_id)
 50 | 
 51 |                 if hp.model_type == 'deepvoice':
 52 |                     if hp.speaker_embedding_size == 1:
 53 |                         before_highway = get_embed(
 54 |                                 speaker_id, self.num_speakers, 
 55 |                                 hp.enc_prenet_sizes[-1], "before_highway")
 56 |                         encoder_rnn_init_state = get_embed(
 57 |                                 speaker_id, self.num_speakers, 
 58 |                                 hp.enc_rnn_size * 2, "encoder_rnn_init_state")
 59 | 
 60 |                         attention_rnn_init_state = get_embed(
 61 |                                 speaker_id, self.num_speakers, 
 62 |                                 hp.attention_state_size, "attention_rnn_init_state")
 63 |                         decoder_rnn_init_states = [get_embed(
 64 |                                 speaker_id, self.num_speakers, 
 65 |                                 hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \
 66 |                                         for idx in range(hp.dec_layer_num)]
 67 |                     else:
 68 |                         deep_dense = lambda x, dim: \
 69 |                                 tf.layers.dense(x, dim, activation=tf.nn.softsign)
 70 | 
 71 |                         before_highway = deep_dense(
 72 |                                 speaker_embed, hp.enc_prenet_sizes[-1])
 73 |                         encoder_rnn_init_state = deep_dense(
 74 |                                 speaker_embed, hp.enc_rnn_size * 2)
 75 | 
 76 |                         attention_rnn_init_state = deep_dense(
 77 |                                 speaker_embed, hp.attention_state_size)
 78 |                         decoder_rnn_init_states = [deep_dense(
 79 |                                 speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num)]
 80 | 
 81 |                     speaker_embed = None # deepvoice does not use speaker_embed directly
 82 |                 elif hp.model_type == 'simple':
 83 |                     before_highway = None
 84 |                     encoder_rnn_init_state = None
 85 |                     attention_rnn_init_state = None
 86 |                     decoder_rnn_init_states = None
 87 |                 else:
 88 |                     raise Exception(" [!] Unkown multi-speaker model type: {}".format(hp.model_type))
 89 |             else:
 90 |                 speaker_embed = None
 91 |                 before_highway = None
 92 |                 encoder_rnn_init_state = None
 93 |                 attention_rnn_init_state = None
 94 |                 decoder_rnn_init_states = None
 95 | 
 96 |             ##############
 97 |             # Encoder
 98 |             ##############
 99 | 
100 |             # [N, T_in, enc_prenet_sizes[-1]]
101 |             prenet_outputs = prenet(char_embedded_inputs, is_training,
102 |                     hp.enc_prenet_sizes, hp.dropout_prob,
103 |                     scope='prenet')
104 | 
105 |             encoder_outputs = cbhg(
106 |                     prenet_outputs, input_lengths, is_training,
107 |                     hp.enc_bank_size, hp.enc_bank_channel_size,
108 |                     hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size,
109 |                     hp.enc_proj_sizes, hp.enc_proj_width,
110 |                     scope="encoder_cbhg",
111 |                     before_highway=before_highway,
112 |                     encoder_rnn_init_state=encoder_rnn_init_state)
113 | 
114 | 
115 |             ##############
116 |             # Attention
117 |             ##############
118 | 
119 |             # For manaul control of attention
120 |             self.is_manual_attention = tf.placeholder(
121 |                     tf.bool, shape=(), name='is_manual_attention',
122 |             )
123 |             self.manual_alignments = tf.placeholder(
124 |                     tf.float32, shape=[None, None, None], name="manual_alignments",
125 |             )
126 | 
127 |             dec_prenet_outputs = DecoderPrenetWrapper(
128 |                     GRUCell(hp.attention_state_size),
129 |                     speaker_embed,
130 |                     is_training, hp.dec_prenet_sizes, hp.dropout_prob)
131 | 
132 |             if hp.attention_type == 'bah_mon':
133 |                 attention_mechanism = BahdanauMonotonicAttention(
134 |                         hp.attention_size, encoder_outputs)
135 |             elif hp.attention_type == 'bah_norm':
136 |                 attention_mechanism = BahdanauAttention(
137 |                         hp.attention_size, encoder_outputs, normalize=True)
138 |             elif hp.attention_type == 'luong_scaled':
139 |                 attention_mechanism = LuongAttention(
140 |                         hp.attention_size, encoder_outputs, scale=True)
141 |             elif hp.attention_type == 'luong':
142 |                 attention_mechanism = LuongAttention(
143 |                         hp.attention_size, encoder_outputs)
144 |             elif hp.attention_type == 'bah':
145 |                 attention_mechanism = BahdanauAttention(
146 |                         hp.attention_size, encoder_outputs)
147 |             elif hp.attention_type.startswith('ntm2'):
148 |                 shift_width = int(hp.attention_type.split('-')[-1])
149 |                 attention_mechanism = NTMAttention2(
150 |                         hp.attention_size, encoder_outputs, shift_width=shift_width)
151 |             else:
152 |                 raise Exception(" [!] Unkown attention type: {}".format(hp.attention_type))
153 | 
154 |             attention_cell = AttentionWrapper(
155 |                     dec_prenet_outputs,
156 |                     attention_mechanism,
157 |                     self.is_manual_attention,
158 |                     self.manual_alignments,
159 |                     initial_cell_state=attention_rnn_init_state,
160 |                     alignment_history=True,
161 |                     output_attention=False
162 |             )
163 | 
164 |             # Concatenate attention context vector and RNN cell output into a 512D vector.
165 |             # [N, T_in, attention_size+attention_state_size]
166 |             concat_cell = ConcatOutputAndAttentionWrapper(
167 |                     attention_cell, embed_to_concat=speaker_embed)
168 |                         
169 |             # Decoder (layers specified bottom to top):
170 |             cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)]
171 |             for _ in range(hp.dec_layer_num):
172 |                 cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size)))
173 | 
174 |             # [N, T_in, 256]
175 |             decoder_cell = MultiRNNCell(cells, state_is_tuple=True)
176 | 
177 |             # Project onto r mel spectrograms (predict r outputs at each RNN step):
178 |             output_cell = OutputProjectionWrapper(
179 |                     decoder_cell, hp.num_mels * hp.reduction_factor)
180 |             decoder_init_state = output_cell.zero_state(
181 |                     batch_size=batch_size, dtype=tf.float32)
182 | 
183 |             if hp.model_type == "deepvoice":
184 |                 # decoder_init_state[0] : AttentionWrapperState
185 |                 # = cell_state + attention + time + alignments + alignment_history
186 |                 # decoder_init_state[0][0] = attention_rnn_init_state (already applied)
187 |                 decoder_init_state = list(decoder_init_state)
188 | 
189 |                 for idx, cell in enumerate(decoder_rnn_init_states):
190 |                     shape1 = decoder_init_state[idx + 1].get_shape().as_list()
191 |                     shape2 = cell.get_shape().as_list()
192 |                     if shape1 != shape2:
193 |                         raise Exception(" [!] Shape {} and {} should be equal". \
194 |                                 format(shape1, shape2))
195 |                     decoder_init_state[idx + 1] = cell
196 | 
197 |                 decoder_init_state = tuple(decoder_init_state)
198 | 
199 |             if is_training:
200 |                 helper = TacoTrainingHelper(
201 |                         inputs, mel_targets, hp.num_mels, hp.reduction_factor,
202 |                         rnn_decoder_test_mode)
203 |             else:
204 |                 helper = TacoTestHelper(
205 |                         batch_size, hp.num_mels, hp.reduction_factor)
206 | 
207 |             (decoder_outputs, _), final_decoder_state, _ = \
208 |                     tf.contrib.seq2seq.dynamic_decode(
209 |                             BasicDecoder(output_cell, helper, decoder_init_state),
210 |                             maximum_iterations=hp.max_iters)
211 | 
212 |             # [N, T_out, M]
213 |             mel_outputs = tf.reshape(
214 |                     decoder_outputs, [batch_size, -1, hp.num_mels])
215 | 
216 |             # Add post-processing CBHG:
217 |             # [N, T_out, 256]
218 |             #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training)
219 |             post_outputs = cbhg(
220 |                     mel_outputs, None, is_training,
221 |                     hp.post_bank_size, hp.post_bank_channel_size,
222 |                     hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size,
223 |                     hp.post_proj_sizes, hp.post_proj_width,
224 |                     scope='post_cbhg')
225 | 
226 |             if speaker_embed is not None and hp.model_type == 'simple':
227 |                 expanded_speaker_emb = tf.expand_dims(speaker_embed, [1])
228 |                 tiled_speaker_embedding = tf.tile(
229 |                         expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1])
230 | 
231 |                 # [N, T_out, 256 + alpha]
232 |                 post_outputs = \
233 |                         tf.concat([tiled_speaker_embedding, post_outputs], axis=-1)
234 | 
235 |             linear_outputs = tf.layers.dense(post_outputs, hp.num_freq)    # [N, T_out, F]
236 | 
237 |             # Grab alignments from the final decoder state:
238 |             alignments = tf.transpose(
239 |                     final_decoder_state[0].alignment_history.stack(), [1, 2, 0])
240 | 
241 | 
242 |             self.inputs = inputs
243 |             self.speaker_id = speaker_id
244 |             self.input_lengths = input_lengths
245 |             self.loss_coeff = loss_coeff
246 |             self.mel_outputs = mel_outputs
247 |             self.linear_outputs = linear_outputs
248 |             self.alignments = alignments
249 |             self.mel_targets = mel_targets
250 |             self.linear_targets = linear_targets
251 |             self.final_decoder_state = final_decoder_state
252 | 
253 |             log('='*40)
254 |             log(' model_type: %s' % hp.model_type)
255 |             log('='*40)
256 | 
257 |             log('Initialized Tacotron model. Dimensions: ')
258 |             log('    embedding:                %d' % char_embedded_inputs.shape[-1])
259 |             if speaker_embed is not None:
260 |                 log('    speaker embedding:        %d' % speaker_embed.shape[-1])
261 |             else:
262 |                 log('    speaker embedding:        None')
263 |             log('    prenet out:               %d' % prenet_outputs.shape[-1])
264 |             log('    encoder out:              %d' % encoder_outputs.shape[-1])
265 |             log('    attention out:            %d' % attention_cell.output_size)
266 |             log('    concat attn & out:        %d' % concat_cell.output_size)
267 |             log('    decoder cell out:         %d' % decoder_cell.output_size)
268 |             log('    decoder out (%d frames):  %d' % (hp.reduction_factor, decoder_outputs.shape[-1]))
269 |             log('    decoder out (1 frame):    %d' % mel_outputs.shape[-1])
270 |             log('    postnet out:              %d' % post_outputs.shape[-1])
271 |             log('    linear out:               %d' % linear_outputs.shape[-1])
272 | 
273 | 
274 |     def add_loss(self):
275 |         '''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
276 |         with tf.variable_scope('loss') as scope:
277 |             hp = self._hparams
278 |             mel_loss = tf.abs(self.mel_targets - self.mel_outputs)
279 | 
280 |             l1 = tf.abs(self.linear_targets - self.linear_outputs)
281 |             expanded_loss_coeff = tf.expand_dims(
282 |                     tf.expand_dims(self.loss_coeff, [-1]), [-1])
283 | 
284 |             if hp.prioritize_loss:
285 |                 # Prioritize loss for frequencies.
286 |                 upper_priority_freq = int(5000 / (hp.sample_rate * 0.5) * hp.num_freq)
287 |                 lower_priority_freq = int(165 / (hp.sample_rate * 0.5) * hp.num_freq)
288 | 
289 |                 l1_priority= l1[:,:,lower_priority_freq:upper_priority_freq]
290 | 
291 |                 self.loss = tf.reduce_mean(mel_loss * expanded_loss_coeff) + \
292 |                         0.5 * tf.reduce_mean(l1 * expanded_loss_coeff) + \
293 |                         0.5 * tf.reduce_mean(l1_priority * expanded_loss_coeff)
294 |                 self.linear_loss = tf.reduce_mean(
295 |                         0.5 * (tf.reduce_mean(l1) + tf.reduce_mean(l1_priority)))
296 |             else:
297 |                 self.loss = tf.reduce_mean(mel_loss * expanded_loss_coeff) + \
298 |                         tf.reduce_mean(l1 * expanded_loss_coeff)
299 |                 self.linear_loss = tf.reduce_mean(l1)
300 | 
301 |             self.mel_loss = tf.reduce_mean(mel_loss)
302 |             self.loss_without_coeff = self.mel_loss + self.linear_loss
303 | 
304 | 
305 |     def add_optimizer(self, global_step):
306 |         '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
307 | 
308 |         Args:
309 |             global_step: int32 scalar Tensor representing current global step in training
310 |         '''
311 |         with tf.variable_scope('optimizer') as scope:
312 |             hp = self._hparams
313 | 
314 |             step = tf.cast(global_step + 1, dtype=tf.float32)
315 | 
316 |             if hp.decay_learning_rate_mode == 0:
317 |                 if self.is_randomly_initialized:
318 |                     warmup_steps = 4000.0
319 |                 else:
320 |                     warmup_steps = 40000.0
321 |                 self.learning_rate = hp.initial_learning_rate * warmup_steps**0.5 * \
322 |                         tf.minimum(step * warmup_steps**-1.5, step**-0.5)
323 |             elif hp.decay_learning_rate_mode == 1:
324 |                 self.learning_rate = hp.initial_learning_rate * \
325 |                         tf.train.exponential_decay(1., step, 3000, 0.95)
326 | 
327 |             optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2)
328 |             gradients, variables = zip(*optimizer.compute_gradients(self.loss))
329 |             self.gradients = gradients
330 |             clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
331 | 
332 |             # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
333 |             # https://github.com/tensorflow/tensorflow/issues/1122
334 |             with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
335 |                 self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables),
336 |                     global_step=global_step)
337 | 
338 |     def get_dummy_feed_dict(self):
339 |         feed_dict = {
340 |                 self.is_manual_attention: False,
341 |                 self.manual_alignments: np.zeros([1, 1, 1]),
342 |         }
343 |         return feed_dict
344 | 


--------------------------------------------------------------------------------
/recognition/alignment.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import string
  4 | import argparse
  5 | import operator
  6 | from functools import partial
  7 | from difflib import SequenceMatcher
  8 | 
  9 | from audio.get_duration import get_durations
 10 | from text import remove_puncuations, text_to_sequence
 11 | from utils import load_json, write_json, parallel_run, remove_postfix, backup_file
 12 | 
 13 | def plain_text(text):
 14 |     return "".join(remove_puncuations(text.strip()).split())
 15 | 
 16 | def add_punctuation(text):
 17 |     if text.endswith('다'):
 18 |         return text + "."
 19 |     else:
 20 |         return text
 21 | 
 22 | def similarity(text_a, text_b):
 23 |     text_a = plain_text(text_a)
 24 |     text_b = plain_text(text_b)
 25 | 
 26 |     score = SequenceMatcher(None, text_a, text_b).ratio()
 27 |     return score
 28 | 
 29 | def first_word_combined_words(text):
 30 |     words = text.split()
 31 |     if len(words) > 1:
 32 |         first_words = [words[0], words[0]+words[1]]
 33 |     else:
 34 |         first_words = [words[0]]
 35 |     return first_words
 36 | 
 37 | def first_word_combined_texts(text):
 38 |     words = text.split()
 39 |     if len(words) > 1:
 40 |         if len(words) > 2:
 41 |             text2 = " ".join([words[0]+words[1]] + words[2:])
 42 |         else:
 43 |             text2 = words[0]+words[1]
 44 |         texts = [text, text2]
 45 |     else:
 46 |         texts = [text]
 47 |     return texts
 48 | 
 49 | def search_optimal(found_text, recognition_text):
 50 |     # 1. found_text is usually more accurate
 51 |     # 2. recognition_text can have more or less word
 52 | 
 53 |     optimal = None
 54 | 
 55 |     if plain_text(recognition_text) in plain_text(found_text):
 56 |         optimal = recognition_text
 57 |     else:
 58 |         found = False
 59 | 
 60 |         for tmp_text in first_word_combined_texts(found_text):
 61 |             for recognition_first_word in first_word_combined_words(recognition_text):
 62 |                 if recognition_first_word in tmp_text:
 63 |                     start_idx = tmp_text.find(recognition_first_word)
 64 | 
 65 |                     if tmp_text != found_text:
 66 |                         found_text = found_text[max(0, start_idx-1):].strip()
 67 |                     else:
 68 |                         found_text = found_text[start_idx:].strip()
 69 |                     found = True
 70 |                     break
 71 | 
 72 |             if found:
 73 |                 break
 74 | 
 75 |         recognition_last_word = recognition_text.split()[-1]
 76 |         if recognition_last_word in found_text:
 77 |             end_idx = found_text.find(recognition_last_word)
 78 | 
 79 |             punctuation = ""
 80 |             if len(found_text) > end_idx + len(recognition_last_word):
 81 |                 punctuation = found_text[end_idx + len(recognition_last_word)]
 82 |                 if punctuation not in string.punctuation:
 83 |                     punctuation = ""
 84 | 
 85 |             found_text = found_text[:end_idx] + recognition_last_word + punctuation
 86 |             found = True
 87 | 
 88 |         if found:
 89 |             optimal = found_text
 90 | 
 91 |     return optimal
 92 | 
 93 | 
 94 | def align_text_fn(
 95 |         item, score_threshold, debug=False):
 96 | 
 97 |     audio_path, recognition_text = item
 98 | 
 99 |     audio_dir = os.path.dirname(audio_path)
100 |     base_dir = os.path.dirname(audio_dir)
101 | 
102 |     news_path = remove_postfix(audio_path.replace("audio", "assets"))
103 |     news_path = os.path.splitext(news_path)[0] + ".txt"
104 | 
105 |     strip_fn = lambda line: line.strip().replace('"', '').replace("'", "")
106 |     candidates = [strip_fn(line) for line in open(news_path, encoding='UTF-8').readlines()]
107 | 
108 |     scores = { candidate: similarity(candidate, recognition_text) \
109 |                     for candidate in candidates}
110 |     print(scores)
111 |     sorted_scores = sorted(scores.items(), key=operator.itemgetter(1))[::-1]
112 | 
113 |     try :
114 |         first, second = sorted_scores[0], sorted_scores[1]
115 | 
116 |         if first[1] > second[1] and first[1] >= score_threshold:
117 |             found_text, score = first
118 |             aligned_text = search_optimal(found_text, recognition_text)
119 | 
120 |             if debug:
121 |                 print("   ", audio_path)
122 |                 print("   ", recognition_text)
123 |                 print("=> ", found_text)
124 |                 print("==>", aligned_text)
125 |                 print("="*30)
126 | 
127 |             if aligned_text is not None:
128 |                 result = { audio_path: add_punctuation(aligned_text) }
129 |             if abs(len(text_to_sequence(found_text)) - len(text_to_sequence(recognition_text))) > 10:
130 |                 result = {}
131 |             else:
132 |                 result = { audio_path: add_punctuation(found_text) }
133 |         else:
134 |             result = {}
135 |         #
136 |         # if len(result) == 0:
137 |         #     result = { audio_path: found_text }
138 | 
139 |         return result
140 | 
141 |     except:
142 |         pass
143 | 
144 | def align_text_batch(config):
145 |     align_text = partial(align_text_fn,
146 |             score_threshold=config.score_threshold)
147 | 
148 |     results = {}
149 |     data = load_json(config.recognition_path, encoding=config.recognition_encoding)
150 | 
151 |     items = parallel_run(
152 |             align_text, data.items(),
153 |             desc="align_text_batch", parallel=True)
154 | 
155 |     for item in items:
156 |         results.update(item)
157 | 
158 |     found_count = sum([type(value) == str for value in results.values()])
159 |     print(" [*] # found: {:.5f}% ({}/{})".format(
160 |             len(results)/len(data), len(results), len(data)))
161 |     print(" [*] # exact match: {:.5f}% ({}/{})".format(
162 |             found_count/len(items), found_count, len(items)))
163 | 
164 |     return results
165 | 
166 | if __name__ == '__main__':
167 |     parser = argparse.ArgumentParser()
168 |     parser.add_argument('--recognition_path', required=True)
169 |     parser.add_argument('--alignment_filename', default="alignment.json")
170 |     parser.add_argument('--score_threshold', default=0.4, type=float)
171 |     parser.add_argument('--recognition_encoding', default='UTF-8')
172 |     config, unparsed = parser.parse_known_args()
173 | 
174 |     results = align_text_batch(config)
175 | 
176 |     base_dir = os.path.dirname(config.recognition_path)
177 |     alignment_path = \
178 |             os.path.join(base_dir, config.alignment_filename)
179 | 
180 |     if os.path.exists(alignment_path):
181 |         backup_file(alignment_path)
182 | 
183 |     write_json(alignment_path, results)
184 |     duration = get_durations(results.keys(), print_detail=False)
185 | 


--------------------------------------------------------------------------------
/recognition/google.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import json
  4 | import argparse
  5 | import numpy as np
  6 | from glob import glob
  7 | from functools import partial
  8 | import time
  9 | 
 10 | from utils import parallel_run, remove_file, backup_file, write_json
 11 | from audio import load_audio, save_audio, resample_audio, get_duration
 12 | 
 13 | def text_recognition(path, config):
 14 |     time.sleep(0.7)
 15 |     root, ext = os.path.splitext(path)
 16 |     txt_path = root + ".txt"
 17 | 
 18 |     if os.path.exists(txt_path):
 19 |         with open(txt_path) as f:
 20 |             out = json.loads(open(txt_path).read())
 21 |             return out
 22 | 
 23 |     from google.cloud import speech
 24 |     from google.cloud.speech import enums
 25 |     from google.cloud.speech import types
 26 | 
 27 |     out = {}
 28 |     error_count = 0
 29 | 
 30 |     tmp_path = os.path.splitext(path)[0] + ".tmp.wav"
 31 |     client = speech.SpeechClient() # Fixed
 32 | 
 33 |     while True:
 34 |         try:
 35 |             # client= speech.SpeechClient() # Causes 10060 max retries exceeded -to OAuth -HK
 36 |             
 37 |             content = load_audio(
 38 |                     path, pre_silence_length=config.pre_silence_length,
 39 |                     post_silence_length=config.post_silence_length)
 40 | 
 41 |             max_duration = config.max_duration - \
 42 |                     config.pre_silence_length - config.post_silence_length
 43 |             audio_duration = get_duration(content)
 44 | 
 45 |             if audio_duration >= max_duration:
 46 |                 print(" [!] Skip {} because of duration: {} > {}". \
 47 |                         format(path, audio_duration, max_duration))
 48 |                 return {}
 49 | 
 50 |             content = resample_audio(content, config.sample_rate)
 51 |             save_audio(content, tmp_path, config.sample_rate)
 52 | 
 53 |             with io.open(tmp_path, 'rb') as f:
 54 |                 audio = types.RecognitionAudio(content=f.read())
 55 | 
 56 |             config = types.RecognitionConfig(
 57 |                     encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
 58 |                     sample_rate_hertz=config.sample_rate,
 59 |                     language_code='ko-KR')
 60 | 
 61 |             response = client.recognize(config, audio)
 62 |             if len(response.results) > 0:
 63 |                 alternatives = response.results[0].alternatives
 64 | 
 65 |                 results = [alternative.transcript for alternative in alternatives]
 66 |                 assert len(results) == 1, "More than 1 results: {}".format(results)
 67 | 
 68 |                 out = { path: "" if len(results) == 0 else results[0] }
 69 |                 print(path, results[0])
 70 |                 break
 71 |             break
 72 |         except Exception as err:
 73 |             raise Exception("OS error: {0}".format(err))
 74 | 
 75 |             error_count += 1
 76 |             print("Skip warning for {} for {} times". \
 77 |                     format(path, error_count))
 78 | 
 79 |             if error_count > 5:
 80 |                 break
 81 |             else:
 82 |                 continue
 83 | 
 84 |     remove_file(tmp_path)
 85 |     with open(txt_path, 'w') as f:
 86 |         json.dump(out, f, indent=2, ensure_ascii=False)
 87 | 
 88 |     return out
 89 | 
 90 | def text_recognition_batch(paths, config):
 91 |     paths.sort()
 92 | 
 93 |     results = {}
 94 |     items = parallel_run(
 95 |             partial(text_recognition, config=config), paths,
 96 |             desc="text_recognition_batch", parallel=True)
 97 |     for item in items:
 98 |         results.update(item)
 99 |     return results
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     parser = argparse.ArgumentParser()
104 |     parser.add_argument('--audio_pattern', required=True)
105 |     parser.add_argument('--recognition_filename', default="recognition.json")
106 |     parser.add_argument('--sample_rate', default=16000, type=int)
107 |     parser.add_argument('--pre_silence_length', default=1, type=int)
108 |     parser.add_argument('--post_silence_length', default=1, type=int)
109 |     parser.add_argument('--max_duration', default=60, type=int)
110 |     config, unparsed = parser.parse_known_args()
111 | 
112 |     audio_dir = os.path.dirname(config.audio_pattern)
113 | 
114 |     for tmp_path in glob(os.path.join(audio_dir, "*.tmp.*")):
115 |         remove_file(tmp_path)
116 | 
117 |     paths = glob(config.audio_pattern)
118 |     paths.sort()
119 |     results = text_recognition_batch(paths, config)
120 | 
121 |     base_dir = os.path.dirname(audio_dir)
122 |     recognition_path = \
123 |             os.path.join(base_dir, config.recognition_filename)
124 | 
125 |     if os.path.exists(recognition_path):
126 |         backup_file(recognition_path)
127 | 
128 |     write_json(recognition_path, results)
129 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | appnope==0.1.0
  2 | audioread==2.1.5
  3 | beautifulsoup4==4.6.0
  4 | bleach==1.5.0
  5 | bs4==0.0.1
  6 | cachetools==2.0.1
  7 | chardet==3.0.4
  8 | click==6.7
  9 | cycler==0.10.0
 10 | decorator==4.1.2
 11 | dill==0.2.7.1
 12 | ffprobe==0.5
 13 | Flask==0.12.2
 14 | Flask-Cors==3.0.3
 15 | future==0.16.0
 16 | gapic-google-cloud-datastore-v1==0.15.3
 17 | gapic-google-cloud-error-reporting-v1beta1==0.15.3
 18 | gapic-google-cloud-logging-v2==0.91.3
 19 | gapic-google-cloud-pubsub-v1==0.15.4
 20 | gapic-google-cloud-spanner-admin-database-v1==0.15.3
 21 | gapic-google-cloud-spanner-admin-instance-v1==0.15.3
 22 | gapic-google-cloud-spanner-v1==0.15.3
 23 | google-auth==1.1.1
 24 | google-cloud==0.27.0
 25 | google-cloud-bigquery==0.26.0
 26 | google-cloud-bigtable==0.26.0
 27 | google-cloud-core==0.26.0
 28 | google-cloud-datastore==1.2.0
 29 | google-cloud-dns==0.26.0
 30 | google-cloud-error-reporting==0.26.0
 31 | google-cloud-language==0.27.0
 32 | google-cloud-logging==1.2.0
 33 | google-cloud-monitoring==0.26.0
 34 | google-cloud-pubsub==0.27.0
 35 | google-cloud-resource-manager==0.26.0
 36 | google-cloud-runtimeconfig==0.26.0
 37 | google-cloud-spanner==0.26.0
 38 | google-cloud-speech==0.28.0
 39 | google-cloud-storage==1.3.2
 40 | google-cloud-translate==1.1.0
 41 | google-cloud-videointelligence==0.25.0
 42 | google-cloud-vision==0.26.0
 43 | google-api-core==1.1.2
 44 | google-resumable-media==0.3.0
 45 | googleapis-common-protos==1.5.3
 46 | grpc-google-iam-v1==0.11.4
 47 | grpcio==1.8.6
 48 | html5lib==0.9999999
 49 | httplib2==0.10.3
 50 | idna==2.6
 51 | ipdb==0.10.3
 52 | ipython==6.2.1
 53 | ipython-genutils==0.2.0
 54 | iso8601==0.1.12
 55 | itsdangerous==0.24
 56 | jamo==0.4.1
 57 | jedi==0.11.0
 58 | Jinja2==2.9.6
 59 | joblib==0.11
 60 | librosa==0.5.1
 61 | #llvmlite==0.20.0
 62 | m3u8==0.3.3
 63 | Markdown==2.6.9
 64 | MarkupSafe==1.0
 65 | matplotlib==2.1.0
 66 | monotonic==1.3
 67 | nltk==3.2.5
 68 | numba==0.35.0
 69 | numpy==1.13.3
 70 | oauth2client==3.0.0
 71 | parso==0.1.0
 72 | pexpect==4.2.1
 73 | pickleshare==0.7.4
 74 | ply==3.8
 75 | prompt-toolkit==1.0.15
 76 | proto-google-cloud-datastore-v1==0.90.4
 77 | proto-google-cloud-error-reporting-v1beta1==0.15.3
 78 | proto-google-cloud-logging-v2==0.91.3
 79 | proto-google-cloud-pubsub-v1==0.15.4
 80 | proto-google-cloud-spanner-admin-database-v1==0.15.3
 81 | proto-google-cloud-spanner-admin-instance-v1==0.15.3
 82 | proto-google-cloud-spanner-v1==0.15.3
 83 | protobuf==3.5.1
 84 | ptyprocess==0.5.2
 85 | pyasn1==0.3.7
 86 | pyasn1-modules==0.1.5
 87 | pydub==0.20.0
 88 | Pygments==2.2.0
 89 | pyparsing==2.2.0
 90 | python-dateutil==2.6.1
 91 | pytz==2017.2
 92 | requests==2.18.4
 93 | resampy==0.2.0
 94 | rsa==3.4.2
 95 | scikit-learn==0.19.0
 96 | scipy==0.19.1
 97 | simplegeneric==0.8.1
 98 | six==1.11.0
 99 | tenacity==4.4.0
100 | #tensorflow-gpu==1.3.0
101 | #tensorflow-tensorboard==0.1.8
102 | tinytag==0.18.0
103 | tqdm==4.19.2
104 | traitlets==4.3.2
105 | urllib3==1.22
106 | wcwidth==0.1.7
107 | Werkzeug==0.12.2
108 | youtube-dl==2017.10.15.1
109 | unidecode==1.0.22
110 | inflect==0.2.5
111 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | CUDA_VISIBLE_DEVICES= python app.py --load_path logs/deepvoice2-256-256-krbook-bah-mon-22000-no-priority --dataname=krbook --num_speakers=1
4 | CUDA_VISIBLE_DEVICES= python app.py --load_path logs/jtbc_2017-09-25_11-49-23 --dataname=krbook --num_speakers=1 --port=5002
5 | CUDA_VISIBLE_DEVICES= python app.py --load_path logs/krbook_2017-09-27_17-02-44 --dataname=krbook --num_speakers=1 --port=5001
6 | CUDA_VISIBLE_DEVICES= python app.py --load_path logs/krfemale_2017-10-10_20-37-38 --dataname=krbook --num_speakers=1 --port=5003
7 | CUDA_VISIBLE_DEVICES= python app.py --load_path logs/krmale_2017-10-10_17-49-49 --dataname=krbook --num_speakers=1 --port=5005
8 | CUDA_VISIBLE_DEVICES= python app.py --load_path logs/park+moon+krbook_2017-10-09_20-43-53 --dataname=krbook --num_speakers=3 --port=5004
9 | 


--------------------------------------------------------------------------------
/scripts/prepare_son.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # 1. Download and extract audio and texts
 4 | python -m datasets.jtbc.download
 5 | 
 6 | # 2. Split audios on silence
 7 | python -m audio.silence --audio_pattern "./datasets/jtbc/audio/*.wav" --method=pydub
 8 | 
 9 | # 3. Run Google Speech Recognition
10 | python -m recognition.google --audio_pattern "./datasets/jtbc/audio/*.*.wav"
11 | 
12 | # 4. Run heuristic text-audio pair search (any improvement on this is welcome)
13 | python -m recognition.alignment --recognition_path "./datasets/jtbc/recognition.json" --score_threshold=0.5
14 | 
15 | # 5. Remove intro music
16 | rm datasets/jtbc/data/*.0000.npz
17 | 


--------------------------------------------------------------------------------
/synthesizer.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import re
  4 | import librosa
  5 | import argparse
  6 | import numpy as np
  7 | from glob import glob
  8 | from tqdm import tqdm
  9 | import tensorflow as tf
 10 | from functools import partial
 11 | 
 12 | from hparams import hparams
 13 | from models import create_model, get_most_recent_checkpoint
 14 | from audio import save_audio, inv_spectrogram, inv_preemphasis, \
 15 |                   inv_spectrogram_tensorflow
 16 | from utils import plot, PARAMS_NAME, load_json, load_hparams, \
 17 |                   add_prefix, add_postfix, get_time, parallel_run, makedirs, str2bool
 18 | 
 19 | from text.korean import tokenize
 20 | from text import text_to_sequence, sequence_to_text
 21 | 
 22 | 
 23 | class Synthesizer(object):
 24 | 
 25 |     def close(self):
 26 |         tf.reset_default_graph()
 27 |         self.sess.close()
 28 | 
 29 |     def load(self, checkpoint_path, num_speakers=2, checkpoint_step=None, model_name='tacotron'):
 30 |         self.num_speakers = num_speakers
 31 | 
 32 |         if os.path.isdir(checkpoint_path):
 33 |             load_path = checkpoint_path
 34 |             checkpoint_path = get_most_recent_checkpoint(checkpoint_path, checkpoint_step)
 35 |         else:
 36 |             load_path = os.path.dirname(checkpoint_path)
 37 | 
 38 |         print('Constructing model: %s' % model_name)
 39 | 
 40 |         inputs = tf.placeholder(tf.int32, [None, None], 'inputs')
 41 |         input_lengths = tf.placeholder(tf.int32, [None], 'input_lengths')
 42 | 
 43 |         batch_size = tf.shape(inputs)[0]
 44 |         speaker_id = tf.placeholder_with_default(
 45 |                 tf.zeros([batch_size], dtype=tf.int32), [None], 'speaker_id')
 46 | 
 47 |         load_hparams(hparams, load_path)
 48 |         with tf.variable_scope('model') as scope:
 49 |             self.model = create_model(hparams)
 50 | 
 51 |             self.model.initialize(
 52 |                     inputs, input_lengths,
 53 |                     self.num_speakers, speaker_id)
 54 |             self.wav_output = \
 55 |                     inv_spectrogram_tensorflow(self.model.linear_outputs)
 56 | 
 57 |         print('Loading checkpoint: %s' % checkpoint_path)
 58 | 
 59 |         sess_config = tf.ConfigProto(
 60 |                 allow_soft_placement=True,
 61 |                 intra_op_parallelism_threads=1,
 62 |                 inter_op_parallelism_threads=2)
 63 |         sess_config.gpu_options.allow_growth = True
 64 | 
 65 |         self.sess = tf.Session(config=sess_config)
 66 |         self.sess.run(tf.global_variables_initializer())
 67 |         saver = tf.train.Saver()
 68 |         saver.restore(self.sess, checkpoint_path)
 69 | 
 70 |     def synthesize(self,
 71 |             texts=None, tokens=None,
 72 |             base_path=None, paths=None, speaker_ids=None,
 73 |             start_of_sentence=None, end_of_sentence=True,
 74 |             pre_word_num=0, post_word_num=0,
 75 |             pre_surplus_idx=0, post_surplus_idx=1,
 76 |             use_short_concat=False,
 77 |             manual_attention_mode=0,
 78 |             base_alignment_path=None,
 79 |             librosa_trim=True,
 80 |             attention_trim=True,
 81 |             isKorean=True):
 82 | 
 83 |         # Possible inputs:
 84 |         # 1) text=text
 85 |         # 2) text=texts
 86 |         # 3) tokens=tokens, texts=texts # use texts as guide
 87 | 
 88 |         if type(texts) == str:
 89 |             texts = [texts]
 90 | 
 91 |         if texts is not None and tokens is None:
 92 |             sequences = [text_to_sequence(text) for text in texts]
 93 |         elif tokens is not None:
 94 |             sequences = tokens
 95 | 
 96 |         if paths is None:
 97 |             paths = [None] * len(sequences)
 98 |         if texts is None:
 99 |             texts = [None] * len(sequences)
100 | 
101 |         time_str = get_time()
102 |         def plot_and_save_parallel(
103 |                 wavs, alignments, use_manual_attention):
104 | 
105 |             items = list(enumerate(zip(
106 |                     wavs, alignments, paths, texts, sequences)))
107 | 
108 |             fn = partial(
109 |                     plot_graph_and_save_audio,
110 |                     base_path=base_path,
111 |                     start_of_sentence=start_of_sentence, end_of_sentence=end_of_sentence,
112 |                     pre_word_num=pre_word_num, post_word_num=post_word_num,
113 |                     pre_surplus_idx=pre_surplus_idx, post_surplus_idx=post_surplus_idx,
114 |                     use_short_concat=use_short_concat,
115 |                     use_manual_attention=use_manual_attention,
116 |                     librosa_trim=librosa_trim,
117 |                     attention_trim=attention_trim,
118 |                     time_str=time_str,
119 |                     isKorean=isKorean)
120 |             return parallel_run(fn, items,
121 |                     desc="plot_graph_and_save_audio", parallel=False)
122 | 
123 |         input_lengths = np.argmax(np.array(sequences) == 1, 1)
124 | 
125 |         fetches = [
126 |                 #self.wav_output,
127 |                 self.model.linear_outputs,
128 |                 self.model.alignments,
129 |         ]
130 | 
131 |         feed_dict = {
132 |                 self.model.inputs: sequences,
133 |                 self.model.input_lengths: input_lengths,
134 |         }
135 |         if base_alignment_path is None:
136 |             feed_dict.update({
137 |                     self.model.manual_alignments: np.zeros([1, 1, 1]),
138 |                     self.model.is_manual_attention: False,
139 |             })
140 |         else:
141 |             manual_alignments = []
142 |             alignment_path = os.path.join(
143 |                     base_alignment_path,
144 |                     os.path.basename(base_path))
145 | 
146 |             for idx in range(len(sequences)):
147 |                 numpy_path = "{}.{}.npy".format(alignment_path, idx)
148 |                 manual_alignments.append(np.load(numpy_path))
149 | 
150 |             alignments_T = np.transpose(manual_alignments, [0, 2, 1])
151 |             feed_dict.update({
152 |                     self.model.manual_alignments: alignments_T,
153 |                     self.model.is_manual_attention: True,
154 |             })
155 | 
156 |         if speaker_ids is not None:
157 |             if type(speaker_ids) == dict:
158 |                 speaker_embed_table = sess.run(
159 |                         self.model.speaker_embed_table)
160 | 
161 |                 speaker_embed =  [speaker_ids[speaker_id] * \
162 |                         speaker_embed_table[speaker_id] for speaker_id in speaker_ids]
163 |                 feed_dict.update({
164 |                         self.model.speaker_embed_table: np.tile()
165 |                 })
166 |             else:
167 |                 feed_dict[self.model.speaker_id] = speaker_ids
168 | 
169 |         wavs, alignments = \
170 |                 self.sess.run(fetches, feed_dict=feed_dict)
171 |         results = plot_and_save_parallel(
172 |                 wavs, alignments, True)
173 | 
174 |         if manual_attention_mode > 0:
175 |             # argmax one hot
176 |             if manual_attention_mode == 1:
177 |                 alignments_T = np.transpose(alignments, [0, 2, 1]) # [N, E, D]
178 |                 new_alignments = np.zeros_like(alignments_T)
179 | 
180 |                 for idx in range(len(alignments)):
181 |                     argmax = alignments[idx].argmax(1)
182 |                     new_alignments[idx][(argmax, range(len(argmax)))] = 1
183 |             # sharpening
184 |             elif manual_attention_mode == 2:
185 |                 new_alignments = np.transpose(alignments, [0, 2, 1]) # [N, E, D]
186 | 
187 |                 for idx in range(len(alignments)):
188 |                     var = np.var(new_alignments[idx], 1)
189 |                     mean_var = var[:input_lengths[idx]].mean()
190 | 
191 |                     new_alignments = np.pow(new_alignments[idx], 2)
192 |             # prunning
193 |             elif manual_attention_mode == 3:
194 |                 new_alignments = np.transpose(alignments, [0, 2, 1]) # [N, E, D]
195 | 
196 |                 for idx in range(len(alignments)):
197 |                     argmax = alignments[idx].argmax(1)
198 |                     new_alignments[idx][(argmax, range(len(argmax)))] = 1
199 | 
200 |             feed_dict.update({
201 |                     self.model.manual_alignments: new_alignments,
202 |                     self.model.is_manual_attention: True,
203 |             })
204 | 
205 |             new_wavs, new_alignments = \
206 |                     self.sess.run(fetches, feed_dict=feed_dict)
207 |             results = plot_and_save_parallel(
208 |                     new_wavs, new_alignments, True)
209 | 
210 |         return "{}/{}.manual.wav".format(base_path, time_str)
211 | 
212 | def plot_graph_and_save_audio(args,
213 |         base_path=None,
214 |         start_of_sentence=None, end_of_sentence=None,
215 |         pre_word_num=0, post_word_num=0,
216 |         pre_surplus_idx=0, post_surplus_idx=1,
217 |         use_short_concat=False,
218 |         use_manual_attention=False, save_alignment=False,
219 |         librosa_trim=False, attention_trim=False,
220 |         time_str=None, isKorean=True):
221 | 
222 |     idx, (wav, alignment, path, text, sequence) = args
223 | 
224 |     if base_path:
225 |         plot_path = "{}/{}.png".format(base_path, time_str)
226 |     elif path:
227 |         plot_path = path.rsplit('.', 1)[0] + ".png"
228 |     else:
229 |         plot_path = None
230 | 
231 |     #plot_path = add_prefix(plot_path, time_str)
232 |     if use_manual_attention:
233 |         plot_path = add_postfix(plot_path, "manual")
234 | 
235 |     if plot_path:
236 |         plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean)
237 | 
238 |     if use_short_concat:
239 |         wav = short_concat(
240 |                 wav, alignment, text,
241 |                 start_of_sentence, end_of_sentence,
242 |                 pre_word_num, post_word_num,
243 |                 pre_surplus_idx, post_surplus_idx)
244 | 
245 |     if attention_trim and end_of_sentence:
246 |         end_idx_counter = 0
247 |         attention_argmax = alignment.argmax(0)
248 |         end_idx = min(len(sequence) - 1, max(attention_argmax))
249 |         max_counter = min((attention_argmax == end_idx).sum(), 5)
250 | 
251 |         for jdx, attend_idx in enumerate(attention_argmax):
252 |             if len(attention_argmax) > jdx + 1:
253 |                 if attend_idx == end_idx:
254 |                     end_idx_counter += 1
255 | 
256 |                 if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx:
257 |                     break
258 | 
259 |                 if end_idx_counter >= max_counter:
260 |                     break
261 |             else:
262 |                 break
263 | 
264 |         spec_end_idx = hparams.reduction_factor * jdx + 3
265 |         wav = wav[:spec_end_idx]
266 | 
267 |     audio_out = inv_spectrogram(wav.T)
268 | 
269 |     if librosa_trim and end_of_sentence:
270 |         yt, index = librosa.effects.trim(audio_out,
271 |                 frame_length=5120, hop_length=256, top_db=50)
272 |         print ("index = ", index)
273 |         audio_out = audio_out[:index[-1]]
274 | 
275 |     if save_alignment:
276 |         alignment_path = "{}/{}.npy".format(base_path, idx)
277 |         np.save(alignment_path, alignment, allow_pickle=False)
278 | 
279 |     if path or base_path:
280 |         if path:
281 |             current_path = add_postfix(path, idx)
282 |         elif base_path:
283 |             current_path = plot_path.replace(".png", ".wav")
284 | 
285 |         save_audio(audio_out, current_path)
286 |         return True
287 |     else:
288 |         io_out = io.BytesIO()
289 |         save_audio(audio_out, io_out)
290 |         result = io_out.getvalue()
291 |         return result
292 | 
293 | def get_most_recent_checkpoint(checkpoint_dir, checkpoint_step=None):
294 |     if checkpoint_step is None:
295 |         checkpoint_paths = [path for path in glob("{}/*.ckpt-*.data-*".format(checkpoint_dir))]
296 |         idxes = [int(os.path.basename(path).split('-')[1].split('.')[0]) for path in checkpoint_paths]
297 | 
298 |         max_idx = max(idxes)
299 |     else:
300 |         max_idx = checkpoint_step
301 |     lastest_checkpoint = os.path.join(checkpoint_dir, "model.ckpt-{}".format(max_idx))
302 |     print(" [*] Found lastest checkpoint: {}".format(lastest_checkpoint))
303 |     return lastest_checkpoint
304 | 
305 | def short_concat(
306 |         wav, alignment, text,
307 |         start_of_sentence, end_of_sentence,
308 |         pre_word_num, post_word_num,
309 |         pre_surplus_idx, post_surplus_idx):
310 | 
311 |     # np.array(list(decomposed_text))[attention_argmax]
312 |     attention_argmax = alignment.argmax(0)
313 | 
314 |     if not start_of_sentence and pre_word_num > 0:
315 |         surplus_decomposed_text = decompose_ko_text("".join(text.split()[0]))
316 |         start_idx = len(surplus_decomposed_text) + 1
317 | 
318 |         for idx, attend_idx in enumerate(attention_argmax):
319 |             if attend_idx == start_idx and attention_argmax[idx - 1] < start_idx:
320 |                 break
321 | 
322 |         wav_start_idx = hparams.reduction_factor * idx - 1 - pre_surplus_idx
323 |     else:
324 |         wav_start_idx = 0
325 | 
326 |     if not end_of_sentence and post_word_num > 0:
327 |         surplus_decomposed_text = decompose_ko_text("".join(text.split()[-1]))
328 |         end_idx = len(decomposed_text.replace(surplus_decomposed_text, '')) - 1
329 | 
330 |         for idx, attend_idx in enumerate(attention_argmax):
331 |             if attend_idx == end_idx and attention_argmax[idx + 1] > end_idx:
332 |                 break
333 | 
334 |         wav_end_idx = hparams.reduction_factor * idx + 1 + post_surplus_idx
335 |     else:
336 |         if True: # attention based split
337 |             if end_of_sentence:
338 |                 end_idx = min(len(decomposed_text) - 1, max(attention_argmax))
339 |             else:
340 |                 surplus_decomposed_text = decompose_ko_text("".join(text.split()[-1]))
341 |                 end_idx = len(decomposed_text.replace(surplus_decomposed_text, '')) - 1
342 | 
343 |             while True:
344 |                 if end_idx in attention_argmax:
345 |                     break
346 |                 end_idx -= 1
347 | 
348 |             end_idx_counter = 0
349 |             for idx, attend_idx in enumerate(attention_argmax):
350 |                 if len(attention_argmax) > idx + 1:
351 |                     if attend_idx == end_idx:
352 |                         end_idx_counter += 1
353 | 
354 |                     if attend_idx == end_idx and attention_argmax[idx + 1] > end_idx:
355 |                         break
356 | 
357 |                     if end_idx_counter > 5:
358 |                         break
359 |                 else:
360 |                     break
361 | 
362 |             wav_end_idx = hparams.reduction_factor * idx + 1 + post_surplus_idx
363 |         else:
364 |             wav_end_idx = None
365 | 
366 |     wav = wav[wav_start_idx:wav_end_idx]
367 | 
368 |     if end_of_sentence:
369 |         wav = np.lib.pad(wav, ((0, 20), (0, 0)), 'constant', constant_values=0)
370 |     else:
371 |         wav = np.lib.pad(wav, ((0, 10), (0, 0)), 'constant', constant_values=0)
372 | 
373 | 
374 | if __name__ == "__main__":
375 |     parser = argparse.ArgumentParser()
376 |     parser.add_argument('--load_path', required=True)
377 |     parser.add_argument('--sample_path', default="samples")
378 |     parser.add_argument('--text', required=True)
379 |     parser.add_argument('--num_speakers', default=1, type=int)
380 |     parser.add_argument('--speaker_id', default=0, type=int)
381 |     parser.add_argument('--checkpoint_step', default=None, type=int)
382 |     parser.add_argument('--is_korean', default=True, type=str2bool)
383 |     config = parser.parse_args()
384 | 
385 |     makedirs(config.sample_path)
386 | 
387 |     synthesizer = Synthesizer()
388 |     synthesizer.load(config.load_path, config.num_speakers, config.checkpoint_step)
389 | 
390 |     audio = synthesizer.synthesize(
391 |             texts=[config.text],
392 |             base_path=config.sample_path,
393 |             speaker_ids=[config.speaker_id],
394 |             attention_trim=False,
395 |             isKorean=config.is_korean)[0]
396 | 


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import string
  3 | import numpy as np
  4 | 
  5 | from text import cleaners
  6 | from hparams import hparams
  7 | from text.symbols import symbols, en_symbols, PAD, EOS
  8 | from text.korean import jamo_to_korean
  9 | 
 10 | 
 11 | 
 12 | # Mappings from symbol to numeric ID and vice versa:
 13 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 14 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 15 | isEn=False
 16 | 
 17 | 
 18 | # Regular expression matching text enclosed in curly braces:
 19 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
 20 | 
 21 | puncuation_table = str.maketrans({key: None for key in string.punctuation})
 22 | 
 23 | def convert_to_en_symbols():
 24 |     '''Converts built-in korean symbols to english, to be used for english training
 25 |     
 26 | '''
 27 |     global _symbol_to_id, _id_to_symbol, isEn
 28 |     if not isEn:
 29 |         print(" [!] Converting to english mode")
 30 |     _symbol_to_id = {s: i for i, s in enumerate(en_symbols)}
 31 |     _id_to_symbol = {i: s for i, s in enumerate(en_symbols)}
 32 |     isEn=True
 33 | 
 34 | def remove_puncuations(text):
 35 |     return text.translate(puncuation_table)
 36 | 
 37 | def text_to_sequence(text, as_token=False):    
 38 |     cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
 39 |     if ('english_cleaners' in cleaner_names) and isEn==False:
 40 |         convert_to_en_symbols()
 41 |     return _text_to_sequence(text, cleaner_names, as_token)
 42 | 
 43 | def _text_to_sequence(text, cleaner_names, as_token):
 44 |     '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 45 | 
 46 |         The text can optionally have ARPAbet sequences enclosed in curly braces embedded
 47 |         in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
 48 | 
 49 |         Args:
 50 |             text: string to convert to a sequence
 51 |             cleaner_names: names of the cleaner functions to run the text through
 52 | 
 53 |         Returns:
 54 |             List of integers corresponding to the symbols in the text
 55 |     '''
 56 |     sequence = []
 57 | 
 58 |     # Check for curly braces and treat their contents as ARPAbet:
 59 |     while len(text):
 60 |         m = _curly_re.match(text)
 61 |         if not m:
 62 |             sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
 63 |             break
 64 |         sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
 65 |         sequence += _arpabet_to_sequence(m.group(2))
 66 |         text = m.group(3)
 67 | 
 68 |     # Append EOS token
 69 |     sequence.append(_symbol_to_id[EOS])
 70 | 
 71 |     if as_token:
 72 |         return sequence_to_text(sequence, combine_jamo=True)
 73 |     else:
 74 |         return np.array(sequence, dtype=np.int32)
 75 | 
 76 | 
 77 | def sequence_to_text(sequence, skip_eos_and_pad=False, combine_jamo=False):
 78 |     '''Converts a sequence of IDs back to a string'''
 79 |     cleaner_names=[x.strip() for x in hparams.cleaners.split(',')]
 80 |     if 'english_cleaners' in cleaner_names and isEn==False:
 81 |         convert_to_en_symbols()
 82 |         
 83 |     result = ''
 84 |     for symbol_id in sequence:
 85 |         if symbol_id in _id_to_symbol:
 86 |             s = _id_to_symbol[symbol_id]
 87 |             # Enclose ARPAbet back in curly braces:
 88 |             if len(s) > 1 and s[0] == '@':
 89 |                 s = '{%s}' % s[1:]
 90 | 
 91 |             if not skip_eos_and_pad or s not in [EOS, PAD]:
 92 |                 result += s
 93 | 
 94 |     result = result.replace('}{', ' ')
 95 | 
 96 |     if combine_jamo:
 97 |         return jamo_to_korean(result)
 98 |     else:
 99 |         return result
100 | 
101 | 
102 | 
103 | def _clean_text(text, cleaner_names):
104 |     for name in cleaner_names:
105 |         cleaner = getattr(cleaners, name)
106 |         if not cleaner:
107 |             raise Exception('Unknown cleaner: %s' % name)
108 |         text = cleaner(text)
109 |     return text
110 | 
111 | 
112 | def _symbols_to_sequence(symbols):
113 |     return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
114 | 
115 | 
116 | def _arpabet_to_sequence(text):
117 |     return _symbols_to_sequence(['@' + s for s in text.split()])
118 | 
119 | 
120 | def _should_keep_symbol(s):
121 |     return s in _symbol_to_id and s is not '_' and s is not '~'
122 | 


--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
  1 | # Code based on https://github.com/keithito/tacotron/blob/master/text/cleaners.py
  2 | '''
  3 | Cleaners are transformations that run over the input text at both training and eval time.
  4 | 
  5 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
  6 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  7 |     1. "english_cleaners" for English text
  8 |     2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
  9 |          the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 10 |     3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 11 |          the symbols in symbols.py to match your data).
 12 | '''
 13 | 
 14 | import re
 15 | from .korean import tokenize as ko_tokenize
 16 | 
 17 | # Added to support LJ_speech
 18 | from unidecode import unidecode
 19 | from .en_numbers import normalize_numbers as en_normalize_numbers
 20 | 
 21 | # Regular expression matching whitespace:
 22 | _whitespace_re = re.compile(r'\s+')
 23 | 
 24 | 
 25 | def korean_cleaners(text):
 26 |     '''Pipeline for Korean text, including number and abbreviation expansion.'''
 27 |     text = ko_tokenize(text)
 28 |     return text
 29 | 
 30 | 
 31 | # List of (regular expression, replacement) pairs for abbreviations:
 32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
 33 |     ('mrs', 'misess'),
 34 |     ('mr', 'mister'),
 35 |     ('dr', 'doctor'),
 36 |     ('st', 'saint'),
 37 |     ('co', 'company'),
 38 |     ('jr', 'junior'),
 39 |     ('maj', 'major'),
 40 |     ('gen', 'general'),
 41 |     ('drs', 'doctors'),
 42 |     ('rev', 'reverend'),
 43 |     ('lt', 'lieutenant'),
 44 |     ('hon', 'honorable'),
 45 |     ('sgt', 'sergeant'),
 46 |     ('capt', 'captain'),
 47 |     ('esq', 'esquire'),
 48 |     ('ltd', 'limited'),
 49 |     ('col', 'colonel'),
 50 |     ('ft', 'fort'),
 51 | ]]
 52 | 
 53 | 
 54 | def expand_abbreviations(text):
 55 |     for regex, replacement in _abbreviations:
 56 |         text = re.sub(regex, replacement, text)
 57 |     return text
 58 | 
 59 | 
 60 | def expand_numbers(text):
 61 |     return en_normalize_numbers(text)
 62 | 
 63 | 
 64 | def lowercase(text):
 65 |     return text.lower()
 66 | 
 67 | 
 68 | def collapse_whitespace(text):
 69 |     return re.sub(_whitespace_re, ' ', text)
 70 | 
 71 | def convert_to_ascii(text):
 72 |     '''Converts to ascii, existed in keithito but deleted in carpedm20'''
 73 |     return unidecode(text)
 74 |     
 75 | 
 76 | def basic_cleaners(text):
 77 |     '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
 78 |     text = lowercase(text)
 79 |     text = collapse_whitespace(text)
 80 |     return text
 81 | 
 82 | 
 83 | def transliteration_cleaners(text):
 84 |     '''Pipeline for non-English text that transliterates to ASCII.'''
 85 |     text = convert_to_ascii(text)
 86 |     text = lowercase(text)
 87 |     text = collapse_whitespace(text)
 88 |     return text
 89 | 
 90 | 
 91 | def english_cleaners(text):
 92 |     '''Pipeline for English text, including number and abbreviation expansion.'''
 93 |     text = convert_to_ascii(text)
 94 |     text = lowercase(text)
 95 |     text = expand_numbers(text)
 96 |     text = expand_abbreviations(text)
 97 |     text = collapse_whitespace(text)
 98 |     return text
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/text/en_numbers.py:
--------------------------------------------------------------------------------
 1 | import inflect
 2 | import re
 3 | 
 4 | 
 5 | _inflect = inflect.engine()
 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
11 | _number_re = re.compile(r'[0-9]+')
12 | 
13 | 
14 | def _remove_commas(m):
15 |   return m.group(1).replace(',', '')
16 | 
17 | 
18 | def _expand_decimal_point(m):
19 |   return m.group(1).replace('.', ' point ')
20 | 
21 | 
22 | def _expand_dollars(m):
23 |   match = m.group(1)
24 |   parts = match.split('.')
25 |   if len(parts) > 2:
26 |     return match + ' dollars'  # Unexpected format
27 |   dollars = int(parts[0]) if parts[0] else 0
28 |   cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
29 |   if dollars and cents:
30 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
31 |     cent_unit = 'cent' if cents == 1 else 'cents'
32 |     return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
33 |   elif dollars:
34 |     dollar_unit = 'dollar' if dollars == 1 else 'dollars'
35 |     return '%s %s' % (dollars, dollar_unit)
36 |   elif cents:
37 |     cent_unit = 'cent' if cents == 1 else 'cents'
38 |     return '%s %s' % (cents, cent_unit)
39 |   else:
40 |     return 'zero dollars'
41 | 
42 | 
43 | def _expand_ordinal(m):
44 |   return _inflect.number_to_words(m.group(0))
45 | 
46 | 
47 | def _expand_number(m):
48 |   num = int(m.group(0))
49 |   if num > 1000 and num < 3000:
50 |     if num == 2000:
51 |       return 'two thousand'
52 |     elif num > 2000 and num < 2010:
53 |       return 'two thousand ' + _inflect.number_to_words(num % 100)
54 |     elif num % 100 == 0:
55 |       return _inflect.number_to_words(num // 100) + ' hundred'
56 |     else:
57 |       return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
58 |   else:
59 |     return _inflect.number_to_words(num, andword='')
60 | 
61 | 
62 | def normalize_numbers(text):
63 |   text = re.sub(_comma_number_re, _remove_commas, text)
64 |   text = re.sub(_pounds_re, r'\1 pounds', text)
65 |   text = re.sub(_dollars_re, _expand_dollars, text)
66 |   text = re.sub(_decimal_number_re, _expand_decimal_point, text)
67 |   text = re.sub(_ordinal_re, _expand_ordinal, text)
68 |   text = re.sub(_number_re, _expand_number, text)
69 |   return text
70 | 


--------------------------------------------------------------------------------
/text/english.py:
--------------------------------------------------------------------------------
 1 | # Code from https://github.com/keithito/tacotron/blob/master/util/numbers.py
 2 | import inflect
 3 | 
 4 | 
 5 | _inflect = inflect.engine()
 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
11 | _number_re = re.compile(r'[0-9]+')
12 | 
13 | 
14 | def _remove_commas(m):
15 |     return m.group(1).replace(',', '')
16 | 
17 | 
18 | def _expand_decimal_point(m):
19 |     return m.group(1).replace('.', ' point ')
20 | 
21 | 
22 | def _expand_dollars(m):
23 |     match = m.group(1)
24 |     parts = match.split('.')
25 |     if len(parts) > 2:
26 |         return match + ' dollars'    # Unexpected format
27 |     dollars = int(parts[0]) if parts[0] else 0
28 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
29 |     if dollars and cents:
30 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
31 |         cent_unit = 'cent' if cents == 1 else 'cents'
32 |         return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
33 |     elif dollars:
34 |         dollar_unit = 'dollar' if dollars == 1 else 'dollars'
35 |         return '%s %s' % (dollars, dollar_unit)
36 |     elif cents:
37 |         cent_unit = 'cent' if cents == 1 else 'cents'
38 |         return '%s %s' % (cents, cent_unit)
39 |     else:
40 |         return 'zero dollars'
41 | 
42 | 
43 | def _expand_ordinal(m):
44 |     return _inflect.number_to_words(m.group(0))
45 | 
46 | 
47 | def _expand_number(m):
48 |     num = int(m.group(0))
49 |     if num > 1000 and num < 3000:
50 |         if num == 2000:
51 |             return 'two thousand'
52 |         elif num > 2000 and num < 2010:
53 |             return 'two thousand ' + _inflect.number_to_words(num % 100)
54 |         elif num % 100 == 0:
55 |             return _inflect.number_to_words(num // 100) + ' hundred'
56 |         else:
57 |             return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
58 |     else:
59 |         return _inflect.number_to_words(num, andword='')
60 | 
61 | 
62 | def normalize(text):
63 |     text = re.sub(_comma_number_re, _remove_commas, text)
64 |     text = re.sub(_pounds_re, r'\1 pounds', text)
65 |     text = re.sub(_dollars_re, _expand_dollars, text)
66 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
67 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
68 |     text = re.sub(_number_re, _expand_number, text)
69 |     return text
70 | 


--------------------------------------------------------------------------------
/text/ko_dictionary.py:
--------------------------------------------------------------------------------
  1 | etc_dictionary = {
  2 |         '2 30대': '이삼십대',
  3 |         '20~30대': '이삼십대',
  4 |         '20, 30대': '이십대 삼십대',
  5 |         '1+1': '원플러스원',
  6 |         '3에서 6개월인': '3개월에서 육개월인',
  7 | }
  8 | 
  9 | english_dictionary = {
 10 |         'Devsisters': '데브시스터즈',
 11 |         'track': '트랙',
 12 | 
 13 |         # krbook
 14 |         'LA': '엘에이',
 15 |         'LG': '엘지',
 16 |         'KOREA': '코리아',
 17 |         'JSA': '제이에스에이',
 18 |         'PGA': '피지에이',
 19 |         'GA': '지에이',
 20 |         'idol': '아이돌',
 21 |         'KTX': '케이티엑스',
 22 |         'AC': '에이씨',
 23 |         'DVD': '디비디',
 24 |         'US': '유에스',
 25 |         'CNN': '씨엔엔',
 26 |         'LPGA': '엘피지에이',
 27 |         'P': '피',
 28 |         'L': '엘',
 29 |         'T': '티',
 30 |         'B': '비',
 31 |         'C': '씨',
 32 |         'BIFF': '비아이에프에프',
 33 |         'GV': '지비',
 34 | 
 35 |         # JTBC
 36 |         'IT': '아이티',
 37 |         'IQ': '아이큐',
 38 |         'JTBC': '제이티비씨',
 39 |         'trickle down effect': '트리클 다운 이펙트',
 40 |         'trickle up effect': '트리클 업 이펙트',
 41 |         'down': '다운',
 42 |         'up': '업',
 43 |         'FCK': '에프씨케이',
 44 |         'AP': '에이피',
 45 |         'WHERETHEWILDTHINGSARE': '',
 46 |         'Rashomon Effect': '',
 47 |         'O': '오',
 48 |         'OO': '오오',
 49 |         'B': '비',
 50 |         'GDP': '지디피',
 51 |         'CIPA': '씨아이피에이',
 52 |         'YS': '와이에스',
 53 |         'Y': '와이',
 54 |         'S': '에스',
 55 |         'JTBC': '제이티비씨',
 56 |         'PC': '피씨',
 57 |         'bill': '빌',
 58 |         'Halmuny': '하모니', #####
 59 |         'X': '엑스',
 60 |         'SNS': '에스엔에스',
 61 |         'ability': '어빌리티',
 62 |         'shy': '',
 63 |         'CCTV': '씨씨티비',
 64 |         'IT': '아이티',
 65 |         'the tenth man': '더 텐쓰 맨', ####
 66 |         'L': '엘',
 67 |         'PC': '피씨',
 68 |         'YSDJJPMB': '', ########
 69 |         'Content Attitude Timing': '컨텐트 애티튜드 타이밍',
 70 |         'CAT': '캣',
 71 |         'IS': '아이에스',
 72 |         'SNS': '에스엔에스',
 73 |         'K': '케이',
 74 |         'Y': '와이',
 75 |         'KDI': '케이디아이',
 76 |         'DOC': '디오씨',
 77 |         'CIA': '씨아이에이',
 78 |         'PBS': '피비에스',
 79 |         'D': '디',
 80 |         'PPropertyPositionPowerPrisonP'
 81 |         'S': '에스',
 82 |         'francisco': '프란시스코',
 83 |         'I': '아이',
 84 |         'III': '아이아이', ######
 85 |         'No joke': '노 조크',
 86 |         'BBK': '비비케이',
 87 |         'LA': '엘에이',
 88 |         'Don': '',
 89 |         't worry be happy': ' 워리 비 해피',
 90 |         'NO': '엔오', #####
 91 |         'it was our sky': '잇 워즈 아워 스카이',
 92 |         'it is our sky': '잇 이즈 아워 스카이', ####
 93 |         'NEIS': '엔이아이에스', #####
 94 |         'IMF': '아이엠에프',
 95 |         'apology': '어폴로지',
 96 |         'humble': '험블',
 97 |         'M': '엠',
 98 |         'Nowhere Man': '노웨어 맨',
 99 |         'The Tenth Man': '더 텐쓰 맨',
100 |         'PBS': '피비에스',
101 |         'BBC': '비비씨',
102 |         'MRJ': '엠알제이',
103 |         'CCTV': '씨씨티비',
104 |         'Pick me up': '픽 미 업',
105 |         'DNA': '디엔에이',
106 |         'UN': '유엔',
107 |         'STOP': '스탑', #####
108 |         'PRESS': '프레스', #####
109 |         'not to be': '낫 투비',
110 |         'Denial': '디나이얼',
111 |         'G': '지',
112 |         'IMF': '아이엠에프',
113 |         'GDP': '지디피',
114 |         'JTBC': '제이티비씨',
115 |         'Time flies like an arrow': '타임 플라이즈 라이크 언 애로우',
116 |         'DDT': '디디티',
117 |         'AI': '에이아이',
118 |         'Z': '제트',
119 |         'OECD': '오이씨디',
120 |         'N': '앤',
121 |         'A': '에이',
122 |         'MB': '엠비',
123 |         'EH': '이에이치',
124 |         'IS': '아이에스',
125 |         'TV': '티비',
126 |         'MIT': '엠아이티',
127 |         'KBO': '케이비오',
128 |         'I love America': '아이 러브 아메리카',
129 |         'SF': '에스에프',
130 |         'Q': '큐',
131 |         'KFX': '케이에프엑스',
132 |         'PM': '피엠',
133 |         'Prime Minister': '프라임 미니스터',
134 |         'Swordline': '스워드라인',
135 |         'TBS': '티비에스',
136 |         'DDT': '디디티',
137 |         'CS': '씨에스',
138 |         'Reflecting Absence': '리플렉팅 앱센스',
139 |         'PBS': '피비에스',
140 |         'Drum being beaten by everyone': '드럼 빙 비튼 바이 에브리원',
141 |         'negative pressure': '네거티브 프레셔',
142 |         'F': '에프',
143 |         'KIA': '기아',
144 |         'FTA': '에프티에이',
145 |         'Que sais-je': '',
146 |         'UFC': '유에프씨',
147 |         'P': '피',
148 |         'DJ': '디제이',
149 |         'Chaebol': '채벌',
150 |         'BBC': '비비씨',
151 |         'OECD': '오이씨디',
152 |         'BC': '삐씨',
153 |         'C': '씨',
154 |         'B': '씨',
155 |         'KY': '케이와이',
156 |         'K': '케이',
157 |         'CEO': '씨이오',
158 |         'YH': '와이에치',
159 |         'IS': '아이에스',
160 |         'who are you': '후 얼 유',
161 |         'Y': '와이',
162 |         'The Devils Advocate': '더 데빌즈 어드보카트',
163 |         'YS': '와이에스',
164 |         'so sorry': '쏘 쏘리',
165 |         'Santa': '산타',
166 |         'Big Endian': '빅 엔디안',
167 |         'Small Endian': '스몰 엔디안',
168 |         'Oh Captain My Captain': '오 캡틴 마이 캡틴',
169 |         'AIB': '에이아이비',
170 |         'K': '케이',
171 |         'PBS': '피비에스',
172 | }
173 | 


--------------------------------------------------------------------------------
/text/korean.py:
--------------------------------------------------------------------------------
  1 | ﻿# Code based on 
  2 | 
  3 | import re
  4 | import os
  5 | import ast
  6 | import json
  7 | from jamo import hangul_to_jamo, h2j, j2h
  8 | 
  9 | from .ko_dictionary import english_dictionary, etc_dictionary
 10 | 
 11 | PAD = '_'
 12 | EOS = '~'
 13 | PUNC = '!\'(),-.:;?'
 14 | SPACE = ' '
 15 | 
 16 | JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)])
 17 | JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)])
 18 | JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)])
 19 | 
 20 | VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE
 21 | ALL_SYMBOLS = PAD + EOS + VALID_CHARS
 22 | 
 23 | char_to_id = {c: i for i, c in enumerate(ALL_SYMBOLS)}
 24 | id_to_char = {i: c for i, c in enumerate(ALL_SYMBOLS)}
 25 | 
 26 | quote_checker = """([`"'＂“‘])(.+?)([`"'＂”’])"""
 27 | 
 28 | def is_lead(char):
 29 |     return char in JAMO_LEADS
 30 | 
 31 | def is_vowel(char):
 32 |     return char in JAMO_VOWELS
 33 | 
 34 | def is_tail(char):
 35 |     return char in JAMO_TAILS
 36 | 
 37 | def get_mode(char):
 38 |     if is_lead(char):
 39 |         return 0
 40 |     elif is_vowel(char):
 41 |         return 1
 42 |     elif is_tail(char):
 43 |         return 2
 44 |     else:
 45 |         return -1
 46 | 
 47 | def _get_text_from_candidates(candidates):
 48 |     if len(candidates) == 0:
 49 |         return ""
 50 |     elif len(candidates) == 1:
 51 |         return _jamo_char_to_hcj(candidates[0])
 52 |     else:
 53 |         return j2h(**dict(zip(["lead", "vowel", "tail"], candidates)))
 54 | 
 55 | def jamo_to_korean(text):
 56 |     text = h2j(text)
 57 | 
 58 |     idx = 0
 59 |     new_text = ""
 60 |     candidates = []
 61 | 
 62 |     while True:
 63 |         if idx >= len(text):
 64 |             new_text += _get_text_from_candidates(candidates)
 65 |             break
 66 | 
 67 |         char = text[idx]
 68 |         mode = get_mode(char)
 69 | 
 70 |         if mode == 0:
 71 |             new_text += _get_text_from_candidates(candidates)
 72 |             candidates = [char]
 73 |         elif mode == -1:
 74 |             new_text += _get_text_from_candidates(candidates)
 75 |             new_text += char
 76 |             candidates = []
 77 |         else:
 78 |             candidates.append(char)
 79 | 
 80 |         idx += 1
 81 |     return new_text
 82 | 
 83 | num_to_kor = {
 84 |         '0': '영',
 85 |         '1': '일',
 86 |         '2': '이',
 87 |         '3': '삼',
 88 |         '4': '사',
 89 |         '5': '오',
 90 |         '6': '육',
 91 |         '7': '칠',
 92 |         '8': '팔',
 93 |         '9': '구',
 94 | }
 95 | 
 96 | unit_to_kor1 = {
 97 |         '%': '퍼센트',
 98 |         'cm': '센치미터',
 99 |         'mm': '밀리미터',
100 |         'km': '킬로미터',
101 |         'kg': '킬로그람',
102 | }
103 | unit_to_kor2 = {
104 |         'm': '미터',
105 | }
106 | 
107 | upper_to_kor = {
108 |         'A': '에이',
109 |         'B': '비',
110 |         'C': '씨',
111 |         'D': '디',
112 |         'E': '이',
113 |         'F': '에프',
114 |         'G': '지',
115 |         'H': '에이치',
116 |         'I': '아이',
117 |         'J': '제이',
118 |         'K': '케이',
119 |         'L': '엘',
120 |         'M': '엠',
121 |         'N': '엔',
122 |         'O': '오',
123 |         'P': '피',
124 |         'Q': '큐',
125 |         'R': '알',
126 |         'S': '에스',
127 |         'T': '티',
128 |         'U': '유',
129 |         'V': '브이',
130 |         'W': '더블유',
131 |         'X': '엑스',
132 |         'Y': '와이',
133 |         'Z': '지',
134 | }
135 | 
136 | def compare_sentence_with_jamo(text1, text2):
137 |     return h2j(text1) != h2j(text)
138 | 
139 | def tokenize(text, as_id=False):
140 |     text = normalize(text)
141 |     tokens = list(hangul_to_jamo(text))
142 | 
143 |     if as_id:
144 |         return [char_to_id[token] for token in tokens] + [char_to_id[EOS]]
145 |     else:
146 |         return [token for token in tokens] + [EOS]
147 | 
148 | def tokenizer_fn(iterator):
149 |     return (token for x in iterator for token in tokenize(x, as_id=False))
150 | 
151 | def normalize(text):
152 |     text = text.strip()
153 | 
154 |     text = re.sub('\(\d+일\)', '', text)
155 |     text = re.sub('\([⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+\)', '', text)
156 | 
157 |     text = normalize_with_dictionary(text, etc_dictionary)
158 |     text = normalize_english(text)
159 |     text = re.sub('[a-zA-Z]+', normalize_upper, text)
160 | 
161 |     text = normalize_quote(text)
162 |     text = normalize_number(text)
163 | 
164 |     return text
165 | 
166 | def normalize_with_dictionary(text, dic):
167 |     if any(key in text for key in dic.keys()):
168 |         pattern = re.compile('|'.join(re.escape(key) for key in dic.keys()))
169 |         return pattern.sub(lambda x: dic[x.group()], text)
170 |     else:
171 |         return text
172 | 
173 | def normalize_english(text):
174 |     def fn(m):
175 |         word = m.group()
176 |         if word in english_dictionary:
177 |             return english_dictionary.get(word)
178 |         else:
179 |             return word
180 | 
181 |     text = re.sub("([A-Za-z]+)", fn, text)
182 |     return text
183 | 
184 | def normalize_upper(text):
185 |     text = text.group(0)
186 | 
187 |     if all([char.isupper() for char in text]):
188 |         return "".join(upper_to_kor[char] for char in text)
189 |     else:
190 |         return text
191 | 
192 | def normalize_quote(text):
193 |     def fn(found_text):
194 |         from nltk import sent_tokenize # NLTK doesn't along with multiprocessing
195 | 
196 |         found_text = found_text.group()
197 |         unquoted_text = found_text[1:-1]
198 | 
199 |         sentences = sent_tokenize(unquoted_text)
200 |         return " ".join(["'{}'".format(sent) for sent in sentences])
201 | 
202 |     return re.sub(quote_checker, fn, text)
203 | 
204 | number_checker = "([+-]?\d[\d,]*)[\.]?\d*"
205 | count_checker = "(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)"
206 | 
207 | def normalize_number(text):
208 |     text = normalize_with_dictionary(text, unit_to_kor1)
209 |     text = normalize_with_dictionary(text, unit_to_kor2)
210 |     text = re.sub(number_checker + count_checker,
211 |             lambda x: number_to_korean(x, True), text)
212 |     text = re.sub(number_checker,
213 |             lambda x: number_to_korean(x, False), text)
214 |     return text
215 | 
216 | num_to_kor1 = [""] + list("일이삼사오육칠팔구")
217 | num_to_kor2 = [""] + list("만억조경해")
218 | num_to_kor3 = [""] + list("십백천")
219 | 
220 | #count_to_kor1 = [""] + ["하나","둘","셋","넷","다섯","여섯","일곱","여덟","아홉"]
221 | count_to_kor1 = [""] + ["한","두","세","네","다섯","여섯","일곱","여덟","아홉"]
222 | 
223 | count_tenth_dict = {
224 |         "십": "열",
225 |         "두십": "스물",
226 |         "세십": "서른",
227 |         "네십": "마흔",
228 |         "다섯십": "쉰",
229 |         "여섯십": "예순",
230 |         "일곱십": "일흔",
231 |         "여덟십": "여든",
232 |         "아홉십": "아흔",
233 | }
234 | 
235 | 
236 | 
237 | def number_to_korean(num_str, is_count=False):
238 |     if is_count:
239 |         num_str, unit_str = num_str.group(1), num_str.group(2)
240 |     else:
241 |         num_str, unit_str = num_str.group(), ""
242 |     
243 |     num_str = num_str.replace(',', '')
244 |     num = ast.literal_eval(num_str)
245 | 
246 |     if num == 0:
247 |         return "영"
248 | 
249 |     check_float = num_str.split('.')
250 |     if len(check_float) == 2:
251 |         digit_str, float_str = check_float
252 |     elif len(check_float) >= 3:
253 |         raise Exception(" [!] Wrong number format")
254 |     else:
255 |         digit_str, float_str = check_float[0], None
256 | 
257 |     if is_count and float_str is not None:
258 |         raise Exception(" [!] `is_count` and float number does not fit each other")
259 | 
260 |     digit = int(digit_str)
261 | 
262 |     if digit_str.startswith("-"):
263 |         digit, digit_str = abs(digit), str(abs(digit))
264 | 
265 |     kor = ""
266 |     size = len(str(digit))
267 |     tmp = []
268 | 
269 |     for i, v in enumerate(digit_str, start=1):
270 |         v = int(v)
271 | 
272 |         if v != 0:
273 |             if is_count:
274 |                 tmp += count_to_kor1[v]
275 |             else:
276 |                 tmp += num_to_kor1[v]
277 | 
278 |             tmp += num_to_kor3[(size - i) % 4]
279 | 
280 |         if (size - i) % 4 == 0 and len(tmp) != 0:
281 |             kor += "".join(tmp)
282 |             tmp = []
283 |             kor += num_to_kor2[int((size - i) / 4)]
284 | 
285 |     if is_count:
286 |         if kor.startswith("한") and len(kor) > 1:
287 |             kor = kor[1:]
288 | 
289 |         if any(word in kor for word in count_tenth_dict):
290 |             kor = re.sub(
291 |                     '|'.join(count_tenth_dict.keys()),
292 |                     lambda x: count_tenth_dict[x.group()], kor)
293 | 
294 |     if not is_count and kor.startswith("일") and len(kor) > 1:
295 |         kor = kor[1:]
296 | 
297 |     if float_str is not None:
298 |         kor += "쩜 "
299 |         kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str)
300 | 
301 |     if num_str.startswith("+"):
302 |         kor = "플러스 " + kor
303 |     elif num_str.startswith("-"):
304 |         kor = "마이너스 " + kor
305 | 
306 |     return kor + unit_str
307 | 
308 | if __name__ == "__main__":
309 |     def test_normalize(text):
310 |         print(text)
311 |         print(normalize(text))
312 |         print("="*30)
313 | 
314 |     test_normalize("JTBC는 JTBCs를 DY는 A가 Absolute")
315 |     test_normalize("오늘(13일) 101마리 강아지가")
316 |     test_normalize('"저돌"(猪突) 입니다.')
317 |     test_normalize('비대위원장이 지난 1월 이런 말을 했습니다. “난 그냥 산돼지처럼 돌파하는 스타일이다”')
318 |     test_normalize("지금은 -12.35%였고 종류는 5가지와 19가지, 그리고 55가지였다")
319 |     test_normalize("JTBC는 TH와 K 양이 2017년 9월 12일 오후 12시에 24살이 된다")
320 | 


--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Defines the set of symbols used in text input to the model.
 3 | 
 4 | The default is a set of ASCII characters that works well for English or text that has been run
 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
 6 | '''
 7 | from jamo import h2j, j2h
 8 | from jamo.jamo import _jamo_char_to_hcj
 9 | 
10 | from .korean import ALL_SYMBOLS, PAD, EOS
11 | 
12 | # For english
13 | en_symbols = PAD+EOS+'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '  #<-For deployment(Because korean ALL_SYMBOLS follow this convention)
14 | 
15 | symbols = ALL_SYMBOLS # for korean
16 | 
17 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import math
  4 | import argparse
  5 | import traceback
  6 | import subprocess
  7 | import numpy as np
  8 | from jamo import h2j
  9 | import tensorflow as tf
 10 | from datetime import datetime
 11 | from functools import partial
 12 | 
 13 | from hparams import hparams, hparams_debug_string
 14 | from models import create_model, get_most_recent_checkpoint
 15 | 
 16 | from utils import ValueWindow, prepare_dirs
 17 | from utils import infolog, warning, plot, load_hparams
 18 | from utils import get_git_revision_hash, get_git_diff, str2bool, parallel_run
 19 | 
 20 | from audio import save_audio, inv_spectrogram
 21 | from text import sequence_to_text, text_to_sequence
 22 | from datasets.datafeeder import DataFeeder, _prepare_inputs
 23 | 
 24 | log = infolog.log
 25 | 
 26 | 
 27 | def create_batch_inputs_from_texts(texts):
 28 |     sequences = [text_to_sequence(text) for text in texts]
 29 | 
 30 |     inputs = _prepare_inputs(sequences)
 31 |     input_lengths = np.asarray([len(x) for x in inputs], dtype=np.int32)
 32 | 
 33 |     for idx, (seq, text) in enumerate(zip(inputs, texts)):
 34 |         recovered_text = sequence_to_text(seq, skip_eos_and_pad=True)
 35 |         if recovered_text != h2j(text):
 36 |             log(" [{}] {}".format(idx, text))
 37 |             log(" [{}] {}".format(idx, recovered_text))
 38 |             log("="*30)
 39 | 
 40 |     return inputs, input_lengths
 41 | 
 42 | 
 43 | def get_git_commit():
 44 |     subprocess.check_output(['git', 'diff-index', '--quiet', 'HEAD'])     # Verify client is clean
 45 |     commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()[:10]
 46 |     log('Git commit: %s' % commit)
 47 |     return commit
 48 | 
 49 | 
 50 | def add_stats(model, model2=None, scope_name='train'):
 51 |     with tf.variable_scope(scope_name) as scope:
 52 |         summaries = [
 53 |                 tf.summary.scalar('loss_mel', model.mel_loss),
 54 |                 tf.summary.scalar('loss_linear', model.linear_loss),
 55 |                 tf.summary.scalar('loss', model.loss_without_coeff),
 56 |         ]
 57 | 
 58 |         if scope_name == 'train':
 59 |             gradient_norms = [tf.norm(grad) for grad in model.gradients if grad is not None]
 60 | 
 61 |             summaries.extend([
 62 |                     tf.summary.scalar('learning_rate', model.learning_rate),
 63 |                     tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)),
 64 |             ])
 65 | 
 66 |     if model2 is not None:
 67 |         with tf.variable_scope('gap_test-train') as scope:
 68 |             summaries.extend([
 69 |                     tf.summary.scalar('loss_mel',
 70 |                             model.mel_loss - model2.mel_loss),
 71 |                     tf.summary.scalar('loss_linear', 
 72 |                             model.linear_loss - model2.linear_loss),
 73 |                     tf.summary.scalar('loss',
 74 |                             model.loss_without_coeff - model2.loss_without_coeff),
 75 |             ])
 76 | 
 77 |     return tf.summary.merge(summaries)
 78 | 
 79 | 
 80 | def save_and_plot_fn(args, log_dir, step, loss, prefix):
 81 |     idx, (seq, spec, align) = args
 82 | 
 83 |     audio_path = os.path.join(
 84 |             log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
 85 |     align_path = os.path.join(
 86 |             log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))
 87 | 
 88 |     waveform = inv_spectrogram(spec.T)
 89 |     save_audio(waveform, audio_path)
 90 | 
 91 |     info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
 92 |     if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
 93 |         log('Training korean : Use jamo')
 94 |         plot.plot_alignment(
 95 |                 align, align_path, info=info_text,
 96 |                 text=sequence_to_text(seq,
 97 |                         skip_eos_and_pad=True, combine_jamo=True), isKorean=True)
 98 |     else:
 99 |         log('Training non-korean : X use jamo')
100 |         plot.plot_alignment(
101 |                 align, align_path, info=info_text,
102 |                 text=sequence_to_text(seq,
103 |                         skip_eos_and_pad=True, combine_jamo=False), isKorean=False) 
104 | 
105 | def save_and_plot(sequences, spectrograms,
106 |         alignments, log_dir, step, loss, prefix):
107 | 
108 |     fn = partial(save_and_plot_fn,
109 |         log_dir=log_dir, step=step, loss=loss, prefix=prefix)
110 |     items = list(enumerate(zip(sequences, spectrograms, alignments)))
111 | 
112 |     parallel_run(fn, items, parallel=False)
113 |     log('Test finished for step {}.'.format(step))
114 | 
115 | 
116 | def train(log_dir, config):
117 |     config.data_paths = config.data_paths
118 | 
119 |     data_dirs = [os.path.join(data_path, "data") \
120 |             for data_path in config.data_paths]
121 |     num_speakers = len(data_dirs)
122 |     config.num_test = config.num_test_per_speaker * num_speakers
123 | 
124 |     if num_speakers > 1 and hparams.model_type not in ["deepvoice", "simple"]:
125 |         raise Exception("[!] Unkown model_type for multi-speaker: {}".format(config.model_type))
126 | 
127 |     commit = get_git_commit() if config.git else 'None'
128 |     checkpoint_path = os.path.join(log_dir, 'model.ckpt')
129 | 
130 |     log(' [*] git recv-parse HEAD:\n%s' % get_git_revision_hash())
131 |     log('='*50)
132 |     #log(' [*] dit diff:\n%s' % get_git_diff())
133 |     log('='*50)
134 |     log(' [*] Checkpoint path: %s' % checkpoint_path)
135 |     log(' [*] Loading training data from: %s' % data_dirs)
136 |     log(' [*] Using model: %s' % config.model_dir)
137 |     log(hparams_debug_string())
138 | 
139 |     # Set up DataFeeder:
140 |     coord = tf.train.Coordinator()
141 |     with tf.variable_scope('datafeeder') as scope:
142 |         train_feeder = DataFeeder(
143 |                 coord, data_dirs, hparams, config, 32,
144 |                 data_type='train', batch_size=hparams.batch_size)
145 |         test_feeder = DataFeeder(
146 |                 coord, data_dirs, hparams, config, 8,
147 |                 data_type='test', batch_size=config.num_test)
148 | 
149 |     # Set up model:
150 |     is_randomly_initialized = config.initialize_path is None
151 |     global_step = tf.Variable(0, name='global_step', trainable=False)
152 | 
153 |     with tf.variable_scope('model') as scope:
154 |         model = create_model(hparams)
155 |         model.initialize(
156 |                 train_feeder.inputs, train_feeder.input_lengths,
157 |                 num_speakers,  train_feeder.speaker_id,
158 |                 train_feeder.mel_targets, train_feeder.linear_targets,
159 |                 train_feeder.loss_coeff,
160 |                 is_randomly_initialized=is_randomly_initialized)
161 | 
162 |         model.add_loss()
163 |         model.add_optimizer(global_step)
164 |         train_stats = add_stats(model, scope_name='stats') # legacy
165 | 
166 |     with tf.variable_scope('model', reuse=True) as scope:
167 |         test_model = create_model(hparams)
168 |         test_model.initialize(
169 |                 test_feeder.inputs, test_feeder.input_lengths,
170 |                 num_speakers, test_feeder.speaker_id,
171 |                 test_feeder.mel_targets, test_feeder.linear_targets,
172 |                 test_feeder.loss_coeff, rnn_decoder_test_mode=True,
173 |                 is_randomly_initialized=is_randomly_initialized)
174 |         test_model.add_loss()
175 | 
176 |     test_stats = add_stats(test_model, model, scope_name='test')
177 |     test_stats = tf.summary.merge([test_stats, train_stats])
178 | 
179 |     # Bookkeeping:
180 |     step = 0
181 |     time_window = ValueWindow(100)
182 |     loss_window = ValueWindow(100)
183 |     saver = tf.train.Saver(max_to_keep=None, keep_checkpoint_every_n_hours=2)
184 | 
185 |     sess_config = tf.ConfigProto(
186 |             log_device_placement=False,
187 |             allow_soft_placement=True)
188 |     sess_config.gpu_options.allow_growth=True
189 | 
190 |     # Train!
191 |     #with tf.Session(config=sess_config) as sess:
192 |     with tf.Session() as sess:
193 |         try:
194 |             summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
195 |             sess.run(tf.global_variables_initializer())
196 | 
197 |             if config.load_path:
198 |                 # Restore from a checkpoint if the user requested it.
199 |                 restore_path = get_most_recent_checkpoint(config.model_dir)
200 |                 saver.restore(sess, restore_path)
201 |                 log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True)
202 |             elif config.initialize_path:
203 |                 restore_path = get_most_recent_checkpoint(config.initialize_path)
204 |                 saver.restore(sess, restore_path)
205 |                 log('Initialized from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True)
206 | 
207 |                 zero_step_assign = tf.assign(global_step, 0)
208 |                 sess.run(zero_step_assign)
209 | 
210 |                 start_step = sess.run(global_step)
211 |                 log('='*50)
212 |                 log(' [*] Global step is reset to {}'. \
213 |                         format(start_step))
214 |                 log('='*50)
215 |             else:
216 |                 log('Starting new training run at commit: %s' % commit, slack=True)
217 | 
218 |             start_step = sess.run(global_step)
219 | 
220 |             train_feeder.start_in_session(sess, start_step)
221 |             test_feeder.start_in_session(sess, start_step)
222 | 
223 |             while not coord.should_stop():
224 |                 start_time = time.time()
225 |                 step, loss, opt = sess.run(
226 |                         [global_step, model.loss_without_coeff, model.optimize],
227 |                         feed_dict=model.get_dummy_feed_dict())
228 | 
229 |                 time_window.append(time.time() - start_time)
230 |                 loss_window.append(loss)
231 | 
232 |                 message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % (
233 |                         step, time_window.average, loss, loss_window.average)
234 |                 log(message, slack=(step % config.checkpoint_interval == 0))
235 | 
236 |                 if loss > 100 or math.isnan(loss):
237 |                     log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True)
238 |                     raise Exception('Loss Exploded')
239 | 
240 |                 if step % config.summary_interval == 0:
241 |                     log('Writing summary at step: %d' % step)
242 | 
243 |                     feed_dict = {
244 |                             **model.get_dummy_feed_dict(),
245 |                             **test_model.get_dummy_feed_dict()
246 |                     }
247 |                     summary_writer.add_summary(sess.run(
248 |                             test_stats, feed_dict=feed_dict), step)
249 | 
250 |                 if step % config.checkpoint_interval == 0:
251 |                     log('Saving checkpoint to: %s-%d' % (checkpoint_path, step))
252 |                     saver.save(sess, checkpoint_path, global_step=step)
253 | 
254 |                 if step % config.test_interval == 0:
255 |                     log('Saving audio and alignment...')
256 |                     num_test = config.num_test
257 | 
258 |                     fetches = [
259 |                             model.inputs[:num_test],
260 |                             model.linear_outputs[:num_test],
261 |                             model.alignments[:num_test],
262 |                             test_model.inputs[:num_test],
263 |                             test_model.linear_outputs[:num_test],
264 |                             test_model.alignments[:num_test],
265 |                     ]
266 |                     feed_dict = {
267 |                             **model.get_dummy_feed_dict(),
268 |                             **test_model.get_dummy_feed_dict()
269 |                     }
270 | 
271 |                     sequences, spectrograms, alignments, \
272 |                             test_sequences, test_spectrograms, test_alignments = \
273 |                                     sess.run(fetches, feed_dict=feed_dict)
274 | 
275 |                     save_and_plot(sequences[:1], spectrograms[:1], alignments[:1],
276 |                             log_dir, step, loss, "train")
277 |                     save_and_plot(test_sequences, test_spectrograms, test_alignments,
278 |                             log_dir, step, loss, "test")
279 | 
280 |         except Exception as e:
281 |             log('Exiting due to exception: %s' % e, slack=True)
282 |             traceback.print_exc()
283 |             coord.request_stop(e)
284 | 
285 | 
286 | def main():
287 |     parser = argparse.ArgumentParser()
288 | 
289 |     parser.add_argument('--log_dir', default='logs')
290 |     parser.add_argument('--data_paths', default='datasets/kr_example')
291 |     parser.add_argument('--load_path', default=None)
292 |     parser.add_argument('--initialize_path', default=None)
293 | 
294 |     parser.add_argument('--num_test_per_speaker', type=int, default=2)
295 |     parser.add_argument('--random_seed', type=int, default=123)
296 |     parser.add_argument('--summary_interval', type=int, default=100)
297 |     parser.add_argument('--test_interval', type=int, default=500)
298 |     parser.add_argument('--checkpoint_interval', type=int, default=1000)
299 |     parser.add_argument('--skip_path_filter',
300 |             type=str2bool, default=False, help='Use only for debugging')
301 | 
302 |     parser.add_argument('--slack_url',
303 |             help='Slack webhook URL to get periodic reports.')
304 |     parser.add_argument('--git', action='store_true',
305 |             help='If set, verify that the client is clean.')
306 | 
307 |     config = parser.parse_args()
308 |     config.data_paths = config.data_paths.split(",")
309 |     setattr(hparams, "num_speakers", len(config.data_paths))
310 | 
311 |     prepare_dirs(config, hparams)
312 | 
313 |     log_path = os.path.join(config.model_dir, 'train.log')
314 |     infolog.init(log_path, config.model_dir, config.slack_url)
315 | 
316 |     tf.set_random_seed(config.random_seed)
317 |     print(config.data_paths)
318 | 
319 |     if any("krbook" not in data_path for data_path in config.data_paths) and \
320 |             hparams.sample_rate != 20000:
321 |         warning("Detect non-krbook dataset. May need to set sampling rate from {} to 20000".\
322 |                 format(hparams.sample_rate))
323 |         
324 |     if any('LJ' in data_path for data_path in config.data_paths) and \
325 |            hparams.sample_rate != 22050:
326 |         warning("Detect LJ Speech dataset. Set sampling rate from {} to 22050".\
327 |                 format(hparams.sample_rate))
328 | 
329 |     if config.load_path is not None and config.initialize_path is not None:
330 |         raise Exception(" [!] Only one of load_path and initialize_path should be set")
331 | 
332 |     train(config.model_dir, config)
333 | 
334 | 
335 | if __name__ == '__main__':
336 |     main()
337 | 


--------------------------------------------------------------------------------
/utils/NanumBarunGothic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melonicedlatte/multi-speaker-tacotron-tensorflow/b98628b467e0e577541d48d344dc3347d99c8392/utils/NanumBarunGothic.ttf


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import sys
  4 | import json
  5 | import requests
  6 | import subprocess
  7 | from tqdm import tqdm
  8 | from contextlib import closing
  9 | from multiprocessing import Pool
 10 | from collections import namedtuple
 11 | from datetime import datetime, timedelta
 12 | from shutil import copyfile as copy_file
 13 | 
 14 | PARAMS_NAME = "params.json"
 15 | 
 16 | class ValueWindow():
 17 |     def __init__(self, window_size=100):
 18 |         self._window_size = window_size
 19 |         self._values = []
 20 | 
 21 |     def append(self, x):
 22 |         self._values = self._values[-(self._window_size - 1):] + [x]
 23 | 
 24 |     @property
 25 |     def sum(self):
 26 |         return sum(self._values)
 27 | 
 28 |     @property
 29 |     def count(self):
 30 |         return len(self._values)
 31 | 
 32 |     @property
 33 |     def average(self):
 34 |         return self.sum / max(1, self.count)
 35 | 
 36 |     def reset(self):
 37 |         self._values = []
 38 | 
 39 | def prepare_dirs(config, hparams):
 40 |     if hasattr(config, "data_paths"):
 41 |         config.datasets = [
 42 |                 os.path.basename(data_path) for data_path in config.data_paths]
 43 |         dataset_desc = "+".join(config.datasets)
 44 | 
 45 |     if config.load_path:
 46 |         config.model_dir = config.load_path
 47 |     else:
 48 |         config.model_name = "{}_{}".format(dataset_desc, get_time())
 49 |         config.model_dir = os.path.join(config.log_dir, config.model_name)
 50 | 
 51 |         for path in [config.log_dir, config.model_dir]:
 52 |             if not os.path.exists(path):
 53 |                 os.makedirs(path)
 54 | 
 55 |     if config.load_path:
 56 |         load_hparams(hparams, config.model_dir)
 57 |     else:
 58 |         setattr(hparams, "num_speakers", len(config.datasets))
 59 | 
 60 |         save_hparams(config.model_dir, hparams)
 61 |         copy_file("hparams.py", os.path.join(config.model_dir, "hparams.py"))
 62 | 
 63 | def makedirs(path):
 64 |     if not os.path.exists(path):
 65 |         print(" [*] Make directories : {}".format(path))
 66 |         os.makedirs(path)
 67 | 
 68 | def remove_file(path):
 69 |     if os.path.exists(path):
 70 |         print(" [*] Removed: {}".format(path))
 71 |         os.remove(path)
 72 | 
 73 | def backup_file(path):
 74 |     root, ext = os.path.splitext(path)
 75 |     new_path = "{}.backup_{}{}".format(root, get_time(), ext)
 76 | 
 77 |     os.rename(path, new_path)
 78 |     print(" [*] {} has backup: {}".format(path, new_path))
 79 | 
 80 | def get_time():
 81 |     return datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 82 | 
 83 | def write_json(path, data):
 84 |     with open(path, 'w',encoding='utf-8') as f:
 85 |         json.dump(data, f, indent=4, sort_keys=True, ensure_ascii=False)
 86 | 
 87 | def load_json(path, as_class=False, encoding='euc-kr'):
 88 |     with open(path,encoding=encoding) as f:
 89 |         content = f.read()
 90 |         content = re.sub(",\s*}", "}", content)
 91 |         content = re.sub(",\s*]", "]", content)
 92 | 
 93 |         if as_class:
 94 |             data = json.loads(content, object_hook=\
 95 |                     lambda data: namedtuple('Data', data.keys())(*data.values()))
 96 |         else:
 97 |             data = json.loads(content)
 98 |     #print(data)
 99 |     return data
100 | 
101 | def save_hparams(model_dir, hparams):
102 |     param_path = os.path.join(model_dir, PARAMS_NAME)
103 | 
104 |     info = eval(hparams.to_json(). \
105 |             replace('true', 'True').replace('false', 'False'))
106 |     write_json(param_path, info)
107 | 
108 |     print(" [*] MODEL dir: {}".format(model_dir))
109 |     print(" [*] PARAM path: {}".format(param_path))
110 | 
111 | def load_hparams(hparams, load_path, skip_list=[]):
112 |     path = os.path.join(load_path, PARAMS_NAME)
113 | 
114 |     new_hparams = load_json(path)
115 |     hparams_keys = vars(hparams).keys()
116 | 
117 |     for key, value in new_hparams.items():
118 |         if key in skip_list or key not in hparams_keys:
119 |             print("Skip {} because it not exists".format(key))
120 |             continue
121 | 
122 |         if key not in ['job_name', 'num_workers', 'display', 'is_train', 'load_path'] or \
123 |                 key == "pointer_load_path":
124 |             original_value = getattr(hparams, key)
125 |             if original_value != value:
126 |                 print("UPDATE {}: {} -> {}".format(key, getattr(hparams, key), value))
127 |                 setattr(hparams, key, value)
128 | 
129 | def add_prefix(path, prefix):
130 |     dir_path, filename = os.path.dirname(path), os.path.basename(path)
131 |     return "{}/{}.{}".format(dir_path, prefix, filename)
132 | 
133 | def add_postfix(path, postfix):
134 |     path_without_ext, ext = path.rsplit('.', 1)
135 |     return "{}.{}.{}".format(path_without_ext, postfix, ext)
136 | 
137 | def remove_postfix(path):
138 |     items = path.rsplit('.', 2)
139 |     return items[0] + "." + items[2]
140 | 
141 | def parallel_run(fn, items, desc="", parallel=True):
142 |     results = []
143 | 
144 |     if parallel:
145 |         with closing(Pool()) as pool:
146 |             for out in tqdm(pool.imap_unordered(
147 |                     fn, items), total=len(items), desc=desc):
148 |                 if out is not None:
149 |                     results.append(out)
150 |     else:
151 |         for item in tqdm(items, total=len(items), desc=desc):
152 |             out = fn(item)
153 |             if out is not None:
154 |                 results.append(out)
155 | 
156 |     return results
157 | 
158 | def which(program):
159 |     if os.name == "nt" and not program.endswith(".exe"):
160 |         program += ".exe"
161 | 
162 |     envdir_list = [os.curdir] + os.environ["PATH"].split(os.pathsep)
163 | 
164 |     for envdir in envdir_list:
165 |         program_path = os.path.join(envdir, program)
166 |         if os.path.isfile(program_path) and os.access(program_path, os.X_OK):
167 |             return program_path
168 | 
169 | def get_encoder_name():
170 |     if which("avconv"):
171 |         return "avconv"
172 |     elif which("ffmpeg"):
173 |         return "ffmpeg"
174 |     else:
175 |         return "ffmpeg"
176 | 
177 | def download_with_url(url, dest_path, chunk_size=32*1024):
178 |     with open(dest_path, "wb") as f:
179 |         response = requests.get(url, stream=True)
180 |         total_size = int(response.headers.get('content-length', 0))
181 | 
182 |         for chunk in response.iter_content(chunk_size):
183 |             if chunk: # filter out keep-alive new chunks
184 |                 f.write(chunk)
185 |     return True
186 | 
187 | def str2bool(v):
188 |     return v.lower() in ('true', '1')
189 | 
190 | def get_git_revision_hash():
191 |     return subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode("utf-8")
192 | 
193 | def get_git_diff():
194 |     return subprocess.check_output(['git', 'diff']).decode("utf-8")
195 | 
196 | def warning(msg):
197 |     print("="*40)
198 |     print(" [!] {}".format(msg))
199 |     print("="*40)
200 |     print()
201 | 
202 | def query_yes_no(question, default=None):
203 |     # Code from https://stackoverflow.com/a/3041990
204 |     valid = {"yes": True, "y": True, "ye": True,
205 |              "no": False, "n": False}
206 |     if default is None:
207 |         prompt = " [y/n] "
208 |     elif default == "yes":
209 |         prompt = " [Y/n] "
210 |     elif default == "no":
211 |         prompt = " [y/N] "
212 |     else:
213 |         raise ValueError("invalid default answer: '%s'" % default)
214 | 
215 |     while True:
216 |         sys.stdout.write(question + prompt)
217 |         choice = input().lower()
218 |         if default is not None and choice == '':
219 |             return valid[default]
220 |         elif choice in valid:
221 |             return valid[choice]
222 |         else:
223 |             sys.stdout.write("Please respond with 'yes' or 'no' "
224 |                              "(or 'y' or 'n').\n")
225 | 


--------------------------------------------------------------------------------
/utils/infolog.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | from datetime import datetime
 3 | import json
 4 | from threading import Thread
 5 | from urllib.request import Request, urlopen
 6 | 
 7 | 
 8 | _format = '%Y-%m-%d %H:%M:%S.%f'
 9 | _file = None
10 | _run_name = None
11 | _slack_url = None
12 | 
13 | 
14 | def init(filename, run_name, slack_url=None):
15 |     global _file, _run_name, _slack_url
16 |     _close_logfile()
17 |     _file = open(filename, 'a')
18 |     _file.write('\n-----------------------------------------------------------------\n')
19 |     _file.write('Starting new training run\n')
20 |     _file.write('-----------------------------------------------------------------\n')
21 |     _run_name = run_name
22 |     _slack_url = slack_url
23 | 
24 | 
25 | def log(msg, slack=False):
26 |     print(msg)
27 |     if _file is not None:
28 |         _file.write('[%s]    %s\n' % (datetime.now().strftime(_format)[:-3], msg))
29 |     if slack and _slack_url is not None:
30 |         Thread(target=_send_slack, args=(msg,)).start()
31 | 
32 | 
33 | def _close_logfile():
34 |     global _file
35 |     if _file is not None:
36 |         _file.close()
37 |         _file = None
38 | 
39 | 
40 | def _send_slack(msg):
41 |     req = Request(_slack_url)
42 |     req.add_header('Content-Type', 'application/json')
43 |     urlopen(req, json.dumps({
44 |         'username': 'tacotron',
45 |         'icon_emoji': ':taco:',
46 |         'text': '*%s*: %s' % (_run_name, msg)
47 |     }).encode())
48 | 
49 | 
50 | atexit.register(_close_logfile)
51 | 


--------------------------------------------------------------------------------
/utils/plot.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | import matplotlib
 3 | from jamo import h2j, j2hcj
 4 | 
 5 | matplotlib.use('Agg')
 6 | matplotlib.rc('font', family="NanumBarunGothic")
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | from text import PAD, EOS
10 | from utils import add_postfix
11 | from text.korean import normalize
12 | 
13 | def plot(alignment, info, text, isKorean=True):
14 |     char_len, audio_len = alignment.shape # 145, 200
15 | 
16 |     fig, ax = plt.subplots(figsize=(char_len/5, 5))
17 |     im = ax.imshow(
18 |             alignment.T,
19 |             aspect='auto',
20 |             origin='lower',
21 |             interpolation='none')
22 | 
23 |     xlabel = 'Encoder timestep'
24 |     ylabel = 'Decoder timestep'
25 | 
26 |     if info is not None:
27 |         xlabel += '\n{}'.format(info)
28 | 
29 |     plt.xlabel(xlabel)
30 |     plt.ylabel(ylabel)
31 | 
32 |     if text:
33 |         if isKorean:
34 |             jamo_text = j2hcj(h2j(normalize(text)))
35 |         else:
36 |             jamo_text=text
37 |         pad = [PAD] * (char_len - len(jamo_text) - 1)
38 | 
39 |         plt.xticks(range(char_len),
40 |                 [tok for tok in jamo_text] + [EOS] + pad)
41 | 
42 |     if text is not None:
43 |         while True:
44 |             if text[-1] in [EOS, PAD]:
45 |                 text = text[:-1]
46 |             else:
47 |                 break
48 |         plt.title(text)
49 | 
50 |     plt.tight_layout()
51 | 
52 | def plot_alignment(
53 |         alignment, path, info=None, text=None, isKorean=True):
54 | 
55 |     if text:
56 |         tmp_alignment = alignment[:len(h2j(text)) + 2]
57 | 
58 |         plot(tmp_alignment, info, text, isKorean)
59 |         plt.savefig(path, format='png')
60 |     else:
61 |         plot(alignment, info, text, isKorean)
62 |         plt.savefig(path, format='png')
63 | 
64 |     print(" [*] Plot saved: {}".format(path))
65 | 


--------------------------------------------------------------------------------
/web/static/css/main.css:
--------------------------------------------------------------------------------
 1 | @media screen and (min-width: 1452px) {
 2 |     .container {
 3 |         max-width: 1152px;
 4 |         width: 1152px;
 5 |     }
 6 | }
 7 | @media screen and (min-width: 1260px) {
 8 |     .container {
 9 |         max-width: 960px;
10 |         width: 960px;
11 |     }
12 | }
13 | @media screen and (min-width: 1068px) {
14 |     .container {
15 |         max-width: 768px;
16 |         width: 768px;
17 |     }
18 | }
19 | 
20 | .container {
21 |     margin: 0 auto;
22 |     position: relative;
23 | }
24 | 
25 | #wave {
26 |     height: 100px;
27 | }
28 | 
29 | #waveform {
30 |     display: none;
31 | }
32 | 
33 | #nav {
34 |       position: fixed !important;
35 |       top: 0;
36 |       left: 0;
37 |       right: 0;
38 |       z-index: 100;
39 | }
40 | 
41 | .card {
42 |       padding: 0;
43 | }
44 | 
45 | .columns {
46 |     margin-left: 0rem;
47 |     margin-right: 0rem;
48 |     margin-top: 0rem;
49 | }
50 | 
51 | #text {
52 |     font-size: 1.2em;
53 |     padding: 0.7em 1em 0.7em 1em;
54 |     background: transparent;
55 |     color: white;
56 | }
57 | 
58 | .dark {
59 |     background-color: black;
60 | }
61 | 


--------------------------------------------------------------------------------
/web/static/js/main.js:
--------------------------------------------------------------------------------
  1 | ﻿var sw;
  2 | var wavesurfer;
  3 | 
  4 | var defaultSpeed = 0.03;
  5 | var defaultAmplitude = 0.3;
  6 | 
  7 | var activeColors = [[32,133,252], [94,252,169], [253,71,103]];
  8 | var inactiveColors = [[241,243,245], [206,212,218], [222,226,230], [173,181,189]];
  9 | 
 10 | function generate(ip, port, text, speaker_id) {
 11 |   $("#synthesize").addClass("is-loading");
 12 | 
 13 |   var uri = 'http://' + ip + ':' + port
 14 |   var url = uri + '/generate?text=' + encodeURIComponent(text) + "&speaker_id=" + speaker_id;
 15 | 
 16 |   fetch(url, {cache: 'no-cache', mode: 'cors'})
 17 |     .then(function(res) {
 18 |       if (!res.ok) throw Error(response.statusText)
 19 |       return res.blob()
 20 |     }).then(function(blob) {
 21 |       var url = URL.createObjectURL(blob);
 22 |       console.log(url);
 23 |       inProgress = false;
 24 |       wavesurfer.load(url);
 25 |       $("#synthesize").removeClass("is-loading");
 26 |     }).catch(function(err) {
 27 |       showWarning("에러가 발생했습니다");
 28 |       inProgress = false;
 29 |       $("#synthesize").removeClass("is-loading");
 30 |     });
 31 | }
 32 | 
 33 | (function(window, document, undefined){
 34 |   window.onload = init;
 35 | 
 36 |   function setDefaultColor(sw, isActive) {
 37 |     for (idx=0; idx < sw.curves.length; idx++) {
 38 |       var curve = sw.curves[idx];
 39 | 
 40 |       if (isActive) {
 41 |         curve.color = activeColors[idx % activeColors.length];
 42 |       } else {
 43 |         curve.color = inactiveColors[idx % inactiveColors.length];
 44 |       }
 45 |     }
 46 |   }
 47 | 
 48 |   function init(){
 49 |     sw = new SiriWave9({
 50 |       amplitude: defaultAmplitude,
 51 |       container: document.getElementById('wave'),
 52 |       autostart: true,
 53 |       speed: defaultSpeed,
 54 |       style: 'ios9',
 55 |     });
 56 |     sw.setSpeed(defaultSpeed);
 57 |     setDefaultColor(sw, false);
 58 | 
 59 |     wavesurfer = WaveSurfer.create({
 60 |       container: '#waveform',
 61 |       waveColor: 'violet',
 62 |       barWidth: 3,
 63 |       progressColor: 'purple'
 64 |     });
 65 | 
 66 |     wavesurfer.on('ready', function () {
 67 |       this.width = wavesurfer.getDuration() *
 68 |                    wavesurfer.params.minPxPerSec * wavesurfer.params.pixelRatio;
 69 |       this.peaks = wavesurfer.backend.getPeaks(width);
 70 | 
 71 |       wavesurfer.play();
 72 |     });
 73 | 
 74 |     wavesurfer.on('audioprocess', function () {
 75 |       var percent = wavesurfer.backend.getPlayedPercents();
 76 |       var height = this.peaks[parseInt(this.peaks.length * percent)];
 77 |       if (height > 0) {
 78 |         sw.setAmplitude(height*3);
 79 |       }
 80 |     });
 81 | 
 82 |     wavesurfer.on('finish', function () {
 83 |       sw.setSpeed(defaultSpeed);
 84 |       sw.setAmplitude(defaultAmplitude);
 85 |       setDefaultColor(sw, false);
 86 |     });
 87 | 
 88 |     $(document).on('click', "#synthesize", function() {
 89 |       synthesize();
 90 |     });
 91 | 
 92 |     function synthesize() {
 93 |       var text = $("#text").val().trim();
 94 |       var text_length = text.length;
 95 | 
 96 |       var speaker_id = $('input[name=id]:checked').val();
 97 |       var speaker = $('input[name=id]:checked').attr("speaker");
 98 | 
 99 |       generate('0.0.0.0', 51000, text, speaker_id);
100 | 
101 |       var lowpass = wavesurfer.backend.ac.createGain();
102 |       wavesurfer.backend.setFilter(lowpass);
103 |     }
104 |   }
105 | })(window, document, undefined);
106 | 


--------------------------------------------------------------------------------
/web/static/js/siriwave.js:
--------------------------------------------------------------------------------
  1 | (function() {
  2 | 
  3 | ////////////////////
  4 | // SiriWave9Curve //
  5 | ////////////////////
  6 | 
  7 | function SiriWave9Curve(opt) {
  8 | 	opt = opt || {};
  9 | 	this.controller = opt.controller;
 10 | 	this.color = opt.color;
 11 | 	this.tick = 0;
 12 | 
 13 | 	this.respawn();
 14 | }
 15 | 
 16 | SiriWave9Curve.prototype.respawn = function() {
 17 | 	this.amplitude = 0.3 + Math.random() * 0.7;
 18 | 	this.seed = Math.random();
 19 | 	this.open_class = 2+(Math.random()*3)|0;
 20 | };
 21 | 
 22 | SiriWave9Curve.prototype.equation = function(i) {
 23 | 	var p = this.tick;
 24 | 	var y = -1 * Math.abs(Math.sin(p)) * this.controller.amplitude * this.amplitude * this.controller.MAX * Math.pow(1/(1+Math.pow(this.open_class*i,2)),2);
 25 | 	if (Math.abs(y) < 0.001) {
 26 | 		this.respawn();
 27 | 	}
 28 | 	return y;
 29 | };
 30 | 
 31 | SiriWave9Curve.prototype._draw = function(m) {
 32 | 	this.tick += this.controller.speed * (1-0.5*Math.sin(this.seed*Math.PI));
 33 | 
 34 | 	var ctx = this.controller.ctx;
 35 | 	ctx.beginPath();
 36 | 
 37 | 	var x_base = this.controller.width/2 + (-this.controller.width/4 + this.seed*(this.controller.width/2) );
 38 | 	var y_base = this.controller.height/2;
 39 | 
 40 | 	var x, y, x_init;
 41 | 
 42 | 	var i = -3;
 43 | 	while (i <= 3) {
 44 | 		x = x_base + i * this.controller.width/4;
 45 | 		y = y_base + (m * this.equation(i));
 46 | 		x_init = x_init || x;
 47 | 		ctx.lineTo(x, y);
 48 | 		i += 0.01;
 49 | 	}
 50 | 
 51 | 	var h = Math.abs(this.equation(0));
 52 | 	var gradient = ctx.createRadialGradient(x_base, y_base, h*1.15, x_base, y_base, h * 0.3 );
 53 | 	gradient.addColorStop(0, 'rgba(' + this.color.join(',') + ',0.4)');
 54 | 	gradient.addColorStop(1, 'rgba(' + this.color.join(',') + ',0.2)');
 55 | 
 56 | 	ctx.fillStyle = gradient;
 57 | 
 58 | 	ctx.lineTo(x_init, y_base);
 59 | 	ctx.closePath();
 60 | 
 61 | 	ctx.fill();
 62 | };
 63 | 
 64 | SiriWave9Curve.prototype.draw = function() {
 65 | 	this._draw(-1);
 66 | 	this._draw(1);
 67 | };
 68 | 
 69 | 
 70 | //////////////
 71 | // SiriWave //
 72 | //////////////
 73 | 
 74 | function SiriWave9(opt) {
 75 | 	opt = opt || {};
 76 | 
 77 | 	this.tick = 0;
 78 | 	this.run = false;
 79 | 
 80 | 	// UI vars
 81 | 
 82 | 	this.ratio = opt.ratio || window.devicePixelRatio || 1;
 83 | 
 84 | 	this.width = this.ratio * (opt.width || 320);
 85 | 	this.height = this.ratio * (opt.height || 100);
 86 | 	this.MAX = this.height/2;
 87 | 
 88 | 	this.speed = 0.1;
 89 | 	this.amplitude = opt.amplitude || 1;
 90 | 
 91 | 	// Interpolation
 92 | 
 93 | 	this.speedInterpolationSpeed = opt.speedInterpolationSpeed || 0.005;
 94 | 	this.amplitudeInterpolationSpeed = opt.amplitudeInterpolationSpeed || 0.05;
 95 | 
 96 | 	this._interpolation = {
 97 | 		speed: this.speed,
 98 | 		amplitude: this.amplitude
 99 | 	};
100 | 
101 | 	// Canvas
102 | 
103 | 	this.canvas = document.createElement('canvas');
104 | 	this.canvas.width = this.width;
105 | 	this.canvas.height = this.height;
106 | 
107 | 	if (opt.cover) {
108 | 		this.canvas.style.width = this.canvas.style.height = '100%';
109 | 	} else {
110 | 		this.canvas.style.width = (this.width / this.ratio) + 'px';
111 | 		this.canvas.style.height = (this.height / this.ratio) + 'px';
112 | 	}
113 | 
114 | 	this.container = opt.container || document.body;
115 | 	this.container.appendChild(this.canvas);
116 | 
117 | 	this.ctx = this.canvas.getContext('2d');
118 | 
119 | 	// Create curves
120 | 
121 | 	this.curves = [];
122 | 	for (var i = 0; i < SiriWave9.prototype.COLORS.length; i++) {
123 | 		var color = SiriWave9.prototype.COLORS[i];
124 | 		for (var j = 0; j < (3 * Math.random())|0; j++) {
125 | 			this.curves.push(new SiriWave9Curve({
126 | 				controller: this,
127 | 				color: color
128 | 			}));
129 | 		}
130 | 	}
131 | 
132 | 	if (opt.autostart) {
133 | 		this.start();
134 | 	}
135 | }
136 | 
137 | SiriWave9.prototype._interpolate = function(propertyStr) {
138 | 	increment = this[ propertyStr + 'InterpolationSpeed' ];
139 | 
140 | 	if (Math.abs(this._interpolation[propertyStr] - this[propertyStr]) <= increment) {
141 | 		this[propertyStr] = this._interpolation[propertyStr];
142 | 	} else {
143 | 		if (this._interpolation[propertyStr] > this[propertyStr]) {
144 | 			this[propertyStr] += increment;
145 | 		} else {
146 | 			this[propertyStr] -= increment;
147 | 		}
148 | 	}
149 | };
150 | 
151 | SiriWave9.prototype._clear = function() {
152 | 	this.ctx.globalCompositeOperation = 'destination-out';
153 | 	this.ctx.fillRect(0, 0, this.width, this.height);
154 | 	this.ctx.globalCompositeOperation = 'lighter';
155 | };
156 | 
157 | SiriWave9.prototype._draw = function() {
158 | 	for (var i = 0, len = this.curves.length; i < len; i++) {
159 | 		this.curves[i].draw();
160 | 	}
161 | };
162 | 
163 | SiriWave9.prototype._startDrawCycle = function() {
164 | 	if (this.run === false) return;
165 | 	this._clear();
166 | 
167 | 	// Interpolate values
168 | 	this._interpolate('amplitude');
169 | 	this._interpolate('speed');
170 | 
171 | 	this._draw();
172 | 	this.phase = (this.phase + Math.PI*this.speed) % (2*Math.PI);
173 | 
174 | 	if (window.requestAnimationFrame) {
175 | 		window.requestAnimationFrame(this._startDrawCycle.bind(this));
176 | 	} else {
177 | 		setTimeout(this._startDrawCycle.bind(this), 20);
178 | 	}
179 | };
180 | 
181 | SiriWave9.prototype.start = function() {
182 | 	this.tick = 0;
183 | 	this.run = true;
184 | 	this._startDrawCycle();
185 | };
186 | 
187 | SiriWave9.prototype.stop = function() {
188 | 	this.tick = 0;
189 | 	this.run = false;
190 | };
191 | 
192 | SiriWave9.prototype.setSpeed = function(v, increment) {
193 | 	this._interpolation.speed = v;
194 | };
195 | 
196 | SiriWave9.prototype.setNoise = SiriWave9.prototype.setAmplitude = function(v) {
197 | 	this._interpolation.amplitude = Math.max(Math.min(v, 1), 0);
198 | };
199 | 
200 | SiriWave9.prototype.COLORS = [
201 | [32,133,252],
202 | [94,252,169],
203 | [253,71,103]
204 | ];
205 | 
206 | if (typeof define === 'function' && define.amd) {
207 | 	define(function(){ return SiriWave9; });
208 | } else {
209 | 	window.SiriWave9 = SiriWave9;
210 | }
211 | 
212 | })();
213 | 


--------------------------------------------------------------------------------
/web/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1">
 6 |     <title>D.Voice</title>
 7 | 
 8 |     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
 9 |     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bulma/0.5.1/css/bulma.min.css">
10 |     <link rel="stylesheet" href="{{ url_for('static', filename='css/main.css') }}">
11 | 
12 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.2.1/jquery.min.js"></script>
13 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.8.3/underscore-min.js"></script>
14 |     <script src="https://wavesurfer-js.org/dist/wavesurfer.min.js"></script>
15 | 
16 |     <script src="{{ url_for('static', filename='js/siriwave.js') }}"></script>
17 |     <script src="{{ url_for('static', filename='js/main.js') }}"></script>
18 |   </head>
19 |   <body class="layout-default">
20 | 
21 |     <section class="hero is-fullheight dark">
22 |       <div class="hero-body">
23 |         <div class="container">
24 |           <div class="section-body"">
25 |             <div class="field">
26 |               <div class="control">
27 |                 <div class="columns">
28 |                   <div class="column"></div>
29 |                   <div class="column">
30 |                     <div id="wave"></div>
31 |                   </div>
32 |                   <div class="column"></div>
33 |                 </div>
34 |               </div>
35 |             </div>
36 | 
37 |             <div class="field">
38 |               <div class="control">
39 |                 <div id="waveform"></div>
40 |               </div>
41 |             </div>
42 | 
43 |             <div class="field">
44 |               <div class="control has-text-centered">
45 |                 <label class="radio">
46 |                   <input type="radio" name="id" value="0" port="5000" checked>
47 |                   Speaker 1
48 |                 </label>
49 |               </div>
50 |             </div>
51 | 
52 |             <div class="field">
53 |               <div class="control has-icons-right">
54 |                 <textarea class="textarea" id="text" placeholder="{{ text }} "></textarea>
55 |                 <span class="icon is-small is-right" id="text-warning-icon" style="display:none">
56 |                   <i class="fa fa-warning"></i>
57 |                 </span>
58 |               </div>
59 |               <p class="help is-danger" id="text-warning" style="display:none">
60 |                 Wrong sentence
61 |               </p>
62 |             </div>
63 | 
64 |             <div class="field has-text-centered">
65 |               <button class="button is-white" id="synthesize">
66 |                 Synthesize
67 |               </button>
68 |             </div>
69 |           </div>
70 |         </div>
71 |       </div>
72 |     </section>
73 |   </body>
74 | </html>
75 | 


--------------------------------------------------------------------------------
/김앵커한마디_음성받아오기.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 17,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import re\n",
 10 |     "import os\n",
 11 |     "import sys\n",
 12 |     "import m3u8\n",
 13 |     "import json\n",
 14 |     "import requests\n",
 15 |     "import subprocess\n",
 16 |     "from functools import partial\n",
 17 |     "from bs4 import BeautifulSoup\n",
 18 |     "from nltk import sent_tokenize \n",
 19 |     "import ast\n",
 20 |     "from utils import get_encoder_name, parallel_run, makedirs"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 18,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "kim_movie_info_list = []\n",
 30 |     "\n",
 31 |     "with open('./datasets/kim_anchor/kim_anchor_data_info.json', 'r') as myfile:\n",
 32 |     "    data = myfile.read()\n",
 33 |     "    kim_movie_info_list = ast.literal_eval(data)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 19,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "text/plain": [
 44 |        "{'CONTENT': '오늘(3일)의 한마디는 괜찮은 항공사는 없습니까입니다.아시아나 항공 기내식 대란이 사흘째 이어지고 있습니다. 기내식을 싣지 못한 국제선 여객기 지연은 다반사. 그냥 출발하는 항공기들도 속출합니다. 승무원들은 거센 항의를 받으며 자신들은 굶고 라면을 끓여 승객들에게 내놓고, 수백명 목숨을 책임진 기장도 라면과 음료수 하나로 버틴다고 합니다. 이 와중에 기내식 관련 하청업체 대표는 자살했습니다.아시아나는 지난 15년간 기내식을 공급하던 LSG에게 올해 초 6개월 뒤 계약을 종료하겠다고 밝혔습니다. 금호홀딩스 투자를 둘러싼 갈등이 있었다고 합니다. 하지만 새로 선정한 업체에서 석 달 전 불이 나자 아시아나는 하청 업체 하나를 선정하고 대책을 마무리 했습니다. 하루 생산능력 3000명 분인 업체가 2만 5000명분 이상의 기내식을 공급하려다 보니 바로 사고가 터진 것입니다.대한항공 오너 일가의 갑질도 개탄스러운데 아시아나까지 이러니 참 실망스럽습니다. 도대체 대한민국에서 신뢰할 만한 항공사 하나를 찾기가 왜 이리 힘든 것입니까. 이게 다 독과점의 폐해 아닙니까. 뉴스현장  은 여기까지입니다. 저희는 내일 다시 찾아뵙겠습니다.',\n",
 45 |        " 'SD_URL': 'http://jtbcvod.fvod.skcdn.com/newsvod/694404E4DE8EC7CB67F11A3108BFDD615EAB00176C1E9061534E5CB22FC65CBE45E9B49C57254C3A12BA8FFDC647CBFBC94AC6A51F864AA7F25918759417C651/playlist.m3u8',\n",
 46 |        " 'NEWS_URL': 'http://news.jtbc.joins.com/article/article.aspx?news_id=NB11659024',\n",
 47 |        " 'MOVIE_ID': 'NV10214115'}"
 48 |       ]
 49 |      },
 50 |      "execution_count": 19,
 51 |      "metadata": {},
 52 |      "output_type": "execute_result"
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "kim_movie_info_list[0]"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 20,
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "data": {
 66 |       "text/plain": [
 67 |        "1296"
 68 |       ]
 69 |      },
 70 |      "execution_count": 20,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "len(kim_movie_info_list)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 25,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "ffmpeg\n",
 89 |       "./test_encoder -y -loglevel panic -i ./test_video -ab 160k -ac 2 -ar 44100 -vn ./test_audio\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "if not os.path.exists('audio_path'):\n",
 95 |     "    encoder = get_encoder_name()\n",
 96 |     "    print(encoder)\n",
 97 |     "    command = \"{} -y -loglevel panic -i {} -ab 160k -ac 2 -ar 44100 -vn {}\".\\\n",
 98 |     "            format('./test_encoder', './test_video', './test_audio')\n",
 99 |     "    print (command)\n",
100 |     "    subprocess.call(command, shell=True)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": []
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": []
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": []
123 |   }
124 |  ],
125 |  "metadata": {
126 |   "kernelspec": {
127 |    "display_name": "Python 3",
128 |    "language": "python",
129 |    "name": "python3"
130 |   }
131 |  },
132 |  "nbformat": 4,
133 |  "nbformat_minor": 2
134 | }
135 | 


--------------------------------------------------------------------------------