├── .gitignore ├── DISCLAIMER ├── LICENSE ├── README.md ├── app.py ├── assets ├── attention_multi_speaker.gif ├── attention_single_speaker.gif └── model.png ├── audio ├── __init__.py ├── audio_range.py ├── get_duration.py ├── google_speech.py └── silence.py ├── datasets ├── LJSpeech_1_0 │ ├── README │ └── prepare.py ├── __init__.py ├── datafeeder.py ├── generate_data.py ├── kim_anchor │ └── download.py ├── son │ └── download.py └── yuinna │ └── download.py ├── download.py ├── eval.py ├── hparams.py ├── models ├── __init__.py ├── helpers.py ├── modules.py ├── rnn_wrappers.py └── tacotron.py ├── nohup.out ├── recognition ├── alignment.py └── google.py ├── requirements.txt ├── run.sh ├── scripts └── prepare_son.sh ├── synthesizer.py ├── text ├── __init__.py ├── cleaners.py ├── en_numbers.py ├── english.py ├── ko_dictionary.py ├── korean.py └── symbols.py ├── train.py ├── utils ├── NanumBarunGothic.ttf ├── __init__.py ├── infolog.py └── plot.py ├── web ├── static │ ├── css │ │ └── main.css │ └── js │ │ ├── main.js │ │ └── siriwave.js └── templates │ └── index.html └── 김앵커한마디_음성받아오기.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Text 2 | *.png 3 | *.txt 4 | *.json 5 | *.csv 6 | 7 | # Data 8 | logs 9 | *.npy 10 | *.npz 11 | *.tar 12 | *.tar.gz 13 | 14 | # Media 15 | *.mp4 16 | *.mp3 17 | *.flac 18 | *.wav 19 | *.ts 20 | *.avi 21 | 22 | .DS_Store 23 | 24 | # Created by https://www.gitignore.io/api/python,vim 25 | 26 | ### Python ### 27 | # Byte-compiled / optimized / DLL files 28 | __pycache__/ 29 | *.py[cod] 30 | *$py.class 31 | 32 | # C extensions 33 | *.so 34 | 35 | # Distribution / packaging 36 | .Python 37 | env/ 38 | build/ 39 | develop-eggs/ 40 | dist/ 41 | downloads/ 42 | eggs/ 43 | .eggs/ 44 | lib64/ 45 | parts/ 46 | sdist/ 47 | var/ 48 | wheels/ 49 | *.egg-info/ 50 | .installed.cfg 51 | *.egg 52 | 53 | # PyInstaller 54 | # Usually these files are written by a python script from a template 55 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 56 | *.manifest 57 | *.spec 58 | 59 | # Installer logs 60 | pip-log.txt 61 | pip-delete-this-directory.txt 62 | 63 | # Unit test / coverage reports 64 | htmlcov/ 65 | .tox/ 66 | .coverage 67 | .coverage.* 68 | .cache 69 | nosetests.xml 70 | coverage.xml 71 | *,cover 72 | .hypothesis/ 73 | 74 | # Translations 75 | *.mo 76 | *.pot 77 | 78 | # Django stuff: 79 | *.log 80 | local_settings.py 81 | 82 | # Flask stuff: 83 | instance/ 84 | .webassets-cache 85 | 86 | # Scrapy stuff: 87 | .scrapy 88 | 89 | # Sphinx documentation 90 | docs/_build/ 91 | 92 | # PyBuilder 93 | target/ 94 | 95 | # Jupyter Notebook 96 | .ipynb_checkpoints 97 | 98 | # pyenv 99 | .python-version 100 | 101 | # celery beat schedule file 102 | celerybeat-schedule 103 | 104 | # dotenv 105 | .env 106 | 107 | # virtualenv 108 | .venv/ 109 | venv/ 110 | ENV/ 111 | 112 | # Spyder project settings 113 | .spyderproject 114 | 115 | # Rope project settings 116 | .ropeproject 117 | 118 | 119 | ### Vim ### 120 | # swap 121 | [._]*.s[a-v][a-z] 122 | [._]*.sw[a-p] 123 | [._]s[a-v][a-z] 124 | [._]sw[a-p] 125 | # session 126 | Session.vim 127 | # temporary 128 | .netrwhist 129 | *~ 130 | # auto-generated tag files 131 | tags 132 | 133 | # End of https://www.gitignore.io/api/python,vim 134 | -------------------------------------------------------------------------------- /DISCLAIMER: -------------------------------------------------------------------------------- 1 | This is not an official [DEVSISTERS](http://devsisters.com/) product and is not responsible for misuse or for any damage that you may cause. You agree that you use this software at your own risk. 2 | 3 | 이것은 [데브시스터즈](http://devsisters.com/)의 공식적인 제품이 아닙니다. [데브시스터즈](http://devsisters.com )는 이 코드를 잘못 사용했을 시 발생한 문제나 이슈에 대한 책임을 지지 않으며 이 소프트웨어의 사용은 사용자 자신에>게 전적으로 책임이 있습니다. 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Devsisters 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | 21 | 22 | Copyright (c) 2017 Keith Ito 23 | 24 | Permission is hereby granted, free of charge, to any person obtaining a copy 25 | of this software and associated documentation files (the "Software"), to deal 26 | in the Software without restriction, including without limitation the rights 27 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 28 | copies of the Software, and to permit persons to whom the Software is 29 | furnished to do so, subject to the following conditions: 30 | 31 | The above copyright notice and this permission notice shall be included in 32 | all copies or substantial portions of the Software. 33 | 34 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 35 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 36 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 37 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 38 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 39 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 40 | THE SOFTWARE. 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 책 읽어주는 딥러닝: 배우 유인나가 해리포터를 읽어준다면 DEVIEW 2017 from Taehoon Kim 님이 발표한 자료를 바탕으로 약간의 소스코드를 더하여 수정한 소스코드입니다. 자세한 구현 방법은 아래의 블로그 글에서 참고하시면 좋을 것 같습니다 :) 2 | 3 | - DeepVoice 이용한 프로젝트 수행해보기 4 | - [인공지능 deep voice를 이용한 TTS(음성합성) 구현하기 _ 손석희 앵커](http://melonicedlatte.com/machinelearning/2018/07/02/215933.html) 5 | - [deep voice를 이용한 TTS(Text-To-Speech) 구현하기 _ 김앵커 한마디 학습](http://melonicedlatte.com/machinelearning/2018/07/09/141346.html) 6 | - fork 한 원래 repository 주소 : [carpedm20/multi-speaker-tacotron-tensorflow](https://github.com/carpedm20/multi-speaker-tacotron-tensorflow) 7 | --- 8 | 9 | # Multi-Speaker Tacotron in TensorFlow 10 | 11 | TensorFlow implementation of: 12 | 13 | - [Deep Voice 2: Multi-Speaker Neural Text-to-Speech](https://arxiv.org/abs/1705.08947) 14 | - [Listening while Speaking: Speech Chain by Deep Learning](https://arxiv.org/abs/1707.04879) 15 | - [Tacotron: Towards End-to-End Speech Synthesis](https://arxiv.org/abs/1703.10135) 16 | 17 | Samples audios (in Korean) can be found [here](http://carpedm20.github.io/tacotron/en.html). 18 | 19 | ![model](./assets/model.png) 20 | 21 | 22 | ## Prerequisites 23 | 24 | - Python 3.6+ 25 | - FFmpeg 26 | - [Tensorflow 1.3](https://www.tensorflow.org/install/) 27 | 28 | 29 | ## Usage 30 | 31 | ### 1. Install prerequisites 32 | 33 | After preparing [Tensorflow](https://www.tensorflow.org/install/), install prerequisites with: 34 | 35 | pip3 install -r requirements.txt 36 | python -c "import nltk; nltk.download('punkt')" 37 | 38 | If you want to synthesize a speech in Korean dicrectly, follow [2-3. Download pre-trained models](#2-3-download-pre-trained-models). 39 | 40 | 41 | ### 2-1. Generate custom datasets 42 | 43 | The `datasets` directory should look like: 44 | 45 | datasets 46 | ├── son 47 | │ ├── alignment.json 48 | │ └── audio 49 | │ ├── 1.mp3 50 | │ ├── 2.mp3 51 | │ ├── 3.mp3 52 | │ └── ... 53 | └── YOUR_DATASET 54 | ├── alignment.json 55 | └── audio 56 | ├── 1.mp3 57 | ├── 2.mp3 58 | ├── 3.mp3 59 | └── ... 60 | 61 | and `YOUR_DATASET/alignment.json` should look like: 62 | 63 | { 64 | "./datasets/YOUR_DATASET/audio/001.mp3": "My name is Taehoon Kim.", 65 | "./datasets/YOUR_DATASET/audio/002.mp3": "The buses aren't the problem.", 66 | "./datasets/YOUR_DATASET/audio/003.mp3": "They have discovered a new particle.", 67 | } 68 | 69 | After you prepare as described, you should genearte preprocessed data with: 70 | 71 | python3 -m datasets.generate_data ./datasets/YOUR_DATASET/alignment.json 72 | 73 | 74 | ### 2-2. Generate Korean datasets 75 | 76 | Follow below commands. (explain with `son` dataset) 77 | 78 | 0. To automate an alignment between sounds and texts, prepare `GOOGLE_APPLICATION_CREDENTIALS` to use [Google Speech Recognition API](https://cloud.google.com/speech/). To get credentials, read [this](https://developers.google.com/identity/protocols/application-default-credentials). 79 | 80 | export GOOGLE_APPLICATION_CREDENTIALS="YOUR-GOOGLE.CREDENTIALS.json" 81 | 82 | 1. Download speech(or video) and text. 83 | 84 | python3 -m datasets.son.download 85 | 86 | 2. Segment all audios on silence. 87 | 88 | python3 -m audio.silence --audio_pattern "./datasets/son/audio/*.wav" --method=pydub 89 | 90 | 3. By using [Google Speech Recognition API](https://cloud.google.com/speech/), we predict sentences for all segmented audios. 91 | 92 | python3 -m recognition.google --audio_pattern "./datasets/son/audio/*.*.wav" 93 | 94 | 4. By comparing original text and recognised text, save `audio<->text` pair information into `./datasets/son/alignment.json`. 95 | 96 | python3 -m recognition.alignment --recognition_path "./datasets/son/recognition.json" --score_threshold=0.5 97 | 98 | 5. Finally, generated numpy files which will be used in training. 99 | 100 | python3 -m datasets.generate_data ./datasets/son/alignment.json 101 | 102 | Because the automatic generation is extremely naive, the dataset is noisy. However, if you have enough datasets (20+ hours with random initialization or 5+ hours with pretrained model initialization), you can expect an acceptable quality of audio synthesis. 103 | 104 | ### 2-3. Generate English datasets 105 | 106 | 1. Download speech dataset at https://keithito.com/LJ-Speech-Dataset/ 107 | 108 | 2. Convert metadata CSV file to json file. (arguments are available for changing preferences) 109 | 110 | python3 -m datasets.LJSpeech_1_0.prepare 111 | 112 | 3. Finally, generate numpy files which will be used in training. 113 | 114 | python3 -m datasets.generate_data ./datasets/LJSpeech_1_0 115 | 116 | 117 | ### 3. Train a model 118 | 119 | The important hyperparameters for a models are defined in `hparams.py`. 120 | 121 | (**Change `cleaners` in `hparams.py` from `korean_cleaners` to `english_cleaners` to train with English dataset**) 122 | 123 | To train a single-speaker model: 124 | 125 | python3 train.py --data_path=datasets/son 126 | python3 train.py --data_path=datasets/son --initialize_path=PATH_TO_CHECKPOINT 127 | 128 | To train a multi-speaker model: 129 | 130 | # after change `model_type` in `hparams.py` to `deepvoice` or `simple` 131 | python3 train.py --data_path=datasets/son1,datasets/son2 132 | 133 | To restart a training from previous experiments such as `logs/son-20171015`: 134 | 135 | python3 train.py --data_path=datasets/son --load_path logs/son-20171015 136 | 137 | If you don't have good and enough (10+ hours) dataset, it would be better to use `--initialize_path` to use a well-trained model as initial parameters. 138 | 139 | 140 | ### 4. Synthesize audio 141 | 142 | You can train your own models with: 143 | 144 | python3 app.py --load_path logs/son-20171015 --num_speakers=1 145 | 146 | or generate audio directly with: 147 | 148 | python3 synthesizer.py --load_path logs/son-20171015 --text "이거 실화냐?" 149 | 150 | ### 4-1. Synthesizing non-korean(english) audio 151 | 152 | For generating non-korean audio, you must set the argument --is_korean False. 153 | 154 | python3 app.py --load_path logs/LJSpeech_1_0-20180108 --num_speakers=1 --is_korean=False 155 | python3 synthesizer.py --load_path logs/LJSpeech_1_0-20180108 --text="Winter is coming." --is_korean=False 156 | 157 | ## Results 158 | 159 | Training attention on single speaker model: 160 | 161 | ![model](./assets/attention_single_speaker.gif) 162 | 163 | Training attention on multi speaker model: 164 | 165 | ![model](./assets/attention_multi_speaker.gif) 166 | 167 | 168 | ## Disclaimer 169 | 170 | This is not an official [DEVSISTERS](http://devsisters.com/) product. This project is not responsible for misuse or for any damage that you may cause. You agree that you use this software at your own risk. 171 | 172 | 173 | ## References 174 | 175 | - [Keith Ito](https://github.com/keithito)'s [tacotron](https://github.com/keithito/tacotron) 176 | - [DEVIEW 2017 presentation](https://www.slideshare.net/carpedm20/deview-2017-80824162) 177 | 178 | 179 | ## Author 180 | 181 | Taehoon Kim / [@carpedm20](http://carpedm20.github.io/) 182 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melonicedlatte/multi-speaker-tacotron-tensorflow/b98628b467e0e577541d48d344dc3347d99c8392/app.py -------------------------------------------------------------------------------- /assets/attention_multi_speaker.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melonicedlatte/multi-speaker-tacotron-tensorflow/b98628b467e0e577541d48d344dc3347d99c8392/assets/attention_multi_speaker.gif -------------------------------------------------------------------------------- /assets/attention_single_speaker.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melonicedlatte/multi-speaker-tacotron-tensorflow/b98628b467e0e577541d48d344dc3347d99c8392/assets/attention_single_speaker.gif -------------------------------------------------------------------------------- /assets/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melonicedlatte/multi-speaker-tacotron-tensorflow/b98628b467e0e577541d48d344dc3347d99c8392/assets/model.png -------------------------------------------------------------------------------- /audio/__init__.py: -------------------------------------------------------------------------------- 1 | # Code based on https://github.com/keithito/tacotron/blob/master/util/audio.py 2 | import math 3 | import numpy as np 4 | import tensorflow as tf 5 | from scipy import signal 6 | from hparams import hparams 7 | 8 | import librosa 9 | import librosa.filters 10 | 11 | 12 | def load_audio(path, pre_silence_length=0, post_silence_length=0): 13 | audio = librosa.core.load(path, sr=hparams.sample_rate)[0] 14 | if pre_silence_length > 0 or post_silence_length > 0: 15 | audio = np.concatenate([ 16 | get_silence(pre_silence_length), 17 | audio, 18 | get_silence(post_silence_length), 19 | ]) 20 | return audio 21 | 22 | def save_audio(audio, path, sample_rate=None): 23 | audio *= 32767 / max(0.01, np.max(np.abs(audio))) 24 | librosa.output.write_wav(path, audio.astype(np.int16), 25 | hparams.sample_rate if sample_rate is None else sample_rate) 26 | 27 | print(" [*] Audio saved: {}".format(path)) 28 | 29 | 30 | def resample_audio(audio, target_sample_rate): 31 | return librosa.core.resample( 32 | audio, hparams.sample_rate, target_sample_rate) 33 | 34 | 35 | def get_duration(audio): 36 | return librosa.core.get_duration(audio, sr=hparams.sample_rate) 37 | 38 | 39 | def frames_to_hours(n_frames): 40 | return sum((n_frame for n_frame in n_frames)) * \ 41 | hparams.frame_shift_ms / (3600 * 1000) 42 | 43 | 44 | def get_silence(sec): 45 | return np.zeros(hparams.sample_rate * sec) 46 | 47 | 48 | def spectrogram(y): 49 | D = _stft(_preemphasis(y)) 50 | S = _amp_to_db(np.abs(D)) - hparams.ref_level_db 51 | return _normalize(S) 52 | 53 | 54 | def inv_spectrogram(spectrogram): 55 | S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db) # Convert back to linear 56 | return inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase 57 | 58 | 59 | def inv_spectrogram_tensorflow(spectrogram): 60 | S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db) 61 | return _griffin_lim_tensorflow(tf.pow(S, hparams.power)) 62 | 63 | 64 | def melspectrogram(y): 65 | D = _stft(_preemphasis(y)) 66 | S = _amp_to_db(_linear_to_mel(np.abs(D))) 67 | return _normalize(S) 68 | 69 | 70 | def inv_melspectrogram(melspectrogram): 71 | S = _mel_to_linear(_db_to_amp(_denormalize(melspectrogram))) # Convert back to linear 72 | return inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase 73 | 74 | 75 | # Based on https://github.com/librosa/librosa/issues/434 76 | def _griffin_lim(S): 77 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 78 | S_complex = np.abs(S).astype(np.complex) 79 | 80 | y = _istft(S_complex * angles) 81 | for i in range(hparams.griffin_lim_iters): 82 | angles = np.exp(1j * np.angle(_stft(y))) 83 | y = _istft(S_complex * angles) 84 | return y 85 | 86 | 87 | def _griffin_lim_tensorflow(S): 88 | with tf.variable_scope('griffinlim'): 89 | S = tf.expand_dims(S, 0) 90 | S_complex = tf.identity(tf.cast(S, dtype=tf.complex64)) 91 | y = _istft_tensorflow(S_complex) 92 | for i in range(hparams.griffin_lim_iters): 93 | est = _stft_tensorflow(y) 94 | angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64) 95 | y = _istft_tensorflow(S_complex * angles) 96 | return tf.squeeze(y, 0) 97 | 98 | 99 | def _stft(y): 100 | n_fft, hop_length, win_length = _stft_parameters() 101 | return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length) 102 | 103 | 104 | def _istft(y): 105 | _, hop_length, win_length = _stft_parameters() 106 | return librosa.istft(y, hop_length=hop_length, win_length=win_length) 107 | 108 | 109 | def _stft_tensorflow(signals): 110 | n_fft, hop_length, win_length = _stft_parameters() 111 | return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False) 112 | 113 | 114 | def _istft_tensorflow(stfts): 115 | n_fft, hop_length, win_length = _stft_parameters() 116 | return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft) 117 | 118 | def _stft_parameters(): 119 | n_fft = (hparams.num_freq - 1) * 2 120 | hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) 121 | win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate) 122 | return n_fft, hop_length, win_length 123 | 124 | 125 | # Conversions: 126 | 127 | _mel_basis = None 128 | _inv_mel_basis = None 129 | 130 | def _linear_to_mel(spectrogram): 131 | global _mel_basis 132 | if _mel_basis is None: 133 | _mel_basis = _build_mel_basis() 134 | return np.dot(_mel_basis, spectrogram) 135 | 136 | def _mel_to_linear(mel_spectrogram): 137 | global _inv_mel_basis 138 | if _inv_mel_basis is None: 139 | _inv_mel_basis = np.linalg.pinv(_build_mel_basis()) 140 | return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram)) 141 | 142 | def _build_mel_basis(): 143 | n_fft = (hparams.num_freq - 1) * 2 144 | return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels) 145 | 146 | def _amp_to_db(x): 147 | return 20 * np.log10(np.maximum(1e-5, x)) 148 | 149 | def _db_to_amp(x): 150 | return np.power(10.0, x * 0.05) 151 | 152 | def _db_to_amp_tensorflow(x): 153 | return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05) 154 | 155 | def _preemphasis(x): 156 | return signal.lfilter([1, -hparams.preemphasis], [1], x) 157 | 158 | def inv_preemphasis(x): 159 | return signal.lfilter([1], [1, -hparams.preemphasis], x) 160 | 161 | def _normalize(S): 162 | return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1) 163 | 164 | def _denormalize(S): 165 | return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db 166 | 167 | def _denormalize_tensorflow(S): 168 | return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db 169 | -------------------------------------------------------------------------------- /audio/audio_range.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import tqdm 4 | 5 | def search(dirname): 6 | try: 7 | filenames = os.listdir(dirname) 8 | for filename in filenames: 9 | full_filename = os.path.join(dirname, filename) 10 | if os.path.isdir(full_filename): 11 | search(full_filename) 12 | else: 13 | ext = os.path.splitext(full_filename)[-1] 14 | if ext == '.py': 15 | print(full_filename) 16 | except PermissionError: 17 | pass 18 | 19 | 20 | def audio_range(_load_path, _min, _max): 21 | base_dir = _load_path 22 | 23 | for (path, dir, files) in os.walk(base_dir): 24 | for filename in tqdm.tqdm(files): 25 | print(filename) 26 | each_size = os.path.getsize(path + '/' + filename) 27 | print(filename, ' / size is == ', each_size) 28 | 29 | ext = os.path.splitext(filename)[-1] 30 | if not ext == '.wav': 31 | print('This folder contains not audio file!! In audio folder, they must have only wav file!!') 32 | return 33 | 34 | print(os.getcwd()) 35 | 36 | # 규정 사이즈 이상은 제거 37 | if not (_min <= each_size and each_size <= _max) : 38 | print(path + '/' + filename, ' is removed!!') 39 | os.remove( path + '/' + filename ) 40 | 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument('--load_path', required=True) 46 | parser.add_argument('--min', default= 300000) 47 | parser.add_argument('--max', default=1600000) 48 | config = parser.parse_args() 49 | 50 | if not os.path.exists(config.load_path): 51 | print("wrong path!!") 52 | 53 | print (config.load_path) 54 | 55 | if config.load_path in 'kim_anchor': 56 | print("wrong path!! path must have kim_anchor") 57 | 58 | else : 59 | audio_range(config.load_path, config.min, config.max) 60 | 61 | # 텍스트에 아무 것도 없는 내용 제거 -------------------------------------------------------------------------------- /audio/get_duration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | from glob import glob 4 | from tqdm import tqdm 5 | from tinytag import TinyTag 6 | from collections import defaultdict 7 | from multiprocessing.dummy import Pool 8 | 9 | from utils import load_json 10 | 11 | def second_to_hour(sec): 12 | return str(datetime.timedelta(seconds=int(sec))) 13 | 14 | def get_duration(path): 15 | filename = os.path.basename(path) 16 | candidates = filename.split('.')[0].split('_') 17 | dataset = candidates[0] 18 | 19 | if not os.path.exists(path): 20 | print(" [!] {} not found".format(path)) 21 | return dataset, 0 22 | 23 | if True: # tinytag 24 | tag = TinyTag.get(path) 25 | duration = tag.duration 26 | else: # librosa 27 | y, sr = librosa.load(path) 28 | duration = librosa.get_duration(y=y, sr=sr) 29 | 30 | return dataset, duration 31 | 32 | def get_durations(paths, print_detail=True): 33 | duration_all = 0 34 | duration_book = defaultdict(list) 35 | 36 | pool = Pool() 37 | iterator = pool.imap_unordered(get_duration, paths) 38 | for dataset, duration in tqdm(iterator, total=len(paths)): 39 | duration_all += duration 40 | duration_book[dataset].append(duration) 41 | 42 | total_count = 0 43 | for book, duration in duration_book.items(): 44 | if book: 45 | time = second_to_hour(sum(duration)) 46 | file_count = len(duration) 47 | total_count += file_count 48 | 49 | if print_detail: 50 | print(" [*] Duration of {}: {} (file #: {})". \ 51 | format(book, time, file_count)) 52 | 53 | print(" [*] Total Duration : {} (file #: {})". \ 54 | format(second_to_hour(duration_all), total_count)) 55 | print() 56 | return duration_all 57 | 58 | 59 | if __name__ == '__main__': 60 | import argparse 61 | 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument('--audio-pattern', default=None) # datasets/krbook/audio/*.wav 64 | parser.add_argument('--data-path', default=None) # datasets/jtbc/alignment.json 65 | config, unparsed = parser.parse_known_args() 66 | 67 | if config.audio_pattern is not None: 68 | duration = get_durations(get_paths_by_pattern(config.data_dir)) 69 | elif config.data_path is not None: 70 | paths = load_json(config.data_path).keys() 71 | duration = get_durations(paths) 72 | -------------------------------------------------------------------------------- /audio/silence.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import json 5 | import librosa 6 | import argparse 7 | import numpy as np 8 | from tqdm import tqdm 9 | from glob import glob 10 | from pydub import silence 11 | from pydub import AudioSegment 12 | from functools import partial 13 | 14 | from hparams import hparams 15 | from utils import parallel_run, add_postfix 16 | from audio import load_audio, save_audio, get_duration, get_silence 17 | 18 | def abs_mean(x): 19 | return abs(x).mean() 20 | 21 | def remove_breath(audio): 22 | edges = librosa.effects.split( 23 | audio, top_db=40, frame_length=128, hop_length=32) 24 | 25 | for idx in range(len(edges)): 26 | start_idx, end_idx = edges[idx][0], edges[idx][1] 27 | if start_idx < len(audio): 28 | if abs_mean(audio[start_idx:end_idx]) < abs_mean(audio) - 0.05: 29 | audio[start_idx:end_idx] = 0 30 | 31 | return audio 32 | 33 | def split_on_silence_with_librosa( 34 | audio_path, top_db=40, frame_length=1024, hop_length=256, 35 | skip_idx=0, out_ext="wav", 36 | min_segment_length=3, max_segment_length=8, 37 | pre_silence_length=0, post_silence_length=0): 38 | 39 | filename = os.path.basename(audio_path).split('.', 1)[0] 40 | in_ext = audio_path.rsplit(".")[1] 41 | 42 | audio = load_audio(audio_path) 43 | 44 | edges = librosa.effects.split(audio, 45 | top_db=top_db, frame_length=frame_length, hop_length=hop_length) 46 | 47 | new_audio = np.zeros_like(audio) 48 | for idx, (start, end) in enumerate(edges[skip_idx:]): 49 | new_audio[start:end] = remove_breath(audio[start:end]) 50 | 51 | save_audio(new_audio, add_postfix(audio_path, "no_breath")) 52 | audio = new_audio 53 | edges = librosa.effects.split(audio, 54 | top_db=top_db, frame_length=frame_length, hop_length=hop_length) 55 | 56 | audio_paths = [] 57 | for idx, (start, end) in enumerate(edges[skip_idx:]): 58 | segment = audio[start:end] 59 | duration = get_duration(segment) 60 | 61 | if duration <= min_segment_length or duration >= max_segment_length: 62 | continue 63 | 64 | output_path = "{}/{}.{:04d}.{}".format( 65 | os.path.dirname(audio_path), filename, idx, out_ext) 66 | 67 | padded_segment = np.concatenate([ 68 | get_silence(pre_silence_length), 69 | segment, 70 | get_silence(post_silence_length), 71 | ]) 72 | 73 | 74 | 75 | save_audio(padded_segment, output_path) 76 | audio_paths.append(output_path) 77 | 78 | return audio_paths 79 | 80 | def read_audio(audio_path): 81 | return AudioSegment.from_file(audio_path) 82 | 83 | def split_on_silence_with_pydub( 84 | audio_path, skip_idx=0, out_ext="wav", 85 | silence_thresh=-40, min_silence_len=400, 86 | silence_chunk_len=100, keep_silence=100): 87 | 88 | filename = os.path.basename(audio_path).split('.', 1)[0] 89 | in_ext = audio_path.rsplit(".")[1] 90 | 91 | audio = read_audio(audio_path) 92 | not_silence_ranges = silence.detect_nonsilent( 93 | audio, min_silence_len=silence_chunk_len, 94 | silence_thresh=silence_thresh) 95 | 96 | edges = [not_silence_ranges[0]] 97 | 98 | for idx in range(1, len(not_silence_ranges)-1): 99 | cur_start = not_silence_ranges[idx][0] 100 | prev_end = edges[-1][1] 101 | 102 | if cur_start - prev_end < min_silence_len: 103 | edges[-1][1] = not_silence_ranges[idx][1] 104 | else: 105 | edges.append(not_silence_ranges[idx]) 106 | 107 | audio_paths = [] 108 | for idx, (start_idx, end_idx) in enumerate(edges[skip_idx:]): 109 | start_idx = max(0, start_idx - keep_silence) 110 | end_idx += keep_silence 111 | 112 | target_audio_path = "{}/{}.{:04d}.{}".format( 113 | os.path.dirname(audio_path), filename, idx, out_ext) 114 | 115 | segment=audio[start_idx:end_idx] 116 | 117 | segment.export(target_audio_path, out_ext) # for soundsegment 118 | 119 | audio_paths.append(target_audio_path) 120 | 121 | return audio_paths 122 | 123 | def split_on_silence_batch(audio_paths, method, **kargv): 124 | audio_paths.sort() 125 | method = method.lower() 126 | 127 | if method == "librosa": 128 | fn = partial(split_on_silence_with_librosa, **kargv) 129 | elif method == "pydub": 130 | fn = partial(split_on_silence_with_pydub, **kargv) 131 | 132 | parallel_run(fn, audio_paths, 133 | desc="Split on silence", parallel=False) 134 | 135 | if __name__ == "__main__": 136 | parser = argparse.ArgumentParser() 137 | parser.add_argument('--audio_pattern', required=True) 138 | parser.add_argument('--out_ext', default='wav') 139 | parser.add_argument('--method', choices=['librosa', 'pydub'], required=True) 140 | config = parser.parse_args() 141 | 142 | audio_paths = glob(config.audio_pattern) 143 | 144 | split_on_silence_batch( 145 | audio_paths, config.method, 146 | out_ext=config.out_ext, 147 | ) 148 | -------------------------------------------------------------------------------- /datasets/LJSpeech_1_0/README: -------------------------------------------------------------------------------- 1 | ----------------------------------------------------------------------------- 2 | The LJ Speech Dataset 3 | 4 | Version 1.0 5 | July 5, 2017 6 | https://keithito.com/LJ-Speech-Dataset 7 | ----------------------------------------------------------------------------- 8 | 9 | 10 | OVERVIEW 11 | 12 | This is a public domain speech dataset consisting of 13,100 short audio clips 13 | of a single speaker reading passages from 7 non-fiction books. A transcription 14 | is provided for each clip. Clips vary in length from 1 to 10 seconds and have 15 | a total length of approximately 24 hours. 16 | 17 | The texts were published between 1884 and 1964, and are in the public domain. 18 | The audio was recorded in 2016-17 by the LibriVox project and is also in the 19 | public domain. 20 | 21 | 22 | 23 | FILE FORMAT 24 | 25 | Metadata is provided in metadata.csv. This file consists of one record per 26 | line, delimited by the pipe character (0x7c). The fields are: 27 | 28 | 1. ID: this is the name of the corresponding .wav file 29 | 2. Transcription: words spoken by the reader (UTF-8) 30 | 3. Normalized Transcription: transcription with numbers, ordinals, and 31 | monetary units expanded into full words (UTF-8). 32 | 33 | Each audio file is a single-channel 16-bit PCM WAV with a sample rate of 34 | 22050 Hz. 35 | 36 | 37 | 38 | STATISTICS 39 | 40 | Total Clips 13,100 41 | Total Words 225,715 42 | Total Characters 1,308,674 43 | Total Duration 23:55:17 44 | Mean Clip Duration 6.57 sec 45 | Min Clip Duration 1.11 sec 46 | Max Clip Duration 10.10 sec 47 | Mean Words per Clip 17.23 48 | Distinct Words 13,821 49 | 50 | 51 | 52 | MISCELLANEOUS 53 | 54 | The audio clips range in length from approximately 1 second to 10 seconds. 55 | They were segmented automatically based on silences in the recording. Clip 56 | boundaries generally align with sentence or clause boundaries, but not always. 57 | 58 | The text was matched to the audio manually, and a QA pass was done to ensure 59 | that the text accurately matched the words spoken in the audio. 60 | 61 | The original LibriVox recordings were distributed as 128 kbps MP3 files. As a 62 | result, they may contain artifacts introduced by the MP3 encoding. 63 | 64 | The following abbreviations appear in the text. They may be expanded as 65 | follows: 66 | 67 | Abbreviation Expansion 68 | -------------------------- 69 | Mr. Mister 70 | Mrs. Misess (*) 71 | Dr. Doctor 72 | No. Number 73 | St. Saint 74 | Co. Company 75 | Jr. Junior 76 | Maj. Major 77 | Gen. General 78 | Drs. Doctors 79 | Rev. Reverend 80 | Lt. Lieutenant 81 | Hon. Honorable 82 | Sgt. Sergeant 83 | Capt. Captain 84 | Esq. Esquire 85 | Ltd. Limited 86 | Col. Colonel 87 | Ft. Fort 88 | 89 | * there's no standard expansion of "Mrs." 90 | 91 | 92 | 19 of the transcriptions contain non-ASCII characters (for example, LJ016-0257 93 | contains "raison d'être"). 94 | 95 | For more information or to report errors, please email kito@kito.us. 96 | 97 | 98 | 99 | LICENSE 100 | 101 | This dataset is in the public domain in the USA (and likely other countries as 102 | well). There are no restrictions on its use. For more information, please see: 103 | https://librivox.org/pages/public-domain. 104 | 105 | 106 | 107 | CREDITS 108 | 109 | This dataset consists of excerpts from the following works: 110 | 111 | * Morris, William, et al. Arts and Crafts Essays. 1893. 112 | * Griffiths, Arthur. The Chronicles of Newgate, Vol. 2. 1884. 113 | * Roosevelt, Franklin D. The Fireside Chats of Franklin Delano Roosevelt. 114 | 1933-42. 115 | * Harland, Marion. Marion Harland's Cookery for Beginners. 1893. 116 | * Rolt-Wheeler, Francis. The Science - History of the Universe, Vol. 5: 117 | Biology. 1910. 118 | * Banks, Edgar J. The Seven Wonders of the Ancient World. 1916. 119 | * President's Commission on the Assassination of President Kennedy. Report 120 | of the President's Commission on the Assassination of President Kennedy. 121 | 1964. 122 | 123 | Recordings by Linda Johnson. Alignment and annotation by Keith Ito. All text, 124 | audio, and annotations are in the public domain. 125 | 126 | If you would like to cite this work, please do so by linking to: 127 | https://keithito.com/LJ-Speech-Dataset 128 | 129 | or by using the citation: 130 | Ito, Keith. The LJ Speech Dataset. 2017. https://keithito.com/LJ-Speech-Dataset. 131 | -------------------------------------------------------------------------------- /datasets/LJSpeech_1_0/prepare.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jan 4 10:50:07 2018 4 | # Conversion tool for https://github.com/carpedm20/multi-speaker-tacotron-tensorflow 5 | This prepares LJ-Dataset (available at https://keithito.com/LJ-Speech-Dataset/) to json and wav format 6 | that can be processed into .npz file using datasets.generate_data. 7 | 8 | @author: engiecat (github) 9 | """ 10 | import os 11 | from utils import load_json, write_json, backup_file, str2bool 12 | import argparse 13 | 14 | base_dir = os.path.dirname(os.path.realpath(__file__)) 15 | work_dir = os.getcwd() 16 | class Data(object): 17 | def __init__( 18 | self, audio_name, audio_transcript,audio_normalized_transcript,audio_path='ERR'): 19 | self.audio_name = audio_name 20 | self.audio_transcript = audio_transcript 21 | self.audio_normalized_transcript=audio_normalized_transcript 22 | self.audio_path = audio_path 23 | 24 | def read_csv(path,fn_encoding='UTF8'): 25 | # reads csv file into audio snippet name and its transcript 26 | with open(path, encoding=fn_encoding) as f: 27 | data = [] 28 | temp='' # for storing non-normalized 29 | for line in f: 30 | audio_name, audio_transcript,audio_normalized_transcript = line.split('|') 31 | audio_transcript=audio_transcript.strip() 32 | audio_normalized_transcript=audio_normalized_transcript.strip() 33 | data.append(Data(audio_name, audio_transcript,audio_normalized_transcript)) 34 | return data 35 | 36 | def convert_name_to_path(name, audio_dir, audio_format): 37 | # converts audio snippet name to audio snippet path 38 | abs_audio_dir=os.path.abspath(os.path.join(base_dir,audio_dir)) 39 | # the audio directory is respective to dataset folder(base_dir) 40 | # while the working directory is at the root directory (work_dir) 41 | result= os.path.join('./',os.path.relpath(abs_audio_dir,work_dir), name+'.'+audio_format ) 42 | return result 43 | 44 | def convert_to_json_format(data, is_normalized): 45 | # converts into json format 46 | if is_normalized: 47 | result={data.audio_path:[data.audio_normalized_transcript]} 48 | else: 49 | result={data.audio_path:[data.audio_transcript]} 50 | return result 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument('--metadata', default="metadata.csv") 55 | parser.add_argument('--metadata_enconding', default="UTF8") 56 | parser.add_argument('--audio_dir', default="wavs") 57 | parser.add_argument('--audio_format', default='wav') 58 | parser.add_argument('--alignment_filename', default="alignment.json") 59 | parser.add_argument('--use_normalize', default=True, type=str2bool) 60 | config = parser.parse_args() 61 | 62 | print(' [*] Reading metadata file - '+config.metadata) 63 | data = read_csv(os.path.join(base_dir, config.metadata)) 64 | print(' [*] Converting to audio_path...') 65 | results={} 66 | for d in data: 67 | d.audio_path=convert_name_to_path(d.audio_name,config.audio_dir,config.audio_format) 68 | results.update(convert_to_json_format(d, config.use_normalize)) 69 | print(' [*] Saving to json...') 70 | alignment_path = \ 71 | os.path.join(base_dir, config.alignment_filename) 72 | if os.path.exists(alignment_path): 73 | backup_file(alignment_path) 74 | write_json(alignment_path, results) 75 | print(' [!] All Done!') 76 | print(work_dir) 77 | 78 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melonicedlatte/multi-speaker-tacotron-tensorflow/b98628b467e0e577541d48d344dc3347d99c8392/datasets/__init__.py -------------------------------------------------------------------------------- /datasets/datafeeder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import pprint 4 | import random 5 | import threading 6 | import traceback 7 | import numpy as np 8 | from glob import glob 9 | import tensorflow as tf 10 | from collections import defaultdict 11 | 12 | import text 13 | from utils.infolog import log 14 | from utils import parallel_run, remove_file 15 | from audio import frames_to_hours 16 | from audio.get_duration import get_durations 17 | 18 | 19 | _pad = 0 20 | 21 | def get_frame(path): 22 | data = np.load(path) 23 | n_frame = data["linear"].shape[0] 24 | n_token = len(data["tokens"]) 25 | return (path, n_frame, n_token) 26 | 27 | def get_path_dict( 28 | data_dirs, hparams, config, 29 | data_type, n_test=None, 30 | rng=np.random.RandomState(123)): 31 | 32 | # Load metadata: 33 | path_dict = {} 34 | for data_dir in data_dirs: 35 | paths = glob("{}/*.npz".format(data_dir)) 36 | 37 | if data_type == 'train': 38 | rng.shuffle(paths) 39 | 40 | if not config.skip_path_filter: 41 | items = parallel_run( 42 | get_frame, paths, desc="filter_by_min_max_frame_batch", parallel=True) 43 | 44 | min_n_frame = hparams.reduction_factor * hparams.min_iters 45 | max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor 46 | 47 | new_items = [(path, n) for path, n, n_tokens in items \ 48 | if min_n_frame <= n <= max_n_frame and n_tokens >= hparams.min_tokens] 49 | 50 | if any(check in data_dir for check in ["son", "yuinna"]): 51 | blacklists = [".0000.", ".0001.", "NB11479580.0001"] 52 | new_items = [item for item in new_items \ 53 | if any(check not in item[0] for check in blacklists)] 54 | 55 | new_paths = [path for path, n in new_items] 56 | new_n_frames = [n for path, n in new_items] 57 | 58 | hours = frames_to_hours(new_n_frames) 59 | 60 | log(' [{}] Loaded metadata for {} examples ({:.2f} hours)'. \ 61 | format(data_dir, len(new_n_frames), hours)) 62 | log(' [{}] Max length: {}'.format(data_dir, max(new_n_frames))) 63 | log(' [{}] Min length: {}'.format(data_dir, min(new_n_frames))) 64 | else: 65 | new_paths = paths 66 | 67 | if data_type == 'train': 68 | new_paths = new_paths[:-n_test] 69 | elif data_type == 'test': 70 | new_paths = new_paths[-n_test:] 71 | else: 72 | raise Exception(" [!] Unkown data_type: {}".format(data_type)) 73 | 74 | path_dict[data_dir] = new_paths 75 | 76 | return path_dict 77 | 78 | class DataFeeder(threading.Thread): 79 | '''Feeds batches of data into a queue on a background thread.''' 80 | 81 | def __init__(self, coordinator, data_dirs, 82 | hparams, config, batches_per_group, data_type, batch_size): 83 | super(DataFeeder, self).__init__() 84 | 85 | self._coord = coordinator 86 | self._hp = hparams 87 | self._cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 88 | self._step = 0 89 | self._offset = defaultdict(lambda: 2) 90 | self._batches_per_group = batches_per_group 91 | 92 | self.rng = np.random.RandomState(config.random_seed) 93 | self.data_type = data_type 94 | self.batch_size = batch_size 95 | 96 | self.min_tokens = hparams.min_tokens 97 | self.min_n_frame = hparams.reduction_factor * hparams.min_iters 98 | self.max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor 99 | self.skip_path_filter = config.skip_path_filter 100 | 101 | # Load metadata: 102 | self.path_dict = get_path_dict( 103 | data_dirs, self._hp, config, self.data_type, 104 | n_test=self.batch_size, rng=self.rng) 105 | 106 | self.data_dirs = list(self.path_dict.keys()) 107 | self.data_dir_to_id = { 108 | data_dir: idx for idx, data_dir in enumerate(self.data_dirs)} 109 | 110 | data_weight = { 111 | data_dir: 1. for data_dir in self.data_dirs 112 | } 113 | 114 | if self._hp.main_data_greedy_factor > 0 and \ 115 | any(main_data in data_dir for data_dir in self.data_dirs \ 116 | for main_data in self._hp.main_data): 117 | for main_data in self._hp.main_data: 118 | for data_dir in self.data_dirs: 119 | if main_data in data_dir: 120 | data_weight[data_dir] += self._hp.main_data_greedy_factor 121 | 122 | weight_Z = sum(data_weight.values()) 123 | self.data_ratio = { 124 | data_dir: weight / weight_Z for data_dir, weight in data_weight.items() 125 | } 126 | 127 | log("="*40) 128 | log(pprint.pformat(self.data_ratio, indent=4)) 129 | log("="*40) 130 | 131 | #audio_paths = [path.replace("/data/", "/audio/"). \ 132 | # replace(".npz", ".wav") for path in self.data_paths] 133 | #duration = get_durations(audio_paths, print_detail=False) 134 | 135 | # Create placeholders for inputs and targets. Don't specify batch size because we want to 136 | # be able to feed different sized batches at eval time. 137 | 138 | self._placeholders = [ 139 | tf.placeholder(tf.int32, [None, None], 'inputs'), 140 | tf.placeholder(tf.int32, [None], 'input_lengths'), 141 | tf.placeholder(tf.float32, [None], 'loss_coeff'), 142 | tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), 143 | tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets'), 144 | ] 145 | 146 | # Create queue for buffering data: 147 | dtypes = [tf.int32, tf.int32, tf.float32, tf.float32, tf.float32] 148 | 149 | self.is_multi_speaker = len(self.data_dirs) > 1 150 | 151 | if self.is_multi_speaker: 152 | self._placeholders.append( 153 | tf.placeholder(tf.int32, [None], 'inputs'), 154 | ) 155 | dtypes.append(tf.int32) 156 | 157 | num_worker = 8 if self.data_type == 'train' else 1 158 | queue = tf.FIFOQueue(num_worker, dtypes, name='input_queue') 159 | 160 | self._enqueue_op = queue.enqueue(self._placeholders) 161 | 162 | if self.is_multi_speaker: 163 | self.inputs, self.input_lengths, self.loss_coeff, \ 164 | self.mel_targets, self.linear_targets, self.speaker_id = queue.dequeue() 165 | else: 166 | self.inputs, self.input_lengths, self.loss_coeff, \ 167 | self.mel_targets, self.linear_targets = queue.dequeue() 168 | 169 | self.inputs.set_shape(self._placeholders[0].shape) 170 | self.input_lengths.set_shape(self._placeholders[1].shape) 171 | self.loss_coeff.set_shape(self._placeholders[2].shape) 172 | self.mel_targets.set_shape(self._placeholders[3].shape) 173 | self.linear_targets.set_shape(self._placeholders[4].shape) 174 | 175 | if self.is_multi_speaker: 176 | self.speaker_id.set_shape(self._placeholders[5].shape) 177 | else: 178 | self.speaker_id = None 179 | 180 | if self.data_type == 'test': 181 | examples = [] 182 | while True: 183 | for data_dir in self.data_dirs: 184 | examples.append(self._get_next_example(data_dir)) 185 | #print(data_dir, text.sequence_to_text(examples[-1][0], False, True)) 186 | if len(examples) >= self.batch_size: 187 | break 188 | if len(examples) >= self.batch_size: 189 | break 190 | self.static_batches = [examples for _ in range(self._batches_per_group)] 191 | 192 | else: 193 | self.static_batches = None 194 | 195 | def start_in_session(self, session, start_step): 196 | self._step = start_step 197 | self._session = session 198 | self.start() 199 | 200 | 201 | def run(self): 202 | try: 203 | while not self._coord.should_stop(): 204 | self._enqueue_next_group() 205 | except Exception as e: 206 | traceback.print_exc() 207 | self._coord.request_stop(e) 208 | 209 | 210 | def _enqueue_next_group(self): 211 | start = time.time() 212 | 213 | # Read a group of examples: 214 | n = self.batch_size 215 | r = self._hp.reduction_factor 216 | 217 | if self.static_batches is not None: 218 | batches = self.static_batches 219 | else: 220 | examples = [] 221 | for data_dir in self.data_dirs: 222 | if self._hp.initial_data_greedy: 223 | if self._step < self._hp.initial_phase_step and \ 224 | any("krbook" in data_dir for data_dir in self.data_dirs): 225 | data_dir = [data_dir for data_dir in self.data_dirs if "krbook" in data_dir][0] 226 | 227 | if self._step < self._hp.initial_phase_step: 228 | example = [self._get_next_example(data_dir) \ 229 | for _ in range(int(n * self._batches_per_group // len(self.data_dirs)))] 230 | else: 231 | example = [self._get_next_example(data_dir) \ 232 | for _ in range(int(n * self._batches_per_group * self.data_ratio[data_dir]))] 233 | examples.extend(example) 234 | examples.sort(key=lambda x: x[-1]) 235 | 236 | batches = [examples[i:i+n] for i in range(0, len(examples), n)] 237 | self.rng.shuffle(batches) 238 | 239 | log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) 240 | for batch in batches: 241 | feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r, self.rng, self.data_type))) 242 | self._session.run(self._enqueue_op, feed_dict=feed_dict) 243 | self._step += 1 244 | 245 | 246 | def _get_next_example(self, data_dir): 247 | '''Loads a single example (input, mel_target, linear_target, cost) from disk''' 248 | data_paths = self.path_dict[data_dir] 249 | 250 | while True: 251 | if self._offset[data_dir] >= len(data_paths): 252 | self._offset[data_dir] = 0 253 | 254 | if self.data_type == 'train': 255 | self.rng.shuffle(data_paths) 256 | 257 | data_path = data_paths[self._offset[data_dir]] 258 | self._offset[data_dir] += 1 259 | 260 | try: 261 | if os.path.exists(data_path): 262 | data = np.load(data_path) 263 | else: 264 | continue 265 | except: 266 | remove_file(data_path) 267 | continue 268 | 269 | if not self.skip_path_filter: 270 | break 271 | 272 | if self.min_n_frame <= data["linear"].shape[0] <= self.max_n_frame and \ 273 | len(data["tokens"]) > self.min_tokens: 274 | break 275 | 276 | input_data = data['tokens'] 277 | mel_target = data['mel'] 278 | 279 | if 'loss_coeff' in data: 280 | loss_coeff = data['loss_coeff'] 281 | else: 282 | loss_coeff = 1 283 | linear_target = data['linear'] 284 | 285 | return (input_data, loss_coeff, mel_target, linear_target, 286 | self.data_dir_to_id[data_dir], len(linear_target)) 287 | 288 | 289 | def _prepare_batch(batch, reduction_factor, rng, data_type=None): 290 | if data_type == 'train': 291 | rng.shuffle(batch) 292 | 293 | inputs = _prepare_inputs([x[0] for x in batch]) 294 | input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32) 295 | loss_coeff = np.asarray([x[1] for x in batch], dtype=np.float32) 296 | 297 | mel_targets = _prepare_targets([x[2] for x in batch], reduction_factor) 298 | linear_targets = _prepare_targets([x[3] for x in batch], reduction_factor) 299 | 300 | if len(batch[0]) == 6: 301 | speaker_id = np.asarray([x[4] for x in batch], dtype=np.int32) 302 | return (inputs, input_lengths, loss_coeff, 303 | mel_targets, linear_targets, speaker_id) 304 | else: 305 | return (inputs, input_lengths, loss_coeff, mel_targets, linear_targets) 306 | 307 | 308 | def _prepare_inputs(inputs): 309 | max_len = max((len(x) for x in inputs)) 310 | return np.stack([_pad_input(x, max_len) for x in inputs]) 311 | 312 | 313 | def _prepare_targets(targets, alignment): 314 | max_len = max((len(t) for t in targets)) + 1 315 | return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets]) 316 | 317 | 318 | def _pad_input(x, length): 319 | return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) 320 | 321 | 322 | def _pad_target(t, length): 323 | return np.pad(t, [(0, length - t.shape[0]), (0,0)], mode='constant', constant_values=_pad) 324 | 325 | 326 | def _round_up(x, multiple): 327 | remainder = x % multiple 328 | return x if remainder == 0 else x + multiple - remainder 329 | -------------------------------------------------------------------------------- /datasets/generate_data.py: -------------------------------------------------------------------------------- 1 | # Code based on https://github.com/keithito/tacotron/blob/master/datasets/ljspeech.py 2 | import os 3 | import re 4 | import sys 5 | import json 6 | import argparse 7 | import numpy as np 8 | from tqdm import tqdm 9 | from glob import glob 10 | from functools import partial 11 | 12 | from collections import Counter, defaultdict 13 | from concurrent.futures import ProcessPoolExecutor 14 | 15 | import matplotlib 16 | matplotlib.use('agg') 17 | import matplotlib.pyplot as plt 18 | 19 | from hparams import hparams 20 | from text import text_to_sequence 21 | from utils import makedirs, remove_file, warning 22 | from audio import load_audio, spectrogram, melspectrogram, frames_to_hours 23 | 24 | def one(x=None): 25 | return 1 26 | 27 | def build_from_path(config): 28 | warning("Sampling rate: {}".format(hparams.sample_rate)) 29 | 30 | executor = ProcessPoolExecutor(max_workers=config.num_workers) 31 | futures = [] 32 | index = 1 33 | 34 | base_dir = os.path.dirname(config.metadata_path) 35 | data_dir = os.path.join(base_dir, config.data_dirname) 36 | makedirs(data_dir) 37 | 38 | loss_coeff = defaultdict(one) 39 | if config.metadata_path.endswith("json"): 40 | with open(config.metadata_path) as f: 41 | content = f.read() 42 | info = json.loads(content) 43 | elif config.metadata_path.endswith("csv"): 44 | with open(config.metadata_path) as f: 45 | info = {} 46 | for line in f: 47 | path, text = line.strip().split('|') 48 | info[path] = text 49 | else: 50 | raise Exception(" [!] Unkown metadata format: {}".format(config.metadata_path)) 51 | 52 | new_info = {} 53 | for path in info.keys(): 54 | if not os.path.exists(path): 55 | new_path = os.path.join(base_dir, path) 56 | if not os.path.exists(new_path): 57 | print(" [!] Audio not found: {}".format([path, new_path])) 58 | continue 59 | else: 60 | new_path = path 61 | 62 | new_info[new_path] = info[path] 63 | 64 | info = new_info 65 | 66 | for path in info.keys(): 67 | if type(info[path]) == list: 68 | if hparams.ignore_recognition_level == 1 and len(info[path]) == 1 or \ 69 | hparams.ignore_recognition_level == 2: 70 | loss_coeff[path] = hparams.recognition_loss_coeff 71 | 72 | info[path] = info[path][0] 73 | 74 | ignore_description = { 75 | 0: "use all", 76 | 1: "ignore only unmatched_alignment", 77 | 2: "fully ignore recognitio", 78 | } 79 | 80 | print(" [!] Skip recognition level: {} ({})". \ 81 | format(hparams.ignore_recognition_level, 82 | ignore_description[hparams.ignore_recognition_level])) 83 | 84 | for audio_path, text in info.items(): 85 | if hparams.ignore_recognition_level > 0 and loss_coeff[audio_path] != 1: 86 | continue 87 | 88 | if base_dir not in audio_path: 89 | audio_path = os.path.join(base_dir, audio_path) 90 | 91 | try: 92 | tokens = text_to_sequence(text) 93 | except: 94 | continue 95 | 96 | fn = partial( 97 | _process_utterance, 98 | audio_path, data_dir, tokens, loss_coeff[audio_path]) 99 | futures.append(executor.submit(fn)) 100 | 101 | n_frames = [future.result() for future in tqdm(futures)] 102 | n_frames = [n_frame for n_frame in n_frames if n_frame is not None] 103 | 104 | hours = frames_to_hours(n_frames) 105 | 106 | print(' [*] Loaded metadata for {} examples ({:.2f} hours)'.format(len(n_frames), hours)) 107 | print(' [*] Max length: {}'.format(max(n_frames))) 108 | print(' [*] Min length: {}'.format(min(n_frames))) 109 | 110 | plot_n_frames(n_frames, os.path.join( 111 | base_dir, "n_frames_before_filter.png")) 112 | 113 | min_n_frame = hparams.reduction_factor * hparams.min_iters 114 | max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor 115 | 116 | n_frames = [n for n in n_frames if min_n_frame <= n <= max_n_frame] 117 | hours = frames_to_hours(n_frames) 118 | 119 | print(' [*] After filtered: {} examples ({:.2f} hours)'.format(len(n_frames), hours)) 120 | print(' [*] Max length: {}'.format(max(n_frames))) 121 | print(' [*] Min length: {}'.format(min(n_frames))) 122 | 123 | plot_n_frames(n_frames, os.path.join( 124 | base_dir, "n_frames_after_filter.png")) 125 | 126 | def plot_n_frames(n_frames, path): 127 | labels, values = list(zip(*Counter(n_frames).most_common())) 128 | 129 | values = [v for _, v in sorted(zip(labels, values))] 130 | labels = sorted(labels) 131 | 132 | indexes = np.arange(len(labels)) 133 | width = 1 134 | 135 | fig, ax = plt.subplots(figsize=(len(labels) / 2, 5)) 136 | 137 | plt.bar(indexes, values, width) 138 | plt.xticks(indexes + width * 0.5, labels) 139 | 140 | plt.tight_layout() 141 | plt.savefig(path) 142 | 143 | 144 | def _process_utterance(audio_path, data_dir, tokens, loss_coeff): 145 | audio_name = os.path.basename(audio_path) 146 | 147 | filename = audio_name.rsplit('.', 1)[0] + ".npz" 148 | numpy_path = os.path.join(data_dir, filename) 149 | 150 | if not os.path.exists(numpy_path): 151 | wav = load_audio(audio_path) 152 | 153 | linear_spectrogram = spectrogram(wav).astype(np.float32) 154 | mel_spectrogram = melspectrogram(wav).astype(np.float32) 155 | 156 | data = { 157 | "linear": linear_spectrogram.T, 158 | "mel": mel_spectrogram.T, 159 | "tokens": tokens, 160 | "loss_coeff": loss_coeff, 161 | } 162 | 163 | n_frame = linear_spectrogram.shape[1] 164 | 165 | if hparams.skip_inadequate: 166 | min_n_frame = hparams.reduction_factor * hparams.min_iters 167 | max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor 168 | 169 | if min_n_frame <= n_frame <= max_n_frame and len(tokens) >= hparams.min_tokens: 170 | return None 171 | 172 | np.savez(numpy_path, **data, allow_pickle=False) 173 | else: 174 | try: 175 | data = np.load(numpy_path) 176 | n_frame = data["linear"].shape[0] 177 | except: 178 | remove_file(numpy_path) 179 | return _process_utterance(audio_path, data_dir, tokens, loss_coeff) 180 | 181 | return n_frame 182 | 183 | if __name__ == '__main__': 184 | parser = argparse.ArgumentParser(description='spectrogram') 185 | 186 | parser.add_argument('metadata_path', type=str) 187 | parser.add_argument('--data_dirname', type=str, default="data") 188 | parser.add_argument('--num_workers', type=int, default=None) 189 | 190 | config = parser.parse_args() 191 | build_from_path(config) 192 | -------------------------------------------------------------------------------- /datasets/kim_anchor/download.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sys 4 | import m3u8 5 | import json 6 | import requests 7 | import subprocess 8 | from functools import partial 9 | from bs4 import BeautifulSoup 10 | from utils import get_encoder_name, parallel_run, makedirs 11 | 12 | BASE_URL = 'http://nsvc.jtbc.joins.com/API/News/Newapp/Default.aspx' 13 | 14 | def soupify(text): 15 | return BeautifulSoup(text, "html.parser") 16 | 17 | def get_news_ids(page_id): 18 | params = { 19 | 'NJC': 'NJC300', 20 | 'CAID': 'NC10011174', 21 | 'PGI': page_id, 22 | } 23 | 24 | response = requests.request( 25 | method='GET', url=BASE_URL, params=params, 26 | ) 27 | soup = soupify(response.text) 28 | 29 | return [item.text for item in soup.find_all('news_id')] 30 | 31 | def download_news_video_and_content( 32 | news_id, base_dir, chunk_size=32*1024, 33 | video_dir="video", asset_dir="assets", audio_dir="audio"): 34 | 35 | video_dir = os.path.join(base_dir, video_dir) 36 | asset_dir = os.path.join(base_dir, asset_dir) 37 | audio_dir = os.path.join(base_dir, audio_dir) 38 | 39 | makedirs(video_dir) 40 | makedirs(asset_dir) 41 | makedirs(audio_dir) 42 | 43 | text_path = os.path.join(asset_dir, "{}.txt".format(news_id)) 44 | original_text_path = os.path.join(asset_dir, "original-{}.txt".format(news_id)) 45 | 46 | video_path = os.path.join(video_dir, "{}.ts".format(news_id)) 47 | audio_path = os.path.join(audio_dir, "{}.wav".format(news_id)) 48 | audio_raw_path = os.path.join(audio_dir + "/../audio_raw/", "{}_raw.wav".format(news_id)) 49 | # prof_path = os.path.join(audio_dir + "/../audio_raw/", "{}.prof".format(news_id)) 50 | 51 | params = { 52 | 'NJC': 'NJC400', 53 | 'NID': news_id, # NB11515152 54 | 'CD': 'A0100', 55 | } 56 | 57 | response = requests.request( 58 | method='GET', url=BASE_URL, params=params, 59 | ) 60 | 61 | soup = soupify(response.text) 62 | 63 | try: 64 | article_title = soup.find_all('article_title') 65 | print(news_id) 66 | print(article_title) 67 | 68 | article_contents = soup.find_all('article_contents') 69 | 70 | assert len(article_contents) == 1, \ 71 | "# of of {} should be 1: {}".format(news_id, response.text) 72 | 73 | text = soupify(article_contents[0].text).get_text() # remove
74 | 75 | with open(original_text_path, "w") as f: 76 | f.write(text) 77 | 78 | with open(text_path, "w") as f: 79 | from nltk import sent_tokenize 80 | 81 | text = re.sub(r'\[.{0,80} :\s.+]', '', text) # remove quote 82 | text = re.sub(r'☞.+http.+\)', '', text) # remove quote 83 | text = re.sub(r'\(https?:\/\/.*[\r\n]*\)', '', text) # remove url 84 | 85 | sentences = sent_tokenize(text) 86 | sentences = [sent for sentence in sentences for sent in sentence.split('\n') if sent] 87 | 88 | new_texts = [] 89 | for sent in sentences: 90 | sent = sent.strip() 91 | sent = re.sub(r'\([^)]*\)', '', sent) 92 | #sent = re.sub(r'\<.{0,80}\>', '', sent) 93 | sent = sent.replace('…', '.') 94 | new_texts.append(sent) 95 | 96 | f.write("\n".join([sent for sent in new_texts if sent])) 97 | 98 | vod_paths = soup.find_all('vod_path') 99 | 100 | assert len(vod_paths) == 1, \ 101 | "# of of {} should be 1: {}".format(news_id, response.text) 102 | 103 | if not os.path.exists(video_path): 104 | redirect_url = soup.find_all('vod_path')[0].text 105 | 106 | list_url = m3u8.load(redirect_url).playlists[0].absolute_uri 107 | video_urls = [segment.absolute_uri for segment in m3u8.load(list_url).segments] 108 | 109 | with open(video_path, "wb") as f: 110 | for url in video_urls: 111 | response = requests.get(url, stream=True) 112 | total_size = int(response.headers.get('content-length', 0)) 113 | 114 | for chunk in response.iter_content(chunk_size): 115 | if chunk: # filter out keep-alive new chunks 116 | f.write(chunk) 117 | 118 | if not os.path.exists(audio_path): 119 | encoder = get_encoder_name() 120 | # 영상의 오디오 추출 121 | command = '{} -y -loglevel panic -i {} -ab 160k {}'.\ 122 | format(encoder, video_path, audio_path) 123 | subprocess.call(command, shell=True) 124 | 125 | 126 | # 잡음 제거 127 | command = 'noiseclean/noiseclean.sh {} {} '.format(audio_raw_path, audio_path) 128 | subprocess.call(command, shell=True) 129 | 130 | ################### 131 | 132 | # # prof 파일 생성 133 | # command = 'sox {} -n noiseprof {} '.\ 134 | # format(audio_raw_path, prof_path) 135 | # subprocess.call(command, shell=True) 136 | 137 | # # 잡음 제거 138 | # command = 'sox -S --multi-threaded --buffer 131072 {} {} noisered {} 0.21 norm;'.\ 139 | # format(audio_raw_path, audio_path , prof_path) 140 | # subprocess.call(command, shell=True) 141 | 142 | except Exception as ex: 143 | print (ex) 144 | 145 | return True 146 | 147 | if __name__ == '__main__': 148 | news_ids = [] 149 | page_idx = 1 150 | 151 | base_dir = os.path.dirname(os.path.realpath(__file__)) 152 | news_id_path = os.path.join(base_dir, "news_ids.json") 153 | 154 | if not os.path.exists(news_id_path): 155 | while True: 156 | tmp_ids = get_news_ids(page_idx) 157 | if len(tmp_ids) == 0: 158 | break 159 | 160 | news_ids.extend(tmp_ids) 161 | print(" [*] Download page {}: {}/{}".format(page_idx, len(tmp_ids), len(news_ids))) 162 | 163 | page_idx += 1 164 | 165 | with open(news_id_path, "w") as f: 166 | json.dump(news_ids, f, indent=2, ensure_ascii=False) 167 | else: 168 | with open(news_id_path) as f: 169 | news_ids = json.loads(f.read()) 170 | 171 | exceptions = ["NB10830162"] 172 | news_ids = list(set(news_ids) - set(exceptions)) 173 | 174 | fn = partial(download_news_video_and_content, base_dir=base_dir) 175 | 176 | results = parallel_run( 177 | fn, news_ids, desc="Download news video+text", parallel=True) 178 | -------------------------------------------------------------------------------- /datasets/son/download.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sys 4 | import m3u8 5 | import json 6 | import requests 7 | import subprocess 8 | from functools import partial 9 | from bs4 import BeautifulSoup 10 | 11 | from utils import get_encoder_name, parallel_run, makedirs 12 | 13 | API_URL = 'http://api.jtbc.joins.com/ad/pre/NV10173083' 14 | BASE_URL = 'http://nsvc.jtbc.joins.com/API/News/Newapp/Default.aspx' 15 | 16 | def soupify(text): 17 | return BeautifulSoup(text, "html.parser") 18 | 19 | def get_news_ids(page_id): 20 | params = { 21 | 'NJC': 'NJC300', 22 | 'CAID': 'NC10011174', 23 | 'PGI': page_id, 24 | } 25 | 26 | response = requests.request( 27 | method='GET', url=BASE_URL, params=params, 28 | ) 29 | soup = soupify(response.text) 30 | 31 | return [item.text for item in soup.find_all('news_id')] 32 | 33 | def download_news_video_and_content( 34 | news_id, base_dir, chunk_size=32*1024, 35 | video_dir="video", asset_dir="assets", audio_dir="audio"): 36 | 37 | video_dir = os.path.join(base_dir, video_dir) 38 | asset_dir = os.path.join(base_dir, asset_dir) 39 | audio_dir = os.path.join(base_dir, audio_dir) 40 | 41 | makedirs(video_dir) 42 | makedirs(asset_dir) 43 | makedirs(audio_dir) 44 | 45 | text_path = os.path.join(asset_dir, "{}.txt".format(news_id)) 46 | original_text_path = os.path.join(asset_dir, "original-{}.txt".format(news_id)) 47 | 48 | video_path = os.path.join(video_dir, "{}.ts".format(news_id)) 49 | audio_path = os.path.join(audio_dir, "{}.wav".format(news_id)) 50 | 51 | params = { 52 | 'NJC': 'NJC400', 53 | 'NID': news_id, # NB11515152 54 | 'CD': 'A0100', 55 | } 56 | 57 | response = requests.request( 58 | method='GET', url=BASE_URL, params=params, 59 | ) 60 | soup = soupify(response.text) 61 | 62 | article_contents = soup.find_all('article_contents') 63 | 64 | assert len(article_contents) == 1, \ 65 | "# of of {} should be 1: {}".format(news_id, response.text) 66 | 67 | text = soupify(article_contents[0].text).get_text() # remove
68 | 69 | with open(original_text_path, "w") as f: 70 | f.write(text) 71 | 72 | with open(text_path, "w") as f: 73 | from nltk import sent_tokenize 74 | 75 | text = re.sub(r'\[.{0,80} :\s.+]', '', text) # remove quote 76 | text = re.sub(r'☞.+http.+\)', '', text) # remove quote 77 | text = re.sub(r'\(https?:\/\/.*[\r\n]*\)', '', text) # remove url 78 | 79 | sentences = sent_tokenize(text) 80 | sentences = [sent for sentence in sentences for sent in sentence.split('\n') if sent] 81 | 82 | new_texts = [] 83 | for sent in sentences: 84 | sent = sent.strip() 85 | sent = re.sub(r'\([^)]*\)', '', sent) 86 | #sent = re.sub(r'\<.{0,80}\>', '', sent) 87 | sent = sent.replace('…', '.') 88 | new_texts.append(sent) 89 | 90 | f.write("\n".join([sent for sent in new_texts if sent])) 91 | 92 | vod_paths = soup.find_all('vod_path') 93 | 94 | assert len(vod_paths) == 1, \ 95 | "# of of {} should be 1: {}".format(news_id, response.text) 96 | 97 | if not os.path.exists(video_path): 98 | redirect_url = soup.find_all('vod_path')[0].text 99 | 100 | list_url = m3u8.load(redirect_url).playlists[0].absolute_uri 101 | video_urls = [segment.absolute_uri for segment in m3u8.load(list_url).segments] 102 | 103 | with open(video_path, "wb") as f: 104 | for url in video_urls: 105 | response = requests.get(url, stream=True) 106 | total_size = int(response.headers.get('content-length', 0)) 107 | 108 | for chunk in response.iter_content(chunk_size): 109 | if chunk: # filter out keep-alive new chunks 110 | f.write(chunk) 111 | 112 | if not os.path.exists(audio_path): 113 | encoder = get_encoder_name() 114 | command = "{} -y -loglevel panic -i {} -ab 160k -ac 2 -ar 44100 -vn {}".\ 115 | format(encoder, video_path, audio_path) 116 | subprocess.call(command, shell=True) 117 | 118 | return True 119 | 120 | if __name__ == '__main__': 121 | news_ids = [] 122 | page_idx = 1 123 | 124 | base_dir = os.path.dirname(os.path.realpath(__file__)) 125 | news_id_path = os.path.join(base_dir, "news_ids.json") 126 | 127 | if not os.path.exists(news_id_path): 128 | while True: 129 | tmp_ids = get_news_ids(page_idx) 130 | if len(tmp_ids) == 0: 131 | break 132 | 133 | news_ids.extend(tmp_ids) 134 | print(" [*] Download page {}: {}/{}".format(page_idx, len(tmp_ids), len(news_ids))) 135 | 136 | page_idx += 1 137 | 138 | with open(news_id_path, "w") as f: 139 | json.dump(news_ids, f, indent=2, ensure_ascii=False) 140 | else: 141 | with open(news_id_path) as f: 142 | news_ids = json.loads(f.read()) 143 | 144 | exceptions = ["NB10830162"] 145 | news_ids = list(set(news_ids) - set(exceptions)) 146 | 147 | fn = partial(download_news_video_and_content, base_dir=base_dir) 148 | 149 | results = parallel_run( 150 | fn, news_ids, desc="Download news video+text", parallel=True) 151 | -------------------------------------------------------------------------------- /datasets/yuinna/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import argparse 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from functools import partial 8 | 9 | from utils import download_with_url, makedirs, parallel_run 10 | 11 | base_path = os.path.dirname(os.path.realpath(__file__)) 12 | RSS_URL = "http://enabler.kbs.co.kr/api/podcast_channel/feed.xml?channel_id=R2010-0440" 13 | 14 | def itunes_download(item): 15 | audio_dir = os.path.join(base_path, "audio") 16 | 17 | date, url = item 18 | path = os.path.join(audio_dir, "{}.mp4".format(date)) 19 | 20 | if not os.path.exists(path): 21 | download_with_url(url, path) 22 | 23 | def download_all(config): 24 | audio_dir = os.path.join(base_path, "audio") 25 | makedirs(audio_dir) 26 | 27 | soup = BeautifulSoup(requests.get(RSS_URL).text, "html5lib") 28 | 29 | items = [item for item in soup.find_all('item')] 30 | 31 | titles = [item.find('title').text[9:-3] for item in items] 32 | guids = [item.find('guid').text for item in items] 33 | 34 | accept_list = ['친절한 인나씨', '반납예정일', '귀욤열매 드세요'] 35 | 36 | new_guids = [guid for title, guid in zip(titles, guids) \ 37 | if any(accept in title for accept in accept_list) and '-' not in title] 38 | new_titles = [title for title, _ in zip(titles, guids) \ 39 | if any(accept in title for accept in accept_list) and '-' not in title] 40 | 41 | for idx, title in enumerate(new_titles): 42 | print(" [{:3d}] {}, {}".format(idx + 1, title, 43 | os.path.basename(new_guids[idx]).split('_')[2])) 44 | if idx == config.max_num: print("="*30) 45 | 46 | urls = { 47 | os.path.basename(guid).split('_')[2]: guid \ 48 | for guid in new_guids[:config.max_num] 49 | } 50 | 51 | parallel_run(itunes_download, urls.items(), 52 | desc=" [*] Itunes download", parallel=True) 53 | 54 | if __name__ == '__main__': 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument('--max_num', default=100, type=int) 57 | config, unparsed = parser.parse_known_args() 58 | 59 | download_all(config) 60 | -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | # Code based on https://github.com/carpedm20/DCGAN-tensorflow/blob/master/download.py 2 | 3 | from __future__ import print_function 4 | import os 5 | import sys 6 | import gzip 7 | import json 8 | import tarfile 9 | import zipfile 10 | import argparse 11 | import requests 12 | from tqdm import tqdm 13 | from six.moves import urllib 14 | 15 | from utils import query_yes_no 16 | 17 | parser = argparse.ArgumentParser(description='Download model checkpoints.') 18 | parser.add_argument('checkpoints', metavar='N', type=str, nargs='+', choices=['son', 'park'], 19 | help='name of checkpoints to download [son, park]') 20 | 21 | def download(url, dirpath): 22 | filename = url.split('/')[-1] 23 | filepath = os.path.join(dirpath, filename) 24 | u = urllib.request.urlopen(url) 25 | f = open(filepath, 'wb') 26 | filesize = int(u.headers["Content-Length"]) 27 | print("Downloading: %s Bytes: %s" % (filename, filesize)) 28 | 29 | downloaded = 0 30 | block_sz = 8192 31 | status_width = 70 32 | while True: 33 | buf = u.read(block_sz) 34 | if not buf: 35 | print('') 36 | break 37 | else: 38 | print('', end='\r') 39 | downloaded += len(buf) 40 | f.write(buf) 41 | status = (("[%-" + str(status_width + 1) + "s] %3.2f%%") % 42 | ('=' * int(float(downloaded) / filesize * status_width) + '>', downloaded * 100. / filesize)) 43 | print(status, end='') 44 | sys.stdout.flush() 45 | f.close() 46 | return filepath 47 | 48 | def download_file_from_google_drive(id, destination): 49 | URL = "https://docs.google.com/uc?export=download" 50 | session = requests.Session() 51 | 52 | response = session.get(URL, params={ 'id': id }, stream=True) 53 | token = get_confirm_token(response) 54 | 55 | if token: 56 | params = { 'id' : id, 'confirm' : token } 57 | response = session.get(URL, params=params, stream=True) 58 | 59 | save_response_content(response, destination) 60 | 61 | def get_confirm_token(response): 62 | for key, value in response.cookies.items(): 63 | if key.startswith('download_warning'): 64 | return value 65 | return None 66 | 67 | def save_response_content(response, destination, chunk_size=32*1024): 68 | total_size = int(response.headers.get('content-length', 0)) 69 | with open(destination, "wb") as f: 70 | for chunk in tqdm(response.iter_content(chunk_size), total=total_size, 71 | unit='B', unit_scale=True, desc=destination): 72 | if chunk: # filter out keep-alive new chunks 73 | f.write(chunk) 74 | 75 | def unzip(filepath): 76 | print("Extracting: " + filepath) 77 | dirpath = os.path.dirname(filepath) 78 | with zipfile.ZipFile(filepath) as zf: 79 | zf.extractall(dirpath) 80 | os.remove(filepath) 81 | 82 | def download_checkpoint(checkpoint): 83 | if checkpoint == "son": 84 | save_path, drive_id = "son-20171015.tar.gz", "0B_7wC-DuR6ORcmpaY1A5V1AzZUU" 85 | elif checkpoint == "park": 86 | save_path, drive_id = "park-20171015.tar.gz", "0B_7wC-DuR6ORYjhlekl5bVlkQ2c" 87 | else: 88 | raise Exception(" [!] Unknown checkpoint: {}".format(checkpoint)) 89 | 90 | if os.path.exists(save_path): 91 | print('[*] {} already exists'.format(save_path)) 92 | else: 93 | download_file_from_google_drive(drive_id, save_path) 94 | 95 | if save_path.endswith(".zip"): 96 | zip_dir = '' 97 | with zipfile.ZipFile(save_path) as zf: 98 | zip_dir = zf.namelist()[0] 99 | zf.extractall(dirpath) 100 | os.remove(save_path) 101 | os.rename(os.path.join(dirpath, zip_dir), os.path.join(dirpath, data_dir)) 102 | elif save_path.endswith("tar.gz"): 103 | tar = tarfile.open(save_path, "r:gz") 104 | tar.extractall() 105 | tar.close() 106 | elif save_path.endswith("tar"): 107 | tar = tarfile.open(save_path, "r:") 108 | tar.extractall() 109 | tar.close() 110 | 111 | if __name__ == '__main__': 112 | args = parser.parse_args() 113 | 114 | print(" [!] The pre-trained models are being made available for research purpose only") 115 | print(" [!] 학습된 모델을 연구 이외의 목적으로 사용하는 것을 금지합니다.") 116 | print() 117 | 118 | if query_yes_no(" [?] Are you agree on this? 이에 동의하십니까?"): 119 | if 'park' in args.checkpoints: 120 | download_checkpoint('park') 121 | if 'son' in args.checkpoints: 122 | download_checkpoint('son') 123 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import math 4 | import argparse 5 | from glob import glob 6 | 7 | from synthesizer import Synthesizer 8 | from train import create_batch_inputs_from_texts 9 | from utils import makedirs, str2bool, backup_file 10 | from hparams import hparams, hparams_debug_string 11 | 12 | 13 | texts = [ 14 | '텍스트를 음성으로 읽어주는 "음성합성" 기술은 시각 장애인을 위한 오디오북, 음성 안내 시스템, 대화 인공지능 등 많은 분야에 활용할 수 있습니다.', 15 | "하지만 개인이 원하는 목소리로 음성합성 엔진을 만들기에는 어려움이 많았고 소수의 기업만이 기술을 보유하고 있었습니다.", 16 | "최근 딥러닝 기술의 발전은 음성합성 기술의 진입 장벽을 많이 낮췄고 이제는 누구나 손쉽게 음성합성 엔진을 만들 수 있게 되었습니다.", 17 | 18 | "본 세션에서는 딥러닝을 활용한 음성합성 기술을 소개하고 개발 경험과 그 과정에서 얻었던 팁을 공유하고자 합니다.", 19 | "음성합성 엔진을 구현하는데 사용한 세 가지 연구를 소개하고 각각의 기술이 얼마나 자연스러운 목소리를 만들 수 있는지를 공유합니다.", 20 | 21 | # Harry Potter 22 | "그리고 헤르미온느는 겁에 질려 마룻바닥에 쓰러져 있었다.", 23 | "그러자 론은 요술지팡이를 꺼냈다. 무엇을 할지도 모르면서 그는 머리에 처음으로 떠오른 주문을 외치고 있었다.", 24 | "윙가르디움 레비오우사.... 하지만, 그렇게 소리쳤다.", 25 | "그러자 그 방망이가 갑자기 트롤의 손에서 벗어나, 저 위로 올라가더니 탁하며 그 주인의 머리 위에 떨어졌다.", 26 | "그러자 트롤이 그 자리에서 비틀거리더니 방 전체를 흔들어버릴 것 같은 커다란 소리를 내며 쿵 하고 넘어졌다. ", 27 | "그러자 조그맣게 펑 하는 소리가 나면서 가장 가까이 있는 가로등이 꺼졌다.", 28 | "그리고 그가 다시 찰깍하자 그 다음 가로등이 깜박거리며 나가 버렸다.", 29 | 30 | #"그가 그렇게 가로등 끄기를 열두번 하자, 이제 그 거리에 남아 있는 불빛이라곤, ", 31 | #"바늘로 꼭 질러둔 것처럼 작게 보이는 멀리서 그를 지켜보고 있는 고양이의 두 눈뿐이었다.", 32 | #"프리벳가 4번지에 살고 있는 더즐리 부부는 자신들이 정상적이라는 것을 아주 자랑스럽게 여기는 사람들이었다. ", 33 | #"그들은 기이하거나 신비스런 일과는 전혀 무관해 보였다.", 34 | #"아니, 그런 터무니없는 것은 도저히 참아내지 못했다.", 35 | #"더즐리 씨는 그루닝스라는 드릴제작회사의 중역이었다.", 36 | #"그는 목이 거의 없을 정도로 살이 뒤룩뒤룩 찐 몸집이 큰 사내로, 코밑에는 커다란 콧수염을 기르고 있었다.", 37 | #"더즐리 부인은 마른 체구의 금발이었고, 목이 보통사람보다 두 배는 길어서, 담 너머로 고개를 쭉 배고 이웃 사람들을 몰래 훔쳐보는 그녀의 취미에는 더없이 제격이었다.", 38 | 39 | # From Yoo Inna's Audiobook (http://campaign.happybean.naver.com/yooinna_audiobook): 40 | #'16세기 중엽 어느 가을날 옛 런던 시의 가난한 캔티 집안에 사내아이 하나가 태어났다.', 41 | #'그런데 그 집안에서는 그 사내아이를 별로 반기지 않았다.', 42 | #'바로 같은 날 또 한 명의 사내아이가 영국의 부유한 튜터 가문에서 태어났다.', 43 | #'그런데 그 가문에서는 그 아이를 무척이나 반겼다.', 44 | #'온 영국이 다 함께 그 아이를 반겼다.', 45 | 46 | ## From NAVER's Audiobook (http://campaign.happybean.naver.com/yooinna_audiobook): 47 | #'부랑자 패거리는 이른 새벽에 일찍 출발하여 길을 떠났다.', 48 | #'하늘은 찌푸렸고, 발밑의 땅은 질퍽거렸으며, 겨울의 냉기가 공기 중에 감돌았다.', 49 | #'지난밤의 흥겨움은 온데간데없이 사라졌다.', 50 | #'시무룩하게 말이 없는 사람들도 있었고, 안달복달하며 조바심을 내는 사람들도 있었지만, 기분이 좋은 사람은 하나도 없었다.', 51 | 52 | ## From NAVER's nVoice example (https://www.facebook.com/naverlabs/videos/422780217913446): 53 | #'감사합니다. Devsisters 김태훈 님의 발표였습니다.', 54 | #'이것으로 금일 마련된 track 2의 모든 세션이 종료되었습니다.', 55 | #'장시간 끝까지 참석해주신 개발자 여러분들께 진심으로 감사의 말씀을 드리며,', 56 | #'잠시 후 5시 15분부터 특정 주제에 관심 있는 사람들이 모여 자유롭게 이야기하는 오프미팅이 진행될 예정이므로', 57 | #'참여신청을 해주신 분들은 진행 요원의 안내에 따라 이동해주시기 바랍니다.', 58 | 59 | ## From Kakao's Son Seok hee example (https://www.youtube.com/watch?v=ScfdAH2otrY): 60 | #'소설가 마크 트웨인이 말했습니다.', 61 | #'인생에 가장 중요한 이틀이 있는데, 하나는 세상에 태어난 날이고 다른 하나는 왜 이 세상에 왔는가를 깨닫는 날이다.', 62 | #'그런데 그 첫번째 날은 누구나 다 알지만 두번째 날은 참 어려운 것 같습니다.', 63 | #'누구나 그 두번째 날을 만나기 위해 애쓰는게 삶인지도 모르겠습니다.', 64 | #'뉴스룸도 그런 면에서 똑같습니다.', 65 | #'저희들도 그 두번째의 날을 만나고 기억하기 위해 매일 매일 최선을 다하겠습니다.', 66 | ] 67 | 68 | 69 | def get_output_base_path(load_path, eval_dirname="eval"): 70 | if not os.path.isdir(load_path): 71 | base_dir = os.path.dirname(load_path) 72 | else: 73 | base_dir = load_path 74 | 75 | base_dir = os.path.join(base_dir, eval_dirname) 76 | if os.path.exists(base_dir): 77 | backup_file(base_dir) 78 | makedirs(base_dir) 79 | 80 | m = re.compile(r'.*?\.ckpt\-([0-9]+)').match(load_path) 81 | base_path = os.path.join(base_dir, 82 | 'eval-%d' % int(m.group(1)) if m else 'eval') 83 | return base_path 84 | 85 | 86 | def run_eval(args): 87 | print(hparams_debug_string()) 88 | 89 | load_paths = glob(args.load_path_pattern) 90 | 91 | for load_path in load_paths: 92 | if not os.path.exists(os.path.join(load_path, "checkpoint")): 93 | print(" [!] Skip non model directory: {}".format(load_path)) 94 | continue 95 | 96 | synth = Synthesizer() 97 | synth.load(load_path) 98 | 99 | for speaker_id in range(synth.num_speakers): 100 | base_path = get_output_base_path(load_path, "eval-{}".format(speaker_id)) 101 | 102 | inputs, input_lengths = create_batch_inputs_from_texts(texts) 103 | 104 | for idx in range(math.ceil(len(inputs) / args.batch_size)): 105 | start_idx, end_idx = idx*args.batch_size, (idx+1)*args.batch_size 106 | 107 | cur_texts = texts[start_idx:end_idx] 108 | cur_inputs = inputs[start_idx:end_idx] 109 | 110 | synth.synthesize( 111 | texts=cur_texts, 112 | speaker_ids=[speaker_id] * len(cur_texts), 113 | tokens=cur_inputs, 114 | base_path="{}-{}".format(base_path, idx), 115 | manual_attention_mode=args.manual_attention_mode, 116 | base_alignment_path=args.base_alignment_path, 117 | ) 118 | 119 | synth.close() 120 | 121 | def main(): 122 | parser = argparse.ArgumentParser() 123 | parser.add_argument('--batch_size', default=16) 124 | parser.add_argument('--load_path_pattern', required=True) 125 | parser.add_argument('--base_alignment_path', default=None) 126 | parser.add_argument('--manual_attention_mode', default=0, type=int, 127 | help="0: None, 1: Argmax, 2: Sharpening, 3. Pruning") 128 | parser.add_argument('--hparams', default='', 129 | help='Hyperparameter overrides as a comma-separated list of name=value pairs') 130 | args = parser.parse_args() 131 | 132 | #hparams.max_iters = 100 133 | #hparams.parse(args.hparams) 134 | run_eval(args) 135 | 136 | 137 | if __name__ == '__main__': 138 | main() 139 | -------------------------------------------------------------------------------- /hparams.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | SCALE_FACTOR = 1 4 | 5 | def f(num): 6 | return num // SCALE_FACTOR 7 | 8 | basic_params = { 9 | # Comma-separated list of cleaners to run on text prior to training and eval. For non-English 10 | # text, you may want to use "basic_cleaners" or "transliteration_cleaners" See TRAINING_DATA.md. 11 | 'cleaners': 'korean_cleaners' #originally korean_cleaners 12 | } 13 | 14 | basic_params.update({ 15 | # Audio 16 | 'num_mels': 80, 17 | 'num_freq': 1025, 18 | 'sample_rate': 24000, # trained as 20000 but need to be 24000 19 | 'frame_length_ms': 50, 20 | 'frame_shift_ms': 12.5, 21 | 'preemphasis': 0.97, 22 | 'min_level_db': -100, 23 | 'ref_level_db': 20, 24 | }) 25 | 26 | if True: 27 | basic_params.update({ 28 | 'sample_rate': 22050, #originally 24000 (krbook), 22050(lj-data), 20000(others) 29 | }) 30 | 31 | basic_params.update({ 32 | # Model 33 | 'model_type': 'single', # [single, simple, deepvoice] 34 | 'speaker_embedding_size': f(16), 35 | 36 | 'embedding_size': f(256), 37 | 'dropout_prob': 0.5, 38 | 39 | # Encoder 40 | 'enc_prenet_sizes': [f(256), f(128)], 41 | 'enc_bank_size': 16, 42 | 'enc_bank_channel_size': f(128), 43 | 'enc_maxpool_width': 2, 44 | 'enc_highway_depth': 4, 45 | 'enc_rnn_size': f(128), 46 | 'enc_proj_sizes': [f(128), f(128)], 47 | 'enc_proj_width': 3, 48 | 49 | # Attention 50 | 'attention_type': 'bah_mon', # ntm2-5 51 | 'attention_size': f(256), 52 | 'attention_state_size': f(256), 53 | 54 | # Decoder recurrent network 55 | 'dec_layer_num': 2, 56 | 'dec_rnn_size': f(256), 57 | 58 | # Decoder 59 | 'dec_prenet_sizes': [f(256), f(128)], 60 | 'post_bank_size': 8, 61 | 'post_bank_channel_size': f(256), 62 | 'post_maxpool_width': 2, 63 | 'post_highway_depth': 4, 64 | 'post_rnn_size': f(128), 65 | 'post_proj_sizes': [f(256), 80], # num_mels=80 66 | 'post_proj_width': 3, 67 | 68 | 'reduction_factor': 4, 69 | }) 70 | 71 | if False: # Deep Voice 2 AudioBook Dataset 72 | basic_params.update({ 73 | 'dropout_prob': 0.8, 74 | 75 | 'attention_size': f(512), 76 | 77 | 'dec_prenet_sizes': [f(256), f(128), f(64)], 78 | 'post_bank_channel_size': f(512), 79 | 'post_rnn_size': f(256), 80 | 81 | 'reduction_factor': 5, # changed from 4 82 | }) 83 | elif False: # Deep Voice 2 VCTK dataset 84 | basic_params.update({ 85 | 'dropout_prob': 0.8, 86 | 87 | #'attention_size': f(512), 88 | 89 | #'dec_prenet_sizes': [f(256), f(128)], 90 | #'post_bank_channel_size': f(512), 91 | 'post_rnn_size': f(256), 92 | 93 | 'reduction_factor': 5, 94 | }) 95 | elif True: # Single Speaker 96 | basic_params.update({ 97 | 'dropout_prob': 0.5, 98 | 99 | 'attention_size': f(128), 100 | 101 | 'post_bank_channel_size': f(128), 102 | #'post_rnn_size': f(128), 103 | 104 | 'reduction_factor': 5, #chhanged from 4 105 | }) 106 | elif False: # Single Speaker with generalization 107 | basic_params.update({ 108 | 'dropout_prob': 0.8, 109 | 110 | 'attention_size': f(256), 111 | 112 | 'dec_prenet_sizes': [f(256), f(128), f(64)], 113 | 'post_bank_channel_size': f(128), 114 | 'post_rnn_size': f(128), 115 | 116 | 'reduction_factor': 4, 117 | }) 118 | 119 | 120 | basic_params.update({ 121 | # Training 122 | 'batch_size': 32, 123 | 'adam_beta1': 0.9, 124 | 'adam_beta2': 0.999, 125 | 'use_fixed_test_inputs': False, 126 | 127 | 'initial_learning_rate': 0.001, 128 | 'decay_learning_rate_mode': 0, # True in deepvoice2 paper 129 | 'initial_data_greedy': True, 130 | 'initial_phase_step': 8000, 131 | 'main_data_greedy_factor': 0, 132 | 'main_data': [''], 133 | 'prioritize_loss': False, 134 | 135 | 'recognition_loss_coeff': 0.2, 136 | 'ignore_recognition_level': 0, # 0: use all, 1: ignore only unmatched_alignment, 2: fully ignore recognition 137 | 138 | # Eval 139 | 'min_tokens': 50,#originally 50, 30 is good for korean, 140 | 'min_iters': 30, 141 | 'max_iters': 200, 142 | 'skip_inadequate': False, 143 | 144 | 'griffin_lim_iters': 60, 145 | 'power': 1.5, # Power to raise magnitudes to prior to Griffin-Lim 146 | }) 147 | 148 | 149 | # Default hyperparameters: 150 | hparams = tf.contrib.training.HParams(**basic_params) 151 | 152 | 153 | def hparams_debug_string(): 154 | values = hparams.values() 155 | hp = [' %s: %s' % (name, values[name]) for name in sorted(values)] 156 | return 'Hyperparameters:\n' + '\n'.join(hp) 157 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from glob import glob 3 | from .tacotron import Tacotron 4 | 5 | 6 | def create_model(hparams): 7 | return Tacotron(hparams) 8 | 9 | 10 | def get_most_recent_checkpoint(checkpoint_dir): 11 | checkpoint_paths = [path for path in glob("{}/*.ckpt-*.data-*".format(checkpoint_dir))] 12 | idxes = [int(os.path.basename(path).split('-')[1].split('.')[0]) for path in checkpoint_paths] 13 | 14 | max_idx = max(idxes) 15 | lastest_checkpoint = os.path.join(checkpoint_dir, "model.ckpt-{}".format(max_idx)) 16 | 17 | #latest_checkpoint=checkpoint_paths[0] 18 | print(" [*] Found lastest checkpoint: {}".format(lastest_checkpoint)) 19 | return lastest_checkpoint 20 | -------------------------------------------------------------------------------- /models/helpers.py: -------------------------------------------------------------------------------- 1 | # Code based on https://github.com/keithito/tacotron/blob/master/models/tacotron.py 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | from tensorflow.contrib.seq2seq import Helper 6 | 7 | 8 | # Adapted from tf.contrib.seq2seq.GreedyEmbeddingHelper 9 | class TacoTestHelper(Helper): 10 | def __init__(self, batch_size, output_dim, r): 11 | with tf.name_scope('TacoTestHelper'): 12 | self._batch_size = batch_size 13 | self._output_dim = output_dim 14 | self._end_token = tf.tile([0.0], [output_dim * r]) 15 | 16 | @property 17 | def batch_size(self): 18 | return self._batch_size 19 | 20 | def initialize(self, name=None): 21 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 22 | 23 | def sample(self, time, outputs, state, name=None): 24 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 25 | 26 | def next_inputs(self, time, outputs, state, sample_ids, name=None): 27 | '''Stop on EOS. Otherwise, pass the last output as the next input and pass through state.''' 28 | with tf.name_scope('TacoTestHelper'): 29 | finished = tf.reduce_all(tf.equal(outputs, self._end_token), axis=1) 30 | # Feed last output frame as next input. outputs is [N, output_dim * r] 31 | next_inputs = outputs[:, -self._output_dim:] 32 | return (finished, next_inputs, state) 33 | 34 | 35 | class TacoTrainingHelper(Helper): 36 | def __init__(self, inputs, targets, output_dim, r, rnn_decoder_test_mode=False): 37 | # inputs is [N, T_in], targets is [N, T_out, D] 38 | with tf.name_scope('TacoTrainingHelper'): 39 | self._batch_size = tf.shape(inputs)[0] 40 | self._output_dim = output_dim 41 | self._rnn_decoder_test_mode = rnn_decoder_test_mode 42 | 43 | # Feed every r-th target frame as input 44 | self._targets = targets[:, r-1::r, :] 45 | 46 | # Use full length for every target because we don't want to mask the padding frames 47 | num_steps = tf.shape(self._targets)[1] 48 | self._lengths = tf.tile([num_steps], [self._batch_size]) 49 | 50 | @property 51 | def batch_size(self): 52 | return self._batch_size 53 | 54 | def initialize(self, name=None): 55 | return (tf.tile([False], [self._batch_size]), _go_frames(self._batch_size, self._output_dim)) 56 | 57 | def sample(self, time, outputs, state, name=None): 58 | return tf.tile([0], [self._batch_size]) # Return all 0; we ignore them 59 | 60 | def next_inputs(self, time, outputs, state, sample_ids, name=None): 61 | with tf.name_scope(name or 'TacoTrainingHelper'): 62 | finished = (time + 1 >= self._lengths) 63 | if self._rnn_decoder_test_mode: 64 | next_inputs = outputs[:, -self._output_dim:] 65 | else: 66 | next_inputs = self._targets[:, time, :] 67 | return (finished, next_inputs, state) 68 | 69 | 70 | def _go_frames(batch_size, output_dim): 71 | '''Returns all-zero frames for a given batch size and output dimension''' 72 | return tf.tile([[0.0]], [batch_size, output_dim]) 73 | 74 | -------------------------------------------------------------------------------- /models/modules.py: -------------------------------------------------------------------------------- 1 | # Code based on https://github.com/keithito/tacotron/blob/master/models/tacotron.py 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib.rnn import GRUCell 5 | from tensorflow.python.layers import core 6 | from tensorflow.contrib.seq2seq.python.ops.attention_wrapper \ 7 | import _bahdanau_score, _BaseAttentionMechanism, BahdanauAttention, \ 8 | AttentionWrapper, AttentionWrapperState 9 | 10 | 11 | def get_embed(inputs, num_inputs, embed_size, name): 12 | embed_table = tf.get_variable( 13 | name, [num_inputs, embed_size], dtype=tf.float32, 14 | initializer=tf.truncated_normal_initializer(stddev=0.1)) 15 | return tf.nn.embedding_lookup(embed_table, inputs) 16 | 17 | 18 | def prenet(inputs, is_training, layer_sizes, drop_prob, scope=None): 19 | x = inputs 20 | drop_rate = drop_prob if is_training else 0.0 21 | with tf.variable_scope(scope or 'prenet'): 22 | for i, size in enumerate(layer_sizes): 23 | dense = tf.layers.dense(x, units=size, activation=tf.nn.relu, name='dense_%d' % (i+1)) 24 | x = tf.layers.dropout(dense, rate=drop_rate, name='dropout_%d' % (i+1)) 25 | return x 26 | 27 | def cbhg(inputs, input_lengths, is_training, 28 | bank_size, bank_channel_size, 29 | maxpool_width, highway_depth, rnn_size, 30 | proj_sizes, proj_width, scope, 31 | before_highway=None, encoder_rnn_init_state=None): 32 | 33 | batch_size = tf.shape(inputs)[0] 34 | with tf.variable_scope(scope): 35 | with tf.variable_scope('conv_bank'): 36 | # Convolution bank: concatenate on the last axis 37 | # to stack channels from all convolutions 38 | conv_fn = lambda k: \ 39 | conv1d(inputs, k, bank_channel_size, 40 | tf.nn.relu, is_training, 'conv1d_%d' % k) 41 | 42 | conv_outputs = tf.concat( 43 | [conv_fn(k) for k in range(1, bank_size+1)], axis=-1, 44 | ) 45 | 46 | # Maxpooling: 47 | maxpool_output = tf.layers.max_pooling1d( 48 | conv_outputs, 49 | pool_size=maxpool_width, 50 | strides=1, 51 | padding='same') 52 | 53 | # Two projection layers: 54 | proj_out = maxpool_output 55 | for idx, proj_size in enumerate(proj_sizes): 56 | activation_fn = None if idx == len(proj_sizes) - 1 else tf.nn.relu 57 | proj_out = conv1d( 58 | proj_out, proj_width, proj_size, activation_fn, 59 | is_training, 'proj_{}'.format(idx + 1)) 60 | 61 | # Residual connection: 62 | if before_highway is not None: 63 | expanded_before_highway = tf.expand_dims(before_highway, [1]) 64 | tiled_before_highway = tf.tile( 65 | expanded_before_highway, [1, tf.shape(proj_out)[1], 1]) 66 | 67 | highway_input = proj_out + inputs + tiled_before_highway 68 | else: 69 | highway_input = proj_out + inputs 70 | 71 | # Handle dimensionality mismatch: 72 | if highway_input.shape[2] != rnn_size: 73 | highway_input = tf.layers.dense(highway_input, rnn_size) 74 | 75 | # 4-layer HighwayNet: 76 | for idx in range(highway_depth): 77 | highway_input = highwaynet(highway_input, 'highway_%d' % (idx+1)) 78 | 79 | rnn_input = highway_input 80 | 81 | # Bidirectional RNN 82 | if encoder_rnn_init_state is not None: 83 | initial_state_fw, initial_state_bw = \ 84 | tf.split(encoder_rnn_init_state, 2, 1) 85 | else: 86 | initial_state_fw, initial_state_bw = None, None 87 | 88 | cell_fw, cell_bw = GRUCell(rnn_size), GRUCell(rnn_size) 89 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 90 | cell_fw, cell_bw, 91 | rnn_input, 92 | sequence_length=input_lengths, 93 | initial_state_fw=initial_state_fw, 94 | initial_state_bw=initial_state_bw, 95 | dtype=tf.float32) 96 | return tf.concat(outputs, axis=2) # Concat forward and backward 97 | 98 | 99 | def batch_tile(tensor, batch_size): 100 | expaneded_tensor = tf.expand_dims(tensor, [0]) 101 | return tf.tile(expaneded_tensor, \ 102 | [batch_size] + [1 for _ in tensor.get_shape()]) 103 | 104 | 105 | def highwaynet(inputs, scope): 106 | highway_dim = int(inputs.get_shape()[-1]) 107 | 108 | with tf.variable_scope(scope): 109 | H = tf.layers.dense( 110 | inputs, 111 | units=highway_dim, 112 | activation=tf.nn.relu, 113 | name='H') 114 | T = tf.layers.dense( 115 | inputs, 116 | units=highway_dim, 117 | activation=tf.nn.sigmoid, 118 | name='T', 119 | bias_initializer=tf.constant_initializer(-1.0)) 120 | return H * T + inputs * (1.0 - T) 121 | 122 | 123 | def conv1d(inputs, kernel_size, channels, activation, is_training, scope): 124 | with tf.variable_scope(scope): 125 | conv1d_output = tf.layers.conv1d( 126 | inputs, 127 | filters=channels, 128 | kernel_size=kernel_size, 129 | activation=activation, 130 | padding='same') 131 | return tf.layers.batch_normalization(conv1d_output, training=is_training) 132 | -------------------------------------------------------------------------------- /models/tacotron.py: -------------------------------------------------------------------------------- 1 | # Code based on https://github.com/keithito/tacotron/blob/master/models/tacotron.py 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | from tensorflow.contrib.seq2seq import BasicDecoder, BahdanauAttention, BahdanauMonotonicAttention 6 | from tensorflow.contrib.rnn import GRUCell, MultiRNNCell, OutputProjectionWrapper, ResidualWrapper 7 | 8 | from utils.infolog import log 9 | from text.symbols import symbols 10 | 11 | from .modules import * 12 | from .helpers import TacoTestHelper, TacoTrainingHelper 13 | from .rnn_wrappers import AttentionWrapper, DecoderPrenetWrapper, ConcatOutputAndAttentionWrapper 14 | 15 | 16 | class Tacotron(): 17 | def __init__(self, hparams): 18 | self._hparams = hparams 19 | 20 | 21 | def initialize( 22 | self, inputs, input_lengths, num_speakers, speaker_id, 23 | mel_targets=None, linear_targets=None, loss_coeff=None, 24 | rnn_decoder_test_mode=False, is_randomly_initialized=False, 25 | ): 26 | is_training = linear_targets is not None 27 | self.is_randomly_initialized = is_randomly_initialized 28 | 29 | with tf.variable_scope('inference') as scope: 30 | hp = self._hparams 31 | batch_size = tf.shape(inputs)[0] 32 | 33 | # Embeddings 34 | char_embed_table = tf.get_variable( 35 | 'embedding', [len(symbols), hp.embedding_size], dtype=tf.float32, 36 | initializer=tf.truncated_normal_initializer(stddev=0.5)) 37 | # [N, T_in, embedding_size] 38 | char_embedded_inputs = \ 39 | tf.nn.embedding_lookup(char_embed_table, inputs) 40 | 41 | self.num_speakers = num_speakers 42 | if self.num_speakers > 1: 43 | if hp.speaker_embedding_size != 1: 44 | speaker_embed_table = tf.get_variable( 45 | 'speaker_embedding', 46 | [self.num_speakers, hp.speaker_embedding_size], dtype=tf.float32, 47 | initializer=tf.truncated_normal_initializer(stddev=0.5)) 48 | # [N, T_in, speaker_embedding_size] 49 | speaker_embed = tf.nn.embedding_lookup(speaker_embed_table, speaker_id) 50 | 51 | if hp.model_type == 'deepvoice': 52 | if hp.speaker_embedding_size == 1: 53 | before_highway = get_embed( 54 | speaker_id, self.num_speakers, 55 | hp.enc_prenet_sizes[-1], "before_highway") 56 | encoder_rnn_init_state = get_embed( 57 | speaker_id, self.num_speakers, 58 | hp.enc_rnn_size * 2, "encoder_rnn_init_state") 59 | 60 | attention_rnn_init_state = get_embed( 61 | speaker_id, self.num_speakers, 62 | hp.attention_state_size, "attention_rnn_init_state") 63 | decoder_rnn_init_states = [get_embed( 64 | speaker_id, self.num_speakers, 65 | hp.dec_rnn_size, "decoder_rnn_init_states{}".format(idx + 1)) \ 66 | for idx in range(hp.dec_layer_num)] 67 | else: 68 | deep_dense = lambda x, dim: \ 69 | tf.layers.dense(x, dim, activation=tf.nn.softsign) 70 | 71 | before_highway = deep_dense( 72 | speaker_embed, hp.enc_prenet_sizes[-1]) 73 | encoder_rnn_init_state = deep_dense( 74 | speaker_embed, hp.enc_rnn_size * 2) 75 | 76 | attention_rnn_init_state = deep_dense( 77 | speaker_embed, hp.attention_state_size) 78 | decoder_rnn_init_states = [deep_dense( 79 | speaker_embed, hp.dec_rnn_size) for _ in range(hp.dec_layer_num)] 80 | 81 | speaker_embed = None # deepvoice does not use speaker_embed directly 82 | elif hp.model_type == 'simple': 83 | before_highway = None 84 | encoder_rnn_init_state = None 85 | attention_rnn_init_state = None 86 | decoder_rnn_init_states = None 87 | else: 88 | raise Exception(" [!] Unkown multi-speaker model type: {}".format(hp.model_type)) 89 | else: 90 | speaker_embed = None 91 | before_highway = None 92 | encoder_rnn_init_state = None 93 | attention_rnn_init_state = None 94 | decoder_rnn_init_states = None 95 | 96 | ############## 97 | # Encoder 98 | ############## 99 | 100 | # [N, T_in, enc_prenet_sizes[-1]] 101 | prenet_outputs = prenet(char_embedded_inputs, is_training, 102 | hp.enc_prenet_sizes, hp.dropout_prob, 103 | scope='prenet') 104 | 105 | encoder_outputs = cbhg( 106 | prenet_outputs, input_lengths, is_training, 107 | hp.enc_bank_size, hp.enc_bank_channel_size, 108 | hp.enc_maxpool_width, hp.enc_highway_depth, hp.enc_rnn_size, 109 | hp.enc_proj_sizes, hp.enc_proj_width, 110 | scope="encoder_cbhg", 111 | before_highway=before_highway, 112 | encoder_rnn_init_state=encoder_rnn_init_state) 113 | 114 | 115 | ############## 116 | # Attention 117 | ############## 118 | 119 | # For manaul control of attention 120 | self.is_manual_attention = tf.placeholder( 121 | tf.bool, shape=(), name='is_manual_attention', 122 | ) 123 | self.manual_alignments = tf.placeholder( 124 | tf.float32, shape=[None, None, None], name="manual_alignments", 125 | ) 126 | 127 | dec_prenet_outputs = DecoderPrenetWrapper( 128 | GRUCell(hp.attention_state_size), 129 | speaker_embed, 130 | is_training, hp.dec_prenet_sizes, hp.dropout_prob) 131 | 132 | if hp.attention_type == 'bah_mon': 133 | attention_mechanism = BahdanauMonotonicAttention( 134 | hp.attention_size, encoder_outputs) 135 | elif hp.attention_type == 'bah_norm': 136 | attention_mechanism = BahdanauAttention( 137 | hp.attention_size, encoder_outputs, normalize=True) 138 | elif hp.attention_type == 'luong_scaled': 139 | attention_mechanism = LuongAttention( 140 | hp.attention_size, encoder_outputs, scale=True) 141 | elif hp.attention_type == 'luong': 142 | attention_mechanism = LuongAttention( 143 | hp.attention_size, encoder_outputs) 144 | elif hp.attention_type == 'bah': 145 | attention_mechanism = BahdanauAttention( 146 | hp.attention_size, encoder_outputs) 147 | elif hp.attention_type.startswith('ntm2'): 148 | shift_width = int(hp.attention_type.split('-')[-1]) 149 | attention_mechanism = NTMAttention2( 150 | hp.attention_size, encoder_outputs, shift_width=shift_width) 151 | else: 152 | raise Exception(" [!] Unkown attention type: {}".format(hp.attention_type)) 153 | 154 | attention_cell = AttentionWrapper( 155 | dec_prenet_outputs, 156 | attention_mechanism, 157 | self.is_manual_attention, 158 | self.manual_alignments, 159 | initial_cell_state=attention_rnn_init_state, 160 | alignment_history=True, 161 | output_attention=False 162 | ) 163 | 164 | # Concatenate attention context vector and RNN cell output into a 512D vector. 165 | # [N, T_in, attention_size+attention_state_size] 166 | concat_cell = ConcatOutputAndAttentionWrapper( 167 | attention_cell, embed_to_concat=speaker_embed) 168 | 169 | # Decoder (layers specified bottom to top): 170 | cells = [OutputProjectionWrapper(concat_cell, hp.dec_rnn_size)] 171 | for _ in range(hp.dec_layer_num): 172 | cells.append(ResidualWrapper(GRUCell(hp.dec_rnn_size))) 173 | 174 | # [N, T_in, 256] 175 | decoder_cell = MultiRNNCell(cells, state_is_tuple=True) 176 | 177 | # Project onto r mel spectrograms (predict r outputs at each RNN step): 178 | output_cell = OutputProjectionWrapper( 179 | decoder_cell, hp.num_mels * hp.reduction_factor) 180 | decoder_init_state = output_cell.zero_state( 181 | batch_size=batch_size, dtype=tf.float32) 182 | 183 | if hp.model_type == "deepvoice": 184 | # decoder_init_state[0] : AttentionWrapperState 185 | # = cell_state + attention + time + alignments + alignment_history 186 | # decoder_init_state[0][0] = attention_rnn_init_state (already applied) 187 | decoder_init_state = list(decoder_init_state) 188 | 189 | for idx, cell in enumerate(decoder_rnn_init_states): 190 | shape1 = decoder_init_state[idx + 1].get_shape().as_list() 191 | shape2 = cell.get_shape().as_list() 192 | if shape1 != shape2: 193 | raise Exception(" [!] Shape {} and {} should be equal". \ 194 | format(shape1, shape2)) 195 | decoder_init_state[idx + 1] = cell 196 | 197 | decoder_init_state = tuple(decoder_init_state) 198 | 199 | if is_training: 200 | helper = TacoTrainingHelper( 201 | inputs, mel_targets, hp.num_mels, hp.reduction_factor, 202 | rnn_decoder_test_mode) 203 | else: 204 | helper = TacoTestHelper( 205 | batch_size, hp.num_mels, hp.reduction_factor) 206 | 207 | (decoder_outputs, _), final_decoder_state, _ = \ 208 | tf.contrib.seq2seq.dynamic_decode( 209 | BasicDecoder(output_cell, helper, decoder_init_state), 210 | maximum_iterations=hp.max_iters) 211 | 212 | # [N, T_out, M] 213 | mel_outputs = tf.reshape( 214 | decoder_outputs, [batch_size, -1, hp.num_mels]) 215 | 216 | # Add post-processing CBHG: 217 | # [N, T_out, 256] 218 | #post_outputs = post_cbhg(mel_outputs, hp.num_mels, is_training) 219 | post_outputs = cbhg( 220 | mel_outputs, None, is_training, 221 | hp.post_bank_size, hp.post_bank_channel_size, 222 | hp.post_maxpool_width, hp.post_highway_depth, hp.post_rnn_size, 223 | hp.post_proj_sizes, hp.post_proj_width, 224 | scope='post_cbhg') 225 | 226 | if speaker_embed is not None and hp.model_type == 'simple': 227 | expanded_speaker_emb = tf.expand_dims(speaker_embed, [1]) 228 | tiled_speaker_embedding = tf.tile( 229 | expanded_speaker_emb, [1, tf.shape(post_outputs)[1], 1]) 230 | 231 | # [N, T_out, 256 + alpha] 232 | post_outputs = \ 233 | tf.concat([tiled_speaker_embedding, post_outputs], axis=-1) 234 | 235 | linear_outputs = tf.layers.dense(post_outputs, hp.num_freq) # [N, T_out, F] 236 | 237 | # Grab alignments from the final decoder state: 238 | alignments = tf.transpose( 239 | final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) 240 | 241 | 242 | self.inputs = inputs 243 | self.speaker_id = speaker_id 244 | self.input_lengths = input_lengths 245 | self.loss_coeff = loss_coeff 246 | self.mel_outputs = mel_outputs 247 | self.linear_outputs = linear_outputs 248 | self.alignments = alignments 249 | self.mel_targets = mel_targets 250 | self.linear_targets = linear_targets 251 | self.final_decoder_state = final_decoder_state 252 | 253 | log('='*40) 254 | log(' model_type: %s' % hp.model_type) 255 | log('='*40) 256 | 257 | log('Initialized Tacotron model. Dimensions: ') 258 | log(' embedding: %d' % char_embedded_inputs.shape[-1]) 259 | if speaker_embed is not None: 260 | log(' speaker embedding: %d' % speaker_embed.shape[-1]) 261 | else: 262 | log(' speaker embedding: None') 263 | log(' prenet out: %d' % prenet_outputs.shape[-1]) 264 | log(' encoder out: %d' % encoder_outputs.shape[-1]) 265 | log(' attention out: %d' % attention_cell.output_size) 266 | log(' concat attn & out: %d' % concat_cell.output_size) 267 | log(' decoder cell out: %d' % decoder_cell.output_size) 268 | log(' decoder out (%d frames): %d' % (hp.reduction_factor, decoder_outputs.shape[-1])) 269 | log(' decoder out (1 frame): %d' % mel_outputs.shape[-1]) 270 | log(' postnet out: %d' % post_outputs.shape[-1]) 271 | log(' linear out: %d' % linear_outputs.shape[-1]) 272 | 273 | 274 | def add_loss(self): 275 | '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' 276 | with tf.variable_scope('loss') as scope: 277 | hp = self._hparams 278 | mel_loss = tf.abs(self.mel_targets - self.mel_outputs) 279 | 280 | l1 = tf.abs(self.linear_targets - self.linear_outputs) 281 | expanded_loss_coeff = tf.expand_dims( 282 | tf.expand_dims(self.loss_coeff, [-1]), [-1]) 283 | 284 | if hp.prioritize_loss: 285 | # Prioritize loss for frequencies. 286 | upper_priority_freq = int(5000 / (hp.sample_rate * 0.5) * hp.num_freq) 287 | lower_priority_freq = int(165 / (hp.sample_rate * 0.5) * hp.num_freq) 288 | 289 | l1_priority= l1[:,:,lower_priority_freq:upper_priority_freq] 290 | 291 | self.loss = tf.reduce_mean(mel_loss * expanded_loss_coeff) + \ 292 | 0.5 * tf.reduce_mean(l1 * expanded_loss_coeff) + \ 293 | 0.5 * tf.reduce_mean(l1_priority * expanded_loss_coeff) 294 | self.linear_loss = tf.reduce_mean( 295 | 0.5 * (tf.reduce_mean(l1) + tf.reduce_mean(l1_priority))) 296 | else: 297 | self.loss = tf.reduce_mean(mel_loss * expanded_loss_coeff) + \ 298 | tf.reduce_mean(l1 * expanded_loss_coeff) 299 | self.linear_loss = tf.reduce_mean(l1) 300 | 301 | self.mel_loss = tf.reduce_mean(mel_loss) 302 | self.loss_without_coeff = self.mel_loss + self.linear_loss 303 | 304 | 305 | def add_optimizer(self, global_step): 306 | '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. 307 | 308 | Args: 309 | global_step: int32 scalar Tensor representing current global step in training 310 | ''' 311 | with tf.variable_scope('optimizer') as scope: 312 | hp = self._hparams 313 | 314 | step = tf.cast(global_step + 1, dtype=tf.float32) 315 | 316 | if hp.decay_learning_rate_mode == 0: 317 | if self.is_randomly_initialized: 318 | warmup_steps = 4000.0 319 | else: 320 | warmup_steps = 40000.0 321 | self.learning_rate = hp.initial_learning_rate * warmup_steps**0.5 * \ 322 | tf.minimum(step * warmup_steps**-1.5, step**-0.5) 323 | elif hp.decay_learning_rate_mode == 1: 324 | self.learning_rate = hp.initial_learning_rate * \ 325 | tf.train.exponential_decay(1., step, 3000, 0.95) 326 | 327 | optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2) 328 | gradients, variables = zip(*optimizer.compute_gradients(self.loss)) 329 | self.gradients = gradients 330 | clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) 331 | 332 | # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See: 333 | # https://github.com/tensorflow/tensorflow/issues/1122 334 | with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): 335 | self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables), 336 | global_step=global_step) 337 | 338 | def get_dummy_feed_dict(self): 339 | feed_dict = { 340 | self.is_manual_attention: False, 341 | self.manual_alignments: np.zeros([1, 1, 1]), 342 | } 343 | return feed_dict 344 | -------------------------------------------------------------------------------- /recognition/alignment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import string 4 | import argparse 5 | import operator 6 | from functools import partial 7 | from difflib import SequenceMatcher 8 | 9 | from audio.get_duration import get_durations 10 | from text import remove_puncuations, text_to_sequence 11 | from utils import load_json, write_json, parallel_run, remove_postfix, backup_file 12 | 13 | def plain_text(text): 14 | return "".join(remove_puncuations(text.strip()).split()) 15 | 16 | def add_punctuation(text): 17 | if text.endswith('다'): 18 | return text + "." 19 | else: 20 | return text 21 | 22 | def similarity(text_a, text_b): 23 | text_a = plain_text(text_a) 24 | text_b = plain_text(text_b) 25 | 26 | score = SequenceMatcher(None, text_a, text_b).ratio() 27 | return score 28 | 29 | def first_word_combined_words(text): 30 | words = text.split() 31 | if len(words) > 1: 32 | first_words = [words[0], words[0]+words[1]] 33 | else: 34 | first_words = [words[0]] 35 | return first_words 36 | 37 | def first_word_combined_texts(text): 38 | words = text.split() 39 | if len(words) > 1: 40 | if len(words) > 2: 41 | text2 = " ".join([words[0]+words[1]] + words[2:]) 42 | else: 43 | text2 = words[0]+words[1] 44 | texts = [text, text2] 45 | else: 46 | texts = [text] 47 | return texts 48 | 49 | def search_optimal(found_text, recognition_text): 50 | # 1. found_text is usually more accurate 51 | # 2. recognition_text can have more or less word 52 | 53 | optimal = None 54 | 55 | if plain_text(recognition_text) in plain_text(found_text): 56 | optimal = recognition_text 57 | else: 58 | found = False 59 | 60 | for tmp_text in first_word_combined_texts(found_text): 61 | for recognition_first_word in first_word_combined_words(recognition_text): 62 | if recognition_first_word in tmp_text: 63 | start_idx = tmp_text.find(recognition_first_word) 64 | 65 | if tmp_text != found_text: 66 | found_text = found_text[max(0, start_idx-1):].strip() 67 | else: 68 | found_text = found_text[start_idx:].strip() 69 | found = True 70 | break 71 | 72 | if found: 73 | break 74 | 75 | recognition_last_word = recognition_text.split()[-1] 76 | if recognition_last_word in found_text: 77 | end_idx = found_text.find(recognition_last_word) 78 | 79 | punctuation = "" 80 | if len(found_text) > end_idx + len(recognition_last_word): 81 | punctuation = found_text[end_idx + len(recognition_last_word)] 82 | if punctuation not in string.punctuation: 83 | punctuation = "" 84 | 85 | found_text = found_text[:end_idx] + recognition_last_word + punctuation 86 | found = True 87 | 88 | if found: 89 | optimal = found_text 90 | 91 | return optimal 92 | 93 | 94 | def align_text_fn( 95 | item, score_threshold, debug=False): 96 | 97 | audio_path, recognition_text = item 98 | 99 | audio_dir = os.path.dirname(audio_path) 100 | base_dir = os.path.dirname(audio_dir) 101 | 102 | news_path = remove_postfix(audio_path.replace("audio", "assets")) 103 | news_path = os.path.splitext(news_path)[0] + ".txt" 104 | 105 | strip_fn = lambda line: line.strip().replace('"', '').replace("'", "") 106 | candidates = [strip_fn(line) for line in open(news_path, encoding='UTF-8').readlines()] 107 | 108 | scores = { candidate: similarity(candidate, recognition_text) \ 109 | for candidate in candidates} 110 | print(scores) 111 | sorted_scores = sorted(scores.items(), key=operator.itemgetter(1))[::-1] 112 | 113 | try : 114 | first, second = sorted_scores[0], sorted_scores[1] 115 | 116 | if first[1] > second[1] and first[1] >= score_threshold: 117 | found_text, score = first 118 | aligned_text = search_optimal(found_text, recognition_text) 119 | 120 | if debug: 121 | print(" ", audio_path) 122 | print(" ", recognition_text) 123 | print("=> ", found_text) 124 | print("==>", aligned_text) 125 | print("="*30) 126 | 127 | if aligned_text is not None: 128 | result = { audio_path: add_punctuation(aligned_text) } 129 | if abs(len(text_to_sequence(found_text)) - len(text_to_sequence(recognition_text))) > 10: 130 | result = {} 131 | else: 132 | result = { audio_path: add_punctuation(found_text) } 133 | else: 134 | result = {} 135 | # 136 | # if len(result) == 0: 137 | # result = { audio_path: found_text } 138 | 139 | return result 140 | 141 | except: 142 | pass 143 | 144 | def align_text_batch(config): 145 | align_text = partial(align_text_fn, 146 | score_threshold=config.score_threshold) 147 | 148 | results = {} 149 | data = load_json(config.recognition_path, encoding=config.recognition_encoding) 150 | 151 | items = parallel_run( 152 | align_text, data.items(), 153 | desc="align_text_batch", parallel=True) 154 | 155 | for item in items: 156 | results.update(item) 157 | 158 | found_count = sum([type(value) == str for value in results.values()]) 159 | print(" [*] # found: {:.5f}% ({}/{})".format( 160 | len(results)/len(data), len(results), len(data))) 161 | print(" [*] # exact match: {:.5f}% ({}/{})".format( 162 | found_count/len(items), found_count, len(items))) 163 | 164 | return results 165 | 166 | if __name__ == '__main__': 167 | parser = argparse.ArgumentParser() 168 | parser.add_argument('--recognition_path', required=True) 169 | parser.add_argument('--alignment_filename', default="alignment.json") 170 | parser.add_argument('--score_threshold', default=0.4, type=float) 171 | parser.add_argument('--recognition_encoding', default='UTF-8') 172 | config, unparsed = parser.parse_known_args() 173 | 174 | results = align_text_batch(config) 175 | 176 | base_dir = os.path.dirname(config.recognition_path) 177 | alignment_path = \ 178 | os.path.join(base_dir, config.alignment_filename) 179 | 180 | if os.path.exists(alignment_path): 181 | backup_file(alignment_path) 182 | 183 | write_json(alignment_path, results) 184 | duration = get_durations(results.keys(), print_detail=False) 185 | -------------------------------------------------------------------------------- /recognition/google.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import json 4 | import argparse 5 | import numpy as np 6 | from glob import glob 7 | from functools import partial 8 | import time 9 | 10 | from utils import parallel_run, remove_file, backup_file, write_json 11 | from audio import load_audio, save_audio, resample_audio, get_duration 12 | 13 | def text_recognition(path, config): 14 | time.sleep(0.7) 15 | root, ext = os.path.splitext(path) 16 | txt_path = root + ".txt" 17 | 18 | if os.path.exists(txt_path): 19 | with open(txt_path) as f: 20 | out = json.loads(open(txt_path).read()) 21 | return out 22 | 23 | from google.cloud import speech 24 | from google.cloud.speech import enums 25 | from google.cloud.speech import types 26 | 27 | out = {} 28 | error_count = 0 29 | 30 | tmp_path = os.path.splitext(path)[0] + ".tmp.wav" 31 | client = speech.SpeechClient() # Fixed 32 | 33 | while True: 34 | try: 35 | # client= speech.SpeechClient() # Causes 10060 max retries exceeded -to OAuth -HK 36 | 37 | content = load_audio( 38 | path, pre_silence_length=config.pre_silence_length, 39 | post_silence_length=config.post_silence_length) 40 | 41 | max_duration = config.max_duration - \ 42 | config.pre_silence_length - config.post_silence_length 43 | audio_duration = get_duration(content) 44 | 45 | if audio_duration >= max_duration: 46 | print(" [!] Skip {} because of duration: {} > {}". \ 47 | format(path, audio_duration, max_duration)) 48 | return {} 49 | 50 | content = resample_audio(content, config.sample_rate) 51 | save_audio(content, tmp_path, config.sample_rate) 52 | 53 | with io.open(tmp_path, 'rb') as f: 54 | audio = types.RecognitionAudio(content=f.read()) 55 | 56 | config = types.RecognitionConfig( 57 | encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, 58 | sample_rate_hertz=config.sample_rate, 59 | language_code='ko-KR') 60 | 61 | response = client.recognize(config, audio) 62 | if len(response.results) > 0: 63 | alternatives = response.results[0].alternatives 64 | 65 | results = [alternative.transcript for alternative in alternatives] 66 | assert len(results) == 1, "More than 1 results: {}".format(results) 67 | 68 | out = { path: "" if len(results) == 0 else results[0] } 69 | print(path, results[0]) 70 | break 71 | break 72 | except Exception as err: 73 | raise Exception("OS error: {0}".format(err)) 74 | 75 | error_count += 1 76 | print("Skip warning for {} for {} times". \ 77 | format(path, error_count)) 78 | 79 | if error_count > 5: 80 | break 81 | else: 82 | continue 83 | 84 | remove_file(tmp_path) 85 | with open(txt_path, 'w') as f: 86 | json.dump(out, f, indent=2, ensure_ascii=False) 87 | 88 | return out 89 | 90 | def text_recognition_batch(paths, config): 91 | paths.sort() 92 | 93 | results = {} 94 | items = parallel_run( 95 | partial(text_recognition, config=config), paths, 96 | desc="text_recognition_batch", parallel=True) 97 | for item in items: 98 | results.update(item) 99 | return results 100 | 101 | 102 | if __name__ == '__main__': 103 | parser = argparse.ArgumentParser() 104 | parser.add_argument('--audio_pattern', required=True) 105 | parser.add_argument('--recognition_filename', default="recognition.json") 106 | parser.add_argument('--sample_rate', default=16000, type=int) 107 | parser.add_argument('--pre_silence_length', default=1, type=int) 108 | parser.add_argument('--post_silence_length', default=1, type=int) 109 | parser.add_argument('--max_duration', default=60, type=int) 110 | config, unparsed = parser.parse_known_args() 111 | 112 | audio_dir = os.path.dirname(config.audio_pattern) 113 | 114 | for tmp_path in glob(os.path.join(audio_dir, "*.tmp.*")): 115 | remove_file(tmp_path) 116 | 117 | paths = glob(config.audio_pattern) 118 | paths.sort() 119 | results = text_recognition_batch(paths, config) 120 | 121 | base_dir = os.path.dirname(audio_dir) 122 | recognition_path = \ 123 | os.path.join(base_dir, config.recognition_filename) 124 | 125 | if os.path.exists(recognition_path): 126 | backup_file(recognition_path) 127 | 128 | write_json(recognition_path, results) 129 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appnope==0.1.0 2 | audioread==2.1.5 3 | beautifulsoup4==4.6.0 4 | bleach==1.5.0 5 | bs4==0.0.1 6 | cachetools==2.0.1 7 | chardet==3.0.4 8 | click==6.7 9 | cycler==0.10.0 10 | decorator==4.1.2 11 | dill==0.2.7.1 12 | ffprobe==0.5 13 | Flask==0.12.2 14 | Flask-Cors==3.0.3 15 | future==0.16.0 16 | gapic-google-cloud-datastore-v1==0.15.3 17 | gapic-google-cloud-error-reporting-v1beta1==0.15.3 18 | gapic-google-cloud-logging-v2==0.91.3 19 | gapic-google-cloud-pubsub-v1==0.15.4 20 | gapic-google-cloud-spanner-admin-database-v1==0.15.3 21 | gapic-google-cloud-spanner-admin-instance-v1==0.15.3 22 | gapic-google-cloud-spanner-v1==0.15.3 23 | google-auth==1.1.1 24 | google-cloud==0.27.0 25 | google-cloud-bigquery==0.26.0 26 | google-cloud-bigtable==0.26.0 27 | google-cloud-core==0.26.0 28 | google-cloud-datastore==1.2.0 29 | google-cloud-dns==0.26.0 30 | google-cloud-error-reporting==0.26.0 31 | google-cloud-language==0.27.0 32 | google-cloud-logging==1.2.0 33 | google-cloud-monitoring==0.26.0 34 | google-cloud-pubsub==0.27.0 35 | google-cloud-resource-manager==0.26.0 36 | google-cloud-runtimeconfig==0.26.0 37 | google-cloud-spanner==0.26.0 38 | google-cloud-speech==0.28.0 39 | google-cloud-storage==1.3.2 40 | google-cloud-translate==1.1.0 41 | google-cloud-videointelligence==0.25.0 42 | google-cloud-vision==0.26.0 43 | google-api-core==1.1.2 44 | google-resumable-media==0.3.0 45 | googleapis-common-protos==1.5.3 46 | grpc-google-iam-v1==0.11.4 47 | grpcio==1.8.6 48 | html5lib==0.9999999 49 | httplib2==0.10.3 50 | idna==2.6 51 | ipdb==0.10.3 52 | ipython==6.2.1 53 | ipython-genutils==0.2.0 54 | iso8601==0.1.12 55 | itsdangerous==0.24 56 | jamo==0.4.1 57 | jedi==0.11.0 58 | Jinja2==2.9.6 59 | joblib==0.11 60 | librosa==0.5.1 61 | #llvmlite==0.20.0 62 | m3u8==0.3.3 63 | Markdown==2.6.9 64 | MarkupSafe==1.0 65 | matplotlib==2.1.0 66 | monotonic==1.3 67 | nltk==3.2.5 68 | numba==0.35.0 69 | numpy==1.13.3 70 | oauth2client==3.0.0 71 | parso==0.1.0 72 | pexpect==4.2.1 73 | pickleshare==0.7.4 74 | ply==3.8 75 | prompt-toolkit==1.0.15 76 | proto-google-cloud-datastore-v1==0.90.4 77 | proto-google-cloud-error-reporting-v1beta1==0.15.3 78 | proto-google-cloud-logging-v2==0.91.3 79 | proto-google-cloud-pubsub-v1==0.15.4 80 | proto-google-cloud-spanner-admin-database-v1==0.15.3 81 | proto-google-cloud-spanner-admin-instance-v1==0.15.3 82 | proto-google-cloud-spanner-v1==0.15.3 83 | protobuf==3.5.1 84 | ptyprocess==0.5.2 85 | pyasn1==0.3.7 86 | pyasn1-modules==0.1.5 87 | pydub==0.20.0 88 | Pygments==2.2.0 89 | pyparsing==2.2.0 90 | python-dateutil==2.6.1 91 | pytz==2017.2 92 | requests==2.18.4 93 | resampy==0.2.0 94 | rsa==3.4.2 95 | scikit-learn==0.19.0 96 | scipy==0.19.1 97 | simplegeneric==0.8.1 98 | six==1.11.0 99 | tenacity==4.4.0 100 | #tensorflow-gpu==1.3.0 101 | #tensorflow-tensorboard==0.1.8 102 | tinytag==0.18.0 103 | tqdm==4.19.2 104 | traitlets==4.3.2 105 | urllib3==1.22 106 | wcwidth==0.1.7 107 | Werkzeug==0.12.2 108 | youtube-dl==2017.10.15.1 109 | unidecode==1.0.22 110 | inflect==0.2.5 111 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | CUDA_VISIBLE_DEVICES= python app.py --load_path logs/deepvoice2-256-256-krbook-bah-mon-22000-no-priority --dataname=krbook --num_speakers=1 4 | CUDA_VISIBLE_DEVICES= python app.py --load_path logs/jtbc_2017-09-25_11-49-23 --dataname=krbook --num_speakers=1 --port=5002 5 | CUDA_VISIBLE_DEVICES= python app.py --load_path logs/krbook_2017-09-27_17-02-44 --dataname=krbook --num_speakers=1 --port=5001 6 | CUDA_VISIBLE_DEVICES= python app.py --load_path logs/krfemale_2017-10-10_20-37-38 --dataname=krbook --num_speakers=1 --port=5003 7 | CUDA_VISIBLE_DEVICES= python app.py --load_path logs/krmale_2017-10-10_17-49-49 --dataname=krbook --num_speakers=1 --port=5005 8 | CUDA_VISIBLE_DEVICES= python app.py --load_path logs/park+moon+krbook_2017-10-09_20-43-53 --dataname=krbook --num_speakers=3 --port=5004 9 | -------------------------------------------------------------------------------- /scripts/prepare_son.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # 1. Download and extract audio and texts 4 | python -m datasets.jtbc.download 5 | 6 | # 2. Split audios on silence 7 | python -m audio.silence --audio_pattern "./datasets/jtbc/audio/*.wav" --method=pydub 8 | 9 | # 3. Run Google Speech Recognition 10 | python -m recognition.google --audio_pattern "./datasets/jtbc/audio/*.*.wav" 11 | 12 | # 4. Run heuristic text-audio pair search (any improvement on this is welcome) 13 | python -m recognition.alignment --recognition_path "./datasets/jtbc/recognition.json" --score_threshold=0.5 14 | 15 | # 5. Remove intro music 16 | rm datasets/jtbc/data/*.0000.npz 17 | -------------------------------------------------------------------------------- /synthesizer.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import re 4 | import librosa 5 | import argparse 6 | import numpy as np 7 | from glob import glob 8 | from tqdm import tqdm 9 | import tensorflow as tf 10 | from functools import partial 11 | 12 | from hparams import hparams 13 | from models import create_model, get_most_recent_checkpoint 14 | from audio import save_audio, inv_spectrogram, inv_preemphasis, \ 15 | inv_spectrogram_tensorflow 16 | from utils import plot, PARAMS_NAME, load_json, load_hparams, \ 17 | add_prefix, add_postfix, get_time, parallel_run, makedirs, str2bool 18 | 19 | from text.korean import tokenize 20 | from text import text_to_sequence, sequence_to_text 21 | 22 | 23 | class Synthesizer(object): 24 | 25 | def close(self): 26 | tf.reset_default_graph() 27 | self.sess.close() 28 | 29 | def load(self, checkpoint_path, num_speakers=2, checkpoint_step=None, model_name='tacotron'): 30 | self.num_speakers = num_speakers 31 | 32 | if os.path.isdir(checkpoint_path): 33 | load_path = checkpoint_path 34 | checkpoint_path = get_most_recent_checkpoint(checkpoint_path, checkpoint_step) 35 | else: 36 | load_path = os.path.dirname(checkpoint_path) 37 | 38 | print('Constructing model: %s' % model_name) 39 | 40 | inputs = tf.placeholder(tf.int32, [None, None], 'inputs') 41 | input_lengths = tf.placeholder(tf.int32, [None], 'input_lengths') 42 | 43 | batch_size = tf.shape(inputs)[0] 44 | speaker_id = tf.placeholder_with_default( 45 | tf.zeros([batch_size], dtype=tf.int32), [None], 'speaker_id') 46 | 47 | load_hparams(hparams, load_path) 48 | with tf.variable_scope('model') as scope: 49 | self.model = create_model(hparams) 50 | 51 | self.model.initialize( 52 | inputs, input_lengths, 53 | self.num_speakers, speaker_id) 54 | self.wav_output = \ 55 | inv_spectrogram_tensorflow(self.model.linear_outputs) 56 | 57 | print('Loading checkpoint: %s' % checkpoint_path) 58 | 59 | sess_config = tf.ConfigProto( 60 | allow_soft_placement=True, 61 | intra_op_parallelism_threads=1, 62 | inter_op_parallelism_threads=2) 63 | sess_config.gpu_options.allow_growth = True 64 | 65 | self.sess = tf.Session(config=sess_config) 66 | self.sess.run(tf.global_variables_initializer()) 67 | saver = tf.train.Saver() 68 | saver.restore(self.sess, checkpoint_path) 69 | 70 | def synthesize(self, 71 | texts=None, tokens=None, 72 | base_path=None, paths=None, speaker_ids=None, 73 | start_of_sentence=None, end_of_sentence=True, 74 | pre_word_num=0, post_word_num=0, 75 | pre_surplus_idx=0, post_surplus_idx=1, 76 | use_short_concat=False, 77 | manual_attention_mode=0, 78 | base_alignment_path=None, 79 | librosa_trim=True, 80 | attention_trim=True, 81 | isKorean=True): 82 | 83 | # Possible inputs: 84 | # 1) text=text 85 | # 2) text=texts 86 | # 3) tokens=tokens, texts=texts # use texts as guide 87 | 88 | if type(texts) == str: 89 | texts = [texts] 90 | 91 | if texts is not None and tokens is None: 92 | sequences = [text_to_sequence(text) for text in texts] 93 | elif tokens is not None: 94 | sequences = tokens 95 | 96 | if paths is None: 97 | paths = [None] * len(sequences) 98 | if texts is None: 99 | texts = [None] * len(sequences) 100 | 101 | time_str = get_time() 102 | def plot_and_save_parallel( 103 | wavs, alignments, use_manual_attention): 104 | 105 | items = list(enumerate(zip( 106 | wavs, alignments, paths, texts, sequences))) 107 | 108 | fn = partial( 109 | plot_graph_and_save_audio, 110 | base_path=base_path, 111 | start_of_sentence=start_of_sentence, end_of_sentence=end_of_sentence, 112 | pre_word_num=pre_word_num, post_word_num=post_word_num, 113 | pre_surplus_idx=pre_surplus_idx, post_surplus_idx=post_surplus_idx, 114 | use_short_concat=use_short_concat, 115 | use_manual_attention=use_manual_attention, 116 | librosa_trim=librosa_trim, 117 | attention_trim=attention_trim, 118 | time_str=time_str, 119 | isKorean=isKorean) 120 | return parallel_run(fn, items, 121 | desc="plot_graph_and_save_audio", parallel=False) 122 | 123 | input_lengths = np.argmax(np.array(sequences) == 1, 1) 124 | 125 | fetches = [ 126 | #self.wav_output, 127 | self.model.linear_outputs, 128 | self.model.alignments, 129 | ] 130 | 131 | feed_dict = { 132 | self.model.inputs: sequences, 133 | self.model.input_lengths: input_lengths, 134 | } 135 | if base_alignment_path is None: 136 | feed_dict.update({ 137 | self.model.manual_alignments: np.zeros([1, 1, 1]), 138 | self.model.is_manual_attention: False, 139 | }) 140 | else: 141 | manual_alignments = [] 142 | alignment_path = os.path.join( 143 | base_alignment_path, 144 | os.path.basename(base_path)) 145 | 146 | for idx in range(len(sequences)): 147 | numpy_path = "{}.{}.npy".format(alignment_path, idx) 148 | manual_alignments.append(np.load(numpy_path)) 149 | 150 | alignments_T = np.transpose(manual_alignments, [0, 2, 1]) 151 | feed_dict.update({ 152 | self.model.manual_alignments: alignments_T, 153 | self.model.is_manual_attention: True, 154 | }) 155 | 156 | if speaker_ids is not None: 157 | if type(speaker_ids) == dict: 158 | speaker_embed_table = sess.run( 159 | self.model.speaker_embed_table) 160 | 161 | speaker_embed = [speaker_ids[speaker_id] * \ 162 | speaker_embed_table[speaker_id] for speaker_id in speaker_ids] 163 | feed_dict.update({ 164 | self.model.speaker_embed_table: np.tile() 165 | }) 166 | else: 167 | feed_dict[self.model.speaker_id] = speaker_ids 168 | 169 | wavs, alignments = \ 170 | self.sess.run(fetches, feed_dict=feed_dict) 171 | results = plot_and_save_parallel( 172 | wavs, alignments, True) 173 | 174 | if manual_attention_mode > 0: 175 | # argmax one hot 176 | if manual_attention_mode == 1: 177 | alignments_T = np.transpose(alignments, [0, 2, 1]) # [N, E, D] 178 | new_alignments = np.zeros_like(alignments_T) 179 | 180 | for idx in range(len(alignments)): 181 | argmax = alignments[idx].argmax(1) 182 | new_alignments[idx][(argmax, range(len(argmax)))] = 1 183 | # sharpening 184 | elif manual_attention_mode == 2: 185 | new_alignments = np.transpose(alignments, [0, 2, 1]) # [N, E, D] 186 | 187 | for idx in range(len(alignments)): 188 | var = np.var(new_alignments[idx], 1) 189 | mean_var = var[:input_lengths[idx]].mean() 190 | 191 | new_alignments = np.pow(new_alignments[idx], 2) 192 | # prunning 193 | elif manual_attention_mode == 3: 194 | new_alignments = np.transpose(alignments, [0, 2, 1]) # [N, E, D] 195 | 196 | for idx in range(len(alignments)): 197 | argmax = alignments[idx].argmax(1) 198 | new_alignments[idx][(argmax, range(len(argmax)))] = 1 199 | 200 | feed_dict.update({ 201 | self.model.manual_alignments: new_alignments, 202 | self.model.is_manual_attention: True, 203 | }) 204 | 205 | new_wavs, new_alignments = \ 206 | self.sess.run(fetches, feed_dict=feed_dict) 207 | results = plot_and_save_parallel( 208 | new_wavs, new_alignments, True) 209 | 210 | return "{}/{}.manual.wav".format(base_path, time_str) 211 | 212 | def plot_graph_and_save_audio(args, 213 | base_path=None, 214 | start_of_sentence=None, end_of_sentence=None, 215 | pre_word_num=0, post_word_num=0, 216 | pre_surplus_idx=0, post_surplus_idx=1, 217 | use_short_concat=False, 218 | use_manual_attention=False, save_alignment=False, 219 | librosa_trim=False, attention_trim=False, 220 | time_str=None, isKorean=True): 221 | 222 | idx, (wav, alignment, path, text, sequence) = args 223 | 224 | if base_path: 225 | plot_path = "{}/{}.png".format(base_path, time_str) 226 | elif path: 227 | plot_path = path.rsplit('.', 1)[0] + ".png" 228 | else: 229 | plot_path = None 230 | 231 | #plot_path = add_prefix(plot_path, time_str) 232 | if use_manual_attention: 233 | plot_path = add_postfix(plot_path, "manual") 234 | 235 | if plot_path: 236 | plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean) 237 | 238 | if use_short_concat: 239 | wav = short_concat( 240 | wav, alignment, text, 241 | start_of_sentence, end_of_sentence, 242 | pre_word_num, post_word_num, 243 | pre_surplus_idx, post_surplus_idx) 244 | 245 | if attention_trim and end_of_sentence: 246 | end_idx_counter = 0 247 | attention_argmax = alignment.argmax(0) 248 | end_idx = min(len(sequence) - 1, max(attention_argmax)) 249 | max_counter = min((attention_argmax == end_idx).sum(), 5) 250 | 251 | for jdx, attend_idx in enumerate(attention_argmax): 252 | if len(attention_argmax) > jdx + 1: 253 | if attend_idx == end_idx: 254 | end_idx_counter += 1 255 | 256 | if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx: 257 | break 258 | 259 | if end_idx_counter >= max_counter: 260 | break 261 | else: 262 | break 263 | 264 | spec_end_idx = hparams.reduction_factor * jdx + 3 265 | wav = wav[:spec_end_idx] 266 | 267 | audio_out = inv_spectrogram(wav.T) 268 | 269 | if librosa_trim and end_of_sentence: 270 | yt, index = librosa.effects.trim(audio_out, 271 | frame_length=5120, hop_length=256, top_db=50) 272 | print ("index = ", index) 273 | audio_out = audio_out[:index[-1]] 274 | 275 | if save_alignment: 276 | alignment_path = "{}/{}.npy".format(base_path, idx) 277 | np.save(alignment_path, alignment, allow_pickle=False) 278 | 279 | if path or base_path: 280 | if path: 281 | current_path = add_postfix(path, idx) 282 | elif base_path: 283 | current_path = plot_path.replace(".png", ".wav") 284 | 285 | save_audio(audio_out, current_path) 286 | return True 287 | else: 288 | io_out = io.BytesIO() 289 | save_audio(audio_out, io_out) 290 | result = io_out.getvalue() 291 | return result 292 | 293 | def get_most_recent_checkpoint(checkpoint_dir, checkpoint_step=None): 294 | if checkpoint_step is None: 295 | checkpoint_paths = [path for path in glob("{}/*.ckpt-*.data-*".format(checkpoint_dir))] 296 | idxes = [int(os.path.basename(path).split('-')[1].split('.')[0]) for path in checkpoint_paths] 297 | 298 | max_idx = max(idxes) 299 | else: 300 | max_idx = checkpoint_step 301 | lastest_checkpoint = os.path.join(checkpoint_dir, "model.ckpt-{}".format(max_idx)) 302 | print(" [*] Found lastest checkpoint: {}".format(lastest_checkpoint)) 303 | return lastest_checkpoint 304 | 305 | def short_concat( 306 | wav, alignment, text, 307 | start_of_sentence, end_of_sentence, 308 | pre_word_num, post_word_num, 309 | pre_surplus_idx, post_surplus_idx): 310 | 311 | # np.array(list(decomposed_text))[attention_argmax] 312 | attention_argmax = alignment.argmax(0) 313 | 314 | if not start_of_sentence and pre_word_num > 0: 315 | surplus_decomposed_text = decompose_ko_text("".join(text.split()[0])) 316 | start_idx = len(surplus_decomposed_text) + 1 317 | 318 | for idx, attend_idx in enumerate(attention_argmax): 319 | if attend_idx == start_idx and attention_argmax[idx - 1] < start_idx: 320 | break 321 | 322 | wav_start_idx = hparams.reduction_factor * idx - 1 - pre_surplus_idx 323 | else: 324 | wav_start_idx = 0 325 | 326 | if not end_of_sentence and post_word_num > 0: 327 | surplus_decomposed_text = decompose_ko_text("".join(text.split()[-1])) 328 | end_idx = len(decomposed_text.replace(surplus_decomposed_text, '')) - 1 329 | 330 | for idx, attend_idx in enumerate(attention_argmax): 331 | if attend_idx == end_idx and attention_argmax[idx + 1] > end_idx: 332 | break 333 | 334 | wav_end_idx = hparams.reduction_factor * idx + 1 + post_surplus_idx 335 | else: 336 | if True: # attention based split 337 | if end_of_sentence: 338 | end_idx = min(len(decomposed_text) - 1, max(attention_argmax)) 339 | else: 340 | surplus_decomposed_text = decompose_ko_text("".join(text.split()[-1])) 341 | end_idx = len(decomposed_text.replace(surplus_decomposed_text, '')) - 1 342 | 343 | while True: 344 | if end_idx in attention_argmax: 345 | break 346 | end_idx -= 1 347 | 348 | end_idx_counter = 0 349 | for idx, attend_idx in enumerate(attention_argmax): 350 | if len(attention_argmax) > idx + 1: 351 | if attend_idx == end_idx: 352 | end_idx_counter += 1 353 | 354 | if attend_idx == end_idx and attention_argmax[idx + 1] > end_idx: 355 | break 356 | 357 | if end_idx_counter > 5: 358 | break 359 | else: 360 | break 361 | 362 | wav_end_idx = hparams.reduction_factor * idx + 1 + post_surplus_idx 363 | else: 364 | wav_end_idx = None 365 | 366 | wav = wav[wav_start_idx:wav_end_idx] 367 | 368 | if end_of_sentence: 369 | wav = np.lib.pad(wav, ((0, 20), (0, 0)), 'constant', constant_values=0) 370 | else: 371 | wav = np.lib.pad(wav, ((0, 10), (0, 0)), 'constant', constant_values=0) 372 | 373 | 374 | if __name__ == "__main__": 375 | parser = argparse.ArgumentParser() 376 | parser.add_argument('--load_path', required=True) 377 | parser.add_argument('--sample_path', default="samples") 378 | parser.add_argument('--text', required=True) 379 | parser.add_argument('--num_speakers', default=1, type=int) 380 | parser.add_argument('--speaker_id', default=0, type=int) 381 | parser.add_argument('--checkpoint_step', default=None, type=int) 382 | parser.add_argument('--is_korean', default=True, type=str2bool) 383 | config = parser.parse_args() 384 | 385 | makedirs(config.sample_path) 386 | 387 | synthesizer = Synthesizer() 388 | synthesizer.load(config.load_path, config.num_speakers, config.checkpoint_step) 389 | 390 | audio = synthesizer.synthesize( 391 | texts=[config.text], 392 | base_path=config.sample_path, 393 | speaker_ids=[config.speaker_id], 394 | attention_trim=False, 395 | isKorean=config.is_korean)[0] 396 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import numpy as np 4 | 5 | from text import cleaners 6 | from hparams import hparams 7 | from text.symbols import symbols, en_symbols, PAD, EOS 8 | from text.korean import jamo_to_korean 9 | 10 | 11 | 12 | # Mappings from symbol to numeric ID and vice versa: 13 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 14 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 15 | isEn=False 16 | 17 | 18 | # Regular expression matching text enclosed in curly braces: 19 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 20 | 21 | puncuation_table = str.maketrans({key: None for key in string.punctuation}) 22 | 23 | def convert_to_en_symbols(): 24 | '''Converts built-in korean symbols to english, to be used for english training 25 | 26 | ''' 27 | global _symbol_to_id, _id_to_symbol, isEn 28 | if not isEn: 29 | print(" [!] Converting to english mode") 30 | _symbol_to_id = {s: i for i, s in enumerate(en_symbols)} 31 | _id_to_symbol = {i: s for i, s in enumerate(en_symbols)} 32 | isEn=True 33 | 34 | def remove_puncuations(text): 35 | return text.translate(puncuation_table) 36 | 37 | def text_to_sequence(text, as_token=False): 38 | cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] 39 | if ('english_cleaners' in cleaner_names) and isEn==False: 40 | convert_to_en_symbols() 41 | return _text_to_sequence(text, cleaner_names, as_token) 42 | 43 | def _text_to_sequence(text, cleaner_names, as_token): 44 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 45 | 46 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 47 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 48 | 49 | Args: 50 | text: string to convert to a sequence 51 | cleaner_names: names of the cleaner functions to run the text through 52 | 53 | Returns: 54 | List of integers corresponding to the symbols in the text 55 | ''' 56 | sequence = [] 57 | 58 | # Check for curly braces and treat their contents as ARPAbet: 59 | while len(text): 60 | m = _curly_re.match(text) 61 | if not m: 62 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 63 | break 64 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 65 | sequence += _arpabet_to_sequence(m.group(2)) 66 | text = m.group(3) 67 | 68 | # Append EOS token 69 | sequence.append(_symbol_to_id[EOS]) 70 | 71 | if as_token: 72 | return sequence_to_text(sequence, combine_jamo=True) 73 | else: 74 | return np.array(sequence, dtype=np.int32) 75 | 76 | 77 | def sequence_to_text(sequence, skip_eos_and_pad=False, combine_jamo=False): 78 | '''Converts a sequence of IDs back to a string''' 79 | cleaner_names=[x.strip() for x in hparams.cleaners.split(',')] 80 | if 'english_cleaners' in cleaner_names and isEn==False: 81 | convert_to_en_symbols() 82 | 83 | result = '' 84 | for symbol_id in sequence: 85 | if symbol_id in _id_to_symbol: 86 | s = _id_to_symbol[symbol_id] 87 | # Enclose ARPAbet back in curly braces: 88 | if len(s) > 1 and s[0] == '@': 89 | s = '{%s}' % s[1:] 90 | 91 | if not skip_eos_and_pad or s not in [EOS, PAD]: 92 | result += s 93 | 94 | result = result.replace('}{', ' ') 95 | 96 | if combine_jamo: 97 | return jamo_to_korean(result) 98 | else: 99 | return result 100 | 101 | 102 | 103 | def _clean_text(text, cleaner_names): 104 | for name in cleaner_names: 105 | cleaner = getattr(cleaners, name) 106 | if not cleaner: 107 | raise Exception('Unknown cleaner: %s' % name) 108 | text = cleaner(text) 109 | return text 110 | 111 | 112 | def _symbols_to_sequence(symbols): 113 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 114 | 115 | 116 | def _arpabet_to_sequence(text): 117 | return _symbols_to_sequence(['@' + s for s in text.split()]) 118 | 119 | 120 | def _should_keep_symbol(s): 121 | return s in _symbol_to_id and s is not '_' and s is not '~' 122 | -------------------------------------------------------------------------------- /text/cleaners.py: -------------------------------------------------------------------------------- 1 | # Code based on https://github.com/keithito/tacotron/blob/master/text/cleaners.py 2 | ''' 3 | Cleaners are transformations that run over the input text at both training and eval time. 4 | 5 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 6 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 7 | 1. "english_cleaners" for English text 8 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 9 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 10 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 11 | the symbols in symbols.py to match your data). 12 | ''' 13 | 14 | import re 15 | from .korean import tokenize as ko_tokenize 16 | 17 | # Added to support LJ_speech 18 | from unidecode import unidecode 19 | from .en_numbers import normalize_numbers as en_normalize_numbers 20 | 21 | # Regular expression matching whitespace: 22 | _whitespace_re = re.compile(r'\s+') 23 | 24 | 25 | def korean_cleaners(text): 26 | '''Pipeline for Korean text, including number and abbreviation expansion.''' 27 | text = ko_tokenize(text) 28 | return text 29 | 30 | 31 | # List of (regular expression, replacement) pairs for abbreviations: 32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 33 | ('mrs', 'misess'), 34 | ('mr', 'mister'), 35 | ('dr', 'doctor'), 36 | ('st', 'saint'), 37 | ('co', 'company'), 38 | ('jr', 'junior'), 39 | ('maj', 'major'), 40 | ('gen', 'general'), 41 | ('drs', 'doctors'), 42 | ('rev', 'reverend'), 43 | ('lt', 'lieutenant'), 44 | ('hon', 'honorable'), 45 | ('sgt', 'sergeant'), 46 | ('capt', 'captain'), 47 | ('esq', 'esquire'), 48 | ('ltd', 'limited'), 49 | ('col', 'colonel'), 50 | ('ft', 'fort'), 51 | ]] 52 | 53 | 54 | def expand_abbreviations(text): 55 | for regex, replacement in _abbreviations: 56 | text = re.sub(regex, replacement, text) 57 | return text 58 | 59 | 60 | def expand_numbers(text): 61 | return en_normalize_numbers(text) 62 | 63 | 64 | def lowercase(text): 65 | return text.lower() 66 | 67 | 68 | def collapse_whitespace(text): 69 | return re.sub(_whitespace_re, ' ', text) 70 | 71 | def convert_to_ascii(text): 72 | '''Converts to ascii, existed in keithito but deleted in carpedm20''' 73 | return unidecode(text) 74 | 75 | 76 | def basic_cleaners(text): 77 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 78 | text = lowercase(text) 79 | text = collapse_whitespace(text) 80 | return text 81 | 82 | 83 | def transliteration_cleaners(text): 84 | '''Pipeline for non-English text that transliterates to ASCII.''' 85 | text = convert_to_ascii(text) 86 | text = lowercase(text) 87 | text = collapse_whitespace(text) 88 | return text 89 | 90 | 91 | def english_cleaners(text): 92 | '''Pipeline for English text, including number and abbreviation expansion.''' 93 | text = convert_to_ascii(text) 94 | text = lowercase(text) 95 | text = expand_numbers(text) 96 | text = expand_abbreviations(text) 97 | text = collapse_whitespace(text) 98 | return text 99 | 100 | 101 | -------------------------------------------------------------------------------- /text/en_numbers.py: -------------------------------------------------------------------------------- 1 | import inflect 2 | import re 3 | 4 | 5 | _inflect = inflect.engine() 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 11 | _number_re = re.compile(r'[0-9]+') 12 | 13 | 14 | def _remove_commas(m): 15 | return m.group(1).replace(',', '') 16 | 17 | 18 | def _expand_decimal_point(m): 19 | return m.group(1).replace('.', ' point ') 20 | 21 | 22 | def _expand_dollars(m): 23 | match = m.group(1) 24 | parts = match.split('.') 25 | if len(parts) > 2: 26 | return match + ' dollars' # Unexpected format 27 | dollars = int(parts[0]) if parts[0] else 0 28 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 29 | if dollars and cents: 30 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 31 | cent_unit = 'cent' if cents == 1 else 'cents' 32 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 33 | elif dollars: 34 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 35 | return '%s %s' % (dollars, dollar_unit) 36 | elif cents: 37 | cent_unit = 'cent' if cents == 1 else 'cents' 38 | return '%s %s' % (cents, cent_unit) 39 | else: 40 | return 'zero dollars' 41 | 42 | 43 | def _expand_ordinal(m): 44 | return _inflect.number_to_words(m.group(0)) 45 | 46 | 47 | def _expand_number(m): 48 | num = int(m.group(0)) 49 | if num > 1000 and num < 3000: 50 | if num == 2000: 51 | return 'two thousand' 52 | elif num > 2000 and num < 2010: 53 | return 'two thousand ' + _inflect.number_to_words(num % 100) 54 | elif num % 100 == 0: 55 | return _inflect.number_to_words(num // 100) + ' hundred' 56 | else: 57 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 58 | else: 59 | return _inflect.number_to_words(num, andword='') 60 | 61 | 62 | def normalize_numbers(text): 63 | text = re.sub(_comma_number_re, _remove_commas, text) 64 | text = re.sub(_pounds_re, r'\1 pounds', text) 65 | text = re.sub(_dollars_re, _expand_dollars, text) 66 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 67 | text = re.sub(_ordinal_re, _expand_ordinal, text) 68 | text = re.sub(_number_re, _expand_number, text) 69 | return text 70 | -------------------------------------------------------------------------------- /text/english.py: -------------------------------------------------------------------------------- 1 | # Code from https://github.com/keithito/tacotron/blob/master/util/numbers.py 2 | import inflect 3 | 4 | 5 | _inflect = inflect.engine() 6 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 7 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 8 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 9 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 10 | _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') 11 | _number_re = re.compile(r'[0-9]+') 12 | 13 | 14 | def _remove_commas(m): 15 | return m.group(1).replace(',', '') 16 | 17 | 18 | def _expand_decimal_point(m): 19 | return m.group(1).replace('.', ' point ') 20 | 21 | 22 | def _expand_dollars(m): 23 | match = m.group(1) 24 | parts = match.split('.') 25 | if len(parts) > 2: 26 | return match + ' dollars' # Unexpected format 27 | dollars = int(parts[0]) if parts[0] else 0 28 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 29 | if dollars and cents: 30 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 31 | cent_unit = 'cent' if cents == 1 else 'cents' 32 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 33 | elif dollars: 34 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 35 | return '%s %s' % (dollars, dollar_unit) 36 | elif cents: 37 | cent_unit = 'cent' if cents == 1 else 'cents' 38 | return '%s %s' % (cents, cent_unit) 39 | else: 40 | return 'zero dollars' 41 | 42 | 43 | def _expand_ordinal(m): 44 | return _inflect.number_to_words(m.group(0)) 45 | 46 | 47 | def _expand_number(m): 48 | num = int(m.group(0)) 49 | if num > 1000 and num < 3000: 50 | if num == 2000: 51 | return 'two thousand' 52 | elif num > 2000 and num < 2010: 53 | return 'two thousand ' + _inflect.number_to_words(num % 100) 54 | elif num % 100 == 0: 55 | return _inflect.number_to_words(num // 100) + ' hundred' 56 | else: 57 | return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') 58 | else: 59 | return _inflect.number_to_words(num, andword='') 60 | 61 | 62 | def normalize(text): 63 | text = re.sub(_comma_number_re, _remove_commas, text) 64 | text = re.sub(_pounds_re, r'\1 pounds', text) 65 | text = re.sub(_dollars_re, _expand_dollars, text) 66 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 67 | text = re.sub(_ordinal_re, _expand_ordinal, text) 68 | text = re.sub(_number_re, _expand_number, text) 69 | return text 70 | -------------------------------------------------------------------------------- /text/ko_dictionary.py: -------------------------------------------------------------------------------- 1 | etc_dictionary = { 2 | '2 30대': '이삼십대', 3 | '20~30대': '이삼십대', 4 | '20, 30대': '이십대 삼십대', 5 | '1+1': '원플러스원', 6 | '3에서 6개월인': '3개월에서 육개월인', 7 | } 8 | 9 | english_dictionary = { 10 | 'Devsisters': '데브시스터즈', 11 | 'track': '트랙', 12 | 13 | # krbook 14 | 'LA': '엘에이', 15 | 'LG': '엘지', 16 | 'KOREA': '코리아', 17 | 'JSA': '제이에스에이', 18 | 'PGA': '피지에이', 19 | 'GA': '지에이', 20 | 'idol': '아이돌', 21 | 'KTX': '케이티엑스', 22 | 'AC': '에이씨', 23 | 'DVD': '디비디', 24 | 'US': '유에스', 25 | 'CNN': '씨엔엔', 26 | 'LPGA': '엘피지에이', 27 | 'P': '피', 28 | 'L': '엘', 29 | 'T': '티', 30 | 'B': '비', 31 | 'C': '씨', 32 | 'BIFF': '비아이에프에프', 33 | 'GV': '지비', 34 | 35 | # JTBC 36 | 'IT': '아이티', 37 | 'IQ': '아이큐', 38 | 'JTBC': '제이티비씨', 39 | 'trickle down effect': '트리클 다운 이펙트', 40 | 'trickle up effect': '트리클 업 이펙트', 41 | 'down': '다운', 42 | 'up': '업', 43 | 'FCK': '에프씨케이', 44 | 'AP': '에이피', 45 | 'WHERETHEWILDTHINGSARE': '', 46 | 'Rashomon Effect': '', 47 | 'O': '오', 48 | 'OO': '오오', 49 | 'B': '비', 50 | 'GDP': '지디피', 51 | 'CIPA': '씨아이피에이', 52 | 'YS': '와이에스', 53 | 'Y': '와이', 54 | 'S': '에스', 55 | 'JTBC': '제이티비씨', 56 | 'PC': '피씨', 57 | 'bill': '빌', 58 | 'Halmuny': '하모니', ##### 59 | 'X': '엑스', 60 | 'SNS': '에스엔에스', 61 | 'ability': '어빌리티', 62 | 'shy': '', 63 | 'CCTV': '씨씨티비', 64 | 'IT': '아이티', 65 | 'the tenth man': '더 텐쓰 맨', #### 66 | 'L': '엘', 67 | 'PC': '피씨', 68 | 'YSDJJPMB': '', ######## 69 | 'Content Attitude Timing': '컨텐트 애티튜드 타이밍', 70 | 'CAT': '캣', 71 | 'IS': '아이에스', 72 | 'SNS': '에스엔에스', 73 | 'K': '케이', 74 | 'Y': '와이', 75 | 'KDI': '케이디아이', 76 | 'DOC': '디오씨', 77 | 'CIA': '씨아이에이', 78 | 'PBS': '피비에스', 79 | 'D': '디', 80 | 'PPropertyPositionPowerPrisonP' 81 | 'S': '에스', 82 | 'francisco': '프란시스코', 83 | 'I': '아이', 84 | 'III': '아이아이', ###### 85 | 'No joke': '노 조크', 86 | 'BBK': '비비케이', 87 | 'LA': '엘에이', 88 | 'Don': '', 89 | 't worry be happy': ' 워리 비 해피', 90 | 'NO': '엔오', ##### 91 | 'it was our sky': '잇 워즈 아워 스카이', 92 | 'it is our sky': '잇 이즈 아워 스카이', #### 93 | 'NEIS': '엔이아이에스', ##### 94 | 'IMF': '아이엠에프', 95 | 'apology': '어폴로지', 96 | 'humble': '험블', 97 | 'M': '엠', 98 | 'Nowhere Man': '노웨어 맨', 99 | 'The Tenth Man': '더 텐쓰 맨', 100 | 'PBS': '피비에스', 101 | 'BBC': '비비씨', 102 | 'MRJ': '엠알제이', 103 | 'CCTV': '씨씨티비', 104 | 'Pick me up': '픽 미 업', 105 | 'DNA': '디엔에이', 106 | 'UN': '유엔', 107 | 'STOP': '스탑', ##### 108 | 'PRESS': '프레스', ##### 109 | 'not to be': '낫 투비', 110 | 'Denial': '디나이얼', 111 | 'G': '지', 112 | 'IMF': '아이엠에프', 113 | 'GDP': '지디피', 114 | 'JTBC': '제이티비씨', 115 | 'Time flies like an arrow': '타임 플라이즈 라이크 언 애로우', 116 | 'DDT': '디디티', 117 | 'AI': '에이아이', 118 | 'Z': '제트', 119 | 'OECD': '오이씨디', 120 | 'N': '앤', 121 | 'A': '에이', 122 | 'MB': '엠비', 123 | 'EH': '이에이치', 124 | 'IS': '아이에스', 125 | 'TV': '티비', 126 | 'MIT': '엠아이티', 127 | 'KBO': '케이비오', 128 | 'I love America': '아이 러브 아메리카', 129 | 'SF': '에스에프', 130 | 'Q': '큐', 131 | 'KFX': '케이에프엑스', 132 | 'PM': '피엠', 133 | 'Prime Minister': '프라임 미니스터', 134 | 'Swordline': '스워드라인', 135 | 'TBS': '티비에스', 136 | 'DDT': '디디티', 137 | 'CS': '씨에스', 138 | 'Reflecting Absence': '리플렉팅 앱센스', 139 | 'PBS': '피비에스', 140 | 'Drum being beaten by everyone': '드럼 빙 비튼 바이 에브리원', 141 | 'negative pressure': '네거티브 프레셔', 142 | 'F': '에프', 143 | 'KIA': '기아', 144 | 'FTA': '에프티에이', 145 | 'Que sais-je': '', 146 | 'UFC': '유에프씨', 147 | 'P': '피', 148 | 'DJ': '디제이', 149 | 'Chaebol': '채벌', 150 | 'BBC': '비비씨', 151 | 'OECD': '오이씨디', 152 | 'BC': '삐씨', 153 | 'C': '씨', 154 | 'B': '씨', 155 | 'KY': '케이와이', 156 | 'K': '케이', 157 | 'CEO': '씨이오', 158 | 'YH': '와이에치', 159 | 'IS': '아이에스', 160 | 'who are you': '후 얼 유', 161 | 'Y': '와이', 162 | 'The Devils Advocate': '더 데빌즈 어드보카트', 163 | 'YS': '와이에스', 164 | 'so sorry': '쏘 쏘리', 165 | 'Santa': '산타', 166 | 'Big Endian': '빅 엔디안', 167 | 'Small Endian': '스몰 엔디안', 168 | 'Oh Captain My Captain': '오 캡틴 마이 캡틴', 169 | 'AIB': '에이아이비', 170 | 'K': '케이', 171 | 'PBS': '피비에스', 172 | } 173 | -------------------------------------------------------------------------------- /text/korean.py: -------------------------------------------------------------------------------- 1 | # Code based on 2 | 3 | import re 4 | import os 5 | import ast 6 | import json 7 | from jamo import hangul_to_jamo, h2j, j2h 8 | 9 | from .ko_dictionary import english_dictionary, etc_dictionary 10 | 11 | PAD = '_' 12 | EOS = '~' 13 | PUNC = '!\'(),-.:;?' 14 | SPACE = ' ' 15 | 16 | JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)]) 17 | JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)]) 18 | JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)]) 19 | 20 | VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE 21 | ALL_SYMBOLS = PAD + EOS + VALID_CHARS 22 | 23 | char_to_id = {c: i for i, c in enumerate(ALL_SYMBOLS)} 24 | id_to_char = {i: c for i, c in enumerate(ALL_SYMBOLS)} 25 | 26 | quote_checker = """([`"'"“‘])(.+?)([`"'"”’])""" 27 | 28 | def is_lead(char): 29 | return char in JAMO_LEADS 30 | 31 | def is_vowel(char): 32 | return char in JAMO_VOWELS 33 | 34 | def is_tail(char): 35 | return char in JAMO_TAILS 36 | 37 | def get_mode(char): 38 | if is_lead(char): 39 | return 0 40 | elif is_vowel(char): 41 | return 1 42 | elif is_tail(char): 43 | return 2 44 | else: 45 | return -1 46 | 47 | def _get_text_from_candidates(candidates): 48 | if len(candidates) == 0: 49 | return "" 50 | elif len(candidates) == 1: 51 | return _jamo_char_to_hcj(candidates[0]) 52 | else: 53 | return j2h(**dict(zip(["lead", "vowel", "tail"], candidates))) 54 | 55 | def jamo_to_korean(text): 56 | text = h2j(text) 57 | 58 | idx = 0 59 | new_text = "" 60 | candidates = [] 61 | 62 | while True: 63 | if idx >= len(text): 64 | new_text += _get_text_from_candidates(candidates) 65 | break 66 | 67 | char = text[idx] 68 | mode = get_mode(char) 69 | 70 | if mode == 0: 71 | new_text += _get_text_from_candidates(candidates) 72 | candidates = [char] 73 | elif mode == -1: 74 | new_text += _get_text_from_candidates(candidates) 75 | new_text += char 76 | candidates = [] 77 | else: 78 | candidates.append(char) 79 | 80 | idx += 1 81 | return new_text 82 | 83 | num_to_kor = { 84 | '0': '영', 85 | '1': '일', 86 | '2': '이', 87 | '3': '삼', 88 | '4': '사', 89 | '5': '오', 90 | '6': '육', 91 | '7': '칠', 92 | '8': '팔', 93 | '9': '구', 94 | } 95 | 96 | unit_to_kor1 = { 97 | '%': '퍼센트', 98 | 'cm': '센치미터', 99 | 'mm': '밀리미터', 100 | 'km': '킬로미터', 101 | 'kg': '킬로그람', 102 | } 103 | unit_to_kor2 = { 104 | 'm': '미터', 105 | } 106 | 107 | upper_to_kor = { 108 | 'A': '에이', 109 | 'B': '비', 110 | 'C': '씨', 111 | 'D': '디', 112 | 'E': '이', 113 | 'F': '에프', 114 | 'G': '지', 115 | 'H': '에이치', 116 | 'I': '아이', 117 | 'J': '제이', 118 | 'K': '케이', 119 | 'L': '엘', 120 | 'M': '엠', 121 | 'N': '엔', 122 | 'O': '오', 123 | 'P': '피', 124 | 'Q': '큐', 125 | 'R': '알', 126 | 'S': '에스', 127 | 'T': '티', 128 | 'U': '유', 129 | 'V': '브이', 130 | 'W': '더블유', 131 | 'X': '엑스', 132 | 'Y': '와이', 133 | 'Z': '지', 134 | } 135 | 136 | def compare_sentence_with_jamo(text1, text2): 137 | return h2j(text1) != h2j(text) 138 | 139 | def tokenize(text, as_id=False): 140 | text = normalize(text) 141 | tokens = list(hangul_to_jamo(text)) 142 | 143 | if as_id: 144 | return [char_to_id[token] for token in tokens] + [char_to_id[EOS]] 145 | else: 146 | return [token for token in tokens] + [EOS] 147 | 148 | def tokenizer_fn(iterator): 149 | return (token for x in iterator for token in tokenize(x, as_id=False)) 150 | 151 | def normalize(text): 152 | text = text.strip() 153 | 154 | text = re.sub('\(\d+일\)', '', text) 155 | text = re.sub('\([⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+\)', '', text) 156 | 157 | text = normalize_with_dictionary(text, etc_dictionary) 158 | text = normalize_english(text) 159 | text = re.sub('[a-zA-Z]+', normalize_upper, text) 160 | 161 | text = normalize_quote(text) 162 | text = normalize_number(text) 163 | 164 | return text 165 | 166 | def normalize_with_dictionary(text, dic): 167 | if any(key in text for key in dic.keys()): 168 | pattern = re.compile('|'.join(re.escape(key) for key in dic.keys())) 169 | return pattern.sub(lambda x: dic[x.group()], text) 170 | else: 171 | return text 172 | 173 | def normalize_english(text): 174 | def fn(m): 175 | word = m.group() 176 | if word in english_dictionary: 177 | return english_dictionary.get(word) 178 | else: 179 | return word 180 | 181 | text = re.sub("([A-Za-z]+)", fn, text) 182 | return text 183 | 184 | def normalize_upper(text): 185 | text = text.group(0) 186 | 187 | if all([char.isupper() for char in text]): 188 | return "".join(upper_to_kor[char] for char in text) 189 | else: 190 | return text 191 | 192 | def normalize_quote(text): 193 | def fn(found_text): 194 | from nltk import sent_tokenize # NLTK doesn't along with multiprocessing 195 | 196 | found_text = found_text.group() 197 | unquoted_text = found_text[1:-1] 198 | 199 | sentences = sent_tokenize(unquoted_text) 200 | return " ".join(["'{}'".format(sent) for sent in sentences]) 201 | 202 | return re.sub(quote_checker, fn, text) 203 | 204 | number_checker = "([+-]?\d[\d,]*)[\.]?\d*" 205 | count_checker = "(시|명|가지|살|마리|포기|송이|수|톨|통|점|개|벌|척|채|다발|그루|자루|줄|켤레|그릇|잔|마디|상자|사람|곡|병|판)" 206 | 207 | def normalize_number(text): 208 | text = normalize_with_dictionary(text, unit_to_kor1) 209 | text = normalize_with_dictionary(text, unit_to_kor2) 210 | text = re.sub(number_checker + count_checker, 211 | lambda x: number_to_korean(x, True), text) 212 | text = re.sub(number_checker, 213 | lambda x: number_to_korean(x, False), text) 214 | return text 215 | 216 | num_to_kor1 = [""] + list("일이삼사오육칠팔구") 217 | num_to_kor2 = [""] + list("만억조경해") 218 | num_to_kor3 = [""] + list("십백천") 219 | 220 | #count_to_kor1 = [""] + ["하나","둘","셋","넷","다섯","여섯","일곱","여덟","아홉"] 221 | count_to_kor1 = [""] + ["한","두","세","네","다섯","여섯","일곱","여덟","아홉"] 222 | 223 | count_tenth_dict = { 224 | "십": "열", 225 | "두십": "스물", 226 | "세십": "서른", 227 | "네십": "마흔", 228 | "다섯십": "쉰", 229 | "여섯십": "예순", 230 | "일곱십": "일흔", 231 | "여덟십": "여든", 232 | "아홉십": "아흔", 233 | } 234 | 235 | 236 | 237 | def number_to_korean(num_str, is_count=False): 238 | if is_count: 239 | num_str, unit_str = num_str.group(1), num_str.group(2) 240 | else: 241 | num_str, unit_str = num_str.group(), "" 242 | 243 | num_str = num_str.replace(',', '') 244 | num = ast.literal_eval(num_str) 245 | 246 | if num == 0: 247 | return "영" 248 | 249 | check_float = num_str.split('.') 250 | if len(check_float) == 2: 251 | digit_str, float_str = check_float 252 | elif len(check_float) >= 3: 253 | raise Exception(" [!] Wrong number format") 254 | else: 255 | digit_str, float_str = check_float[0], None 256 | 257 | if is_count and float_str is not None: 258 | raise Exception(" [!] `is_count` and float number does not fit each other") 259 | 260 | digit = int(digit_str) 261 | 262 | if digit_str.startswith("-"): 263 | digit, digit_str = abs(digit), str(abs(digit)) 264 | 265 | kor = "" 266 | size = len(str(digit)) 267 | tmp = [] 268 | 269 | for i, v in enumerate(digit_str, start=1): 270 | v = int(v) 271 | 272 | if v != 0: 273 | if is_count: 274 | tmp += count_to_kor1[v] 275 | else: 276 | tmp += num_to_kor1[v] 277 | 278 | tmp += num_to_kor3[(size - i) % 4] 279 | 280 | if (size - i) % 4 == 0 and len(tmp) != 0: 281 | kor += "".join(tmp) 282 | tmp = [] 283 | kor += num_to_kor2[int((size - i) / 4)] 284 | 285 | if is_count: 286 | if kor.startswith("한") and len(kor) > 1: 287 | kor = kor[1:] 288 | 289 | if any(word in kor for word in count_tenth_dict): 290 | kor = re.sub( 291 | '|'.join(count_tenth_dict.keys()), 292 | lambda x: count_tenth_dict[x.group()], kor) 293 | 294 | if not is_count and kor.startswith("일") and len(kor) > 1: 295 | kor = kor[1:] 296 | 297 | if float_str is not None: 298 | kor += "쩜 " 299 | kor += re.sub('\d', lambda x: num_to_kor[x.group()], float_str) 300 | 301 | if num_str.startswith("+"): 302 | kor = "플러스 " + kor 303 | elif num_str.startswith("-"): 304 | kor = "마이너스 " + kor 305 | 306 | return kor + unit_str 307 | 308 | if __name__ == "__main__": 309 | def test_normalize(text): 310 | print(text) 311 | print(normalize(text)) 312 | print("="*30) 313 | 314 | test_normalize("JTBC는 JTBCs를 DY는 A가 Absolute") 315 | test_normalize("오늘(13일) 101마리 강아지가") 316 | test_normalize('"저돌"(猪突) 입니다.') 317 | test_normalize('비대위원장이 지난 1월 이런 말을 했습니다. “난 그냥 산돼지처럼 돌파하는 스타일이다”') 318 | test_normalize("지금은 -12.35%였고 종류는 5가지와 19가지, 그리고 55가지였다") 319 | test_normalize("JTBC는 TH와 K 양이 2017년 9월 12일 오후 12시에 24살이 된다") 320 | -------------------------------------------------------------------------------- /text/symbols.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Defines the set of symbols used in text input to the model. 3 | 4 | The default is a set of ASCII characters that works well for English or text that has been run 5 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 6 | ''' 7 | from jamo import h2j, j2h 8 | from jamo.jamo import _jamo_char_to_hcj 9 | 10 | from .korean import ALL_SYMBOLS, PAD, EOS 11 | 12 | # For english 13 | en_symbols = PAD+EOS+'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' #<-For deployment(Because korean ALL_SYMBOLS follow this convention) 14 | 15 | symbols = ALL_SYMBOLS # for korean 16 | 17 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import math 4 | import argparse 5 | import traceback 6 | import subprocess 7 | import numpy as np 8 | from jamo import h2j 9 | import tensorflow as tf 10 | from datetime import datetime 11 | from functools import partial 12 | 13 | from hparams import hparams, hparams_debug_string 14 | from models import create_model, get_most_recent_checkpoint 15 | 16 | from utils import ValueWindow, prepare_dirs 17 | from utils import infolog, warning, plot, load_hparams 18 | from utils import get_git_revision_hash, get_git_diff, str2bool, parallel_run 19 | 20 | from audio import save_audio, inv_spectrogram 21 | from text import sequence_to_text, text_to_sequence 22 | from datasets.datafeeder import DataFeeder, _prepare_inputs 23 | 24 | log = infolog.log 25 | 26 | 27 | def create_batch_inputs_from_texts(texts): 28 | sequences = [text_to_sequence(text) for text in texts] 29 | 30 | inputs = _prepare_inputs(sequences) 31 | input_lengths = np.asarray([len(x) for x in inputs], dtype=np.int32) 32 | 33 | for idx, (seq, text) in enumerate(zip(inputs, texts)): 34 | recovered_text = sequence_to_text(seq, skip_eos_and_pad=True) 35 | if recovered_text != h2j(text): 36 | log(" [{}] {}".format(idx, text)) 37 | log(" [{}] {}".format(idx, recovered_text)) 38 | log("="*30) 39 | 40 | return inputs, input_lengths 41 | 42 | 43 | def get_git_commit(): 44 | subprocess.check_output(['git', 'diff-index', '--quiet', 'HEAD']) # Verify client is clean 45 | commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode().strip()[:10] 46 | log('Git commit: %s' % commit) 47 | return commit 48 | 49 | 50 | def add_stats(model, model2=None, scope_name='train'): 51 | with tf.variable_scope(scope_name) as scope: 52 | summaries = [ 53 | tf.summary.scalar('loss_mel', model.mel_loss), 54 | tf.summary.scalar('loss_linear', model.linear_loss), 55 | tf.summary.scalar('loss', model.loss_without_coeff), 56 | ] 57 | 58 | if scope_name == 'train': 59 | gradient_norms = [tf.norm(grad) for grad in model.gradients if grad is not None] 60 | 61 | summaries.extend([ 62 | tf.summary.scalar('learning_rate', model.learning_rate), 63 | tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)), 64 | ]) 65 | 66 | if model2 is not None: 67 | with tf.variable_scope('gap_test-train') as scope: 68 | summaries.extend([ 69 | tf.summary.scalar('loss_mel', 70 | model.mel_loss - model2.mel_loss), 71 | tf.summary.scalar('loss_linear', 72 | model.linear_loss - model2.linear_loss), 73 | tf.summary.scalar('loss', 74 | model.loss_without_coeff - model2.loss_without_coeff), 75 | ]) 76 | 77 | return tf.summary.merge(summaries) 78 | 79 | 80 | def save_and_plot_fn(args, log_dir, step, loss, prefix): 81 | idx, (seq, spec, align) = args 82 | 83 | audio_path = os.path.join( 84 | log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx)) 85 | align_path = os.path.join( 86 | log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx)) 87 | 88 | waveform = inv_spectrogram(spec.T) 89 | save_audio(waveform, audio_path) 90 | 91 | info_text = 'step={:d}, loss={:.5f}'.format(step, loss) 92 | if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]: 93 | log('Training korean : Use jamo') 94 | plot.plot_alignment( 95 | align, align_path, info=info_text, 96 | text=sequence_to_text(seq, 97 | skip_eos_and_pad=True, combine_jamo=True), isKorean=True) 98 | else: 99 | log('Training non-korean : X use jamo') 100 | plot.plot_alignment( 101 | align, align_path, info=info_text, 102 | text=sequence_to_text(seq, 103 | skip_eos_and_pad=True, combine_jamo=False), isKorean=False) 104 | 105 | def save_and_plot(sequences, spectrograms, 106 | alignments, log_dir, step, loss, prefix): 107 | 108 | fn = partial(save_and_plot_fn, 109 | log_dir=log_dir, step=step, loss=loss, prefix=prefix) 110 | items = list(enumerate(zip(sequences, spectrograms, alignments))) 111 | 112 | parallel_run(fn, items, parallel=False) 113 | log('Test finished for step {}.'.format(step)) 114 | 115 | 116 | def train(log_dir, config): 117 | config.data_paths = config.data_paths 118 | 119 | data_dirs = [os.path.join(data_path, "data") \ 120 | for data_path in config.data_paths] 121 | num_speakers = len(data_dirs) 122 | config.num_test = config.num_test_per_speaker * num_speakers 123 | 124 | if num_speakers > 1 and hparams.model_type not in ["deepvoice", "simple"]: 125 | raise Exception("[!] Unkown model_type for multi-speaker: {}".format(config.model_type)) 126 | 127 | commit = get_git_commit() if config.git else 'None' 128 | checkpoint_path = os.path.join(log_dir, 'model.ckpt') 129 | 130 | log(' [*] git recv-parse HEAD:\n%s' % get_git_revision_hash()) 131 | log('='*50) 132 | #log(' [*] dit diff:\n%s' % get_git_diff()) 133 | log('='*50) 134 | log(' [*] Checkpoint path: %s' % checkpoint_path) 135 | log(' [*] Loading training data from: %s' % data_dirs) 136 | log(' [*] Using model: %s' % config.model_dir) 137 | log(hparams_debug_string()) 138 | 139 | # Set up DataFeeder: 140 | coord = tf.train.Coordinator() 141 | with tf.variable_scope('datafeeder') as scope: 142 | train_feeder = DataFeeder( 143 | coord, data_dirs, hparams, config, 32, 144 | data_type='train', batch_size=hparams.batch_size) 145 | test_feeder = DataFeeder( 146 | coord, data_dirs, hparams, config, 8, 147 | data_type='test', batch_size=config.num_test) 148 | 149 | # Set up model: 150 | is_randomly_initialized = config.initialize_path is None 151 | global_step = tf.Variable(0, name='global_step', trainable=False) 152 | 153 | with tf.variable_scope('model') as scope: 154 | model = create_model(hparams) 155 | model.initialize( 156 | train_feeder.inputs, train_feeder.input_lengths, 157 | num_speakers, train_feeder.speaker_id, 158 | train_feeder.mel_targets, train_feeder.linear_targets, 159 | train_feeder.loss_coeff, 160 | is_randomly_initialized=is_randomly_initialized) 161 | 162 | model.add_loss() 163 | model.add_optimizer(global_step) 164 | train_stats = add_stats(model, scope_name='stats') # legacy 165 | 166 | with tf.variable_scope('model', reuse=True) as scope: 167 | test_model = create_model(hparams) 168 | test_model.initialize( 169 | test_feeder.inputs, test_feeder.input_lengths, 170 | num_speakers, test_feeder.speaker_id, 171 | test_feeder.mel_targets, test_feeder.linear_targets, 172 | test_feeder.loss_coeff, rnn_decoder_test_mode=True, 173 | is_randomly_initialized=is_randomly_initialized) 174 | test_model.add_loss() 175 | 176 | test_stats = add_stats(test_model, model, scope_name='test') 177 | test_stats = tf.summary.merge([test_stats, train_stats]) 178 | 179 | # Bookkeeping: 180 | step = 0 181 | time_window = ValueWindow(100) 182 | loss_window = ValueWindow(100) 183 | saver = tf.train.Saver(max_to_keep=None, keep_checkpoint_every_n_hours=2) 184 | 185 | sess_config = tf.ConfigProto( 186 | log_device_placement=False, 187 | allow_soft_placement=True) 188 | sess_config.gpu_options.allow_growth=True 189 | 190 | # Train! 191 | #with tf.Session(config=sess_config) as sess: 192 | with tf.Session() as sess: 193 | try: 194 | summary_writer = tf.summary.FileWriter(log_dir, sess.graph) 195 | sess.run(tf.global_variables_initializer()) 196 | 197 | if config.load_path: 198 | # Restore from a checkpoint if the user requested it. 199 | restore_path = get_most_recent_checkpoint(config.model_dir) 200 | saver.restore(sess, restore_path) 201 | log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True) 202 | elif config.initialize_path: 203 | restore_path = get_most_recent_checkpoint(config.initialize_path) 204 | saver.restore(sess, restore_path) 205 | log('Initialized from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True) 206 | 207 | zero_step_assign = tf.assign(global_step, 0) 208 | sess.run(zero_step_assign) 209 | 210 | start_step = sess.run(global_step) 211 | log('='*50) 212 | log(' [*] Global step is reset to {}'. \ 213 | format(start_step)) 214 | log('='*50) 215 | else: 216 | log('Starting new training run at commit: %s' % commit, slack=True) 217 | 218 | start_step = sess.run(global_step) 219 | 220 | train_feeder.start_in_session(sess, start_step) 221 | test_feeder.start_in_session(sess, start_step) 222 | 223 | while not coord.should_stop(): 224 | start_time = time.time() 225 | step, loss, opt = sess.run( 226 | [global_step, model.loss_without_coeff, model.optimize], 227 | feed_dict=model.get_dummy_feed_dict()) 228 | 229 | time_window.append(time.time() - start_time) 230 | loss_window.append(loss) 231 | 232 | message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % ( 233 | step, time_window.average, loss, loss_window.average) 234 | log(message, slack=(step % config.checkpoint_interval == 0)) 235 | 236 | if loss > 100 or math.isnan(loss): 237 | log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True) 238 | raise Exception('Loss Exploded') 239 | 240 | if step % config.summary_interval == 0: 241 | log('Writing summary at step: %d' % step) 242 | 243 | feed_dict = { 244 | **model.get_dummy_feed_dict(), 245 | **test_model.get_dummy_feed_dict() 246 | } 247 | summary_writer.add_summary(sess.run( 248 | test_stats, feed_dict=feed_dict), step) 249 | 250 | if step % config.checkpoint_interval == 0: 251 | log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) 252 | saver.save(sess, checkpoint_path, global_step=step) 253 | 254 | if step % config.test_interval == 0: 255 | log('Saving audio and alignment...') 256 | num_test = config.num_test 257 | 258 | fetches = [ 259 | model.inputs[:num_test], 260 | model.linear_outputs[:num_test], 261 | model.alignments[:num_test], 262 | test_model.inputs[:num_test], 263 | test_model.linear_outputs[:num_test], 264 | test_model.alignments[:num_test], 265 | ] 266 | feed_dict = { 267 | **model.get_dummy_feed_dict(), 268 | **test_model.get_dummy_feed_dict() 269 | } 270 | 271 | sequences, spectrograms, alignments, \ 272 | test_sequences, test_spectrograms, test_alignments = \ 273 | sess.run(fetches, feed_dict=feed_dict) 274 | 275 | save_and_plot(sequences[:1], spectrograms[:1], alignments[:1], 276 | log_dir, step, loss, "train") 277 | save_and_plot(test_sequences, test_spectrograms, test_alignments, 278 | log_dir, step, loss, "test") 279 | 280 | except Exception as e: 281 | log('Exiting due to exception: %s' % e, slack=True) 282 | traceback.print_exc() 283 | coord.request_stop(e) 284 | 285 | 286 | def main(): 287 | parser = argparse.ArgumentParser() 288 | 289 | parser.add_argument('--log_dir', default='logs') 290 | parser.add_argument('--data_paths', default='datasets/kr_example') 291 | parser.add_argument('--load_path', default=None) 292 | parser.add_argument('--initialize_path', default=None) 293 | 294 | parser.add_argument('--num_test_per_speaker', type=int, default=2) 295 | parser.add_argument('--random_seed', type=int, default=123) 296 | parser.add_argument('--summary_interval', type=int, default=100) 297 | parser.add_argument('--test_interval', type=int, default=500) 298 | parser.add_argument('--checkpoint_interval', type=int, default=1000) 299 | parser.add_argument('--skip_path_filter', 300 | type=str2bool, default=False, help='Use only for debugging') 301 | 302 | parser.add_argument('--slack_url', 303 | help='Slack webhook URL to get periodic reports.') 304 | parser.add_argument('--git', action='store_true', 305 | help='If set, verify that the client is clean.') 306 | 307 | config = parser.parse_args() 308 | config.data_paths = config.data_paths.split(",") 309 | setattr(hparams, "num_speakers", len(config.data_paths)) 310 | 311 | prepare_dirs(config, hparams) 312 | 313 | log_path = os.path.join(config.model_dir, 'train.log') 314 | infolog.init(log_path, config.model_dir, config.slack_url) 315 | 316 | tf.set_random_seed(config.random_seed) 317 | print(config.data_paths) 318 | 319 | if any("krbook" not in data_path for data_path in config.data_paths) and \ 320 | hparams.sample_rate != 20000: 321 | warning("Detect non-krbook dataset. May need to set sampling rate from {} to 20000".\ 322 | format(hparams.sample_rate)) 323 | 324 | if any('LJ' in data_path for data_path in config.data_paths) and \ 325 | hparams.sample_rate != 22050: 326 | warning("Detect LJ Speech dataset. Set sampling rate from {} to 22050".\ 327 | format(hparams.sample_rate)) 328 | 329 | if config.load_path is not None and config.initialize_path is not None: 330 | raise Exception(" [!] Only one of load_path and initialize_path should be set") 331 | 332 | train(config.model_dir, config) 333 | 334 | 335 | if __name__ == '__main__': 336 | main() 337 | -------------------------------------------------------------------------------- /utils/NanumBarunGothic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melonicedlatte/multi-speaker-tacotron-tensorflow/b98628b467e0e577541d48d344dc3347d99c8392/utils/NanumBarunGothic.ttf -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import json 5 | import requests 6 | import subprocess 7 | from tqdm import tqdm 8 | from contextlib import closing 9 | from multiprocessing import Pool 10 | from collections import namedtuple 11 | from datetime import datetime, timedelta 12 | from shutil import copyfile as copy_file 13 | 14 | PARAMS_NAME = "params.json" 15 | 16 | class ValueWindow(): 17 | def __init__(self, window_size=100): 18 | self._window_size = window_size 19 | self._values = [] 20 | 21 | def append(self, x): 22 | self._values = self._values[-(self._window_size - 1):] + [x] 23 | 24 | @property 25 | def sum(self): 26 | return sum(self._values) 27 | 28 | @property 29 | def count(self): 30 | return len(self._values) 31 | 32 | @property 33 | def average(self): 34 | return self.sum / max(1, self.count) 35 | 36 | def reset(self): 37 | self._values = [] 38 | 39 | def prepare_dirs(config, hparams): 40 | if hasattr(config, "data_paths"): 41 | config.datasets = [ 42 | os.path.basename(data_path) for data_path in config.data_paths] 43 | dataset_desc = "+".join(config.datasets) 44 | 45 | if config.load_path: 46 | config.model_dir = config.load_path 47 | else: 48 | config.model_name = "{}_{}".format(dataset_desc, get_time()) 49 | config.model_dir = os.path.join(config.log_dir, config.model_name) 50 | 51 | for path in [config.log_dir, config.model_dir]: 52 | if not os.path.exists(path): 53 | os.makedirs(path) 54 | 55 | if config.load_path: 56 | load_hparams(hparams, config.model_dir) 57 | else: 58 | setattr(hparams, "num_speakers", len(config.datasets)) 59 | 60 | save_hparams(config.model_dir, hparams) 61 | copy_file("hparams.py", os.path.join(config.model_dir, "hparams.py")) 62 | 63 | def makedirs(path): 64 | if not os.path.exists(path): 65 | print(" [*] Make directories : {}".format(path)) 66 | os.makedirs(path) 67 | 68 | def remove_file(path): 69 | if os.path.exists(path): 70 | print(" [*] Removed: {}".format(path)) 71 | os.remove(path) 72 | 73 | def backup_file(path): 74 | root, ext = os.path.splitext(path) 75 | new_path = "{}.backup_{}{}".format(root, get_time(), ext) 76 | 77 | os.rename(path, new_path) 78 | print(" [*] {} has backup: {}".format(path, new_path)) 79 | 80 | def get_time(): 81 | return datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 82 | 83 | def write_json(path, data): 84 | with open(path, 'w',encoding='utf-8') as f: 85 | json.dump(data, f, indent=4, sort_keys=True, ensure_ascii=False) 86 | 87 | def load_json(path, as_class=False, encoding='euc-kr'): 88 | with open(path,encoding=encoding) as f: 89 | content = f.read() 90 | content = re.sub(",\s*}", "}", content) 91 | content = re.sub(",\s*]", "]", content) 92 | 93 | if as_class: 94 | data = json.loads(content, object_hook=\ 95 | lambda data: namedtuple('Data', data.keys())(*data.values())) 96 | else: 97 | data = json.loads(content) 98 | #print(data) 99 | return data 100 | 101 | def save_hparams(model_dir, hparams): 102 | param_path = os.path.join(model_dir, PARAMS_NAME) 103 | 104 | info = eval(hparams.to_json(). \ 105 | replace('true', 'True').replace('false', 'False')) 106 | write_json(param_path, info) 107 | 108 | print(" [*] MODEL dir: {}".format(model_dir)) 109 | print(" [*] PARAM path: {}".format(param_path)) 110 | 111 | def load_hparams(hparams, load_path, skip_list=[]): 112 | path = os.path.join(load_path, PARAMS_NAME) 113 | 114 | new_hparams = load_json(path) 115 | hparams_keys = vars(hparams).keys() 116 | 117 | for key, value in new_hparams.items(): 118 | if key in skip_list or key not in hparams_keys: 119 | print("Skip {} because it not exists".format(key)) 120 | continue 121 | 122 | if key not in ['job_name', 'num_workers', 'display', 'is_train', 'load_path'] or \ 123 | key == "pointer_load_path": 124 | original_value = getattr(hparams, key) 125 | if original_value != value: 126 | print("UPDATE {}: {} -> {}".format(key, getattr(hparams, key), value)) 127 | setattr(hparams, key, value) 128 | 129 | def add_prefix(path, prefix): 130 | dir_path, filename = os.path.dirname(path), os.path.basename(path) 131 | return "{}/{}.{}".format(dir_path, prefix, filename) 132 | 133 | def add_postfix(path, postfix): 134 | path_without_ext, ext = path.rsplit('.', 1) 135 | return "{}.{}.{}".format(path_without_ext, postfix, ext) 136 | 137 | def remove_postfix(path): 138 | items = path.rsplit('.', 2) 139 | return items[0] + "." + items[2] 140 | 141 | def parallel_run(fn, items, desc="", parallel=True): 142 | results = [] 143 | 144 | if parallel: 145 | with closing(Pool()) as pool: 146 | for out in tqdm(pool.imap_unordered( 147 | fn, items), total=len(items), desc=desc): 148 | if out is not None: 149 | results.append(out) 150 | else: 151 | for item in tqdm(items, total=len(items), desc=desc): 152 | out = fn(item) 153 | if out is not None: 154 | results.append(out) 155 | 156 | return results 157 | 158 | def which(program): 159 | if os.name == "nt" and not program.endswith(".exe"): 160 | program += ".exe" 161 | 162 | envdir_list = [os.curdir] + os.environ["PATH"].split(os.pathsep) 163 | 164 | for envdir in envdir_list: 165 | program_path = os.path.join(envdir, program) 166 | if os.path.isfile(program_path) and os.access(program_path, os.X_OK): 167 | return program_path 168 | 169 | def get_encoder_name(): 170 | if which("avconv"): 171 | return "avconv" 172 | elif which("ffmpeg"): 173 | return "ffmpeg" 174 | else: 175 | return "ffmpeg" 176 | 177 | def download_with_url(url, dest_path, chunk_size=32*1024): 178 | with open(dest_path, "wb") as f: 179 | response = requests.get(url, stream=True) 180 | total_size = int(response.headers.get('content-length', 0)) 181 | 182 | for chunk in response.iter_content(chunk_size): 183 | if chunk: # filter out keep-alive new chunks 184 | f.write(chunk) 185 | return True 186 | 187 | def str2bool(v): 188 | return v.lower() in ('true', '1') 189 | 190 | def get_git_revision_hash(): 191 | return subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode("utf-8") 192 | 193 | def get_git_diff(): 194 | return subprocess.check_output(['git', 'diff']).decode("utf-8") 195 | 196 | def warning(msg): 197 | print("="*40) 198 | print(" [!] {}".format(msg)) 199 | print("="*40) 200 | print() 201 | 202 | def query_yes_no(question, default=None): 203 | # Code from https://stackoverflow.com/a/3041990 204 | valid = {"yes": True, "y": True, "ye": True, 205 | "no": False, "n": False} 206 | if default is None: 207 | prompt = " [y/n] " 208 | elif default == "yes": 209 | prompt = " [Y/n] " 210 | elif default == "no": 211 | prompt = " [y/N] " 212 | else: 213 | raise ValueError("invalid default answer: '%s'" % default) 214 | 215 | while True: 216 | sys.stdout.write(question + prompt) 217 | choice = input().lower() 218 | if default is not None and choice == '': 219 | return valid[default] 220 | elif choice in valid: 221 | return valid[choice] 222 | else: 223 | sys.stdout.write("Please respond with 'yes' or 'no' " 224 | "(or 'y' or 'n').\n") 225 | -------------------------------------------------------------------------------- /utils/infolog.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | from datetime import datetime 3 | import json 4 | from threading import Thread 5 | from urllib.request import Request, urlopen 6 | 7 | 8 | _format = '%Y-%m-%d %H:%M:%S.%f' 9 | _file = None 10 | _run_name = None 11 | _slack_url = None 12 | 13 | 14 | def init(filename, run_name, slack_url=None): 15 | global _file, _run_name, _slack_url 16 | _close_logfile() 17 | _file = open(filename, 'a') 18 | _file.write('\n-----------------------------------------------------------------\n') 19 | _file.write('Starting new training run\n') 20 | _file.write('-----------------------------------------------------------------\n') 21 | _run_name = run_name 22 | _slack_url = slack_url 23 | 24 | 25 | def log(msg, slack=False): 26 | print(msg) 27 | if _file is not None: 28 | _file.write('[%s] %s\n' % (datetime.now().strftime(_format)[:-3], msg)) 29 | if slack and _slack_url is not None: 30 | Thread(target=_send_slack, args=(msg,)).start() 31 | 32 | 33 | def _close_logfile(): 34 | global _file 35 | if _file is not None: 36 | _file.close() 37 | _file = None 38 | 39 | 40 | def _send_slack(msg): 41 | req = Request(_slack_url) 42 | req.add_header('Content-Type', 'application/json') 43 | urlopen(req, json.dumps({ 44 | 'username': 'tacotron', 45 | 'icon_emoji': ':taco:', 46 | 'text': '*%s*: %s' % (_run_name, msg) 47 | }).encode()) 48 | 49 | 50 | atexit.register(_close_logfile) 51 | -------------------------------------------------------------------------------- /utils/plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import matplotlib 3 | from jamo import h2j, j2hcj 4 | 5 | matplotlib.use('Agg') 6 | matplotlib.rc('font', family="NanumBarunGothic") 7 | import matplotlib.pyplot as plt 8 | 9 | from text import PAD, EOS 10 | from utils import add_postfix 11 | from text.korean import normalize 12 | 13 | def plot(alignment, info, text, isKorean=True): 14 | char_len, audio_len = alignment.shape # 145, 200 15 | 16 | fig, ax = plt.subplots(figsize=(char_len/5, 5)) 17 | im = ax.imshow( 18 | alignment.T, 19 | aspect='auto', 20 | origin='lower', 21 | interpolation='none') 22 | 23 | xlabel = 'Encoder timestep' 24 | ylabel = 'Decoder timestep' 25 | 26 | if info is not None: 27 | xlabel += '\n{}'.format(info) 28 | 29 | plt.xlabel(xlabel) 30 | plt.ylabel(ylabel) 31 | 32 | if text: 33 | if isKorean: 34 | jamo_text = j2hcj(h2j(normalize(text))) 35 | else: 36 | jamo_text=text 37 | pad = [PAD] * (char_len - len(jamo_text) - 1) 38 | 39 | plt.xticks(range(char_len), 40 | [tok for tok in jamo_text] + [EOS] + pad) 41 | 42 | if text is not None: 43 | while True: 44 | if text[-1] in [EOS, PAD]: 45 | text = text[:-1] 46 | else: 47 | break 48 | plt.title(text) 49 | 50 | plt.tight_layout() 51 | 52 | def plot_alignment( 53 | alignment, path, info=None, text=None, isKorean=True): 54 | 55 | if text: 56 | tmp_alignment = alignment[:len(h2j(text)) + 2] 57 | 58 | plot(tmp_alignment, info, text, isKorean) 59 | plt.savefig(path, format='png') 60 | else: 61 | plot(alignment, info, text, isKorean) 62 | plt.savefig(path, format='png') 63 | 64 | print(" [*] Plot saved: {}".format(path)) 65 | -------------------------------------------------------------------------------- /web/static/css/main.css: -------------------------------------------------------------------------------- 1 | @media screen and (min-width: 1452px) { 2 | .container { 3 | max-width: 1152px; 4 | width: 1152px; 5 | } 6 | } 7 | @media screen and (min-width: 1260px) { 8 | .container { 9 | max-width: 960px; 10 | width: 960px; 11 | } 12 | } 13 | @media screen and (min-width: 1068px) { 14 | .container { 15 | max-width: 768px; 16 | width: 768px; 17 | } 18 | } 19 | 20 | .container { 21 | margin: 0 auto; 22 | position: relative; 23 | } 24 | 25 | #wave { 26 | height: 100px; 27 | } 28 | 29 | #waveform { 30 | display: none; 31 | } 32 | 33 | #nav { 34 | position: fixed !important; 35 | top: 0; 36 | left: 0; 37 | right: 0; 38 | z-index: 100; 39 | } 40 | 41 | .card { 42 | padding: 0; 43 | } 44 | 45 | .columns { 46 | margin-left: 0rem; 47 | margin-right: 0rem; 48 | margin-top: 0rem; 49 | } 50 | 51 | #text { 52 | font-size: 1.2em; 53 | padding: 0.7em 1em 0.7em 1em; 54 | background: transparent; 55 | color: white; 56 | } 57 | 58 | .dark { 59 | background-color: black; 60 | } 61 | -------------------------------------------------------------------------------- /web/static/js/main.js: -------------------------------------------------------------------------------- 1 | var sw; 2 | var wavesurfer; 3 | 4 | var defaultSpeed = 0.03; 5 | var defaultAmplitude = 0.3; 6 | 7 | var activeColors = [[32,133,252], [94,252,169], [253,71,103]]; 8 | var inactiveColors = [[241,243,245], [206,212,218], [222,226,230], [173,181,189]]; 9 | 10 | function generate(ip, port, text, speaker_id) { 11 | $("#synthesize").addClass("is-loading"); 12 | 13 | var uri = 'http://' + ip + ':' + port 14 | var url = uri + '/generate?text=' + encodeURIComponent(text) + "&speaker_id=" + speaker_id; 15 | 16 | fetch(url, {cache: 'no-cache', mode: 'cors'}) 17 | .then(function(res) { 18 | if (!res.ok) throw Error(response.statusText) 19 | return res.blob() 20 | }).then(function(blob) { 21 | var url = URL.createObjectURL(blob); 22 | console.log(url); 23 | inProgress = false; 24 | wavesurfer.load(url); 25 | $("#synthesize").removeClass("is-loading"); 26 | }).catch(function(err) { 27 | showWarning("에러가 발생했습니다"); 28 | inProgress = false; 29 | $("#synthesize").removeClass("is-loading"); 30 | }); 31 | } 32 | 33 | (function(window, document, undefined){ 34 | window.onload = init; 35 | 36 | function setDefaultColor(sw, isActive) { 37 | for (idx=0; idx < sw.curves.length; idx++) { 38 | var curve = sw.curves[idx]; 39 | 40 | if (isActive) { 41 | curve.color = activeColors[idx % activeColors.length]; 42 | } else { 43 | curve.color = inactiveColors[idx % inactiveColors.length]; 44 | } 45 | } 46 | } 47 | 48 | function init(){ 49 | sw = new SiriWave9({ 50 | amplitude: defaultAmplitude, 51 | container: document.getElementById('wave'), 52 | autostart: true, 53 | speed: defaultSpeed, 54 | style: 'ios9', 55 | }); 56 | sw.setSpeed(defaultSpeed); 57 | setDefaultColor(sw, false); 58 | 59 | wavesurfer = WaveSurfer.create({ 60 | container: '#waveform', 61 | waveColor: 'violet', 62 | barWidth: 3, 63 | progressColor: 'purple' 64 | }); 65 | 66 | wavesurfer.on('ready', function () { 67 | this.width = wavesurfer.getDuration() * 68 | wavesurfer.params.minPxPerSec * wavesurfer.params.pixelRatio; 69 | this.peaks = wavesurfer.backend.getPeaks(width); 70 | 71 | wavesurfer.play(); 72 | }); 73 | 74 | wavesurfer.on('audioprocess', function () { 75 | var percent = wavesurfer.backend.getPlayedPercents(); 76 | var height = this.peaks[parseInt(this.peaks.length * percent)]; 77 | if (height > 0) { 78 | sw.setAmplitude(height*3); 79 | } 80 | }); 81 | 82 | wavesurfer.on('finish', function () { 83 | sw.setSpeed(defaultSpeed); 84 | sw.setAmplitude(defaultAmplitude); 85 | setDefaultColor(sw, false); 86 | }); 87 | 88 | $(document).on('click', "#synthesize", function() { 89 | synthesize(); 90 | }); 91 | 92 | function synthesize() { 93 | var text = $("#text").val().trim(); 94 | var text_length = text.length; 95 | 96 | var speaker_id = $('input[name=id]:checked').val(); 97 | var speaker = $('input[name=id]:checked').attr("speaker"); 98 | 99 | generate('0.0.0.0', 51000, text, speaker_id); 100 | 101 | var lowpass = wavesurfer.backend.ac.createGain(); 102 | wavesurfer.backend.setFilter(lowpass); 103 | } 104 | } 105 | })(window, document, undefined); 106 | -------------------------------------------------------------------------------- /web/static/js/siriwave.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | 3 | //////////////////// 4 | // SiriWave9Curve // 5 | //////////////////// 6 | 7 | function SiriWave9Curve(opt) { 8 | opt = opt || {}; 9 | this.controller = opt.controller; 10 | this.color = opt.color; 11 | this.tick = 0; 12 | 13 | this.respawn(); 14 | } 15 | 16 | SiriWave9Curve.prototype.respawn = function() { 17 | this.amplitude = 0.3 + Math.random() * 0.7; 18 | this.seed = Math.random(); 19 | this.open_class = 2+(Math.random()*3)|0; 20 | }; 21 | 22 | SiriWave9Curve.prototype.equation = function(i) { 23 | var p = this.tick; 24 | var y = -1 * Math.abs(Math.sin(p)) * this.controller.amplitude * this.amplitude * this.controller.MAX * Math.pow(1/(1+Math.pow(this.open_class*i,2)),2); 25 | if (Math.abs(y) < 0.001) { 26 | this.respawn(); 27 | } 28 | return y; 29 | }; 30 | 31 | SiriWave9Curve.prototype._draw = function(m) { 32 | this.tick += this.controller.speed * (1-0.5*Math.sin(this.seed*Math.PI)); 33 | 34 | var ctx = this.controller.ctx; 35 | ctx.beginPath(); 36 | 37 | var x_base = this.controller.width/2 + (-this.controller.width/4 + this.seed*(this.controller.width/2) ); 38 | var y_base = this.controller.height/2; 39 | 40 | var x, y, x_init; 41 | 42 | var i = -3; 43 | while (i <= 3) { 44 | x = x_base + i * this.controller.width/4; 45 | y = y_base + (m * this.equation(i)); 46 | x_init = x_init || x; 47 | ctx.lineTo(x, y); 48 | i += 0.01; 49 | } 50 | 51 | var h = Math.abs(this.equation(0)); 52 | var gradient = ctx.createRadialGradient(x_base, y_base, h*1.15, x_base, y_base, h * 0.3 ); 53 | gradient.addColorStop(0, 'rgba(' + this.color.join(',') + ',0.4)'); 54 | gradient.addColorStop(1, 'rgba(' + this.color.join(',') + ',0.2)'); 55 | 56 | ctx.fillStyle = gradient; 57 | 58 | ctx.lineTo(x_init, y_base); 59 | ctx.closePath(); 60 | 61 | ctx.fill(); 62 | }; 63 | 64 | SiriWave9Curve.prototype.draw = function() { 65 | this._draw(-1); 66 | this._draw(1); 67 | }; 68 | 69 | 70 | ////////////// 71 | // SiriWave // 72 | ////////////// 73 | 74 | function SiriWave9(opt) { 75 | opt = opt || {}; 76 | 77 | this.tick = 0; 78 | this.run = false; 79 | 80 | // UI vars 81 | 82 | this.ratio = opt.ratio || window.devicePixelRatio || 1; 83 | 84 | this.width = this.ratio * (opt.width || 320); 85 | this.height = this.ratio * (opt.height || 100); 86 | this.MAX = this.height/2; 87 | 88 | this.speed = 0.1; 89 | this.amplitude = opt.amplitude || 1; 90 | 91 | // Interpolation 92 | 93 | this.speedInterpolationSpeed = opt.speedInterpolationSpeed || 0.005; 94 | this.amplitudeInterpolationSpeed = opt.amplitudeInterpolationSpeed || 0.05; 95 | 96 | this._interpolation = { 97 | speed: this.speed, 98 | amplitude: this.amplitude 99 | }; 100 | 101 | // Canvas 102 | 103 | this.canvas = document.createElement('canvas'); 104 | this.canvas.width = this.width; 105 | this.canvas.height = this.height; 106 | 107 | if (opt.cover) { 108 | this.canvas.style.width = this.canvas.style.height = '100%'; 109 | } else { 110 | this.canvas.style.width = (this.width / this.ratio) + 'px'; 111 | this.canvas.style.height = (this.height / this.ratio) + 'px'; 112 | } 113 | 114 | this.container = opt.container || document.body; 115 | this.container.appendChild(this.canvas); 116 | 117 | this.ctx = this.canvas.getContext('2d'); 118 | 119 | // Create curves 120 | 121 | this.curves = []; 122 | for (var i = 0; i < SiriWave9.prototype.COLORS.length; i++) { 123 | var color = SiriWave9.prototype.COLORS[i]; 124 | for (var j = 0; j < (3 * Math.random())|0; j++) { 125 | this.curves.push(new SiriWave9Curve({ 126 | controller: this, 127 | color: color 128 | })); 129 | } 130 | } 131 | 132 | if (opt.autostart) { 133 | this.start(); 134 | } 135 | } 136 | 137 | SiriWave9.prototype._interpolate = function(propertyStr) { 138 | increment = this[ propertyStr + 'InterpolationSpeed' ]; 139 | 140 | if (Math.abs(this._interpolation[propertyStr] - this[propertyStr]) <= increment) { 141 | this[propertyStr] = this._interpolation[propertyStr]; 142 | } else { 143 | if (this._interpolation[propertyStr] > this[propertyStr]) { 144 | this[propertyStr] += increment; 145 | } else { 146 | this[propertyStr] -= increment; 147 | } 148 | } 149 | }; 150 | 151 | SiriWave9.prototype._clear = function() { 152 | this.ctx.globalCompositeOperation = 'destination-out'; 153 | this.ctx.fillRect(0, 0, this.width, this.height); 154 | this.ctx.globalCompositeOperation = 'lighter'; 155 | }; 156 | 157 | SiriWave9.prototype._draw = function() { 158 | for (var i = 0, len = this.curves.length; i < len; i++) { 159 | this.curves[i].draw(); 160 | } 161 | }; 162 | 163 | SiriWave9.prototype._startDrawCycle = function() { 164 | if (this.run === false) return; 165 | this._clear(); 166 | 167 | // Interpolate values 168 | this._interpolate('amplitude'); 169 | this._interpolate('speed'); 170 | 171 | this._draw(); 172 | this.phase = (this.phase + Math.PI*this.speed) % (2*Math.PI); 173 | 174 | if (window.requestAnimationFrame) { 175 | window.requestAnimationFrame(this._startDrawCycle.bind(this)); 176 | } else { 177 | setTimeout(this._startDrawCycle.bind(this), 20); 178 | } 179 | }; 180 | 181 | SiriWave9.prototype.start = function() { 182 | this.tick = 0; 183 | this.run = true; 184 | this._startDrawCycle(); 185 | }; 186 | 187 | SiriWave9.prototype.stop = function() { 188 | this.tick = 0; 189 | this.run = false; 190 | }; 191 | 192 | SiriWave9.prototype.setSpeed = function(v, increment) { 193 | this._interpolation.speed = v; 194 | }; 195 | 196 | SiriWave9.prototype.setNoise = SiriWave9.prototype.setAmplitude = function(v) { 197 | this._interpolation.amplitude = Math.max(Math.min(v, 1), 0); 198 | }; 199 | 200 | SiriWave9.prototype.COLORS = [ 201 | [32,133,252], 202 | [94,252,169], 203 | [253,71,103] 204 | ]; 205 | 206 | if (typeof define === 'function' && define.amd) { 207 | define(function(){ return SiriWave9; }); 208 | } else { 209 | window.SiriWave9 = SiriWave9; 210 | } 211 | 212 | })(); 213 | -------------------------------------------------------------------------------- /web/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | D.Voice 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 | 37 |
38 |
39 |
40 |
41 |
42 | 43 |
44 |
45 | 49 |
50 |
51 | 52 |
53 |
54 | 55 | 58 |
59 | 62 |
63 | 64 |
65 | 68 |
69 |
70 |
71 |
72 |
73 | 74 | 75 | -------------------------------------------------------------------------------- /김앵커한마디_음성받아오기.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 17, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import re\n", 10 | "import os\n", 11 | "import sys\n", 12 | "import m3u8\n", 13 | "import json\n", 14 | "import requests\n", 15 | "import subprocess\n", 16 | "from functools import partial\n", 17 | "from bs4 import BeautifulSoup\n", 18 | "from nltk import sent_tokenize \n", 19 | "import ast\n", 20 | "from utils import get_encoder_name, parallel_run, makedirs" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 18, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "kim_movie_info_list = []\n", 30 | "\n", 31 | "with open('./datasets/kim_anchor/kim_anchor_data_info.json', 'r') as myfile:\n", 32 | " data = myfile.read()\n", 33 | " kim_movie_info_list = ast.literal_eval(data)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 19, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "{'CONTENT': '오늘(3일)의 한마디는 괜찮은 항공사는 없습니까입니다.아시아나 항공 기내식 대란이 사흘째 이어지고 있습니다. 기내식을 싣지 못한 국제선 여객기 지연은 다반사. 그냥 출발하는 항공기들도 속출합니다. 승무원들은 거센 항의를 받으며 자신들은 굶고 라면을 끓여 승객들에게 내놓고, 수백명 목숨을 책임진 기장도 라면과 음료수 하나로 버틴다고 합니다. 이 와중에 기내식 관련 하청업체 대표는 자살했습니다.아시아나는 지난 15년간 기내식을 공급하던 LSG에게 올해 초 6개월 뒤 계약을 종료하겠다고 밝혔습니다. 금호홀딩스 투자를 둘러싼 갈등이 있었다고 합니다. 하지만 새로 선정한 업체에서 석 달 전 불이 나자 아시아나는 하청 업체 하나를 선정하고 대책을 마무리 했습니다. 하루 생산능력 3000명 분인 업체가 2만 5000명분 이상의 기내식을 공급하려다 보니 바로 사고가 터진 것입니다.대한항공 오너 일가의 갑질도 개탄스러운데 아시아나까지 이러니 참 실망스럽습니다. 도대체 대한민국에서 신뢰할 만한 항공사 하나를 찾기가 왜 이리 힘든 것입니까. 이게 다 독과점의 폐해 아닙니까. 뉴스현장 은 여기까지입니다. 저희는 내일 다시 찾아뵙겠습니다.',\n", 45 | " 'SD_URL': 'http://jtbcvod.fvod.skcdn.com/newsvod/694404E4DE8EC7CB67F11A3108BFDD615EAB00176C1E9061534E5CB22FC65CBE45E9B49C57254C3A12BA8FFDC647CBFBC94AC6A51F864AA7F25918759417C651/playlist.m3u8',\n", 46 | " 'NEWS_URL': 'http://news.jtbc.joins.com/article/article.aspx?news_id=NB11659024',\n", 47 | " 'MOVIE_ID': 'NV10214115'}" 48 | ] 49 | }, 50 | "execution_count": 19, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "kim_movie_info_list[0]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 20, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "1296" 68 | ] 69 | }, 70 | "execution_count": 20, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "len(kim_movie_info_list)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 25, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "ffmpeg\n", 89 | "./test_encoder -y -loglevel panic -i ./test_video -ab 160k -ac 2 -ar 44100 -vn ./test_audio\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "if not os.path.exists('audio_path'):\n", 95 | " encoder = get_encoder_name()\n", 96 | " print(encoder)\n", 97 | " command = \"{} -y -loglevel panic -i {} -ab 160k -ac 2 -ar 44100 -vn {}\".\\\n", 98 | " format('./test_encoder', './test_video', './test_audio')\n", 99 | " print (command)\n", 100 | " subprocess.call(command, shell=True)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3", 128 | "language": "python", 129 | "name": "python3" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 2 134 | } 135 | --------------------------------------------------------------------------------