├── fig ├── proposed_model.png └── with_tsukuyomi_chan.png ├── monotonic_align ├── setup.py ├── __init__.py └── core.pyx ├── requirements.txt ├── convert_to_22050.py ├── filelists ├── filelist_val2.txt.cleaned ├── vctk_audio_sid_text_val_filelist.txt ├── vctk_audio_sid_text_val_filelist.txt.cleaned ├── ljs_audio_text_val_filelist.txt ├── filelist_train2.txt.cleaned └── ljs_audio_text_val_filelist.txt.cleaned ├── text ├── symbols.py ├── LICENSE ├── __init__.py ├── py2kn.json ├── japanese.py ├── cleaners.py └── korean.py ├── preprocess.py ├── configs ├── tsukuyomi_chan.json ├── ljs_istft_vits.json ├── ljs_mb_istft_vits.json ├── ljs_mini_istft_vits.json ├── ljs_mini_mb_istft_vits.json └── ljs_ms_istft_vits.json ├── losses.py ├── inference.ipynb ├── README.md ├── mel_processing.py ├── pqmf.py ├── stft_loss.py ├── commons.py ├── utils.py ├── transforms.py ├── stft.py ├── LICENSE ├── train_latest.py ├── train_latest_ms.py ├── attentions.py └── modules.py /fig/proposed_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/misakiudon/MB-iSTFT-VITS-multilingual/HEAD/fig/proposed_model.png -------------------------------------------------------------------------------- /fig/with_tsukuyomi_chan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/misakiudon/MB-iSTFT-VITS-multilingual/HEAD/fig/with_tsukuyomi_chan.png -------------------------------------------------------------------------------- /monotonic_align/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | import numpy 4 | 5 | setup( 6 | name = 'monotonic_align', 7 | ext_modules = cythonize("core.pyx"), 8 | include_dirs=[numpy.get_include()] 9 | ) 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython==0.29.21 2 | librosa==0.8.0 3 | matplotlib==3.3.1 4 | numpy==1.18.5 5 | phonemizer==2.2.1 6 | scipy==1.5.2 7 | tensorboard==2.3.0 8 | torch==1.6.0 9 | torchvision==0.7.0 10 | Unidecode==1.1.1 11 | pysoundfile==0.9.0.post1 12 | pyopenjtalk==0.2.0 13 | jamo==0.4.1 14 | ko_pron==1.3 15 | -------------------------------------------------------------------------------- /convert_to_22050.py: -------------------------------------------------------------------------------- 1 | import os 2 | import librosa 3 | import argparse 4 | import soundfile as sf 5 | 6 | if __name__ == '__main__': 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--in_path", default="./tsukuyomi_raw/", required=True) 9 | parser.add_argument("--out_path", default="./tsukuyomi/" ,required=True) 10 | 11 | args = parser.parse_args() 12 | 13 | os.makedirs(args.out_path, exist_ok=True) 14 | filenames = os.listdir(args.in_path) 15 | for filename in filenames: 16 | print(args.in_path+filename) 17 | y, sr = librosa.core.load(args.in_path+filename, sr=22050, mono=True) 18 | sf.write(args.out_path+filename, y, sr, subtype="PCM_16") 19 | -------------------------------------------------------------------------------- /monotonic_align/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from .monotonic_align.core import maximum_path_c 4 | 5 | 6 | def maximum_path(neg_cent, mask): 7 | """ Cython optimized version. 8 | neg_cent: [b, t_t, t_s] 9 | mask: [b, t_t, t_s] 10 | """ 11 | device = neg_cent.device 12 | dtype = neg_cent.dtype 13 | neg_cent = neg_cent.data.cpu().numpy().astype(np.float32) 14 | path = np.zeros(neg_cent.shape, dtype=np.int32) 15 | 16 | t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32) 17 | t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32) 18 | maximum_path_c(path, neg_cent, t_t_max, t_s_max) 19 | return torch.from_numpy(path).to(device=device, dtype=dtype) 20 | -------------------------------------------------------------------------------- /filelists/filelist_val2.txt.cleaned: -------------------------------------------------------------------------------- 1 | ./tsukuyomi/VOICEACTRESS100_096.wav|pe↑Nʃirubenia↓ʃuu, pi↑Qtsuba↓aguno, a↑regeeniiko↓okooo so↑tsugyoo ʃ i, ka↑riforuniada↓igaku, ba↑akuree↓kooni nyu↑ugaku. 2 | ./tsukuyomi/VOICEACTRESS100_097.wav|ko↑no ga↓ineNno do↑onyuuniyoQte, sa↑ma↓zamana ba↑rie↓eʃoNno, ryu↑utaino ko↑Npyuutaaʃimyure↓eʃoNga, ta↑ka↓i se↓edode ka↑nooto na↓Qta. 3 | ./tsukuyomi/VOICEACTRESS100_098.wav|i↓nui do↓Qkuni nyu↓ukyo ʃI↑te, o↑obaaho↓oru su↑be↓kIka do↓oka, pa↑fo↓omaNsuga ʧe↓QkU sa↑reta. 4 | ./tsukuyomi/VOICEACTRESS100_099.wav|de↑byuuwe↓etowa, su↑upaabaNtamu↓kyuudewanaku, fe↑zaa↓kyuudaQta. 5 | ./tsukuyomi/VOICEACTRESS100_100.wav|a↓ariiwa, ko↓ouno na↓kao, mi↑namino ba↑ajinia↓ʃuu, wi↑NʧesUtaaʧi↓kakuno, fi↑Qʃaazuhi↓rumade, gu↓No ʃi↑rizo↓ita. -------------------------------------------------------------------------------- /text/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Defines the set of symbols used in text input to the model. 5 | ''' 6 | _pad = '_' 7 | _punctuation = ';:,.!?¡¿—…"«»“” ' 8 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ' 9 | _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" 10 | 11 | '''# korean_cleaners 12 | _pad = '_' 13 | _punctuation = ',.!?…~' 14 | _letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ ' 15 | ''' 16 | 17 | # Export all symbols: 18 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) 19 | 20 | # Special symbol ids 21 | SPACE_ID = symbols.index(" ") 22 | -------------------------------------------------------------------------------- /text/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Keith Ito 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import text 3 | from utils import load_filepaths_and_text 4 | 5 | if __name__ == '__main__': 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--out_extension", default="cleaned") 8 | parser.add_argument("--text_index", default=1, type=int) 9 | parser.add_argument("--filelists", nargs="+", default=["filelists/ljs_audio_text_val_filelist.txt", "filelists/ljs_audio_text_test_filelist.txt"]) 10 | parser.add_argument("--text_cleaners", nargs="+", default=["english_cleaners2"]) 11 | 12 | args = parser.parse_args() 13 | 14 | 15 | for filelist in args.filelists: 16 | print("START:", filelist) 17 | filepaths_and_text = load_filepaths_and_text(filelist) 18 | for i in range(len(filepaths_and_text)): 19 | original_text = filepaths_and_text[i][args.text_index] 20 | cleaned_text = text._clean_text(original_text, args.text_cleaners) 21 | filepaths_and_text[i][args.text_index] = cleaned_text 22 | 23 | new_filelist = filelist + "." + args.out_extension 24 | with open(new_filelist, "w", encoding="utf-8") as f: 25 | f.writelines(["|".join(x) + "\n" for x in filepaths_and_text]) 26 | -------------------------------------------------------------------------------- /monotonic_align/core.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | from cython.parallel import prange 3 | 4 | 5 | @cython.boundscheck(False) 6 | @cython.wraparound(False) 7 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil: 8 | cdef int x 9 | cdef int y 10 | cdef float v_prev 11 | cdef float v_cur 12 | cdef float tmp 13 | cdef int index = t_x - 1 14 | 15 | for y in range(t_y): 16 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): 17 | if x == y: 18 | v_cur = max_neg_val 19 | else: 20 | v_cur = value[y-1, x] 21 | if x == 0: 22 | if y == 0: 23 | v_prev = 0. 24 | else: 25 | v_prev = max_neg_val 26 | else: 27 | v_prev = value[y-1, x-1] 28 | value[y, x] += max(v_prev, v_cur) 29 | 30 | for y in range(t_y - 1, -1, -1): 31 | path[y, index] = 1 32 | if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]): 33 | index = index - 1 34 | 35 | 36 | @cython.boundscheck(False) 37 | @cython.wraparound(False) 38 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil: 39 | cdef int b = paths.shape[0] 40 | cdef int i 41 | for i in prange(b, nogil=True): 42 | maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i]) 43 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | from text import cleaners 3 | from text.symbols import symbols 4 | 5 | 6 | # Mappings from symbol to numeric ID and vice versa: 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 9 | 10 | 11 | def text_to_sequence(text, cleaner_names): 12 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 13 | Args: 14 | text: string to convert to a sequence 15 | cleaner_names: names of the cleaner functions to run the text through 16 | Returns: 17 | List of integers corresponding to the symbols in the text 18 | ''' 19 | sequence = [] 20 | 21 | clean_text = _clean_text(text, cleaner_names) 22 | for symbol in clean_text: 23 | symbol_id = _symbol_to_id[symbol] 24 | sequence += [symbol_id] 25 | return sequence 26 | 27 | 28 | def cleaned_text_to_sequence(cleaned_text): 29 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 30 | Args: 31 | text: string to convert to a sequence 32 | Returns: 33 | List of integers corresponding to the symbols in the text 34 | ''' 35 | sequence = [_symbol_to_id[symbol] for symbol in cleaned_text] 36 | return sequence 37 | 38 | 39 | def sequence_to_text(sequence): 40 | '''Converts a sequence of IDs back to a string''' 41 | result = '' 42 | for symbol_id in sequence: 43 | s = _id_to_symbol[symbol_id] 44 | result += s 45 | return result 46 | 47 | 48 | def _clean_text(text, cleaner_names): 49 | for name in cleaner_names: 50 | cleaner = getattr(cleaners, name) 51 | if not cleaner: 52 | raise Exception('Unknown cleaner: %s' % name) 53 | text = cleaner(text) 54 | return text 55 | -------------------------------------------------------------------------------- /configs/tsukuyomi_chan.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 1000, 5 | "seed": 1234, 6 | "epochs": 10000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 32, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0, 18 | "fft_sizes": [384, 683, 171], 19 | "hop_sizes": [30, 60, 10], 20 | "win_lengths": [150, 300, 60], 21 | "window": "hann_window" 22 | }, 23 | "data": { 24 | "training_files":"./filelists/filelist_train2.txt.cleaned", 25 | "validation_files":"./filelists/filelist_val2.txt.cleaned", 26 | "text_cleaners":["japanese_cleaners"], 27 | "max_wav_value": 32768.0, 28 | "sampling_rate": 22050, 29 | "filter_length": 1024, 30 | "hop_length": 256, 31 | "win_length": 1024, 32 | "n_mel_channels": 80, 33 | "mel_fmin": 0.0, 34 | "mel_fmax": null, 35 | "add_blank": true, 36 | "n_speakers": 0, 37 | "cleaned_text": true 38 | }, 39 | "model": { 40 | "ms_istft_vits": false, 41 | "mb_istft_vits": true, 42 | "istft_vits": false, 43 | "subbands": 4, 44 | "gen_istft_n_fft": 16, 45 | "gen_istft_hop_size": 4, 46 | "inter_channels": 192, 47 | "hidden_channels": 192, 48 | "filter_channels": 768, 49 | "n_heads": 2, 50 | "n_layers": 6, 51 | "kernel_size": 3, 52 | "p_dropout": 0.1, 53 | "resblock": "1", 54 | "resblock_kernel_sizes": [3,7,11], 55 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 56 | "upsample_rates": [4,4], 57 | "upsample_initial_channel": 512, 58 | "upsample_kernel_sizes": [16,16], 59 | "n_layers_q": 3, 60 | "use_spectral_norm": false, 61 | "use_sdp": false 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /configs/ljs_istft_vits.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 100000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 64, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0, 18 | "fft_sizes": [384, 683, 171], 19 | "hop_sizes": [30, 60, 10], 20 | "win_lengths": [150, 300, 60], 21 | "window": "hann_window" 22 | }, 23 | "data": { 24 | "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned", 25 | "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned", 26 | "text_cleaners":["english_cleaners2"], 27 | "max_wav_value": 32768.0, 28 | "sampling_rate": 22050, 29 | "filter_length": 1024, 30 | "hop_length": 256, 31 | "win_length": 1024, 32 | "n_mel_channels": 80, 33 | "mel_fmin": 0.0, 34 | "mel_fmax": null, 35 | "add_blank": true, 36 | "n_speakers": 0, 37 | "cleaned_text": true 38 | }, 39 | "model": { 40 | "ms_istft_vits": false, 41 | "mb_istft_vits": false, 42 | "istft_vits": true, 43 | "subbands": false, 44 | "gen_istft_n_fft": 16, 45 | "gen_istft_hop_size": 4, 46 | "inter_channels": 192, 47 | "hidden_channels": 192, 48 | "filter_channels": 768, 49 | "n_heads": 2, 50 | "n_layers": 6, 51 | "kernel_size": 3, 52 | "p_dropout": 0.1, 53 | "resblock": "1", 54 | "resblock_kernel_sizes": [3,7,11], 55 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 56 | "upsample_rates": [8,8], 57 | "upsample_initial_channel": 512, 58 | "upsample_kernel_sizes": [16,16], 59 | "n_layers_q": 3, 60 | "use_spectral_norm": false, 61 | "use_sdp": false 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /configs/ljs_mb_istft_vits.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 100000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 64, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0, 18 | "fft_sizes": [384, 683, 171], 19 | "hop_sizes": [30, 60, 10], 20 | "win_lengths": [150, 300, 60], 21 | "window": "hann_window" 22 | }, 23 | "data": { 24 | "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned", 25 | "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned", 26 | "text_cleaners":["english_cleaners2"], 27 | "max_wav_value": 32768.0, 28 | "sampling_rate": 22050, 29 | "filter_length": 1024, 30 | "hop_length": 256, 31 | "win_length": 1024, 32 | "n_mel_channels": 80, 33 | "mel_fmin": 0.0, 34 | "mel_fmax": null, 35 | "add_blank": true, 36 | "n_speakers": 0, 37 | "cleaned_text": true 38 | }, 39 | "model": { 40 | "ms_istft_vits": false, 41 | "mb_istft_vits": true, 42 | "istft_vits": false, 43 | "subbands": 4, 44 | "gen_istft_n_fft": 16, 45 | "gen_istft_hop_size": 4, 46 | "inter_channels": 192, 47 | "hidden_channels": 192, 48 | "filter_channels": 768, 49 | "n_heads": 2, 50 | "n_layers": 6, 51 | "kernel_size": 3, 52 | "p_dropout": 0.1, 53 | "resblock": "1", 54 | "resblock_kernel_sizes": [3,7,11], 55 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 56 | "upsample_rates": [4,4], 57 | "upsample_initial_channel": 512, 58 | "upsample_kernel_sizes": [16,16], 59 | "n_layers_q": 3, 60 | "use_spectral_norm": false, 61 | "use_sdp": false 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /configs/ljs_mini_istft_vits.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 100000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 64, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0, 18 | "fft_sizes": [384, 683, 171], 19 | "hop_sizes": [30, 60, 10], 20 | "win_lengths": [150, 300, 60], 21 | "window": "hann_window" 22 | }, 23 | "data": { 24 | "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned", 25 | "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned", 26 | "text_cleaners":["english_cleaners2"], 27 | "max_wav_value": 32768.0, 28 | "sampling_rate": 22050, 29 | "filter_length": 1024, 30 | "hop_length": 256, 31 | "win_length": 1024, 32 | "n_mel_channels": 80, 33 | "mel_fmin": 0.0, 34 | "mel_fmax": null, 35 | "add_blank": true, 36 | "n_speakers": 0, 37 | "cleaned_text": true 38 | }, 39 | "model": { 40 | "ms_istft_vits": false, 41 | "mb_istft_vits": false, 42 | "istft_vits": true, 43 | "subbands": false, 44 | "gen_istft_n_fft": 16, 45 | "gen_istft_hop_size": 4, 46 | "inter_channels": 192, 47 | "hidden_channels": 96, 48 | "filter_channels": 768, 49 | "n_heads": 2, 50 | "n_layers": 3, 51 | "kernel_size": 3, 52 | "p_dropout": 0.1, 53 | "resblock": "1", 54 | "resblock_kernel_sizes": [3,7,11], 55 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 56 | "upsample_rates": [8,8], 57 | "upsample_initial_channel": 256, 58 | "upsample_kernel_sizes": [16,16], 59 | "n_layers_q": 3, 60 | "use_spectral_norm": false, 61 | "use_sdp": false 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /configs/ljs_mini_mb_istft_vits.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 100000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 64, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0, 18 | "fft_sizes": [384, 683, 171], 19 | "hop_sizes": [30, 60, 10], 20 | "win_lengths": [150, 300, 60], 21 | "window": "hann_window" 22 | }, 23 | "data": { 24 | "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned", 25 | "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned", 26 | "text_cleaners":["english_cleaners2"], 27 | "max_wav_value": 32768.0, 28 | "sampling_rate": 22050, 29 | "filter_length": 1024, 30 | "hop_length": 256, 31 | "win_length": 1024, 32 | "n_mel_channels": 80, 33 | "mel_fmin": 0.0, 34 | "mel_fmax": null, 35 | "add_blank": true, 36 | "n_speakers": 0, 37 | "cleaned_text": true 38 | }, 39 | "model": { 40 | "ms_istft_vits": false, 41 | "mb_istft_vits": true, 42 | "istft_vits": false, 43 | "subbands": 4, 44 | "gen_istft_n_fft": 16, 45 | "gen_istft_hop_size": 4, 46 | "inter_channels": 192, 47 | "hidden_channels": 96, 48 | "filter_channels": 768, 49 | "n_heads": 2, 50 | "n_layers": 3, 51 | "kernel_size": 3, 52 | "p_dropout": 0.1, 53 | "resblock": "1", 54 | "resblock_kernel_sizes": [3,7,11], 55 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 56 | "upsample_rates": [4,4], 57 | "upsample_initial_channel": 256, 58 | "upsample_kernel_sizes": [16,16], 59 | "n_layers_q": 3, 60 | "use_spectral_norm": false, 61 | "use_sdp": false 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | from stft_loss import MultiResolutionSTFTLoss 4 | 5 | 6 | import commons 7 | 8 | 9 | def feature_loss(fmap_r, fmap_g): 10 | loss = 0 11 | for dr, dg in zip(fmap_r, fmap_g): 12 | for rl, gl in zip(dr, dg): 13 | rl = rl.float().detach() 14 | gl = gl.float() 15 | loss += torch.mean(torch.abs(rl - gl)) 16 | 17 | return loss * 2 18 | 19 | 20 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 21 | loss = 0 22 | r_losses = [] 23 | g_losses = [] 24 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 25 | dr = dr.float() 26 | dg = dg.float() 27 | r_loss = torch.mean((1-dr)**2) 28 | g_loss = torch.mean(dg**2) 29 | loss += (r_loss + g_loss) 30 | r_losses.append(r_loss.item()) 31 | g_losses.append(g_loss.item()) 32 | 33 | return loss, r_losses, g_losses 34 | 35 | 36 | def generator_loss(disc_outputs): 37 | loss = 0 38 | gen_losses = [] 39 | for dg in disc_outputs: 40 | dg = dg.float() 41 | l = torch.mean((1-dg)**2) 42 | gen_losses.append(l) 43 | loss += l 44 | 45 | return loss, gen_losses 46 | 47 | 48 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): 49 | """ 50 | z_p, logs_q: [b, h, t_t] 51 | m_p, logs_p: [b, h, t_t] 52 | """ 53 | z_p = z_p.float() 54 | logs_q = logs_q.float() 55 | m_p = m_p.float() 56 | logs_p = logs_p.float() 57 | z_mask = z_mask.float() 58 | 59 | kl = logs_p - logs_q - 0.5 60 | kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p) 61 | kl = torch.sum(kl * z_mask) 62 | l = kl / torch.sum(z_mask) 63 | return l 64 | 65 | def subband_stft_loss(h, y_mb, y_hat_mb): 66 | sub_stft_loss = MultiResolutionSTFTLoss(h.train.fft_sizes, h.train.hop_sizes, h.train.win_lengths) 67 | y_mb = y_mb.view(-1, y_mb.size(2)) 68 | y_hat_mb = y_hat_mb.view(-1, y_hat_mb.size(2)) 69 | sub_sc_loss, sub_mag_loss = sub_stft_loss(y_hat_mb[:, :y_mb.size(-1)], y_mb) 70 | return sub_sc_loss+sub_mag_loss 71 | 72 | -------------------------------------------------------------------------------- /configs/ljs_ms_istft_vits.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "log_interval": 200, 4 | "eval_interval": 100000, 5 | "seed": 1234, 6 | "epochs": 20000, 7 | "learning_rate": 2e-4, 8 | "betas": [0.8, 0.99], 9 | "eps": 1e-9, 10 | "batch_size": 64, 11 | "fp16_run": false, 12 | "lr_decay": 0.999875, 13 | "segment_size": 8192, 14 | "init_lr_ratio": 1, 15 | "warmup_epochs": 0, 16 | "c_mel": 45, 17 | "c_kl": 1.0, 18 | "fft_sizes": [384, 683, 171], 19 | "hop_sizes": [30, 60, 10], 20 | "win_lengths": [150, 300, 60], 21 | "window": "hann_window" 22 | }, 23 | "data": { 24 | "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned", 25 | "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned", 26 | "text_cleaners":["english_cleaners2"], 27 | "max_wav_value": 32768.0, 28 | "sampling_rate": 22050, 29 | "filter_length": 1024, 30 | "hop_length": 256, 31 | "win_length": 1024, 32 | "n_mel_channels": 80, 33 | "mel_fmin": 0.0, 34 | "mel_fmax": null, 35 | "add_blank": true, 36 | "n_speakers": 0, 37 | "cleaned_text": true 38 | }, 39 | "model": { 40 | "ms_istft_vits": true, 41 | "mb_istft_vits": false, 42 | "istft_vits": false, 43 | "subbands": 4, 44 | "gen_istft_n_fft": 16, 45 | "gen_istft_hop_size": 4, 46 | "inter_channels": 192, 47 | "hidden_channels": 192, 48 | "filter_channels": 768, 49 | "n_heads": 2, 50 | "n_layers": 6, 51 | "kernel_size": 3, 52 | "p_dropout": 0.1, 53 | "resblock": "1", 54 | "resblock_kernel_sizes": [3,7,11], 55 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 56 | "upsample_rates": [4,4], 57 | "upsample_initial_channel": 512, 58 | "upsample_kernel_sizes": [16,16], 59 | "n_layers_q": 3, 60 | "use_spectral_norm": false, 61 | "use_sdp": false 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%matplotlib inline\n", 10 | "import matplotlib.pyplot as plt\n", 11 | "import IPython.display as ipd\n", 12 | "\n", 13 | "import os\n", 14 | "import json\n", 15 | "import math\n", 16 | "import torch\n", 17 | "from torch import nn\n", 18 | "from torch.nn import functional as F\n", 19 | "from torch.utils.data import DataLoader\n", 20 | "\n", 21 | "import commons\n", 22 | "import utils\n", 23 | "from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate\n", 24 | "from models import SynthesizerTrn\n", 25 | "from text.symbols import symbols\n", 26 | "from text import text_to_sequence\n", 27 | "\n", 28 | "from scipy.io.wavfile import write\n", 29 | "\n", 30 | "\n", 31 | "def get_text(text, hps):\n", 32 | " text_norm = text_to_sequence(text, hps.data.text_cleaners)\n", 33 | " if hps.data.add_blank:\n", 34 | " text_norm = commons.intersperse(text_norm, 0)\n", 35 | " text_norm = torch.LongTensor(text_norm)\n", 36 | " return text_norm" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## MB-iSTFT-VITS" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "hps = utils.get_hparams_from_file(\"./configs/tsukuyomi_chan.json\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "net_g = SynthesizerTrn(\n", 62 | " len(symbols),\n", 63 | " hps.data.filter_length // 2 + 1,\n", 64 | " hps.train.segment_size // hps.data.hop_length,\n", 65 | " **hps.model).cuda()\n", 66 | "_ = net_g.eval()\n", 67 | "\n", 68 | "_ = utils.load_checkpoint(\"./logs/tsukuyomi/G_100000.pth\", net_g, None)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "stn_tst = get_text(\"こんにちは。\", hps)\n", 78 | "with torch.no_grad():\n", 79 | " x_tst = stn_tst.cuda().unsqueeze(0)\n", 80 | " x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()\n", 81 | " audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()\n", 82 | "ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))" 83 | ] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.8.13" 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 4 107 | } 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MB-iSTFT-VITS with Multilingual Implementations 2 | 3 | 4 | This is an multilingual implementation of [MB-iSTFT-VITS](https://github.com/MasayaKawamura/MB-iSTFT-VITS) to support conversion to various languages. MB-iSTFT-VITS showed 4.1 times faster inference time compared with original VITS!
5 | Preprocessed Japanese Single Speaker training material is provided with [つくよみちゃんコーパス(tsukuyomi-chan corpus).](https://tyc.rei-yumesaki.net/material/corpus/) You need to download the corpus and place 100 `.wav` files to `./tsukuyomi_raw`. 6 |
7 | 8 | - Currently Supported: Japanese / Korean 9 | - Chinese / CJKE / and other languages will be updated very soon! 10 | 11 | 12 | # How to use 13 | Python >= 3.6 (Python == 3.7 is suggested) 14 | 15 | ## Clone this repository 16 | ```sh 17 | git clone https://github.com/misakiudon/MB-iSTFT-VITS-multilingual.git 18 | ``` 19 | 20 | ## Install requirements 21 | ```sh 22 | pip install -r requirements.txt 23 | ``` 24 | You may need to install espeak first: `apt-get install espeak` 25 | 26 | ## Create manifest data 27 | ### Single speaker 28 | "n_speakers" should be 0 in config.json 29 | ``` 30 | path/to/XXX.wav|transcript 31 | ``` 32 | - Example 33 | ``` 34 | dataset/001.wav|こんにちは。 35 | ``` 36 | 37 | ### Mutiple speakers 38 | Speaker id should start from 0 39 | ``` 40 | path/to/XXX.wav|speaker id|transcript 41 | ``` 42 | - Example 43 | ``` 44 | dataset/001.wav|0|こんにちは。 45 | ``` 46 | 47 | ## Preprocess 48 | Japanese preprocessed manifest data is provided with `filelists/filelist_train2.txt.cleaned` and `filelists/filelist_val2.txt.cleaned`. 49 | ```sh 50 | # Single speaker 51 | python preprocess.py --text_index 1 --filelists path/to/filelist_train.txt path/to/filelist_val.txt --text_cleaners 'japanese_cleaners' 52 | 53 | # Mutiple speakers 54 | python preprocess.py --text_index 2 --filelists path/to/filelist_train.txt path/to/filelist_val.txt --text_cleaners 'japanese_cleaners' 55 | ``` 56 | 57 | If your speech file is either not `22050Hz / Mono / PCM-16`, the you should resample your .wav file first. 58 | ```sh 59 | python convert_to_22050.py --in_path path/to/original_wav_dir/ --out_path path/to/output_wav_dir/ 60 | ``` 61 | 62 | ## Build monotonic alignment search 63 | ```sh 64 | # Cython-version Monotonoic Alignment Search 65 | cd monotonic_align 66 | mkdir monotonic_align 67 | python setup.py build_ext --inplace 68 | ``` 69 | 70 | ## Setting json file in [configs](configs) 71 | 72 | | Model | How to set up json file in [configs](configs) | Sample of json file configuration| 73 | | :---: | :---: | :---: | 74 | | iSTFT-VITS | ```"istft_vits": true, ```
``` "upsample_rates": [8,8], ``` | ljs_istft_vits.json | 75 | | MB-iSTFT-VITS | ```"subbands": 4,```
```"mb_istft_vits": true, ```
``` "upsample_rates": [4,4], ``` | ljs_mb_istft_vits.json | 76 | | MS-iSTFT-VITS | ```"subbands": 4,```
```"ms_istft_vits": true, ```
``` "upsample_rates": [4,4], ``` | ljs_ms_istft_vits.json | 77 | 78 | For tutorial, check `config/tsukuyomi_chan.json` for more examples 79 | - If you have done preprocessing, set "cleaned_text" to true. 80 | - Change `training_files` and `validation_files` to the path of preprocessed manifest files. 81 | - Select same `text_cleaners` you used in preprocessing step. 82 | 83 | ## Train 84 | ```sh 85 | # Single speaker 86 | python train_latest.py -c -m 87 | 88 | # Mutiple speakers 89 | python train_latest_ms.py -c -m 90 | ``` 91 | In the case of training MB-iSTFT-VITS with Japanese tutorial corpus, run the following script. Resume training from lastest checkpoint is automatic. 92 | ```sh 93 | python train_latest.py -c configs/tsukuyomi_chan.json -m tsukuyomi 94 | ``` 95 | 96 | After the training, you can check inference audio using [inference.ipynb](inference.ipynb) 97 | 98 | ## References 99 | - https://github.com/MasayaKawamura/MB-iSTFT-VITS 100 | - https://github.com/CjangCjengh/vits 101 | - https://github.com/Francis-Komizu/VITS 102 | -------------------------------------------------------------------------------- /mel_processing.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import random 4 | import torch 5 | from torch import nn 6 | import torch.nn.functional as F 7 | import torch.utils.data 8 | import numpy as np 9 | import librosa 10 | import librosa.util as librosa_util 11 | from librosa.util import normalize, pad_center, tiny 12 | from scipy.signal import get_window 13 | from scipy.io.wavfile import read 14 | from librosa.filters import mel as librosa_mel_fn 15 | 16 | MAX_WAV_VALUE = 32768.0 17 | 18 | 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 20 | """ 21 | PARAMS 22 | ------ 23 | C: compression factor 24 | """ 25 | return torch.log(torch.clamp(x, min=clip_val) * C) 26 | 27 | 28 | def dynamic_range_decompression_torch(x, C=1): 29 | """ 30 | PARAMS 31 | ------ 32 | C: compression factor used to compress 33 | """ 34 | return torch.exp(x) / C 35 | 36 | 37 | def spectral_normalize_torch(magnitudes): 38 | output = dynamic_range_compression_torch(magnitudes) 39 | return output 40 | 41 | 42 | def spectral_de_normalize_torch(magnitudes): 43 | output = dynamic_range_decompression_torch(magnitudes) 44 | return output 45 | 46 | 47 | mel_basis = {} 48 | hann_window = {} 49 | 50 | 51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): 52 | if torch.min(y) < -1.: 53 | print('min value is ', torch.min(y)) 54 | if torch.max(y) > 1.: 55 | print('max value is ', torch.max(y)) 56 | 57 | global hann_window 58 | dtype_device = str(y.dtype) + '_' + str(y.device) 59 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 60 | if wnsize_dtype_device not in hann_window: 61 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 62 | 63 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 64 | y = y.squeeze(1) 65 | 66 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 67 | center=center, pad_mode='reflect', normalized=False, onesided=True) 68 | 69 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 70 | return spec 71 | 72 | 73 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): 74 | global mel_basis 75 | dtype_device = str(spec.dtype) + '_' + str(spec.device) 76 | fmax_dtype_device = str(fmax) + '_' + dtype_device 77 | if fmax_dtype_device not in mel_basis: 78 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 79 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) 80 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 81 | spec = spectral_normalize_torch(spec) 82 | return spec 83 | 84 | 85 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 86 | if torch.min(y) < -1.: 87 | print('min value is ', torch.min(y)) 88 | if torch.max(y) > 1.: 89 | print('max value is ', torch.max(y)) 90 | 91 | global mel_basis, hann_window 92 | dtype_device = str(y.dtype) + '_' + str(y.device) 93 | fmax_dtype_device = str(fmax) + '_' + dtype_device 94 | wnsize_dtype_device = str(win_size) + '_' + dtype_device 95 | if fmax_dtype_device not in mel_basis: 96 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 97 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) 98 | if wnsize_dtype_device not in hann_window: 99 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) 100 | 101 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 102 | y = y.squeeze(1) 103 | 104 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], 105 | center=center, pad_mode='reflect', normalized=False, onesided=True) 106 | 107 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) 108 | 109 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec) 110 | spec = spectral_normalize_torch(spec) 111 | 112 | return spec 113 | -------------------------------------------------------------------------------- /pqmf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2020 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """Pseudo QMF modules.""" 7 | 8 | import numpy as np 9 | import torch 10 | import torch.nn.functional as F 11 | 12 | from scipy.signal import kaiser 13 | 14 | 15 | def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0): 16 | """Design prototype filter for PQMF. 17 | This method is based on `A Kaiser window approach for the design of prototype 18 | filters of cosine modulated filterbanks`_. 19 | Args: 20 | taps (int): The number of filter taps. 21 | cutoff_ratio (float): Cut-off frequency ratio. 22 | beta (float): Beta coefficient for kaiser window. 23 | Returns: 24 | ndarray: Impluse response of prototype filter (taps + 1,). 25 | .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: 26 | https://ieeexplore.ieee.org/abstract/document/681427 27 | """ 28 | # check the arguments are valid 29 | assert taps % 2 == 0, "The number of taps mush be even number." 30 | assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0." 31 | 32 | # make initial filter 33 | omega_c = np.pi * cutoff_ratio 34 | with np.errstate(invalid='ignore'): 35 | h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) \ 36 | / (np.pi * (np.arange(taps + 1) - 0.5 * taps)) 37 | h_i[taps // 2] = np.cos(0) * cutoff_ratio # fix nan due to indeterminate form 38 | 39 | # apply kaiser window 40 | w = kaiser(taps + 1, beta) 41 | h = h_i * w 42 | 43 | return h 44 | 45 | 46 | class PQMF(torch.nn.Module): 47 | """PQMF module. 48 | This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_. 49 | .. _`Near-perfect-reconstruction pseudo-QMF banks`: 50 | https://ieeexplore.ieee.org/document/258122 51 | """ 52 | 53 | def __init__(self, device, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0): 54 | """Initilize PQMF module. 55 | Args: 56 | subbands (int): The number of subbands. 57 | taps (int): The number of filter taps. 58 | cutoff_ratio (float): Cut-off frequency ratio. 59 | beta (float): Beta coefficient for kaiser window. 60 | """ 61 | super(PQMF, self).__init__() 62 | 63 | # define filter coefficient 64 | h_proto = design_prototype_filter(taps, cutoff_ratio, beta) 65 | h_analysis = np.zeros((subbands, len(h_proto))) 66 | h_synthesis = np.zeros((subbands, len(h_proto))) 67 | for k in range(subbands): 68 | h_analysis[k] = 2 * h_proto * np.cos( 69 | (2 * k + 1) * (np.pi / (2 * subbands)) * 70 | (np.arange(taps + 1) - ((taps - 1) / 2)) + 71 | (-1) ** k * np.pi / 4) 72 | h_synthesis[k] = 2 * h_proto * np.cos( 73 | (2 * k + 1) * (np.pi / (2 * subbands)) * 74 | (np.arange(taps + 1) - ((taps - 1) / 2)) - 75 | (-1) ** k * np.pi / 4) 76 | 77 | # convert to tensor 78 | analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).cuda(device) 79 | synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0).cuda(device) 80 | 81 | # register coefficients as beffer 82 | self.register_buffer("analysis_filter", analysis_filter) 83 | self.register_buffer("synthesis_filter", synthesis_filter) 84 | 85 | # filter for downsampling & upsampling 86 | updown_filter = torch.zeros((subbands, subbands, subbands)).float().cuda(device) 87 | for k in range(subbands): 88 | updown_filter[k, k, 0] = 1.0 89 | self.register_buffer("updown_filter", updown_filter) 90 | self.subbands = subbands 91 | 92 | # keep padding info 93 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0) 94 | 95 | def analysis(self, x): 96 | """Analysis with PQMF. 97 | Args: 98 | x (Tensor): Input tensor (B, 1, T). 99 | Returns: 100 | Tensor: Output tensor (B, subbands, T // subbands). 101 | """ 102 | x = F.conv1d(self.pad_fn(x), self.analysis_filter) 103 | return F.conv1d(x, self.updown_filter, stride=self.subbands) 104 | 105 | def synthesis(self, x): 106 | """Synthesis with PQMF. 107 | Args: 108 | x (Tensor): Input tensor (B, subbands, T // subbands). 109 | Returns: 110 | Tensor: Output tensor (B, 1, T). 111 | """ 112 | # NOTE(kan-bayashi): Power will be dreased so here multipy by # subbands. 113 | # Not sure this is the correct way, it is better to check again. 114 | # TODO(kan-bayashi): Understand the reconstruction procedure 115 | x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands) 116 | return F.conv1d(self.pad_fn(x), self.synthesis_filter) -------------------------------------------------------------------------------- /text/py2kn.json: -------------------------------------------------------------------------------- 1 | {"a": "アー", "ai": "アイ", "an": "アン", "ang": "アン", "ao": "アオ", "ba": "バー", "bai": "バイ", "ban": "バン", "bang": "バン", "bao": "バオ", "bei": "ベイ", "ben": "ベン", "beng": "ボン", "bi": "ビー", "bian": "ビィェン", "biao": "ビィャォ", "bie": "ビィェ", "bin": "ビン", "bing": "ビン", "bo": "ブォ", "bu": "ブー", "ca": "ツァ", "cai": "ツァィ", "can": "ツァン", "cang": "ツァン", "cao": "ツァォ", "ce": "ツェ", "cen": "ツェン", "ceng": "ツォン", "cha": "チャ", "chai": "チャイ", "chan": "チャン", "chang": "チャン", "chao": "チャオ", "che": "チェ", "chen": "チェン", "cheng": "チォン", "chi": "チー", "chong": "チョン", "chou": "チョウ", "chu": "チュ", "chuan": "チュァン", "chuai": "チュァイ", "chuang": "チュゥァン", "chui": "チュイ", "chun": "チュン", "chuo": "チャオ", "ci": "ツー", "cong": "ツォン", "cou": "ツォゥ ", "cu": "ツゥ", "cuan": "ツァン", "cui": "ツイ", "cun": "ツン", "cuo": "ツゥォ", "da": "ダー", "dai": "ダイ", "dan": "ダン", "dang": "ダン", "dao": "ダオ", "de": "デェ", "dei": "デイ", "dun": "ドゥン", "deng": "ドン", "di": "ディ", "dian": "ディェン", "diao": "ディァォ", "die": "ディェ", "ding": "ディン", "diu": "ディゥ", "dong": "ドン", "dou": "ドウ", "du": "ドゥ", "duan": "ドゥァン", "dui": "ドゥイ", "duo": "ドゥォ", "e": "ェ", "ei": "ェイ", "en": "エン", "eng": "鞥", "er": "ェ", "fa": "ファ", "fan": "ファン", "fang": "ファン", "fei": "フェイ", "fen": "フェン", "feng": "フォン", "fuo": "フォ", "fou": "フォウ", "fu": "フー", "ga": "ガー", "gai": "ガイ", "gan": "ガン", "gang": "ガン", "gao": "ガオ", "ge": "グェ", "gei": "ゲイ", "gen": "ゲン", "geng": "ゴン", "gong": "ゴン", "gou": "ゴウ", "gu": "グー", "gua": "グァ", "guai": "グゥァイ", "guan": "グァン", "guang": "グゥァン", "gui": "グゥイ", "gun": "ガン", "guo": "グゥォ", "ha": "ハー", "hai": "ハイ", "han": "ハン", "hang": "ハン", "hao": "ハオ", "he": "フェ゛ァ", "hei": "ヘイ", "hen": "ヘン", "heng": "ホン", "hong": "ホン", "hou": "ホウ", "hu": "フー", "hua": "ファ", "huai": "フゥァイ", "huan": "ファン", "huang": "フゥァン", "hui": "フゥイ", "hun": "フン", "huo": "フォ", "ji": "ジー", "jia": "ジャ", "jian": "ジィェン", "jiang": "ジィァン", "jiao": "ジャオ", "jie": "ジェ", "jin": "ジン", "jing": "ジン", "jiong": "ジィォン", "jiu": "ジゥ", "ju": "ジュ", "juan": "ジュェン", "jue": "ジュェ", "jun": "ジュン", "ka": "カー", "kai": "カイ", "kan": "カン", "kang": "カン", "kao": "カオ", "ke": "クェ゛ァ", "ken": "ケン", "keng": "コン", "kong": "コン", "kou": "コウ", "ku": "クー", "kua": "クァ", "kuai": "クァィ", "kuan": "クァン", "kuang": "クゥァン", "kui": "クゥイ", "kun": "クン", "kuo": "クォ", "la": "ラー", "lai": "ライ", "lan": "ラン", "lang": "ラン", "lao": "ラオ", "le": "ラ", "lei": "レイ", "leng": "ラン", "li": "リー", "liang": "リィァン", "lian": "リィェン", "liao": "リィァォ", "lie": "リィェ", "lin": "リン", "ling": "リン", "liu": "リィゥ", "long": "ロン", "lou": "ロウ", "lu": "ルー", "lv": "リュ", "luan": "ルゥァン", "lue": "リュェ", "lun": "ルゥン", "luo": "ルゥォ", "ma": "マー", "mai": "マイ", "man": "マン", "mang": "マン", "mao": "マオ", "me": "ムェ", "mei": "メイ", "men": "メン", "meng": "モン", "mi": "ミィ", "mian": "ミィェン", "miao": "ミィァォ", "mie": "ミィェ", "min": "ミン", "ming": "ミン", "miu": "ミィゥ", "mo": "ムォ", "mou": "モウ", "mu": "ムー", "na": "ナー", "nai": "ナイ", "nan": "ナン", "nang": "ナン", "nao": "ナオ", "ne": "ヌェ゛ァ", "nei": "ネイ", "nen": "ネン", "neng": "ノン", "ni": "ニー", "nian": "ニィェン", "niang": "ニィァン", "niao": "ニィァォ", "nie": "ニィェ", "nin": "ニン", "ning": "ニン", "niu": "ニュェ", "nong": "ノン", "nou": "ノウ", "nu": "ヌー", "nv": "ニュ", "nuan": "ヌァン", "nuo": "ヌオ", "o": "オ", "ou": "オウ", "pa": "パー", "pai": "パイ", "pan": "パン", "pang": "パン", "pao": "パオ", "pei": "ペイ", "pen": "ペン", "peng": "ポン", "pi": "ピー", "pian": "ピィェン", "piao": "ピィァオ", "pie": "ピェ", "pin": "ピン", "ping": "ピン", "po": "ポォ", "pou": "ポウ", "pu": "プー", "qi": "チー", "qia": "チィァ", "qian": "チィェン", "qiang": "チィァン", "qiao": "チィァォ", "qie": "チィェ", "qin": "チン", "qing": "チン", "qiong": "チォン", "qiu": "チィゥ", "qu": "チュ", "quan": "チュェン", "que": "チュェ", "qun": "チュン", "ran": "ラン", "rang": "ラン", "rao": "ラオ", "re": "レ", "ren": "レン", "reng": "ロン", "ri": "リ", "rong": "ロン", "rou": "ロウ", "ru": "ルー", "ruan": "ルァン", "rui": "ルイ", "run": "ルン", "ruo": "ルォ", "sa": "サー", "sai": "サオ", "san": "サン", "sang": "サン", "se": "スェ", "sen": "セン", "seng": "ソン", "sha": "シャ", "shai": "シャイ", "shan": "シャン", "shang": "シャン", "shao": "シャオ", "she": "シェ", "shen": "シェン", "sheng": "シォン", "shi": "シー", "shou": "ショウ", "shu": "シュ", "shua": "シュァ", "shuai": "シュァイ", "shuan": "シュァン", "shuang": "シュゥァン", "shui": "シュイ", "shun": "シュン", "shuo": "シュォ", "si": "スー", "song": "ソン", "sou": "ソウ", "su": "スー", "suan": "スゥァン", "sui": "スイ", "sun": "スン", "suo": "スォ", "ta": "ター", "tai": "タイ", "tan": "タン", "tang": "タン", "tao": "タオ", "te": "テェ", "teng": "トン", "ti": "ティ", "tian": "ティェン", "tiao": "ティァォ", "tie": "ティェ", "ting": "ティン", "tong": "トン", "tou": "トウ", "tu": "トゥ", "tuan": "トゥァン", "tui": "トゥイ", "tun": "トゥン", "tuo": "トゥォ", "wa": "ウァ", "wai": "ワィ", "wan": "ワン", "wang": "ワン", "wei": "ウェイ", "wen": "ウェン", "weng": "ウォン", "wo": "ウォ", "wu": "ウー", "xi": "シー", "xia": "シァ", "xian": "シィェン", "xiang": "シィァン", "xiao": "シァォ", "xie": "シェ", "xin": "シン", "xing": "シン", "xiong": "シィォン", "xiu": "シゥ", "xu": "シュ", "xuan": "シュェン", "xue": "シュェ", "xun": "シュン", "ya": "ヤー", "yan": "イェン", "yang": "ヤン", "yao": "イャォ", "ye": "イェ", "yi": "イー", "yin": "イン", "ying": "イン", "yong": "ヨン", "you": "ヨウ", "yu": "ユー", "yuan": "ユェン", "yue": "ユェ", "yun": "ユン", "za": "ザー", "zai": "ヂャイ", "zan": "ザン", "zang": "ザン", "zao": "ザオ", "ze": "ゼェ", "zei": "ゼイ", "zen": "ゼン", "zeng": "ゾン", "zhan": "ヂャン", "zhang": "ヂャン", "zhao": "ヂャオ", "zhe": "ヂェ゛ァ", "zhen": "ヂェン", "zheng": "ヂォン", "zhi": "ヂー", "zhong": "ヂョン", "zhou": "ヂョウ", "zhu": "ヂュ", "zhua": "ヂュア", "zhuai": "ヂュァイ", "zhuan": "ヂュァン", "zhuang": "ヂュゥァン", "zhui": "ヂュイ", "zhun": "ヂュン", "zhuo": "ヂュオ", "zi": "ズー", "zong": "ゾン", "zou": "ゾウ", "zu": "ズー", "zuan": "ズァン", "zui": "ズイ", "zun": "ズン", "zuo": "ズゥォ", ",": "、", "。": "。", "!": "!", "?": "?", "……": "。"} 2 | -------------------------------------------------------------------------------- /stft_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright 2019 Tomoki Hayashi 4 | # MIT License (https://opensource.org/licenses/MIT) 5 | 6 | """STFT-based Loss modules.""" 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | def stft(x, fft_size, hop_size, win_length, window): 13 | """Perform STFT and convert to magnitude spectrogram. 14 | Args: 15 | x (Tensor): Input signal tensor (B, T). 16 | fft_size (int): FFT size. 17 | hop_size (int): Hop size. 18 | win_length (int): Window length. 19 | window (str): Window function type. 20 | Returns: 21 | Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). 22 | """ 23 | x_stft = torch.stft(x, fft_size, hop_size, win_length, window.to(x.device)) 24 | real = x_stft[..., 0] 25 | imag = x_stft[..., 1] 26 | 27 | # NOTE(kan-bayashi): clamp is needed to avoid nan or inf 28 | return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1) 29 | 30 | 31 | class SpectralConvergengeLoss(torch.nn.Module): 32 | """Spectral convergence loss module.""" 33 | 34 | def __init__(self): 35 | """Initilize spectral convergence loss module.""" 36 | super(SpectralConvergengeLoss, self).__init__() 37 | 38 | def forward(self, x_mag, y_mag): 39 | """Calculate forward propagation. 40 | Args: 41 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 42 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 43 | Returns: 44 | Tensor: Spectral convergence loss value. 45 | """ 46 | return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro") 47 | 48 | 49 | class LogSTFTMagnitudeLoss(torch.nn.Module): 50 | """Log STFT magnitude loss module.""" 51 | 52 | def __init__(self): 53 | """Initilize los STFT magnitude loss module.""" 54 | super(LogSTFTMagnitudeLoss, self).__init__() 55 | 56 | def forward(self, x_mag, y_mag): 57 | """Calculate forward propagation. 58 | Args: 59 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). 60 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). 61 | Returns: 62 | Tensor: Log STFT magnitude loss value. 63 | """ 64 | return F.l1_loss(torch.log(y_mag), torch.log(x_mag)) 65 | 66 | 67 | class STFTLoss(torch.nn.Module): 68 | """STFT loss module.""" 69 | 70 | def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"): 71 | """Initialize STFT loss module.""" 72 | super(STFTLoss, self).__init__() 73 | self.fft_size = fft_size 74 | self.shift_size = shift_size 75 | self.win_length = win_length 76 | self.window = getattr(torch, window)(win_length) 77 | self.spectral_convergenge_loss = SpectralConvergengeLoss() 78 | self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() 79 | 80 | def forward(self, x, y): 81 | """Calculate forward propagation. 82 | Args: 83 | x (Tensor): Predicted signal (B, T). 84 | y (Tensor): Groundtruth signal (B, T). 85 | Returns: 86 | Tensor: Spectral convergence loss value. 87 | Tensor: Log STFT magnitude loss value. 88 | """ 89 | x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) 90 | y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window) 91 | sc_loss = self.spectral_convergenge_loss(x_mag, y_mag) 92 | mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) 93 | 94 | return sc_loss, mag_loss 95 | 96 | 97 | class MultiResolutionSTFTLoss(torch.nn.Module): 98 | """Multi resolution STFT loss module.""" 99 | 100 | def __init__(self, 101 | fft_sizes=[1024, 2048, 512], 102 | hop_sizes=[120, 240, 50], 103 | win_lengths=[600, 1200, 240], 104 | window="hann_window"): 105 | """Initialize Multi resolution STFT loss module. 106 | Args: 107 | fft_sizes (list): List of FFT sizes. 108 | hop_sizes (list): List of hop sizes. 109 | win_lengths (list): List of window lengths. 110 | window (str): Window function type. 111 | """ 112 | super(MultiResolutionSTFTLoss, self).__init__() 113 | assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) 114 | self.stft_losses = torch.nn.ModuleList() 115 | for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): 116 | self.stft_losses += [STFTLoss(fs, ss, wl, window)] 117 | 118 | def forward(self, x, y): 119 | """Calculate forward propagation. 120 | Args: 121 | x (Tensor): Predicted signal (B, T). 122 | y (Tensor): Groundtruth signal (B, T). 123 | Returns: 124 | Tensor: Multi resolution spectral convergence loss value. 125 | Tensor: Multi resolution log STFT magnitude loss value. 126 | """ 127 | sc_loss = 0.0 128 | mag_loss = 0.0 129 | for f in self.stft_losses: 130 | sc_l, mag_l = f(x, y) 131 | sc_loss += sc_l 132 | mag_loss += mag_l 133 | sc_loss /= len(self.stft_losses) 134 | mag_loss /= len(self.stft_losses) 135 | 136 | return sc_loss, mag_loss -------------------------------------------------------------------------------- /commons.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | 7 | 8 | def init_weights(m, mean=0.0, std=0.01): 9 | classname = m.__class__.__name__ 10 | if classname.find("Conv") != -1: 11 | m.weight.data.normal_(mean, std) 12 | 13 | 14 | def get_padding(kernel_size, dilation=1): 15 | return int((kernel_size*dilation - dilation)/2) 16 | 17 | 18 | def convert_pad_shape(pad_shape): 19 | l = pad_shape[::-1] 20 | pad_shape = [item for sublist in l for item in sublist] 21 | return pad_shape 22 | 23 | 24 | def intersperse(lst, item): 25 | result = [item] * (len(lst) * 2 + 1) 26 | result[1::2] = lst 27 | return result 28 | 29 | 30 | def kl_divergence(m_p, logs_p, m_q, logs_q): 31 | """KL(P||Q)""" 32 | kl = (logs_q - logs_p) - 0.5 33 | kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q) 34 | return kl 35 | 36 | 37 | def rand_gumbel(shape): 38 | """Sample from the Gumbel distribution, protect from overflows.""" 39 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 40 | return -torch.log(-torch.log(uniform_samples)) 41 | 42 | 43 | def rand_gumbel_like(x): 44 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) 45 | return g 46 | 47 | 48 | def slice_segments(x, ids_str, segment_size=4): 49 | ret = torch.zeros_like(x[:, :, :segment_size]) 50 | for i in range(x.size(0)): 51 | idx_str = ids_str[i] 52 | idx_end = idx_str + segment_size 53 | ret[i] = x[i, :, idx_str:idx_end] 54 | return ret 55 | 56 | 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 58 | b, d, t = x.size() 59 | if x_lengths is None: 60 | x_lengths = t 61 | ids_str_max = x_lengths - segment_size + 1 62 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) 63 | ret = slice_segments(x, ids_str, segment_size) 64 | return ret, ids_str 65 | 66 | 67 | def get_timing_signal_1d( 68 | length, channels, min_timescale=1.0, max_timescale=1.0e4): 69 | position = torch.arange(length, dtype=torch.float) 70 | num_timescales = channels // 2 71 | log_timescale_increment = ( 72 | math.log(float(max_timescale) / float(min_timescale)) / 73 | (num_timescales - 1)) 74 | inv_timescales = min_timescale * torch.exp( 75 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment) 76 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) 77 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) 78 | signal = F.pad(signal, [0, 0, 0, channels % 2]) 79 | signal = signal.view(1, channels, length) 80 | return signal 81 | 82 | 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): 84 | b, channels, length = x.size() 85 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 86 | return x + signal.to(dtype=x.dtype, device=x.device) 87 | 88 | 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): 90 | b, channels, length = x.size() 91 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) 92 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) 93 | 94 | 95 | def subsequent_mask(length): 96 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) 97 | return mask 98 | 99 | 100 | @torch.jit.script 101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 102 | n_channels_int = n_channels[0] 103 | in_act = input_a + input_b 104 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 105 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 106 | acts = t_act * s_act 107 | return acts 108 | 109 | 110 | def convert_pad_shape(pad_shape): 111 | l = pad_shape[::-1] 112 | pad_shape = [item for sublist in l for item in sublist] 113 | return pad_shape 114 | 115 | 116 | def shift_1d(x): 117 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] 118 | return x 119 | 120 | 121 | def sequence_mask(length, max_length=None): 122 | if max_length is None: 123 | max_length = length.max() 124 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 125 | return x.unsqueeze(0) < length.unsqueeze(1) 126 | 127 | 128 | def generate_path(duration, mask): 129 | """ 130 | duration: [b, 1, t_x] 131 | mask: [b, 1, t_y, t_x] 132 | """ 133 | device = duration.device 134 | 135 | b, _, t_y, t_x = mask.shape 136 | cum_duration = torch.cumsum(duration, -1) 137 | 138 | cum_duration_flat = cum_duration.view(b * t_x) 139 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) 140 | path = path.view(b, t_x, t_y) 141 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] 142 | path = path.unsqueeze(1).transpose(2,3) * mask 143 | return path 144 | 145 | 146 | def clip_grad_value_(parameters, clip_value, norm_type=2): 147 | if isinstance(parameters, torch.Tensor): 148 | parameters = [parameters] 149 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 150 | norm_type = float(norm_type) 151 | if clip_value is not None: 152 | clip_value = float(clip_value) 153 | 154 | total_norm = 0 155 | for p in parameters: 156 | param_norm = p.grad.data.norm(norm_type) 157 | total_norm += param_norm.item() ** norm_type 158 | if clip_value is not None: 159 | p.grad.data.clamp_(min=-clip_value, max=clip_value) 160 | total_norm = total_norm ** (1. / norm_type) 161 | return total_norm 162 | -------------------------------------------------------------------------------- /text/japanese.py: -------------------------------------------------------------------------------- 1 | import re 2 | from unidecode import unidecode 3 | import pyopenjtalk 4 | 5 | 6 | # Regular expression matching Japanese without punctuation marks: 7 | _japanese_characters = re.compile( 8 | r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 9 | 10 | # Regular expression matching non-Japanese characters or punctuation marks: 11 | _japanese_marks = re.compile( 12 | r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 13 | 14 | # List of (symbol, Japanese) pairs for marks: 15 | _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [ 16 | ('%', 'パーセント') 17 | ]] 18 | 19 | # List of (romaji, ipa) pairs for marks: 20 | _romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ 21 | ('ts', 'ʦ'), 22 | ('u', 'ɯ'), 23 | ('j', 'ʥ'), 24 | ('y', 'j'), 25 | ('ni', 'n^i'), 26 | ('nj', 'n^'), 27 | ('hi', 'çi'), 28 | ('hj', 'ç'), 29 | ('f', 'ɸ'), 30 | ('I', 'i*'), 31 | ('U', 'ɯ*'), 32 | ('r', 'ɾ') 33 | ]] 34 | 35 | # List of (romaji, ipa2) pairs for marks: 36 | _romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ 37 | ('u', 'ɯ'), 38 | ('ʧ', 'tʃ'), 39 | ('j', 'dʑ'), 40 | ('y', 'j'), 41 | ('ni', 'n^i'), 42 | ('nj', 'n^'), 43 | ('hi', 'çi'), 44 | ('hj', 'ç'), 45 | ('f', 'ɸ'), 46 | ('I', 'i*'), 47 | ('U', 'ɯ*'), 48 | ('r', 'ɾ') 49 | ]] 50 | 51 | # List of (consonant, sokuon) pairs: 52 | _real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [ 53 | (r'Q([↑↓]*[kg])', r'k#\1'), 54 | (r'Q([↑↓]*[tdjʧ])', r't#\1'), 55 | (r'Q([↑↓]*[sʃ])', r's\1'), 56 | (r'Q([↑↓]*[pb])', r'p#\1') 57 | ]] 58 | 59 | # List of (consonant, hatsuon) pairs: 60 | _real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [ 61 | (r'N([↑↓]*[pbm])', r'm\1'), 62 | (r'N([↑↓]*[ʧʥj])', r'n^\1'), 63 | (r'N([↑↓]*[tdn])', r'n\1'), 64 | (r'N([↑↓]*[kg])', r'ŋ\1') 65 | ]] 66 | 67 | 68 | def symbols_to_japanese(text): 69 | for regex, replacement in _symbols_to_japanese: 70 | text = re.sub(regex, replacement, text) 71 | return text 72 | 73 | 74 | def japanese_to_romaji_with_accent(text): 75 | '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html''' 76 | text = symbols_to_japanese(text) 77 | sentences = re.split(_japanese_marks, text) 78 | marks = re.findall(_japanese_marks, text) 79 | text = '' 80 | for i, sentence in enumerate(sentences): 81 | if re.match(_japanese_characters, sentence): 82 | if text != '': 83 | text += ' ' 84 | labels = pyopenjtalk.extract_fullcontext(sentence) 85 | for n, label in enumerate(labels): 86 | phoneme = re.search(r'\-([^\+]*)\+', label).group(1) 87 | if phoneme not in ['sil', 'pau']: 88 | text += phoneme.replace('ch', 'ʧ').replace('sh', 89 | 'ʃ').replace('cl', 'Q') 90 | else: 91 | continue 92 | # n_moras = int(re.search(r'/F:(\d+)_', label).group(1)) 93 | a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1)) 94 | a2 = int(re.search(r"\+(\d+)\+", label).group(1)) 95 | a3 = int(re.search(r"\+(\d+)/", label).group(1)) 96 | if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']: 97 | a2_next = -1 98 | else: 99 | a2_next = int( 100 | re.search(r"\+(\d+)\+", labels[n + 1]).group(1)) 101 | # Accent phrase boundary 102 | if a3 == 1 and a2_next == 1: 103 | text += ' ' 104 | # Falling 105 | elif a1 == 0 and a2_next == a2 + 1: 106 | text += '↓' 107 | # Rising 108 | elif a2 == 1 and a2_next == 2: 109 | text += '↑' 110 | if i < len(marks): 111 | text += unidecode(marks[i]).replace(' ', '') 112 | return text 113 | 114 | 115 | def get_real_sokuon(text): 116 | for regex, replacement in _real_sokuon: 117 | text = re.sub(regex, replacement, text) 118 | return text 119 | 120 | 121 | def get_real_hatsuon(text): 122 | for regex, replacement in _real_hatsuon: 123 | text = re.sub(regex, replacement, text) 124 | return text 125 | 126 | 127 | def japanese_to_ipa(text): 128 | text = japanese_to_romaji_with_accent(text).replace('...', '…') 129 | text = re.sub( 130 | r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text) 131 | text = get_real_sokuon(text) 132 | text = get_real_hatsuon(text) 133 | for regex, replacement in _romaji_to_ipa: 134 | text = re.sub(regex, replacement, text) 135 | return text 136 | 137 | 138 | def japanese_to_ipa2(text): 139 | text = japanese_to_romaji_with_accent(text).replace('...', '…') 140 | text = get_real_sokuon(text) 141 | text = get_real_hatsuon(text) 142 | for regex, replacement in _romaji_to_ipa2: 143 | text = re.sub(regex, replacement, text) 144 | return text 145 | 146 | 147 | def japanese_to_ipa3(text): 148 | text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace( 149 | 'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a') 150 | text = re.sub( 151 | r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text) 152 | text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text) 153 | return text 154 | -------------------------------------------------------------------------------- /text/cleaners.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | import re 16 | from unidecode import unidecode 17 | from phonemizer import phonemize 18 | import pyopenjtalk 19 | from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3 20 | from text.korean import latin_to_hangul, number_to_hangul, divide_hangul, korean_to_lazy_ipa, korean_to_ipa 21 | 22 | # Regular expression matching whitespace: 23 | _whitespace_re = re.compile(r'\s+') 24 | 25 | # Regular expression matching Japanese without punctuation marks: 26 | _japanese_characters = re.compile(r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 27 | 28 | # Regular expression matching non-Japanese characters or punctuation marks: 29 | _japanese_marks = re.compile(r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]') 30 | 31 | # List of (regular expression, replacement) pairs for abbreviations: 32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 33 | ('mrs', 'misess'), 34 | ('mr', 'mister'), 35 | ('dr', 'doctor'), 36 | ('st', 'saint'), 37 | ('co', 'company'), 38 | ('jr', 'junior'), 39 | ('maj', 'major'), 40 | ('gen', 'general'), 41 | ('drs', 'doctors'), 42 | ('rev', 'reverend'), 43 | ('lt', 'lieutenant'), 44 | ('hon', 'honorable'), 45 | ('sgt', 'sergeant'), 46 | ('capt', 'captain'), 47 | ('esq', 'esquire'), 48 | ('ltd', 'limited'), 49 | ('col', 'colonel'), 50 | ('ft', 'fort'), 51 | ]] 52 | 53 | 54 | def expand_abbreviations(text): 55 | for regex, replacement in _abbreviations: 56 | text = re.sub(regex, replacement, text) 57 | return text 58 | 59 | 60 | def expand_numbers(text): 61 | return normalize_numbers(text) 62 | 63 | 64 | def lowercase(text): 65 | return text.lower() 66 | 67 | 68 | def collapse_whitespace(text): 69 | return re.sub(_whitespace_re, ' ', text) 70 | 71 | 72 | def convert_to_ascii(text): 73 | return unidecode(text) 74 | 75 | 76 | def basic_cleaners(text): 77 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 78 | text = lowercase(text) 79 | text = collapse_whitespace(text) 80 | return text 81 | 82 | 83 | def transliteration_cleaners(text): 84 | '''Pipeline for non-English text that transliterates to ASCII.''' 85 | text = convert_to_ascii(text) 86 | text = lowercase(text) 87 | text = collapse_whitespace(text) 88 | return text 89 | 90 | 91 | def english_cleaners(text): 92 | '''Pipeline for English text, including abbreviation expansion.''' 93 | text = convert_to_ascii(text) 94 | text = lowercase(text) 95 | text = expand_abbreviations(text) 96 | phonemes = phonemize(text, language='en-us', backend='espeak', strip=True) 97 | phonemes = collapse_whitespace(phonemes) 98 | return phonemes 99 | 100 | 101 | def english_cleaners2(text): 102 | '''Pipeline for English text, including abbreviation expansion. + punctuation + stress''' 103 | text = convert_to_ascii(text) 104 | text = lowercase(text) 105 | text = expand_abbreviations(text) 106 | phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True) 107 | phonemes = collapse_whitespace(phonemes) 108 | return phonemes 109 | 110 | 111 | def japanese_cleaners(text): 112 | text = japanese_to_romaji_with_accent(text) 113 | text = re.sub(r'([A-Za-z])$', r'\1.', text) 114 | return text 115 | 116 | 117 | def japanese_cleaners2(text): 118 | return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…') 119 | 120 | 121 | def korean_cleaners(text): 122 | '''Pipeline for Korean text''' 123 | text = latin_to_hangul(text) 124 | text = number_to_hangul(text) 125 | text = divide_hangul(text) 126 | text = re.sub(r'([\u3131-\u3163])$', r'\1.', text) 127 | return text 128 | 129 | 130 | def japanese_triphone_cleaners(text): 131 | sentences = re.split(_japanese_marks, text) 132 | marks = re.findall(_japanese_marks, text) 133 | text = '' 134 | for i, sentence in enumerate(sentences): 135 | phones = pyopenjtalk.g2p(sentence, kana=False) 136 | phones = phones.replace(' ','') 137 | phones = phones.replace('A', 'a').replace('I', 'i').replace('U', 'u').replace('E', 'e').replace('O', 'o') 138 | phones = phones.replace('ch','ʧ').replace('sh','ʃ').replace('cl','Q') 139 | triphones = [] 140 | length = len(phones) 141 | for j, phone in enumerate(phones): 142 | if length == 1: 143 | triphone = phone 144 | else: 145 | if j == 0: 146 | triphone = f'{phone}+{phones[j+1]}' 147 | elif j == length - 1: 148 | triphone = f'{phones[j-1]}-{phone}' 149 | else: 150 | triphone = f'{phones[j-1]}-{phone}+{phones[j+1]}' 151 | triphones.append(triphone) 152 | subtext = ' '.join(triphones) 153 | text += subtext 154 | if i < len(marks): 155 | text += unidecode(marks[i]).replace(' ', '') 156 | if len(text) > 0 and re.match('[A-Za-z]',text[-1]): 157 | text += '.' 158 | 159 | return text 160 | -------------------------------------------------------------------------------- /text/korean.py: -------------------------------------------------------------------------------- 1 | import re 2 | from jamo import h2j, j2hcj 3 | import ko_pron 4 | 5 | 6 | # This is a list of Korean classifiers preceded by pure Korean numerals. 7 | _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통' 8 | 9 | # List of (hangul, hangul divided) pairs: 10 | _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [ 11 | ('ㄳ', 'ㄱㅅ'), 12 | ('ㄵ', 'ㄴㅈ'), 13 | ('ㄶ', 'ㄴㅎ'), 14 | ('ㄺ', 'ㄹㄱ'), 15 | ('ㄻ', 'ㄹㅁ'), 16 | ('ㄼ', 'ㄹㅂ'), 17 | ('ㄽ', 'ㄹㅅ'), 18 | ('ㄾ', 'ㄹㅌ'), 19 | ('ㄿ', 'ㄹㅍ'), 20 | ('ㅀ', 'ㄹㅎ'), 21 | ('ㅄ', 'ㅂㅅ'), 22 | ('ㅘ', 'ㅗㅏ'), 23 | ('ㅙ', 'ㅗㅐ'), 24 | ('ㅚ', 'ㅗㅣ'), 25 | ('ㅝ', 'ㅜㅓ'), 26 | ('ㅞ', 'ㅜㅔ'), 27 | ('ㅟ', 'ㅜㅣ'), 28 | ('ㅢ', 'ㅡㅣ'), 29 | ('ㅑ', 'ㅣㅏ'), 30 | ('ㅒ', 'ㅣㅐ'), 31 | ('ㅕ', 'ㅣㅓ'), 32 | ('ㅖ', 'ㅣㅔ'), 33 | ('ㅛ', 'ㅣㅗ'), 34 | ('ㅠ', 'ㅣㅜ') 35 | ]] 36 | 37 | # List of (Latin alphabet, hangul) pairs: 38 | _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 39 | ('a', '에이'), 40 | ('b', '비'), 41 | ('c', '시'), 42 | ('d', '디'), 43 | ('e', '이'), 44 | ('f', '에프'), 45 | ('g', '지'), 46 | ('h', '에이치'), 47 | ('i', '아이'), 48 | ('j', '제이'), 49 | ('k', '케이'), 50 | ('l', '엘'), 51 | ('m', '엠'), 52 | ('n', '엔'), 53 | ('o', '오'), 54 | ('p', '피'), 55 | ('q', '큐'), 56 | ('r', '아르'), 57 | ('s', '에스'), 58 | ('t', '티'), 59 | ('u', '유'), 60 | ('v', '브이'), 61 | ('w', '더블유'), 62 | ('x', '엑스'), 63 | ('y', '와이'), 64 | ('z', '제트') 65 | ]] 66 | 67 | # List of (ipa, lazy ipa) pairs: 68 | _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ 69 | ('t͡ɕ','ʧ'), 70 | ('d͡ʑ','ʥ'), 71 | ('ɲ','n^'), 72 | ('ɕ','ʃ'), 73 | ('ʷ','w'), 74 | ('ɭ','l`'), 75 | ('ʎ','ɾ'), 76 | ('ɣ','ŋ'), 77 | ('ɰ','ɯ'), 78 | ('ʝ','j'), 79 | ('ʌ','ə'), 80 | ('ɡ','g'), 81 | ('\u031a','#'), 82 | ('\u0348','='), 83 | ('\u031e',''), 84 | ('\u0320',''), 85 | ('\u0339','') 86 | ]] 87 | 88 | 89 | def latin_to_hangul(text): 90 | for regex, replacement in _latin_to_hangul: 91 | text = re.sub(regex, replacement, text) 92 | return text 93 | 94 | 95 | def divide_hangul(text): 96 | text = j2hcj(h2j(text)) 97 | for regex, replacement in _hangul_divided: 98 | text = re.sub(regex, replacement, text) 99 | return text 100 | 101 | 102 | def hangul_number(num, sino=True): 103 | '''Reference https://github.com/Kyubyong/g2pK''' 104 | num = re.sub(',', '', num) 105 | 106 | if num == '0': 107 | return '영' 108 | if not sino and num == '20': 109 | return '스무' 110 | 111 | digits = '123456789' 112 | names = '일이삼사오육칠팔구' 113 | digit2name = {d: n for d, n in zip(digits, names)} 114 | 115 | modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉' 116 | decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔' 117 | digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())} 118 | digit2dec = {d: dec for d, dec in zip(digits, decimals.split())} 119 | 120 | spelledout = [] 121 | for i, digit in enumerate(num): 122 | i = len(num) - i - 1 123 | if sino: 124 | if i == 0: 125 | name = digit2name.get(digit, '') 126 | elif i == 1: 127 | name = digit2name.get(digit, '') + '십' 128 | name = name.replace('일십', '십') 129 | else: 130 | if i == 0: 131 | name = digit2mod.get(digit, '') 132 | elif i == 1: 133 | name = digit2dec.get(digit, '') 134 | if digit == '0': 135 | if i % 4 == 0: 136 | last_three = spelledout[-min(3, len(spelledout)):] 137 | if ''.join(last_three) == '': 138 | spelledout.append('') 139 | continue 140 | else: 141 | spelledout.append('') 142 | continue 143 | if i == 2: 144 | name = digit2name.get(digit, '') + '백' 145 | name = name.replace('일백', '백') 146 | elif i == 3: 147 | name = digit2name.get(digit, '') + '천' 148 | name = name.replace('일천', '천') 149 | elif i == 4: 150 | name = digit2name.get(digit, '') + '만' 151 | name = name.replace('일만', '만') 152 | elif i == 5: 153 | name = digit2name.get(digit, '') + '십' 154 | name = name.replace('일십', '십') 155 | elif i == 6: 156 | name = digit2name.get(digit, '') + '백' 157 | name = name.replace('일백', '백') 158 | elif i == 7: 159 | name = digit2name.get(digit, '') + '천' 160 | name = name.replace('일천', '천') 161 | elif i == 8: 162 | name = digit2name.get(digit, '') + '억' 163 | elif i == 9: 164 | name = digit2name.get(digit, '') + '십' 165 | elif i == 10: 166 | name = digit2name.get(digit, '') + '백' 167 | elif i == 11: 168 | name = digit2name.get(digit, '') + '천' 169 | elif i == 12: 170 | name = digit2name.get(digit, '') + '조' 171 | elif i == 13: 172 | name = digit2name.get(digit, '') + '십' 173 | elif i == 14: 174 | name = digit2name.get(digit, '') + '백' 175 | elif i == 15: 176 | name = digit2name.get(digit, '') + '천' 177 | spelledout.append(name) 178 | return ''.join(elem for elem in spelledout) 179 | 180 | 181 | def number_to_hangul(text): 182 | '''Reference https://github.com/Kyubyong/g2pK''' 183 | tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text)) 184 | for token in tokens: 185 | num, classifier = token 186 | if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers: 187 | spelledout = hangul_number(num, sino=False) 188 | else: 189 | spelledout = hangul_number(num, sino=True) 190 | text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}') 191 | # digit by digit for remaining digits 192 | digits = '0123456789' 193 | names = '영일이삼사오육칠팔구' 194 | for d, n in zip(digits, names): 195 | text = text.replace(d, n) 196 | return text 197 | 198 | 199 | def korean_to_lazy_ipa(text): 200 | text = latin_to_hangul(text) 201 | text = number_to_hangul(text) 202 | text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text) 203 | for regex, replacement in _ipa_to_lazy_ipa: 204 | text = re.sub(regex, replacement, text) 205 | return text 206 | 207 | 208 | def korean_to_ipa(text): 209 | text = korean_to_lazy_ipa(text) 210 | return text.replace('ʧ','tʃ').replace('ʥ','dʑ') 211 | -------------------------------------------------------------------------------- /filelists/vctk_audio_sid_text_val_filelist.txt: -------------------------------------------------------------------------------- 1 | DUMMY2/p364/p364_240.wav|88|It had happened to him. 2 | DUMMY2/p280/p280_148.wav|52|It is open season on the Old Firm. 3 | DUMMY2/p231/p231_320.wav|50|However, he is a coach, and he remains a coach at heart. 4 | DUMMY2/p282/p282_129.wav|83|It is not a U-turn. 5 | DUMMY2/p254/p254_015.wav|41|The Greeks used to imagine that it was a sign from the gods to foretell war or heavy rain. 6 | DUMMY2/p228/p228_285.wav|57|The songs are just so good. 7 | DUMMY2/p334/p334_307.wav|38|If they don't, they can expect their funding to be cut. 8 | DUMMY2/p287/p287_081.wav|77|I've never seen anything like it. 9 | DUMMY2/p247/p247_083.wav|14|It is a job creation scheme.) 10 | DUMMY2/p264/p264_051.wav|65|We were leading by two goals.) 11 | DUMMY2/p335/p335_058.wav|49|Let's see that increase over the years. 12 | DUMMY2/p236/p236_225.wav|75|There is no quick fix. 13 | DUMMY2/p374/p374_353.wav|11|And that brings us to the point. 14 | DUMMY2/p272/p272_076.wav|69|Sounds like The Sixth Sense? 15 | DUMMY2/p271/p271_152.wav|27|The petition was formally presented at Downing Street yesterday. 16 | DUMMY2/p228/p228_127.wav|57|They've got to account for it. 17 | DUMMY2/p276/p276_223.wav|106|It's been a humbling year. 18 | DUMMY2/p262/p262_248.wav|45|The project has already secured the support of Sir Sean Connery. 19 | DUMMY2/p314/p314_086.wav|51|The team this year is going places. 20 | DUMMY2/p225/p225_038.wav|101|Diving is no part of football. 21 | DUMMY2/p279/p279_088.wav|25|The shareholders will vote to wind up the company on Friday morning. 22 | DUMMY2/p272/p272_018.wav|69|Aristotle thought that the rainbow was caused by reflection of the sun's rays by the rain. 23 | DUMMY2/p256/p256_098.wav|90|She told The Herald. 24 | DUMMY2/p261/p261_218.wav|100|All will be revealed in due course. 25 | DUMMY2/p265/p265_063.wav|73|IT shouldn't come as a surprise, but it does. 26 | DUMMY2/p314/p314_042.wav|51|It is all about people being assaulted, abused. 27 | DUMMY2/p241/p241_188.wav|86|I wish I could say something. 28 | DUMMY2/p283/p283_111.wav|95|It's good to have a voice. 29 | DUMMY2/p275/p275_006.wav|40|When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow. 30 | DUMMY2/p228/p228_092.wav|57|Today I couldn't run on it. 31 | DUMMY2/p295/p295_343.wav|92|The atmosphere is businesslike. 32 | DUMMY2/p228/p228_187.wav|57|They will run a mile. 33 | DUMMY2/p294/p294_317.wav|104|It didn't put me off. 34 | DUMMY2/p231/p231_445.wav|50|It sounded like a bomb. 35 | DUMMY2/p272/p272_086.wav|69|Today she has been released. 36 | DUMMY2/p255/p255_210.wav|31|It was worth a photograph. 37 | DUMMY2/p229/p229_060.wav|67|And a film maker was born. 38 | DUMMY2/p260/p260_232.wav|81|The Home Office would not release any further details about the group. 39 | DUMMY2/p245/p245_025.wav|59|Johnson was pretty low. 40 | DUMMY2/p333/p333_185.wav|64|This area is perfect for children. 41 | DUMMY2/p244/p244_242.wav|78|He is a man of the people. 42 | DUMMY2/p376/p376_187.wav|71|"It is a terrible loss." 43 | DUMMY2/p239/p239_156.wav|48|It is a good lifestyle. 44 | DUMMY2/p307/p307_037.wav|22|He released a half-dozen solo albums. 45 | DUMMY2/p305/p305_185.wav|54|I am not even thinking about that. 46 | DUMMY2/p272/p272_081.wav|69|It was magic. 47 | DUMMY2/p302/p302_297.wav|30|I'm trying to stay open on that. 48 | DUMMY2/p275/p275_320.wav|40|We are in the end game. 49 | DUMMY2/p239/p239_231.wav|48|Then we will face the Danish champions. 50 | DUMMY2/p268/p268_301.wav|87|It was only later that the condition was diagnosed. 51 | DUMMY2/p336/p336_088.wav|98|They failed to reach agreement yesterday. 52 | DUMMY2/p278/p278_255.wav|10|They made such decisions in London. 53 | DUMMY2/p361/p361_132.wav|79|That got me out. 54 | DUMMY2/p307/p307_146.wav|22|You hope he prevails. 55 | DUMMY2/p244/p244_147.wav|78|They could not ignore the will of parliament, he claimed. 56 | DUMMY2/p294/p294_283.wav|104|This is our unfinished business. 57 | DUMMY2/p283/p283_300.wav|95|I would have the hammer in the crowd. 58 | DUMMY2/p239/p239_079.wav|48|I can understand the frustrations of our fans. 59 | DUMMY2/p264/p264_009.wav|65|There is , according to legend, a boiling pot of gold at one end. ) 60 | DUMMY2/p307/p307_348.wav|22|He did not oppose the divorce. 61 | DUMMY2/p304/p304_308.wav|72|We are the gateway to justice. 62 | DUMMY2/p281/p281_056.wav|36|None has ever been found. 63 | DUMMY2/p267/p267_158.wav|0|We were given a warm and friendly reception. 64 | DUMMY2/p300/p300_169.wav|102|Who do these people think they are? 65 | DUMMY2/p276/p276_177.wav|106|They exist in name alone. 66 | DUMMY2/p228/p228_245.wav|57|It is a policy which has the full support of the minister. 67 | DUMMY2/p300/p300_303.wav|102|I'm wondering what you feel about the youngest. 68 | DUMMY2/p362/p362_247.wav|15|This would give Scotland around eight members. 69 | DUMMY2/p326/p326_031.wav|28|United were in control without always being dominant. 70 | DUMMY2/p361/p361_288.wav|79|I did not think it was very proper. 71 | DUMMY2/p286/p286_145.wav|63|Tiger is not the norm. 72 | DUMMY2/p234/p234_071.wav|3|She did that for the rest of her life. 73 | DUMMY2/p263/p263_296.wav|39|The decision was announced at its annual conference in Dunfermline. 74 | DUMMY2/p323/p323_228.wav|34|She became a heroine of my childhood. 75 | DUMMY2/p280/p280_346.wav|52|It was a bit like having children. 76 | DUMMY2/p333/p333_080.wav|64|But the tragedy did not stop there. 77 | DUMMY2/p226/p226_268.wav|43|That decision is for the British Parliament and people. 78 | DUMMY2/p362/p362_314.wav|15|Is that right? 79 | DUMMY2/p240/p240_047.wav|93|It is so sad. 80 | DUMMY2/p250/p250_207.wav|24|You could feel the heat. 81 | DUMMY2/p273/p273_176.wav|56|Neither side would reveal the details of the offer. 82 | DUMMY2/p316/p316_147.wav|85|And frankly, it's been a while. 83 | DUMMY2/p265/p265_047.wav|73|It is unique. 84 | DUMMY2/p336/p336_353.wav|98|Sometimes you get them, sometimes you don't. 85 | DUMMY2/p230/p230_376.wav|35|This hasn't happened in a vacuum. 86 | DUMMY2/p308/p308_209.wav|107|There is great potential on this river. 87 | DUMMY2/p250/p250_442.wav|24|We have not yet received a letter from the Irish. 88 | DUMMY2/p260/p260_037.wav|81|It's a fact. 89 | DUMMY2/p299/p299_345.wav|58|We're very excited and challenged by the project. 90 | DUMMY2/p269/p269_218.wav|94|A Grampian Police spokesman said. 91 | DUMMY2/p306/p306_014.wav|12|To the Hebrews it was a token that there would be no more universal floods. 92 | DUMMY2/p271/p271_292.wav|27|It's a record label, not a form of music. 93 | DUMMY2/p247/p247_225.wav|14|I am considered a teenager.) 94 | DUMMY2/p294/p294_094.wav|104|It should be a condition of employment. 95 | DUMMY2/p269/p269_031.wav|94|Is this accurate? 96 | DUMMY2/p275/p275_116.wav|40|It's not fair. 97 | DUMMY2/p265/p265_006.wav|73|When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow. 98 | DUMMY2/p285/p285_072.wav|2|Mr Irvine said Mr Rafferty was now in good spirits. 99 | DUMMY2/p270/p270_167.wav|8|We did what we had to do. 100 | DUMMY2/p360/p360_397.wav|60|It is a relief. 101 | -------------------------------------------------------------------------------- /filelists/vctk_audio_sid_text_val_filelist.txt.cleaned: -------------------------------------------------------------------------------- 1 | DUMMY2/p364/p364_240.wav|88|ɪt hɐd hˈæpənd tə hˌɪm. 2 | DUMMY2/p280/p280_148.wav|52|ɪt ɪz ˈoʊpən sˈiːzən ɑːnðɪ ˈoʊld fˈɜːm. 3 | DUMMY2/p231/p231_320.wav|50|haʊˈɛvɚ, hiː ɪz ɐ kˈoʊtʃ, ænd hiː ɹɪmˈeɪnz ɐ kˈoʊtʃ æt hˈɑːɹt. 4 | DUMMY2/p282/p282_129.wav|83|ɪt ɪz nˌɑːɾə jˈuːtˈɜːn. 5 | DUMMY2/p254/p254_015.wav|41|ðə ɡɹˈiːks jˈuːzd tʊ ɪmˈædʒɪn ðˌɐɾɪt wʌzɐ sˈaɪn fɹʌmðə ɡˈɑːdz tə foːɹtˈɛl wˈɔːɹ ɔːɹ hˈɛvi ɹˈeɪn. 6 | DUMMY2/p228/p228_285.wav|57|ðə sˈɔŋz ɑːɹ dʒˈʌst sˌoʊ ɡˈʊd. 7 | DUMMY2/p334/p334_307.wav|38|ɪf ðeɪ dˈoʊnt, ðeɪ kæn ɛkspˈɛkt ðɛɹ fˈʌndɪŋ təbi kˈʌt. 8 | DUMMY2/p287/p287_081.wav|77|aɪv nˈɛvɚ sˈiːn ˈɛnɪθˌɪŋ lˈaɪk ɪt. 9 | DUMMY2/p247/p247_083.wav|14|ɪt ɪz ɐ dʒˈɑːb kɹiːˈeɪʃən skˈiːm. 10 | DUMMY2/p264/p264_051.wav|65|wiː wɜː lˈiːdɪŋ baɪ tˈuː ɡˈoʊlz. 11 | DUMMY2/p335/p335_058.wav|49|lˈɛts sˈiː ðæt ˈɪnkɹiːs ˌoʊvɚ ðə jˈɪɹz. 12 | DUMMY2/p236/p236_225.wav|75|ðɛɹ ɪz nˈoʊ kwˈɪk fˈɪks. 13 | DUMMY2/p374/p374_353.wav|11|ænd ðæt bɹˈɪŋz ˌʌs tə ðə pˈɔɪnt. 14 | DUMMY2/p272/p272_076.wav|69|sˈaʊndz lˈaɪk ðə sˈɪksθ sˈɛns? 15 | DUMMY2/p271/p271_152.wav|27|ðə pətˈɪʃən wʌz fˈɔːɹməli pɹɪzˈɛntᵻd æt dˈaʊnɪŋ stɹˈiːt jˈɛstɚdˌeɪ. 16 | DUMMY2/p228/p228_127.wav|57|ðeɪv ɡɑːt tʊ ɐkˈaʊnt fɔːɹ ɪt. 17 | DUMMY2/p276/p276_223.wav|106|ɪts bˌɪn ɐ hˈʌmblɪŋ jˈɪɹ. 18 | DUMMY2/p262/p262_248.wav|45|ðə pɹˈɑːdʒɛkt hɐz ɔːlɹˌɛdi sɪkjˈʊɹd ðə səpˈoːɹt ʌv sˌɜː ʃˈɔːn kɑːnɚɹi. 19 | DUMMY2/p314/p314_086.wav|51|ðə tˈiːm ðɪs jˈɪɹ ɪz ɡˌoʊɪŋ plˈeɪsᵻz. 20 | DUMMY2/p225/p225_038.wav|101|dˈaɪvɪŋ ɪz nˈoʊ pˈɑːɹt ʌv fˈʊtbɔːl. 21 | DUMMY2/p279/p279_088.wav|25|ðə ʃˈɛɹhoʊldɚz wɪl vˈoʊt tə wˈaɪnd ˈʌp ðə kˈʌmpəni ˌɑːn fɹˈaɪdeɪ mˈɔːɹnɪŋ. 22 | DUMMY2/p272/p272_018.wav|69|ˈæɹɪstˌɑːɾəl θˈɔːt ðætðə ɹˈeɪnboʊ wʌz kˈɔːzd baɪ ɹɪflˈɛkʃən ʌvðə sˈʌnz ɹˈeɪz baɪ ðə ɹˈeɪn. 23 | DUMMY2/p256/p256_098.wav|90|ʃiː tˈoʊld ðə hˈɛɹəld. 24 | DUMMY2/p261/p261_218.wav|100|ˈɔːl wɪl biː ɹɪvˈiːld ɪn dˈuː kˈoːɹs. 25 | DUMMY2/p265/p265_063.wav|73|ɪt ʃˌʊdənt kˈʌm æz ɐ sɚpɹˈaɪz, bˌʌt ɪt dˈʌz. 26 | DUMMY2/p314/p314_042.wav|51|ɪt ɪz ˈɔːl ɐbˌaʊt pˈiːpəl bˌiːɪŋ ɐsˈɑːltᵻd, ɐbjˈuːsd. 27 | DUMMY2/p241/p241_188.wav|86|ˈaɪ wˈɪʃ ˈaɪ kʊd sˈeɪ sˈʌmθɪŋ. 28 | DUMMY2/p283/p283_111.wav|95|ɪts ɡˈʊd tə hæv ɐ vˈɔɪs. 29 | DUMMY2/p275/p275_006.wav|40|wˌɛn ðə sˈʌnlaɪt stɹˈaɪks ɹˈeɪndɹɑːps ɪnðɪ ˈɛɹ, ðeɪ ˈækt æz ɐ pɹˈɪzəm ænd fˈɔːɹm ɐ ɹˈeɪnboʊ. 30 | DUMMY2/p228/p228_092.wav|57|tədˈeɪ ˈaɪ kˌʊdənt ɹˈʌn ˈɑːn ɪt. 31 | DUMMY2/p295/p295_343.wav|92|ðɪ ˈætməsfˌɪɹ ɪz bˈɪznəslˌaɪk. 32 | DUMMY2/p228/p228_187.wav|57|ðeɪ wɪl ɹˈʌn ɐ mˈaɪl. 33 | DUMMY2/p294/p294_317.wav|104|ɪt dˈɪdnt pˌʊt mˌiː ˈɔf. 34 | DUMMY2/p231/p231_445.wav|50|ɪt sˈaʊndᵻd lˈaɪk ɐ bˈɑːm. 35 | DUMMY2/p272/p272_086.wav|69|tədˈeɪ ʃiː hɐzbɪn ɹɪlˈiːsd. 36 | DUMMY2/p255/p255_210.wav|31|ɪt wʌz wˈɜːθ ɐ fˈoʊɾəɡɹˌæf. 37 | DUMMY2/p229/p229_060.wav|67|ænd ɐ fˈɪlm mˈeɪkɚ wʌz bˈɔːɹn. 38 | DUMMY2/p260/p260_232.wav|81|ðə hˈoʊm ˈɑːfɪs wʊd nˌɑːt ɹɪlˈiːs ˌɛni fˈɜːðɚ diːtˈeɪlz ɐbˌaʊt ðə ɡɹˈuːp. 39 | DUMMY2/p245/p245_025.wav|59|dʒˈɑːnsən wʌz pɹˈɪɾi lˈoʊ. 40 | DUMMY2/p333/p333_185.wav|64|ðɪs ˈɛɹiə ɪz pˈɜːfɛkt fɔːɹ tʃˈɪldɹən. 41 | DUMMY2/p244/p244_242.wav|78|hiː ɪz ɐ mˈæn ʌvðə pˈiːpəl. 42 | DUMMY2/p376/p376_187.wav|71|"ɪt ɪz ɐ tˈɛɹəbəl lˈɔs." 43 | DUMMY2/p239/p239_156.wav|48|ɪt ɪz ɐ ɡˈʊd lˈaɪfstaɪl. 44 | DUMMY2/p307/p307_037.wav|22|hiː ɹɪlˈiːsd ɐ hˈæfdˈʌzən sˈoʊloʊ ˈælbəmz. 45 | DUMMY2/p305/p305_185.wav|54|ˈaɪ æm nˌɑːt ˈiːvən θˈɪŋkɪŋ ɐbˌaʊt ðˈæt. 46 | DUMMY2/p272/p272_081.wav|69|ɪt wʌz mˈædʒɪk. 47 | DUMMY2/p302/p302_297.wav|30|aɪm tɹˈaɪɪŋ tə stˈeɪ ˈoʊpən ˌɑːn ðˈæt. 48 | DUMMY2/p275/p275_320.wav|40|wiː ɑːɹ ɪnðɪ ˈɛnd ɡˈeɪm. 49 | DUMMY2/p239/p239_231.wav|48|ðˈɛn wiː wɪl fˈeɪs ðə dˈeɪnɪʃ tʃˈæmpiənz. 50 | DUMMY2/p268/p268_301.wav|87|ɪt wʌz ˈoʊnli lˈeɪɾɚ ðætðə kəndˈɪʃən wʌz dˌaɪəɡnˈoʊzd. 51 | DUMMY2/p336/p336_088.wav|98|ðeɪ fˈeɪld tə ɹˈiːtʃ ɐɡɹˈiːmənt jˈɛstɚdˌeɪ. 52 | DUMMY2/p278/p278_255.wav|10|ðeɪ mˌeɪd sˈʌtʃ dᵻsˈɪʒənz ɪn lˈʌndən. 53 | DUMMY2/p361/p361_132.wav|79|ðæt ɡɑːt mˌiː ˈaʊt. 54 | DUMMY2/p307/p307_146.wav|22|juː hˈoʊp hiː pɹɪvˈeɪlz. 55 | DUMMY2/p244/p244_147.wav|78|ðeɪ kʊd nˌɑːt ɪɡnˈoːɹ ðə wɪl ʌv pˈɑːɹləmənt, hiː klˈeɪmd. 56 | DUMMY2/p294/p294_283.wav|104|ðɪs ɪz ˌaʊɚɹ ʌnfˈɪnɪʃt bˈɪznəs. 57 | DUMMY2/p283/p283_300.wav|95|ˈaɪ wʊdhɐv ðə hˈæmɚɹ ɪnðə kɹˈaʊd. 58 | DUMMY2/p239/p239_079.wav|48|ˈaɪ kæn ˌʌndɚstˈænd ðə fɹʌstɹˈeɪʃənz ʌv ˌaʊɚ fˈænz. 59 | DUMMY2/p264/p264_009.wav|65|ðɛɹˈɪz , ɐkˈoːɹdɪŋ tə lˈɛdʒənd, ɐ bˈɔɪlɪŋ pˈɑːt ʌv ɡˈoʊld æt wˈʌn ˈɛnd. 60 | DUMMY2/p307/p307_348.wav|22|hiː dɪdnˌɑːt əpˈoʊz ðə dɪvˈoːɹs. 61 | DUMMY2/p304/p304_308.wav|72|wiː ɑːɹ ðə ɡˈeɪtweɪ tə dʒˈʌstɪs. 62 | DUMMY2/p281/p281_056.wav|36|nˈʌn hɐz ˈɛvɚ bˌɪn fˈaʊnd. 63 | DUMMY2/p267/p267_158.wav|0|wiː wɜː ɡˈɪvən ɐ wˈɔːɹm ænd fɹˈɛndli ɹɪsˈɛpʃən. 64 | DUMMY2/p300/p300_169.wav|102|hˌuː dˈuː ðiːz pˈiːpəl θˈɪŋk ðeɪ ɑːɹ? 65 | DUMMY2/p276/p276_177.wav|106|ðeɪ ɛɡzˈɪst ɪn nˈeɪm ɐlˈoʊn. 66 | DUMMY2/p228/p228_245.wav|57|ɪt ɪz ɐ pˈɑːlɪsi wˌɪtʃ hɐz ðə fˈʊl səpˈoːɹt ʌvðə mˈɪnɪstɚ. 67 | DUMMY2/p300/p300_303.wav|102|aɪm wˈʌndɚɹɪŋ wˌʌt juː fˈiːl ɐbˌaʊt ðə jˈʌŋɡəst. 68 | DUMMY2/p362/p362_247.wav|15|ðɪs wʊd ɡˈɪv skˈɑːtlənd ɐɹˈaʊnd ˈeɪt mˈɛmbɚz. 69 | DUMMY2/p326/p326_031.wav|28|juːnˈaɪɾᵻd wɜːɹ ɪn kəntɹˈoʊl wɪðˌaʊt ˈɔːlweɪz bˌiːɪŋ dˈɑːmɪnənt. 70 | DUMMY2/p361/p361_288.wav|79|ˈaɪ dɪdnˌɑːt θˈɪŋk ɪt wʌz vˈɛɹi pɹˈɑːpɚ. 71 | DUMMY2/p286/p286_145.wav|63|tˈaɪɡɚɹ ɪz nˌɑːt ðə nˈɔːɹm. 72 | DUMMY2/p234/p234_071.wav|3|ʃiː dˈɪd ðæt fɚðə ɹˈɛst ʌv hɜː lˈaɪf. 73 | DUMMY2/p263/p263_296.wav|39|ðə dᵻsˈɪʒən wʌz ɐnˈaʊnst æt ɪts ˈænjuːəl kˈɑːnfɹəns ɪn dˈʌnfɚmlˌaɪn. 74 | DUMMY2/p323/p323_228.wav|34|ʃiː bɪkˌeɪm ɐ hˈɛɹoʊˌɪn ʌv maɪ tʃˈaɪldhʊd. 75 | DUMMY2/p280/p280_346.wav|52|ɪt wʌzɐ bˈɪt lˈaɪk hˌævɪŋ tʃˈɪldɹən. 76 | DUMMY2/p333/p333_080.wav|64|bˌʌt ðə tɹˈædʒədi dɪdnˌɑːt stˈɑːp ðˈɛɹ. 77 | DUMMY2/p226/p226_268.wav|43|ðæt dᵻsˈɪʒən ɪz fɚðə bɹˈɪɾɪʃ pˈɑːɹləmənt ænd pˈiːpəl. 78 | DUMMY2/p362/p362_314.wav|15|ɪz ðæt ɹˈaɪt? 79 | DUMMY2/p240/p240_047.wav|93|ɪt ɪz sˌoʊ sˈæd. 80 | DUMMY2/p250/p250_207.wav|24|juː kʊd fˈiːl ðə hˈiːt. 81 | DUMMY2/p273/p273_176.wav|56|nˈiːðɚ sˈaɪd wʊd ɹɪvˈiːl ðə diːtˈeɪlz ʌvðɪ ˈɑːfɚ. 82 | DUMMY2/p316/p316_147.wav|85|ænd fɹˈæŋkli, ɪts bˌɪn ɐ wˈaɪl. 83 | DUMMY2/p265/p265_047.wav|73|ɪt ɪz juːnˈiːk. 84 | DUMMY2/p336/p336_353.wav|98|sˈʌmtaɪmz juː ɡˈɛt ðˌɛm, sˈʌmtaɪmz juː dˈoʊnt. 85 | DUMMY2/p230/p230_376.wav|35|ðɪs hˈæzənt hˈæpənd ɪn ɐ vˈækjuːm. 86 | DUMMY2/p308/p308_209.wav|107|ðɛɹ ɪz ɡɹˈeɪt pətˈɛnʃəl ˌɑːn ðɪs ɹˈɪvɚ. 87 | DUMMY2/p250/p250_442.wav|24|wiː hɐvnˌɑːt jˈɛt ɹɪsˈiːvd ɐ lˈɛɾɚ fɹʌmðɪ ˈaɪɹɪʃ. 88 | DUMMY2/p260/p260_037.wav|81|ɪts ɐ fˈækt. 89 | DUMMY2/p299/p299_345.wav|58|wɪɹ vˈɛɹi ɛksˈaɪɾᵻd ænd tʃˈælɪndʒd baɪ ðə pɹˈɑːdʒɛkt. 90 | DUMMY2/p269/p269_218.wav|94|ɐ ɡɹˈæmpiən pəlˈiːs spˈoʊksmən sˈɛd. 91 | DUMMY2/p306/p306_014.wav|12|tə ðə hˈiːbɹuːz ɪt wʌzɐ tˈoʊkən ðæt ðɛɹ wʊd biː nˈoʊmˌoːɹ jˌuːnɪvˈɜːsəl flˈʌdz. 92 | DUMMY2/p271/p271_292.wav|27|ɪts ɐ ɹˈɛkɚd lˈeɪbəl, nˌɑːɾə fˈɔːɹm ʌv mjˈuːzɪk. 93 | DUMMY2/p247/p247_225.wav|14|ˈaɪ æm kənsˈɪdɚd ɐ tˈiːneɪdʒɚ. 94 | DUMMY2/p294/p294_094.wav|104|ɪt ʃˌʊd biː ɐ kəndˈɪʃən ʌv ɛmplˈɔɪmənt. 95 | DUMMY2/p269/p269_031.wav|94|ɪz ðɪs ˈækjʊɹət? 96 | DUMMY2/p275/p275_116.wav|40|ɪts nˌɑːt fˈɛɹ. 97 | DUMMY2/p265/p265_006.wav|73|wˌɛn ðə sˈʌnlaɪt stɹˈaɪks ɹˈeɪndɹɑːps ɪnðɪ ˈɛɹ, ðeɪ ˈækt æz ɐ pɹˈɪzəm ænd fˈɔːɹm ɐ ɹˈeɪnboʊ. 98 | DUMMY2/p285/p285_072.wav|2|mˈɪstɚɹ ˈɜːvaɪn sˈɛd mˈɪstɚ ɹˈæfɚɾi wʌz nˈaʊ ɪn ɡˈʊd spˈɪɹɪts. 99 | DUMMY2/p270/p270_167.wav|8|wiː dˈɪd wˌʌt wiː hædtə dˈuː. 100 | DUMMY2/p360/p360_397.wav|60|ɪt ɪz ɐ ɹɪlˈiːf. 101 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import sys 4 | import argparse 5 | import logging 6 | import json 7 | import subprocess 8 | import numpy as np 9 | from scipy.io.wavfile import read 10 | import torch 11 | 12 | MATPLOTLIB_FLAG = False 13 | 14 | logging.basicConfig(stream=sys.stdout, level=logging.WARNING) 15 | logger = logging 16 | 17 | 18 | def load_checkpoint(checkpoint_path, model, optimizer=None): 19 | assert os.path.isfile(checkpoint_path) 20 | checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') 21 | iteration = checkpoint_dict['iteration'] 22 | learning_rate = checkpoint_dict['learning_rate'] 23 | if optimizer is not None: 24 | optimizer.load_state_dict(checkpoint_dict['optimizer']) 25 | saved_state_dict = checkpoint_dict['model'] 26 | if hasattr(model, 'module'): 27 | state_dict = model.module.state_dict() 28 | else: 29 | state_dict = model.state_dict() 30 | new_state_dict= {} 31 | for k, v in state_dict.items(): 32 | try: 33 | new_state_dict[k] = saved_state_dict[k] 34 | except: 35 | logger.info("%s is not in the checkpoint" % k) 36 | new_state_dict[k] = v 37 | if hasattr(model, 'module'): 38 | model.module.load_state_dict(new_state_dict) 39 | else: 40 | model.load_state_dict(new_state_dict) 41 | logger.info("Loaded checkpoint '{}' (iteration {})" .format( 42 | checkpoint_path, iteration)) 43 | return model, optimizer, learning_rate, iteration 44 | 45 | 46 | def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): 47 | logger.info("Saving model and optimizer state at iteration {} to {}".format( 48 | iteration, checkpoint_path)) 49 | if hasattr(model, 'module'): 50 | state_dict = model.module.state_dict() 51 | else: 52 | state_dict = model.state_dict() 53 | torch.save({'model': state_dict, 54 | 'iteration': iteration, 55 | 'optimizer': optimizer.state_dict(), 56 | 'learning_rate': learning_rate}, checkpoint_path) 57 | 58 | 59 | def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): 60 | for k, v in scalars.items(): 61 | writer.add_scalar(k, v, global_step) 62 | for k, v in histograms.items(): 63 | writer.add_histogram(k, v, global_step) 64 | for k, v in images.items(): 65 | writer.add_image(k, v, global_step, dataformats='HWC') 66 | for k, v in audios.items(): 67 | writer.add_audio(k, v, global_step, audio_sampling_rate) 68 | 69 | 70 | def latest_checkpoint_path(dir_path, regex="G_*.pth"): 71 | f_list = glob.glob(os.path.join(dir_path, regex)) 72 | f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) 73 | x = f_list[-1] 74 | print(x) 75 | return x 76 | 77 | 78 | def plot_spectrogram_to_numpy(spectrogram): 79 | global MATPLOTLIB_FLAG 80 | if not MATPLOTLIB_FLAG: 81 | import matplotlib 82 | matplotlib.use("Agg") 83 | MATPLOTLIB_FLAG = True 84 | mpl_logger = logging.getLogger('matplotlib') 85 | mpl_logger.setLevel(logging.WARNING) 86 | import matplotlib.pylab as plt 87 | import numpy as np 88 | 89 | fig, ax = plt.subplots(figsize=(10,2)) 90 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 91 | interpolation='none') 92 | plt.colorbar(im, ax=ax) 93 | plt.xlabel("Frames") 94 | plt.ylabel("Channels") 95 | plt.tight_layout() 96 | 97 | fig.canvas.draw() 98 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 99 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 100 | plt.close() 101 | return data 102 | 103 | 104 | def plot_alignment_to_numpy(alignment, info=None): 105 | global MATPLOTLIB_FLAG 106 | if not MATPLOTLIB_FLAG: 107 | import matplotlib 108 | matplotlib.use("Agg") 109 | MATPLOTLIB_FLAG = True 110 | mpl_logger = logging.getLogger('matplotlib') 111 | mpl_logger.setLevel(logging.WARNING) 112 | import matplotlib.pylab as plt 113 | import numpy as np 114 | 115 | fig, ax = plt.subplots(figsize=(6, 4)) 116 | im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', 117 | interpolation='none') 118 | fig.colorbar(im, ax=ax) 119 | xlabel = 'Decoder timestep' 120 | if info is not None: 121 | xlabel += '\n\n' + info 122 | plt.xlabel(xlabel) 123 | plt.ylabel('Encoder timestep') 124 | plt.tight_layout() 125 | 126 | fig.canvas.draw() 127 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 128 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 129 | plt.close() 130 | return data 131 | 132 | 133 | def load_wav_to_torch(full_path): 134 | sampling_rate, data = read(full_path) 135 | return torch.FloatTensor(data.astype(np.float32)), sampling_rate 136 | 137 | 138 | def load_filepaths_and_text(filename, split="|"): 139 | with open(filename, encoding='utf-8') as f: 140 | filepaths_and_text = [line.strip().split(split) for line in f] 141 | return filepaths_and_text 142 | 143 | 144 | def get_hparams(init=True): 145 | parser = argparse.ArgumentParser() 146 | parser.add_argument('-c', '--config', type=str, default="./configs/base.json", 147 | help='JSON file for configuration') 148 | parser.add_argument('-m', '--model', type=str, required=True, 149 | help='Model name') 150 | 151 | args = parser.parse_args() 152 | model_dir = os.path.join("./logs", args.model) 153 | 154 | if not os.path.exists(model_dir): 155 | os.makedirs(model_dir) 156 | 157 | config_path = args.config 158 | config_save_path = os.path.join(model_dir, "config.json") 159 | if init: 160 | with open(config_path, "r") as f: 161 | data = f.read() 162 | with open(config_save_path, "w") as f: 163 | f.write(data) 164 | else: 165 | with open(config_save_path, "r") as f: 166 | data = f.read() 167 | config = json.loads(data) 168 | 169 | hparams = HParams(**config) 170 | hparams.model_dir = model_dir 171 | return hparams 172 | 173 | 174 | def get_hparams_from_dir(model_dir): 175 | config_save_path = os.path.join(model_dir, "config.json") 176 | with open(config_save_path, "r") as f: 177 | data = f.read() 178 | config = json.loads(data) 179 | 180 | hparams =HParams(**config) 181 | hparams.model_dir = model_dir 182 | return hparams 183 | 184 | 185 | def get_hparams_from_file(config_path): 186 | with open(config_path, "r") as f: 187 | data = f.read() 188 | config = json.loads(data) 189 | 190 | hparams =HParams(**config) 191 | return hparams 192 | 193 | 194 | def check_git_hash(model_dir): 195 | source_dir = os.path.dirname(os.path.realpath(__file__)) 196 | if not os.path.exists(os.path.join(source_dir, ".git")): 197 | logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format( 198 | source_dir 199 | )) 200 | return 201 | 202 | cur_hash = subprocess.getoutput("git rev-parse HEAD") 203 | 204 | path = os.path.join(model_dir, "githash") 205 | if os.path.exists(path): 206 | saved_hash = open(path).read() 207 | if saved_hash != cur_hash: 208 | logger.warn("git hash values are different. {}(saved) != {}(current)".format( 209 | saved_hash[:8], cur_hash[:8])) 210 | else: 211 | open(path, "w").write(cur_hash) 212 | 213 | 214 | def get_logger(model_dir, filename="train.log"): 215 | global logger 216 | logger = logging.getLogger(os.path.basename(model_dir)) 217 | logger.setLevel(logging.DEBUG) 218 | 219 | formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") 220 | if not os.path.exists(model_dir): 221 | os.makedirs(model_dir) 222 | h = logging.FileHandler(os.path.join(model_dir, filename)) 223 | h.setLevel(logging.DEBUG) 224 | h.setFormatter(formatter) 225 | logger.addHandler(h) 226 | return logger 227 | 228 | 229 | class HParams(): 230 | def __init__(self, **kwargs): 231 | for k, v in kwargs.items(): 232 | if type(v) == dict: 233 | v = HParams(**v) 234 | self[k] = v 235 | 236 | def keys(self): 237 | return self.__dict__.keys() 238 | 239 | def items(self): 240 | return self.__dict__.items() 241 | 242 | def values(self): 243 | return self.__dict__.values() 244 | 245 | def __len__(self): 246 | return len(self.__dict__) 247 | 248 | def __getitem__(self, key): 249 | return getattr(self, key) 250 | 251 | def __setitem__(self, key, value): 252 | return setattr(self, key, value) 253 | 254 | def __contains__(self, key): 255 | return key in self.__dict__ 256 | 257 | def __repr__(self): 258 | return self.__dict__.__repr__() 259 | -------------------------------------------------------------------------------- /transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | 4 | import numpy as np 5 | 6 | 7 | DEFAULT_MIN_BIN_WIDTH = 1e-3 8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3 9 | DEFAULT_MIN_DERIVATIVE = 1e-3 10 | 11 | 12 | def piecewise_rational_quadratic_transform(inputs, 13 | unnormalized_widths, 14 | unnormalized_heights, 15 | unnormalized_derivatives, 16 | inverse=False, 17 | tails=None, 18 | tail_bound=1., 19 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 20 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 21 | min_derivative=DEFAULT_MIN_DERIVATIVE): 22 | 23 | if tails is None: 24 | spline_fn = rational_quadratic_spline 25 | spline_kwargs = {} 26 | else: 27 | spline_fn = unconstrained_rational_quadratic_spline 28 | spline_kwargs = { 29 | 'tails': tails, 30 | 'tail_bound': tail_bound 31 | } 32 | 33 | outputs, logabsdet = spline_fn( 34 | inputs=inputs, 35 | unnormalized_widths=unnormalized_widths, 36 | unnormalized_heights=unnormalized_heights, 37 | unnormalized_derivatives=unnormalized_derivatives, 38 | inverse=inverse, 39 | min_bin_width=min_bin_width, 40 | min_bin_height=min_bin_height, 41 | min_derivative=min_derivative, 42 | **spline_kwargs 43 | ) 44 | return outputs, logabsdet 45 | 46 | 47 | def searchsorted(bin_locations, inputs, eps=1e-6): 48 | bin_locations[..., -1] += eps 49 | return torch.sum( 50 | inputs[..., None] >= bin_locations, 51 | dim=-1 52 | ) - 1 53 | 54 | 55 | def unconstrained_rational_quadratic_spline(inputs, 56 | unnormalized_widths, 57 | unnormalized_heights, 58 | unnormalized_derivatives, 59 | inverse=False, 60 | tails='linear', 61 | tail_bound=1., 62 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 63 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 64 | min_derivative=DEFAULT_MIN_DERIVATIVE): 65 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) 66 | outside_interval_mask = ~inside_interval_mask 67 | 68 | outputs = torch.zeros_like(inputs) 69 | logabsdet = torch.zeros_like(inputs) 70 | 71 | if tails == 'linear': 72 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) 73 | constant = np.log(np.exp(1 - min_derivative) - 1) 74 | unnormalized_derivatives[..., 0] = constant 75 | unnormalized_derivatives[..., -1] = constant 76 | 77 | outputs[outside_interval_mask] = inputs[outside_interval_mask] 78 | logabsdet[outside_interval_mask] = 0 79 | else: 80 | raise RuntimeError('{} tails are not implemented.'.format(tails)) 81 | 82 | outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( 83 | inputs=inputs[inside_interval_mask], 84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :], 85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :], 86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], 87 | inverse=inverse, 88 | left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, 89 | min_bin_width=min_bin_width, 90 | min_bin_height=min_bin_height, 91 | min_derivative=min_derivative 92 | ) 93 | 94 | return outputs, logabsdet 95 | 96 | def rational_quadratic_spline(inputs, 97 | unnormalized_widths, 98 | unnormalized_heights, 99 | unnormalized_derivatives, 100 | inverse=False, 101 | left=0., right=1., bottom=0., top=1., 102 | min_bin_width=DEFAULT_MIN_BIN_WIDTH, 103 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT, 104 | min_derivative=DEFAULT_MIN_DERIVATIVE): 105 | if torch.min(inputs) < left or torch.max(inputs) > right: 106 | raise ValueError('Input to a transform is not within its domain') 107 | 108 | num_bins = unnormalized_widths.shape[-1] 109 | 110 | if min_bin_width * num_bins > 1.0: 111 | raise ValueError('Minimal bin width too large for the number of bins') 112 | if min_bin_height * num_bins > 1.0: 113 | raise ValueError('Minimal bin height too large for the number of bins') 114 | 115 | widths = F.softmax(unnormalized_widths, dim=-1) 116 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths 117 | cumwidths = torch.cumsum(widths, dim=-1) 118 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) 119 | cumwidths = (right - left) * cumwidths + left 120 | cumwidths[..., 0] = left 121 | cumwidths[..., -1] = right 122 | widths = cumwidths[..., 1:] - cumwidths[..., :-1] 123 | 124 | derivatives = min_derivative + F.softplus(unnormalized_derivatives) 125 | 126 | heights = F.softmax(unnormalized_heights, dim=-1) 127 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights 128 | cumheights = torch.cumsum(heights, dim=-1) 129 | cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) 130 | cumheights = (top - bottom) * cumheights + bottom 131 | cumheights[..., 0] = bottom 132 | cumheights[..., -1] = top 133 | heights = cumheights[..., 1:] - cumheights[..., :-1] 134 | 135 | if inverse: 136 | bin_idx = searchsorted(cumheights, inputs)[..., None] 137 | else: 138 | bin_idx = searchsorted(cumwidths, inputs)[..., None] 139 | 140 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] 141 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0] 142 | 143 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] 144 | delta = heights / widths 145 | input_delta = delta.gather(-1, bin_idx)[..., 0] 146 | 147 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] 148 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] 149 | 150 | input_heights = heights.gather(-1, bin_idx)[..., 0] 151 | 152 | if inverse: 153 | a = (((inputs - input_cumheights) * (input_derivatives 154 | + input_derivatives_plus_one 155 | - 2 * input_delta) 156 | + input_heights * (input_delta - input_derivatives))) 157 | b = (input_heights * input_derivatives 158 | - (inputs - input_cumheights) * (input_derivatives 159 | + input_derivatives_plus_one 160 | - 2 * input_delta)) 161 | c = - input_delta * (inputs - input_cumheights) 162 | 163 | discriminant = b.pow(2) - 4 * a * c 164 | assert (discriminant >= 0).all() 165 | 166 | root = (2 * c) / (-b - torch.sqrt(discriminant)) 167 | outputs = root * input_bin_widths + input_cumwidths 168 | 169 | theta_one_minus_theta = root * (1 - root) 170 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 171 | * theta_one_minus_theta) 172 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) 173 | + 2 * input_delta * theta_one_minus_theta 174 | + input_derivatives * (1 - root).pow(2)) 175 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 176 | 177 | return outputs, -logabsdet 178 | else: 179 | theta = (inputs - input_cumwidths) / input_bin_widths 180 | theta_one_minus_theta = theta * (1 - theta) 181 | 182 | numerator = input_heights * (input_delta * theta.pow(2) 183 | + input_derivatives * theta_one_minus_theta) 184 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) 185 | * theta_one_minus_theta) 186 | outputs = input_cumheights + numerator / denominator 187 | 188 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) 189 | + 2 * input_delta * theta_one_minus_theta 190 | + input_derivatives * (1 - theta).pow(2)) 191 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) 192 | 193 | return outputs, logabsdet 194 | -------------------------------------------------------------------------------- /stft.py: -------------------------------------------------------------------------------- 1 | """ 2 | BSD 3-Clause License 3 | Copyright (c) 2017, Prem Seetharaman 4 | All rights reserved. 5 | * Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, this 10 | list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of the copyright holder nor the names of its 13 | contributors may be used to endorse or promote products derived from this 14 | software without specific prior written permission. 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 19 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 22 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | 27 | import torch 28 | import numpy as np 29 | import torch.nn.functional as F 30 | from torch.autograd import Variable 31 | from scipy.signal import get_window 32 | from librosa.util import pad_center, tiny 33 | import librosa.util as librosa_util 34 | 35 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800, 36 | n_fft=800, dtype=np.float32, norm=None): 37 | """ 38 | # from librosa 0.6 39 | Compute the sum-square envelope of a window function at a given hop length. 40 | This is used to estimate modulation effects induced by windowing 41 | observations in short-time fourier transforms. 42 | Parameters 43 | ---------- 44 | window : string, tuple, number, callable, or list-like 45 | Window specification, as in `get_window` 46 | n_frames : int > 0 47 | The number of analysis frames 48 | hop_length : int > 0 49 | The number of samples to advance between frames 50 | win_length : [optional] 51 | The length of the window function. By default, this matches `n_fft`. 52 | n_fft : int > 0 53 | The length of each analysis frame. 54 | dtype : np.dtype 55 | The data type of the output 56 | Returns 57 | ------- 58 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 59 | The sum-squared envelope of the window function 60 | """ 61 | if win_length is None: 62 | win_length = n_fft 63 | 64 | n = n_fft + hop_length * (n_frames - 1) 65 | x = np.zeros(n, dtype=dtype) 66 | 67 | # Compute the squared window at the desired length 68 | win_sq = get_window(window, win_length, fftbins=True) 69 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2 70 | win_sq = librosa_util.pad_center(win_sq, n_fft) 71 | 72 | # Fill the envelope 73 | for i in range(n_frames): 74 | sample = i * hop_length 75 | x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] 76 | return x 77 | 78 | 79 | class STFT(torch.nn.Module): 80 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 81 | def __init__(self, filter_length=800, hop_length=200, win_length=800, 82 | window='hann'): 83 | super(STFT, self).__init__() 84 | self.filter_length = filter_length 85 | self.hop_length = hop_length 86 | self.win_length = win_length 87 | self.window = window 88 | self.forward_transform = None 89 | scale = self.filter_length / self.hop_length 90 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 91 | 92 | cutoff = int((self.filter_length / 2 + 1)) 93 | fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), 94 | np.imag(fourier_basis[:cutoff, :])]) 95 | 96 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 97 | inverse_basis = torch.FloatTensor( 98 | np.linalg.pinv(scale * fourier_basis).T[:, None, :]) 99 | 100 | if window is not None: 101 | assert(filter_length >= win_length) 102 | # get window and zero center pad it to filter_length 103 | fft_window = get_window(window, win_length, fftbins=True) 104 | fft_window = pad_center(fft_window, filter_length) 105 | fft_window = torch.from_numpy(fft_window).float() 106 | 107 | # window the bases 108 | forward_basis *= fft_window 109 | inverse_basis *= fft_window 110 | 111 | self.register_buffer('forward_basis', forward_basis.float()) 112 | self.register_buffer('inverse_basis', inverse_basis.float()) 113 | 114 | def transform(self, input_data): 115 | num_batches = input_data.size(0) 116 | num_samples = input_data.size(1) 117 | 118 | self.num_samples = num_samples 119 | 120 | # similar to librosa, reflect-pad the input 121 | input_data = input_data.view(num_batches, 1, num_samples) 122 | input_data = F.pad( 123 | input_data.unsqueeze(1), 124 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 125 | mode='reflect') 126 | input_data = input_data.squeeze(1) 127 | 128 | forward_transform = F.conv1d( 129 | input_data, 130 | Variable(self.forward_basis, requires_grad=False), 131 | stride=self.hop_length, 132 | padding=0) 133 | 134 | cutoff = int((self.filter_length / 2) + 1) 135 | real_part = forward_transform[:, :cutoff, :] 136 | imag_part = forward_transform[:, cutoff:, :] 137 | 138 | magnitude = torch.sqrt(real_part**2 + imag_part**2) 139 | phase = torch.autograd.Variable( 140 | torch.atan2(imag_part.data, real_part.data)) 141 | 142 | return magnitude, phase 143 | 144 | def inverse(self, magnitude, phase): 145 | recombine_magnitude_phase = torch.cat( 146 | [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) 147 | 148 | inverse_transform = F.conv_transpose1d( 149 | recombine_magnitude_phase, 150 | Variable(self.inverse_basis, requires_grad=False), 151 | stride=self.hop_length, 152 | padding=0) 153 | 154 | if self.window is not None: 155 | window_sum = window_sumsquare( 156 | self.window, magnitude.size(-1), hop_length=self.hop_length, 157 | win_length=self.win_length, n_fft=self.filter_length, 158 | dtype=np.float32) 159 | # remove modulation effects 160 | approx_nonzero_indices = torch.from_numpy( 161 | np.where(window_sum > tiny(window_sum))[0]) 162 | window_sum = torch.autograd.Variable( 163 | torch.from_numpy(window_sum), requires_grad=False) 164 | window_sum = window_sum.to(inverse_transform.device()) if magnitude.is_cuda else window_sum 165 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] 166 | 167 | # scale by hop ratio 168 | inverse_transform *= float(self.filter_length) / self.hop_length 169 | 170 | inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] 171 | inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] 172 | 173 | return inverse_transform 174 | 175 | def forward(self, input_data): 176 | self.magnitude, self.phase = self.transform(input_data) 177 | reconstruction = self.inverse(self.magnitude, self.phase) 178 | return reconstruction 179 | 180 | 181 | class TorchSTFT(torch.nn.Module): 182 | def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'): 183 | super().__init__() 184 | self.filter_length = filter_length 185 | self.hop_length = hop_length 186 | self.win_length = win_length 187 | self.window = torch.from_numpy(get_window(window, win_length, fftbins=True).astype(np.float32)) 188 | 189 | def transform(self, input_data): 190 | forward_transform = torch.stft( 191 | input_data, 192 | self.filter_length, self.hop_length, self.win_length, window=self.window, 193 | return_complex=True) 194 | 195 | return torch.abs(forward_transform), torch.angle(forward_transform) 196 | 197 | def inverse(self, magnitude, phase): 198 | inverse_transform = torch.istft( 199 | magnitude * torch.exp(phase * 1j), 200 | self.filter_length, self.hop_length, self.win_length, window=self.window.to(magnitude.device)) 201 | 202 | return inverse_transform.unsqueeze(-2) # unsqueeze to stay consistent with conv_transpose1d implementation 203 | 204 | def forward(self, input_data): 205 | self.magnitude, self.phase = self.transform(input_data) 206 | reconstruction = self.inverse(self.magnitude, self.phase) 207 | return reconstruction 208 | 209 | 210 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /filelists/ljs_audio_text_val_filelist.txt: -------------------------------------------------------------------------------- 1 | DUMMY1/LJ022-0023.wav|The overwhelming majority of people in this country know how to sift the wheat from the chaff in what they hear and what they read. 2 | DUMMY1/LJ043-0030.wav|If somebody did that to me, a lousy trick like that, to take my wife away, and all the furniture, I would be mad as hell, too. 3 | DUMMY1/LJ005-0201.wav|as is shown by the report of the Commissioners to inquire into the state of the municipal corporations in eighteen thirty-five. 4 | DUMMY1/LJ001-0110.wav|Even the Caslon type when enlarged shows great shortcomings in this respect: 5 | DUMMY1/LJ003-0345.wav|All the committee could do in this respect was to throw the responsibility on others. 6 | DUMMY1/LJ007-0154.wav|These pungent and well-grounded strictures applied with still greater force to the unconvicted prisoner, the man who came to the prison innocent, and still uncontaminated, 7 | DUMMY1/LJ018-0098.wav|and recognized as one of the frequenters of the bogus law-stationers. His arrest led to that of others. 8 | DUMMY1/LJ047-0044.wav|Oswald was, however, willing to discuss his contacts with Soviet authorities. He denied having any involvement with Soviet intelligence agencies 9 | DUMMY1/LJ031-0038.wav|The first physician to see the President at Parkland Hospital was Dr. Charles J. Carrico, a resident in general surgery. 10 | DUMMY1/LJ048-0194.wav|during the morning of November twenty-two prior to the motorcade. 11 | DUMMY1/LJ049-0026.wav|On occasion the Secret Service has been permitted to have an agent riding in the passenger compartment with the President. 12 | DUMMY1/LJ004-0152.wav|although at Mr. Buxton's visit a new jail was in process of erection, the first step towards reform since Howard's visitation in seventeen seventy-four. 13 | DUMMY1/LJ008-0278.wav|or theirs might be one of many, and it might be considered necessary to "make an example." 14 | DUMMY1/LJ043-0002.wav|The Warren Commission Report. By The President's Commission on the Assassination of President Kennedy. Chapter seven. Lee Harvey Oswald: 15 | DUMMY1/LJ009-0114.wav|Mr. Wakefield winds up his graphic but somewhat sensational account by describing another religious service, which may appropriately be inserted here. 16 | DUMMY1/LJ028-0506.wav|A modern artist would have difficulty in doing such accurate work. 17 | DUMMY1/LJ050-0168.wav|with the particular purposes of the agency involved. The Commission recognizes that this is a controversial area 18 | DUMMY1/LJ039-0223.wav|Oswald's Marine training in marksmanship, his other rifle experience and his established familiarity with this particular weapon 19 | DUMMY1/LJ029-0032.wav|According to O'Donnell, quote, we had a motorcade wherever we went, end quote. 20 | DUMMY1/LJ031-0070.wav|Dr. Clark, who most closely observed the head wound, 21 | DUMMY1/LJ034-0198.wav|Euins, who was on the southwest corner of Elm and Houston Streets testified that he could not describe the man he saw in the window. 22 | DUMMY1/LJ026-0068.wav|Energy enters the plant, to a small extent, 23 | DUMMY1/LJ039-0075.wav|once you know that you must put the crosshairs on the target and that is all that is necessary. 24 | DUMMY1/LJ004-0096.wav|the fatal consequences whereof might be prevented if the justices of the peace were duly authorized 25 | DUMMY1/LJ005-0014.wav|Speaking on a debate on prison matters, he declared that 26 | DUMMY1/LJ012-0161.wav|he was reported to have fallen away to a shadow. 27 | DUMMY1/LJ018-0239.wav|His disappearance gave color and substance to evil reports already in circulation that the will and conveyance above referred to 28 | DUMMY1/LJ019-0257.wav|Here the tread-wheel was in use, there cellular cranks, or hard-labor machines. 29 | DUMMY1/LJ028-0008.wav|you tap gently with your heel upon the shoulder of the dromedary to urge her on. 30 | DUMMY1/LJ024-0083.wav|This plan of mine is no attack on the Court; 31 | DUMMY1/LJ042-0129.wav|No night clubs or bowling alleys, no places of recreation except the trade union dances. I have had enough. 32 | DUMMY1/LJ036-0103.wav|The police asked him whether he could pick out his passenger from the lineup. 33 | DUMMY1/LJ046-0058.wav|During his Presidency, Franklin D. Roosevelt made almost four hundred journeys and traveled more than three hundred fifty thousand miles. 34 | DUMMY1/LJ014-0076.wav|He was seen afterwards smoking and talking with his hosts in their back parlor, and never seen again alive. 35 | DUMMY1/LJ002-0043.wav|long narrow rooms -- one thirty-six feet, six twenty-three feet, and the eighth eighteen, 36 | DUMMY1/LJ009-0076.wav|We come to the sermon. 37 | DUMMY1/LJ017-0131.wav|even when the high sheriff had told him there was no possibility of a reprieve, and within a few hours of execution. 38 | DUMMY1/LJ046-0184.wav|but there is a system for the immediate notification of the Secret Service by the confining institution when a subject is released or escapes. 39 | DUMMY1/LJ014-0263.wav|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art. 40 | DUMMY1/LJ042-0096.wav|(old exchange rate) in addition to his factory salary of approximately equal amount 41 | DUMMY1/LJ049-0050.wav|Hill had both feet on the car and was climbing aboard to assist President and Mrs. Kennedy. 42 | DUMMY1/LJ019-0186.wav|seeing that since the establishment of the Central Criminal Court, Newgate received prisoners for trial from several counties, 43 | DUMMY1/LJ028-0307.wav|then let twenty days pass, and at the end of that time station near the Chaldasan gates a body of four thousand. 44 | DUMMY1/LJ012-0235.wav|While they were in a state of insensibility the murder was committed. 45 | DUMMY1/LJ034-0053.wav|reached the same conclusion as Latona that the prints found on the cartons were those of Lee Harvey Oswald. 46 | DUMMY1/LJ014-0030.wav|These were damnatory facts which well supported the prosecution. 47 | DUMMY1/LJ015-0203.wav|but were the precautions too minute, the vigilance too close to be eluded or overcome? 48 | DUMMY1/LJ028-0093.wav|but his scribe wrote it in the manner customary for the scribes of those days to write of their royal masters. 49 | DUMMY1/LJ002-0018.wav|The inadequacy of the jail was noticed and reported upon again and again by the grand juries of the city of London, 50 | DUMMY1/LJ028-0275.wav|At last, in the twentieth month, 51 | DUMMY1/LJ012-0042.wav|which he kept concealed in a hiding-place with a trap-door just under his bed. 52 | DUMMY1/LJ011-0096.wav|He married a lady also belonging to the Society of Friends, who brought him a large fortune, which, and his own money, he put into a city firm, 53 | DUMMY1/LJ036-0077.wav|Roger D. Craig, a deputy sheriff of Dallas County, 54 | DUMMY1/LJ016-0318.wav|Other officials, great lawyers, governors of prisons, and chaplains supported this view. 55 | DUMMY1/LJ013-0164.wav|who came from his room ready dressed, a suspicious circumstance, as he was always late in the morning. 56 | DUMMY1/LJ027-0141.wav|is closely reproduced in the life-history of existing deer. Or, in other words, 57 | DUMMY1/LJ028-0335.wav|accordingly they committed to him the command of their whole army, and put the keys of their city into his hands. 58 | DUMMY1/LJ031-0202.wav|Mrs. Kennedy chose the hospital in Bethesda for the autopsy because the President had served in the Navy. 59 | DUMMY1/LJ021-0145.wav|From those willing to join in establishing this hoped-for period of peace, 60 | DUMMY1/LJ016-0288.wav|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells. 61 | DUMMY1/LJ028-0081.wav|Years later, when the archaeologists could readily distinguish the false from the true, 62 | DUMMY1/LJ018-0081.wav|his defense being that he had intended to commit suicide, but that, on the appearance of this officer who had wronged him, 63 | DUMMY1/LJ021-0066.wav|together with a great increase in the payrolls, there has come a substantial rise in the total of industrial profits 64 | DUMMY1/LJ009-0238.wav|After this the sheriffs sent for another rope, but the spectators interfered, and the man was carried back to jail. 65 | DUMMY1/LJ005-0079.wav|and improve the morals of the prisoners, and shall insure the proper measure of punishment to convicted offenders. 66 | DUMMY1/LJ035-0019.wav|drove to the northwest corner of Elm and Houston, and parked approximately ten feet from the traffic signal. 67 | DUMMY1/LJ036-0174.wav|This is the approximate time he entered the roominghouse, according to Earlene Roberts, the housekeeper there. 68 | DUMMY1/LJ046-0146.wav|The criteria in effect prior to November twenty-two, nineteen sixty-three, for determining whether to accept material for the PRS general files 69 | DUMMY1/LJ017-0044.wav|and the deepest anxiety was felt that the crime, if crime there had been, should be brought home to its perpetrator. 70 | DUMMY1/LJ017-0070.wav|but his sporting operations did not prosper, and he became a needy man, always driven to desperate straits for cash. 71 | DUMMY1/LJ014-0020.wav|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood; 72 | DUMMY1/LJ016-0020.wav|He never reached the cistern, but fell back into the yard, injuring his legs severely. 73 | DUMMY1/LJ045-0230.wav|when he was finally apprehended in the Texas Theatre. Although it is not fully corroborated by others who were present, 74 | DUMMY1/LJ035-0129.wav|and she must have run down the stairs ahead of Oswald and would probably have seen or heard him. 75 | DUMMY1/LJ008-0307.wav|afterwards express a wish to murder the Recorder for having kept them so long in suspense. 76 | DUMMY1/LJ008-0294.wav|nearly indefinitely deferred. 77 | DUMMY1/LJ047-0148.wav|On October twenty-five, 78 | DUMMY1/LJ008-0111.wav|They entered a "stone cold room," and were presently joined by the prisoner. 79 | DUMMY1/LJ034-0042.wav|that he could only testify with certainty that the print was less than three days old. 80 | DUMMY1/LJ037-0234.wav|Mrs. Mary Brock, the wife of a mechanic who worked at the station, was there at the time and she saw a white male, 81 | DUMMY1/LJ040-0002.wav|Chapter seven. Lee Harvey Oswald: Background and Possible Motives, Part one. 82 | DUMMY1/LJ045-0140.wav|The arguments he used to justify his use of the alias suggest that Oswald may have come to think that the whole world was becoming involved 83 | DUMMY1/LJ012-0035.wav|the number and names on watches, were carefully removed or obliterated after the goods passed out of his hands. 84 | DUMMY1/LJ012-0250.wav|On the seventh July, eighteen thirty-seven, 85 | DUMMY1/LJ016-0179.wav|contracted with sheriffs and conveners to work by the job. 86 | DUMMY1/LJ016-0138.wav|at a distance from the prison. 87 | DUMMY1/LJ027-0052.wav|These principles of homology are essential to a correct interpretation of the facts of morphology. 88 | DUMMY1/LJ031-0134.wav|On one occasion Mrs. Johnson, accompanied by two Secret Service agents, left the room to see Mrs. Kennedy and Mrs. Connally. 89 | DUMMY1/LJ019-0273.wav|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline. 90 | DUMMY1/LJ014-0110.wav|At the first the boxes were impounded, opened, and found to contain many of O'Connor's effects. 91 | DUMMY1/LJ034-0160.wav|on Brennan's subsequent certain identification of Lee Harvey Oswald as the man he saw fire the rifle. 92 | DUMMY1/LJ038-0199.wav|eleven. If I am alive and taken prisoner, 93 | DUMMY1/LJ014-0010.wav|yet he could not overcome the strange fascination it had for him, and remained by the side of the corpse till the stretcher came. 94 | DUMMY1/LJ033-0047.wav|I noticed when I went out that the light was on, end quote, 95 | DUMMY1/LJ040-0027.wav|He was never satisfied with anything. 96 | DUMMY1/LJ048-0228.wav|and others who were present say that no agent was inebriated or acted improperly. 97 | DUMMY1/LJ003-0111.wav|He was in consequence put out of the protection of their internal law, end quote. Their code was a subject of some curiosity. 98 | DUMMY1/LJ008-0258.wav|Let me retrace my steps, and speak more in detail of the treatment of the condemned in those bloodthirsty and brutally indifferent days, 99 | DUMMY1/LJ029-0022.wav|The original plan called for the President to spend only one day in the State, making whirlwind visits to Dallas, Fort Worth, San Antonio, and Houston. 100 | DUMMY1/LJ004-0045.wav|Mr. Sturges Bourne, Sir James Mackintosh, Sir James Scarlett, and William Wilberforce. 101 | -------------------------------------------------------------------------------- /train_latest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import itertools 5 | import math 6 | import torch 7 | from torch import nn, optim 8 | from torch.nn import functional as F 9 | from torch.utils.data import DataLoader 10 | from torch.utils.tensorboard import SummaryWriter 11 | import torch.multiprocessing as mp 12 | import torch.distributed as dist 13 | from torch.nn.parallel import DistributedDataParallel as DDP 14 | from torch.cuda.amp import autocast, GradScaler 15 | from pqmf import PQMF 16 | 17 | import commons 18 | import utils 19 | from data_utils import ( 20 | TextAudioLoader, 21 | TextAudioCollate, 22 | DistributedBucketSampler 23 | ) 24 | from models import ( 25 | SynthesizerTrn, 26 | MultiPeriodDiscriminator, 27 | ) 28 | from losses import ( 29 | generator_loss, 30 | discriminator_loss, 31 | feature_loss, 32 | kl_loss, 33 | subband_stft_loss 34 | ) 35 | from mel_processing import mel_spectrogram_torch, spec_to_mel_torch 36 | from text.symbols import symbols 37 | 38 | torch.autograd.set_detect_anomaly(True) 39 | torch.backends.cudnn.benchmark = True 40 | global_step = 0 41 | 42 | 43 | def main(): 44 | """Assume Single Node Multi GPUs Training Only""" 45 | assert torch.cuda.is_available(), "CPU training is not allowed." 46 | 47 | n_gpus = torch.cuda.device_count() 48 | os.environ['MASTER_ADDR'] = 'localhost' 49 | os.environ['MASTER_PORT'] = '65520' 50 | # n_gpus = 1 51 | 52 | hps = utils.get_hparams() 53 | mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,)) 54 | 55 | 56 | def run(rank, n_gpus, hps): 57 | global global_step 58 | if rank == 0: 59 | logger = utils.get_logger(hps.model_dir) 60 | logger.info(hps) 61 | utils.check_git_hash(hps.model_dir) 62 | writer = SummaryWriter(log_dir=hps.model_dir) 63 | writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) 64 | 65 | dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) 66 | torch.manual_seed(hps.train.seed) 67 | torch.cuda.set_device(rank) 68 | 69 | train_dataset = TextAudioLoader(hps.data.training_files, hps.data) 70 | train_sampler = DistributedBucketSampler( 71 | train_dataset, 72 | hps.train.batch_size, 73 | [32,300,400,500,600,700,800,900,1000], 74 | num_replicas=n_gpus, 75 | rank=rank, 76 | shuffle=True) 77 | collate_fn = TextAudioCollate() 78 | train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True, 79 | collate_fn=collate_fn, batch_sampler=train_sampler) 80 | if rank == 0: 81 | eval_dataset = TextAudioLoader(hps.data.validation_files, hps.data) 82 | eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False, 83 | batch_size=hps.train.batch_size, pin_memory=True, 84 | drop_last=False, collate_fn=collate_fn) 85 | 86 | net_g = SynthesizerTrn( 87 | len(symbols), 88 | hps.data.filter_length // 2 + 1, 89 | hps.train.segment_size // hps.data.hop_length, 90 | **hps.model).cuda(rank) 91 | net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) 92 | optim_g = torch.optim.AdamW( 93 | net_g.parameters(), 94 | hps.train.learning_rate, 95 | betas=hps.train.betas, 96 | eps=hps.train.eps) 97 | optim_d = torch.optim.AdamW( 98 | net_d.parameters(), 99 | hps.train.learning_rate, 100 | betas=hps.train.betas, 101 | eps=hps.train.eps) 102 | net_g = DDP(net_g, device_ids=[rank]) 103 | net_d = DDP(net_d, device_ids=[rank]) 104 | 105 | try: 106 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g) 107 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) 108 | global_step = (epoch_str - 1) * len(train_loader) 109 | except: 110 | epoch_str = 1 111 | global_step = 0 112 | 113 | scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str-2) 114 | scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str-2) 115 | 116 | scaler = GradScaler(enabled=hps.train.fp16_run) 117 | 118 | for epoch in range(epoch_str, hps.train.epochs + 1): 119 | if rank==0: 120 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval]) 121 | else: 122 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None) 123 | scheduler_g.step() 124 | scheduler_d.step() 125 | 126 | 127 | 128 | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): 129 | net_g, net_d = nets 130 | optim_g, optim_d = optims 131 | scheduler_g, scheduler_d = schedulers 132 | train_loader, eval_loader = loaders 133 | if writers is not None: 134 | writer, writer_eval = writers 135 | 136 | train_loader.batch_sampler.set_epoch(epoch) 137 | global global_step 138 | 139 | net_g.train() 140 | net_d.train() 141 | for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths) in enumerate(train_loader): 142 | x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True) 143 | spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True) 144 | y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True) 145 | 146 | with autocast(enabled=hps.train.fp16_run): 147 | y_hat, y_hat_mb, l_length, attn, ids_slice, x_mask, z_mask,\ 148 | (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths) 149 | 150 | mel = spec_to_mel_torch( 151 | spec, 152 | hps.data.filter_length, 153 | hps.data.n_mel_channels, 154 | hps.data.sampling_rate, 155 | hps.data.mel_fmin, 156 | hps.data.mel_fmax) 157 | y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) 158 | y_hat_mel = mel_spectrogram_torch( 159 | y_hat.squeeze(1), 160 | hps.data.filter_length, 161 | hps.data.n_mel_channels, 162 | hps.data.sampling_rate, 163 | hps.data.hop_length, 164 | hps.data.win_length, 165 | hps.data.mel_fmin, 166 | hps.data.mel_fmax 167 | ) 168 | 169 | y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice 170 | 171 | # Discriminator 172 | y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) 173 | with autocast(enabled=False): 174 | loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) 175 | loss_disc_all = loss_disc 176 | optim_d.zero_grad() 177 | scaler.scale(loss_disc_all).backward() 178 | scaler.unscale_(optim_d) 179 | grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) 180 | scaler.step(optim_d) 181 | 182 | 183 | 184 | 185 | with autocast(enabled=hps.train.fp16_run): 186 | # Generator 187 | y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) 188 | with autocast(enabled=False): 189 | loss_dur = torch.sum(l_length.float()) 190 | loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel 191 | loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl 192 | 193 | loss_fm = feature_loss(fmap_r, fmap_g) 194 | loss_gen, losses_gen = generator_loss(y_d_hat_g) 195 | 196 | if hps.model.mb_istft_vits == True: 197 | pqmf = PQMF(y.device) 198 | y_mb = pqmf.analysis(y) 199 | loss_subband = subband_stft_loss(hps, y_mb, y_hat_mb) 200 | else: 201 | loss_subband = torch.tensor(0.0) 202 | 203 | loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl + loss_subband 204 | 205 | optim_g.zero_grad() 206 | scaler.scale(loss_gen_all).backward() 207 | scaler.unscale_(optim_g) 208 | grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) 209 | scaler.step(optim_g) 210 | scaler.update() 211 | 212 | if rank==0: 213 | if global_step % hps.train.log_interval == 0: 214 | lr = optim_g.param_groups[0]['lr'] 215 | losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl, loss_subband] 216 | logger.info('Train Epoch: {} [{:.0f}%]'.format( 217 | epoch, 218 | 100. * batch_idx / len(train_loader))) 219 | logger.info([x.item() for x in losses] + [global_step, lr]) 220 | 221 | scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} 222 | scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl, "loss/g/subband": loss_subband}) 223 | 224 | scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) 225 | scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) 226 | scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) 227 | image_dict = { 228 | "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), 229 | "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), 230 | "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), 231 | "all/attn": utils.plot_alignment_to_numpy(attn[0,0].data.cpu().numpy()) 232 | } 233 | utils.summarize( 234 | writer=writer, 235 | global_step=global_step, 236 | images=image_dict, 237 | scalars=scalar_dict) 238 | 239 | if global_step % hps.train.eval_interval == 0: 240 | evaluate(hps, net_g, eval_loader, writer_eval) 241 | utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step))) 242 | utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step))) 243 | global_step += 1 244 | 245 | 246 | if rank == 0: 247 | logger.info('====> Epoch: {}'.format(epoch)) 248 | 249 | 250 | 251 | 252 | def evaluate(hps, generator, eval_loader, writer_eval): 253 | generator.eval() 254 | with torch.no_grad(): 255 | for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths) in enumerate(eval_loader): 256 | x, x_lengths = x.cuda(0), x_lengths.cuda(0) 257 | spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0) 258 | y, y_lengths = y.cuda(0), y_lengths.cuda(0) 259 | 260 | # remove else 261 | x = x[:1] 262 | x_lengths = x_lengths[:1] 263 | spec = spec[:1] 264 | spec_lengths = spec_lengths[:1] 265 | y = y[:1] 266 | y_lengths = y_lengths[:1] 267 | break 268 | y_hat, y_hat_mb, attn, mask, *_ = generator.module.infer(x, x_lengths, max_len=1000) 269 | y_hat_lengths = mask.sum([1,2]).long() * hps.data.hop_length 270 | 271 | mel = spec_to_mel_torch( 272 | spec, 273 | hps.data.filter_length, 274 | hps.data.n_mel_channels, 275 | hps.data.sampling_rate, 276 | hps.data.mel_fmin, 277 | hps.data.mel_fmax) 278 | y_hat_mel = mel_spectrogram_torch( 279 | y_hat.squeeze(1).float(), 280 | hps.data.filter_length, 281 | hps.data.n_mel_channels, 282 | hps.data.sampling_rate, 283 | hps.data.hop_length, 284 | hps.data.win_length, 285 | hps.data.mel_fmin, 286 | hps.data.mel_fmax 287 | ) 288 | image_dict = { 289 | "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()) 290 | } 291 | audio_dict = { 292 | "gen/audio": y_hat[0,:,:y_hat_lengths[0]] 293 | } 294 | if global_step == 0: 295 | image_dict.update({"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())}) 296 | audio_dict.update({"gt/audio": y[0,:,:y_lengths[0]]}) 297 | 298 | utils.summarize( 299 | writer=writer_eval, 300 | global_step=global_step, 301 | images=image_dict, 302 | audios=audio_dict, 303 | audio_sampling_rate=hps.data.sampling_rate 304 | ) 305 | generator.train() 306 | 307 | 308 | if __name__ == "__main__": 309 | os.environ[ 310 | "TORCH_DISTRIBUTED_DEBUG" 311 | ] = "DETAIL" 312 | main() 313 | -------------------------------------------------------------------------------- /filelists/filelist_train2.txt.cleaned: -------------------------------------------------------------------------------- 1 | ./tsukuyomi/VOICEACTRESS100_001.wav|ma↑ta, to↓ojino yo↓oni, go↓dai myo↑oo↓oto yo↑bareru, ʃu↑yoona myo↑oo↓ono ʧu↑uo↓oni ha↓isareru ko↑to↓mo o↓oi. 2 | ./tsukuyomi/VOICEACTRESS100_002.wav|nyu↑uiNguraNdo↓fuuwa, gyu↑unyuuo be↓esUto ʃI↑ta, ʃi↑ro↓i ku↑riimusu↓upudeari, bo↑sUtoNkuramuʧa↓udaatomo yo↑bareru. 3 | ./tsukuyomi/VOICEACTRESS100_003.wav|ko↑Npyuutage↓emuno me↓ekaaya, gyo↑okaida↓Ntainadoni ka↑NreN su↑ru ji↓Nbutsuno ka↑te↓gori. 4 | ./tsukuyomi/VOICEACTRESS100_004.wav|sa↑abisumaneejaadoonyuu↓ekino ta↑me, o↑oi↓maʧi e↓kIkara, e↑NkakUka↓Nri ʃI↑te i↑ru. 5 | ./tsukuyomi/VOICEACTRESS100_005.wav|ʃi↓rubaa sa↑afaaʃuugekiji↓keNmadeni, ri↓ʧaazuwa, ʧi↑imu↓meeto to↑moni, ko↑kUsai↓tekini su↑upaahi↓iroo, o↓yobi, yu↑ume↓ejiNto ʃI↑te, ni↓NʧI sa↑rete i↑ru. 6 | ./tsukuyomi/VOICEACTRESS100_006.wav|ts u yu↑reNharuto↓ryoowa, byu↑ruteNberuku↓ryooni he↑Nnyuu sa↑reta. 7 | ./tsukuyomi/VOICEACTRESS100_007.wav|ji↑kaN ryo↑oikIto, ku↑ukaNryo↓oikide kyo↑otsuu su↑ru ʃo↑riʃu↓hoowa, fi↑rutari↓Nguni yo↑ru, nyu↑uryokUʃi↓Ngoono kyo↓okadearu. 8 | ./tsukuyomi/VOICEACTRESS100_008.wav|ʃa↑Nʧiino se↑Ngyoo↓purowa, ʧi↓imukara ʃI↑hara↓wareru kyu↓uryooto, ta↑ikyoku↓hio, o↓mona ʃu↑unyuuto ʃI↑te i↑ru. 9 | ./tsukuyomi/VOICEACTRESS100_009.wav|ma↑ta ne↑jimeʃiwa, ʧu↑usa↓Nnoono o↑same↓ru, ryu↑ukyuuo↓okokUtono ko↑oekinimo sa↑Nka ʃI↑ta. 10 | ./tsukuyomi/VOICEACTRESS100_010.wav|su↑maato↓foNkara, fi↑iʧaafo↓Nmade, ma↑ruʧideba↓isuni ta↑ioo. 11 | ./tsukuyomi/VOICEACTRESS100_011.wav|ke↑emyoo ʃa↑datsuna na↑reeʃoNkara, jo↑oʧo↓kaN a↑fure↓ru ka↑tarima↓de, ha↑bahiro↓i hyo↑ogeNryo↓kuo mo↓tsu. 12 | ./tsukuyomi/VOICEACTRESS100_012.wav|ko↑ozoowa, ha↑ganeseeno ta↑Nitsu a↓aʧide, kyo↑okyakuwa, i↑ʃItsumidearu. 13 | ./tsukuyomi/VOICEACTRESS100_013.wav|so↑koe, o↓onaaga a↑taraʃi↓i ʃe↓fUto ʃI↑te, u↑dekIkino hyo↑nu↓kuo ma↑ne↓ku. 14 | ./tsukuyomi/VOICEACTRESS100_014.wav|ku↑iiNzuabenyuua↓rufani ʃo↑zoku ʃI↑te i↑ru. 15 | ./tsukuyomi/VOICEACTRESS100_015.wav|i↑Qpo↓ode, gyo↓gyooto ʃo↓ogyoode, rya↑nesUko↓owa ha↑Nee ʃI↑te i↑ta. 16 | ./tsukuyomi/VOICEACTRESS100_016.wav|ko↑no, nyu↑usausuweeruzudaihyooʧi↓imuga, wa↑rabi↓izuno ʧu↑ukakUto na↓Qte i↑ku. 17 | ./tsukuyomi/VOICEACTRESS100_017.wav|ta↓daʃi, gya↑NburuizoNʃoono nyu↑uiNʧi↓ryooo i↑Qte i↑ru byo↑oiNwa, wa↓zukadearu. 18 | ./tsukuyomi/VOICEACTRESS100_018.wav|ta↓no me↓jaana di↑sUtoribyu↓uʃoNni ku↑rabe, se↑kyuritiijoono mo↑Ndaino ʃu↑useega, o↑soi ba↑aimo a↓ru. 19 | ./tsukuyomi/VOICEACTRESS100_019.wav|be↑rugaato↓oa ma↓eno, ve↑digeNuufaapa↓akuniwa, se↑Nsooto da↑Natsuno gi↑seeʃano ta↑me↓no ki↑neN↓higa ta↓Qte i↑ru. 20 | ./tsukuyomi/VOICEACTRESS100_020.wav|ze↑NbeepaburiQʃaazukyo↓okaino, be↑sUtosUtora↓tejiigeemuobuzaiyaao, ni↑QpoN↓jiNto ʃI↑te ju↑ʃoo. 21 | ./tsukuyomi/VOICEACTRESS100_021.wav|i↑tami↓wa, te↑Ntekiyo↓ri ʧi↑Ntsuuya↓kuo, jo↑omyakUto↓oyo su↑ru ko↑to↓de, ʧi↑Ntsuuo o↑konau. 22 | ./tsukuyomi/VOICEACTRESS100_022.wav|ko↑no to↓kini, fu↑yuutairikUpuruva↓mani a↓ru, ʧu↑uritsu↓koku, byu↑eru↓baga, a↓ru jo↑ohoosujikara, ba↑QʃUʃo↓oguNno ʃo↑keeto, ze↑No↓ojo, a↓aʃeno ji↓gaio ha↑Qpyoo. 23 | ./tsukuyomi/VOICEACTRESS100_023.wav|na↑Nsee↓bu wo↓oreNwa, be↑ia↓amaN fa↓amuzuto, fi↑Qtsujera↓rudono ʧi↓kude, ko↑osee sa↑reru. 24 | ./tsukuyomi/VOICEACTRESS100_024.wav|ko↑no ta↑me, pu↑razumaʧuuno i↓oNya, de↓Nʃino mo↓tsu, he↑ekiNuNdooene↓rugiio, o↓Ndode hyo↑oge↓N su↑ru ko↑to↓ga a↓ru. 25 | ./tsukuyomi/VOICEACTRESS100_025.wav|so↑no hyo↑ohyooto ʃI↑ta hI↑togaraga, ro↑onyakuna↓Nnyoni ʃI↑tawarete i↑ru. 26 | ./tsukuyomi/VOICEACTRESS100_026.wav|ge↓Nzai, nyu↑ujaajii↓ʃuu, mu↓ua zu↑ta↓uNni su↓Nde i↑ru. 27 | ./tsukuyomi/VOICEACTRESS100_027.wav|ʧo↑oikini a↓Qta, mi↑tsunesaN↓haNwa, na↑gaoka↓haNni, be↑e hya↑Q↓pyooo o↑kuQta ko↑to↓de yu↑umee. 28 | ./tsukuyomi/VOICEACTRESS100_028.wav|ko↑no to↓ki, pe↑rime↓edeewa, a↑mupIkutoriyu↓ooNni do↑okoo ʃI↑te, te↓ebaini ki↓te i↑ta, r i ky u mu↑ni↓osuni, tsu↓mato ʃI↑te a↑taerareta. 29 | ./tsukuyomi/VOICEACTRESS100_029.wav|ge↓Nzaino ka↑Qsooo mo↑kUtekIto ʃI↑ta, sU↑kiibu↓utsuwa, ka↑tai pu↑rasUʧiQku↓ʃeruto, ya↑waraka↓i i↑Nnaabu↓utsUkara na↓ru. 30 | ./tsukuyomi/VOICEACTRESS100_030.wav|bo↑ogo↓ori bu↓utsuwa, hyo↑ome↓Nni ha↑rareta, go↑museeno u↑sui ma↑ku↓de de↑ki↓te i↑ru. 31 | ./tsukuyomi/VOICEACTRESS100_031.wav|ko↓oʃano da↑ihyooga, we↑Qjiu↓Qdono, ja↑sUpaawe↓adearu. 32 | ./tsukuyomi/VOICEACTRESS100_032.wav|ki↑i↓kyokuga ha↑QʃiN su↑ru, nyu↑usuneQtowaaku↓meeo ka↑NʃIta ta↓itoruno, nyu↑usuba↓Ngumino na↓kadewa, re↑gyuraaho↓osooga, mo↑Qto↓mo o↓oi. 33 | ./tsukuyomi/VOICEACTRESS100_033.wav|ge↑enoopu↓rodakUʃoN, a↑myu↓uzuno gu↑ruupUki↓gyoo. 34 | ./tsukuyomi/VOICEACTRESS100_034.wav|ʧo↑obo↓iNo ʃo↑oryaku ʃI↑te, e↑ryu↓ʃioNtomo hyo↓okI sa↑reru. 35 | ./tsukuyomi/VOICEACTRESS100_035.wav|mo↑rinagano o↑iʃi↓i gyu↑unyuuwa, ko↓i a↑oironi, gyu↑unyuu↓biNo a↑ʃira↓Qta de↑za↓iNno, pa↑Qkugyu↓unyuudearu. 36 | ./tsukuyomi/VOICEACTRESS100_036.wav|ba↑Ngumibo↓otooo, to↑okyoomuubiise↓esakuno a↑nime↓eʃoNde, hyo↑oge↓N su↑ru te↑Nmo, kyo↑otsuu ʃI↑te i↑ta. 37 | ./tsukuyomi/VOICEACTRESS100_037.wav|ko↑myu↓uNwa, se↑enu↓gawato, e↑soNnu↓kawano, go↑oryuuʧi↓teNto na↓Qte i↑ru. 38 | ./tsukuyomi/VOICEACTRESS100_038.wav|do↑ojini, fU↑kuimi↓rakuruerefaNtsuni, ko↑oʧIke↓NniNde, nyu↑udaN su↑ru ko↑to↓ga ha↑Qpyoo sa↑reta. 39 | ./tsukuyomi/VOICEACTRESS100_039.wav|o↑Qtodearu ko↑muroga, kyu↑ukyuuʃa↓o yo↑bi, to↑naibyo↓oiNni, ki↑Nkyuu ha↑Nsoo sa↑reru. 40 | ./tsukuyomi/VOICEACTRESS100_040.wav|gi↑re↓sUpiiwa, ma↓Qgiio tsu↑ujite, i↓nesUto ʃi↑ria↓Qta. 41 | ./tsukuyomi/VOICEACTRESS100_041.wav|fo↑Nteenuburooyo↓oʃIkidewa, gu↑ui↓tekina e↓ga, ʃi↑Qkuino mo↓orudoni tsU↑kawarete i↑ru. 42 | ./tsukuyomi/VOICEACTRESS100_042.wav|sa↑ijiNwa, bi↑ʃunuhano se↓ejiN, su↓waa mi↑inaaraayaN. 43 | ./tsukuyomi/VOICEACTRESS100_043.wav|ha↓adee su↑ga, pe↑ruse↓poneeni ko↓io ʃI↑ta no↑wa, a↑purodi↓iteeno, sa↑kuryakudearuto sa↑rete i↑ru. 44 | ./tsukuyomi/VOICEACTRESS100_044.wav|ku↓weeNbaaNʧaaNwa, ʧi↓isana ko↑myu↓nitiide, no↓ogyooya, ʃo↓ogyooo ʧu↑uʃiNni, na↑rita↓Qte i↑ta↓to, ka↑Nga↓erarete i↑ru. 45 | ./tsukuyomi/VOICEACTRESS100_045.wav|ve↑ezaajiteNʃa↓dooya, myu↓ureN ru↓utoni ʃI↑taga↓Qta, sa↑ikuriNgutsu↓aawa, pe↓etaasuhaageNo, ke↑eyu su↑ru. 46 | ./tsukuyomi/VOICEACTRESS100_046.wav|fo↑omyura↓kaawa, tsu↑ujoo, o↑opuNhoi↓irude, ʃi↑Nguruʃi↓itaadearu. 47 | ./tsukuyomi/VOICEACTRESS100_047.wav|do↑ojitsu a↓sani, o↑osakana↓Nbade, ʃu↑QpatsUse↓remoniiga ka↑isai sa↑re, e↑egyoou↓NteNni, ju↑utoo sa↑reta. 48 | ./tsukuyomi/VOICEACTRESS100_048.wav|so↑ʃIte, i↑NdepeNdeNto↓ʃino, do↑kUʃato↓ohyoode e↑ra↓bu, pu↑remiariigusaiyuuʃuugooruki↓ipaani e↑ra↓bareta. 49 | ./tsukuyomi/VOICEACTRESS100_049.wav|pu↑reiyaa↓kyarakUtaawa, kyu↑udeNo se↓Nkyo ʃI↑ta, ja↑akuna ku↑ri↓iʧaani so↑oguu su↑ru. 50 | ./tsukuyomi/VOICEACTRESS100_050.wav|fi↑irudomaake↓tiNguwa, re↑kIʃi↓tekiniwa, i↑Qpo↓o tsu↑ukoono ko↑myunikeeʃoNtsu↓uruto ʃI↑te, ka↑Nga↓erarete ki↓ta. 51 | ./tsukuyomi/VOICEACTRESS100_051.wav|de↑byuu↓gono su↑une↓NkaNwa, be↑biife↓isUto ʃI↑te, ho↓Nmyoode ka↑tsudoo. 52 | ./tsukuyomi/VOICEACTRESS100_052.wav|ga↑Qkooya byo↑oiNna↓dono, kyu↑uʃokugyo↓omude, e↑eyo↓osoo ke↑esaN su↑ru jo↑ode, ju↑uyoona ʃi↓ryoono hI↑to↓tsudearu. 53 | ./tsukuyomi/VOICEACTRESS100_053.wav|to↓oji, a↑yaʃii wa↓arudoni jo↑oʧuu ʃI↑te i↑ta gi↑ko↓nekoga, ku↑uhakuni↓te ha↑Qpyoo. 54 | ./tsukuyomi/VOICEACTRESS100_054.wav|yu↓ufUkuna nyu↑uyookaa↓taʧiwa, gu↑re↓evuseNdo, ke↑ebajooya, ʃi↓ipuʃeQdobei, ke↑ebajoona↓doni tsu↑do↓i, u↑mizoino ko↑okyuu re↓sUtoraNya, ho↓teruo ri↑yoo ʃI↑ta. 55 | ./tsukuyomi/VOICEACTRESS100_055.wav|wo↑riaazumiQkusumaaʃaruaatsuakademiiʃo↓zoku. 56 | ./tsukuyomi/VOICEACTRESS100_056.wav|to↑koro↓ga, e↑riyuʃIkuto↓oNwa, nyu↓mupeeno se↑eʃimo kI↑kazuni, de↑emeete↓eruno ka↓ʃio, ki↑ritao↓ʃIta. 57 | ./tsukuyomi/VOICEACTRESS100_057.wav|ko↑no je↑ʃii↓yakude sU↑tei↓mosuwa, e↑mii↓ʃooni no↑mine↓eto sa↑reta ko↑to↓mo a↓ru. 58 | ./tsukuyomi/VOICEACTRESS100_058.wav|su↑weedeNi↓miNno ryo↓oʃiNno mo↑to↓ni, ma↑saʧuuseQtsu↓ʃuu, ke↓NburiQjinite u↑mareru. 59 | ./tsukuyomi/VOICEACTRESS100_059.wav|kyu↑ueNno fa↑Nto↓ohyoodemo, ni↑Nkiga gu↑uzooka ʃI↑te i↑ta, na↑gaʃima ʃi↑geoni ni↑kUhakU su↑ru. 60 | ./tsukuyomi/VOICEACTRESS100_060.wav|ha↓hawa, pi↑itaamariQtsuba↓aguno se↑eʃiNbyo↓oiNni nyu↑uiN ʃI↑te i↑ru to↓kini, be↓Qʃiio u↑mu. 61 | ./tsukuyomi/VOICEACTRESS100_061.wav|po↑iNtoga↓adokara, su↑moorufo↓waadomade ko↑nase↓ru, so↑ogooryo↓kuga ta↑ka↓i yu↑utiritiipu↓reeyaadearu. 62 | ./tsukuyomi/VOICEACTRESS100_062.wav|gu↑re↓Qguwa, mi↑ʃIʃiQpi↓ʃuu, a↑badi↓iNni a↓ru, o↑Qdoferoozu↓boʧini ma↑isoo sa↑reru ko↑to↓ni na↓Qta. 63 | ./tsukuyomi/VOICEACTRESS100_063.wav|o↑oatariʃuuryoo↓gowa, gu↑radieetaaʧa↓Nsuni to↑tsunyuu su↑ru. 64 | ./tsukuyomi/VOICEACTRESS100_064.wav|ko↑no ki↓Nni yo↑ru byo↑okiwa, ha↑iirokabibyooto na↑zuke↓rarete i↑ru mo↑no↓ga o↓oi. 65 | ./tsukuyomi/VOICEACTRESS100_065.wav|re↑gyuraame↓Nbaano ka↑oja↓ʃiNo ku↑ri↓Qku ʃI↑ta a↓toni, mu↑ubiipureiyaa↓fuuni sa↑isee sa↑reruto i↑u, to↑kuina ke↑eʃIkito na↓Qte i↑ru. 66 | ./tsukuyomi/VOICEACTRESS100_066.wav|ka↑Nzooeno sa↑Nsokyo↓okyuuwa, ka↑Ndo↓omyakUto, te↑eatsu↓keeno mo↑N↓myakuo ka↓iʃIte, o↑konawarete i↑ru. 67 | ./tsukuyomi/VOICEACTRESS100_067.wav|de↑Qdo↓kiiwa, ta↑ipura↓itaaya, ko↑Npyu↓utano ki↑ibo↓odoni o↑keru, to↑kUʃuna so↑oʃoku↓kiidearu. 68 | ./tsukuyomi/VOICEACTRESS100_068.wav|ʃa↓NʃaN u↑ma↓wa, u↑dojiNgu↓ue sa↑Npai su↑ru, ʃi↑NkoNfu↓ufuga no↑Qte i↑ta u↑ma↓no ko↑to. 69 | ./tsukuyomi/VOICEACTRESS100_069.wav|bu↑ruuriQjisa↓Nmyakuno ge↑Nryuukara, ri↑Qʧimo↓Ndomade, o↓okuno ha↑yaseya fU↑ʧi↓ga, tsu↑riya kyu↑uryuuku↓dario ta↑noʃi↓masete ku↑reru. 70 | ./tsukuyomi/VOICEACTRESS100_070.wav|bo↑o↓haNwa, i↑isUtomaN↓ra, gya↓Nguno sU↑piikui↓ijiino a↑garikara, wa↓iroo to↓Qte i↑ta↓tomo u↑wasa sa↑reta. 71 | ./tsukuyomi/VOICEACTRESS100_071.wav|pe↑Nʃirubenia↓ʃuu, fi↑raderu↓fiano ko↓ogai, wi↑Nre↓Qdono re↑Nkinaubyo↓oiNde u↑mareta. 72 | ./tsukuyomi/VOICEACTRESS100_072.wav|bu↑ra↓Qguwa, byu↓u e↑ru↓guNyorimo, re↑QseedaQta ta↑me↓ni, ko↑no ki↓kaio i↑ka↓sU ko↑to↓o ʧu↓uʧo ʃI↑ta. 73 | ./tsukuyomi/VOICEACTRESS100_073.wav|jo↑oiNgi↓iNto ʃI↑te, ba↓aNweruwa, ka↑riforunia↓ʃuuno, re↑Npooka↓nyuuni sa↑Nsee ʃI↑ta. 74 | ./tsukuyomi/VOICEACTRESS100_074.wav|re↑jeNdoʃiri↓izuo be↓esuni, yo↑o fu↑riiki↓kooo so↑nae↓ta, byu↓u ka↓mera. 75 | ./tsukuyomi/VOICEACTRESS100_075.wav|ga↑Qkyokuno se↑Ntaapoji↓ʃoNwa, e↑ikeebiifootii↓eitono, ta↑ka↓haʃi mi↓namiga tsU↑tome↓ta. 76 | ./tsukuyomi/VOICEACTRESS100_076.wav|di↑onyu↓usosuno, ʧo↑oaio u↑ke↓ru, o↑ineusu↓ooto, h i, a↑rutai↓aano a↑idani, ka↓riyu do↑oNno o↓ojoto ʃI↑te, se↓eo u↑ke↓ta. 77 | ./tsukuyomi/VOICEACTRESS100_077.wav|o↑oniʃi yo↑ojoono, ju↑Nkoo ko↓odokara, do↓oryoku na↓ʃide, ʧi↑jooe ka↑Qkuuhi↓koo ʃ i, ki↑Nkyuu ʧa↑kurikuni se↑ekoo ʃI↑ta. 78 | ./tsukuyomi/VOICEACTRESS100_078.wav|hyo↑ogeNgyo↓oretsuno ʃI↑hyoohyooo, bu↓Nʃino ta↑iʃooseeo a↑rawa↓su, te↑N↓guNno ʃI↑hyoohyooo mo↑ʧii↓te, su↑Nde ya↑kUhyooge↓Ne bu↑Nkai su↑ru. 79 | ./tsukuyomi/VOICEACTRESS100_079.wav|ta↑iyoogyogyooo↓onaano, na↓kabe ke↑NkIʧino i↓noʧio u↑ke↓te, pu↑royakyuukyu↓udaNno, ta↑iyoohoe↓eruzuni ka↑kawa↓ru. 80 | ./tsukuyomi/VOICEACTRESS100_080.wav|ka↓sUkani kI↑koete ku↓ru se↓N kyu↓uhyakU sa↓Njuu i↑ʧine↓NbaNno sa↑Nbi↓kaga, ʃi↑daini o↓okIkunaQte i↑ku. 81 | ./tsukuyomi/VOICEACTRESS100_081.wav|mo↓o i↑ideeNgaNpekIʧii↓kino jo↓obuwa, pu↑raasaatopurawihaaNjiiNi↓sekie tsu↑nagaru, ta↑igawa sa↑Ndooni tsu↑zuite i↑ru. 82 | ./tsukuyomi/VOICEACTRESS100_082.wav|ka↑amira↓boʃIto yo↑barete i↑ru wa↑kUseekara, u↑ʧuuseNni no↑Qte, ʧI↑kyuuni ʃi↑Nnyuu ʃI↑ta u↑ʧuu↓jiN. 83 | ./tsukuyomi/VOICEACTRESS100_083.wav|do↑Qgaaba↓Nkuwa, ta↓raya ni↓ʃiNno gyo↑kaku↓ryouga o↓oi, ju↑uyoona, gyo↑joodearu. 84 | ./tsukuyomi/VOICEACTRESS100_084.wav|ʃo↑oneNji↓daiwa, ro↑ʃiate↓ekoku, ʧe↑runiihiu↓keN, pu↑ruiruukui↓guN, to↑rosUʧanuiitsuyamurade su↑go↓ʃIta. 85 | ./tsukuyomi/VOICEACTRESS100_085.wav|i↑haino ho↑to↓Ndowa, su↑weedeNniʃIka↓igaNno, bu↓u hyu↑usureeNʧIho↓ono ko↑jimani a↓ru gyo↑soN, f u y a r u ba↓Qka ʃu↑uheNno u↓mini, sa↑NkotsU sa↑reta. 86 | ./tsukuyomi/VOICEACTRESS100_086.wav|ko↑Qkyooo ko↑ete, re↑Qʃawa, ka↑iryoo sa↑reta za↑iraiseNni so↓Qte, a↑aheNʧuuoo↓ekini mu↑kau. 87 | ./tsukuyomi/VOICEACTRESS100_087.wav|fU↑kuokadaieeho↓okUsudewanaku, ʧo↑okyorihoono ho↑kyooo me↑za↓ʃIte i↑ta, o↑osakakiNtetsuba↓faroozukara, o↓faao u↑ke↓te nyu↑udaN. 88 | ./tsukuyomi/VOICEACTRESS100_088.wav|so↑koniwa, hya↑kudoru↓satsUto, a↑merikani ko↓ito i↑u, mi↑jika↓i me↑QseejidakedaQta. 89 | ./tsukuyomi/VOICEACTRESS100_089.wav|ge↓Nzaiwa, ba↓Qhao mo↑ʧi↓ifUto ʃI↑ta, ha↑apUʃiko↓odono sa↑Qkyoku↓kato ʃI↑te, ki↑okU sa↑rete i↑ru. 90 | ./tsukuyomi/VOICEACTRESS100_090.wav|se↑Ntoo↓fUkuwa, ryo↑o↓udeo ro↑ʃUtsu ʃ i, ryo↑okyakuga, a↑Ndaasu↓utsude o↑owarete i↑ru. 91 | ./tsukuyomi/VOICEACTRESS100_091.wav|do↑obo↓aneni, su↑weedeNo↓ohi, jo↑zefi↓inuga i↑ru. 92 | ./tsukuyomi/VOICEACTRESS100_092.wav|ʃi↑gaiseNwa, hyo↑omeNene↓rugiino, ʧi↑isa↓i po↓rimaao se↑QʧakU su↑ru sa↓ino, ze↑N↓ʃorini ri↑yoo sa↑reru. 93 | ./tsukuyomi/VOICEACTRESS100_093.wav|ji↓ʃiNno pe↑ejide, me↓Qseejiya, ko↑okaiko↓meNtoo to↑oʃi↓te, re↓byuuo to↑okoo ʃI↑ta yu↓uzaato, ko↑myunike↓eʃoNo to↓ru ko↑to↓ga ka↑noodearu. 94 | ./tsukuyomi/VOICEACTRESS100_094.wav|wa↓kakI hi↑no ha↑Ngyaku↓yueni, u↓ʧuuno ʧu↑uo↓oo tsu↑ihoo sa↑rete, wa↑kUsee, ʧI↑kyuuni ya↑QtekIta ʃu↑ji↓Nkoo, be↓ruzebabuga, u↑ʧuuseN ka↓runaakuno na↓kade, ma↑go↓ni ka↑taru so↑odaina mo↑noga↓tari. 95 | ./tsukuyomi/VOICEACTRESS100_095.wav|ja↓gaatowa ta↑iʃoo↓tekini, bo↑diibi↓rudaao ho↑ofUtsuto sa↑seru, ma↓Qʧona ta↑iikUkai↓keeno ga↑ikeNga to↑kUʧoo. -------------------------------------------------------------------------------- /filelists/ljs_audio_text_val_filelist.txt.cleaned: -------------------------------------------------------------------------------- 1 | DUMMY1/LJ022-0023.wav|ðɪ ˌoʊvɚwˈɛlmɪŋ mədʒˈɔːɹɪɾi ʌv pˈiːpəl ɪn ðɪs kˈʌntɹi nˈoʊ hˌaʊ tə sˈɪft ðə wˈiːt fɹʌmðə tʃˈæf ɪn wˌʌt ðeɪ hˈɪɹ ænd wˌʌt ðeɪ ɹˈiːd. 2 | DUMMY1/LJ043-0030.wav|ɪf sˈʌmbɑːdi dˈɪd ðˈæt tə mˌiː, ɐ lˈaʊsi tɹˈɪk lˈaɪk ðˈæt, tə tˈeɪk maɪ wˈaɪf ɐwˈeɪ, ænd ˈɔːl ðə fˈɜːnɪtʃɚ, ˈaɪ wʊd biː mˈæd æz hˈɛl, tˈuː. 3 | DUMMY1/LJ005-0201.wav|ˌæzˌɪz ʃˈoʊn baɪ ðə ɹɪpˈoːɹt ʌvðə kəmˈɪʃənɚz tʊ ɪnkwˈaɪɚɹ ˌɪntʊ ðə stˈeɪt ʌvðə mjuːnˈɪsɪpəl kˌɔːɹpɚɹˈeɪʃənz ɪn eɪtˈiːn θˈɜːɾifˈaɪv. 4 | DUMMY1/LJ001-0110.wav|ˈiːvən ðə kˈæslɑːn tˈaɪp wɛn ɛnlˈɑːɹdʒd ʃˈoʊz ɡɹˈeɪt ʃˈɔːɹtkʌmɪŋz ɪn ðɪs ɹɪspˈɛkt: 5 | DUMMY1/LJ003-0345.wav|ˈɔːl ðə kəmˈɪɾi kʊd dˈuː ɪn ðɪs ɹɪspˈɛkt wʌz tə θɹˈoʊ ðə ɹɪspˌɑːnsəbˈɪlɪɾi ˌɑːn ˈʌðɚz. 6 | DUMMY1/LJ007-0154.wav|ðiːz pˈʌndʒənt ænd wˈɛlɡɹˈaʊndᵻd stɹˈɪktʃɚz ɐplˈaɪd wɪð stˈɪl ɡɹˈeɪɾɚ fˈoːɹs tə ðɪ ʌnkənvˈɪktᵻd pɹˈɪzənɚ, ðə mˈæn hˌuː kˈeɪm tə ðə pɹˈɪzən ˈɪnəsənt, ænd stˈɪl ʌnkəntˈæmᵻnˌeɪɾᵻd, 7 | DUMMY1/LJ018-0098.wav|ænd ɹˈɛkəɡnˌaɪzd æz wˈʌn ʌvðə fɹˈiːkwɛntɚz ʌvðə bˈoʊɡəs lˈɔːstˈeɪʃənɚz. hɪz ɐɹˈɛst lˈɛd tə ðæt ʌv ˈʌðɚz. 8 | DUMMY1/LJ047-0044.wav|ˈɑːswəld wʌz, haʊˈɛvɚ, wˈɪlɪŋ tə dɪskˈʌs hɪz kˈɑːntækts wɪð sˈoʊviət ɐθˈɔːɹɪɾiz. hiː dɪnˈaɪd hˌævɪŋ ˌɛni ɪnvˈɑːlvmənt wɪð sˈoʊviət ɪntˈɛlɪdʒəns ˈeɪdʒənsiz 9 | DUMMY1/LJ031-0038.wav|ðə fˈɜːst fɪzˈɪʃən tə sˈiː ðə pɹˈɛzɪdənt æt pˈɑːɹklənd hˈɑːspɪɾəl wʌz dˈɑːktɚ tʃˈɑːɹlz dʒˈeɪ. kˈæɹɪkˌoʊ, ɐ ɹˈɛzɪdənt ɪn dʒˈɛnɚɹəl sˈɜːdʒɚɹi. 10 | DUMMY1/LJ048-0194.wav|dˈʊɹɪŋ ðə mˈɔːɹnɪŋ ʌv noʊvˈɛmbɚ twˈɛntitˈuː pɹˈaɪɚ tə ðə mˈoʊɾɚkˌeɪd. 11 | DUMMY1/LJ049-0026.wav|ˌɑːn əkˈeɪʒən ðə sˈiːkɹət sˈɜːvɪs hɐzbɪn pɚmˈɪɾᵻd tə hæv ɐn ˈeɪdʒənt ɹˈaɪdɪŋ ɪnðə pˈæsɪndʒɚ kəmpˈɑːɹtmənt wɪððə pɹˈɛzɪdənt. 12 | DUMMY1/LJ004-0152.wav|ɑːlðˈoʊ æt mˈɪstɚ bˈʌkstənz vˈɪzɪt ɐ nˈuː dʒˈeɪl wʌz ɪn pɹˈɑːsɛs ʌv ɪɹˈɛkʃən, ðə fˈɜːst stˈɛp tʊwˈɔːɹdz ɹɪfˈɔːɹm sˈɪns hˈaʊɚdz vˌɪzɪtˈeɪʃən ɪn sˌɛvəntˈiːn sˈɛvəntifˈoːɹ. 13 | DUMMY1/LJ008-0278.wav|ɔːɹ ðˈɛɹz mˌaɪt biː wˈʌn ʌv mˈɛni, ænd ɪt mˌaɪt biː kənsˈɪdɚd nˈɛsəsɚɹi tuː "mˌeɪk ɐn ɛɡzˈæmpəl." 14 | DUMMY1/LJ043-0002.wav|ðə wˈɔːɹən kəmˈɪʃən ɹɪpˈoːɹt. baɪ ðə pɹˈɛzɪdənts kəmˈɪʃən ɑːnðɪ ɐsˌæsᵻnˈeɪʃən ʌv pɹˈɛzɪdənt kˈɛnədi. tʃˈæptɚ sˈɛvən. lˈiː hˈɑːɹvi ˈɑːswəld: 15 | DUMMY1/LJ009-0114.wav|mˈɪstɚ wˈeɪkfiːld wˈaɪndz ˈʌp hɪz ɡɹˈæfɪk bˌʌt sˈʌmwʌt sɛnsˈeɪʃənəl ɐkˈaʊnt baɪ dɪskɹˈaɪbɪŋ ɐnˈʌðɚ ɹɪlˈɪdʒəs sˈɜːvɪs, wˌɪtʃ mˈeɪ ɐpɹˈoʊpɹɪətli biː ɪnsˈɜːɾᵻd hˈɪɹ. 16 | DUMMY1/LJ028-0506.wav|ɐ mˈɑːdɚn ˈɑːɹɾɪst wʊdhɐv dˈɪfɪkˌʌlti ɪn dˌuːɪŋ sˈʌtʃ ˈækjʊɹət wˈɜːk. 17 | DUMMY1/LJ050-0168.wav|wɪððə pɚtˈɪkjʊlɚ pˈɜːpəsᵻz ʌvðɪ ˈeɪdʒənsi ɪnvˈɑːlvd. ðə kəmˈɪʃən ɹˈɛkəɡnˌaɪzɪz ðæt ðɪs ɪz ɐ kˌɑːntɹəvˈɜːʃəl ˈɛɹiə 18 | DUMMY1/LJ039-0223.wav|ˈɑːswəldz mɚɹˈiːn tɹˈeɪnɪŋ ɪn mˈɑːɹksmənʃˌɪp, hɪz ˈʌðɚ ɹˈaɪfəl ɛkspˈiəɹɪəns ænd hɪz ɪstˈæblɪʃt fəmˌɪlɪˈæɹɪɾi wɪð ðɪs pɚtˈɪkjʊlɚ wˈɛpən 19 | DUMMY1/LJ029-0032.wav|ɐkˈoːɹdɪŋ tʊ oʊdˈɑːnəl, kwˈoʊt, wiː hɐd ɐ mˈoʊɾɚkˌeɪd wɛɹɹˈɛvɚ wiː wˈɛnt, ˈɛnd kwˈoʊt. 20 | DUMMY1/LJ031-0070.wav|dˈɑːktɚ klˈɑːɹk, hˌuː mˈoʊst klˈoʊsli ɑːbzˈɜːvd ðə hˈɛd wˈuːnd, 21 | DUMMY1/LJ034-0198.wav|jˈuːɪnz, hˌuː wʌz ɑːnðə saʊθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən stɹˈiːts tˈɛstɪfˌaɪd ðæt hiː kʊd nˌɑːt dɪskɹˈaɪb ðə mˈæn hiː sˈɔː ɪnðə wˈɪndoʊ. 22 | DUMMY1/LJ026-0068.wav|ˈɛnɚdʒi ˈɛntɚz ðə plˈænt, tʊ ɐ smˈɔːl ɛkstˈɛnt, 23 | DUMMY1/LJ039-0075.wav|wˈʌns juː nˈoʊ ðæt juː mˈʌst pˌʊt ðə kɹˈɔshɛɹz ɑːnðə tˈɑːɹɡɪt ænd ðæt ɪz ˈɔːl ðæt ɪz nˈɛsəsɚɹi. 24 | DUMMY1/LJ004-0096.wav|ðə fˈeɪɾəl kˈɑːnsɪkwənsᵻz wˈɛɹɑːf mˌaɪt biː pɹɪvˈɛntᵻd ɪf ðə dʒˈʌstɪsᵻz ʌvðə pˈiːs wɜː djˈuːli ˈɔːθɚɹˌaɪzd 25 | DUMMY1/LJ005-0014.wav|spˈiːkɪŋ ˌɑːn ɐ dɪbˈeɪt ˌɑːn pɹˈɪzən mˈæɾɚz, hiː dᵻklˈɛɹd ðˈæt 26 | DUMMY1/LJ012-0161.wav|hiː wʌz ɹɪpˈoːɹɾᵻd tə hæv fˈɔːlən ɐwˈeɪ tʊ ɐ ʃˈædoʊ. 27 | DUMMY1/LJ018-0239.wav|hɪz dˌɪsɐpˈɪɹəns ɡˈeɪv kˈʌlɚ ænd sˈʌbstəns tʊ ˈiːvəl ɹɪpˈoːɹts ɔːlɹˌɛdi ɪn sˌɜːkjʊlˈeɪʃən ðætðə wɪl ænd kənvˈeɪəns əbˌʌv ɹɪfˈɜːd tuː 28 | DUMMY1/LJ019-0257.wav|hˈɪɹ ðə tɹˈɛdwˈiːl wʌz ɪn jˈuːs, ðɛɹ sˈɛljʊlɚ kɹˈæŋks, ɔːɹ hˈɑːɹdlˈeɪbɚ məʃˈiːnz. 29 | DUMMY1/LJ028-0008.wav|juː tˈæp dʒˈɛntli wɪð jʊɹ hˈiːl əpˌɑːn ðə ʃˈoʊldɚɹ ʌvðə dɹˈoʊmdɚɹi tʊ ˈɜːdʒ hɜːɹ ˈɑːn. 30 | DUMMY1/LJ024-0083.wav|ðɪs plˈæn ʌv mˈaɪn ɪz nˈoʊ ɐtˈæk ɑːnðə kˈoːɹt; 31 | DUMMY1/LJ042-0129.wav|nˈoʊ nˈaɪt klˈʌbz ɔːɹ bˈoʊlɪŋ ˈælɪz, nˈoʊ plˈeɪsᵻz ʌv ɹˌɛkɹiːˈeɪʃən ɛksˈɛpt ðə tɹˈeɪd jˈuːniən dˈænsᵻz. ˈaɪ hæv hɐd ɪnˈʌf. 32 | DUMMY1/LJ036-0103.wav|ðə pəlˈiːs ˈæskt hˌɪm wˈɛðɚ hiː kʊd pˈɪk ˈaʊt hɪz pˈæsɪndʒɚ fɹʌmðə lˈaɪnʌp. 33 | DUMMY1/LJ046-0058.wav|dˈʊɹɪŋ hɪz pɹˈɛzɪdənsi, fɹˈæŋklɪn dˈiː. ɹˈoʊzəvˌɛlt mˌeɪd ˈɔːlmoʊst fˈoːɹ hˈʌndɹəd dʒˈɜːnɪz ænd tɹˈævəld mˈoːɹ ðɐn θɹˈiː hˈʌndɹəd fˈɪfti θˈaʊzənd mˈaɪlz. 34 | DUMMY1/LJ014-0076.wav|hiː wʌz sˈiːn ˈæftɚwɚdz smˈoʊkɪŋ ænd tˈɔːkɪŋ wɪð hɪz hˈoʊsts ɪn ðɛɹ bˈæk pˈɑːɹlɚ, ænd nˈɛvɚ sˈiːn ɐɡˈɛn ɐlˈaɪv. 35 | DUMMY1/LJ002-0043.wav|lˈɑːŋ nˈæɹoʊ ɹˈuːmz wˈʌn θˈɜːɾisˈɪks fˈiːt, sˈɪks twˈɛntiθɹˈiː fˈiːt, ænd ðɪ ˈeɪtθ eɪtˈiːn, 36 | DUMMY1/LJ009-0076.wav|wiː kˈʌm tə ðə sˈɜːmən. 37 | DUMMY1/LJ017-0131.wav|ˈiːvən wɛn ðə hˈaɪ ʃˈɛɹɪf hɐd tˈoʊld hˌɪm ðɛɹwˌʌz nˈoʊ pˌɑːsəbˈɪlɪɾi əvɚ ɹɪpɹˈiːv, ænd wɪðˌɪn ɐ fjˈuː ˈaɪʊɹz ʌv ˌɛksɪkjˈuːʃən. 38 | DUMMY1/LJ046-0184.wav|bˌʌt ðɛɹ ɪz ɐ sˈɪstəm fɚðɪ ɪmˈiːdɪət nˌoʊɾɪfɪkˈeɪʃən ʌvðə sˈiːkɹət sˈɜːvɪs baɪ ðə kənfˈaɪnɪŋ ˌɪnstɪtˈuːʃən wɛn ɐ sˈʌbdʒɛkt ɪz ɹɪlˈiːsd ɔːɹ ɛskˈeɪps. 39 | DUMMY1/LJ014-0263.wav|wˌɛn ˈʌðɚ plˈɛʒɚz pˈɔːld hiː tˈʊk ɐ θˈiəɾɚ, ænd pˈoʊzd æz ɐ mjuːnˈɪfɪsənt pˈeɪtɹən ʌvðə dɹəmˈæɾɪk ˈɑːɹt. 40 | DUMMY1/LJ042-0096.wav| ˈoʊld ɛkstʃˈeɪndʒ ɹˈeɪt ɪn ɐdˈɪʃən tə hɪz fˈæktɚɹi sˈælɚɹi ʌv ɐpɹˈɑːksɪmətli ˈiːkwəl ɐmˈaʊnt 41 | DUMMY1/LJ049-0050.wav|hˈɪl hɐd bˈoʊθ fˈiːt ɑːnðə kˈɑːɹ ænd wʌz klˈaɪmɪŋ ɐbˈoːɹd tʊ ɐsˈɪst pɹˈɛzɪdənt ænd mɪsˈɛs kˈɛnədi. 42 | DUMMY1/LJ019-0186.wav|sˈiːɪŋ ðæt sˈɪns ðɪ ɪstˈæblɪʃmənt ʌvðə sˈɛntɹəl kɹˈɪmɪnəl kˈoːɹt, nˈuːɡeɪt ɹɪsˈiːvd pɹˈɪzənɚz fɔːɹ tɹˈaɪəl fɹʌm sˈɛvɹəl kˈaʊntɪz, 43 | DUMMY1/LJ028-0307.wav|ðˈɛn lˈɛt twˈɛnti dˈeɪz pˈæs, ænd æt ðɪ ˈɛnd ʌv ðæt tˈaɪm stˈeɪʃən nˌɪɹ ðə tʃˈældæsən ɡˈeɪts ɐ bˈɑːdi ʌv fˈoːɹ θˈaʊzənd. 44 | DUMMY1/LJ012-0235.wav|wˌaɪl ðeɪ wɜːɹ ɪn ɐ stˈeɪt ʌv ɪnsˌɛnsəbˈɪlɪɾi ðə mˈɜːdɚ wʌz kəmˈɪɾᵻd. 45 | DUMMY1/LJ034-0053.wav|ɹˈiːtʃt ðə sˈeɪm kənklˈuːʒən æz lætˈoʊnə ðætðə pɹˈɪnts fˈaʊnd ɑːnðə kˈɑːɹtənz wɜː ðoʊz ʌv lˈiː hˈɑːɹvi ˈɑːswəld. 46 | DUMMY1/LJ014-0030.wav|ðiːz wɜː dˈæmnətˌoːɹi fˈækts wˌɪtʃ wˈɛl səpˈoːɹɾᵻd ðə pɹˌɑːsɪkjˈuːʃən. 47 | DUMMY1/LJ015-0203.wav|bˌʌt wɜː ðə pɹɪkˈɔːʃənz tˈuː mˈɪnɪt, ðə vˈɪdʒɪləns tˈuː klˈoʊs təbi ɪlˈuːdᵻd ɔːɹ ˌoʊvɚkˈʌm? 48 | DUMMY1/LJ028-0093.wav|bˌʌt hɪz skɹˈaɪb ɹˈoʊt ɪt ɪnðə mˈænɚ kˈʌstəmˌɛɹi fɚðə skɹˈaɪbz ʌv ðoʊz dˈeɪz tə ɹˈaɪt ʌv ðɛɹ ɹˈɔɪəl mˈæstɚz. 49 | DUMMY1/LJ002-0018.wav|ðɪ ɪnˈædɪkwəsi ʌvðə dʒˈeɪl wʌz nˈoʊɾɪsd ænd ɹɪpˈoːɹɾᵻd əpˌɑːn ɐɡˈɛn ænd ɐɡˈɛn baɪ ðə ɡɹˈænd dʒˈʊɹɪz ʌvðə sˈɪɾi ʌv lˈʌndən, 50 | DUMMY1/LJ028-0275.wav|æt lˈæst, ɪnðə twˈɛntiəθ mˈʌnθ, 51 | DUMMY1/LJ012-0042.wav|wˌɪtʃ hiː kˈɛpt kənsˈiːld ɪn ɐ hˈaɪdɪŋplˈeɪs wɪð ɐ tɹˈæpdˈoːɹ dʒˈʌst ˌʌndɚ hɪz bˈɛd. 52 | DUMMY1/LJ011-0096.wav|hiː mˈæɹɪd ɐ lˈeɪdi ˈɑːlsoʊ bɪlˈɑːŋɪŋ tə ðə səsˈaɪəɾi ʌv fɹˈɛndz, hˌuː bɹˈɔːt hˌɪm ɐ lˈɑːɹdʒ fˈɔːɹtʃən, wˈɪtʃ, ænd hɪz ˈoʊn mˈʌni, hiː pˌʊt ˌɪntʊ ɐ sˈɪɾi fˈɜːm, 53 | DUMMY1/LJ036-0077.wav|ɹˈɑːdʒɚ dˈiː. kɹˈeɪɡ, ɐ dˈɛpjuːɾi ʃˈɛɹɪf ʌv dˈæləs kˈaʊnti, 54 | DUMMY1/LJ016-0318.wav|ˈʌðɚɹ əfˈɪʃəlz, ɡɹˈeɪt lˈɔɪɚz, ɡˈʌvɚnɚz ʌv pɹˈɪzənz, ænd tʃˈæplɪnz səpˈoːɹɾᵻd ðɪs vjˈuː. 55 | DUMMY1/LJ013-0164.wav|hˌuː kˈeɪm fɹʌm hɪz ɹˈuːm ɹˈɛdi dɹˈɛst, ɐ səspˈɪʃəs sˈɜːkəmstˌæns, æz hiː wʌz ˈɔːlweɪz lˈeɪt ɪnðə mˈɔːɹnɪŋ. 56 | DUMMY1/LJ027-0141.wav|ɪz klˈoʊsli ɹɪpɹədˈuːst ɪnðə lˈaɪfhˈɪstɚɹi ʌv ɛɡzˈɪstɪŋ dˈɪɹ. ˈɔːɹ, ɪn ˈʌðɚ wˈɜːdz, 57 | DUMMY1/LJ028-0335.wav|ɐkˈoːɹdɪŋli ðeɪ kəmˈɪɾᵻd tə hˌɪm ðə kəmˈænd ʌv ðɛɹ hˈoʊl ˈɑːɹmi, ænd pˌʊt ðə kˈiːz ʌv ðɛɹ sˈɪɾi ˌɪntʊ hɪz hˈændz. 58 | DUMMY1/LJ031-0202.wav|mɪsˈɛs kˈɛnədi tʃˈoʊz ðə hˈɑːspɪɾəl ɪn bəθˈɛzdə fɚðɪ ˈɔːtɑːpsi bɪkˈʌz ðə pɹˈɛzɪdənt hɐd sˈɜːvd ɪnðə nˈeɪvi. 59 | DUMMY1/LJ021-0145.wav|fɹʌm ðoʊz wˈɪlɪŋ tə dʒˈɔɪn ɪn ɪstˈæblɪʃɪŋ ðɪs hˈoʊptfɔːɹ pˈiəɹɪəd ʌv pˈiːs, 60 | DUMMY1/LJ016-0288.wav|"mˈʌlɚ, mˈʌlɚ, hiːz ðə mˈæn," tˈɪl ɐ daɪvˈɜːʒən wʌz kɹiːˈeɪɾᵻd baɪ ðɪ ɐpˈɪɹəns ʌvðə ɡˈæloʊz, wˌɪtʃ wʌz ɹɪsˈiːvd wɪð kəntˈɪnjuːəs jˈɛlz. 61 | DUMMY1/LJ028-0081.wav|jˈɪɹz lˈeɪɾɚ, wˌɛn ðɪ ˌɑːɹkiːˈɑːlədʒˌɪsts kʊd ɹˈɛdɪli dɪstˈɪŋɡwɪʃ ðə fˈɑːls fɹʌmðə tɹˈuː, 62 | DUMMY1/LJ018-0081.wav|hɪz dɪfˈɛns bˌiːɪŋ ðæt hiː hɐd ɪntˈɛndᵻd tə kəmˈɪt sˈuːɪsˌaɪd, bˌʌt ðˈæt, ɑːnðɪ ɐpˈɪɹəns ʌv ðɪs ˈɑːfɪsɚ hˌuː hɐd ɹˈɔŋd hˌɪm, 63 | DUMMY1/LJ021-0066.wav|təɡˌɛðɚ wɪð ɐ ɡɹˈeɪt ˈɪnkɹiːs ɪnðə pˈeɪɹoʊlz, ðɛɹ hɐz kˈʌm ɐ səbstˈænʃəl ɹˈaɪz ɪnðə tˈoʊɾəl ʌv ɪndˈʌstɹɪəl pɹˈɑːfɪts 64 | DUMMY1/LJ009-0238.wav|ˈæftɚ ðɪs ðə ʃˈɛɹɪfs sˈɛnt fɔːɹ ɐnˈʌðɚ ɹˈoʊp, bˌʌt ðə spɛktˈeɪɾɚz ˌɪntəfˈɪɹd, ænd ðə mˈæn wʌz kˈæɹɪd bˈæk tə dʒˈeɪl. 65 | DUMMY1/LJ005-0079.wav|ænd ɪmpɹˈuːv ðə mˈɔːɹəlz ʌvðə pɹˈɪzənɚz, ænd ʃˌæl ɪnʃˈʊɹ ðə pɹˈɑːpɚ mˈɛʒɚɹ ʌv pˈʌnɪʃmənt tə kənvˈɪktᵻd əfˈɛndɚz. 66 | DUMMY1/LJ035-0019.wav|dɹˈoʊv tə ðə nɔːɹθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən, ænd pˈɑːɹkt ɐpɹˈɑːksɪmətli tˈɛn fˈiːt fɹʌmðə tɹˈæfɪk sˈɪɡnəl. 67 | DUMMY1/LJ036-0174.wav|ðɪs ɪz ðɪ ɐpɹˈɑːksɪmət tˈaɪm hiː ˈɛntɚd ðə ɹˈuːmɪŋhˌaʊs, ɐkˈoːɹdɪŋ tʊ ˈɜːliːn ɹˈɑːbɚts, ðə hˈaʊskiːpɚ ðˈɛɹ. 68 | DUMMY1/LJ046-0146.wav|ðə kɹaɪtˈiəɹɪə ɪn ɪfˈɛkt pɹˈaɪɚ tə noʊvˈɛmbɚ twˈɛntitˈuː, naɪntˈiːn sˈɪkstiθɹˈiː, fɔːɹ dɪtˈɜːmɪnɪŋ wˈɛðɚ tʊ ɐksˈɛpt mətˈiəɹɪəl fɚðə pˌiːˌɑːɹˈɛs dʒˈɛnɚɹəl fˈaɪlz 69 | DUMMY1/LJ017-0044.wav|ænd ðə dˈiːpəst æŋzˈaɪəɾi wʌz fˈɛlt ðætðə kɹˈaɪm, ɪf kɹˈaɪm ðˈɛɹ hɐdbɪn, ʃˌʊd biː bɹˈɔːt hˈoʊm tʊ ɪts pˈɜːpɪtɹˌeɪɾɚ. 70 | DUMMY1/LJ017-0070.wav|bˌʌt hɪz spˈoːɹɾɪŋ ˌɑːpɚɹˈeɪʃənz dɪdnˌɑːt pɹˈɑːspɚ, ænd hiː bɪkˌeɪm ɐ nˈiːdi mˈæn, ˈɔːlweɪz dɹˈɪvən tə dˈɛspɚɹət stɹˈeɪts fɔːɹ kˈæʃ. 71 | DUMMY1/LJ014-0020.wav|hiː wʌz sˈuːn ˈæftɚwɚdz ɐɹˈɛstᵻd ˌɑːn səspˈɪʃən, ænd ɐ sˈɜːtʃ ʌv hɪz lˈɑːdʒɪŋz bɹˈɔːt tə lˈaɪt sˈɛvɹəl ɡˈɑːɹmənts sˈætʃɚɹˌeɪɾᵻd wɪð blˈʌd; 72 | DUMMY1/LJ016-0020.wav|hiː nˈɛvɚ ɹˈiːtʃt ðə sˈɪstɚn, bˌʌt fˈɛl bˈæk ˌɪntʊ ðə jˈɑːɹd, ˈɪndʒɚɹɪŋ hɪz lˈɛɡz sɪvˈɪɹli. 73 | DUMMY1/LJ045-0230.wav|wˌɛn hiː wʌz fˈaɪnəli ˌæpɹɪhˈɛndᵻd ɪnðə tˈɛksəs θˈiəɾɚ. ɑːlðˈoʊ ɪt ɪz nˌɑːt fˈʊli kɚɹˈɑːbɚɹˌeɪɾᵻd baɪ ˈʌðɚz hˌuː wɜː pɹˈɛzənt, 74 | DUMMY1/LJ035-0129.wav|ænd ʃiː mˈʌstɐv ɹˈʌn dˌaʊn ðə stˈɛɹz ɐhˈɛd ʌv ˈɑːswəld ænd wʊd pɹˈɑːbəbli hæv sˈiːn ɔːɹ hˈɜːd hˌɪm. 75 | DUMMY1/LJ008-0307.wav|ˈæftɚwɚdz ɛkspɹˈɛs ɐ wˈɪʃ tə mˈɜːdɚ ðə ɹɪkˈoːɹdɚ fɔːɹ hˌævɪŋ kˈɛpt ðˌɛm sˌoʊ lˈɑːŋ ɪn səspˈɛns. 76 | DUMMY1/LJ008-0294.wav|nˌɪɹli ɪndˈɛfɪnətli dɪfˈɜːd. 77 | DUMMY1/LJ047-0148.wav|ˌɑːn ɑːktˈoʊbɚ twˈɛntifˈaɪv, 78 | DUMMY1/LJ008-0111.wav|ðeɪ ˈɛntɚd ˈeɪ "stˈoʊn kˈoʊld ɹˈuːm," ænd wɜː pɹˈɛzəntli dʒˈɔɪnd baɪ ðə pɹˈɪzənɚ. 79 | DUMMY1/LJ034-0042.wav|ðæt hiː kʊd ˈoʊnli tˈɛstɪfˌaɪ wɪð sˈɜːtənti ðætðə pɹˈɪnt wʌz lˈɛs ðɐn θɹˈiː dˈeɪz ˈoʊld. 80 | DUMMY1/LJ037-0234.wav|mɪsˈɛs mˈɛɹi bɹˈɑːk, ðə wˈaɪf əvə mɪkˈænɪk hˌuː wˈɜːkt æt ðə stˈeɪʃən, wʌz ðɛɹ æt ðə tˈaɪm ænd ʃiː sˈɔː ɐ wˈaɪt mˈeɪl, 81 | DUMMY1/LJ040-0002.wav|tʃˈæptɚ sˈɛvən. lˈiː hˈɑːɹvi ˈɑːswəld: bˈækɡɹaʊnd ænd pˈɑːsəbəl mˈoʊɾɪvz, pˈɑːɹt wˌʌn. 82 | DUMMY1/LJ045-0140.wav|ðɪ ˈɑːɹɡjuːmənts hiː jˈuːzd tə dʒˈʌstɪfˌaɪ hɪz jˈuːs ʌvðɪ ˈeɪliəs sədʒˈɛst ðæt ˈɑːswəld mˌeɪhɐv kˈʌm tə θˈɪŋk ðætðə hˈoʊl wˈɜːld wʌz bɪkˈʌmɪŋ ɪnvˈɑːlvd 83 | DUMMY1/LJ012-0035.wav|ðə nˈʌmbɚ ænd nˈeɪmz ˌɑːn wˈɑːtʃᵻz, wɜː kˈɛɹfəli ɹɪmˈuːvd ɔːɹ əblˈɪɾɚɹˌeɪɾᵻd ˈæftɚ ðə ɡˈʊdz pˈæst ˌaʊɾəv hɪz hˈændz. 84 | DUMMY1/LJ012-0250.wav|ɑːnðə sˈɛvənθ dʒuːlˈaɪ, eɪtˈiːn θˈɜːɾisˈɛvən, 85 | DUMMY1/LJ016-0179.wav|kəntɹˈæktᵻd wɪð ʃˈɛɹɪfs ænd kənvˈɛnɚz tə wˈɜːk baɪ ðə dʒˈɑːb. 86 | DUMMY1/LJ016-0138.wav|æɾə dˈɪstəns fɹʌmðə pɹˈɪzən. 87 | DUMMY1/LJ027-0052.wav|ðiːz pɹˈɪnsɪpəlz ʌv həmˈɑːlədʒi ɑːɹ ɪsˈɛnʃəl tʊ ɐ kɚɹˈɛkt ɪntˌɜːpɹɪtˈeɪʃən ʌvðə fˈækts ʌv mɔːɹfˈɑːlədʒi. 88 | DUMMY1/LJ031-0134.wav|ˌɑːn wˈʌn əkˈeɪʒən mɪsˈɛs dʒˈɑːnsən, ɐkˈʌmpənɪd baɪ tˈuː sˈiːkɹət sˈɜːvɪs ˈeɪdʒənts, lˈɛft ðə ɹˈuːm tə sˈiː mɪsˈɛs kˈɛnədi ænd mɪsˈɛs kənˈæli. 89 | DUMMY1/LJ019-0273.wav|wˌɪtʃ sˌɜː dʒˈɑːʃjuːə dʒˈɛb tˈoʊld ðə kəmˈɪɾi hiː kənsˈɪdɚd ðə pɹˈɑːpɚɹ ˈɛlɪmənts ʌv pˈiːnəl dˈɪsɪplˌɪn. 90 | DUMMY1/LJ014-0110.wav|æt ðə fˈɜːst ðə bˈɑːksᵻz wɜːɹ ɪmpˈaʊndᵻd, ˈoʊpənd, ænd fˈaʊnd tə kəntˈeɪn mˈɛnɪəv oʊkˈɑːnɚz ɪfˈɛkts. 91 | DUMMY1/LJ034-0160.wav|ˌɑːn bɹˈɛnənz sˈʌbsɪkwənt sˈɜːtən aɪdˈɛntɪfɪkˈeɪʃən ʌv lˈiː hˈɑːɹvi ˈɑːswəld æz ðə mˈæn hiː sˈɔː fˈaɪɚ ðə ɹˈaɪfəl. 92 | DUMMY1/LJ038-0199.wav|ɪlˈɛvən. ɪf ˈaɪ æm ɐlˈaɪv ænd tˈeɪkən pɹˈɪzənɚ, 93 | DUMMY1/LJ014-0010.wav|jˈɛt hiː kʊd nˌɑːt ˌoʊvɚkˈʌm ðə stɹˈeɪndʒ fˌæsᵻnˈeɪʃən ɪt hˈɐd fɔːɹ hˌɪm, ænd ɹɪmˈeɪnd baɪ ðə sˈaɪd ʌvðə kˈɔːɹps tˈɪl ðə stɹˈɛtʃɚ kˈeɪm. 94 | DUMMY1/LJ033-0047.wav|ˈaɪ nˈoʊɾɪsd wɛn ˈaɪ wɛnt ˈaʊt ðætðə lˈaɪt wʌz ˈɑːn, ˈɛnd kwˈoʊt, 95 | DUMMY1/LJ040-0027.wav|hiː wʌz nˈɛvɚ sˈæɾɪsfˌaɪd wɪð ˈɛnɪθˌɪŋ. 96 | DUMMY1/LJ048-0228.wav|ænd ˈʌðɚz hˌuː wɜː pɹˈɛzənt sˈeɪ ðæt nˈoʊ ˈeɪdʒənt wʌz ɪnˈiːbɹɪˌeɪɾᵻd ɔːɹ ˈæktᵻd ɪmpɹˈɑːpɚli. 97 | DUMMY1/LJ003-0111.wav|hiː wʌz ɪn kˈɑːnsɪkwəns pˌʊt ˌaʊɾəv ðə pɹətˈɛkʃən ʌv ðɛɹ ɪntˈɜːnəl lˈɔː, ˈɛnd kwˈoʊt. ðɛɹ kˈoʊd wʌzɐ sˈʌbdʒɛkt ʌv sˌʌm kjˌʊɹɪˈɑːsɪɾi. 98 | DUMMY1/LJ008-0258.wav|lˈɛt mˌiː ɹɪtɹˈeɪs maɪ stˈɛps, ænd spˈiːk mˈoːɹ ɪn diːtˈeɪl ʌvðə tɹˈiːtmənt ʌvðə kəndˈɛmd ɪn ðoʊz blˈʌdθɜːsti ænd bɹˈuːɾəli ɪndˈɪfɹənt dˈeɪz, 99 | DUMMY1/LJ029-0022.wav|ðɪ ɚɹˈɪdʒɪnəl plˈæn kˈɔːld fɚðə pɹˈɛzɪdənt tə spˈɛnd ˈoʊnli wˈʌn dˈeɪ ɪnðə stˈeɪt, mˌeɪkɪŋ wˈɜːlwɪnd vˈɪzɪts tə dˈæləs, fˈɔːɹt wˈɜːθ, sˌæn æntˈoʊnɪˌoʊ, ænd hjˈuːstən. 100 | DUMMY1/LJ004-0045.wav|mˈɪstɚ stˈɜːdʒᵻz bˈoːɹn, sˌɜː dʒˈeɪmz mˈækɪntˌɑːʃ, sˌɜː dʒˈeɪmz skˈɑːɹlɪt, ænd wˈɪljəm wˈɪlbɚfˌoːɹs. 101 | -------------------------------------------------------------------------------- /train_latest_ms.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import itertools 5 | import math 6 | import torch 7 | from torch import nn, optim 8 | from torch.nn import functional as F 9 | from torch.utils.data import DataLoader 10 | from torch.utils.tensorboard import SummaryWriter 11 | import torch.multiprocessing as mp 12 | import torch.distributed as dist 13 | from torch.nn.parallel import DistributedDataParallel as DDP 14 | from torch.cuda.amp import autocast, GradScaler 15 | from pqmf import PQMF 16 | 17 | import commons 18 | import utils 19 | from data_utils import ( 20 | TextAudioSpeakerLoader, 21 | TextAudioSpeakerCollate, 22 | DistributedBucketSampler 23 | ) 24 | from models import ( 25 | SynthesizerTrn, 26 | MultiPeriodDiscriminator, 27 | ) 28 | from losses import ( 29 | generator_loss, 30 | discriminator_loss, 31 | feature_loss, 32 | kl_loss, 33 | subband_stft_loss 34 | ) 35 | from mel_processing import mel_spectrogram_torch, spec_to_mel_torch 36 | from text.symbols import symbols 37 | 38 | torch.autograd.set_detect_anomaly(True) 39 | torch.backends.cudnn.benchmark = True 40 | global_step = 0 41 | 42 | 43 | def main(): 44 | """Assume Single Node Multi GPUs Training Only""" 45 | assert torch.cuda.is_available(), "CPU training is not allowed." 46 | 47 | n_gpus = torch.cuda.device_count() 48 | os.environ['MASTER_ADDR'] = 'localhost' 49 | os.environ['MASTER_PORT'] = '65520' 50 | # n_gpus = 1 51 | 52 | hps = utils.get_hparams() 53 | mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,)) 54 | 55 | 56 | def run(rank, n_gpus, hps): 57 | global global_step 58 | if rank == 0: 59 | logger = utils.get_logger(hps.model_dir) 60 | logger.info(hps) 61 | utils.check_git_hash(hps.model_dir) 62 | writer = SummaryWriter(log_dir=hps.model_dir) 63 | writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) 64 | 65 | dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) 66 | torch.manual_seed(hps.train.seed) 67 | torch.cuda.set_device(rank) 68 | 69 | train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data) 70 | train_sampler = DistributedBucketSampler( 71 | train_dataset, 72 | hps.train.batch_size, 73 | [32,300,400,500,600,700,800,900,1000], 74 | num_replicas=n_gpus, 75 | rank=rank, 76 | shuffle=True) 77 | collate_fn = TextAudioSpeakerCollate() 78 | train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True, 79 | collate_fn=collate_fn, batch_sampler=train_sampler) 80 | if rank == 0: 81 | eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data) 82 | eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False, 83 | batch_size=hps.train.batch_size, pin_memory=True, 84 | drop_last=False, collate_fn=collate_fn) 85 | 86 | net_g = SynthesizerTrn( 87 | len(symbols), 88 | hps.data.filter_length // 2 + 1, 89 | hps.train.segment_size // hps.data.hop_length, 90 | n_speakers=hps.data.n_speakers, 91 | **hps.model).cuda(rank) 92 | net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank) 93 | optim_g = torch.optim.AdamW( 94 | net_g.parameters(), 95 | hps.train.learning_rate, 96 | betas=hps.train.betas, 97 | eps=hps.train.eps) 98 | optim_d = torch.optim.AdamW( 99 | net_d.parameters(), 100 | hps.train.learning_rate, 101 | betas=hps.train.betas, 102 | eps=hps.train.eps) 103 | net_g = DDP(net_g, device_ids=[rank]) 104 | net_d = DDP(net_d, device_ids=[rank]) 105 | 106 | try: 107 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g) 108 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d) 109 | global_step = (epoch_str - 1) * len(train_loader) 110 | except: 111 | epoch_str = 1 112 | global_step = 0 113 | 114 | scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str-2) 115 | scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str-2) 116 | 117 | scaler = GradScaler(enabled=hps.train.fp16_run) 118 | 119 | for epoch in range(epoch_str, hps.train.epochs + 1): 120 | if rank==0: 121 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval]) 122 | else: 123 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None) 124 | scheduler_g.step() 125 | scheduler_d.step() 126 | 127 | 128 | 129 | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): 130 | net_g, net_d = nets 131 | optim_g, optim_d = optims 132 | scheduler_g, scheduler_d = schedulers 133 | train_loader, eval_loader = loaders 134 | if writers is not None: 135 | writer, writer_eval = writers 136 | 137 | train_loader.batch_sampler.set_epoch(epoch) 138 | global global_step 139 | 140 | net_g.train() 141 | net_d.train() 142 | for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(train_loader): 143 | x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True) 144 | spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True) 145 | y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True) 146 | speakers = speakers.cuda(rank, non_blocking=True) 147 | 148 | with autocast(enabled=hps.train.fp16_run): 149 | y_hat, y_hat_mb, l_length, attn, ids_slice, x_mask, z_mask,\ 150 | (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths, speakers) 151 | 152 | mel = spec_to_mel_torch( 153 | spec, 154 | hps.data.filter_length, 155 | hps.data.n_mel_channels, 156 | hps.data.sampling_rate, 157 | hps.data.mel_fmin, 158 | hps.data.mel_fmax) 159 | y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) 160 | y_hat_mel = mel_spectrogram_torch( 161 | y_hat.squeeze(1), 162 | hps.data.filter_length, 163 | hps.data.n_mel_channels, 164 | hps.data.sampling_rate, 165 | hps.data.hop_length, 166 | hps.data.win_length, 167 | hps.data.mel_fmin, 168 | hps.data.mel_fmax 169 | ) 170 | 171 | y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice 172 | 173 | # Discriminator 174 | y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) 175 | with autocast(enabled=False): 176 | loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) 177 | loss_disc_all = loss_disc 178 | optim_d.zero_grad() 179 | scaler.scale(loss_disc_all).backward() 180 | scaler.unscale_(optim_d) 181 | grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) 182 | scaler.step(optim_d) 183 | 184 | 185 | 186 | 187 | with autocast(enabled=hps.train.fp16_run): 188 | # Generator 189 | y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) 190 | with autocast(enabled=False): 191 | loss_dur = torch.sum(l_length.float()) 192 | loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel 193 | loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl 194 | 195 | loss_fm = feature_loss(fmap_r, fmap_g) 196 | loss_gen, losses_gen = generator_loss(y_d_hat_g) 197 | 198 | if hps.model.mb_istft_vits == True: 199 | pqmf = PQMF(y.device) 200 | y_mb = pqmf.analysis(y) 201 | loss_subband = subband_stft_loss(hps, y_mb, y_hat_mb) 202 | else: 203 | loss_subband = torch.tensor(0.0) 204 | 205 | loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl + loss_subband 206 | 207 | optim_g.zero_grad() 208 | scaler.scale(loss_gen_all).backward() 209 | scaler.unscale_(optim_g) 210 | grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) 211 | scaler.step(optim_g) 212 | scaler.update() 213 | 214 | if rank==0: 215 | if global_step % hps.train.log_interval == 0: 216 | lr = optim_g.param_groups[0]['lr'] 217 | losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl, loss_subband] 218 | logger.info('Train Epoch: {} [{:.0f}%]'.format( 219 | epoch, 220 | 100. * batch_idx / len(train_loader))) 221 | logger.info([x.item() for x in losses] + [global_step, lr]) 222 | 223 | scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} 224 | scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl, "loss/g/subband": loss_subband}) 225 | 226 | scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) 227 | scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) 228 | scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) 229 | image_dict = { 230 | "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), 231 | "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), 232 | "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), 233 | "all/attn": utils.plot_alignment_to_numpy(attn[0,0].data.cpu().numpy()) 234 | } 235 | utils.summarize( 236 | writer=writer, 237 | global_step=global_step, 238 | images=image_dict, 239 | scalars=scalar_dict) 240 | 241 | if global_step % hps.train.eval_interval == 0: 242 | evaluate(hps, net_g, eval_loader, writer_eval) 243 | utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step))) 244 | utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step))) 245 | global_step += 1 246 | 247 | 248 | if rank == 0: 249 | logger.info('====> Epoch: {}'.format(epoch)) 250 | 251 | 252 | 253 | 254 | def evaluate(hps, generator, eval_loader, writer_eval): 255 | generator.eval() 256 | with torch.no_grad(): 257 | for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(eval_loader): 258 | x, x_lengths = x.cuda(0), x_lengths.cuda(0) 259 | spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0) 260 | y, y_lengths = y.cuda(0), y_lengths.cuda(0) 261 | speakers = speakers.cuda(0) 262 | 263 | # remove else 264 | x = x[:1] 265 | x_lengths = x_lengths[:1] 266 | spec = spec[:1] 267 | spec_lengths = spec_lengths[:1] 268 | y = y[:1] 269 | y_lengths = y_lengths[:1] 270 | speakers = speakers[:1] 271 | break 272 | y_hat, y_hat_mb, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, max_len=1000) 273 | y_hat_lengths = mask.sum([1,2]).long() * hps.data.hop_length 274 | 275 | mel = spec_to_mel_torch( 276 | spec, 277 | hps.data.filter_length, 278 | hps.data.n_mel_channels, 279 | hps.data.sampling_rate, 280 | hps.data.mel_fmin, 281 | hps.data.mel_fmax) 282 | y_hat_mel = mel_spectrogram_torch( 283 | y_hat.squeeze(1).float(), 284 | hps.data.filter_length, 285 | hps.data.n_mel_channels, 286 | hps.data.sampling_rate, 287 | hps.data.hop_length, 288 | hps.data.win_length, 289 | hps.data.mel_fmin, 290 | hps.data.mel_fmax 291 | ) 292 | image_dict = { 293 | "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()) 294 | } 295 | audio_dict = { 296 | "gen/audio": y_hat[0,:,:y_hat_lengths[0]] 297 | } 298 | if global_step == 0: 299 | image_dict.update({"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())}) 300 | audio_dict.update({"gt/audio": y[0,:,:y_lengths[0]]}) 301 | 302 | utils.summarize( 303 | writer=writer_eval, 304 | global_step=global_step, 305 | images=image_dict, 306 | audios=audio_dict, 307 | audio_sampling_rate=hps.data.sampling_rate 308 | ) 309 | generator.train() 310 | 311 | 312 | if __name__ == "__main__": 313 | os.environ[ 314 | "TORCH_DISTRIBUTED_DEBUG" 315 | ] = "DETAIL" 316 | main() 317 | -------------------------------------------------------------------------------- /attentions.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import torch 5 | from torch import nn 6 | from torch.nn import functional as F 7 | 8 | import commons 9 | import modules 10 | from modules import LayerNorm 11 | 12 | 13 | class Encoder(nn.Module): 14 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs): 15 | super().__init__() 16 | self.hidden_channels = hidden_channels 17 | self.filter_channels = filter_channels 18 | self.n_heads = n_heads 19 | self.n_layers = n_layers 20 | self.kernel_size = kernel_size 21 | self.p_dropout = p_dropout 22 | self.window_size = window_size 23 | 24 | self.drop = nn.Dropout(p_dropout) 25 | self.attn_layers = nn.ModuleList() 26 | self.norm_layers_1 = nn.ModuleList() 27 | self.ffn_layers = nn.ModuleList() 28 | self.norm_layers_2 = nn.ModuleList() 29 | for i in range(self.n_layers): 30 | self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) 31 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 32 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) 33 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 34 | 35 | def forward(self, x, x_mask): 36 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 37 | x = x * x_mask 38 | for i in range(self.n_layers): 39 | y = self.attn_layers[i](x, x, attn_mask) 40 | y = self.drop(y) 41 | x = self.norm_layers_1[i](x + y) 42 | 43 | y = self.ffn_layers[i](x, x_mask) 44 | y = self.drop(y) 45 | x = self.norm_layers_2[i](x + y) 46 | x = x * x_mask 47 | return x 48 | 49 | 50 | class Decoder(nn.Module): 51 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): 52 | super().__init__() 53 | self.hidden_channels = hidden_channels 54 | self.filter_channels = filter_channels 55 | self.n_heads = n_heads 56 | self.n_layers = n_layers 57 | self.kernel_size = kernel_size 58 | self.p_dropout = p_dropout 59 | self.proximal_bias = proximal_bias 60 | self.proximal_init = proximal_init 61 | 62 | self.drop = nn.Dropout(p_dropout) 63 | self.self_attn_layers = nn.ModuleList() 64 | self.norm_layers_0 = nn.ModuleList() 65 | self.encdec_attn_layers = nn.ModuleList() 66 | self.norm_layers_1 = nn.ModuleList() 67 | self.ffn_layers = nn.ModuleList() 68 | self.norm_layers_2 = nn.ModuleList() 69 | for i in range(self.n_layers): 70 | self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) 71 | self.norm_layers_0.append(LayerNorm(hidden_channels)) 72 | self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) 73 | self.norm_layers_1.append(LayerNorm(hidden_channels)) 74 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) 75 | self.norm_layers_2.append(LayerNorm(hidden_channels)) 76 | 77 | def forward(self, x, x_mask, h, h_mask): 78 | """ 79 | x: decoder input 80 | h: encoder output 81 | """ 82 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) 83 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) 84 | x = x * x_mask 85 | for i in range(self.n_layers): 86 | y = self.self_attn_layers[i](x, x, self_attn_mask) 87 | y = self.drop(y) 88 | x = self.norm_layers_0[i](x + y) 89 | 90 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) 91 | y = self.drop(y) 92 | x = self.norm_layers_1[i](x + y) 93 | 94 | y = self.ffn_layers[i](x, x_mask) 95 | y = self.drop(y) 96 | x = self.norm_layers_2[i](x + y) 97 | x = x * x_mask 98 | return x 99 | 100 | 101 | class MultiHeadAttention(nn.Module): 102 | def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): 103 | super().__init__() 104 | assert channels % n_heads == 0 105 | 106 | self.channels = channels 107 | self.out_channels = out_channels 108 | self.n_heads = n_heads 109 | self.p_dropout = p_dropout 110 | self.window_size = window_size 111 | self.heads_share = heads_share 112 | self.block_length = block_length 113 | self.proximal_bias = proximal_bias 114 | self.proximal_init = proximal_init 115 | self.attn = None 116 | 117 | self.k_channels = channels // n_heads 118 | self.conv_q = nn.Conv1d(channels, channels, 1) 119 | self.conv_k = nn.Conv1d(channels, channels, 1) 120 | self.conv_v = nn.Conv1d(channels, channels, 1) 121 | self.conv_o = nn.Conv1d(channels, out_channels, 1) 122 | self.drop = nn.Dropout(p_dropout) 123 | 124 | if window_size is not None: 125 | n_heads_rel = 1 if heads_share else n_heads 126 | rel_stddev = self.k_channels**-0.5 127 | self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 128 | self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) 129 | 130 | nn.init.xavier_uniform_(self.conv_q.weight) 131 | nn.init.xavier_uniform_(self.conv_k.weight) 132 | nn.init.xavier_uniform_(self.conv_v.weight) 133 | if proximal_init: 134 | with torch.no_grad(): 135 | self.conv_k.weight.copy_(self.conv_q.weight) 136 | self.conv_k.bias.copy_(self.conv_q.bias) 137 | 138 | def forward(self, x, c, attn_mask=None): 139 | q = self.conv_q(x) 140 | k = self.conv_k(c) 141 | v = self.conv_v(c) 142 | 143 | x, self.attn = self.attention(q, k, v, mask=attn_mask) 144 | 145 | x = self.conv_o(x) 146 | return x 147 | 148 | def attention(self, query, key, value, mask=None): 149 | # reshape [b, d, t] -> [b, n_h, t, d_k] 150 | b, d, t_s, t_t = (*key.size(), query.size(2)) 151 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) 152 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 153 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) 154 | 155 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) 156 | if self.window_size is not None: 157 | assert t_s == t_t, "Relative attention is only available for self-attention." 158 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) 159 | rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) 160 | scores_local = self._relative_position_to_absolute_position(rel_logits) 161 | scores = scores + scores_local 162 | if self.proximal_bias: 163 | assert t_s == t_t, "Proximal bias is only available for self-attention." 164 | scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) 165 | if mask is not None: 166 | scores = scores.masked_fill(mask == 0, -1e4) 167 | if self.block_length is not None: 168 | assert t_s == t_t, "Local attention is only available for self-attention." 169 | block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) 170 | scores = scores.masked_fill(block_mask == 0, -1e4) 171 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] 172 | p_attn = self.drop(p_attn) 173 | output = torch.matmul(p_attn, value) 174 | if self.window_size is not None: 175 | relative_weights = self._absolute_position_to_relative_position(p_attn) 176 | value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) 177 | output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) 178 | output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] 179 | return output, p_attn 180 | 181 | def _matmul_with_relative_values(self, x, y): 182 | """ 183 | x: [b, h, l, m] 184 | y: [h or 1, m, d] 185 | ret: [b, h, l, d] 186 | """ 187 | ret = torch.matmul(x, y.unsqueeze(0)) 188 | return ret 189 | 190 | def _matmul_with_relative_keys(self, x, y): 191 | """ 192 | x: [b, h, l, d] 193 | y: [h or 1, m, d] 194 | ret: [b, h, l, m] 195 | """ 196 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) 197 | return ret 198 | 199 | def _get_relative_embeddings(self, relative_embeddings, length): 200 | max_relative_position = 2 * self.window_size + 1 201 | # Pad first before slice to avoid using cond ops. 202 | pad_length = max(length - (self.window_size + 1), 0) 203 | slice_start_position = max((self.window_size + 1) - length, 0) 204 | slice_end_position = slice_start_position + 2 * length - 1 205 | if pad_length > 0: 206 | padded_relative_embeddings = F.pad( 207 | relative_embeddings, 208 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) 209 | else: 210 | padded_relative_embeddings = relative_embeddings 211 | used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] 212 | return used_relative_embeddings 213 | 214 | def _relative_position_to_absolute_position(self, x): 215 | """ 216 | x: [b, h, l, 2*l-1] 217 | ret: [b, h, l, l] 218 | """ 219 | batch, heads, length, _ = x.size() 220 | # Concat columns of pad to shift from relative to absolute indexing. 221 | x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]])) 222 | 223 | # Concat extra elements so to add up to shape (len+1, 2*len-1). 224 | x_flat = x.view([batch, heads, length * 2 * length]) 225 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]])) 226 | 227 | # Reshape and slice out the padded elements. 228 | x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] 229 | return x_final 230 | 231 | def _absolute_position_to_relative_position(self, x): 232 | """ 233 | x: [b, h, l, l] 234 | ret: [b, h, l, 2*l-1] 235 | """ 236 | batch, heads, length, _ = x.size() 237 | # padd along column 238 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]])) 239 | x_flat = x.view([batch, heads, length**2 + length*(length -1)]) 240 | # add 0's in the beginning that will skew the elements after reshape 241 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) 242 | x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] 243 | return x_final 244 | 245 | def _attention_bias_proximal(self, length): 246 | """Bias for self-attention to encourage attention to close positions. 247 | Args: 248 | length: an integer scalar. 249 | Returns: 250 | a Tensor with shape [1, 1, length, length] 251 | """ 252 | r = torch.arange(length, dtype=torch.float32) 253 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) 254 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) 255 | 256 | 257 | class FFN(nn.Module): 258 | def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): 259 | super().__init__() 260 | self.in_channels = in_channels 261 | self.out_channels = out_channels 262 | self.filter_channels = filter_channels 263 | self.kernel_size = kernel_size 264 | self.p_dropout = p_dropout 265 | self.activation = activation 266 | self.causal = causal 267 | 268 | if causal: 269 | self.padding = self._causal_padding 270 | else: 271 | self.padding = self._same_padding 272 | 273 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) 274 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) 275 | self.drop = nn.Dropout(p_dropout) 276 | 277 | def forward(self, x, x_mask): 278 | x = self.conv_1(self.padding(x * x_mask)) 279 | if self.activation == "gelu": 280 | x = x * torch.sigmoid(1.702 * x) 281 | else: 282 | x = torch.relu(x) 283 | x = self.drop(x) 284 | x = self.conv_2(self.padding(x * x_mask)) 285 | return x * x_mask 286 | 287 | def _causal_padding(self, x): 288 | if self.kernel_size == 1: 289 | return x 290 | pad_l = self.kernel_size - 1 291 | pad_r = 0 292 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 293 | x = F.pad(x, commons.convert_pad_shape(padding)) 294 | return x 295 | 296 | def _same_padding(self, x): 297 | if self.kernel_size == 1: 298 | return x 299 | pad_l = (self.kernel_size - 1) // 2 300 | pad_r = self.kernel_size // 2 301 | padding = [[0, 0], [0, 0], [pad_l, pad_r]] 302 | x = F.pad(x, commons.convert_pad_shape(padding)) 303 | return x 304 | -------------------------------------------------------------------------------- /modules.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | import numpy as np 4 | import scipy 5 | import torch 6 | from torch import nn 7 | from torch.nn import functional as F 8 | 9 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 10 | from torch.nn.utils import weight_norm, remove_weight_norm 11 | 12 | import commons 13 | from commons import init_weights, get_padding 14 | from transforms import piecewise_rational_quadratic_transform 15 | 16 | 17 | LRELU_SLOPE = 0.1 18 | 19 | 20 | class LayerNorm(nn.Module): 21 | def __init__(self, channels, eps=1e-5): 22 | super().__init__() 23 | self.channels = channels 24 | self.eps = eps 25 | 26 | self.gamma = nn.Parameter(torch.ones(channels)) 27 | self.beta = nn.Parameter(torch.zeros(channels)) 28 | 29 | def forward(self, x): 30 | x = x.transpose(1, -1) 31 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 32 | return x.transpose(1, -1) 33 | 34 | 35 | class ConvReluNorm(nn.Module): 36 | def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): 37 | super().__init__() 38 | self.in_channels = in_channels 39 | self.hidden_channels = hidden_channels 40 | self.out_channels = out_channels 41 | self.kernel_size = kernel_size 42 | self.n_layers = n_layers 43 | self.p_dropout = p_dropout 44 | assert n_layers > 1, "Number of layers should be larger than 0." 45 | 46 | self.conv_layers = nn.ModuleList() 47 | self.norm_layers = nn.ModuleList() 48 | self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 49 | self.norm_layers.append(LayerNorm(hidden_channels)) 50 | self.relu_drop = nn.Sequential( 51 | nn.ReLU(), 52 | nn.Dropout(p_dropout)) 53 | for _ in range(n_layers-1): 54 | self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 55 | self.norm_layers.append(LayerNorm(hidden_channels)) 56 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 57 | self.proj.weight.data.zero_() 58 | self.proj.bias.data.zero_() 59 | 60 | def forward(self, x, x_mask): 61 | x_org = x 62 | for i in range(self.n_layers): 63 | x = self.conv_layers[i](x * x_mask) 64 | x = self.norm_layers[i](x) 65 | x = self.relu_drop(x) 66 | x = x_org + self.proj(x) 67 | return x * x_mask 68 | 69 | 70 | class DDSConv(nn.Module): 71 | """ 72 | Dialted and Depth-Separable Convolution 73 | """ 74 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): 75 | super().__init__() 76 | self.channels = channels 77 | self.kernel_size = kernel_size 78 | self.n_layers = n_layers 79 | self.p_dropout = p_dropout 80 | 81 | self.drop = nn.Dropout(p_dropout) 82 | self.convs_sep = nn.ModuleList() 83 | self.convs_1x1 = nn.ModuleList() 84 | self.norms_1 = nn.ModuleList() 85 | self.norms_2 = nn.ModuleList() 86 | for i in range(n_layers): 87 | dilation = kernel_size ** i 88 | padding = (kernel_size * dilation - dilation) // 2 89 | self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 90 | groups=channels, dilation=dilation, padding=padding 91 | )) 92 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) 93 | self.norms_1.append(LayerNorm(channels)) 94 | self.norms_2.append(LayerNorm(channels)) 95 | 96 | def forward(self, x, x_mask, g=None): 97 | if g is not None: 98 | x = x + g 99 | for i in range(self.n_layers): 100 | y = self.convs_sep[i](x * x_mask) 101 | y = self.norms_1[i](y) 102 | y = F.gelu(y) 103 | y = self.convs_1x1[i](y) 104 | y = self.norms_2[i](y) 105 | y = F.gelu(y) 106 | y = self.drop(y) 107 | x = x + y 108 | return x * x_mask 109 | 110 | 111 | class WN(torch.nn.Module): 112 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): 113 | super(WN, self).__init__() 114 | assert(kernel_size % 2 == 1) 115 | self.hidden_channels =hidden_channels 116 | self.kernel_size = kernel_size, 117 | self.dilation_rate = dilation_rate 118 | self.n_layers = n_layers 119 | self.gin_channels = gin_channels 120 | self.p_dropout = p_dropout 121 | 122 | self.in_layers = torch.nn.ModuleList() 123 | self.res_skip_layers = torch.nn.ModuleList() 124 | self.drop = nn.Dropout(p_dropout) 125 | 126 | if gin_channels != 0: 127 | cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) 128 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') 129 | 130 | for i in range(n_layers): 131 | dilation = dilation_rate ** i 132 | padding = int((kernel_size * dilation - dilation) / 2) 133 | in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, 134 | dilation=dilation, padding=padding) 135 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 136 | self.in_layers.append(in_layer) 137 | 138 | # last one is not necessary 139 | if i < n_layers - 1: 140 | res_skip_channels = 2 * hidden_channels 141 | else: 142 | res_skip_channels = hidden_channels 143 | 144 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) 145 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') 146 | self.res_skip_layers.append(res_skip_layer) 147 | 148 | def forward(self, x, x_mask, g=None, **kwargs): 149 | output = torch.zeros_like(x) 150 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 151 | 152 | if g is not None: 153 | g = self.cond_layer(g) 154 | 155 | for i in range(self.n_layers): 156 | x_in = self.in_layers[i](x) 157 | if g is not None: 158 | cond_offset = i * 2 * self.hidden_channels 159 | g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] 160 | else: 161 | g_l = torch.zeros_like(x_in) 162 | 163 | acts = commons.fused_add_tanh_sigmoid_multiply( 164 | x_in, 165 | g_l, 166 | n_channels_tensor) 167 | acts = self.drop(acts) 168 | 169 | res_skip_acts = self.res_skip_layers[i](acts) 170 | if i < self.n_layers - 1: 171 | res_acts = res_skip_acts[:,:self.hidden_channels,:] 172 | x = (x + res_acts) * x_mask 173 | output = output + res_skip_acts[:,self.hidden_channels:,:] 174 | else: 175 | output = output + res_skip_acts 176 | return output * x_mask 177 | 178 | def remove_weight_norm(self): 179 | if self.gin_channels != 0: 180 | torch.nn.utils.remove_weight_norm(self.cond_layer) 181 | for l in self.in_layers: 182 | torch.nn.utils.remove_weight_norm(l) 183 | for l in self.res_skip_layers: 184 | torch.nn.utils.remove_weight_norm(l) 185 | 186 | 187 | class ResBlock1(torch.nn.Module): 188 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 189 | super(ResBlock1, self).__init__() 190 | self.convs1 = nn.ModuleList([ 191 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 192 | padding=get_padding(kernel_size, dilation[0]))), 193 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 194 | padding=get_padding(kernel_size, dilation[1]))), 195 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 196 | padding=get_padding(kernel_size, dilation[2]))) 197 | ]) 198 | self.convs1.apply(init_weights) 199 | 200 | self.convs2 = nn.ModuleList([ 201 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 202 | padding=get_padding(kernel_size, 1))), 203 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 204 | padding=get_padding(kernel_size, 1))), 205 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 206 | padding=get_padding(kernel_size, 1))) 207 | ]) 208 | self.convs2.apply(init_weights) 209 | 210 | def forward(self, x, x_mask=None): 211 | for c1, c2 in zip(self.convs1, self.convs2): 212 | xt = F.leaky_relu(x, LRELU_SLOPE) 213 | if x_mask is not None: 214 | xt = xt * x_mask 215 | xt = c1(xt) 216 | xt = F.leaky_relu(xt, LRELU_SLOPE) 217 | if x_mask is not None: 218 | xt = xt * x_mask 219 | xt = c2(xt) 220 | x = xt + x 221 | if x_mask is not None: 222 | x = x * x_mask 223 | return x 224 | 225 | def remove_weight_norm(self): 226 | for l in self.convs1: 227 | remove_weight_norm(l) 228 | for l in self.convs2: 229 | remove_weight_norm(l) 230 | 231 | 232 | class ResBlock2(torch.nn.Module): 233 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 234 | super(ResBlock2, self).__init__() 235 | self.convs = nn.ModuleList([ 236 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 237 | padding=get_padding(kernel_size, dilation[0]))), 238 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 239 | padding=get_padding(kernel_size, dilation[1]))) 240 | ]) 241 | self.convs.apply(init_weights) 242 | 243 | def forward(self, x, x_mask=None): 244 | for c in self.convs: 245 | xt = F.leaky_relu(x, LRELU_SLOPE) 246 | if x_mask is not None: 247 | xt = xt * x_mask 248 | xt = c(xt) 249 | x = xt + x 250 | if x_mask is not None: 251 | x = x * x_mask 252 | return x 253 | 254 | def remove_weight_norm(self): 255 | for l in self.convs: 256 | remove_weight_norm(l) 257 | 258 | 259 | class Log(nn.Module): 260 | def forward(self, x, x_mask, reverse=False, **kwargs): 261 | if not reverse: 262 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask 263 | logdet = torch.sum(-y, [1, 2]) 264 | return y, logdet 265 | else: 266 | x = torch.exp(x) * x_mask 267 | return x 268 | 269 | 270 | class Flip(nn.Module): 271 | def forward(self, x, *args, reverse=False, **kwargs): 272 | x = torch.flip(x, [1]) 273 | if not reverse: 274 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 275 | return x, logdet 276 | else: 277 | return x 278 | 279 | 280 | class ElementwiseAffine(nn.Module): 281 | def __init__(self, channels): 282 | super().__init__() 283 | self.channels = channels 284 | self.m = nn.Parameter(torch.zeros(channels,1)) 285 | self.logs = nn.Parameter(torch.zeros(channels,1)) 286 | 287 | def forward(self, x, x_mask, reverse=False, **kwargs): 288 | if not reverse: 289 | y = self.m + torch.exp(self.logs) * x 290 | y = y * x_mask 291 | logdet = torch.sum(self.logs * x_mask, [1,2]) 292 | return y, logdet 293 | else: 294 | x = (x - self.m) * torch.exp(-self.logs) * x_mask 295 | return x 296 | 297 | 298 | class ResidualCouplingLayer(nn.Module): 299 | def __init__(self, 300 | channels, 301 | hidden_channels, 302 | kernel_size, 303 | dilation_rate, 304 | n_layers, 305 | p_dropout=0, 306 | gin_channels=0, 307 | mean_only=False): 308 | assert channels % 2 == 0, "channels should be divisible by 2" 309 | super().__init__() 310 | self.channels = channels 311 | self.hidden_channels = hidden_channels 312 | self.kernel_size = kernel_size 313 | self.dilation_rate = dilation_rate 314 | self.n_layers = n_layers 315 | self.half_channels = channels // 2 316 | self.mean_only = mean_only 317 | 318 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 319 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) 320 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) 321 | self.post.weight.data.zero_() 322 | self.post.bias.data.zero_() 323 | 324 | def forward(self, x, x_mask, g=None, reverse=False): 325 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 326 | h = self.pre(x0) * x_mask 327 | h = self.enc(h, x_mask, g=g) 328 | stats = self.post(h) * x_mask 329 | if not self.mean_only: 330 | m, logs = torch.split(stats, [self.half_channels]*2, 1) 331 | else: 332 | m = stats 333 | logs = torch.zeros_like(m) 334 | 335 | if not reverse: 336 | x1 = m + x1 * torch.exp(logs) * x_mask 337 | x = torch.cat([x0, x1], 1) 338 | logdet = torch.sum(logs, [1,2]) 339 | return x, logdet 340 | else: 341 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 342 | x = torch.cat([x0, x1], 1) 343 | return x 344 | 345 | 346 | class ConvFlow(nn.Module): 347 | def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0): 348 | super().__init__() 349 | self.in_channels = in_channels 350 | self.filter_channels = filter_channels 351 | self.kernel_size = kernel_size 352 | self.n_layers = n_layers 353 | self.num_bins = num_bins 354 | self.tail_bound = tail_bound 355 | self.half_channels = in_channels // 2 356 | 357 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) 358 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.) 359 | self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) 360 | self.proj.weight.data.zero_() 361 | self.proj.bias.data.zero_() 362 | 363 | def forward(self, x, x_mask, g=None, reverse=False): 364 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 365 | h = self.pre(x0) 366 | h = self.convs(h, x_mask, g=g) 367 | h = self.proj(h) * x_mask 368 | 369 | b, c, t = x0.shape 370 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] 371 | 372 | unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels) 373 | unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels) 374 | unnormalized_derivatives = h[..., 2 * self.num_bins:] 375 | 376 | x1, logabsdet = piecewise_rational_quadratic_transform(x1, 377 | unnormalized_widths, 378 | unnormalized_heights, 379 | unnormalized_derivatives, 380 | inverse=reverse, 381 | tails='linear', 382 | tail_bound=self.tail_bound 383 | ) 384 | 385 | x = torch.cat([x0, x1], 1) * x_mask 386 | logdet = torch.sum(logabsdet * x_mask, [1,2]) 387 | if not reverse: 388 | return x, logdet 389 | else: 390 | return x 391 | --------------------------------------------------------------------------------