├── fig
    ├── proposed_model.png
    └── with_tsukuyomi_chan.png
├── monotonic_align
    ├── setup.py
    ├── __init__.py
    └── core.pyx
├── requirements.txt
├── convert_to_22050.py
├── filelists
    ├── filelist_val2.txt.cleaned
    ├── vctk_audio_sid_text_val_filelist.txt
    ├── vctk_audio_sid_text_val_filelist.txt.cleaned
    ├── ljs_audio_text_val_filelist.txt
    ├── filelist_train2.txt.cleaned
    └── ljs_audio_text_val_filelist.txt.cleaned
├── text
    ├── symbols.py
    ├── LICENSE
    ├── __init__.py
    ├── py2kn.json
    ├── japanese.py
    ├── cleaners.py
    └── korean.py
├── preprocess.py
├── configs
    ├── tsukuyomi_chan.json
    ├── ljs_istft_vits.json
    ├── ljs_mb_istft_vits.json
    ├── ljs_mini_istft_vits.json
    ├── ljs_mini_mb_istft_vits.json
    └── ljs_ms_istft_vits.json
├── losses.py
├── inference.ipynb
├── README.md
├── mel_processing.py
├── pqmf.py
├── stft_loss.py
├── commons.py
├── utils.py
├── transforms.py
├── stft.py
├── LICENSE
├── train_latest.py
├── train_latest_ms.py
├── attentions.py
└── modules.py


/fig/proposed_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/misakiudon/MB-iSTFT-VITS-multilingual/HEAD/fig/proposed_model.png


--------------------------------------------------------------------------------
/fig/with_tsukuyomi_chan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/misakiudon/MB-iSTFT-VITS-multilingual/HEAD/fig/with_tsukuyomi_chan.png


--------------------------------------------------------------------------------
/monotonic_align/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from Cython.Build import cythonize
 3 | import numpy
 4 | 
 5 | setup(
 6 |   name = 'monotonic_align',
 7 |   ext_modules = cythonize("core.pyx"),
 8 |   include_dirs=[numpy.get_include()]
 9 | )
10 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Cython==0.29.21
 2 | librosa==0.8.0
 3 | matplotlib==3.3.1
 4 | numpy==1.18.5
 5 | phonemizer==2.2.1
 6 | scipy==1.5.2
 7 | tensorboard==2.3.0
 8 | torch==1.6.0
 9 | torchvision==0.7.0
10 | Unidecode==1.1.1
11 | pysoundfile==0.9.0.post1
12 | pyopenjtalk==0.2.0
13 | jamo==0.4.1
14 | ko_pron==1.3
15 | 


--------------------------------------------------------------------------------
/convert_to_22050.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import librosa
 3 | import argparse
 4 | import soundfile as sf
 5 | 
 6 | if __name__ == '__main__':
 7 |   parser = argparse.ArgumentParser()
 8 |   parser.add_argument("--in_path", default="./tsukuyomi_raw/", required=True)
 9 |   parser.add_argument("--out_path", default="./tsukuyomi/" ,required=True)
10 | 
11 |   args = parser.parse_args()
12 | 
13 |   os.makedirs(args.out_path, exist_ok=True)
14 |   filenames = os.listdir(args.in_path)
15 |   for filename in filenames:
16 |       print(args.in_path+filename)
17 |       y, sr = librosa.core.load(args.in_path+filename, sr=22050, mono=True)
18 |       sf.write(args.out_path+filename, y, sr, subtype="PCM_16")
19 | 


--------------------------------------------------------------------------------
/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from .monotonic_align.core import maximum_path_c
 4 | 
 5 | 
 6 | def maximum_path(neg_cent, mask):
 7 |   """ Cython optimized version.
 8 |   neg_cent: [b, t_t, t_s]
 9 |   mask: [b, t_t, t_s]
10 |   """
11 |   device = neg_cent.device
12 |   dtype = neg_cent.dtype
13 |   neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
14 |   path = np.zeros(neg_cent.shape, dtype=np.int32)
15 | 
16 |   t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
17 |   t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
18 |   maximum_path_c(path, neg_cent, t_t_max, t_s_max)
19 |   return torch.from_numpy(path).to(device=device, dtype=dtype)
20 | 


--------------------------------------------------------------------------------
/filelists/filelist_val2.txt.cleaned:
--------------------------------------------------------------------------------
1 | ./tsukuyomi/VOICEACTRESS100_096.wav|pe↑Nʃirubenia↓ʃuu, pi↑Qtsuba↓aguno, a↑regeeniiko↓okooo so↑tsugyoo ʃ i, ka↑riforuniada↓igaku, ba↑akuree↓kooni nyu↑ugaku.
2 | ./tsukuyomi/VOICEACTRESS100_097.wav|ko↑no ga↓ineNno do↑onyuuniyoQte, sa↑ma↓zamana ba↑rie↓eʃoNno, ryu↑utaino ko↑Npyuutaaʃimyure↓eʃoNga, ta↑ka↓i se↓edode ka↑nooto na↓Qta.
3 | ./tsukuyomi/VOICEACTRESS100_098.wav|i↓nui do↓Qkuni nyu↓ukyo ʃI↑te, o↑obaaho↓oru su↑be↓kIka do↓oka, pa↑fo↓omaNsuga ʧe↓QkU sa↑reta.
4 | ./tsukuyomi/VOICEACTRESS100_099.wav|de↑byuuwe↓etowa, su↑upaabaNtamu↓kyuudewanaku, fe↑zaa↓kyuudaQta.
5 | ./tsukuyomi/VOICEACTRESS100_100.wav|a↓ariiwa, ko↓ouno na↓kao, mi↑namino ba↑ajinia↓ʃuu, wi↑NʧesUtaaʧi↓kakuno, fi↑Qʃaazuhi↓rumade, gu↓No ʃi↑rizo↓ita.


--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Defines the set of symbols used in text input to the model.
 5 | '''
 6 | _pad        = '_'
 7 | _punctuation = ';:,.!?¡¿—…"«»“” '
 8 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ'
 9 | _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
10 | 
11 | '''# korean_cleaners
12 | _pad        = '_'
13 | _punctuation = ',.!?…~'
14 | _letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
15 | '''
16 | 
17 | # Export all symbols:
18 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
19 | 
20 | # Special symbol ids
21 | SPACE_ID = symbols.index(" ")
22 | 


--------------------------------------------------------------------------------
/text/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Keith Ito
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import text
 3 | from utils import load_filepaths_and_text
 4 | 
 5 | if __name__ == '__main__':
 6 |   parser = argparse.ArgumentParser()
 7 |   parser.add_argument("--out_extension", default="cleaned")
 8 |   parser.add_argument("--text_index", default=1, type=int)
 9 |   parser.add_argument("--filelists", nargs="+", default=["filelists/ljs_audio_text_val_filelist.txt", "filelists/ljs_audio_text_test_filelist.txt"])
10 |   parser.add_argument("--text_cleaners", nargs="+", default=["english_cleaners2"])
11 | 
12 |   args = parser.parse_args()
13 |     
14 | 
15 |   for filelist in args.filelists:
16 |     print("START:", filelist)
17 |     filepaths_and_text = load_filepaths_and_text(filelist)
18 |     for i in range(len(filepaths_and_text)):
19 |       original_text = filepaths_and_text[i][args.text_index]
20 |       cleaned_text = text._clean_text(original_text, args.text_cleaners)
21 |       filepaths_and_text[i][args.text_index] = cleaned_text
22 | 
23 |     new_filelist = filelist + "." + args.out_extension
24 |     with open(new_filelist, "w", encoding="utf-8") as f:
25 |       f.writelines(["|".join(x) + "\n" for x in filepaths_and_text])
26 | 


--------------------------------------------------------------------------------
/monotonic_align/core.pyx:
--------------------------------------------------------------------------------
 1 | cimport cython
 2 | from cython.parallel import prange
 3 | 
 4 | 
 5 | @cython.boundscheck(False)
 6 | @cython.wraparound(False)
 7 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
 8 |   cdef int x
 9 |   cdef int y
10 |   cdef float v_prev
11 |   cdef float v_cur
12 |   cdef float tmp
13 |   cdef int index = t_x - 1
14 | 
15 |   for y in range(t_y):
16 |     for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
17 |       if x == y:
18 |         v_cur = max_neg_val
19 |       else:
20 |         v_cur = value[y-1, x]
21 |       if x == 0:
22 |         if y == 0:
23 |           v_prev = 0.
24 |         else:
25 |           v_prev = max_neg_val
26 |       else:
27 |         v_prev = value[y-1, x-1]
28 |       value[y, x] += max(v_prev, v_cur)
29 | 
30 |   for y in range(t_y - 1, -1, -1):
31 |     path[y, index] = 1
32 |     if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
33 |       index = index - 1
34 | 
35 | 
36 | @cython.boundscheck(False)
37 | @cython.wraparound(False)
38 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
39 |   cdef int b = paths.shape[0]
40 |   cdef int i
41 |   for i in prange(b, nogil=True):
42 |     maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
43 | 


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | from text import cleaners
 3 | from text.symbols import symbols
 4 | 
 5 | 
 6 | # Mappings from symbol to numeric ID and vice versa:
 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 9 | 
10 | 
11 | def text_to_sequence(text, cleaner_names):
12 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13 |     Args:
14 |       text: string to convert to a sequence
15 |       cleaner_names: names of the cleaner functions to run the text through
16 |     Returns:
17 |       List of integers corresponding to the symbols in the text
18 |   '''
19 |   sequence = []
20 | 
21 |   clean_text = _clean_text(text, cleaner_names)
22 |   for symbol in clean_text:
23 |     symbol_id = _symbol_to_id[symbol]
24 |     sequence += [symbol_id]
25 |   return sequence
26 | 
27 | 
28 | def cleaned_text_to_sequence(cleaned_text):
29 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
30 |     Args:
31 |       text: string to convert to a sequence
32 |     Returns:
33 |       List of integers corresponding to the symbols in the text
34 |   '''
35 |   sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
36 |   return sequence
37 | 
38 | 
39 | def sequence_to_text(sequence):
40 |   '''Converts a sequence of IDs back to a string'''
41 |   result = ''
42 |   for symbol_id in sequence:
43 |     s = _id_to_symbol[symbol_id]
44 |     result += s
45 |   return result
46 | 
47 | 
48 | def _clean_text(text, cleaner_names):
49 |   for name in cleaner_names:
50 |     cleaner = getattr(cleaners, name)
51 |     if not cleaner:
52 |       raise Exception('Unknown cleaner: %s' % name)
53 |     text = cleaner(text)
54 |   return text
55 | 


--------------------------------------------------------------------------------
/configs/tsukuyomi_chan.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "eval_interval": 1000,
 5 |     "seed": 1234,
 6 |     "epochs": 10000,
 7 |     "learning_rate": 2e-4,
 8 |     "betas": [0.8, 0.99],
 9 |     "eps": 1e-9,
10 |     "batch_size": 32,
11 |     "fp16_run": false,
12 |     "lr_decay": 0.999875,
13 |     "segment_size": 8192,
14 |     "init_lr_ratio": 1,
15 |     "warmup_epochs": 0,
16 |     "c_mel": 45,
17 |     "c_kl": 1.0,
18 |     "fft_sizes": [384, 683, 171],
19 |     "hop_sizes": [30, 60, 10],
20 |     "win_lengths": [150, 300, 60],
21 |     "window": "hann_window"  
22 |   },
23 |   "data": {
24 |     "training_files":"./filelists/filelist_train2.txt.cleaned",
25 |     "validation_files":"./filelists/filelist_val2.txt.cleaned",
26 |     "text_cleaners":["japanese_cleaners"],
27 |     "max_wav_value": 32768.0,
28 |     "sampling_rate": 22050,
29 |     "filter_length": 1024,
30 |     "hop_length": 256,
31 |     "win_length": 1024,
32 |     "n_mel_channels": 80,
33 |     "mel_fmin": 0.0,
34 |     "mel_fmax": null,
35 |     "add_blank": true,
36 |     "n_speakers": 0,
37 |     "cleaned_text": true
38 |   },
39 |   "model": {
40 |     "ms_istft_vits": false,
41 |     "mb_istft_vits": true,
42 |     "istft_vits": false,
43 |     "subbands": 4,
44 |     "gen_istft_n_fft": 16,
45 |     "gen_istft_hop_size": 4,
46 |     "inter_channels": 192,
47 |     "hidden_channels": 192,
48 |     "filter_channels": 768,
49 |     "n_heads": 2,
50 |     "n_layers": 6,
51 |     "kernel_size": 3,
52 |     "p_dropout": 0.1,
53 |     "resblock": "1",
54 |     "resblock_kernel_sizes": [3,7,11],
55 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 |     "upsample_rates": [4,4],
57 |     "upsample_initial_channel": 512,
58 |     "upsample_kernel_sizes": [16,16],
59 |     "n_layers_q": 3,
60 |     "use_spectral_norm": false,
61 |     "use_sdp": false
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/configs/ljs_istft_vits.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "eval_interval": 100000,
 5 |     "seed": 1234,
 6 |     "epochs": 20000,
 7 |     "learning_rate": 2e-4,
 8 |     "betas": [0.8, 0.99],
 9 |     "eps": 1e-9,
10 |     "batch_size": 64,
11 |     "fp16_run": false,
12 |     "lr_decay": 0.999875,
13 |     "segment_size": 8192,
14 |     "init_lr_ratio": 1,
15 |     "warmup_epochs": 0,
16 |     "c_mel": 45,
17 |     "c_kl": 1.0,
18 |     "fft_sizes": [384, 683, 171],
19 |     "hop_sizes": [30, 60, 10],
20 |     "win_lengths": [150, 300, 60],
21 |     "window": "hann_window"  
22 |   },
23 |   "data": {
24 |     "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 |     "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 |     "text_cleaners":["english_cleaners2"],
27 |     "max_wav_value": 32768.0,
28 |     "sampling_rate": 22050,
29 |     "filter_length": 1024,
30 |     "hop_length": 256,
31 |     "win_length": 1024,
32 |     "n_mel_channels": 80,
33 |     "mel_fmin": 0.0,
34 |     "mel_fmax": null,
35 |     "add_blank": true,
36 |     "n_speakers": 0,
37 |     "cleaned_text": true
38 |   },
39 |   "model": {
40 |     "ms_istft_vits": false,
41 |     "mb_istft_vits": false,
42 |     "istft_vits": true,
43 |     "subbands": false,
44 |     "gen_istft_n_fft": 16,
45 |     "gen_istft_hop_size": 4,
46 |     "inter_channels": 192,
47 |     "hidden_channels": 192,
48 |     "filter_channels": 768,
49 |     "n_heads": 2,
50 |     "n_layers": 6,
51 |     "kernel_size": 3,
52 |     "p_dropout": 0.1,
53 |     "resblock": "1",
54 |     "resblock_kernel_sizes": [3,7,11],
55 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 |     "upsample_rates": [8,8],
57 |     "upsample_initial_channel": 512,
58 |     "upsample_kernel_sizes": [16,16],
59 |     "n_layers_q": 3,
60 |     "use_spectral_norm": false,
61 |     "use_sdp": false
62 |   }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/configs/ljs_mb_istft_vits.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "eval_interval": 100000,
 5 |     "seed": 1234,
 6 |     "epochs": 20000,
 7 |     "learning_rate": 2e-4,
 8 |     "betas": [0.8, 0.99],
 9 |     "eps": 1e-9,
10 |     "batch_size": 64,
11 |     "fp16_run": false,
12 |     "lr_decay": 0.999875,
13 |     "segment_size": 8192,
14 |     "init_lr_ratio": 1,
15 |     "warmup_epochs": 0,
16 |     "c_mel": 45,
17 |     "c_kl": 1.0,
18 |     "fft_sizes": [384, 683, 171],
19 |     "hop_sizes": [30, 60, 10],
20 |     "win_lengths": [150, 300, 60],
21 |     "window": "hann_window"  
22 |   },
23 |   "data": {
24 |     "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 |     "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 |     "text_cleaners":["english_cleaners2"],
27 |     "max_wav_value": 32768.0,
28 |     "sampling_rate": 22050,
29 |     "filter_length": 1024,
30 |     "hop_length": 256,
31 |     "win_length": 1024,
32 |     "n_mel_channels": 80,
33 |     "mel_fmin": 0.0,
34 |     "mel_fmax": null,
35 |     "add_blank": true,
36 |     "n_speakers": 0,
37 |     "cleaned_text": true
38 |   },
39 |   "model": {
40 |     "ms_istft_vits": false,
41 |     "mb_istft_vits": true,
42 |     "istft_vits": false,
43 |     "subbands": 4,
44 |     "gen_istft_n_fft": 16,
45 |     "gen_istft_hop_size": 4,
46 |     "inter_channels": 192,
47 |     "hidden_channels": 192,
48 |     "filter_channels": 768,
49 |     "n_heads": 2,
50 |     "n_layers": 6,
51 |     "kernel_size": 3,
52 |     "p_dropout": 0.1,
53 |     "resblock": "1",
54 |     "resblock_kernel_sizes": [3,7,11],
55 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 |     "upsample_rates": [4,4],
57 |     "upsample_initial_channel": 512,
58 |     "upsample_kernel_sizes": [16,16],
59 |     "n_layers_q": 3,
60 |     "use_spectral_norm": false,
61 |     "use_sdp": false
62 |   }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/configs/ljs_mini_istft_vits.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "eval_interval": 100000,
 5 |     "seed": 1234,
 6 |     "epochs": 20000,
 7 |     "learning_rate": 2e-4,
 8 |     "betas": [0.8, 0.99],
 9 |     "eps": 1e-9,
10 |     "batch_size": 64,
11 |     "fp16_run": false,
12 |     "lr_decay": 0.999875,
13 |     "segment_size": 8192,
14 |     "init_lr_ratio": 1,
15 |     "warmup_epochs": 0,
16 |     "c_mel": 45,
17 |     "c_kl": 1.0,
18 |     "fft_sizes": [384, 683, 171],
19 |     "hop_sizes": [30, 60, 10],
20 |     "win_lengths": [150, 300, 60],
21 |     "window": "hann_window"  
22 |   },
23 |   "data": {
24 |     "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 |     "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 |     "text_cleaners":["english_cleaners2"],
27 |     "max_wav_value": 32768.0,
28 |     "sampling_rate": 22050,
29 |     "filter_length": 1024,
30 |     "hop_length": 256,
31 |     "win_length": 1024,
32 |     "n_mel_channels": 80,
33 |     "mel_fmin": 0.0,
34 |     "mel_fmax": null,
35 |     "add_blank": true,
36 |     "n_speakers": 0,
37 |     "cleaned_text": true
38 |   },
39 |   "model": {
40 |     "ms_istft_vits": false,
41 |     "mb_istft_vits": false,
42 |     "istft_vits": true,
43 |     "subbands": false,
44 |     "gen_istft_n_fft": 16,
45 |     "gen_istft_hop_size": 4,
46 |     "inter_channels": 192,
47 |     "hidden_channels": 96,
48 |     "filter_channels": 768,
49 |     "n_heads": 2,
50 |     "n_layers": 3,
51 |     "kernel_size": 3,
52 |     "p_dropout": 0.1,
53 |     "resblock": "1",
54 |     "resblock_kernel_sizes": [3,7,11],
55 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 |     "upsample_rates": [8,8],
57 |     "upsample_initial_channel": 256,
58 |     "upsample_kernel_sizes": [16,16],
59 |     "n_layers_q": 3,
60 |     "use_spectral_norm": false,
61 |     "use_sdp": false
62 |   }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/configs/ljs_mini_mb_istft_vits.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "eval_interval": 100000,
 5 |     "seed": 1234,
 6 |     "epochs": 20000,
 7 |     "learning_rate": 2e-4,
 8 |     "betas": [0.8, 0.99],
 9 |     "eps": 1e-9,
10 |     "batch_size": 64,
11 |     "fp16_run": false,
12 |     "lr_decay": 0.999875,
13 |     "segment_size": 8192,
14 |     "init_lr_ratio": 1,
15 |     "warmup_epochs": 0,
16 |     "c_mel": 45,
17 |     "c_kl": 1.0,
18 |     "fft_sizes": [384, 683, 171],
19 |     "hop_sizes": [30, 60, 10],
20 |     "win_lengths": [150, 300, 60],
21 |     "window": "hann_window"  
22 |   },
23 |   "data": {
24 |     "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 |     "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 |     "text_cleaners":["english_cleaners2"],
27 |     "max_wav_value": 32768.0,
28 |     "sampling_rate": 22050,
29 |     "filter_length": 1024,
30 |     "hop_length": 256,
31 |     "win_length": 1024,
32 |     "n_mel_channels": 80,
33 |     "mel_fmin": 0.0,
34 |     "mel_fmax": null,
35 |     "add_blank": true,
36 |     "n_speakers": 0,
37 |     "cleaned_text": true
38 |   },
39 |   "model": {
40 |     "ms_istft_vits": false,
41 |     "mb_istft_vits": true,
42 |     "istft_vits": false,
43 |     "subbands": 4,
44 |     "gen_istft_n_fft": 16,
45 |     "gen_istft_hop_size": 4,
46 |     "inter_channels": 192,
47 |     "hidden_channels": 96,
48 |     "filter_channels": 768,
49 |     "n_heads": 2,
50 |     "n_layers": 3,
51 |     "kernel_size": 3,
52 |     "p_dropout": 0.1,
53 |     "resblock": "1",
54 |     "resblock_kernel_sizes": [3,7,11],
55 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 |     "upsample_rates": [4,4],
57 |     "upsample_initial_channel": 256,
58 |     "upsample_kernel_sizes": [16,16],
59 |     "n_layers_q": 3,
60 |     "use_spectral_norm": false,
61 |     "use_sdp": false
62 |   }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/losses.py:
--------------------------------------------------------------------------------
 1 | import torch 
 2 | from torch.nn import functional as F
 3 | from stft_loss import MultiResolutionSTFTLoss
 4 | 
 5 | 
 6 | import commons
 7 | 
 8 | 
 9 | def feature_loss(fmap_r, fmap_g):
10 |   loss = 0
11 |   for dr, dg in zip(fmap_r, fmap_g):
12 |     for rl, gl in zip(dr, dg):
13 |       rl = rl.float().detach()
14 |       gl = gl.float()
15 |       loss += torch.mean(torch.abs(rl - gl))
16 | 
17 |   return loss * 2 
18 | 
19 | 
20 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
21 |   loss = 0
22 |   r_losses = []
23 |   g_losses = []
24 |   for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
25 |     dr = dr.float()
26 |     dg = dg.float()
27 |     r_loss = torch.mean((1-dr)**2)
28 |     g_loss = torch.mean(dg**2)
29 |     loss += (r_loss + g_loss)
30 |     r_losses.append(r_loss.item())
31 |     g_losses.append(g_loss.item())
32 | 
33 |   return loss, r_losses, g_losses
34 | 
35 | 
36 | def generator_loss(disc_outputs):
37 |   loss = 0
38 |   gen_losses = []
39 |   for dg in disc_outputs:
40 |     dg = dg.float()
41 |     l = torch.mean((1-dg)**2)
42 |     gen_losses.append(l)
43 |     loss += l
44 | 
45 |   return loss, gen_losses
46 | 
47 | 
48 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
49 |   """
50 |   z_p, logs_q: [b, h, t_t]
51 |   m_p, logs_p: [b, h, t_t]
52 |   """
53 |   z_p = z_p.float()
54 |   logs_q = logs_q.float()
55 |   m_p = m_p.float()
56 |   logs_p = logs_p.float()
57 |   z_mask = z_mask.float()
58 | 
59 |   kl = logs_p - logs_q - 0.5
60 |   kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
61 |   kl = torch.sum(kl * z_mask)
62 |   l = kl / torch.sum(z_mask)
63 |   return l
64 | 
65 | def subband_stft_loss(h, y_mb, y_hat_mb):
66 |   sub_stft_loss = MultiResolutionSTFTLoss(h.train.fft_sizes, h.train.hop_sizes, h.train.win_lengths)
67 |   y_mb =  y_mb.view(-1, y_mb.size(2))
68 |   y_hat_mb = y_hat_mb.view(-1, y_hat_mb.size(2))
69 |   sub_sc_loss, sub_mag_loss = sub_stft_loss(y_hat_mb[:, :y_mb.size(-1)], y_mb)
70 |   return sub_sc_loss+sub_mag_loss
71 | 
72 | 


--------------------------------------------------------------------------------
/configs/ljs_ms_istft_vits.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |       "log_interval": 200,
 4 |       "eval_interval": 100000,
 5 |       "seed": 1234,
 6 |       "epochs": 20000,
 7 |       "learning_rate": 2e-4,
 8 |       "betas": [0.8, 0.99],
 9 |       "eps": 1e-9,
10 |       "batch_size": 64,
11 |       "fp16_run": false,
12 |       "lr_decay": 0.999875,
13 |       "segment_size": 8192,
14 |       "init_lr_ratio": 1,
15 |       "warmup_epochs": 0,
16 |       "c_mel": 45,
17 |       "c_kl": 1.0,
18 |       "fft_sizes": [384, 683, 171],
19 |       "hop_sizes": [30, 60, 10],
20 |       "win_lengths": [150, 300, 60],
21 |       "window": "hann_window"  
22 |     },
23 |     "data": {
24 |       "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 |       "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 |       "text_cleaners":["english_cleaners2"],
27 |       "max_wav_value": 32768.0,
28 |       "sampling_rate": 22050,
29 |       "filter_length": 1024,
30 |       "hop_length": 256,
31 |       "win_length": 1024,
32 |       "n_mel_channels": 80,
33 |       "mel_fmin": 0.0,
34 |       "mel_fmax": null,
35 |       "add_blank": true,
36 |       "n_speakers": 0,
37 |       "cleaned_text": true
38 |     },
39 |     "model": {
40 |       "ms_istft_vits": true,
41 |       "mb_istft_vits": false,
42 |       "istft_vits": false,
43 |       "subbands": 4,
44 |       "gen_istft_n_fft": 16,
45 |       "gen_istft_hop_size": 4,
46 |       "inter_channels": 192,
47 |       "hidden_channels": 192,
48 |       "filter_channels": 768,
49 |       "n_heads": 2,
50 |       "n_layers": 6,
51 |       "kernel_size": 3,
52 |       "p_dropout": 0.1,
53 |       "resblock": "1",
54 |       "resblock_kernel_sizes": [3,7,11],
55 |       "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 |       "upsample_rates": [4,4],
57 |       "upsample_initial_channel": 512,
58 |       "upsample_kernel_sizes": [16,16],
59 |       "n_layers_q": 3,
60 |       "use_spectral_norm": false,
61 |       "use_sdp": false
62 |     }
63 |   
64 |   }
65 |   


--------------------------------------------------------------------------------
/inference.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%matplotlib inline\n",
 10 |     "import matplotlib.pyplot as plt\n",
 11 |     "import IPython.display as ipd\n",
 12 |     "\n",
 13 |     "import os\n",
 14 |     "import json\n",
 15 |     "import math\n",
 16 |     "import torch\n",
 17 |     "from torch import nn\n",
 18 |     "from torch.nn import functional as F\n",
 19 |     "from torch.utils.data import DataLoader\n",
 20 |     "\n",
 21 |     "import commons\n",
 22 |     "import utils\n",
 23 |     "from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate\n",
 24 |     "from models import SynthesizerTrn\n",
 25 |     "from text.symbols import symbols\n",
 26 |     "from text import text_to_sequence\n",
 27 |     "\n",
 28 |     "from scipy.io.wavfile import write\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "def get_text(text, hps):\n",
 32 |     "    text_norm = text_to_sequence(text, hps.data.text_cleaners)\n",
 33 |     "    if hps.data.add_blank:\n",
 34 |     "        text_norm = commons.intersperse(text_norm, 0)\n",
 35 |     "    text_norm = torch.LongTensor(text_norm)\n",
 36 |     "    return text_norm"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## MB-iSTFT-VITS"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "hps = utils.get_hparams_from_file(\"./configs/tsukuyomi_chan.json\")"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "net_g = SynthesizerTrn(\n",
 62 |     "    len(symbols),\n",
 63 |     "    hps.data.filter_length // 2 + 1,\n",
 64 |     "    hps.train.segment_size // hps.data.hop_length,\n",
 65 |     "    **hps.model).cuda()\n",
 66 |     "_ = net_g.eval()\n",
 67 |     "\n",
 68 |     "_ = utils.load_checkpoint(\"./logs/tsukuyomi/G_100000.pth\", net_g, None)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "stn_tst = get_text(\"こんにちは。\", hps)\n",
 78 |     "with torch.no_grad():\n",
 79 |     "    x_tst = stn_tst.cuda().unsqueeze(0)\n",
 80 |     "    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()\n",
 81 |     "    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()\n",
 82 |     "ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))"
 83 |    ]
 84 |   }
 85 |  ],
 86 |  "metadata": {
 87 |   "kernelspec": {
 88 |    "display_name": "Python 3",
 89 |    "language": "python",
 90 |    "name": "python3"
 91 |   },
 92 |   "language_info": {
 93 |    "codemirror_mode": {
 94 |     "name": "ipython",
 95 |     "version": 3
 96 |    },
 97 |    "file_extension": ".py",
 98 |    "mimetype": "text/x-python",
 99 |    "name": "python",
100 |    "nbconvert_exporter": "python",
101 |    "pygments_lexer": "ipython3",
102 |    "version": "3.8.13"
103 |   }
104 |  },
105 |  "nbformat": 4,
106 |  "nbformat_minor": 4
107 | }
108 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MB-iSTFT-VITS with Multilingual Implementations
  2 | <img src="./fig/with_tsukuyomi_chan.png" width="100%">
  3 | 
  4 | This is an multilingual implementation of [MB-iSTFT-VITS](https://github.com/MasayaKawamura/MB-iSTFT-VITS) to support conversion to various languages. MB-iSTFT-VITS showed 4.1 times faster inference time compared with original VITS! </br>
  5 | Preprocessed Japanese Single Speaker training material is provided with [つくよみちゃんコーパス(tsukuyomi-chan corpus).](https://tyc.rei-yumesaki.net/material/corpus/) You need to download the corpus and place 100 `.wav` files to `./tsukuyomi_raw`. 
  6 | </br>
  7 | 
  8 | - Currently Supported: Japanese / Korean
  9 | - Chinese / CJKE / and other languages will be updated very soon!
 10 | 
 11 | 
 12 | # How to use
 13 | Python >= 3.6 (Python == 3.7 is suggested)
 14 | 
 15 | ## Clone this repository
 16 | ```sh
 17 | git clone https://github.com/misakiudon/MB-iSTFT-VITS-multilingual.git
 18 | ```
 19 | 
 20 | ## Install requirements
 21 | ```sh
 22 | pip install -r requirements.txt
 23 | ```
 24 | You may need to install espeak first: `apt-get install espeak`
 25 | 
 26 | ## Create manifest data
 27 | ### Single speaker
 28 | "n_speakers" should be 0 in config.json
 29 | ```
 30 | path/to/XXX.wav|transcript
 31 | ```
 32 | - Example
 33 | ```
 34 | dataset/001.wav|こんにちは。
 35 | ```
 36 | 
 37 | ### Mutiple speakers
 38 | Speaker id should start from 0 
 39 | ```
 40 | path/to/XXX.wav|speaker id|transcript
 41 | ```
 42 | - Example
 43 | ```
 44 | dataset/001.wav|0|こんにちは。
 45 | ```
 46 | 
 47 | ## Preprocess
 48 | Japanese preprocessed manifest data is provided with `filelists/filelist_train2.txt.cleaned` and `filelists/filelist_val2.txt.cleaned`.
 49 | ```sh
 50 | # Single speaker
 51 | python preprocess.py --text_index 1 --filelists path/to/filelist_train.txt path/to/filelist_val.txt --text_cleaners 'japanese_cleaners'
 52 | 
 53 | # Mutiple speakers
 54 | python preprocess.py --text_index 2 --filelists path/to/filelist_train.txt path/to/filelist_val.txt --text_cleaners 'japanese_cleaners'
 55 | ```
 56 | 
 57 | If your speech file is either not `22050Hz / Mono / PCM-16`, the you should resample your .wav file first. 
 58 | ```sh
 59 | python convert_to_22050.py --in_path path/to/original_wav_dir/ --out_path path/to/output_wav_dir/
 60 | ```
 61 | 
 62 | ## Build monotonic alignment search
 63 | ```sh
 64 | # Cython-version Monotonoic Alignment Search
 65 | cd monotonic_align
 66 | mkdir monotonic_align
 67 | python setup.py build_ext --inplace
 68 | ```
 69 | 
 70 | ## Setting json file in [configs](configs)
 71 | 
 72 | | Model | How to set up json file in [configs](configs) | Sample of json file configuration|
 73 | | :---: | :---: | :---: |
 74 | | iSTFT-VITS | ```"istft_vits": true, ```<br>``` "upsample_rates": [8,8], ``` | ljs_istft_vits.json |
 75 | | MB-iSTFT-VITS | ```"subbands": 4,```<br>```"mb_istft_vits": true, ```<br>``` "upsample_rates": [4,4], ``` | ljs_mb_istft_vits.json |
 76 | | MS-iSTFT-VITS | ```"subbands": 4,```<br>```"ms_istft_vits": true, ```<br>``` "upsample_rates": [4,4], ``` | ljs_ms_istft_vits.json |
 77 | 
 78 | For tutorial, check `config/tsukuyomi_chan.json` for more examples
 79 | - If you have done preprocessing, set "cleaned_text" to true. 
 80 | - Change `training_files` and `validation_files` to the path of preprocessed manifest files. 
 81 | - Select same `text_cleaners` you used in preprocessing step. 
 82 | 
 83 | ## Train
 84 | ```sh
 85 | # Single speaker
 86 | python train_latest.py -c <config> -m <folder>
 87 | 
 88 | # Mutiple speakers
 89 | python train_latest_ms.py -c <config> -m <folder>
 90 | ```
 91 | In the case of training MB-iSTFT-VITS with Japanese tutorial corpus, run the following script. Resume training from lastest checkpoint is automatic.
 92 | ```sh
 93 | python train_latest.py -c configs/tsukuyomi_chan.json -m tsukuyomi
 94 | ```
 95 | 
 96 | After the training, you can check inference audio using [inference.ipynb](inference.ipynb)
 97 | 
 98 | ## References
 99 | - https://github.com/MasayaKawamura/MB-iSTFT-VITS
100 | - https://github.com/CjangCjengh/vits
101 | - https://github.com/Francis-Komizu/VITS
102 | 


--------------------------------------------------------------------------------
/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import random
  4 | import torch
  5 | from torch import nn
  6 | import torch.nn.functional as F
  7 | import torch.utils.data
  8 | import numpy as np
  9 | import librosa
 10 | import librosa.util as librosa_util
 11 | from librosa.util import normalize, pad_center, tiny
 12 | from scipy.signal import get_window
 13 | from scipy.io.wavfile import read
 14 | from librosa.filters import mel as librosa_mel_fn
 15 | 
 16 | MAX_WAV_VALUE = 32768.0
 17 | 
 18 | 
 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 20 |     """
 21 |     PARAMS
 22 |     ------
 23 |     C: compression factor
 24 |     """
 25 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 26 | 
 27 | 
 28 | def dynamic_range_decompression_torch(x, C=1):
 29 |     """
 30 |     PARAMS
 31 |     ------
 32 |     C: compression factor used to compress
 33 |     """
 34 |     return torch.exp(x) / C
 35 | 
 36 | 
 37 | def spectral_normalize_torch(magnitudes):
 38 |     output = dynamic_range_compression_torch(magnitudes)
 39 |     return output
 40 | 
 41 | 
 42 | def spectral_de_normalize_torch(magnitudes):
 43 |     output = dynamic_range_decompression_torch(magnitudes)
 44 |     return output
 45 | 
 46 | 
 47 | mel_basis = {}
 48 | hann_window = {}
 49 | 
 50 | 
 51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 52 |     if torch.min(y) < -1.:
 53 |         print('min value is ', torch.min(y))
 54 |     if torch.max(y) > 1.:
 55 |         print('max value is ', torch.max(y))
 56 | 
 57 |     global hann_window
 58 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 59 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 60 |     if wnsize_dtype_device not in hann_window:
 61 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
 62 | 
 63 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 64 |     y = y.squeeze(1)
 65 | 
 66 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
 67 |                       center=center, pad_mode='reflect', normalized=False, onesided=True)
 68 | 
 69 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 70 |     return spec
 71 | 
 72 | 
 73 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 74 |     global mel_basis
 75 |     dtype_device = str(spec.dtype) + '_' + str(spec.device)
 76 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
 77 |     if fmax_dtype_device not in mel_basis:
 78 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 79 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
 80 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
 81 |     spec = spectral_normalize_torch(spec)
 82 |     return spec
 83 | 
 84 | 
 85 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
 86 |     if torch.min(y) < -1.:
 87 |         print('min value is ', torch.min(y))
 88 |     if torch.max(y) > 1.:
 89 |         print('max value is ', torch.max(y))
 90 | 
 91 |     global mel_basis, hann_window
 92 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 93 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
 94 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 95 |     if fmax_dtype_device not in mel_basis:
 96 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 97 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
 98 |     if wnsize_dtype_device not in hann_window:
 99 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
100 | 
101 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
102 |     y = y.squeeze(1)
103 | 
104 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
105 |                       center=center, pad_mode='reflect', normalized=False, onesided=True)
106 | 
107 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
108 | 
109 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
110 |     spec = spectral_normalize_torch(spec)
111 | 
112 |     return spec
113 | 


--------------------------------------------------------------------------------
/pqmf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2020 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Pseudo QMF modules."""
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | import torch.nn.functional as F
 11 | 
 12 | from scipy.signal import kaiser
 13 | 
 14 | 
 15 | def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0):
 16 |     """Design prototype filter for PQMF.
 17 |     This method is based on `A Kaiser window approach for the design of prototype
 18 |     filters of cosine modulated filterbanks`_.
 19 |     Args:
 20 |         taps (int): The number of filter taps.
 21 |         cutoff_ratio (float): Cut-off frequency ratio.
 22 |         beta (float): Beta coefficient for kaiser window.
 23 |     Returns:
 24 |         ndarray: Impluse response of prototype filter (taps + 1,).
 25 |     .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
 26 |         https://ieeexplore.ieee.org/abstract/document/681427
 27 |     """
 28 |     # check the arguments are valid
 29 |     assert taps % 2 == 0, "The number of taps mush be even number."
 30 |     assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
 31 | 
 32 |     # make initial filter
 33 |     omega_c = np.pi * cutoff_ratio
 34 |     with np.errstate(invalid='ignore'):
 35 |         h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) \
 36 |             / (np.pi * (np.arange(taps + 1) - 0.5 * taps))
 37 |     h_i[taps // 2] = np.cos(0) * cutoff_ratio  # fix nan due to indeterminate form
 38 | 
 39 |     # apply kaiser window
 40 |     w = kaiser(taps + 1, beta)
 41 |     h = h_i * w
 42 | 
 43 |     return h
 44 | 
 45 | 
 46 | class PQMF(torch.nn.Module):
 47 |     """PQMF module.
 48 |     This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
 49 |     .. _`Near-perfect-reconstruction pseudo-QMF banks`:
 50 |         https://ieeexplore.ieee.org/document/258122
 51 |     """
 52 | 
 53 |     def __init__(self, device, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0):
 54 |         """Initilize PQMF module.
 55 |         Args:
 56 |             subbands (int): The number of subbands.
 57 |             taps (int): The number of filter taps.
 58 |             cutoff_ratio (float): Cut-off frequency ratio.
 59 |             beta (float): Beta coefficient for kaiser window.
 60 |         """
 61 |         super(PQMF, self).__init__()
 62 | 
 63 |         # define filter coefficient
 64 |         h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
 65 |         h_analysis = np.zeros((subbands, len(h_proto)))
 66 |         h_synthesis = np.zeros((subbands, len(h_proto)))
 67 |         for k in range(subbands):
 68 |             h_analysis[k] = 2 * h_proto * np.cos(
 69 |                 (2 * k + 1) * (np.pi / (2 * subbands)) *
 70 |                 (np.arange(taps + 1) - ((taps - 1) / 2)) +
 71 |                 (-1) ** k * np.pi / 4)
 72 |             h_synthesis[k] = 2 * h_proto * np.cos(
 73 |                 (2 * k + 1) * (np.pi / (2 * subbands)) *
 74 |                 (np.arange(taps + 1) - ((taps - 1) / 2)) -
 75 |                 (-1) ** k * np.pi / 4)
 76 | 
 77 |         # convert to tensor
 78 |         analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).cuda(device)
 79 |         synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0).cuda(device)
 80 | 
 81 |         # register coefficients as beffer
 82 |         self.register_buffer("analysis_filter", analysis_filter)
 83 |         self.register_buffer("synthesis_filter", synthesis_filter)
 84 | 
 85 |         # filter for downsampling & upsampling
 86 |         updown_filter = torch.zeros((subbands, subbands, subbands)).float().cuda(device)
 87 |         for k in range(subbands):
 88 |             updown_filter[k, k, 0] = 1.0
 89 |         self.register_buffer("updown_filter", updown_filter)
 90 |         self.subbands = subbands
 91 | 
 92 |         # keep padding info
 93 |         self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
 94 | 
 95 |     def analysis(self, x):
 96 |         """Analysis with PQMF.
 97 |         Args:
 98 |             x (Tensor): Input tensor (B, 1, T).
 99 |         Returns:
100 |             Tensor: Output tensor (B, subbands, T // subbands).
101 |         """
102 |         x = F.conv1d(self.pad_fn(x), self.analysis_filter)
103 |         return F.conv1d(x, self.updown_filter, stride=self.subbands)
104 | 
105 |     def synthesis(self, x):
106 |         """Synthesis with PQMF.
107 |         Args:
108 |             x (Tensor): Input tensor (B, subbands, T // subbands).
109 |         Returns:
110 |             Tensor: Output tensor (B, 1, T).
111 |         """
112 |         # NOTE(kan-bayashi): Power will be dreased so here multipy by # subbands.
113 |         #   Not sure this is the correct way, it is better to check again.
114 |         # TODO(kan-bayashi): Understand the reconstruction procedure
115 |         x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands)
116 |         return F.conv1d(self.pad_fn(x), self.synthesis_filter)


--------------------------------------------------------------------------------
/text/py2kn.json:
--------------------------------------------------------------------------------
1 | {"a": "アー", "ai": "アイ", "an": "アン", "ang": "アン", "ao": "アオ", "ba": "バー", "bai": "バイ", "ban": "バン", "bang": "バン", "bao": "バオ", "bei": "ベイ", "ben": "ベン", "beng": "ボン", "bi": "ビー", "bian": "ビィェン", "biao": "ビィャォ", "bie": "ビィェ", "bin": "ビン", "bing": "ビン", "bo": "ブォ", "bu": "ブー", "ca": "ツァ", "cai": "ツァィ", "can": "ツァン", "cang": "ツァン", "cao": "ツァォ", "ce": "ツェ", "cen": "ツェン", "ceng": "ツォン", "cha": "チャ", "chai": "チャイ", "chan": "チャン", "chang": "チャン", "chao": "チャオ", "che": "チェ", "chen": "チェン", "cheng": "チォン", "chi": "チー", "chong": "チョン", "chou": "チョウ", "chu": "チュ", "chuan": "チュァン", "chuai": "チュァイ", "chuang": "チュゥァン", "chui": "チュイ", "chun": "チュン", "chuo": "チャオ", "ci": "ツー", "cong": "ツォン", "cou": "ツォゥ　", "cu": "ツゥ", "cuan": "ツァン", "cui": "ツイ", "cun": "ツン", "cuo": "ツゥォ", "da": "ダー", "dai": "ダイ", "dan": "ダン", "dang": "ダン", "dao": "ダオ", "de": "デェ", "dei": "デイ", "dun": "ドゥン", "deng": "ドン", "di": "ディ", "dian": "ディェン", "diao": "ディァォ", "die": "ディェ", "ding": "ディン", "diu": "ディゥ", "dong": "ドン", "dou": "ドウ", "du": "ドゥ", "duan": "ドゥァン", "dui": "ドゥイ", "duo": "ドゥォ", "e": "ェ", "ei": "ェイ", "en": "エン", "eng": "鞥", "er": "ェ", "fa": "ファ", "fan": "ファン", "fang": "ファン", "fei": "フェイ", "fen": "フェン", "feng": "フォン", "fuo": "フォ", "fou": "フォウ", "fu": "フー", "ga": "ガー", "gai": "ガイ", "gan": "ガン", "gang": "ガン", "gao": "ガオ", "ge": "グェ", "gei": "ゲイ", "gen": "ゲン", "geng": "ゴン", "gong": "ゴン", "gou": "ゴウ", "gu": "グー", "gua": "グァ", "guai": "グゥァイ", "guan": "グァン", "guang": "グゥァン", "gui": "グゥイ", "gun": "ガン", "guo": "グゥォ", "ha": "ハー", "hai": "ハイ", "han": "ハン", "hang": "ハン", "hao": "ハオ", "he": "フェ゛ァ", "hei": "ヘイ", "hen": "ヘン", "heng": "ホン", "hong": "ホン", "hou": "ホウ", "hu": "フー", "hua": "ファ", "huai": "フゥァイ", "huan": "ファン", "huang": "フゥァン", "hui": "フゥイ", "hun": "フン", "huo": "フォ", "ji": "ジー", "jia": "ジャ", "jian": "ジィェン", "jiang": "ジィァン", "jiao": "ジャオ", "jie": "ジェ", "jin": "ジン", "jing": "ジン", "jiong": "ジィォン", "jiu": "ジゥ", "ju": "ジュ", "juan": "ジュェン", "jue": "ジュェ", "jun": "ジュン", "ka": "カー", "kai": "カイ", "kan": "カン", "kang": "カン", "kao": "カオ", "ke": "クェ゛ァ", "ken": "ケン", "keng": "コン", "kong": "コン", "kou": "コウ", "ku": "クー", "kua": "クァ", "kuai": "クァィ", "kuan": "クァン", "kuang": "クゥァン", "kui": "クゥイ", "kun": "クン", "kuo": "クォ", "la": "ラー", "lai": "ライ", "lan": "ラン", "lang": "ラン", "lao": "ラオ", "le": "ラ", "lei": "レイ", "leng": "ラン", "li": "リー", "liang": "リィァン", "lian": "リィェン", "liao": "リィァォ", "lie": "リィェ", "lin": "リン", "ling": "リン", "liu": "リィゥ", "long": "ロン", "lou": "ロウ", "lu": "ルー", "lv": "リュ", "luan": "ルゥァン", "lue": "リュェ", "lun": "ルゥン", "luo": "ルゥォ", "ma": "マー", "mai": "マイ", "man": "マン", "mang": "マン", "mao": "マオ", "me": "ムェ", "mei": "メイ", "men": "メン", "meng": "モン", "mi": "ミィ", "mian": "ミィェン", "miao": "ミィァォ", "mie": "ミィェ", "min": "ミン", "ming": "ミン", "miu": "ミィゥ", "mo": "ムォ", "mou": "モウ", "mu": "ムー", "na": "ナー", "nai": "ナイ", "nan": "ナン", "nang": "ナン", "nao": "ナオ", "ne": "ヌェ゛ァ", "nei": "ネイ", "nen": "ネン", "neng": "ノン", "ni": "ニー", "nian": "ニィェン", "niang": "ニィァン", "niao": "ニィァォ", "nie": "ニィェ", "nin": "ニン", "ning": "ニン", "niu": "ニュェ", "nong": "ノン", "nou": "ノウ", "nu": "ヌー", "nv": "ニュ", "nuan": "ヌァン", "nuo": "ヌオ", "o": "オ", "ou": "オウ", "pa": "パー", "pai": "パイ", "pan": "パン", "pang": "パン", "pao": "パオ", "pei": "ペイ", "pen": "ペン", "peng": "ポン", "pi": "ピー", "pian": "ピィェン", "piao": "ピィァオ", "pie": "ピェ", "pin": "ピン", "ping": "ピン", "po": "ポォ", "pou": "ポウ", "pu": "プー", "qi": "チー", "qia": "チィァ", "qian": "チィェン", "qiang": "チィァン", "qiao": "チィァォ", "qie": "チィェ", "qin": "チン", "qing": "チン", "qiong": "チォン", "qiu": "チィゥ", "qu": "チュ", "quan": "チュェン", "que": "チュェ", "qun": "チュン", "ran": "ラン", "rang": "ラン", "rao": "ラオ", "re": "レ", "ren": "レン", "reng": "ロン", "ri": "リ", "rong": "ロン", "rou": "ロウ", "ru": "ルー", "ruan": "ルァン", "rui": "ルイ", "run": "ルン", "ruo": "ルォ", "sa": "サー", "sai": "サオ", "san": "サン", "sang": "サン", "se": "スェ", "sen": "セン", "seng": "ソン", "sha": "シャ", "shai": "シャイ", "shan": "シャン", "shang": "シャン", "shao": "シャオ", "she": "シェ", "shen": "シェン", "sheng": "シォン", "shi": "シー", "shou": "ショウ", "shu": "シュ", "shua": "シュァ", "shuai": "シュァイ", "shuan": "シュァン", "shuang": "シュゥァン", "shui": "シュイ", "shun": "シュン", "shuo": "シュォ", "si": "スー", "song": "ソン", "sou": "ソウ", "su": "スー", "suan": "スゥァン", "sui": "スイ", "sun": "スン", "suo": "スォ", "ta": "ター", "tai": "タイ", "tan": "タン", "tang": "タン", "tao": "タオ", "te": "テェ", "teng": "トン", "ti": "ティ", "tian": "ティェン", "tiao": "ティァォ", "tie": "ティェ", "ting": "ティン", "tong": "トン", "tou": "トウ", "tu": "トゥ", "tuan": "トゥァン", "tui": "トゥイ", "tun": "トゥン", "tuo": "トゥォ", "wa": "ウァ", "wai": "ワィ", "wan": "ワン", "wang": "ワン", "wei": "ウェイ", "wen": "ウェン", "weng": "ウォン", "wo": "ウォ", "wu": "ウー", "xi": "シー", "xia": "シァ", "xian": "シィェン", "xiang": "シィァン", "xiao": "シァォ", "xie": "シェ", "xin": "シン", "xing": "シン", "xiong": "シィォン", "xiu": "シゥ", "xu": "シュ", "xuan": "シュェン", "xue": "シュェ", "xun": "シュン", "ya": "ヤー", "yan": "イェン", "yang": "ヤン", "yao": "イャォ", "ye": "イェ", "yi": "イー", "yin": "イン", "ying": "イン", "yong": "ヨン", "you": "ヨウ", "yu": "ユー", "yuan": "ユェン", "yue": "ユェ", "yun": "ユン", "za": "ザー", "zai": "ヂャイ", "zan": "ザン", "zang": "ザン", "zao": "ザオ", "ze": "ゼェ", "zei": "ゼイ", "zen": "ゼン", "zeng": "ゾン", "zhan": "ヂャン", "zhang": "ヂャン", "zhao": "ヂャオ", "zhe": "ヂェ゛ァ", "zhen": "ヂェン", "zheng": "ヂォン", "zhi": "ヂー", "zhong": "ヂョン", "zhou": "ヂョウ", "zhu": "ヂュ", "zhua": "ヂュア", "zhuai": "ヂュァイ", "zhuan": "ヂュァン", "zhuang": "ヂュゥァン", "zhui": "ヂュイ", "zhun": "ヂュン", "zhuo": "ヂュオ", "zi": "ズー", "zong": "ゾン", "zou": "ゾウ", "zu": "ズー", "zuan": "ズァン", "zui": "ズイ", "zun": "ズン", "zuo": "ズゥォ", "，": "、", "。": "。", "！": "！", "？": "？", "……": "。"}
2 | 


--------------------------------------------------------------------------------
/stft_loss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """STFT-based Loss modules."""
  7 | 
  8 | import torch
  9 | import torch.nn.functional as F
 10 | 
 11 | 
 12 | def stft(x, fft_size, hop_size, win_length, window):
 13 |     """Perform STFT and convert to magnitude spectrogram.
 14 |     Args:
 15 |         x (Tensor): Input signal tensor (B, T).
 16 |         fft_size (int): FFT size.
 17 |         hop_size (int): Hop size.
 18 |         win_length (int): Window length.
 19 |         window (str): Window function type.
 20 |     Returns:
 21 |         Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
 22 |     """
 23 |     x_stft = torch.stft(x, fft_size, hop_size, win_length, window.to(x.device))
 24 |     real = x_stft[..., 0]
 25 |     imag = x_stft[..., 1]
 26 | 
 27 |     # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
 28 |     return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
 29 | 
 30 | 
 31 | class SpectralConvergengeLoss(torch.nn.Module):
 32 |     """Spectral convergence loss module."""
 33 | 
 34 |     def __init__(self):
 35 |         """Initilize spectral convergence loss module."""
 36 |         super(SpectralConvergengeLoss, self).__init__()
 37 | 
 38 |     def forward(self, x_mag, y_mag):
 39 |         """Calculate forward propagation.
 40 |         Args:
 41 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 42 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 43 |         Returns:
 44 |             Tensor: Spectral convergence loss value.
 45 |         """
 46 |         return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
 47 | 
 48 | 
 49 | class LogSTFTMagnitudeLoss(torch.nn.Module):
 50 |     """Log STFT magnitude loss module."""
 51 | 
 52 |     def __init__(self):
 53 |         """Initilize los STFT magnitude loss module."""
 54 |         super(LogSTFTMagnitudeLoss, self).__init__()
 55 | 
 56 |     def forward(self, x_mag, y_mag):
 57 |         """Calculate forward propagation.
 58 |         Args:
 59 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 60 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 61 |         Returns:
 62 |             Tensor: Log STFT magnitude loss value.
 63 |         """
 64 |         return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
 65 | 
 66 | 
 67 | class STFTLoss(torch.nn.Module):
 68 |     """STFT loss module."""
 69 | 
 70 |     def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"):
 71 |         """Initialize STFT loss module."""
 72 |         super(STFTLoss, self).__init__()
 73 |         self.fft_size = fft_size
 74 |         self.shift_size = shift_size
 75 |         self.win_length = win_length
 76 |         self.window = getattr(torch, window)(win_length)
 77 |         self.spectral_convergenge_loss = SpectralConvergengeLoss()
 78 |         self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
 79 | 
 80 |     def forward(self, x, y):
 81 |         """Calculate forward propagation.
 82 |         Args:
 83 |             x (Tensor): Predicted signal (B, T).
 84 |             y (Tensor): Groundtruth signal (B, T).
 85 |         Returns:
 86 |             Tensor: Spectral convergence loss value.
 87 |             Tensor: Log STFT magnitude loss value.
 88 |         """
 89 |         x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
 90 |         y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
 91 |         sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
 92 |         mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
 93 | 
 94 |         return sc_loss, mag_loss
 95 | 
 96 | 
 97 | class MultiResolutionSTFTLoss(torch.nn.Module):
 98 |     """Multi resolution STFT loss module."""
 99 | 
100 |     def __init__(self,
101 |                  fft_sizes=[1024, 2048, 512],
102 |                  hop_sizes=[120, 240, 50],
103 |                  win_lengths=[600, 1200, 240],
104 |                  window="hann_window"):
105 |         """Initialize Multi resolution STFT loss module.
106 |         Args:
107 |             fft_sizes (list): List of FFT sizes.
108 |             hop_sizes (list): List of hop sizes.
109 |             win_lengths (list): List of window lengths.
110 |             window (str): Window function type.
111 |         """
112 |         super(MultiResolutionSTFTLoss, self).__init__()
113 |         assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
114 |         self.stft_losses = torch.nn.ModuleList()
115 |         for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
116 |             self.stft_losses += [STFTLoss(fs, ss, wl, window)]
117 | 
118 |     def forward(self, x, y):
119 |         """Calculate forward propagation.
120 |         Args:
121 |             x (Tensor): Predicted signal (B, T).
122 |             y (Tensor): Groundtruth signal (B, T).
123 |         Returns:
124 |             Tensor: Multi resolution spectral convergence loss value.
125 |             Tensor: Multi resolution log STFT magnitude loss value.
126 |         """
127 |         sc_loss = 0.0
128 |         mag_loss = 0.0
129 |         for f in self.stft_losses:
130 |             sc_l, mag_l = f(x, y)
131 |             sc_loss += sc_l
132 |             mag_loss += mag_l
133 |         sc_loss /= len(self.stft_losses)
134 |         mag_loss /= len(self.stft_losses)
135 | 
136 |         return sc_loss, mag_loss


--------------------------------------------------------------------------------
/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | 
  8 | def init_weights(m, mean=0.0, std=0.01):
  9 |   classname = m.__class__.__name__
 10 |   if classname.find("Conv") != -1:
 11 |     m.weight.data.normal_(mean, std)
 12 | 
 13 | 
 14 | def get_padding(kernel_size, dilation=1):
 15 |   return int((kernel_size*dilation - dilation)/2)
 16 | 
 17 | 
 18 | def convert_pad_shape(pad_shape):
 19 |   l = pad_shape[::-1]
 20 |   pad_shape = [item for sublist in l for item in sublist]
 21 |   return pad_shape
 22 | 
 23 | 
 24 | def intersperse(lst, item):
 25 |   result = [item] * (len(lst) * 2 + 1)
 26 |   result[1::2] = lst
 27 |   return result
 28 | 
 29 | 
 30 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 31 |   """KL(P||Q)"""
 32 |   kl = (logs_q - logs_p) - 0.5
 33 |   kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
 34 |   return kl
 35 | 
 36 | 
 37 | def rand_gumbel(shape):
 38 |   """Sample from the Gumbel distribution, protect from overflows."""
 39 |   uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 40 |   return -torch.log(-torch.log(uniform_samples))
 41 | 
 42 | 
 43 | def rand_gumbel_like(x):
 44 |   g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 45 |   return g
 46 | 
 47 | 
 48 | def slice_segments(x, ids_str, segment_size=4):
 49 |   ret = torch.zeros_like(x[:, :, :segment_size])
 50 |   for i in range(x.size(0)):
 51 |     idx_str = ids_str[i]
 52 |     idx_end = idx_str + segment_size
 53 |     ret[i] = x[i, :, idx_str:idx_end]
 54 |   return ret
 55 | 
 56 | 
 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 58 |   b, d, t = x.size()
 59 |   if x_lengths is None:
 60 |     x_lengths = t
 61 |   ids_str_max = x_lengths - segment_size + 1
 62 |   ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 63 |   ret = slice_segments(x, ids_str, segment_size)
 64 |   return ret, ids_str
 65 | 
 66 | 
 67 | def get_timing_signal_1d(
 68 |     length, channels, min_timescale=1.0, max_timescale=1.0e4):
 69 |   position = torch.arange(length, dtype=torch.float)
 70 |   num_timescales = channels // 2
 71 |   log_timescale_increment = (
 72 |       math.log(float(max_timescale) / float(min_timescale)) /
 73 |       (num_timescales - 1))
 74 |   inv_timescales = min_timescale * torch.exp(
 75 |       torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
 76 |   scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 77 |   signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 78 |   signal = F.pad(signal, [0, 0, 0, channels % 2])
 79 |   signal = signal.view(1, channels, length)
 80 |   return signal
 81 | 
 82 | 
 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 84 |   b, channels, length = x.size()
 85 |   signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 86 |   return x + signal.to(dtype=x.dtype, device=x.device)
 87 | 
 88 | 
 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 90 |   b, channels, length = x.size()
 91 |   signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 92 |   return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 93 | 
 94 | 
 95 | def subsequent_mask(length):
 96 |   mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
 97 |   return mask
 98 | 
 99 | 
100 | @torch.jit.script
101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102 |   n_channels_int = n_channels[0]
103 |   in_act = input_a + input_b
104 |   t_act = torch.tanh(in_act[:, :n_channels_int, :])
105 |   s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106 |   acts = t_act * s_act
107 |   return acts
108 | 
109 | 
110 | def convert_pad_shape(pad_shape):
111 |   l = pad_shape[::-1]
112 |   pad_shape = [item for sublist in l for item in sublist]
113 |   return pad_shape
114 | 
115 | 
116 | def shift_1d(x):
117 |   x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118 |   return x
119 | 
120 | 
121 | def sequence_mask(length, max_length=None):
122 |   if max_length is None:
123 |     max_length = length.max()
124 |   x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125 |   return x.unsqueeze(0) < length.unsqueeze(1)
126 | 
127 | 
128 | def generate_path(duration, mask):
129 |   """
130 |   duration: [b, 1, t_x]
131 |   mask: [b, 1, t_y, t_x]
132 |   """
133 |   device = duration.device
134 |   
135 |   b, _, t_y, t_x = mask.shape
136 |   cum_duration = torch.cumsum(duration, -1)
137 |   
138 |   cum_duration_flat = cum_duration.view(b * t_x)
139 |   path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
140 |   path = path.view(b, t_x, t_y)
141 |   path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
142 |   path = path.unsqueeze(1).transpose(2,3) * mask
143 |   return path
144 | 
145 | 
146 | def clip_grad_value_(parameters, clip_value, norm_type=2):
147 |   if isinstance(parameters, torch.Tensor):
148 |     parameters = [parameters]
149 |   parameters = list(filter(lambda p: p.grad is not None, parameters))
150 |   norm_type = float(norm_type)
151 |   if clip_value is not None:
152 |     clip_value = float(clip_value)
153 | 
154 |   total_norm = 0
155 |   for p in parameters:
156 |     param_norm = p.grad.data.norm(norm_type)
157 |     total_norm += param_norm.item() ** norm_type
158 |     if clip_value is not None:
159 |       p.grad.data.clamp_(min=-clip_value, max=clip_value)
160 |   total_norm = total_norm ** (1. / norm_type)
161 |   return total_norm
162 | 


--------------------------------------------------------------------------------
/text/japanese.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from unidecode import unidecode
  3 | import pyopenjtalk
  4 | 
  5 | 
  6 | # Regular expression matching Japanese without punctuation marks:
  7 | _japanese_characters = re.compile(
  8 |     r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
  9 | 
 10 | # Regular expression matching non-Japanese characters or punctuation marks:
 11 | _japanese_marks = re.compile(
 12 |     r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 13 | 
 14 | # List of (symbol, Japanese) pairs for marks:
 15 | _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
 16 |     ('％', 'パーセント')
 17 | ]]
 18 | 
 19 | # List of (romaji, ipa) pairs for marks:
 20 | _romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
 21 |     ('ts', 'ʦ'),
 22 |     ('u', 'ɯ'),
 23 |     ('j', 'ʥ'),
 24 |     ('y', 'j'),
 25 |     ('ni', 'n^i'),
 26 |     ('nj', 'n^'),
 27 |     ('hi', 'çi'),
 28 |     ('hj', 'ç'),
 29 |     ('f', 'ɸ'),
 30 |     ('I', 'i*'),
 31 |     ('U', 'ɯ*'),
 32 |     ('r', 'ɾ')
 33 | ]]
 34 | 
 35 | # List of (romaji, ipa2) pairs for marks:
 36 | _romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
 37 |     ('u', 'ɯ'),
 38 |     ('ʧ', 'tʃ'),
 39 |     ('j', 'dʑ'),
 40 |     ('y', 'j'),
 41 |     ('ni', 'n^i'),
 42 |     ('nj', 'n^'),
 43 |     ('hi', 'çi'),
 44 |     ('hj', 'ç'),
 45 |     ('f', 'ɸ'),
 46 |     ('I', 'i*'),
 47 |     ('U', 'ɯ*'),
 48 |     ('r', 'ɾ')
 49 | ]]
 50 | 
 51 | # List of (consonant, sokuon) pairs:
 52 | _real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
 53 |     (r'Q([↑↓]*[kg])', r'k#\1'),
 54 |     (r'Q([↑↓]*[tdjʧ])', r't#\1'),
 55 |     (r'Q([↑↓]*[sʃ])', r's\1'),
 56 |     (r'Q([↑↓]*[pb])', r'p#\1')
 57 | ]]
 58 | 
 59 | # List of (consonant, hatsuon) pairs:
 60 | _real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
 61 |     (r'N([↑↓]*[pbm])', r'm\1'),
 62 |     (r'N([↑↓]*[ʧʥj])', r'n^\1'),
 63 |     (r'N([↑↓]*[tdn])', r'n\1'),
 64 |     (r'N([↑↓]*[kg])', r'ŋ\1')
 65 | ]]
 66 | 
 67 | 
 68 | def symbols_to_japanese(text):
 69 |     for regex, replacement in _symbols_to_japanese:
 70 |         text = re.sub(regex, replacement, text)
 71 |     return text
 72 | 
 73 | 
 74 | def japanese_to_romaji_with_accent(text):
 75 |     '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
 76 |     text = symbols_to_japanese(text)
 77 |     sentences = re.split(_japanese_marks, text)
 78 |     marks = re.findall(_japanese_marks, text)
 79 |     text = ''
 80 |     for i, sentence in enumerate(sentences):
 81 |         if re.match(_japanese_characters, sentence):
 82 |             if text != '':
 83 |                 text += ' '
 84 |             labels = pyopenjtalk.extract_fullcontext(sentence)
 85 |             for n, label in enumerate(labels):
 86 |                 phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
 87 |                 if phoneme not in ['sil', 'pau']:
 88 |                     text += phoneme.replace('ch', 'ʧ').replace('sh',
 89 |                                                                'ʃ').replace('cl', 'Q')
 90 |                 else:
 91 |                     continue
 92 |                 # n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
 93 |                 a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
 94 |                 a2 = int(re.search(r"\+(\d+)\+", label).group(1))
 95 |                 a3 = int(re.search(r"\+(\d+)/", label).group(1))
 96 |                 if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
 97 |                     a2_next = -1
 98 |                 else:
 99 |                     a2_next = int(
100 |                         re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
101 |                 # Accent phrase boundary
102 |                 if a3 == 1 and a2_next == 1:
103 |                     text += ' '
104 |                 # Falling
105 |                 elif a1 == 0 and a2_next == a2 + 1:
106 |                     text += '↓'
107 |                 # Rising
108 |                 elif a2 == 1 and a2_next == 2:
109 |                     text += '↑'
110 |         if i < len(marks):
111 |             text += unidecode(marks[i]).replace(' ', '')
112 |     return text
113 | 
114 | 
115 | def get_real_sokuon(text):
116 |     for regex, replacement in _real_sokuon:
117 |         text = re.sub(regex, replacement, text)
118 |     return text
119 | 
120 | 
121 | def get_real_hatsuon(text):
122 |     for regex, replacement in _real_hatsuon:
123 |         text = re.sub(regex, replacement, text)
124 |     return text
125 | 
126 | 
127 | def japanese_to_ipa(text):
128 |     text = japanese_to_romaji_with_accent(text).replace('...', '…')
129 |     text = re.sub(
130 |         r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
131 |     text = get_real_sokuon(text)
132 |     text = get_real_hatsuon(text)
133 |     for regex, replacement in _romaji_to_ipa:
134 |         text = re.sub(regex, replacement, text)
135 |     return text
136 | 
137 | 
138 | def japanese_to_ipa2(text):
139 |     text = japanese_to_romaji_with_accent(text).replace('...', '…')
140 |     text = get_real_sokuon(text)
141 |     text = get_real_hatsuon(text)
142 |     for regex, replacement in _romaji_to_ipa2:
143 |         text = re.sub(regex, replacement, text)
144 |     return text
145 | 
146 | 
147 | def japanese_to_ipa3(text):
148 |     text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace(
149 |         'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a')
150 |     text = re.sub(
151 |         r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
152 |     text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text)
153 |     return text
154 | 


--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/keithito/tacotron """
  2 | 
  3 | '''
  4 | Cleaners are transformations that run over the input text at both training and eval time.
  5 | 
  6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
  7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  8 |   1. "english_cleaners" for English text
  9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 12 |      the symbols in symbols.py to match your data).
 13 | '''
 14 | 
 15 | import re
 16 | from unidecode import unidecode
 17 | from phonemizer import phonemize
 18 | import pyopenjtalk
 19 | from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
 20 | from text.korean import latin_to_hangul, number_to_hangul, divide_hangul, korean_to_lazy_ipa, korean_to_ipa
 21 | 
 22 | # Regular expression matching whitespace:
 23 | _whitespace_re = re.compile(r'\s+')
 24 | 
 25 | # Regular expression matching Japanese without punctuation marks:
 26 | _japanese_characters = re.compile(r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 27 | 
 28 | # Regular expression matching non-Japanese characters or punctuation marks:
 29 | _japanese_marks = re.compile(r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 30 | 
 31 | # List of (regular expression, replacement) pairs for abbreviations:
 32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
 33 |   ('mrs', 'misess'),
 34 |   ('mr', 'mister'),
 35 |   ('dr', 'doctor'),
 36 |   ('st', 'saint'),
 37 |   ('co', 'company'),
 38 |   ('jr', 'junior'),
 39 |   ('maj', 'major'),
 40 |   ('gen', 'general'),
 41 |   ('drs', 'doctors'),
 42 |   ('rev', 'reverend'),
 43 |   ('lt', 'lieutenant'),
 44 |   ('hon', 'honorable'),
 45 |   ('sgt', 'sergeant'),
 46 |   ('capt', 'captain'),
 47 |   ('esq', 'esquire'),
 48 |   ('ltd', 'limited'),
 49 |   ('col', 'colonel'),
 50 |   ('ft', 'fort'),
 51 | ]]
 52 | 
 53 | 
 54 | def expand_abbreviations(text):
 55 |   for regex, replacement in _abbreviations:
 56 |     text = re.sub(regex, replacement, text)
 57 |   return text
 58 | 
 59 | 
 60 | def expand_numbers(text):
 61 |   return normalize_numbers(text)
 62 | 
 63 | 
 64 | def lowercase(text):
 65 |   return text.lower()
 66 | 
 67 | 
 68 | def collapse_whitespace(text):
 69 |   return re.sub(_whitespace_re, ' ', text)
 70 | 
 71 | 
 72 | def convert_to_ascii(text):
 73 |   return unidecode(text)
 74 | 
 75 | 
 76 | def basic_cleaners(text):
 77 |   '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
 78 |   text = lowercase(text)
 79 |   text = collapse_whitespace(text)
 80 |   return text
 81 | 
 82 | 
 83 | def transliteration_cleaners(text):
 84 |   '''Pipeline for non-English text that transliterates to ASCII.'''
 85 |   text = convert_to_ascii(text)
 86 |   text = lowercase(text)
 87 |   text = collapse_whitespace(text)
 88 |   return text
 89 | 
 90 | 
 91 | def english_cleaners(text):
 92 |   '''Pipeline for English text, including abbreviation expansion.'''
 93 |   text = convert_to_ascii(text)
 94 |   text = lowercase(text)
 95 |   text = expand_abbreviations(text)
 96 |   phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
 97 |   phonemes = collapse_whitespace(phonemes)
 98 |   return phonemes
 99 | 
100 | 
101 | def english_cleaners2(text):
102 |   '''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
103 |   text = convert_to_ascii(text)
104 |   text = lowercase(text)
105 |   text = expand_abbreviations(text)
106 |   phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
107 |   phonemes = collapse_whitespace(phonemes)
108 |   return phonemes
109 | 
110 | 
111 | def japanese_cleaners(text):
112 |     text = japanese_to_romaji_with_accent(text)
113 |     text = re.sub(r'([A-Za-z])$', r'\1.', text)
114 |     return text
115 | 
116 | 
117 | def japanese_cleaners2(text):
118 |     return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
119 | 
120 | 
121 | def korean_cleaners(text):
122 |     '''Pipeline for Korean text'''
123 |     text = latin_to_hangul(text)
124 |     text = number_to_hangul(text)
125 |     text = divide_hangul(text)
126 |     text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
127 |     return text
128 | 
129 | 
130 | def japanese_triphone_cleaners(text):
131 |   sentences = re.split(_japanese_marks, text)
132 |   marks = re.findall(_japanese_marks, text)
133 |   text = ''
134 |   for i, sentence in enumerate(sentences):
135 |     phones = pyopenjtalk.g2p(sentence, kana=False)
136 |     phones = phones.replace(' ','')
137 |     phones = phones.replace('A', 'a').replace('I', 'i').replace('U', 'u').replace('E', 'e').replace('O', 'o')
138 |     phones = phones.replace('ch','ʧ').replace('sh','ʃ').replace('cl','Q')
139 |     triphones = []
140 |     length = len(phones)
141 |     for j, phone in enumerate(phones):
142 |       if length == 1:
143 |         triphone = phone
144 |       else:
145 |         if j == 0:
146 |           triphone = f'{phone}+{phones[j+1]}'
147 |         elif j == length - 1:
148 |           triphone = f'{phones[j-1]}-{phone}'
149 |         else:
150 |           triphone = f'{phones[j-1]}-{phone}+{phones[j+1]}'
151 |       triphones.append(triphone)
152 |     subtext = ' '.join(triphones)
153 |     text += subtext
154 |     if i < len(marks):
155 |       text += unidecode(marks[i]).replace(' ', '')
156 |   if len(text) > 0  and re.match('[A-Za-z]',text[-1]):
157 |     text += '.'
158 |     
159 |   return text
160 | 


--------------------------------------------------------------------------------
/text/korean.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from jamo import h2j, j2hcj
  3 | import ko_pron
  4 | 
  5 | 
  6 | # This is a list of Korean classifiers preceded by pure Korean numerals.
  7 | _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
  8 | 
  9 | # List of (hangul, hangul divided) pairs:
 10 | _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
 11 |     ('ㄳ', 'ㄱㅅ'),
 12 |     ('ㄵ', 'ㄴㅈ'),
 13 |     ('ㄶ', 'ㄴㅎ'),
 14 |     ('ㄺ', 'ㄹㄱ'),
 15 |     ('ㄻ', 'ㄹㅁ'),
 16 |     ('ㄼ', 'ㄹㅂ'),
 17 |     ('ㄽ', 'ㄹㅅ'),
 18 |     ('ㄾ', 'ㄹㅌ'),
 19 |     ('ㄿ', 'ㄹㅍ'),
 20 |     ('ㅀ', 'ㄹㅎ'),
 21 |     ('ㅄ', 'ㅂㅅ'),
 22 |     ('ㅘ', 'ㅗㅏ'),
 23 |     ('ㅙ', 'ㅗㅐ'),
 24 |     ('ㅚ', 'ㅗㅣ'),
 25 |     ('ㅝ', 'ㅜㅓ'),
 26 |     ('ㅞ', 'ㅜㅔ'),
 27 |     ('ㅟ', 'ㅜㅣ'),
 28 |     ('ㅢ', 'ㅡㅣ'),
 29 |     ('ㅑ', 'ㅣㅏ'),
 30 |     ('ㅒ', 'ㅣㅐ'),
 31 |     ('ㅕ', 'ㅣㅓ'),
 32 |     ('ㅖ', 'ㅣㅔ'),
 33 |     ('ㅛ', 'ㅣㅗ'),
 34 |     ('ㅠ', 'ㅣㅜ')
 35 | ]]
 36 | 
 37 | # List of (Latin alphabet, hangul) pairs:
 38 | _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
 39 |     ('a', '에이'),
 40 |     ('b', '비'),
 41 |     ('c', '시'),
 42 |     ('d', '디'),
 43 |     ('e', '이'),
 44 |     ('f', '에프'),
 45 |     ('g', '지'),
 46 |     ('h', '에이치'),
 47 |     ('i', '아이'),
 48 |     ('j', '제이'),
 49 |     ('k', '케이'),
 50 |     ('l', '엘'),
 51 |     ('m', '엠'),
 52 |     ('n', '엔'),
 53 |     ('o', '오'),
 54 |     ('p', '피'),
 55 |     ('q', '큐'),
 56 |     ('r', '아르'),
 57 |     ('s', '에스'),
 58 |     ('t', '티'),
 59 |     ('u', '유'),
 60 |     ('v', '브이'),
 61 |     ('w', '더블유'),
 62 |     ('x', '엑스'),
 63 |     ('y', '와이'),
 64 |     ('z', '제트')
 65 | ]]
 66 | 
 67 | # List of (ipa, lazy ipa) pairs:
 68 | _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
 69 |     ('t͡ɕ','ʧ'),
 70 |     ('d͡ʑ','ʥ'),
 71 |     ('ɲ','n^'),
 72 |     ('ɕ','ʃ'),
 73 |     ('ʷ','w'),
 74 |     ('ɭ','l`'),
 75 |     ('ʎ','ɾ'),
 76 |     ('ɣ','ŋ'),
 77 |     ('ɰ','ɯ'),
 78 |     ('ʝ','j'),
 79 |     ('ʌ','ə'),
 80 |     ('ɡ','g'),
 81 |     ('\u031a','#'),
 82 |     ('\u0348','='),
 83 |     ('\u031e',''),
 84 |     ('\u0320',''),
 85 |     ('\u0339','')
 86 | ]]
 87 | 
 88 | 
 89 | def latin_to_hangul(text):
 90 |     for regex, replacement in _latin_to_hangul:
 91 |         text = re.sub(regex, replacement, text)
 92 |     return text
 93 | 
 94 | 
 95 | def divide_hangul(text):
 96 |     text = j2hcj(h2j(text))
 97 |     for regex, replacement in _hangul_divided:
 98 |         text = re.sub(regex, replacement, text)
 99 |     return text
100 | 
101 | 
102 | def hangul_number(num, sino=True):
103 |     '''Reference https://github.com/Kyubyong/g2pK'''
104 |     num = re.sub(',', '', num)
105 | 
106 |     if num == '0':
107 |         return '영'
108 |     if not sino and num == '20':
109 |         return '스무'
110 | 
111 |     digits = '123456789'
112 |     names = '일이삼사오육칠팔구'
113 |     digit2name = {d: n for d, n in zip(digits, names)}
114 | 
115 |     modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
116 |     decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
117 |     digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
118 |     digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
119 | 
120 |     spelledout = []
121 |     for i, digit in enumerate(num):
122 |         i = len(num) - i - 1
123 |         if sino:
124 |             if i == 0:
125 |                 name = digit2name.get(digit, '')
126 |             elif i == 1:
127 |                 name = digit2name.get(digit, '') + '십'
128 |                 name = name.replace('일십', '십')
129 |         else:
130 |             if i == 0:
131 |                 name = digit2mod.get(digit, '')
132 |             elif i == 1:
133 |                 name = digit2dec.get(digit, '')
134 |         if digit == '0':
135 |             if i % 4 == 0:
136 |                 last_three = spelledout[-min(3, len(spelledout)):]
137 |                 if ''.join(last_three) == '':
138 |                     spelledout.append('')
139 |                     continue
140 |             else:
141 |                 spelledout.append('')
142 |                 continue
143 |         if i == 2:
144 |             name = digit2name.get(digit, '') + '백'
145 |             name = name.replace('일백', '백')
146 |         elif i == 3:
147 |             name = digit2name.get(digit, '') + '천'
148 |             name = name.replace('일천', '천')
149 |         elif i == 4:
150 |             name = digit2name.get(digit, '') + '만'
151 |             name = name.replace('일만', '만')
152 |         elif i == 5:
153 |             name = digit2name.get(digit, '') + '십'
154 |             name = name.replace('일십', '십')
155 |         elif i == 6:
156 |             name = digit2name.get(digit, '') + '백'
157 |             name = name.replace('일백', '백')
158 |         elif i == 7:
159 |             name = digit2name.get(digit, '') + '천'
160 |             name = name.replace('일천', '천')
161 |         elif i == 8:
162 |             name = digit2name.get(digit, '') + '억'
163 |         elif i == 9:
164 |             name = digit2name.get(digit, '') + '십'
165 |         elif i == 10:
166 |             name = digit2name.get(digit, '') + '백'
167 |         elif i == 11:
168 |             name = digit2name.get(digit, '') + '천'
169 |         elif i == 12:
170 |             name = digit2name.get(digit, '') + '조'
171 |         elif i == 13:
172 |             name = digit2name.get(digit, '') + '십'
173 |         elif i == 14:
174 |             name = digit2name.get(digit, '') + '백'
175 |         elif i == 15:
176 |             name = digit2name.get(digit, '') + '천'
177 |         spelledout.append(name)
178 |     return ''.join(elem for elem in spelledout)
179 | 
180 | 
181 | def number_to_hangul(text):
182 |     '''Reference https://github.com/Kyubyong/g2pK'''
183 |     tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
184 |     for token in tokens:
185 |         num, classifier = token
186 |         if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
187 |             spelledout = hangul_number(num, sino=False)
188 |         else:
189 |             spelledout = hangul_number(num, sino=True)
190 |         text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
191 |     # digit by digit for remaining digits
192 |     digits = '0123456789'
193 |     names = '영일이삼사오육칠팔구'
194 |     for d, n in zip(digits, names):
195 |         text = text.replace(d, n)
196 |     return text
197 | 
198 | 
199 | def korean_to_lazy_ipa(text):
200 |     text = latin_to_hangul(text)
201 |     text = number_to_hangul(text)
202 |     text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
203 |     for regex, replacement in _ipa_to_lazy_ipa:
204 |         text = re.sub(regex, replacement, text)
205 |     return text
206 | 
207 | 
208 | def korean_to_ipa(text):
209 |     text = korean_to_lazy_ipa(text)
210 |     return text.replace('ʧ','tʃ').replace('ʥ','dʑ')
211 | 


--------------------------------------------------------------------------------
/filelists/vctk_audio_sid_text_val_filelist.txt:
--------------------------------------------------------------------------------
  1 | DUMMY2/p364/p364_240.wav|88|It had happened to him.
  2 | DUMMY2/p280/p280_148.wav|52|It is open season on the Old Firm.
  3 | DUMMY2/p231/p231_320.wav|50|However, he is a coach, and he remains a coach at heart.
  4 | DUMMY2/p282/p282_129.wav|83|It is not a U-turn.
  5 | DUMMY2/p254/p254_015.wav|41|The Greeks used to imagine that it was a sign from the gods to foretell war or heavy rain.
  6 | DUMMY2/p228/p228_285.wav|57|The songs are just so good.
  7 | DUMMY2/p334/p334_307.wav|38|If they don't, they can expect their funding to be cut.
  8 | DUMMY2/p287/p287_081.wav|77|I've never seen anything like it.
  9 | DUMMY2/p247/p247_083.wav|14|It is a job creation scheme.)
 10 | DUMMY2/p264/p264_051.wav|65|We were leading by two goals.)
 11 | DUMMY2/p335/p335_058.wav|49|Let's see that increase over the years.
 12 | DUMMY2/p236/p236_225.wav|75|There is no quick fix.
 13 | DUMMY2/p374/p374_353.wav|11|And that brings us to the point.
 14 | DUMMY2/p272/p272_076.wav|69|Sounds like The Sixth Sense?
 15 | DUMMY2/p271/p271_152.wav|27|The petition was formally presented at Downing Street yesterday.
 16 | DUMMY2/p228/p228_127.wav|57|They've got to account for it.
 17 | DUMMY2/p276/p276_223.wav|106|It's been a humbling year.
 18 | DUMMY2/p262/p262_248.wav|45|The project has already secured the support of Sir Sean Connery.
 19 | DUMMY2/p314/p314_086.wav|51|The team this year is going places.
 20 | DUMMY2/p225/p225_038.wav|101|Diving is no part of football.
 21 | DUMMY2/p279/p279_088.wav|25|The shareholders will vote to wind up the company on Friday morning.
 22 | DUMMY2/p272/p272_018.wav|69|Aristotle thought that the rainbow was caused by reflection of the sun's rays by the rain.
 23 | DUMMY2/p256/p256_098.wav|90|She told The Herald.
 24 | DUMMY2/p261/p261_218.wav|100|All will be revealed in due course.
 25 | DUMMY2/p265/p265_063.wav|73|IT shouldn't come as a surprise, but it does.
 26 | DUMMY2/p314/p314_042.wav|51|It is all about people being assaulted, abused.
 27 | DUMMY2/p241/p241_188.wav|86|I wish I could say something.
 28 | DUMMY2/p283/p283_111.wav|95|It's good to have a voice.
 29 | DUMMY2/p275/p275_006.wav|40|When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow.
 30 | DUMMY2/p228/p228_092.wav|57|Today I couldn't run on it.
 31 | DUMMY2/p295/p295_343.wav|92|The atmosphere is businesslike.
 32 | DUMMY2/p228/p228_187.wav|57|They will run a mile.
 33 | DUMMY2/p294/p294_317.wav|104|It didn't put me off.
 34 | DUMMY2/p231/p231_445.wav|50|It sounded like a bomb.
 35 | DUMMY2/p272/p272_086.wav|69|Today she has been released.
 36 | DUMMY2/p255/p255_210.wav|31|It was worth a photograph.
 37 | DUMMY2/p229/p229_060.wav|67|And a film maker was born.
 38 | DUMMY2/p260/p260_232.wav|81|The Home Office would not release any further details about the group.
 39 | DUMMY2/p245/p245_025.wav|59|Johnson was pretty low.
 40 | DUMMY2/p333/p333_185.wav|64|This area is perfect for children.
 41 | DUMMY2/p244/p244_242.wav|78|He is a man of the people.
 42 | DUMMY2/p376/p376_187.wav|71|"It is a terrible loss."
 43 | DUMMY2/p239/p239_156.wav|48|It is a good lifestyle.
 44 | DUMMY2/p307/p307_037.wav|22|He released a half-dozen solo albums.
 45 | DUMMY2/p305/p305_185.wav|54|I am not even thinking about that.
 46 | DUMMY2/p272/p272_081.wav|69|It was magic.
 47 | DUMMY2/p302/p302_297.wav|30|I'm trying to stay open on that.
 48 | DUMMY2/p275/p275_320.wav|40|We are in the end game.
 49 | DUMMY2/p239/p239_231.wav|48|Then we will face the Danish champions.
 50 | DUMMY2/p268/p268_301.wav|87|It was only later that the condition was diagnosed.
 51 | DUMMY2/p336/p336_088.wav|98|They failed to reach agreement yesterday.
 52 | DUMMY2/p278/p278_255.wav|10|They made such decisions in London.
 53 | DUMMY2/p361/p361_132.wav|79|That got me out.
 54 | DUMMY2/p307/p307_146.wav|22|You hope he prevails.
 55 | DUMMY2/p244/p244_147.wav|78|They could not ignore the will of parliament, he claimed.
 56 | DUMMY2/p294/p294_283.wav|104|This is our unfinished business.
 57 | DUMMY2/p283/p283_300.wav|95|I would have the hammer in the crowd.
 58 | DUMMY2/p239/p239_079.wav|48|I can understand the frustrations of our fans.
 59 | DUMMY2/p264/p264_009.wav|65|There is , according to legend, a boiling pot of gold at one end. )
 60 | DUMMY2/p307/p307_348.wav|22|He did not oppose the divorce.
 61 | DUMMY2/p304/p304_308.wav|72|We are the gateway to justice.
 62 | DUMMY2/p281/p281_056.wav|36|None has ever been found.
 63 | DUMMY2/p267/p267_158.wav|0|We were given a warm and friendly reception.
 64 | DUMMY2/p300/p300_169.wav|102|Who do these people think they are?
 65 | DUMMY2/p276/p276_177.wav|106|They exist in name alone.
 66 | DUMMY2/p228/p228_245.wav|57|It is a policy which has the full support of the minister.
 67 | DUMMY2/p300/p300_303.wav|102|I'm wondering what you feel about the youngest.
 68 | DUMMY2/p362/p362_247.wav|15|This would give Scotland around eight members.
 69 | DUMMY2/p326/p326_031.wav|28|United were in control without always being dominant.
 70 | DUMMY2/p361/p361_288.wav|79|I did not think it was very proper.
 71 | DUMMY2/p286/p286_145.wav|63|Tiger is not the norm.
 72 | DUMMY2/p234/p234_071.wav|3|She did that for the rest of her life.
 73 | DUMMY2/p263/p263_296.wav|39|The decision was announced at its annual conference in Dunfermline.
 74 | DUMMY2/p323/p323_228.wav|34|She became a heroine of my childhood.
 75 | DUMMY2/p280/p280_346.wav|52|It was a bit like having children.
 76 | DUMMY2/p333/p333_080.wav|64|But the tragedy did not stop there.
 77 | DUMMY2/p226/p226_268.wav|43|That decision is for the British Parliament and people.
 78 | DUMMY2/p362/p362_314.wav|15|Is that right?
 79 | DUMMY2/p240/p240_047.wav|93|It is so sad.
 80 | DUMMY2/p250/p250_207.wav|24|You could feel the heat.
 81 | DUMMY2/p273/p273_176.wav|56|Neither side would reveal the details of the offer.
 82 | DUMMY2/p316/p316_147.wav|85|And frankly, it's been a while.
 83 | DUMMY2/p265/p265_047.wav|73|It is unique.
 84 | DUMMY2/p336/p336_353.wav|98|Sometimes you get them, sometimes you don't.
 85 | DUMMY2/p230/p230_376.wav|35|This hasn't happened in a vacuum.
 86 | DUMMY2/p308/p308_209.wav|107|There is great potential on this river.
 87 | DUMMY2/p250/p250_442.wav|24|We have not yet received a letter from the Irish.
 88 | DUMMY2/p260/p260_037.wav|81|It's a fact.
 89 | DUMMY2/p299/p299_345.wav|58|We're very excited and challenged by the project.
 90 | DUMMY2/p269/p269_218.wav|94|A Grampian Police spokesman said.
 91 | DUMMY2/p306/p306_014.wav|12|To the Hebrews it was a token that there would be no more universal floods.
 92 | DUMMY2/p271/p271_292.wav|27|It's a record label, not a form of music.
 93 | DUMMY2/p247/p247_225.wav|14|I am considered a teenager.)
 94 | DUMMY2/p294/p294_094.wav|104|It should be a condition of employment.
 95 | DUMMY2/p269/p269_031.wav|94|Is this accurate?
 96 | DUMMY2/p275/p275_116.wav|40|It's not fair.
 97 | DUMMY2/p265/p265_006.wav|73|When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow.
 98 | DUMMY2/p285/p285_072.wav|2|Mr Irvine said Mr Rafferty was now in good spirits.
 99 | DUMMY2/p270/p270_167.wav|8|We did what we had to do.
100 | DUMMY2/p360/p360_397.wav|60|It is a relief.
101 | 


--------------------------------------------------------------------------------
/filelists/vctk_audio_sid_text_val_filelist.txt.cleaned:
--------------------------------------------------------------------------------
  1 | DUMMY2/p364/p364_240.wav|88|ɪt hɐd hˈæpənd tə hˌɪm.
  2 | DUMMY2/p280/p280_148.wav|52|ɪt ɪz ˈoʊpən sˈiːzən ɑːnðɪ ˈoʊld fˈɜːm.
  3 | DUMMY2/p231/p231_320.wav|50|haʊˈɛvɚ, hiː ɪz ɐ kˈoʊtʃ, ænd hiː ɹɪmˈeɪnz ɐ kˈoʊtʃ æt hˈɑːɹt.
  4 | DUMMY2/p282/p282_129.wav|83|ɪt ɪz nˌɑːɾə jˈuːtˈɜːn.
  5 | DUMMY2/p254/p254_015.wav|41|ðə ɡɹˈiːks jˈuːzd tʊ ɪmˈædʒɪn ðˌɐɾɪt wʌzɐ sˈaɪn fɹʌmðə ɡˈɑːdz tə foːɹtˈɛl wˈɔːɹ ɔːɹ hˈɛvi ɹˈeɪn.
  6 | DUMMY2/p228/p228_285.wav|57|ðə sˈɔŋz ɑːɹ dʒˈʌst sˌoʊ ɡˈʊd.
  7 | DUMMY2/p334/p334_307.wav|38|ɪf ðeɪ dˈoʊnt, ðeɪ kæn ɛkspˈɛkt ðɛɹ fˈʌndɪŋ təbi kˈʌt.
  8 | DUMMY2/p287/p287_081.wav|77|aɪv nˈɛvɚ sˈiːn ˈɛnɪθˌɪŋ lˈaɪk ɪt.
  9 | DUMMY2/p247/p247_083.wav|14|ɪt ɪz ɐ dʒˈɑːb kɹiːˈeɪʃən skˈiːm.
 10 | DUMMY2/p264/p264_051.wav|65|wiː wɜː lˈiːdɪŋ baɪ tˈuː ɡˈoʊlz.
 11 | DUMMY2/p335/p335_058.wav|49|lˈɛts sˈiː ðæt ˈɪnkɹiːs ˌoʊvɚ ðə jˈɪɹz.
 12 | DUMMY2/p236/p236_225.wav|75|ðɛɹ ɪz nˈoʊ kwˈɪk fˈɪks.
 13 | DUMMY2/p374/p374_353.wav|11|ænd ðæt bɹˈɪŋz ˌʌs tə ðə pˈɔɪnt.
 14 | DUMMY2/p272/p272_076.wav|69|sˈaʊndz lˈaɪk ðə sˈɪksθ sˈɛns?
 15 | DUMMY2/p271/p271_152.wav|27|ðə pətˈɪʃən wʌz fˈɔːɹməli pɹɪzˈɛntᵻd æt dˈaʊnɪŋ stɹˈiːt jˈɛstɚdˌeɪ.
 16 | DUMMY2/p228/p228_127.wav|57|ðeɪv ɡɑːt tʊ ɐkˈaʊnt fɔːɹ ɪt.
 17 | DUMMY2/p276/p276_223.wav|106|ɪts bˌɪn ɐ hˈʌmblɪŋ jˈɪɹ.
 18 | DUMMY2/p262/p262_248.wav|45|ðə pɹˈɑːdʒɛkt hɐz ɔːlɹˌɛdi sɪkjˈʊɹd ðə səpˈoːɹt ʌv sˌɜː ʃˈɔːn kɑːnɚɹi.
 19 | DUMMY2/p314/p314_086.wav|51|ðə tˈiːm ðɪs jˈɪɹ ɪz ɡˌoʊɪŋ plˈeɪsᵻz.
 20 | DUMMY2/p225/p225_038.wav|101|dˈaɪvɪŋ ɪz nˈoʊ pˈɑːɹt ʌv fˈʊtbɔːl.
 21 | DUMMY2/p279/p279_088.wav|25|ðə ʃˈɛɹhoʊldɚz wɪl vˈoʊt tə wˈaɪnd ˈʌp ðə kˈʌmpəni ˌɑːn fɹˈaɪdeɪ mˈɔːɹnɪŋ.
 22 | DUMMY2/p272/p272_018.wav|69|ˈæɹɪstˌɑːɾəl θˈɔːt ðætðə ɹˈeɪnboʊ wʌz kˈɔːzd baɪ ɹɪflˈɛkʃən ʌvðə sˈʌnz ɹˈeɪz baɪ ðə ɹˈeɪn.
 23 | DUMMY2/p256/p256_098.wav|90|ʃiː tˈoʊld ðə hˈɛɹəld.
 24 | DUMMY2/p261/p261_218.wav|100|ˈɔːl wɪl biː ɹɪvˈiːld ɪn dˈuː kˈoːɹs.
 25 | DUMMY2/p265/p265_063.wav|73|ɪt ʃˌʊdənt kˈʌm æz ɐ sɚpɹˈaɪz, bˌʌt ɪt dˈʌz.
 26 | DUMMY2/p314/p314_042.wav|51|ɪt ɪz ˈɔːl ɐbˌaʊt pˈiːpəl bˌiːɪŋ ɐsˈɑːltᵻd, ɐbjˈuːsd.
 27 | DUMMY2/p241/p241_188.wav|86|ˈaɪ wˈɪʃ ˈaɪ kʊd sˈeɪ sˈʌmθɪŋ.
 28 | DUMMY2/p283/p283_111.wav|95|ɪts ɡˈʊd tə hæv ɐ vˈɔɪs.
 29 | DUMMY2/p275/p275_006.wav|40|wˌɛn ðə sˈʌnlaɪt stɹˈaɪks ɹˈeɪndɹɑːps ɪnðɪ ˈɛɹ, ðeɪ ˈækt æz ɐ pɹˈɪzəm ænd fˈɔːɹm ɐ ɹˈeɪnboʊ.
 30 | DUMMY2/p228/p228_092.wav|57|tədˈeɪ ˈaɪ kˌʊdənt ɹˈʌn ˈɑːn ɪt.
 31 | DUMMY2/p295/p295_343.wav|92|ðɪ ˈætməsfˌɪɹ ɪz bˈɪznəslˌaɪk.
 32 | DUMMY2/p228/p228_187.wav|57|ðeɪ wɪl ɹˈʌn ɐ mˈaɪl.
 33 | DUMMY2/p294/p294_317.wav|104|ɪt dˈɪdnt pˌʊt mˌiː ˈɔf.
 34 | DUMMY2/p231/p231_445.wav|50|ɪt sˈaʊndᵻd lˈaɪk ɐ bˈɑːm.
 35 | DUMMY2/p272/p272_086.wav|69|tədˈeɪ ʃiː hɐzbɪn ɹɪlˈiːsd.
 36 | DUMMY2/p255/p255_210.wav|31|ɪt wʌz wˈɜːθ ɐ fˈoʊɾəɡɹˌæf.
 37 | DUMMY2/p229/p229_060.wav|67|ænd ɐ fˈɪlm mˈeɪkɚ wʌz bˈɔːɹn.
 38 | DUMMY2/p260/p260_232.wav|81|ðə hˈoʊm ˈɑːfɪs wʊd nˌɑːt ɹɪlˈiːs ˌɛni fˈɜːðɚ diːtˈeɪlz ɐbˌaʊt ðə ɡɹˈuːp.
 39 | DUMMY2/p245/p245_025.wav|59|dʒˈɑːnsən wʌz pɹˈɪɾi lˈoʊ.
 40 | DUMMY2/p333/p333_185.wav|64|ðɪs ˈɛɹiə ɪz pˈɜːfɛkt fɔːɹ tʃˈɪldɹən.
 41 | DUMMY2/p244/p244_242.wav|78|hiː ɪz ɐ mˈæn ʌvðə pˈiːpəl.
 42 | DUMMY2/p376/p376_187.wav|71|"ɪt ɪz ɐ tˈɛɹəbəl lˈɔs."
 43 | DUMMY2/p239/p239_156.wav|48|ɪt ɪz ɐ ɡˈʊd lˈaɪfstaɪl.
 44 | DUMMY2/p307/p307_037.wav|22|hiː ɹɪlˈiːsd ɐ hˈæfdˈʌzən sˈoʊloʊ ˈælbəmz.
 45 | DUMMY2/p305/p305_185.wav|54|ˈaɪ æm nˌɑːt ˈiːvən θˈɪŋkɪŋ ɐbˌaʊt ðˈæt.
 46 | DUMMY2/p272/p272_081.wav|69|ɪt wʌz mˈædʒɪk.
 47 | DUMMY2/p302/p302_297.wav|30|aɪm tɹˈaɪɪŋ tə stˈeɪ ˈoʊpən ˌɑːn ðˈæt.
 48 | DUMMY2/p275/p275_320.wav|40|wiː ɑːɹ ɪnðɪ ˈɛnd ɡˈeɪm.
 49 | DUMMY2/p239/p239_231.wav|48|ðˈɛn wiː wɪl fˈeɪs ðə dˈeɪnɪʃ tʃˈæmpiənz.
 50 | DUMMY2/p268/p268_301.wav|87|ɪt wʌz ˈoʊnli lˈeɪɾɚ ðætðə kəndˈɪʃən wʌz dˌaɪəɡnˈoʊzd.
 51 | DUMMY2/p336/p336_088.wav|98|ðeɪ fˈeɪld tə ɹˈiːtʃ ɐɡɹˈiːmənt jˈɛstɚdˌeɪ.
 52 | DUMMY2/p278/p278_255.wav|10|ðeɪ mˌeɪd sˈʌtʃ dᵻsˈɪʒənz ɪn lˈʌndən.
 53 | DUMMY2/p361/p361_132.wav|79|ðæt ɡɑːt mˌiː ˈaʊt.
 54 | DUMMY2/p307/p307_146.wav|22|juː hˈoʊp hiː pɹɪvˈeɪlz.
 55 | DUMMY2/p244/p244_147.wav|78|ðeɪ kʊd nˌɑːt ɪɡnˈoːɹ ðə wɪl ʌv pˈɑːɹləmənt, hiː klˈeɪmd.
 56 | DUMMY2/p294/p294_283.wav|104|ðɪs ɪz ˌaʊɚɹ ʌnfˈɪnɪʃt bˈɪznəs.
 57 | DUMMY2/p283/p283_300.wav|95|ˈaɪ wʊdhɐv ðə hˈæmɚɹ ɪnðə kɹˈaʊd.
 58 | DUMMY2/p239/p239_079.wav|48|ˈaɪ kæn ˌʌndɚstˈænd ðə fɹʌstɹˈeɪʃənz ʌv ˌaʊɚ fˈænz.
 59 | DUMMY2/p264/p264_009.wav|65|ðɛɹˈɪz , ɐkˈoːɹdɪŋ tə lˈɛdʒənd, ɐ bˈɔɪlɪŋ pˈɑːt ʌv ɡˈoʊld æt wˈʌn ˈɛnd. 
 60 | DUMMY2/p307/p307_348.wav|22|hiː dɪdnˌɑːt əpˈoʊz ðə dɪvˈoːɹs.
 61 | DUMMY2/p304/p304_308.wav|72|wiː ɑːɹ ðə ɡˈeɪtweɪ tə dʒˈʌstɪs.
 62 | DUMMY2/p281/p281_056.wav|36|nˈʌn hɐz ˈɛvɚ bˌɪn fˈaʊnd.
 63 | DUMMY2/p267/p267_158.wav|0|wiː wɜː ɡˈɪvən ɐ wˈɔːɹm ænd fɹˈɛndli ɹɪsˈɛpʃən.
 64 | DUMMY2/p300/p300_169.wav|102|hˌuː dˈuː ðiːz pˈiːpəl θˈɪŋk ðeɪ ɑːɹ?
 65 | DUMMY2/p276/p276_177.wav|106|ðeɪ ɛɡzˈɪst ɪn nˈeɪm ɐlˈoʊn.
 66 | DUMMY2/p228/p228_245.wav|57|ɪt ɪz ɐ pˈɑːlɪsi wˌɪtʃ hɐz ðə fˈʊl səpˈoːɹt ʌvðə mˈɪnɪstɚ.
 67 | DUMMY2/p300/p300_303.wav|102|aɪm wˈʌndɚɹɪŋ wˌʌt juː fˈiːl ɐbˌaʊt ðə jˈʌŋɡəst.
 68 | DUMMY2/p362/p362_247.wav|15|ðɪs wʊd ɡˈɪv skˈɑːtlənd ɐɹˈaʊnd ˈeɪt mˈɛmbɚz.
 69 | DUMMY2/p326/p326_031.wav|28|juːnˈaɪɾᵻd wɜːɹ ɪn kəntɹˈoʊl wɪðˌaʊt ˈɔːlweɪz bˌiːɪŋ dˈɑːmɪnənt.
 70 | DUMMY2/p361/p361_288.wav|79|ˈaɪ dɪdnˌɑːt θˈɪŋk ɪt wʌz vˈɛɹi pɹˈɑːpɚ.
 71 | DUMMY2/p286/p286_145.wav|63|tˈaɪɡɚɹ ɪz nˌɑːt ðə nˈɔːɹm.
 72 | DUMMY2/p234/p234_071.wav|3|ʃiː dˈɪd ðæt fɚðə ɹˈɛst ʌv hɜː lˈaɪf.
 73 | DUMMY2/p263/p263_296.wav|39|ðə dᵻsˈɪʒən wʌz ɐnˈaʊnst æt ɪts ˈænjuːəl kˈɑːnfɹəns ɪn dˈʌnfɚmlˌaɪn.
 74 | DUMMY2/p323/p323_228.wav|34|ʃiː bɪkˌeɪm ɐ hˈɛɹoʊˌɪn ʌv maɪ tʃˈaɪldhʊd.
 75 | DUMMY2/p280/p280_346.wav|52|ɪt wʌzɐ bˈɪt lˈaɪk hˌævɪŋ tʃˈɪldɹən.
 76 | DUMMY2/p333/p333_080.wav|64|bˌʌt ðə tɹˈædʒədi dɪdnˌɑːt stˈɑːp ðˈɛɹ.
 77 | DUMMY2/p226/p226_268.wav|43|ðæt dᵻsˈɪʒən ɪz fɚðə bɹˈɪɾɪʃ pˈɑːɹləmənt ænd pˈiːpəl.
 78 | DUMMY2/p362/p362_314.wav|15|ɪz ðæt ɹˈaɪt?
 79 | DUMMY2/p240/p240_047.wav|93|ɪt ɪz sˌoʊ sˈæd.
 80 | DUMMY2/p250/p250_207.wav|24|juː kʊd fˈiːl ðə hˈiːt.
 81 | DUMMY2/p273/p273_176.wav|56|nˈiːðɚ sˈaɪd wʊd ɹɪvˈiːl ðə diːtˈeɪlz ʌvðɪ ˈɑːfɚ.
 82 | DUMMY2/p316/p316_147.wav|85|ænd fɹˈæŋkli, ɪts bˌɪn ɐ wˈaɪl.
 83 | DUMMY2/p265/p265_047.wav|73|ɪt ɪz juːnˈiːk.
 84 | DUMMY2/p336/p336_353.wav|98|sˈʌmtaɪmz juː ɡˈɛt ðˌɛm, sˈʌmtaɪmz juː dˈoʊnt.
 85 | DUMMY2/p230/p230_376.wav|35|ðɪs hˈæzənt hˈæpənd ɪn ɐ vˈækjuːm.
 86 | DUMMY2/p308/p308_209.wav|107|ðɛɹ ɪz ɡɹˈeɪt pətˈɛnʃəl ˌɑːn ðɪs ɹˈɪvɚ.
 87 | DUMMY2/p250/p250_442.wav|24|wiː hɐvnˌɑːt jˈɛt ɹɪsˈiːvd ɐ lˈɛɾɚ fɹʌmðɪ ˈaɪɹɪʃ.
 88 | DUMMY2/p260/p260_037.wav|81|ɪts ɐ fˈækt.
 89 | DUMMY2/p299/p299_345.wav|58|wɪɹ vˈɛɹi ɛksˈaɪɾᵻd ænd tʃˈælɪndʒd baɪ ðə pɹˈɑːdʒɛkt.
 90 | DUMMY2/p269/p269_218.wav|94|ɐ ɡɹˈæmpiən pəlˈiːs spˈoʊksmən sˈɛd.
 91 | DUMMY2/p306/p306_014.wav|12|tə ðə hˈiːbɹuːz ɪt wʌzɐ tˈoʊkən ðæt ðɛɹ wʊd biː nˈoʊmˌoːɹ jˌuːnɪvˈɜːsəl flˈʌdz.
 92 | DUMMY2/p271/p271_292.wav|27|ɪts ɐ ɹˈɛkɚd lˈeɪbəl, nˌɑːɾə fˈɔːɹm ʌv mjˈuːzɪk.
 93 | DUMMY2/p247/p247_225.wav|14|ˈaɪ æm kənsˈɪdɚd ɐ tˈiːneɪdʒɚ.
 94 | DUMMY2/p294/p294_094.wav|104|ɪt ʃˌʊd biː ɐ kəndˈɪʃən ʌv ɛmplˈɔɪmənt.
 95 | DUMMY2/p269/p269_031.wav|94|ɪz ðɪs ˈækjʊɹət?
 96 | DUMMY2/p275/p275_116.wav|40|ɪts nˌɑːt fˈɛɹ.
 97 | DUMMY2/p265/p265_006.wav|73|wˌɛn ðə sˈʌnlaɪt stɹˈaɪks ɹˈeɪndɹɑːps ɪnðɪ ˈɛɹ, ðeɪ ˈækt æz ɐ pɹˈɪzəm ænd fˈɔːɹm ɐ ɹˈeɪnboʊ.
 98 | DUMMY2/p285/p285_072.wav|2|mˈɪstɚɹ ˈɜːvaɪn sˈɛd mˈɪstɚ ɹˈæfɚɾi wʌz nˈaʊ ɪn ɡˈʊd spˈɪɹɪts.
 99 | DUMMY2/p270/p270_167.wav|8|wiː dˈɪd wˌʌt wiː hædtə dˈuː.
100 | DUMMY2/p360/p360_397.wav|60|ɪt ɪz ɐ ɹɪlˈiːf.
101 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import sys
  4 | import argparse
  5 | import logging
  6 | import json
  7 | import subprocess
  8 | import numpy as np
  9 | from scipy.io.wavfile import read
 10 | import torch
 11 | 
 12 | MATPLOTLIB_FLAG = False
 13 | 
 14 | logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
 15 | logger = logging
 16 | 
 17 | 
 18 | def load_checkpoint(checkpoint_path, model, optimizer=None):
 19 |   assert os.path.isfile(checkpoint_path)
 20 |   checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
 21 |   iteration = checkpoint_dict['iteration']
 22 |   learning_rate = checkpoint_dict['learning_rate']
 23 |   if optimizer is not None:
 24 |     optimizer.load_state_dict(checkpoint_dict['optimizer'])
 25 |   saved_state_dict = checkpoint_dict['model']
 26 |   if hasattr(model, 'module'):
 27 |     state_dict = model.module.state_dict()
 28 |   else:
 29 |     state_dict = model.state_dict()
 30 |   new_state_dict= {}
 31 |   for k, v in state_dict.items():
 32 |     try:
 33 |       new_state_dict[k] = saved_state_dict[k]
 34 |     except:
 35 |       logger.info("%s is not in the checkpoint" % k)
 36 |       new_state_dict[k] = v
 37 |   if hasattr(model, 'module'):
 38 |     model.module.load_state_dict(new_state_dict)
 39 |   else:
 40 |     model.load_state_dict(new_state_dict)
 41 |   logger.info("Loaded checkpoint '{}' (iteration {})" .format(
 42 |     checkpoint_path, iteration))
 43 |   return model, optimizer, learning_rate, iteration
 44 | 
 45 | 
 46 | def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
 47 |   logger.info("Saving model and optimizer state at iteration {} to {}".format(
 48 |     iteration, checkpoint_path))
 49 |   if hasattr(model, 'module'):
 50 |     state_dict = model.module.state_dict()
 51 |   else:
 52 |     state_dict = model.state_dict()
 53 |   torch.save({'model': state_dict,
 54 |               'iteration': iteration,
 55 |               'optimizer': optimizer.state_dict(),
 56 |               'learning_rate': learning_rate}, checkpoint_path)
 57 | 
 58 | 
 59 | def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
 60 |   for k, v in scalars.items():
 61 |     writer.add_scalar(k, v, global_step)
 62 |   for k, v in histograms.items():
 63 |     writer.add_histogram(k, v, global_step)
 64 |   for k, v in images.items():
 65 |     writer.add_image(k, v, global_step, dataformats='HWC')
 66 |   for k, v in audios.items():
 67 |     writer.add_audio(k, v, global_step, audio_sampling_rate)
 68 | 
 69 | 
 70 | def latest_checkpoint_path(dir_path, regex="G_*.pth"):
 71 |   f_list = glob.glob(os.path.join(dir_path, regex))
 72 |   f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
 73 |   x = f_list[-1]
 74 |   print(x)
 75 |   return x
 76 | 
 77 | 
 78 | def plot_spectrogram_to_numpy(spectrogram):
 79 |   global MATPLOTLIB_FLAG
 80 |   if not MATPLOTLIB_FLAG:
 81 |     import matplotlib
 82 |     matplotlib.use("Agg")
 83 |     MATPLOTLIB_FLAG = True
 84 |     mpl_logger = logging.getLogger('matplotlib')
 85 |     mpl_logger.setLevel(logging.WARNING)
 86 |   import matplotlib.pylab as plt
 87 |   import numpy as np
 88 |   
 89 |   fig, ax = plt.subplots(figsize=(10,2))
 90 |   im = ax.imshow(spectrogram, aspect="auto", origin="lower",
 91 |                   interpolation='none')
 92 |   plt.colorbar(im, ax=ax)
 93 |   plt.xlabel("Frames")
 94 |   plt.ylabel("Channels")
 95 |   plt.tight_layout()
 96 | 
 97 |   fig.canvas.draw()
 98 |   data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
 99 |   data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
100 |   plt.close()
101 |   return data
102 | 
103 | 
104 | def plot_alignment_to_numpy(alignment, info=None):
105 |   global MATPLOTLIB_FLAG
106 |   if not MATPLOTLIB_FLAG:
107 |     import matplotlib
108 |     matplotlib.use("Agg")
109 |     MATPLOTLIB_FLAG = True
110 |     mpl_logger = logging.getLogger('matplotlib')
111 |     mpl_logger.setLevel(logging.WARNING)
112 |   import matplotlib.pylab as plt
113 |   import numpy as np
114 | 
115 |   fig, ax = plt.subplots(figsize=(6, 4))
116 |   im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
117 |                   interpolation='none')
118 |   fig.colorbar(im, ax=ax)
119 |   xlabel = 'Decoder timestep'
120 |   if info is not None:
121 |       xlabel += '\n\n' + info
122 |   plt.xlabel(xlabel)
123 |   plt.ylabel('Encoder timestep')
124 |   plt.tight_layout()
125 | 
126 |   fig.canvas.draw()
127 |   data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
128 |   data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
129 |   plt.close()
130 |   return data
131 | 
132 | 
133 | def load_wav_to_torch(full_path):
134 |   sampling_rate, data = read(full_path)
135 |   return torch.FloatTensor(data.astype(np.float32)), sampling_rate
136 | 
137 | 
138 | def load_filepaths_and_text(filename, split="|"):
139 |   with open(filename, encoding='utf-8') as f:
140 |     filepaths_and_text = [line.strip().split(split) for line in f]
141 |   return filepaths_and_text
142 | 
143 | 
144 | def get_hparams(init=True):
145 |   parser = argparse.ArgumentParser()
146 |   parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
147 |                       help='JSON file for configuration')
148 |   parser.add_argument('-m', '--model', type=str, required=True,
149 |                       help='Model name')
150 |   
151 |   args = parser.parse_args()
152 |   model_dir = os.path.join("./logs", args.model)
153 | 
154 |   if not os.path.exists(model_dir):
155 |     os.makedirs(model_dir)
156 | 
157 |   config_path = args.config
158 |   config_save_path = os.path.join(model_dir, "config.json")
159 |   if init:
160 |     with open(config_path, "r") as f:
161 |       data = f.read()
162 |     with open(config_save_path, "w") as f:
163 |       f.write(data)
164 |   else:
165 |     with open(config_save_path, "r") as f:
166 |       data = f.read()
167 |   config = json.loads(data)
168 |   
169 |   hparams = HParams(**config)
170 |   hparams.model_dir = model_dir
171 |   return hparams
172 | 
173 | 
174 | def get_hparams_from_dir(model_dir):
175 |   config_save_path = os.path.join(model_dir, "config.json")
176 |   with open(config_save_path, "r") as f:
177 |     data = f.read()
178 |   config = json.loads(data)
179 | 
180 |   hparams =HParams(**config)
181 |   hparams.model_dir = model_dir
182 |   return hparams
183 | 
184 | 
185 | def get_hparams_from_file(config_path):
186 |   with open(config_path, "r") as f:
187 |     data = f.read()
188 |   config = json.loads(data)
189 | 
190 |   hparams =HParams(**config)
191 |   return hparams
192 | 
193 | 
194 | def check_git_hash(model_dir):
195 |   source_dir = os.path.dirname(os.path.realpath(__file__))
196 |   if not os.path.exists(os.path.join(source_dir, ".git")):
197 |     logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
198 |       source_dir
199 |     ))
200 |     return
201 | 
202 |   cur_hash = subprocess.getoutput("git rev-parse HEAD")
203 | 
204 |   path = os.path.join(model_dir, "githash")
205 |   if os.path.exists(path):
206 |     saved_hash = open(path).read()
207 |     if saved_hash != cur_hash:
208 |       logger.warn("git hash values are different. {}(saved) != {}(current)".format(
209 |         saved_hash[:8], cur_hash[:8]))
210 |   else:
211 |     open(path, "w").write(cur_hash)
212 | 
213 | 
214 | def get_logger(model_dir, filename="train.log"):
215 |   global logger
216 |   logger = logging.getLogger(os.path.basename(model_dir))
217 |   logger.setLevel(logging.DEBUG)
218 |   
219 |   formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
220 |   if not os.path.exists(model_dir):
221 |     os.makedirs(model_dir)
222 |   h = logging.FileHandler(os.path.join(model_dir, filename))
223 |   h.setLevel(logging.DEBUG)
224 |   h.setFormatter(formatter)
225 |   logger.addHandler(h)
226 |   return logger
227 | 
228 | 
229 | class HParams():
230 |   def __init__(self, **kwargs):
231 |     for k, v in kwargs.items():
232 |       if type(v) == dict:
233 |         v = HParams(**v)
234 |       self[k] = v
235 |     
236 |   def keys(self):
237 |     return self.__dict__.keys()
238 | 
239 |   def items(self):
240 |     return self.__dict__.items()
241 | 
242 |   def values(self):
243 |     return self.__dict__.values()
244 | 
245 |   def __len__(self):
246 |     return len(self.__dict__)
247 | 
248 |   def __getitem__(self, key):
249 |     return getattr(self, key)
250 | 
251 |   def __setitem__(self, key, value):
252 |     return setattr(self, key, value)
253 | 
254 |   def __contains__(self, key):
255 |     return key in self.__dict__
256 | 
257 |   def __repr__(self):
258 |     return self.__dict__.__repr__()
259 | 


--------------------------------------------------------------------------------
/transforms.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
  8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
  9 | DEFAULT_MIN_DERIVATIVE = 1e-3
 10 | 
 11 | 
 12 | def piecewise_rational_quadratic_transform(inputs, 
 13 |                                            unnormalized_widths,
 14 |                                            unnormalized_heights,
 15 |                                            unnormalized_derivatives,
 16 |                                            inverse=False,
 17 |                                            tails=None, 
 18 |                                            tail_bound=1.,
 19 |                                            min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 20 |                                            min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 21 |                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
 22 | 
 23 |     if tails is None:
 24 |         spline_fn = rational_quadratic_spline
 25 |         spline_kwargs = {}
 26 |     else:
 27 |         spline_fn = unconstrained_rational_quadratic_spline
 28 |         spline_kwargs = {
 29 |             'tails': tails,
 30 |             'tail_bound': tail_bound
 31 |         }
 32 | 
 33 |     outputs, logabsdet = spline_fn(
 34 |             inputs=inputs,
 35 |             unnormalized_widths=unnormalized_widths,
 36 |             unnormalized_heights=unnormalized_heights,
 37 |             unnormalized_derivatives=unnormalized_derivatives,
 38 |             inverse=inverse,
 39 |             min_bin_width=min_bin_width,
 40 |             min_bin_height=min_bin_height,
 41 |             min_derivative=min_derivative,
 42 |             **spline_kwargs
 43 |     )
 44 |     return outputs, logabsdet
 45 | 
 46 | 
 47 | def searchsorted(bin_locations, inputs, eps=1e-6):
 48 |     bin_locations[..., -1] += eps
 49 |     return torch.sum(
 50 |         inputs[..., None] >= bin_locations,
 51 |         dim=-1
 52 |     ) - 1
 53 | 
 54 | 
 55 | def unconstrained_rational_quadratic_spline(inputs,
 56 |                                             unnormalized_widths,
 57 |                                             unnormalized_heights,
 58 |                                             unnormalized_derivatives,
 59 |                                             inverse=False,
 60 |                                             tails='linear',
 61 |                                             tail_bound=1.,
 62 |                                             min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 63 |                                             min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 64 |                                             min_derivative=DEFAULT_MIN_DERIVATIVE):
 65 |     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
 66 |     outside_interval_mask = ~inside_interval_mask
 67 | 
 68 |     outputs = torch.zeros_like(inputs)
 69 |     logabsdet = torch.zeros_like(inputs)
 70 | 
 71 |     if tails == 'linear':
 72 |         unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
 73 |         constant = np.log(np.exp(1 - min_derivative) - 1)
 74 |         unnormalized_derivatives[..., 0] = constant
 75 |         unnormalized_derivatives[..., -1] = constant
 76 | 
 77 |         outputs[outside_interval_mask] = inputs[outside_interval_mask]
 78 |         logabsdet[outside_interval_mask] = 0
 79 |     else:
 80 |         raise RuntimeError('{} tails are not implemented.'.format(tails))
 81 | 
 82 |     outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
 83 |         inputs=inputs[inside_interval_mask],
 84 |         unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
 85 |         unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
 86 |         unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
 87 |         inverse=inverse,
 88 |         left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
 89 |         min_bin_width=min_bin_width,
 90 |         min_bin_height=min_bin_height,
 91 |         min_derivative=min_derivative
 92 |     )
 93 | 
 94 |     return outputs, logabsdet
 95 | 
 96 | def rational_quadratic_spline(inputs,
 97 |                               unnormalized_widths,
 98 |                               unnormalized_heights,
 99 |                               unnormalized_derivatives,
100 |                               inverse=False,
101 |                               left=0., right=1., bottom=0., top=1.,
102 |                               min_bin_width=DEFAULT_MIN_BIN_WIDTH,
103 |                               min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
104 |                               min_derivative=DEFAULT_MIN_DERIVATIVE):
105 |     if torch.min(inputs) < left or torch.max(inputs) > right:
106 |         raise ValueError('Input to a transform is not within its domain')
107 | 
108 |     num_bins = unnormalized_widths.shape[-1]
109 | 
110 |     if min_bin_width * num_bins > 1.0:
111 |         raise ValueError('Minimal bin width too large for the number of bins')
112 |     if min_bin_height * num_bins > 1.0:
113 |         raise ValueError('Minimal bin height too large for the number of bins')
114 | 
115 |     widths = F.softmax(unnormalized_widths, dim=-1)
116 |     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
117 |     cumwidths = torch.cumsum(widths, dim=-1)
118 |     cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
119 |     cumwidths = (right - left) * cumwidths + left
120 |     cumwidths[..., 0] = left
121 |     cumwidths[..., -1] = right
122 |     widths = cumwidths[..., 1:] - cumwidths[..., :-1]
123 | 
124 |     derivatives = min_derivative + F.softplus(unnormalized_derivatives)
125 | 
126 |     heights = F.softmax(unnormalized_heights, dim=-1)
127 |     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
128 |     cumheights = torch.cumsum(heights, dim=-1)
129 |     cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
130 |     cumheights = (top - bottom) * cumheights + bottom
131 |     cumheights[..., 0] = bottom
132 |     cumheights[..., -1] = top
133 |     heights = cumheights[..., 1:] - cumheights[..., :-1]
134 | 
135 |     if inverse:
136 |         bin_idx = searchsorted(cumheights, inputs)[..., None]
137 |     else:
138 |         bin_idx = searchsorted(cumwidths, inputs)[..., None]
139 | 
140 |     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
141 |     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
142 | 
143 |     input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
144 |     delta = heights / widths
145 |     input_delta = delta.gather(-1, bin_idx)[..., 0]
146 | 
147 |     input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
148 |     input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
149 | 
150 |     input_heights = heights.gather(-1, bin_idx)[..., 0]
151 | 
152 |     if inverse:
153 |         a = (((inputs - input_cumheights) * (input_derivatives
154 |                                              + input_derivatives_plus_one
155 |                                              - 2 * input_delta)
156 |               + input_heights * (input_delta - input_derivatives)))
157 |         b = (input_heights * input_derivatives
158 |              - (inputs - input_cumheights) * (input_derivatives
159 |                                               + input_derivatives_plus_one
160 |                                               - 2 * input_delta))
161 |         c = - input_delta * (inputs - input_cumheights)
162 | 
163 |         discriminant = b.pow(2) - 4 * a * c
164 |         assert (discriminant >= 0).all()
165 | 
166 |         root = (2 * c) / (-b - torch.sqrt(discriminant))
167 |         outputs = root * input_bin_widths + input_cumwidths
168 | 
169 |         theta_one_minus_theta = root * (1 - root)
170 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
171 |                                      * theta_one_minus_theta)
172 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
173 |                                                      + 2 * input_delta * theta_one_minus_theta
174 |                                                      + input_derivatives * (1 - root).pow(2))
175 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
176 | 
177 |         return outputs, -logabsdet
178 |     else:
179 |         theta = (inputs - input_cumwidths) / input_bin_widths
180 |         theta_one_minus_theta = theta * (1 - theta)
181 | 
182 |         numerator = input_heights * (input_delta * theta.pow(2)
183 |                                      + input_derivatives * theta_one_minus_theta)
184 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
185 |                                      * theta_one_minus_theta)
186 |         outputs = input_cumheights + numerator / denominator
187 | 
188 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
189 |                                                      + 2 * input_delta * theta_one_minus_theta
190 |                                                      + input_derivatives * (1 - theta).pow(2))
191 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
192 | 
193 |         return outputs, logabsdet
194 | 


--------------------------------------------------------------------------------
/stft.py:
--------------------------------------------------------------------------------
  1 | """
  2 | BSD 3-Clause License
  3 | Copyright (c) 2017, Prem Seetharaman
  4 | All rights reserved.
  5 | * Redistribution and use in source and binary forms, with or without
  6 |   modification, are permitted provided that the following conditions are met:
  7 | * Redistributions of source code must retain the above copyright notice,
  8 |   this list of conditions and the following disclaimer.
  9 | * Redistributions in binary form must reproduce the above copyright notice, this
 10 |   list of conditions and the following disclaimer in the
 11 |   documentation and/or other materials provided with the distribution.
 12 | * Neither the name of the copyright holder nor the names of its
 13 |   contributors may be used to endorse or promote products derived from this
 14 |   software without specific prior written permission.
 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 19 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 22 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | """
 26 | 
 27 | import torch
 28 | import numpy as np
 29 | import torch.nn.functional as F
 30 | from torch.autograd import Variable
 31 | from scipy.signal import get_window
 32 | from librosa.util import pad_center, tiny
 33 | import librosa.util as librosa_util
 34 | 
 35 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
 36 |                      n_fft=800, dtype=np.float32, norm=None):
 37 |     """
 38 |     # from librosa 0.6
 39 |     Compute the sum-square envelope of a window function at a given hop length.
 40 |     This is used to estimate modulation effects induced by windowing
 41 |     observations in short-time fourier transforms.
 42 |     Parameters
 43 |     ----------
 44 |     window : string, tuple, number, callable, or list-like
 45 |         Window specification, as in `get_window`
 46 |     n_frames : int > 0
 47 |         The number of analysis frames
 48 |     hop_length : int > 0
 49 |         The number of samples to advance between frames
 50 |     win_length : [optional]
 51 |         The length of the window function.  By default, this matches `n_fft`.
 52 |     n_fft : int > 0
 53 |         The length of each analysis frame.
 54 |     dtype : np.dtype
 55 |         The data type of the output
 56 |     Returns
 57 |     -------
 58 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
 59 |         The sum-squared envelope of the window function
 60 |     """
 61 |     if win_length is None:
 62 |         win_length = n_fft
 63 | 
 64 |     n = n_fft + hop_length * (n_frames - 1)
 65 |     x = np.zeros(n, dtype=dtype)
 66 | 
 67 |     # Compute the squared window at the desired length
 68 |     win_sq = get_window(window, win_length, fftbins=True)
 69 |     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
 70 |     win_sq = librosa_util.pad_center(win_sq, n_fft)
 71 | 
 72 |     # Fill the envelope
 73 |     for i in range(n_frames):
 74 |         sample = i * hop_length
 75 |         x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
 76 |     return x
 77 | 
 78 | 
 79 | class STFT(torch.nn.Module):
 80 |     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
 81 |     def __init__(self, filter_length=800, hop_length=200, win_length=800,
 82 |                  window='hann'):
 83 |         super(STFT, self).__init__()
 84 |         self.filter_length = filter_length
 85 |         self.hop_length = hop_length
 86 |         self.win_length = win_length
 87 |         self.window = window
 88 |         self.forward_transform = None
 89 |         scale = self.filter_length / self.hop_length
 90 |         fourier_basis = np.fft.fft(np.eye(self.filter_length))
 91 | 
 92 |         cutoff = int((self.filter_length / 2 + 1))
 93 |         fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
 94 |                                    np.imag(fourier_basis[:cutoff, :])])
 95 | 
 96 |         forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
 97 |         inverse_basis = torch.FloatTensor(
 98 |             np.linalg.pinv(scale * fourier_basis).T[:, None, :])
 99 | 
100 |         if window is not None:
101 |             assert(filter_length >= win_length)
102 |             # get window and zero center pad it to filter_length
103 |             fft_window = get_window(window, win_length, fftbins=True)
104 |             fft_window = pad_center(fft_window, filter_length)
105 |             fft_window = torch.from_numpy(fft_window).float()
106 | 
107 |             # window the bases
108 |             forward_basis *= fft_window
109 |             inverse_basis *= fft_window
110 | 
111 |         self.register_buffer('forward_basis', forward_basis.float())
112 |         self.register_buffer('inverse_basis', inverse_basis.float())
113 | 
114 |     def transform(self, input_data):
115 |         num_batches = input_data.size(0)
116 |         num_samples = input_data.size(1)
117 | 
118 |         self.num_samples = num_samples
119 | 
120 |         # similar to librosa, reflect-pad the input
121 |         input_data = input_data.view(num_batches, 1, num_samples)
122 |         input_data = F.pad(
123 |             input_data.unsqueeze(1),
124 |             (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
125 |             mode='reflect')
126 |         input_data = input_data.squeeze(1)
127 | 
128 |         forward_transform = F.conv1d(
129 |             input_data,
130 |             Variable(self.forward_basis, requires_grad=False),
131 |             stride=self.hop_length,
132 |             padding=0)
133 | 
134 |         cutoff = int((self.filter_length / 2) + 1)
135 |         real_part = forward_transform[:, :cutoff, :]
136 |         imag_part = forward_transform[:, cutoff:, :]
137 | 
138 |         magnitude = torch.sqrt(real_part**2 + imag_part**2)
139 |         phase = torch.autograd.Variable(
140 |             torch.atan2(imag_part.data, real_part.data))
141 | 
142 |         return magnitude, phase
143 | 
144 |     def inverse(self, magnitude, phase):
145 |         recombine_magnitude_phase = torch.cat(
146 |             [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
147 | 
148 |         inverse_transform = F.conv_transpose1d(
149 |             recombine_magnitude_phase,
150 |             Variable(self.inverse_basis, requires_grad=False),
151 |             stride=self.hop_length,
152 |             padding=0)
153 | 
154 |         if self.window is not None:
155 |             window_sum = window_sumsquare(
156 |                 self.window, magnitude.size(-1), hop_length=self.hop_length,
157 |                 win_length=self.win_length, n_fft=self.filter_length,
158 |                 dtype=np.float32)
159 |             # remove modulation effects
160 |             approx_nonzero_indices = torch.from_numpy(
161 |                 np.where(window_sum > tiny(window_sum))[0])
162 |             window_sum = torch.autograd.Variable(
163 |                 torch.from_numpy(window_sum), requires_grad=False)
164 |             window_sum = window_sum.to(inverse_transform.device()) if magnitude.is_cuda else window_sum
165 |             inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
166 | 
167 |             # scale by hop ratio
168 |             inverse_transform *= float(self.filter_length) / self.hop_length
169 | 
170 |         inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
171 |         inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
172 | 
173 |         return inverse_transform
174 | 
175 |     def forward(self, input_data):
176 |         self.magnitude, self.phase = self.transform(input_data)
177 |         reconstruction = self.inverse(self.magnitude, self.phase)
178 |         return reconstruction
179 | 
180 | 
181 | class TorchSTFT(torch.nn.Module):
182 |     def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'):
183 |         super().__init__()
184 |         self.filter_length = filter_length
185 |         self.hop_length = hop_length
186 |         self.win_length = win_length
187 |         self.window = torch.from_numpy(get_window(window, win_length, fftbins=True).astype(np.float32))
188 | 
189 |     def transform(self, input_data):
190 |         forward_transform = torch.stft(
191 |             input_data,
192 |             self.filter_length, self.hop_length, self.win_length, window=self.window,
193 |             return_complex=True)
194 | 
195 |         return torch.abs(forward_transform), torch.angle(forward_transform)
196 | 
197 |     def inverse(self, magnitude, phase):
198 |         inverse_transform = torch.istft(
199 |             magnitude * torch.exp(phase * 1j),
200 |             self.filter_length, self.hop_length, self.win_length, window=self.window.to(magnitude.device))
201 | 
202 |         return inverse_transform.unsqueeze(-2)  # unsqueeze to stay consistent with conv_transpose1d implementation
203 | 
204 |     def forward(self, input_data):
205 |         self.magnitude, self.phase = self.transform(input_data)
206 |         reconstruction = self.inverse(self.magnitude, self.phase)
207 |         return reconstruction
208 | 
209 | 
210 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/filelists/ljs_audio_text_val_filelist.txt:
--------------------------------------------------------------------------------
  1 | DUMMY1/LJ022-0023.wav|The overwhelming majority of people in this country know how to sift the wheat from the chaff in what they hear and what they read.
  2 | DUMMY1/LJ043-0030.wav|If somebody did that to me, a lousy trick like that, to take my wife away, and all the furniture, I would be mad as hell, too.
  3 | DUMMY1/LJ005-0201.wav|as is shown by the report of the Commissioners to inquire into the state of the municipal corporations in eighteen thirty-five.
  4 | DUMMY1/LJ001-0110.wav|Even the Caslon type when enlarged shows great shortcomings in this respect:
  5 | DUMMY1/LJ003-0345.wav|All the committee could do in this respect was to throw the responsibility on others.
  6 | DUMMY1/LJ007-0154.wav|These pungent and well-grounded strictures applied with still greater force to the unconvicted prisoner, the man who came to the prison innocent, and still uncontaminated,
  7 | DUMMY1/LJ018-0098.wav|and recognized as one of the frequenters of the bogus law-stationers. His arrest led to that of others.
  8 | DUMMY1/LJ047-0044.wav|Oswald was, however, willing to discuss his contacts with Soviet authorities. He denied having any involvement with Soviet intelligence agencies
  9 | DUMMY1/LJ031-0038.wav|The first physician to see the President at Parkland Hospital was Dr. Charles J. Carrico, a resident in general surgery.
 10 | DUMMY1/LJ048-0194.wav|during the morning of November twenty-two prior to the motorcade.
 11 | DUMMY1/LJ049-0026.wav|On occasion the Secret Service has been permitted to have an agent riding in the passenger compartment with the President.
 12 | DUMMY1/LJ004-0152.wav|although at Mr. Buxton's visit a new jail was in process of erection, the first step towards reform since Howard's visitation in seventeen seventy-four.
 13 | DUMMY1/LJ008-0278.wav|or theirs might be one of many, and it might be considered necessary to "make an example."
 14 | DUMMY1/LJ043-0002.wav|The Warren Commission Report. By The President's Commission on the Assassination of President Kennedy. Chapter seven. Lee Harvey Oswald:
 15 | DUMMY1/LJ009-0114.wav|Mr. Wakefield winds up his graphic but somewhat sensational account by describing another religious service, which may appropriately be inserted here.
 16 | DUMMY1/LJ028-0506.wav|A modern artist would have difficulty in doing such accurate work.
 17 | DUMMY1/LJ050-0168.wav|with the particular purposes of the agency involved. The Commission recognizes that this is a controversial area
 18 | DUMMY1/LJ039-0223.wav|Oswald's Marine training in marksmanship, his other rifle experience and his established familiarity with this particular weapon
 19 | DUMMY1/LJ029-0032.wav|According to O'Donnell, quote, we had a motorcade wherever we went, end quote.
 20 | DUMMY1/LJ031-0070.wav|Dr. Clark, who most closely observed the head wound,
 21 | DUMMY1/LJ034-0198.wav|Euins, who was on the southwest corner of Elm and Houston Streets testified that he could not describe the man he saw in the window.
 22 | DUMMY1/LJ026-0068.wav|Energy enters the plant, to a small extent,
 23 | DUMMY1/LJ039-0075.wav|once you know that you must put the crosshairs on the target and that is all that is necessary.
 24 | DUMMY1/LJ004-0096.wav|the fatal consequences whereof might be prevented if the justices of the peace were duly authorized
 25 | DUMMY1/LJ005-0014.wav|Speaking on a debate on prison matters, he declared that
 26 | DUMMY1/LJ012-0161.wav|he was reported to have fallen away to a shadow.
 27 | DUMMY1/LJ018-0239.wav|His disappearance gave color and substance to evil reports already in circulation that the will and conveyance above referred to
 28 | DUMMY1/LJ019-0257.wav|Here the tread-wheel was in use, there cellular cranks, or hard-labor machines.
 29 | DUMMY1/LJ028-0008.wav|you tap gently with your heel upon the shoulder of the dromedary to urge her on.
 30 | DUMMY1/LJ024-0083.wav|This plan of mine is no attack on the Court;
 31 | DUMMY1/LJ042-0129.wav|No night clubs or bowling alleys, no places of recreation except the trade union dances. I have had enough.
 32 | DUMMY1/LJ036-0103.wav|The police asked him whether he could pick out his passenger from the lineup.
 33 | DUMMY1/LJ046-0058.wav|During his Presidency, Franklin D. Roosevelt made almost four hundred journeys and traveled more than three hundred fifty thousand miles.
 34 | DUMMY1/LJ014-0076.wav|He was seen afterwards smoking and talking with his hosts in their back parlor, and never seen again alive.
 35 | DUMMY1/LJ002-0043.wav|long narrow rooms -- one thirty-six feet, six twenty-three feet, and the eighth eighteen,
 36 | DUMMY1/LJ009-0076.wav|We come to the sermon.
 37 | DUMMY1/LJ017-0131.wav|even when the high sheriff had told him there was no possibility of a reprieve, and within a few hours of execution.
 38 | DUMMY1/LJ046-0184.wav|but there is a system for the immediate notification of the Secret Service by the confining institution when a subject is released or escapes.
 39 | DUMMY1/LJ014-0263.wav|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.
 40 | DUMMY1/LJ042-0096.wav|(old exchange rate) in addition to his factory salary of approximately equal amount
 41 | DUMMY1/LJ049-0050.wav|Hill had both feet on the car and was climbing aboard to assist President and Mrs. Kennedy.
 42 | DUMMY1/LJ019-0186.wav|seeing that since the establishment of the Central Criminal Court, Newgate received prisoners for trial from several counties,
 43 | DUMMY1/LJ028-0307.wav|then let twenty days pass, and at the end of that time station near the Chaldasan gates a body of four thousand.
 44 | DUMMY1/LJ012-0235.wav|While they were in a state of insensibility the murder was committed.
 45 | DUMMY1/LJ034-0053.wav|reached the same conclusion as Latona that the prints found on the cartons were those of Lee Harvey Oswald.
 46 | DUMMY1/LJ014-0030.wav|These were damnatory facts which well supported the prosecution.
 47 | DUMMY1/LJ015-0203.wav|but were the precautions too minute, the vigilance too close to be eluded or overcome?
 48 | DUMMY1/LJ028-0093.wav|but his scribe wrote it in the manner customary for the scribes of those days to write of their royal masters.
 49 | DUMMY1/LJ002-0018.wav|The inadequacy of the jail was noticed and reported upon again and again by the grand juries of the city of London,
 50 | DUMMY1/LJ028-0275.wav|At last, in the twentieth month,
 51 | DUMMY1/LJ012-0042.wav|which he kept concealed in a hiding-place with a trap-door just under his bed.
 52 | DUMMY1/LJ011-0096.wav|He married a lady also belonging to the Society of Friends, who brought him a large fortune, which, and his own money, he put into a city firm,
 53 | DUMMY1/LJ036-0077.wav|Roger D. Craig, a deputy sheriff of Dallas County,
 54 | DUMMY1/LJ016-0318.wav|Other officials, great lawyers, governors of prisons, and chaplains supported this view.
 55 | DUMMY1/LJ013-0164.wav|who came from his room ready dressed, a suspicious circumstance, as he was always late in the morning.
 56 | DUMMY1/LJ027-0141.wav|is closely reproduced in the life-history of existing deer. Or, in other words,
 57 | DUMMY1/LJ028-0335.wav|accordingly they committed to him the command of their whole army, and put the keys of their city into his hands.
 58 | DUMMY1/LJ031-0202.wav|Mrs. Kennedy chose the hospital in Bethesda for the autopsy because the President had served in the Navy.
 59 | DUMMY1/LJ021-0145.wav|From those willing to join in establishing this hoped-for period of peace,
 60 | DUMMY1/LJ016-0288.wav|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells.
 61 | DUMMY1/LJ028-0081.wav|Years later, when the archaeologists could readily distinguish the false from the true,
 62 | DUMMY1/LJ018-0081.wav|his defense being that he had intended to commit suicide, but that, on the appearance of this officer who had wronged him,
 63 | DUMMY1/LJ021-0066.wav|together with a great increase in the payrolls, there has come a substantial rise in the total of industrial profits
 64 | DUMMY1/LJ009-0238.wav|After this the sheriffs sent for another rope, but the spectators interfered, and the man was carried back to jail.
 65 | DUMMY1/LJ005-0079.wav|and improve the morals of the prisoners, and shall insure the proper measure of punishment to convicted offenders.
 66 | DUMMY1/LJ035-0019.wav|drove to the northwest corner of Elm and Houston, and parked approximately ten feet from the traffic signal.
 67 | DUMMY1/LJ036-0174.wav|This is the approximate time he entered the roominghouse, according to Earlene Roberts, the housekeeper there.
 68 | DUMMY1/LJ046-0146.wav|The criteria in effect prior to November twenty-two, nineteen sixty-three, for determining whether to accept material for the PRS general files
 69 | DUMMY1/LJ017-0044.wav|and the deepest anxiety was felt that the crime, if crime there had been, should be brought home to its perpetrator.
 70 | DUMMY1/LJ017-0070.wav|but his sporting operations did not prosper, and he became a needy man, always driven to desperate straits for cash.
 71 | DUMMY1/LJ014-0020.wav|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;
 72 | DUMMY1/LJ016-0020.wav|He never reached the cistern, but fell back into the yard, injuring his legs severely.
 73 | DUMMY1/LJ045-0230.wav|when he was finally apprehended in the Texas Theatre. Although it is not fully corroborated by others who were present,
 74 | DUMMY1/LJ035-0129.wav|and she must have run down the stairs ahead of Oswald and would probably have seen or heard him.
 75 | DUMMY1/LJ008-0307.wav|afterwards express a wish to murder the Recorder for having kept them so long in suspense.
 76 | DUMMY1/LJ008-0294.wav|nearly indefinitely deferred.
 77 | DUMMY1/LJ047-0148.wav|On October twenty-five,
 78 | DUMMY1/LJ008-0111.wav|They entered a "stone cold room," and were presently joined by the prisoner.
 79 | DUMMY1/LJ034-0042.wav|that he could only testify with certainty that the print was less than three days old.
 80 | DUMMY1/LJ037-0234.wav|Mrs. Mary Brock, the wife of a mechanic who worked at the station, was there at the time and she saw a white male,
 81 | DUMMY1/LJ040-0002.wav|Chapter seven. Lee Harvey Oswald: Background and Possible Motives, Part one.
 82 | DUMMY1/LJ045-0140.wav|The arguments he used to justify his use of the alias suggest that Oswald may have come to think that the whole world was becoming involved
 83 | DUMMY1/LJ012-0035.wav|the number and names on watches, were carefully removed or obliterated after the goods passed out of his hands.
 84 | DUMMY1/LJ012-0250.wav|On the seventh July, eighteen thirty-seven,
 85 | DUMMY1/LJ016-0179.wav|contracted with sheriffs and conveners to work by the job.
 86 | DUMMY1/LJ016-0138.wav|at a distance from the prison.
 87 | DUMMY1/LJ027-0052.wav|These principles of homology are essential to a correct interpretation of the facts of morphology.
 88 | DUMMY1/LJ031-0134.wav|On one occasion Mrs. Johnson, accompanied by two Secret Service agents, left the room to see Mrs. Kennedy and Mrs. Connally.
 89 | DUMMY1/LJ019-0273.wav|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline.
 90 | DUMMY1/LJ014-0110.wav|At the first the boxes were impounded, opened, and found to contain many of O'Connor's effects.
 91 | DUMMY1/LJ034-0160.wav|on Brennan's subsequent certain identification of Lee Harvey Oswald as the man he saw fire the rifle.
 92 | DUMMY1/LJ038-0199.wav|eleven. If I am alive and taken prisoner,
 93 | DUMMY1/LJ014-0010.wav|yet he could not overcome the strange fascination it had for him, and remained by the side of the corpse till the stretcher came.
 94 | DUMMY1/LJ033-0047.wav|I noticed when I went out that the light was on, end quote,
 95 | DUMMY1/LJ040-0027.wav|He was never satisfied with anything.
 96 | DUMMY1/LJ048-0228.wav|and others who were present say that no agent was inebriated or acted improperly.
 97 | DUMMY1/LJ003-0111.wav|He was in consequence put out of the protection of their internal law, end quote. Their code was a subject of some curiosity.
 98 | DUMMY1/LJ008-0258.wav|Let me retrace my steps, and speak more in detail of the treatment of the condemned in those bloodthirsty and brutally indifferent days,
 99 | DUMMY1/LJ029-0022.wav|The original plan called for the President to spend only one day in the State, making whirlwind visits to Dallas, Fort Worth, San Antonio, and Houston.
100 | DUMMY1/LJ004-0045.wav|Mr. Sturges Bourne, Sir James Mackintosh, Sir James Scarlett, and William Wilberforce.
101 | 


--------------------------------------------------------------------------------
/train_latest.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import argparse
  4 | import itertools
  5 | import math
  6 | import torch
  7 | from torch import nn, optim
  8 | from torch.nn import functional as F
  9 | from torch.utils.data import DataLoader
 10 | from torch.utils.tensorboard import SummaryWriter
 11 | import torch.multiprocessing as mp
 12 | import torch.distributed as dist
 13 | from torch.nn.parallel import DistributedDataParallel as DDP
 14 | from torch.cuda.amp import autocast, GradScaler
 15 | from pqmf import PQMF
 16 | 
 17 | import commons
 18 | import utils
 19 | from data_utils import (
 20 |   TextAudioLoader,
 21 |   TextAudioCollate,
 22 |   DistributedBucketSampler
 23 | )
 24 | from models import (
 25 |   SynthesizerTrn,
 26 |   MultiPeriodDiscriminator,
 27 | )
 28 | from losses import (
 29 |   generator_loss,
 30 |   discriminator_loss,
 31 |   feature_loss,
 32 |   kl_loss,
 33 |   subband_stft_loss
 34 | )
 35 | from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
 36 | from text.symbols import symbols
 37 | 
 38 | torch.autograd.set_detect_anomaly(True)
 39 | torch.backends.cudnn.benchmark = True
 40 | global_step = 0
 41 | 
 42 | 
 43 | def main():
 44 |   """Assume Single Node Multi GPUs Training Only"""
 45 |   assert torch.cuda.is_available(), "CPU training is not allowed."
 46 | 
 47 |   n_gpus = torch.cuda.device_count()
 48 |   os.environ['MASTER_ADDR'] = 'localhost'
 49 |   os.environ['MASTER_PORT'] = '65520'
 50 | #   n_gpus = 1
 51 | 
 52 |   hps = utils.get_hparams()
 53 |   mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
 54 | 
 55 | 
 56 | def run(rank, n_gpus, hps):
 57 |   global global_step
 58 |   if rank == 0:
 59 |     logger = utils.get_logger(hps.model_dir)
 60 |     logger.info(hps)
 61 |     utils.check_git_hash(hps.model_dir)
 62 |     writer = SummaryWriter(log_dir=hps.model_dir)
 63 |     writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
 64 | 
 65 |   dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
 66 |   torch.manual_seed(hps.train.seed)
 67 |   torch.cuda.set_device(rank)
 68 | 
 69 |   train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
 70 |   train_sampler = DistributedBucketSampler(
 71 |       train_dataset,
 72 |       hps.train.batch_size,
 73 |       [32,300,400,500,600,700,800,900,1000],
 74 |       num_replicas=n_gpus,
 75 |       rank=rank,
 76 |       shuffle=True)
 77 |   collate_fn = TextAudioCollate()
 78 |   train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True,
 79 |       collate_fn=collate_fn, batch_sampler=train_sampler)
 80 |   if rank == 0:
 81 |     eval_dataset = TextAudioLoader(hps.data.validation_files, hps.data)
 82 |     eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False,
 83 |         batch_size=hps.train.batch_size, pin_memory=True,
 84 |         drop_last=False, collate_fn=collate_fn)
 85 | 
 86 |   net_g = SynthesizerTrn(
 87 |       len(symbols),
 88 |       hps.data.filter_length // 2 + 1,
 89 |       hps.train.segment_size // hps.data.hop_length,
 90 |       **hps.model).cuda(rank)
 91 |   net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
 92 |   optim_g = torch.optim.AdamW(
 93 |       net_g.parameters(), 
 94 |       hps.train.learning_rate, 
 95 |       betas=hps.train.betas, 
 96 |       eps=hps.train.eps)
 97 |   optim_d = torch.optim.AdamW(
 98 |       net_d.parameters(),
 99 |       hps.train.learning_rate, 
100 |       betas=hps.train.betas, 
101 |       eps=hps.train.eps)
102 |   net_g = DDP(net_g, device_ids=[rank])
103 |   net_d = DDP(net_d, device_ids=[rank])
104 | 
105 |   try:
106 |     _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g)
107 |     _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d)
108 |     global_step = (epoch_str - 1) * len(train_loader)
109 |   except:
110 |     epoch_str = 1
111 |     global_step = 0
112 | 
113 |   scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str-2)
114 |   scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str-2)
115 | 
116 |   scaler = GradScaler(enabled=hps.train.fp16_run)
117 | 
118 |   for epoch in range(epoch_str, hps.train.epochs + 1):
119 |     if rank==0:
120 |       train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval])
121 |     else:
122 |       train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None)
123 |     scheduler_g.step()
124 |     scheduler_d.step()
125 | 
126 | 
127 | 
128 | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers):
129 |   net_g, net_d = nets
130 |   optim_g, optim_d = optims
131 |   scheduler_g, scheduler_d = schedulers
132 |   train_loader, eval_loader = loaders
133 |   if writers is not None:
134 |     writer, writer_eval = writers
135 | 
136 |   train_loader.batch_sampler.set_epoch(epoch)
137 |   global global_step
138 | 
139 |   net_g.train()
140 |   net_d.train()
141 |   for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths) in enumerate(train_loader):
142 |     x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True)
143 |     spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
144 |     y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True)
145 | 
146 |     with autocast(enabled=hps.train.fp16_run):
147 |       y_hat, y_hat_mb, l_length, attn, ids_slice, x_mask, z_mask,\
148 |       (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths)
149 | 
150 |       mel = spec_to_mel_torch(
151 |           spec, 
152 |           hps.data.filter_length, 
153 |           hps.data.n_mel_channels, 
154 |           hps.data.sampling_rate,
155 |           hps.data.mel_fmin, 
156 |           hps.data.mel_fmax)
157 |       y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
158 |       y_hat_mel = mel_spectrogram_torch(
159 |           y_hat.squeeze(1), 
160 |           hps.data.filter_length, 
161 |           hps.data.n_mel_channels, 
162 |           hps.data.sampling_rate, 
163 |           hps.data.hop_length, 
164 |           hps.data.win_length, 
165 |           hps.data.mel_fmin, 
166 |           hps.data.mel_fmax
167 |       )
168 | 
169 |       y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice 
170 | 
171 |       # Discriminator
172 |       y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
173 |       with autocast(enabled=False):
174 |         loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
175 |         loss_disc_all = loss_disc
176 |     optim_d.zero_grad()
177 |     scaler.scale(loss_disc_all).backward()
178 |     scaler.unscale_(optim_d)
179 |     grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
180 |     scaler.step(optim_d)
181 | 
182 |     
183 | 
184 | 
185 |     with autocast(enabled=hps.train.fp16_run):
186 |       # Generator
187 |       y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
188 |       with autocast(enabled=False):
189 |         loss_dur = torch.sum(l_length.float())
190 |         loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
191 |         loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
192 | 
193 |         loss_fm = feature_loss(fmap_r, fmap_g)
194 |         loss_gen, losses_gen = generator_loss(y_d_hat_g)
195 |         
196 |         if hps.model.mb_istft_vits == True:
197 |           pqmf = PQMF(y.device)
198 |           y_mb = pqmf.analysis(y)
199 |           loss_subband = subband_stft_loss(hps, y_mb, y_hat_mb)
200 |         else:
201 |           loss_subband = torch.tensor(0.0)
202 | 
203 |         loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl + loss_subband
204 | 
205 |     optim_g.zero_grad()
206 |     scaler.scale(loss_gen_all).backward()
207 |     scaler.unscale_(optim_g)
208 |     grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
209 |     scaler.step(optim_g)
210 |     scaler.update()
211 | 
212 |     if rank==0:
213 |       if global_step % hps.train.log_interval == 0:
214 |         lr = optim_g.param_groups[0]['lr']
215 |         losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl, loss_subband]
216 |         logger.info('Train Epoch: {} [{:.0f}%]'.format(
217 |           epoch,
218 |           100. * batch_idx / len(train_loader)))
219 |         logger.info([x.item() for x in losses] + [global_step, lr])
220 |         
221 |         scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g}
222 |         scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl, "loss/g/subband": loss_subband})
223 | 
224 |         scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
225 |         scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
226 |         scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
227 |         image_dict = { 
228 |             "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
229 |             "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), 
230 |             "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
231 |             "all/attn": utils.plot_alignment_to_numpy(attn[0,0].data.cpu().numpy())
232 |         }
233 |         utils.summarize(
234 |           writer=writer,
235 |           global_step=global_step, 
236 |           images=image_dict,
237 |           scalars=scalar_dict)
238 | 
239 |       if global_step % hps.train.eval_interval == 0:
240 |         evaluate(hps, net_g, eval_loader, writer_eval)
241 |         utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
242 |         utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
243 |     global_step += 1
244 | 
245 |   
246 |   if rank == 0:
247 |     logger.info('====> Epoch: {}'.format(epoch))
248 |   
249 |     
250 | 
251 |  
252 | def evaluate(hps, generator, eval_loader, writer_eval):
253 |     generator.eval()
254 |     with torch.no_grad():
255 |       for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths) in enumerate(eval_loader):
256 |         x, x_lengths = x.cuda(0), x_lengths.cuda(0)
257 |         spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0)
258 |         y, y_lengths = y.cuda(0), y_lengths.cuda(0)
259 | 
260 |         # remove else
261 |         x = x[:1]
262 |         x_lengths = x_lengths[:1]
263 |         spec = spec[:1]
264 |         spec_lengths = spec_lengths[:1]
265 |         y = y[:1]
266 |         y_lengths = y_lengths[:1]
267 |         break
268 |       y_hat, y_hat_mb, attn, mask, *_ = generator.module.infer(x, x_lengths, max_len=1000)
269 |       y_hat_lengths = mask.sum([1,2]).long() * hps.data.hop_length
270 | 
271 |       mel = spec_to_mel_torch(
272 |         spec, 
273 |         hps.data.filter_length, 
274 |         hps.data.n_mel_channels, 
275 |         hps.data.sampling_rate,
276 |         hps.data.mel_fmin, 
277 |         hps.data.mel_fmax)
278 |       y_hat_mel = mel_spectrogram_torch(
279 |         y_hat.squeeze(1).float(),
280 |         hps.data.filter_length,
281 |         hps.data.n_mel_channels,
282 |         hps.data.sampling_rate,
283 |         hps.data.hop_length,
284 |         hps.data.win_length,
285 |         hps.data.mel_fmin,
286 |         hps.data.mel_fmax
287 |       )
288 |     image_dict = {
289 |       "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy())
290 |     }
291 |     audio_dict = {
292 |       "gen/audio": y_hat[0,:,:y_hat_lengths[0]]
293 |     }
294 |     if global_step == 0:
295 |       image_dict.update({"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())})
296 |       audio_dict.update({"gt/audio": y[0,:,:y_lengths[0]]})
297 | 
298 |     utils.summarize(
299 |       writer=writer_eval,
300 |       global_step=global_step, 
301 |       images=image_dict,
302 |       audios=audio_dict,
303 |       audio_sampling_rate=hps.data.sampling_rate
304 |     )
305 |     generator.train()
306 | 
307 |                            
308 | if __name__ == "__main__":
309 |   os.environ[
310 |         "TORCH_DISTRIBUTED_DEBUG"
311 |     ] = "DETAIL"
312 |   main()
313 | 


--------------------------------------------------------------------------------
/filelists/filelist_train2.txt.cleaned:
--------------------------------------------------------------------------------
 1 | ./tsukuyomi/VOICEACTRESS100_001.wav|ma↑ta, to↓ojino yo↓oni, go↓dai myo↑oo↓oto yo↑bareru, ʃu↑yoona myo↑oo↓ono ʧu↑uo↓oni ha↓isareru ko↑to↓mo o↓oi.
 2 | ./tsukuyomi/VOICEACTRESS100_002.wav|nyu↑uiNguraNdo↓fuuwa, gyu↑unyuuo be↓esUto ʃI↑ta, ʃi↑ro↓i ku↑riimusu↓upudeari, bo↑sUtoNkuramuʧa↓udaatomo yo↑bareru.
 3 | ./tsukuyomi/VOICEACTRESS100_003.wav|ko↑Npyuutage↓emuno me↓ekaaya, gyo↑okaida↓Ntainadoni ka↑NreN su↑ru ji↓Nbutsuno ka↑te↓gori.
 4 | ./tsukuyomi/VOICEACTRESS100_004.wav|sa↑abisumaneejaadoonyuu↓ekino ta↑me, o↑oi↓maʧi e↓kIkara, e↑NkakUka↓Nri ʃI↑te i↑ru.
 5 | ./tsukuyomi/VOICEACTRESS100_005.wav|ʃi↓rubaa sa↑afaaʃuugekiji↓keNmadeni, ri↓ʧaazuwa, ʧi↑imu↓meeto to↑moni, ko↑kUsai↓tekini su↑upaahi↓iroo, o↓yobi, yu↑ume↓ejiNto ʃI↑te, ni↓NʧI sa↑rete i↑ru.
 6 | ./tsukuyomi/VOICEACTRESS100_006.wav|ts u yu↑reNharuto↓ryoowa, byu↑ruteNberuku↓ryooni he↑Nnyuu sa↑reta.
 7 | ./tsukuyomi/VOICEACTRESS100_007.wav|ji↑kaN ryo↑oikIto, ku↑ukaNryo↓oikide kyo↑otsuu su↑ru ʃo↑riʃu↓hoowa, fi↑rutari↓Nguni yo↑ru, nyu↑uryokUʃi↓Ngoono kyo↓okadearu.
 8 | ./tsukuyomi/VOICEACTRESS100_008.wav|ʃa↑Nʧiino se↑Ngyoo↓purowa, ʧi↓imukara ʃI↑hara↓wareru kyu↓uryooto, ta↑ikyoku↓hio, o↓mona ʃu↑unyuuto ʃI↑te i↑ru.
 9 | ./tsukuyomi/VOICEACTRESS100_009.wav|ma↑ta ne↑jimeʃiwa, ʧu↑usa↓Nnoono o↑same↓ru, ryu↑ukyuuo↓okokUtono ko↑oekinimo sa↑Nka ʃI↑ta.
10 | ./tsukuyomi/VOICEACTRESS100_010.wav|su↑maato↓foNkara, fi↑iʧaafo↓Nmade, ma↑ruʧideba↓isuni ta↑ioo.
11 | ./tsukuyomi/VOICEACTRESS100_011.wav|ke↑emyoo ʃa↑datsuna na↑reeʃoNkara, jo↑oʧo↓kaN a↑fure↓ru ka↑tarima↓de, ha↑bahiro↓i hyo↑ogeNryo↓kuo mo↓tsu.
12 | ./tsukuyomi/VOICEACTRESS100_012.wav|ko↑ozoowa, ha↑ganeseeno ta↑Nitsu a↓aʧide, kyo↑okyakuwa, i↑ʃItsumidearu.
13 | ./tsukuyomi/VOICEACTRESS100_013.wav|so↑koe, o↓onaaga a↑taraʃi↓i ʃe↓fUto ʃI↑te, u↑dekIkino hyo↑nu↓kuo ma↑ne↓ku.
14 | ./tsukuyomi/VOICEACTRESS100_014.wav|ku↑iiNzuabenyuua↓rufani ʃo↑zoku ʃI↑te i↑ru.
15 | ./tsukuyomi/VOICEACTRESS100_015.wav|i↑Qpo↓ode, gyo↓gyooto ʃo↓ogyoode, rya↑nesUko↓owa ha↑Nee ʃI↑te i↑ta.
16 | ./tsukuyomi/VOICEACTRESS100_016.wav|ko↑no, nyu↑usausuweeruzudaihyooʧi↓imuga, wa↑rabi↓izuno ʧu↑ukakUto na↓Qte i↑ku.
17 | ./tsukuyomi/VOICEACTRESS100_017.wav|ta↓daʃi, gya↑NburuizoNʃoono nyu↑uiNʧi↓ryooo i↑Qte i↑ru byo↑oiNwa, wa↓zukadearu.
18 | ./tsukuyomi/VOICEACTRESS100_018.wav|ta↓no me↓jaana di↑sUtoribyu↓uʃoNni ku↑rabe, se↑kyuritiijoono mo↑Ndaino ʃu↑useega, o↑soi ba↑aimo a↓ru.
19 | ./tsukuyomi/VOICEACTRESS100_019.wav|be↑rugaato↓oa ma↓eno, ve↑digeNuufaapa↓akuniwa, se↑Nsooto da↑Natsuno gi↑seeʃano ta↑me↓no ki↑neN↓higa ta↓Qte i↑ru.
20 | ./tsukuyomi/VOICEACTRESS100_020.wav|ze↑NbeepaburiQʃaazukyo↓okaino, be↑sUtosUtora↓tejiigeemuobuzaiyaao, ni↑QpoN↓jiNto ʃI↑te ju↑ʃoo.
21 | ./tsukuyomi/VOICEACTRESS100_021.wav|i↑tami↓wa, te↑Ntekiyo↓ri ʧi↑Ntsuuya↓kuo, jo↑omyakUto↓oyo su↑ru ko↑to↓de, ʧi↑Ntsuuo o↑konau.
22 | ./tsukuyomi/VOICEACTRESS100_022.wav|ko↑no to↓kini, fu↑yuutairikUpuruva↓mani a↓ru, ʧu↑uritsu↓koku, byu↑eru↓baga, a↓ru jo↑ohoosujikara, ba↑QʃUʃo↓oguNno ʃo↑keeto, ze↑No↓ojo, a↓aʃeno ji↓gaio ha↑Qpyoo.
23 | ./tsukuyomi/VOICEACTRESS100_023.wav|na↑Nsee↓bu wo↓oreNwa, be↑ia↓amaN fa↓amuzuto, fi↑Qtsujera↓rudono ʧi↓kude, ko↑osee sa↑reru.
24 | ./tsukuyomi/VOICEACTRESS100_024.wav|ko↑no ta↑me, pu↑razumaʧuuno i↓oNya, de↓Nʃino mo↓tsu, he↑ekiNuNdooene↓rugiio, o↓Ndode hyo↑oge↓N su↑ru ko↑to↓ga a↓ru.
25 | ./tsukuyomi/VOICEACTRESS100_025.wav|so↑no hyo↑ohyooto ʃI↑ta hI↑togaraga, ro↑onyakuna↓Nnyoni ʃI↑tawarete i↑ru.
26 | ./tsukuyomi/VOICEACTRESS100_026.wav|ge↓Nzai, nyu↑ujaajii↓ʃuu, mu↓ua zu↑ta↓uNni su↓Nde i↑ru.
27 | ./tsukuyomi/VOICEACTRESS100_027.wav|ʧo↑oikini a↓Qta, mi↑tsunesaN↓haNwa, na↑gaoka↓haNni, be↑e hya↑Q↓pyooo o↑kuQta ko↑to↓de yu↑umee.
28 | ./tsukuyomi/VOICEACTRESS100_028.wav|ko↑no to↓ki, pe↑rime↓edeewa, a↑mupIkutoriyu↓ooNni do↑okoo ʃI↑te, te↓ebaini ki↓te i↑ta, r i ky u mu↑ni↓osuni, tsu↓mato ʃI↑te a↑taerareta.
29 | ./tsukuyomi/VOICEACTRESS100_029.wav|ge↓Nzaino ka↑Qsooo mo↑kUtekIto ʃI↑ta, sU↑kiibu↓utsuwa, ka↑tai pu↑rasUʧiQku↓ʃeruto, ya↑waraka↓i i↑Nnaabu↓utsUkara na↓ru.
30 | ./tsukuyomi/VOICEACTRESS100_030.wav|bo↑ogo↓ori bu↓utsuwa, hyo↑ome↓Nni ha↑rareta, go↑museeno u↑sui ma↑ku↓de de↑ki↓te i↑ru.
31 | ./tsukuyomi/VOICEACTRESS100_031.wav|ko↓oʃano da↑ihyooga, we↑Qjiu↓Qdono, ja↑sUpaawe↓adearu.
32 | ./tsukuyomi/VOICEACTRESS100_032.wav|ki↑i↓kyokuga ha↑QʃiN su↑ru, nyu↑usuneQtowaaku↓meeo ka↑NʃIta ta↓itoruno, nyu↑usuba↓Ngumino na↓kadewa, re↑gyuraaho↓osooga, mo↑Qto↓mo o↓oi.
33 | ./tsukuyomi/VOICEACTRESS100_033.wav|ge↑enoopu↓rodakUʃoN, a↑myu↓uzuno gu↑ruupUki↓gyoo.
34 | ./tsukuyomi/VOICEACTRESS100_034.wav|ʧo↑obo↓iNo ʃo↑oryaku ʃI↑te, e↑ryu↓ʃioNtomo hyo↓okI sa↑reru.
35 | ./tsukuyomi/VOICEACTRESS100_035.wav|mo↑rinagano o↑iʃi↓i gyu↑unyuuwa, ko↓i a↑oironi, gyu↑unyuu↓biNo a↑ʃira↓Qta de↑za↓iNno, pa↑Qkugyu↓unyuudearu.
36 | ./tsukuyomi/VOICEACTRESS100_036.wav|ba↑Ngumibo↓otooo, to↑okyoomuubiise↓esakuno a↑nime↓eʃoNde, hyo↑oge↓N su↑ru te↑Nmo, kyo↑otsuu ʃI↑te i↑ta.
37 | ./tsukuyomi/VOICEACTRESS100_037.wav|ko↑myu↓uNwa, se↑enu↓gawato, e↑soNnu↓kawano, go↑oryuuʧi↓teNto na↓Qte i↑ru.
38 | ./tsukuyomi/VOICEACTRESS100_038.wav|do↑ojini, fU↑kuimi↓rakuruerefaNtsuni, ko↑oʧIke↓NniNde, nyu↑udaN su↑ru ko↑to↓ga ha↑Qpyoo sa↑reta.
39 | ./tsukuyomi/VOICEACTRESS100_039.wav|o↑Qtodearu ko↑muroga, kyu↑ukyuuʃa↓o yo↑bi, to↑naibyo↓oiNni, ki↑Nkyuu ha↑Nsoo sa↑reru.
40 | ./tsukuyomi/VOICEACTRESS100_040.wav|gi↑re↓sUpiiwa, ma↓Qgiio tsu↑ujite, i↓nesUto ʃi↑ria↓Qta.
41 | ./tsukuyomi/VOICEACTRESS100_041.wav|fo↑Nteenuburooyo↓oʃIkidewa, gu↑ui↓tekina e↓ga, ʃi↑Qkuino mo↓orudoni tsU↑kawarete i↑ru.
42 | ./tsukuyomi/VOICEACTRESS100_042.wav|sa↑ijiNwa, bi↑ʃunuhano se↓ejiN, su↓waa mi↑inaaraayaN.
43 | ./tsukuyomi/VOICEACTRESS100_043.wav|ha↓adee su↑ga, pe↑ruse↓poneeni ko↓io ʃI↑ta no↑wa, a↑purodi↓iteeno, sa↑kuryakudearuto sa↑rete i↑ru.
44 | ./tsukuyomi/VOICEACTRESS100_044.wav|ku↓weeNbaaNʧaaNwa, ʧi↓isana ko↑myu↓nitiide, no↓ogyooya, ʃo↓ogyooo ʧu↑uʃiNni, na↑rita↓Qte i↑ta↓to, ka↑Nga↓erarete i↑ru.
45 | ./tsukuyomi/VOICEACTRESS100_045.wav|ve↑ezaajiteNʃa↓dooya, myu↓ureN ru↓utoni ʃI↑taga↓Qta, sa↑ikuriNgutsu↓aawa, pe↓etaasuhaageNo, ke↑eyu su↑ru.
46 | ./tsukuyomi/VOICEACTRESS100_046.wav|fo↑omyura↓kaawa, tsu↑ujoo, o↑opuNhoi↓irude, ʃi↑Nguruʃi↓itaadearu.
47 | ./tsukuyomi/VOICEACTRESS100_047.wav|do↑ojitsu a↓sani, o↑osakana↓Nbade, ʃu↑QpatsUse↓remoniiga ka↑isai sa↑re, e↑egyoou↓NteNni, ju↑utoo sa↑reta.
48 | ./tsukuyomi/VOICEACTRESS100_048.wav|so↑ʃIte, i↑NdepeNdeNto↓ʃino, do↑kUʃato↓ohyoode e↑ra↓bu, pu↑remiariigusaiyuuʃuugooruki↓ipaani e↑ra↓bareta.
49 | ./tsukuyomi/VOICEACTRESS100_049.wav|pu↑reiyaa↓kyarakUtaawa, kyu↑udeNo se↓Nkyo ʃI↑ta, ja↑akuna ku↑ri↓iʧaani so↑oguu su↑ru.
50 | ./tsukuyomi/VOICEACTRESS100_050.wav|fi↑irudomaake↓tiNguwa, re↑kIʃi↓tekiniwa, i↑Qpo↓o tsu↑ukoono ko↑myunikeeʃoNtsu↓uruto ʃI↑te, ka↑Nga↓erarete ki↓ta.
51 | ./tsukuyomi/VOICEACTRESS100_051.wav|de↑byuu↓gono su↑une↓NkaNwa, be↑biife↓isUto ʃI↑te, ho↓Nmyoode ka↑tsudoo.
52 | ./tsukuyomi/VOICEACTRESS100_052.wav|ga↑Qkooya byo↑oiNna↓dono, kyu↑uʃokugyo↓omude, e↑eyo↓osoo ke↑esaN su↑ru jo↑ode, ju↑uyoona ʃi↓ryoono hI↑to↓tsudearu.
53 | ./tsukuyomi/VOICEACTRESS100_053.wav|to↓oji, a↑yaʃii wa↓arudoni jo↑oʧuu ʃI↑te i↑ta gi↑ko↓nekoga, ku↑uhakuni↓te ha↑Qpyoo.
54 | ./tsukuyomi/VOICEACTRESS100_054.wav|yu↓ufUkuna nyu↑uyookaa↓taʧiwa, gu↑re↓evuseNdo, ke↑ebajooya, ʃi↓ipuʃeQdobei, ke↑ebajoona↓doni tsu↑do↓i, u↑mizoino ko↑okyuu re↓sUtoraNya, ho↓teruo ri↑yoo ʃI↑ta.
55 | ./tsukuyomi/VOICEACTRESS100_055.wav|wo↑riaazumiQkusumaaʃaruaatsuakademiiʃo↓zoku.
56 | ./tsukuyomi/VOICEACTRESS100_056.wav|to↑koro↓ga, e↑riyuʃIkuto↓oNwa, nyu↓mupeeno se↑eʃimo kI↑kazuni, de↑emeete↓eruno ka↓ʃio, ki↑ritao↓ʃIta.
57 | ./tsukuyomi/VOICEACTRESS100_057.wav|ko↑no je↑ʃii↓yakude sU↑tei↓mosuwa, e↑mii↓ʃooni no↑mine↓eto sa↑reta ko↑to↓mo a↓ru.
58 | ./tsukuyomi/VOICEACTRESS100_058.wav|su↑weedeNi↓miNno ryo↓oʃiNno mo↑to↓ni, ma↑saʧuuseQtsu↓ʃuu, ke↓NburiQjinite u↑mareru.
59 | ./tsukuyomi/VOICEACTRESS100_059.wav|kyu↑ueNno fa↑Nto↓ohyoodemo, ni↑Nkiga gu↑uzooka ʃI↑te i↑ta, na↑gaʃima ʃi↑geoni ni↑kUhakU su↑ru.
60 | ./tsukuyomi/VOICEACTRESS100_060.wav|ha↓hawa, pi↑itaamariQtsuba↓aguno se↑eʃiNbyo↓oiNni nyu↑uiN ʃI↑te i↑ru to↓kini, be↓Qʃiio u↑mu.
61 | ./tsukuyomi/VOICEACTRESS100_061.wav|po↑iNtoga↓adokara, su↑moorufo↓waadomade ko↑nase↓ru, so↑ogooryo↓kuga ta↑ka↓i yu↑utiritiipu↓reeyaadearu.
62 | ./tsukuyomi/VOICEACTRESS100_062.wav|gu↑re↓Qguwa, mi↑ʃIʃiQpi↓ʃuu, a↑badi↓iNni a↓ru, o↑Qdoferoozu↓boʧini ma↑isoo sa↑reru ko↑to↓ni na↓Qta.
63 | ./tsukuyomi/VOICEACTRESS100_063.wav|o↑oatariʃuuryoo↓gowa, gu↑radieetaaʧa↓Nsuni to↑tsunyuu su↑ru.
64 | ./tsukuyomi/VOICEACTRESS100_064.wav|ko↑no ki↓Nni yo↑ru byo↑okiwa, ha↑iirokabibyooto na↑zuke↓rarete i↑ru mo↑no↓ga o↓oi.
65 | ./tsukuyomi/VOICEACTRESS100_065.wav|re↑gyuraame↓Nbaano ka↑oja↓ʃiNo ku↑ri↓Qku ʃI↑ta a↓toni, mu↑ubiipureiyaa↓fuuni sa↑isee sa↑reruto i↑u, to↑kuina ke↑eʃIkito na↓Qte i↑ru.
66 | ./tsukuyomi/VOICEACTRESS100_066.wav|ka↑Nzooeno sa↑Nsokyo↓okyuuwa, ka↑Ndo↓omyakUto, te↑eatsu↓keeno mo↑N↓myakuo ka↓iʃIte, o↑konawarete i↑ru.
67 | ./tsukuyomi/VOICEACTRESS100_067.wav|de↑Qdo↓kiiwa, ta↑ipura↓itaaya, ko↑Npyu↓utano ki↑ibo↓odoni o↑keru, to↑kUʃuna so↑oʃoku↓kiidearu.
68 | ./tsukuyomi/VOICEACTRESS100_068.wav|ʃa↓NʃaN u↑ma↓wa, u↑dojiNgu↓ue sa↑Npai su↑ru, ʃi↑NkoNfu↓ufuga no↑Qte i↑ta u↑ma↓no ko↑to.
69 | ./tsukuyomi/VOICEACTRESS100_069.wav|bu↑ruuriQjisa↓Nmyakuno ge↑Nryuukara, ri↑Qʧimo↓Ndomade, o↓okuno ha↑yaseya fU↑ʧi↓ga, tsu↑riya kyu↑uryuuku↓dario ta↑noʃi↓masete ku↑reru.
70 | ./tsukuyomi/VOICEACTRESS100_070.wav|bo↑o↓haNwa, i↑isUtomaN↓ra, gya↓Nguno sU↑piikui↓ijiino a↑garikara, wa↓iroo to↓Qte i↑ta↓tomo u↑wasa sa↑reta.
71 | ./tsukuyomi/VOICEACTRESS100_071.wav|pe↑Nʃirubenia↓ʃuu, fi↑raderu↓fiano ko↓ogai, wi↑Nre↓Qdono re↑Nkinaubyo↓oiNde u↑mareta.
72 | ./tsukuyomi/VOICEACTRESS100_072.wav|bu↑ra↓Qguwa, byu↓u e↑ru↓guNyorimo, re↑QseedaQta ta↑me↓ni, ko↑no ki↓kaio i↑ka↓sU ko↑to↓o ʧu↓uʧo ʃI↑ta.
73 | ./tsukuyomi/VOICEACTRESS100_073.wav|jo↑oiNgi↓iNto ʃI↑te, ba↓aNweruwa, ka↑riforunia↓ʃuuno, re↑Npooka↓nyuuni sa↑Nsee ʃI↑ta.
74 | ./tsukuyomi/VOICEACTRESS100_074.wav|re↑jeNdoʃiri↓izuo be↓esuni, yo↑o fu↑riiki↓kooo so↑nae↓ta, byu↓u ka↓mera.
75 | ./tsukuyomi/VOICEACTRESS100_075.wav|ga↑Qkyokuno se↑Ntaapoji↓ʃoNwa, e↑ikeebiifootii↓eitono, ta↑ka↓haʃi mi↓namiga tsU↑tome↓ta.
76 | ./tsukuyomi/VOICEACTRESS100_076.wav|di↑onyu↓usosuno, ʧo↑oaio u↑ke↓ru, o↑ineusu↓ooto, h i, a↑rutai↓aano a↑idani, ka↓riyu do↑oNno o↓ojoto ʃI↑te, se↓eo u↑ke↓ta.
77 | ./tsukuyomi/VOICEACTRESS100_077.wav|o↑oniʃi yo↑ojoono, ju↑Nkoo ko↓odokara, do↓oryoku na↓ʃide, ʧi↑jooe ka↑Qkuuhi↓koo ʃ i, ki↑Nkyuu ʧa↑kurikuni se↑ekoo ʃI↑ta.
78 | ./tsukuyomi/VOICEACTRESS100_078.wav|hyo↑ogeNgyo↓oretsuno ʃI↑hyoohyooo, bu↓Nʃino ta↑iʃooseeo a↑rawa↓su, te↑N↓guNno ʃI↑hyoohyooo mo↑ʧii↓te, su↑Nde ya↑kUhyooge↓Ne bu↑Nkai su↑ru.
79 | ./tsukuyomi/VOICEACTRESS100_079.wav|ta↑iyoogyogyooo↓onaano, na↓kabe ke↑NkIʧino i↓noʧio u↑ke↓te, pu↑royakyuukyu↓udaNno, ta↑iyoohoe↓eruzuni ka↑kawa↓ru.
80 | ./tsukuyomi/VOICEACTRESS100_080.wav|ka↓sUkani kI↑koete ku↓ru se↓N kyu↓uhyakU sa↓Njuu i↑ʧine↓NbaNno sa↑Nbi↓kaga, ʃi↑daini o↓okIkunaQte i↑ku.
81 | ./tsukuyomi/VOICEACTRESS100_081.wav|mo↓o i↑ideeNgaNpekIʧii↓kino jo↓obuwa, pu↑raasaatopurawihaaNjiiNi↓sekie tsu↑nagaru, ta↑igawa sa↑Ndooni tsu↑zuite i↑ru.
82 | ./tsukuyomi/VOICEACTRESS100_082.wav|ka↑amira↓boʃIto yo↑barete i↑ru wa↑kUseekara, u↑ʧuuseNni no↑Qte, ʧI↑kyuuni ʃi↑Nnyuu ʃI↑ta u↑ʧuu↓jiN.
83 | ./tsukuyomi/VOICEACTRESS100_083.wav|do↑Qgaaba↓Nkuwa, ta↓raya ni↓ʃiNno gyo↑kaku↓ryouga o↓oi, ju↑uyoona, gyo↑joodearu.
84 | ./tsukuyomi/VOICEACTRESS100_084.wav|ʃo↑oneNji↓daiwa, ro↑ʃiate↓ekoku, ʧe↑runiihiu↓keN, pu↑ruiruukui↓guN, to↑rosUʧanuiitsuyamurade su↑go↓ʃIta.
85 | ./tsukuyomi/VOICEACTRESS100_085.wav|i↑haino ho↑to↓Ndowa, su↑weedeNniʃIka↓igaNno, bu↓u hyu↑usureeNʧIho↓ono ko↑jimani a↓ru gyo↑soN, f u y a r u ba↓Qka ʃu↑uheNno u↓mini, sa↑NkotsU sa↑reta.
86 | ./tsukuyomi/VOICEACTRESS100_086.wav|ko↑Qkyooo ko↑ete, re↑Qʃawa, ka↑iryoo sa↑reta za↑iraiseNni so↓Qte, a↑aheNʧuuoo↓ekini mu↑kau.
87 | ./tsukuyomi/VOICEACTRESS100_087.wav|fU↑kuokadaieeho↓okUsudewanaku, ʧo↑okyorihoono ho↑kyooo me↑za↓ʃIte i↑ta, o↑osakakiNtetsuba↓faroozukara, o↓faao u↑ke↓te nyu↑udaN.
88 | ./tsukuyomi/VOICEACTRESS100_088.wav|so↑koniwa, hya↑kudoru↓satsUto, a↑merikani ko↓ito i↑u, mi↑jika↓i me↑QseejidakedaQta.
89 | ./tsukuyomi/VOICEACTRESS100_089.wav|ge↓Nzaiwa, ba↓Qhao mo↑ʧi↓ifUto ʃI↑ta, ha↑apUʃiko↓odono sa↑Qkyoku↓kato ʃI↑te, ki↑okU sa↑rete i↑ru.
90 | ./tsukuyomi/VOICEACTRESS100_090.wav|se↑Ntoo↓fUkuwa, ryo↑o↓udeo ro↑ʃUtsu ʃ i, ryo↑okyakuga, a↑Ndaasu↓utsude o↑owarete i↑ru.
91 | ./tsukuyomi/VOICEACTRESS100_091.wav|do↑obo↓aneni, su↑weedeNo↓ohi, jo↑zefi↓inuga i↑ru.
92 | ./tsukuyomi/VOICEACTRESS100_092.wav|ʃi↑gaiseNwa, hyo↑omeNene↓rugiino, ʧi↑isa↓i po↓rimaao se↑QʧakU su↑ru sa↓ino, ze↑N↓ʃorini ri↑yoo sa↑reru.
93 | ./tsukuyomi/VOICEACTRESS100_093.wav|ji↓ʃiNno pe↑ejide, me↓Qseejiya, ko↑okaiko↓meNtoo to↑oʃi↓te, re↓byuuo to↑okoo ʃI↑ta yu↓uzaato, ko↑myunike↓eʃoNo to↓ru ko↑to↓ga ka↑noodearu.
94 | ./tsukuyomi/VOICEACTRESS100_094.wav|wa↓kakI hi↑no ha↑Ngyaku↓yueni, u↓ʧuuno ʧu↑uo↓oo tsu↑ihoo sa↑rete, wa↑kUsee, ʧI↑kyuuni ya↑QtekIta ʃu↑ji↓Nkoo, be↓ruzebabuga, u↑ʧuuseN ka↓runaakuno na↓kade, ma↑go↓ni ka↑taru so↑odaina mo↑noga↓tari.
95 | ./tsukuyomi/VOICEACTRESS100_095.wav|ja↓gaatowa ta↑iʃoo↓tekini, bo↑diibi↓rudaao ho↑ofUtsuto sa↑seru, ma↓Qʧona ta↑iikUkai↓keeno ga↑ikeNga to↑kUʧoo.


--------------------------------------------------------------------------------
/filelists/ljs_audio_text_val_filelist.txt.cleaned:
--------------------------------------------------------------------------------
  1 | DUMMY1/LJ022-0023.wav|ðɪ ˌoʊvɚwˈɛlmɪŋ mədʒˈɔːɹɪɾi ʌv pˈiːpəl ɪn ðɪs kˈʌntɹi nˈoʊ hˌaʊ tə sˈɪft ðə wˈiːt fɹʌmðə tʃˈæf ɪn wˌʌt ðeɪ hˈɪɹ ænd wˌʌt ðeɪ ɹˈiːd.
  2 | DUMMY1/LJ043-0030.wav|ɪf sˈʌmbɑːdi dˈɪd ðˈæt tə mˌiː, ɐ lˈaʊsi tɹˈɪk lˈaɪk ðˈæt, tə tˈeɪk maɪ wˈaɪf ɐwˈeɪ, ænd ˈɔːl ðə fˈɜːnɪtʃɚ, ˈaɪ wʊd biː mˈæd æz hˈɛl, tˈuː.
  3 | DUMMY1/LJ005-0201.wav|ˌæzˌɪz ʃˈoʊn baɪ ðə ɹɪpˈoːɹt ʌvðə kəmˈɪʃənɚz tʊ ɪnkwˈaɪɚɹ ˌɪntʊ ðə stˈeɪt ʌvðə mjuːnˈɪsɪpəl kˌɔːɹpɚɹˈeɪʃənz ɪn eɪtˈiːn θˈɜːɾifˈaɪv.
  4 | DUMMY1/LJ001-0110.wav|ˈiːvən ðə kˈæslɑːn tˈaɪp wɛn ɛnlˈɑːɹdʒd ʃˈoʊz ɡɹˈeɪt ʃˈɔːɹtkʌmɪŋz ɪn ðɪs ɹɪspˈɛkt:
  5 | DUMMY1/LJ003-0345.wav|ˈɔːl ðə kəmˈɪɾi kʊd dˈuː ɪn ðɪs ɹɪspˈɛkt wʌz tə θɹˈoʊ ðə ɹɪspˌɑːnsəbˈɪlɪɾi ˌɑːn ˈʌðɚz.
  6 | DUMMY1/LJ007-0154.wav|ðiːz pˈʌndʒənt ænd wˈɛlɡɹˈaʊndᵻd stɹˈɪktʃɚz ɐplˈaɪd wɪð stˈɪl ɡɹˈeɪɾɚ fˈoːɹs tə ðɪ ʌnkənvˈɪktᵻd pɹˈɪzənɚ, ðə mˈæn hˌuː kˈeɪm tə ðə pɹˈɪzən ˈɪnəsənt, ænd stˈɪl ʌnkəntˈæmᵻnˌeɪɾᵻd,
  7 | DUMMY1/LJ018-0098.wav|ænd ɹˈɛkəɡnˌaɪzd æz wˈʌn ʌvðə fɹˈiːkwɛntɚz ʌvðə bˈoʊɡəs lˈɔːstˈeɪʃənɚz. hɪz ɐɹˈɛst lˈɛd tə ðæt ʌv ˈʌðɚz.
  8 | DUMMY1/LJ047-0044.wav|ˈɑːswəld wʌz, haʊˈɛvɚ, wˈɪlɪŋ tə dɪskˈʌs hɪz kˈɑːntækts wɪð sˈoʊviət ɐθˈɔːɹɪɾiz. hiː dɪnˈaɪd hˌævɪŋ ˌɛni ɪnvˈɑːlvmənt wɪð sˈoʊviət ɪntˈɛlɪdʒəns ˈeɪdʒənsiz
  9 | DUMMY1/LJ031-0038.wav|ðə fˈɜːst fɪzˈɪʃən tə sˈiː ðə pɹˈɛzɪdənt æt pˈɑːɹklənd hˈɑːspɪɾəl wʌz dˈɑːktɚ tʃˈɑːɹlz dʒˈeɪ. kˈæɹɪkˌoʊ, ɐ ɹˈɛzɪdənt ɪn dʒˈɛnɚɹəl sˈɜːdʒɚɹi.
 10 | DUMMY1/LJ048-0194.wav|dˈʊɹɪŋ ðə mˈɔːɹnɪŋ ʌv noʊvˈɛmbɚ twˈɛntitˈuː pɹˈaɪɚ tə ðə mˈoʊɾɚkˌeɪd.
 11 | DUMMY1/LJ049-0026.wav|ˌɑːn əkˈeɪʒən ðə sˈiːkɹət sˈɜːvɪs hɐzbɪn pɚmˈɪɾᵻd tə hæv ɐn ˈeɪdʒənt ɹˈaɪdɪŋ ɪnðə pˈæsɪndʒɚ kəmpˈɑːɹtmənt wɪððə pɹˈɛzɪdənt.
 12 | DUMMY1/LJ004-0152.wav|ɑːlðˈoʊ æt mˈɪstɚ bˈʌkstənz vˈɪzɪt ɐ nˈuː dʒˈeɪl wʌz ɪn pɹˈɑːsɛs ʌv ɪɹˈɛkʃən, ðə fˈɜːst stˈɛp tʊwˈɔːɹdz ɹɪfˈɔːɹm sˈɪns hˈaʊɚdz vˌɪzɪtˈeɪʃən ɪn sˌɛvəntˈiːn sˈɛvəntifˈoːɹ.
 13 | DUMMY1/LJ008-0278.wav|ɔːɹ ðˈɛɹz mˌaɪt biː wˈʌn ʌv mˈɛni, ænd ɪt mˌaɪt biː kənsˈɪdɚd nˈɛsəsɚɹi tuː "mˌeɪk ɐn ɛɡzˈæmpəl."
 14 | DUMMY1/LJ043-0002.wav|ðə wˈɔːɹən kəmˈɪʃən ɹɪpˈoːɹt. baɪ ðə pɹˈɛzɪdənts kəmˈɪʃən ɑːnðɪ ɐsˌæsᵻnˈeɪʃən ʌv pɹˈɛzɪdənt kˈɛnədi. tʃˈæptɚ sˈɛvən. lˈiː hˈɑːɹvi ˈɑːswəld:
 15 | DUMMY1/LJ009-0114.wav|mˈɪstɚ wˈeɪkfiːld wˈaɪndz ˈʌp hɪz ɡɹˈæfɪk bˌʌt sˈʌmwʌt sɛnsˈeɪʃənəl ɐkˈaʊnt baɪ dɪskɹˈaɪbɪŋ ɐnˈʌðɚ ɹɪlˈɪdʒəs sˈɜːvɪs, wˌɪtʃ mˈeɪ ɐpɹˈoʊpɹɪətli biː ɪnsˈɜːɾᵻd hˈɪɹ.
 16 | DUMMY1/LJ028-0506.wav|ɐ mˈɑːdɚn ˈɑːɹɾɪst wʊdhɐv dˈɪfɪkˌʌlti ɪn dˌuːɪŋ sˈʌtʃ ˈækjʊɹət wˈɜːk.
 17 | DUMMY1/LJ050-0168.wav|wɪððə pɚtˈɪkjʊlɚ pˈɜːpəsᵻz ʌvðɪ ˈeɪdʒənsi ɪnvˈɑːlvd. ðə kəmˈɪʃən ɹˈɛkəɡnˌaɪzɪz ðæt ðɪs ɪz ɐ kˌɑːntɹəvˈɜːʃəl ˈɛɹiə
 18 | DUMMY1/LJ039-0223.wav|ˈɑːswəldz mɚɹˈiːn tɹˈeɪnɪŋ ɪn mˈɑːɹksmənʃˌɪp, hɪz ˈʌðɚ ɹˈaɪfəl ɛkspˈiəɹɪəns ænd hɪz ɪstˈæblɪʃt fəmˌɪlɪˈæɹɪɾi wɪð ðɪs pɚtˈɪkjʊlɚ wˈɛpən
 19 | DUMMY1/LJ029-0032.wav|ɐkˈoːɹdɪŋ tʊ oʊdˈɑːnəl, kwˈoʊt, wiː hɐd ɐ mˈoʊɾɚkˌeɪd wɛɹɹˈɛvɚ wiː wˈɛnt, ˈɛnd kwˈoʊt.
 20 | DUMMY1/LJ031-0070.wav|dˈɑːktɚ klˈɑːɹk, hˌuː mˈoʊst klˈoʊsli ɑːbzˈɜːvd ðə hˈɛd wˈuːnd,
 21 | DUMMY1/LJ034-0198.wav|jˈuːɪnz, hˌuː wʌz ɑːnðə saʊθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən stɹˈiːts tˈɛstɪfˌaɪd ðæt hiː kʊd nˌɑːt dɪskɹˈaɪb ðə mˈæn hiː sˈɔː ɪnðə wˈɪndoʊ.
 22 | DUMMY1/LJ026-0068.wav|ˈɛnɚdʒi ˈɛntɚz ðə plˈænt, tʊ ɐ smˈɔːl ɛkstˈɛnt,
 23 | DUMMY1/LJ039-0075.wav|wˈʌns juː nˈoʊ ðæt juː mˈʌst pˌʊt ðə kɹˈɔshɛɹz ɑːnðə tˈɑːɹɡɪt ænd ðæt ɪz ˈɔːl ðæt ɪz nˈɛsəsɚɹi.
 24 | DUMMY1/LJ004-0096.wav|ðə fˈeɪɾəl kˈɑːnsɪkwənsᵻz wˈɛɹɑːf mˌaɪt biː pɹɪvˈɛntᵻd ɪf ðə dʒˈʌstɪsᵻz ʌvðə pˈiːs wɜː djˈuːli ˈɔːθɚɹˌaɪzd
 25 | DUMMY1/LJ005-0014.wav|spˈiːkɪŋ ˌɑːn ɐ dɪbˈeɪt ˌɑːn pɹˈɪzən mˈæɾɚz, hiː dᵻklˈɛɹd ðˈæt
 26 | DUMMY1/LJ012-0161.wav|hiː wʌz ɹɪpˈoːɹɾᵻd tə hæv fˈɔːlən ɐwˈeɪ tʊ ɐ ʃˈædoʊ.
 27 | DUMMY1/LJ018-0239.wav|hɪz dˌɪsɐpˈɪɹəns ɡˈeɪv kˈʌlɚ ænd sˈʌbstəns tʊ ˈiːvəl ɹɪpˈoːɹts ɔːlɹˌɛdi ɪn sˌɜːkjʊlˈeɪʃən ðætðə wɪl ænd kənvˈeɪəns əbˌʌv ɹɪfˈɜːd tuː
 28 | DUMMY1/LJ019-0257.wav|hˈɪɹ ðə tɹˈɛdwˈiːl wʌz ɪn jˈuːs, ðɛɹ sˈɛljʊlɚ kɹˈæŋks, ɔːɹ hˈɑːɹdlˈeɪbɚ məʃˈiːnz.
 29 | DUMMY1/LJ028-0008.wav|juː tˈæp dʒˈɛntli wɪð jʊɹ hˈiːl əpˌɑːn ðə ʃˈoʊldɚɹ ʌvðə dɹˈoʊmdɚɹi tʊ ˈɜːdʒ hɜːɹ ˈɑːn.
 30 | DUMMY1/LJ024-0083.wav|ðɪs plˈæn ʌv mˈaɪn ɪz nˈoʊ ɐtˈæk ɑːnðə kˈoːɹt;
 31 | DUMMY1/LJ042-0129.wav|nˈoʊ nˈaɪt klˈʌbz ɔːɹ bˈoʊlɪŋ ˈælɪz, nˈoʊ plˈeɪsᵻz ʌv ɹˌɛkɹiːˈeɪʃən ɛksˈɛpt ðə tɹˈeɪd jˈuːniən dˈænsᵻz. ˈaɪ hæv hɐd ɪnˈʌf.
 32 | DUMMY1/LJ036-0103.wav|ðə pəlˈiːs ˈæskt hˌɪm wˈɛðɚ hiː kʊd pˈɪk ˈaʊt hɪz pˈæsɪndʒɚ fɹʌmðə lˈaɪnʌp.
 33 | DUMMY1/LJ046-0058.wav|dˈʊɹɪŋ hɪz pɹˈɛzɪdənsi, fɹˈæŋklɪn dˈiː. ɹˈoʊzəvˌɛlt mˌeɪd ˈɔːlmoʊst fˈoːɹ hˈʌndɹəd dʒˈɜːnɪz ænd tɹˈævəld mˈoːɹ ðɐn θɹˈiː hˈʌndɹəd fˈɪfti θˈaʊzənd mˈaɪlz.
 34 | DUMMY1/LJ014-0076.wav|hiː wʌz sˈiːn ˈæftɚwɚdz smˈoʊkɪŋ ænd tˈɔːkɪŋ wɪð hɪz hˈoʊsts ɪn ðɛɹ bˈæk pˈɑːɹlɚ, ænd nˈɛvɚ sˈiːn ɐɡˈɛn ɐlˈaɪv.
 35 | DUMMY1/LJ002-0043.wav|lˈɑːŋ nˈæɹoʊ ɹˈuːmz wˈʌn θˈɜːɾisˈɪks fˈiːt, sˈɪks twˈɛntiθɹˈiː fˈiːt, ænd ðɪ ˈeɪtθ eɪtˈiːn,
 36 | DUMMY1/LJ009-0076.wav|wiː kˈʌm tə ðə sˈɜːmən.
 37 | DUMMY1/LJ017-0131.wav|ˈiːvən wɛn ðə hˈaɪ ʃˈɛɹɪf hɐd tˈoʊld hˌɪm ðɛɹwˌʌz nˈoʊ pˌɑːsəbˈɪlɪɾi əvɚ ɹɪpɹˈiːv, ænd wɪðˌɪn ɐ fjˈuː ˈaɪʊɹz ʌv ˌɛksɪkjˈuːʃən.
 38 | DUMMY1/LJ046-0184.wav|bˌʌt ðɛɹ ɪz ɐ sˈɪstəm fɚðɪ ɪmˈiːdɪət nˌoʊɾɪfɪkˈeɪʃən ʌvðə sˈiːkɹət sˈɜːvɪs baɪ ðə kənfˈaɪnɪŋ ˌɪnstɪtˈuːʃən wɛn ɐ sˈʌbdʒɛkt ɪz ɹɪlˈiːsd ɔːɹ ɛskˈeɪps.
 39 | DUMMY1/LJ014-0263.wav|wˌɛn ˈʌðɚ plˈɛʒɚz pˈɔːld hiː tˈʊk ɐ θˈiəɾɚ, ænd pˈoʊzd æz ɐ mjuːnˈɪfɪsənt pˈeɪtɹən ʌvðə dɹəmˈæɾɪk ˈɑːɹt.
 40 | DUMMY1/LJ042-0096.wav| ˈoʊld ɛkstʃˈeɪndʒ ɹˈeɪt ɪn ɐdˈɪʃən tə hɪz fˈæktɚɹi sˈælɚɹi ʌv ɐpɹˈɑːksɪmətli ˈiːkwəl ɐmˈaʊnt
 41 | DUMMY1/LJ049-0050.wav|hˈɪl hɐd bˈoʊθ fˈiːt ɑːnðə kˈɑːɹ ænd wʌz klˈaɪmɪŋ ɐbˈoːɹd tʊ ɐsˈɪst pɹˈɛzɪdənt ænd mɪsˈɛs kˈɛnədi.
 42 | DUMMY1/LJ019-0186.wav|sˈiːɪŋ ðæt sˈɪns ðɪ ɪstˈæblɪʃmənt ʌvðə sˈɛntɹəl kɹˈɪmɪnəl kˈoːɹt, nˈuːɡeɪt ɹɪsˈiːvd pɹˈɪzənɚz fɔːɹ tɹˈaɪəl fɹʌm sˈɛvɹəl kˈaʊntɪz,
 43 | DUMMY1/LJ028-0307.wav|ðˈɛn lˈɛt twˈɛnti dˈeɪz pˈæs, ænd æt ðɪ ˈɛnd ʌv ðæt tˈaɪm stˈeɪʃən nˌɪɹ ðə tʃˈældæsən ɡˈeɪts ɐ bˈɑːdi ʌv fˈoːɹ θˈaʊzənd.
 44 | DUMMY1/LJ012-0235.wav|wˌaɪl ðeɪ wɜːɹ ɪn ɐ stˈeɪt ʌv ɪnsˌɛnsəbˈɪlɪɾi ðə mˈɜːdɚ wʌz kəmˈɪɾᵻd.
 45 | DUMMY1/LJ034-0053.wav|ɹˈiːtʃt ðə sˈeɪm kənklˈuːʒən æz lætˈoʊnə ðætðə pɹˈɪnts fˈaʊnd ɑːnðə kˈɑːɹtənz wɜː ðoʊz ʌv lˈiː hˈɑːɹvi ˈɑːswəld.
 46 | DUMMY1/LJ014-0030.wav|ðiːz wɜː dˈæmnətˌoːɹi fˈækts wˌɪtʃ wˈɛl səpˈoːɹɾᵻd ðə pɹˌɑːsɪkjˈuːʃən.
 47 | DUMMY1/LJ015-0203.wav|bˌʌt wɜː ðə pɹɪkˈɔːʃənz tˈuː mˈɪnɪt, ðə vˈɪdʒɪləns tˈuː klˈoʊs təbi ɪlˈuːdᵻd ɔːɹ ˌoʊvɚkˈʌm?
 48 | DUMMY1/LJ028-0093.wav|bˌʌt hɪz skɹˈaɪb ɹˈoʊt ɪt ɪnðə mˈænɚ kˈʌstəmˌɛɹi fɚðə skɹˈaɪbz ʌv ðoʊz dˈeɪz tə ɹˈaɪt ʌv ðɛɹ ɹˈɔɪəl mˈæstɚz.
 49 | DUMMY1/LJ002-0018.wav|ðɪ ɪnˈædɪkwəsi ʌvðə dʒˈeɪl wʌz nˈoʊɾɪsd ænd ɹɪpˈoːɹɾᵻd əpˌɑːn ɐɡˈɛn ænd ɐɡˈɛn baɪ ðə ɡɹˈænd dʒˈʊɹɪz ʌvðə sˈɪɾi ʌv lˈʌndən,
 50 | DUMMY1/LJ028-0275.wav|æt lˈæst, ɪnðə twˈɛntiəθ mˈʌnθ,
 51 | DUMMY1/LJ012-0042.wav|wˌɪtʃ hiː kˈɛpt kənsˈiːld ɪn ɐ hˈaɪdɪŋplˈeɪs wɪð ɐ tɹˈæpdˈoːɹ dʒˈʌst ˌʌndɚ hɪz bˈɛd.
 52 | DUMMY1/LJ011-0096.wav|hiː mˈæɹɪd ɐ lˈeɪdi ˈɑːlsoʊ bɪlˈɑːŋɪŋ tə ðə səsˈaɪəɾi ʌv fɹˈɛndz, hˌuː bɹˈɔːt hˌɪm ɐ lˈɑːɹdʒ fˈɔːɹtʃən, wˈɪtʃ, ænd hɪz ˈoʊn mˈʌni, hiː pˌʊt ˌɪntʊ ɐ sˈɪɾi fˈɜːm,
 53 | DUMMY1/LJ036-0077.wav|ɹˈɑːdʒɚ dˈiː. kɹˈeɪɡ, ɐ dˈɛpjuːɾi ʃˈɛɹɪf ʌv dˈæləs kˈaʊnti,
 54 | DUMMY1/LJ016-0318.wav|ˈʌðɚɹ əfˈɪʃəlz, ɡɹˈeɪt lˈɔɪɚz, ɡˈʌvɚnɚz ʌv pɹˈɪzənz, ænd tʃˈæplɪnz səpˈoːɹɾᵻd ðɪs vjˈuː.
 55 | DUMMY1/LJ013-0164.wav|hˌuː kˈeɪm fɹʌm hɪz ɹˈuːm ɹˈɛdi dɹˈɛst, ɐ səspˈɪʃəs sˈɜːkəmstˌæns, æz hiː wʌz ˈɔːlweɪz lˈeɪt ɪnðə mˈɔːɹnɪŋ.
 56 | DUMMY1/LJ027-0141.wav|ɪz klˈoʊsli ɹɪpɹədˈuːst ɪnðə lˈaɪfhˈɪstɚɹi ʌv ɛɡzˈɪstɪŋ dˈɪɹ. ˈɔːɹ, ɪn ˈʌðɚ wˈɜːdz,
 57 | DUMMY1/LJ028-0335.wav|ɐkˈoːɹdɪŋli ðeɪ kəmˈɪɾᵻd tə hˌɪm ðə kəmˈænd ʌv ðɛɹ hˈoʊl ˈɑːɹmi, ænd pˌʊt ðə kˈiːz ʌv ðɛɹ sˈɪɾi ˌɪntʊ hɪz hˈændz.
 58 | DUMMY1/LJ031-0202.wav|mɪsˈɛs kˈɛnədi tʃˈoʊz ðə hˈɑːspɪɾəl ɪn bəθˈɛzdə fɚðɪ ˈɔːtɑːpsi bɪkˈʌz ðə pɹˈɛzɪdənt hɐd sˈɜːvd ɪnðə nˈeɪvi.
 59 | DUMMY1/LJ021-0145.wav|fɹʌm ðoʊz wˈɪlɪŋ tə dʒˈɔɪn ɪn ɪstˈæblɪʃɪŋ ðɪs hˈoʊptfɔːɹ pˈiəɹɪəd ʌv pˈiːs,
 60 | DUMMY1/LJ016-0288.wav|"mˈʌlɚ, mˈʌlɚ, hiːz ðə mˈæn," tˈɪl ɐ daɪvˈɜːʒən wʌz kɹiːˈeɪɾᵻd baɪ ðɪ ɐpˈɪɹəns ʌvðə ɡˈæloʊz, wˌɪtʃ wʌz ɹɪsˈiːvd wɪð kəntˈɪnjuːəs jˈɛlz.
 61 | DUMMY1/LJ028-0081.wav|jˈɪɹz lˈeɪɾɚ, wˌɛn ðɪ ˌɑːɹkiːˈɑːlədʒˌɪsts kʊd ɹˈɛdɪli dɪstˈɪŋɡwɪʃ ðə fˈɑːls fɹʌmðə tɹˈuː,
 62 | DUMMY1/LJ018-0081.wav|hɪz dɪfˈɛns bˌiːɪŋ ðæt hiː hɐd ɪntˈɛndᵻd tə kəmˈɪt sˈuːɪsˌaɪd, bˌʌt ðˈæt, ɑːnðɪ ɐpˈɪɹəns ʌv ðɪs ˈɑːfɪsɚ hˌuː hɐd ɹˈɔŋd hˌɪm,
 63 | DUMMY1/LJ021-0066.wav|təɡˌɛðɚ wɪð ɐ ɡɹˈeɪt ˈɪnkɹiːs ɪnðə pˈeɪɹoʊlz, ðɛɹ hɐz kˈʌm ɐ səbstˈænʃəl ɹˈaɪz ɪnðə tˈoʊɾəl ʌv ɪndˈʌstɹɪəl pɹˈɑːfɪts
 64 | DUMMY1/LJ009-0238.wav|ˈæftɚ ðɪs ðə ʃˈɛɹɪfs sˈɛnt fɔːɹ ɐnˈʌðɚ ɹˈoʊp, bˌʌt ðə spɛktˈeɪɾɚz ˌɪntəfˈɪɹd, ænd ðə mˈæn wʌz kˈæɹɪd bˈæk tə dʒˈeɪl.
 65 | DUMMY1/LJ005-0079.wav|ænd ɪmpɹˈuːv ðə mˈɔːɹəlz ʌvðə pɹˈɪzənɚz, ænd ʃˌæl ɪnʃˈʊɹ ðə pɹˈɑːpɚ mˈɛʒɚɹ ʌv pˈʌnɪʃmənt tə kənvˈɪktᵻd əfˈɛndɚz.
 66 | DUMMY1/LJ035-0019.wav|dɹˈoʊv tə ðə nɔːɹθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən, ænd pˈɑːɹkt ɐpɹˈɑːksɪmətli tˈɛn fˈiːt fɹʌmðə tɹˈæfɪk sˈɪɡnəl.
 67 | DUMMY1/LJ036-0174.wav|ðɪs ɪz ðɪ ɐpɹˈɑːksɪmət tˈaɪm hiː ˈɛntɚd ðə ɹˈuːmɪŋhˌaʊs, ɐkˈoːɹdɪŋ tʊ ˈɜːliːn ɹˈɑːbɚts, ðə hˈaʊskiːpɚ ðˈɛɹ.
 68 | DUMMY1/LJ046-0146.wav|ðə kɹaɪtˈiəɹɪə ɪn ɪfˈɛkt pɹˈaɪɚ tə noʊvˈɛmbɚ twˈɛntitˈuː, naɪntˈiːn sˈɪkstiθɹˈiː, fɔːɹ dɪtˈɜːmɪnɪŋ wˈɛðɚ tʊ ɐksˈɛpt mətˈiəɹɪəl fɚðə pˌiːˌɑːɹˈɛs dʒˈɛnɚɹəl fˈaɪlz
 69 | DUMMY1/LJ017-0044.wav|ænd ðə dˈiːpəst æŋzˈaɪəɾi wʌz fˈɛlt ðætðə kɹˈaɪm, ɪf kɹˈaɪm ðˈɛɹ hɐdbɪn, ʃˌʊd biː bɹˈɔːt hˈoʊm tʊ ɪts pˈɜːpɪtɹˌeɪɾɚ.
 70 | DUMMY1/LJ017-0070.wav|bˌʌt hɪz spˈoːɹɾɪŋ ˌɑːpɚɹˈeɪʃənz dɪdnˌɑːt pɹˈɑːspɚ, ænd hiː bɪkˌeɪm ɐ nˈiːdi mˈæn, ˈɔːlweɪz dɹˈɪvən tə dˈɛspɚɹət stɹˈeɪts fɔːɹ kˈæʃ.
 71 | DUMMY1/LJ014-0020.wav|hiː wʌz sˈuːn ˈæftɚwɚdz ɐɹˈɛstᵻd ˌɑːn səspˈɪʃən, ænd ɐ sˈɜːtʃ ʌv hɪz lˈɑːdʒɪŋz bɹˈɔːt tə lˈaɪt sˈɛvɹəl ɡˈɑːɹmənts sˈætʃɚɹˌeɪɾᵻd wɪð blˈʌd;
 72 | DUMMY1/LJ016-0020.wav|hiː nˈɛvɚ ɹˈiːtʃt ðə sˈɪstɚn, bˌʌt fˈɛl bˈæk ˌɪntʊ ðə jˈɑːɹd, ˈɪndʒɚɹɪŋ hɪz lˈɛɡz sɪvˈɪɹli.
 73 | DUMMY1/LJ045-0230.wav|wˌɛn hiː wʌz fˈaɪnəli ˌæpɹɪhˈɛndᵻd ɪnðə tˈɛksəs θˈiəɾɚ. ɑːlðˈoʊ ɪt ɪz nˌɑːt fˈʊli kɚɹˈɑːbɚɹˌeɪɾᵻd baɪ ˈʌðɚz hˌuː wɜː pɹˈɛzənt,
 74 | DUMMY1/LJ035-0129.wav|ænd ʃiː mˈʌstɐv ɹˈʌn dˌaʊn ðə stˈɛɹz ɐhˈɛd ʌv ˈɑːswəld ænd wʊd pɹˈɑːbəbli hæv sˈiːn ɔːɹ hˈɜːd hˌɪm.
 75 | DUMMY1/LJ008-0307.wav|ˈæftɚwɚdz ɛkspɹˈɛs ɐ wˈɪʃ tə mˈɜːdɚ ðə ɹɪkˈoːɹdɚ fɔːɹ hˌævɪŋ kˈɛpt ðˌɛm sˌoʊ lˈɑːŋ ɪn səspˈɛns.
 76 | DUMMY1/LJ008-0294.wav|nˌɪɹli ɪndˈɛfɪnətli dɪfˈɜːd.
 77 | DUMMY1/LJ047-0148.wav|ˌɑːn ɑːktˈoʊbɚ twˈɛntifˈaɪv,
 78 | DUMMY1/LJ008-0111.wav|ðeɪ ˈɛntɚd ˈeɪ "stˈoʊn kˈoʊld ɹˈuːm," ænd wɜː pɹˈɛzəntli dʒˈɔɪnd baɪ ðə pɹˈɪzənɚ.
 79 | DUMMY1/LJ034-0042.wav|ðæt hiː kʊd ˈoʊnli tˈɛstɪfˌaɪ wɪð sˈɜːtənti ðætðə pɹˈɪnt wʌz lˈɛs ðɐn θɹˈiː dˈeɪz ˈoʊld.
 80 | DUMMY1/LJ037-0234.wav|mɪsˈɛs mˈɛɹi bɹˈɑːk, ðə wˈaɪf əvə mɪkˈænɪk hˌuː wˈɜːkt æt ðə stˈeɪʃən, wʌz ðɛɹ æt ðə tˈaɪm ænd ʃiː sˈɔː ɐ wˈaɪt mˈeɪl,
 81 | DUMMY1/LJ040-0002.wav|tʃˈæptɚ sˈɛvən. lˈiː hˈɑːɹvi ˈɑːswəld: bˈækɡɹaʊnd ænd pˈɑːsəbəl mˈoʊɾɪvz, pˈɑːɹt wˌʌn.
 82 | DUMMY1/LJ045-0140.wav|ðɪ ˈɑːɹɡjuːmənts hiː jˈuːzd tə dʒˈʌstɪfˌaɪ hɪz jˈuːs ʌvðɪ ˈeɪliəs sədʒˈɛst ðæt ˈɑːswəld mˌeɪhɐv kˈʌm tə θˈɪŋk ðætðə hˈoʊl wˈɜːld wʌz bɪkˈʌmɪŋ ɪnvˈɑːlvd
 83 | DUMMY1/LJ012-0035.wav|ðə nˈʌmbɚ ænd nˈeɪmz ˌɑːn wˈɑːtʃᵻz, wɜː kˈɛɹfəli ɹɪmˈuːvd ɔːɹ əblˈɪɾɚɹˌeɪɾᵻd ˈæftɚ ðə ɡˈʊdz pˈæst ˌaʊɾəv hɪz hˈændz.
 84 | DUMMY1/LJ012-0250.wav|ɑːnðə sˈɛvənθ dʒuːlˈaɪ, eɪtˈiːn θˈɜːɾisˈɛvən,
 85 | DUMMY1/LJ016-0179.wav|kəntɹˈæktᵻd wɪð ʃˈɛɹɪfs ænd kənvˈɛnɚz tə wˈɜːk baɪ ðə dʒˈɑːb.
 86 | DUMMY1/LJ016-0138.wav|æɾə dˈɪstəns fɹʌmðə pɹˈɪzən.
 87 | DUMMY1/LJ027-0052.wav|ðiːz pɹˈɪnsɪpəlz ʌv həmˈɑːlədʒi ɑːɹ ɪsˈɛnʃəl tʊ ɐ kɚɹˈɛkt ɪntˌɜːpɹɪtˈeɪʃən ʌvðə fˈækts ʌv mɔːɹfˈɑːlədʒi.
 88 | DUMMY1/LJ031-0134.wav|ˌɑːn wˈʌn əkˈeɪʒən mɪsˈɛs dʒˈɑːnsən, ɐkˈʌmpənɪd baɪ tˈuː sˈiːkɹət sˈɜːvɪs ˈeɪdʒənts, lˈɛft ðə ɹˈuːm tə sˈiː mɪsˈɛs kˈɛnədi ænd mɪsˈɛs kənˈæli.
 89 | DUMMY1/LJ019-0273.wav|wˌɪtʃ sˌɜː dʒˈɑːʃjuːə dʒˈɛb tˈoʊld ðə kəmˈɪɾi hiː kənsˈɪdɚd ðə pɹˈɑːpɚɹ ˈɛlɪmənts ʌv pˈiːnəl dˈɪsɪplˌɪn.
 90 | DUMMY1/LJ014-0110.wav|æt ðə fˈɜːst ðə bˈɑːksᵻz wɜːɹ ɪmpˈaʊndᵻd, ˈoʊpənd, ænd fˈaʊnd tə kəntˈeɪn mˈɛnɪəv oʊkˈɑːnɚz ɪfˈɛkts.
 91 | DUMMY1/LJ034-0160.wav|ˌɑːn bɹˈɛnənz sˈʌbsɪkwənt sˈɜːtən aɪdˈɛntɪfɪkˈeɪʃən ʌv lˈiː hˈɑːɹvi ˈɑːswəld æz ðə mˈæn hiː sˈɔː fˈaɪɚ ðə ɹˈaɪfəl.
 92 | DUMMY1/LJ038-0199.wav|ɪlˈɛvən. ɪf ˈaɪ æm ɐlˈaɪv ænd tˈeɪkən pɹˈɪzənɚ,
 93 | DUMMY1/LJ014-0010.wav|jˈɛt hiː kʊd nˌɑːt ˌoʊvɚkˈʌm ðə stɹˈeɪndʒ fˌæsᵻnˈeɪʃən ɪt hˈɐd fɔːɹ hˌɪm, ænd ɹɪmˈeɪnd baɪ ðə sˈaɪd ʌvðə kˈɔːɹps tˈɪl ðə stɹˈɛtʃɚ kˈeɪm.
 94 | DUMMY1/LJ033-0047.wav|ˈaɪ nˈoʊɾɪsd wɛn ˈaɪ wɛnt ˈaʊt ðætðə lˈaɪt wʌz ˈɑːn, ˈɛnd kwˈoʊt,
 95 | DUMMY1/LJ040-0027.wav|hiː wʌz nˈɛvɚ sˈæɾɪsfˌaɪd wɪð ˈɛnɪθˌɪŋ.
 96 | DUMMY1/LJ048-0228.wav|ænd ˈʌðɚz hˌuː wɜː pɹˈɛzənt sˈeɪ ðæt nˈoʊ ˈeɪdʒənt wʌz ɪnˈiːbɹɪˌeɪɾᵻd ɔːɹ ˈæktᵻd ɪmpɹˈɑːpɚli.
 97 | DUMMY1/LJ003-0111.wav|hiː wʌz ɪn kˈɑːnsɪkwəns pˌʊt ˌaʊɾəv ðə pɹətˈɛkʃən ʌv ðɛɹ ɪntˈɜːnəl lˈɔː, ˈɛnd kwˈoʊt. ðɛɹ kˈoʊd wʌzɐ sˈʌbdʒɛkt ʌv sˌʌm kjˌʊɹɪˈɑːsɪɾi.
 98 | DUMMY1/LJ008-0258.wav|lˈɛt mˌiː ɹɪtɹˈeɪs maɪ stˈɛps, ænd spˈiːk mˈoːɹ ɪn diːtˈeɪl ʌvðə tɹˈiːtmənt ʌvðə kəndˈɛmd ɪn ðoʊz blˈʌdθɜːsti ænd bɹˈuːɾəli ɪndˈɪfɹənt dˈeɪz,
 99 | DUMMY1/LJ029-0022.wav|ðɪ ɚɹˈɪdʒɪnəl plˈæn kˈɔːld fɚðə pɹˈɛzɪdənt tə spˈɛnd ˈoʊnli wˈʌn dˈeɪ ɪnðə stˈeɪt, mˌeɪkɪŋ wˈɜːlwɪnd vˈɪzɪts tə dˈæləs, fˈɔːɹt wˈɜːθ, sˌæn æntˈoʊnɪˌoʊ, ænd hjˈuːstən.
100 | DUMMY1/LJ004-0045.wav|mˈɪstɚ stˈɜːdʒᵻz bˈoːɹn, sˌɜː dʒˈeɪmz mˈækɪntˌɑːʃ, sˌɜː dʒˈeɪmz skˈɑːɹlɪt, ænd wˈɪljəm wˈɪlbɚfˌoːɹs.
101 | 


--------------------------------------------------------------------------------
/train_latest_ms.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import argparse
  4 | import itertools
  5 | import math
  6 | import torch
  7 | from torch import nn, optim
  8 | from torch.nn import functional as F
  9 | from torch.utils.data import DataLoader
 10 | from torch.utils.tensorboard import SummaryWriter
 11 | import torch.multiprocessing as mp
 12 | import torch.distributed as dist
 13 | from torch.nn.parallel import DistributedDataParallel as DDP
 14 | from torch.cuda.amp import autocast, GradScaler
 15 | from pqmf import PQMF
 16 | 
 17 | import commons
 18 | import utils
 19 | from data_utils import (
 20 |   TextAudioSpeakerLoader,
 21 |   TextAudioSpeakerCollate,
 22 |   DistributedBucketSampler
 23 | )
 24 | from models import (
 25 |   SynthesizerTrn,
 26 |   MultiPeriodDiscriminator,
 27 | )
 28 | from losses import (
 29 |   generator_loss,
 30 |   discriminator_loss,
 31 |   feature_loss,
 32 |   kl_loss,
 33 |   subband_stft_loss
 34 | )
 35 | from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
 36 | from text.symbols import symbols
 37 | 
 38 | torch.autograd.set_detect_anomaly(True)
 39 | torch.backends.cudnn.benchmark = True
 40 | global_step = 0
 41 | 
 42 | 
 43 | def main():
 44 |   """Assume Single Node Multi GPUs Training Only"""
 45 |   assert torch.cuda.is_available(), "CPU training is not allowed."
 46 | 
 47 |   n_gpus = torch.cuda.device_count()
 48 |   os.environ['MASTER_ADDR'] = 'localhost'
 49 |   os.environ['MASTER_PORT'] = '65520'
 50 | #   n_gpus = 1
 51 | 
 52 |   hps = utils.get_hparams()
 53 |   mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
 54 | 
 55 | 
 56 | def run(rank, n_gpus, hps):
 57 |   global global_step
 58 |   if rank == 0:
 59 |     logger = utils.get_logger(hps.model_dir)
 60 |     logger.info(hps)
 61 |     utils.check_git_hash(hps.model_dir)
 62 |     writer = SummaryWriter(log_dir=hps.model_dir)
 63 |     writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
 64 | 
 65 |   dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
 66 |   torch.manual_seed(hps.train.seed)
 67 |   torch.cuda.set_device(rank)
 68 | 
 69 |   train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data)
 70 |   train_sampler = DistributedBucketSampler(
 71 |       train_dataset,
 72 |       hps.train.batch_size,
 73 |       [32,300,400,500,600,700,800,900,1000],
 74 |       num_replicas=n_gpus,
 75 |       rank=rank,
 76 |       shuffle=True)
 77 |   collate_fn = TextAudioSpeakerCollate()
 78 |   train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True,
 79 |       collate_fn=collate_fn, batch_sampler=train_sampler)
 80 |   if rank == 0:
 81 |     eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
 82 |     eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False,
 83 |         batch_size=hps.train.batch_size, pin_memory=True,
 84 |         drop_last=False, collate_fn=collate_fn)
 85 | 
 86 |   net_g = SynthesizerTrn(
 87 |       len(symbols),
 88 |       hps.data.filter_length // 2 + 1,
 89 |       hps.train.segment_size // hps.data.hop_length,
 90 |       n_speakers=hps.data.n_speakers,
 91 |       **hps.model).cuda(rank)
 92 |   net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
 93 |   optim_g = torch.optim.AdamW(
 94 |       net_g.parameters(), 
 95 |       hps.train.learning_rate, 
 96 |       betas=hps.train.betas, 
 97 |       eps=hps.train.eps)
 98 |   optim_d = torch.optim.AdamW(
 99 |       net_d.parameters(),
100 |       hps.train.learning_rate, 
101 |       betas=hps.train.betas, 
102 |       eps=hps.train.eps)
103 |   net_g = DDP(net_g, device_ids=[rank])
104 |   net_d = DDP(net_d, device_ids=[rank])
105 | 
106 |   try:
107 |     _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g)
108 |     _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d)
109 |     global_step = (epoch_str - 1) * len(train_loader)
110 |   except:
111 |     epoch_str = 1
112 |     global_step = 0
113 | 
114 |   scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str-2)
115 |   scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str-2)
116 | 
117 |   scaler = GradScaler(enabled=hps.train.fp16_run)
118 | 
119 |   for epoch in range(epoch_str, hps.train.epochs + 1):
120 |     if rank==0:
121 |       train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval])
122 |     else:
123 |       train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None)
124 |     scheduler_g.step()
125 |     scheduler_d.step()
126 | 
127 | 
128 | 
129 | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers):
130 |   net_g, net_d = nets
131 |   optim_g, optim_d = optims
132 |   scheduler_g, scheduler_d = schedulers
133 |   train_loader, eval_loader = loaders
134 |   if writers is not None:
135 |     writer, writer_eval = writers
136 | 
137 |   train_loader.batch_sampler.set_epoch(epoch)
138 |   global global_step
139 | 
140 |   net_g.train()
141 |   net_d.train()
142 |   for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(train_loader):
143 |     x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True)
144 |     spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
145 |     y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True)
146 |     speakers = speakers.cuda(rank, non_blocking=True)
147 | 
148 |     with autocast(enabled=hps.train.fp16_run):
149 |       y_hat, y_hat_mb, l_length, attn, ids_slice, x_mask, z_mask,\
150 |       (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths, speakers)
151 | 
152 |       mel = spec_to_mel_torch(
153 |           spec, 
154 |           hps.data.filter_length, 
155 |           hps.data.n_mel_channels, 
156 |           hps.data.sampling_rate,
157 |           hps.data.mel_fmin, 
158 |           hps.data.mel_fmax)
159 |       y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
160 |       y_hat_mel = mel_spectrogram_torch(
161 |           y_hat.squeeze(1), 
162 |           hps.data.filter_length, 
163 |           hps.data.n_mel_channels, 
164 |           hps.data.sampling_rate, 
165 |           hps.data.hop_length, 
166 |           hps.data.win_length, 
167 |           hps.data.mel_fmin, 
168 |           hps.data.mel_fmax
169 |       )
170 | 
171 |       y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice 
172 | 
173 |       # Discriminator
174 |       y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
175 |       with autocast(enabled=False):
176 |         loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
177 |         loss_disc_all = loss_disc
178 |     optim_d.zero_grad()
179 |     scaler.scale(loss_disc_all).backward()
180 |     scaler.unscale_(optim_d)
181 |     grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
182 |     scaler.step(optim_d)
183 | 
184 |     
185 | 
186 | 
187 |     with autocast(enabled=hps.train.fp16_run):
188 |       # Generator
189 |       y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
190 |       with autocast(enabled=False):
191 |         loss_dur = torch.sum(l_length.float())
192 |         loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
193 |         loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
194 | 
195 |         loss_fm = feature_loss(fmap_r, fmap_g)
196 |         loss_gen, losses_gen = generator_loss(y_d_hat_g)
197 |         
198 |         if hps.model.mb_istft_vits == True:
199 |           pqmf = PQMF(y.device)
200 |           y_mb = pqmf.analysis(y)
201 |           loss_subband = subband_stft_loss(hps, y_mb, y_hat_mb)
202 |         else:
203 |           loss_subband = torch.tensor(0.0)
204 | 
205 |         loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl + loss_subband
206 | 
207 |     optim_g.zero_grad()
208 |     scaler.scale(loss_gen_all).backward()
209 |     scaler.unscale_(optim_g)
210 |     grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
211 |     scaler.step(optim_g)
212 |     scaler.update()
213 | 
214 |     if rank==0:
215 |       if global_step % hps.train.log_interval == 0:
216 |         lr = optim_g.param_groups[0]['lr']
217 |         losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl, loss_subband]
218 |         logger.info('Train Epoch: {} [{:.0f}%]'.format(
219 |           epoch,
220 |           100. * batch_idx / len(train_loader)))
221 |         logger.info([x.item() for x in losses] + [global_step, lr])
222 |         
223 |         scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g}
224 |         scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl, "loss/g/subband": loss_subband})
225 | 
226 |         scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
227 |         scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
228 |         scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
229 |         image_dict = { 
230 |             "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
231 |             "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), 
232 |             "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
233 |             "all/attn": utils.plot_alignment_to_numpy(attn[0,0].data.cpu().numpy())
234 |         }
235 |         utils.summarize(
236 |           writer=writer,
237 |           global_step=global_step, 
238 |           images=image_dict,
239 |           scalars=scalar_dict)
240 | 
241 |       if global_step % hps.train.eval_interval == 0:
242 |         evaluate(hps, net_g, eval_loader, writer_eval)
243 |         utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
244 |         utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
245 |     global_step += 1
246 | 
247 |   
248 |   if rank == 0:
249 |     logger.info('====> Epoch: {}'.format(epoch))
250 |   
251 |     
252 | 
253 |  
254 | def evaluate(hps, generator, eval_loader, writer_eval):
255 |     generator.eval()
256 |     with torch.no_grad():
257 |       for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(eval_loader):
258 |         x, x_lengths = x.cuda(0), x_lengths.cuda(0)
259 |         spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0)
260 |         y, y_lengths = y.cuda(0), y_lengths.cuda(0)
261 |         speakers = speakers.cuda(0)
262 | 
263 |         # remove else
264 |         x = x[:1]
265 |         x_lengths = x_lengths[:1]
266 |         spec = spec[:1]
267 |         spec_lengths = spec_lengths[:1]
268 |         y = y[:1]
269 |         y_lengths = y_lengths[:1]
270 |         speakers = speakers[:1]
271 |         break
272 |       y_hat, y_hat_mb, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, max_len=1000)
273 |       y_hat_lengths = mask.sum([1,2]).long() * hps.data.hop_length
274 | 
275 |       mel = spec_to_mel_torch(
276 |         spec, 
277 |         hps.data.filter_length, 
278 |         hps.data.n_mel_channels, 
279 |         hps.data.sampling_rate,
280 |         hps.data.mel_fmin, 
281 |         hps.data.mel_fmax)
282 |       y_hat_mel = mel_spectrogram_torch(
283 |         y_hat.squeeze(1).float(),
284 |         hps.data.filter_length,
285 |         hps.data.n_mel_channels,
286 |         hps.data.sampling_rate,
287 |         hps.data.hop_length,
288 |         hps.data.win_length,
289 |         hps.data.mel_fmin,
290 |         hps.data.mel_fmax
291 |       )
292 |     image_dict = {
293 |       "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy())
294 |     }
295 |     audio_dict = {
296 |       "gen/audio": y_hat[0,:,:y_hat_lengths[0]]
297 |     }
298 |     if global_step == 0:
299 |       image_dict.update({"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())})
300 |       audio_dict.update({"gt/audio": y[0,:,:y_lengths[0]]})
301 | 
302 |     utils.summarize(
303 |       writer=writer_eval,
304 |       global_step=global_step, 
305 |       images=image_dict,
306 |       audios=audio_dict,
307 |       audio_sampling_rate=hps.data.sampling_rate
308 |     )
309 |     generator.train()
310 | 
311 |                            
312 | if __name__ == "__main__":
313 |   os.environ[
314 |         "TORCH_DISTRIBUTED_DEBUG"
315 |     ] = "DETAIL"
316 |   main()
317 | 


--------------------------------------------------------------------------------
/attentions.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | import commons
  9 | import modules
 10 | from modules import LayerNorm
 11 |    
 12 | 
 13 | class Encoder(nn.Module):
 14 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
 15 |     super().__init__()
 16 |     self.hidden_channels = hidden_channels
 17 |     self.filter_channels = filter_channels
 18 |     self.n_heads = n_heads
 19 |     self.n_layers = n_layers
 20 |     self.kernel_size = kernel_size
 21 |     self.p_dropout = p_dropout
 22 |     self.window_size = window_size
 23 | 
 24 |     self.drop = nn.Dropout(p_dropout)
 25 |     self.attn_layers = nn.ModuleList()
 26 |     self.norm_layers_1 = nn.ModuleList()
 27 |     self.ffn_layers = nn.ModuleList()
 28 |     self.norm_layers_2 = nn.ModuleList()
 29 |     for i in range(self.n_layers):
 30 |       self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
 31 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 32 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
 33 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 34 | 
 35 |   def forward(self, x, x_mask):
 36 |     attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 37 |     x = x * x_mask
 38 |     for i in range(self.n_layers):
 39 |       y = self.attn_layers[i](x, x, attn_mask)
 40 |       y = self.drop(y)
 41 |       x = self.norm_layers_1[i](x + y)
 42 | 
 43 |       y = self.ffn_layers[i](x, x_mask)
 44 |       y = self.drop(y)
 45 |       x = self.norm_layers_2[i](x + y)
 46 |     x = x * x_mask
 47 |     return x
 48 | 
 49 | 
 50 | class Decoder(nn.Module):
 51 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
 52 |     super().__init__()
 53 |     self.hidden_channels = hidden_channels
 54 |     self.filter_channels = filter_channels
 55 |     self.n_heads = n_heads
 56 |     self.n_layers = n_layers
 57 |     self.kernel_size = kernel_size
 58 |     self.p_dropout = p_dropout
 59 |     self.proximal_bias = proximal_bias
 60 |     self.proximal_init = proximal_init
 61 | 
 62 |     self.drop = nn.Dropout(p_dropout)
 63 |     self.self_attn_layers = nn.ModuleList()
 64 |     self.norm_layers_0 = nn.ModuleList()
 65 |     self.encdec_attn_layers = nn.ModuleList()
 66 |     self.norm_layers_1 = nn.ModuleList()
 67 |     self.ffn_layers = nn.ModuleList()
 68 |     self.norm_layers_2 = nn.ModuleList()
 69 |     for i in range(self.n_layers):
 70 |       self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
 71 |       self.norm_layers_0.append(LayerNorm(hidden_channels))
 72 |       self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
 73 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 74 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
 75 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 76 | 
 77 |   def forward(self, x, x_mask, h, h_mask):
 78 |     """
 79 |     x: decoder input
 80 |     h: encoder output
 81 |     """
 82 |     self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
 83 |     encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 84 |     x = x * x_mask
 85 |     for i in range(self.n_layers):
 86 |       y = self.self_attn_layers[i](x, x, self_attn_mask)
 87 |       y = self.drop(y)
 88 |       x = self.norm_layers_0[i](x + y)
 89 | 
 90 |       y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
 91 |       y = self.drop(y)
 92 |       x = self.norm_layers_1[i](x + y)
 93 |       
 94 |       y = self.ffn_layers[i](x, x_mask)
 95 |       y = self.drop(y)
 96 |       x = self.norm_layers_2[i](x + y)
 97 |     x = x * x_mask
 98 |     return x
 99 | 
100 | 
101 | class MultiHeadAttention(nn.Module):
102 |   def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
103 |     super().__init__()
104 |     assert channels % n_heads == 0
105 | 
106 |     self.channels = channels
107 |     self.out_channels = out_channels
108 |     self.n_heads = n_heads
109 |     self.p_dropout = p_dropout
110 |     self.window_size = window_size
111 |     self.heads_share = heads_share
112 |     self.block_length = block_length
113 |     self.proximal_bias = proximal_bias
114 |     self.proximal_init = proximal_init
115 |     self.attn = None
116 | 
117 |     self.k_channels = channels // n_heads
118 |     self.conv_q = nn.Conv1d(channels, channels, 1)
119 |     self.conv_k = nn.Conv1d(channels, channels, 1)
120 |     self.conv_v = nn.Conv1d(channels, channels, 1)
121 |     self.conv_o = nn.Conv1d(channels, out_channels, 1)
122 |     self.drop = nn.Dropout(p_dropout)
123 | 
124 |     if window_size is not None:
125 |       n_heads_rel = 1 if heads_share else n_heads
126 |       rel_stddev = self.k_channels**-0.5
127 |       self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
128 |       self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
129 | 
130 |     nn.init.xavier_uniform_(self.conv_q.weight)
131 |     nn.init.xavier_uniform_(self.conv_k.weight)
132 |     nn.init.xavier_uniform_(self.conv_v.weight)
133 |     if proximal_init:
134 |       with torch.no_grad():
135 |         self.conv_k.weight.copy_(self.conv_q.weight)
136 |         self.conv_k.bias.copy_(self.conv_q.bias)
137 |       
138 |   def forward(self, x, c, attn_mask=None):
139 |     q = self.conv_q(x)
140 |     k = self.conv_k(c)
141 |     v = self.conv_v(c)
142 |     
143 |     x, self.attn = self.attention(q, k, v, mask=attn_mask)
144 | 
145 |     x = self.conv_o(x)
146 |     return x
147 | 
148 |   def attention(self, query, key, value, mask=None):
149 |     # reshape [b, d, t] -> [b, n_h, t, d_k]
150 |     b, d, t_s, t_t = (*key.size(), query.size(2))
151 |     query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
152 |     key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
153 |     value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
154 | 
155 |     scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
156 |     if self.window_size is not None:
157 |       assert t_s == t_t, "Relative attention is only available for self-attention."
158 |       key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
159 |       rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
160 |       scores_local = self._relative_position_to_absolute_position(rel_logits)
161 |       scores = scores + scores_local
162 |     if self.proximal_bias:
163 |       assert t_s == t_t, "Proximal bias is only available for self-attention."
164 |       scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
165 |     if mask is not None:
166 |       scores = scores.masked_fill(mask == 0, -1e4)
167 |       if self.block_length is not None:
168 |         assert t_s == t_t, "Local attention is only available for self-attention."
169 |         block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
170 |         scores = scores.masked_fill(block_mask == 0, -1e4)
171 |     p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
172 |     p_attn = self.drop(p_attn)
173 |     output = torch.matmul(p_attn, value)
174 |     if self.window_size is not None:
175 |       relative_weights = self._absolute_position_to_relative_position(p_attn)
176 |       value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
177 |       output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
178 |     output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
179 |     return output, p_attn
180 | 
181 |   def _matmul_with_relative_values(self, x, y):
182 |     """
183 |     x: [b, h, l, m]
184 |     y: [h or 1, m, d]
185 |     ret: [b, h, l, d]
186 |     """
187 |     ret = torch.matmul(x, y.unsqueeze(0))
188 |     return ret
189 | 
190 |   def _matmul_with_relative_keys(self, x, y):
191 |     """
192 |     x: [b, h, l, d]
193 |     y: [h or 1, m, d]
194 |     ret: [b, h, l, m]
195 |     """
196 |     ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
197 |     return ret
198 | 
199 |   def _get_relative_embeddings(self, relative_embeddings, length):
200 |     max_relative_position = 2 * self.window_size + 1
201 |     # Pad first before slice to avoid using cond ops.
202 |     pad_length = max(length - (self.window_size + 1), 0)
203 |     slice_start_position = max((self.window_size + 1) - length, 0)
204 |     slice_end_position = slice_start_position + 2 * length - 1
205 |     if pad_length > 0:
206 |       padded_relative_embeddings = F.pad(
207 |           relative_embeddings,
208 |           commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
209 |     else:
210 |       padded_relative_embeddings = relative_embeddings
211 |     used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
212 |     return used_relative_embeddings
213 | 
214 |   def _relative_position_to_absolute_position(self, x):
215 |     """
216 |     x: [b, h, l, 2*l-1]
217 |     ret: [b, h, l, l]
218 |     """
219 |     batch, heads, length, _ = x.size()
220 |     # Concat columns of pad to shift from relative to absolute indexing.
221 |     x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
222 | 
223 |     # Concat extra elements so to add up to shape (len+1, 2*len-1).
224 |     x_flat = x.view([batch, heads, length * 2 * length])
225 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
226 | 
227 |     # Reshape and slice out the padded elements.
228 |     x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
229 |     return x_final
230 | 
231 |   def _absolute_position_to_relative_position(self, x):
232 |     """
233 |     x: [b, h, l, l]
234 |     ret: [b, h, l, 2*l-1]
235 |     """
236 |     batch, heads, length, _ = x.size()
237 |     # padd along column
238 |     x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
239 |     x_flat = x.view([batch, heads, length**2 + length*(length -1)])
240 |     # add 0's in the beginning that will skew the elements after reshape
241 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
242 |     x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
243 |     return x_final
244 | 
245 |   def _attention_bias_proximal(self, length):
246 |     """Bias for self-attention to encourage attention to close positions.
247 |     Args:
248 |       length: an integer scalar.
249 |     Returns:
250 |       a Tensor with shape [1, 1, length, length]
251 |     """
252 |     r = torch.arange(length, dtype=torch.float32)
253 |     diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
254 |     return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
255 | 
256 | 
257 | class FFN(nn.Module):
258 |   def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
259 |     super().__init__()
260 |     self.in_channels = in_channels
261 |     self.out_channels = out_channels
262 |     self.filter_channels = filter_channels
263 |     self.kernel_size = kernel_size
264 |     self.p_dropout = p_dropout
265 |     self.activation = activation
266 |     self.causal = causal
267 | 
268 |     if causal:
269 |       self.padding = self._causal_padding
270 |     else:
271 |       self.padding = self._same_padding
272 | 
273 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
274 |     self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
275 |     self.drop = nn.Dropout(p_dropout)
276 | 
277 |   def forward(self, x, x_mask):
278 |     x = self.conv_1(self.padding(x * x_mask))
279 |     if self.activation == "gelu":
280 |       x = x * torch.sigmoid(1.702 * x)
281 |     else:
282 |       x = torch.relu(x)
283 |     x = self.drop(x)
284 |     x = self.conv_2(self.padding(x * x_mask))
285 |     return x * x_mask
286 |   
287 |   def _causal_padding(self, x):
288 |     if self.kernel_size == 1:
289 |       return x
290 |     pad_l = self.kernel_size - 1
291 |     pad_r = 0
292 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
293 |     x = F.pad(x, commons.convert_pad_shape(padding))
294 |     return x
295 | 
296 |   def _same_padding(self, x):
297 |     if self.kernel_size == 1:
298 |       return x
299 |     pad_l = (self.kernel_size - 1) // 2
300 |     pad_r = self.kernel_size // 2
301 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
302 |     x = F.pad(x, commons.convert_pad_shape(padding))
303 |     return x
304 | 


--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import scipy
  5 | import torch
  6 | from torch import nn
  7 | from torch.nn import functional as F
  8 | 
  9 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 10 | from torch.nn.utils import weight_norm, remove_weight_norm
 11 | 
 12 | import commons
 13 | from commons import init_weights, get_padding
 14 | from transforms import piecewise_rational_quadratic_transform
 15 | 
 16 | 
 17 | LRELU_SLOPE = 0.1
 18 | 
 19 | 
 20 | class LayerNorm(nn.Module):
 21 |   def __init__(self, channels, eps=1e-5):
 22 |     super().__init__()
 23 |     self.channels = channels
 24 |     self.eps = eps
 25 | 
 26 |     self.gamma = nn.Parameter(torch.ones(channels))
 27 |     self.beta = nn.Parameter(torch.zeros(channels))
 28 | 
 29 |   def forward(self, x):
 30 |     x = x.transpose(1, -1)
 31 |     x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 32 |     return x.transpose(1, -1)
 33 | 
 34 |  
 35 | class ConvReluNorm(nn.Module):
 36 |   def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
 37 |     super().__init__()
 38 |     self.in_channels = in_channels
 39 |     self.hidden_channels = hidden_channels
 40 |     self.out_channels = out_channels
 41 |     self.kernel_size = kernel_size
 42 |     self.n_layers = n_layers
 43 |     self.p_dropout = p_dropout
 44 |     assert n_layers > 1, "Number of layers should be larger than 0."
 45 | 
 46 |     self.conv_layers = nn.ModuleList()
 47 |     self.norm_layers = nn.ModuleList()
 48 |     self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 49 |     self.norm_layers.append(LayerNorm(hidden_channels))
 50 |     self.relu_drop = nn.Sequential(
 51 |         nn.ReLU(),
 52 |         nn.Dropout(p_dropout))
 53 |     for _ in range(n_layers-1):
 54 |       self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 55 |       self.norm_layers.append(LayerNorm(hidden_channels))
 56 |     self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 57 |     self.proj.weight.data.zero_()
 58 |     self.proj.bias.data.zero_()
 59 | 
 60 |   def forward(self, x, x_mask):
 61 |     x_org = x
 62 |     for i in range(self.n_layers):
 63 |       x = self.conv_layers[i](x * x_mask)
 64 |       x = self.norm_layers[i](x)
 65 |       x = self.relu_drop(x)
 66 |     x = x_org + self.proj(x)
 67 |     return x * x_mask
 68 | 
 69 | 
 70 | class DDSConv(nn.Module):
 71 |   """
 72 |   Dialted and Depth-Separable Convolution
 73 |   """
 74 |   def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
 75 |     super().__init__()
 76 |     self.channels = channels
 77 |     self.kernel_size = kernel_size
 78 |     self.n_layers = n_layers
 79 |     self.p_dropout = p_dropout
 80 | 
 81 |     self.drop = nn.Dropout(p_dropout)
 82 |     self.convs_sep = nn.ModuleList()
 83 |     self.convs_1x1 = nn.ModuleList()
 84 |     self.norms_1 = nn.ModuleList()
 85 |     self.norms_2 = nn.ModuleList()
 86 |     for i in range(n_layers):
 87 |       dilation = kernel_size ** i
 88 |       padding = (kernel_size * dilation - dilation) // 2
 89 |       self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 
 90 |           groups=channels, dilation=dilation, padding=padding
 91 |       ))
 92 |       self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
 93 |       self.norms_1.append(LayerNorm(channels))
 94 |       self.norms_2.append(LayerNorm(channels))
 95 | 
 96 |   def forward(self, x, x_mask, g=None):
 97 |     if g is not None:
 98 |       x = x + g
 99 |     for i in range(self.n_layers):
100 |       y = self.convs_sep[i](x * x_mask)
101 |       y = self.norms_1[i](y)
102 |       y = F.gelu(y)
103 |       y = self.convs_1x1[i](y)
104 |       y = self.norms_2[i](y)
105 |       y = F.gelu(y)
106 |       y = self.drop(y)
107 |       x = x + y
108 |     return x * x_mask
109 | 
110 | 
111 | class WN(torch.nn.Module):
112 |   def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
113 |     super(WN, self).__init__()
114 |     assert(kernel_size % 2 == 1)
115 |     self.hidden_channels =hidden_channels
116 |     self.kernel_size = kernel_size,
117 |     self.dilation_rate = dilation_rate
118 |     self.n_layers = n_layers
119 |     self.gin_channels = gin_channels
120 |     self.p_dropout = p_dropout
121 | 
122 |     self.in_layers = torch.nn.ModuleList()
123 |     self.res_skip_layers = torch.nn.ModuleList()
124 |     self.drop = nn.Dropout(p_dropout)
125 | 
126 |     if gin_channels != 0:
127 |       cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
128 |       self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
129 | 
130 |     for i in range(n_layers):
131 |       dilation = dilation_rate ** i
132 |       padding = int((kernel_size * dilation - dilation) / 2)
133 |       in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
134 |                                  dilation=dilation, padding=padding)
135 |       in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
136 |       self.in_layers.append(in_layer)
137 | 
138 |       # last one is not necessary
139 |       if i < n_layers - 1:
140 |         res_skip_channels = 2 * hidden_channels
141 |       else:
142 |         res_skip_channels = hidden_channels
143 | 
144 |       res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
145 |       res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
146 |       self.res_skip_layers.append(res_skip_layer)
147 | 
148 |   def forward(self, x, x_mask, g=None, **kwargs):
149 |     output = torch.zeros_like(x)
150 |     n_channels_tensor = torch.IntTensor([self.hidden_channels])
151 | 
152 |     if g is not None:
153 |       g = self.cond_layer(g)
154 | 
155 |     for i in range(self.n_layers):
156 |       x_in = self.in_layers[i](x)
157 |       if g is not None:
158 |         cond_offset = i * 2 * self.hidden_channels
159 |         g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
160 |       else:
161 |         g_l = torch.zeros_like(x_in)
162 | 
163 |       acts = commons.fused_add_tanh_sigmoid_multiply(
164 |           x_in,
165 |           g_l,
166 |           n_channels_tensor)
167 |       acts = self.drop(acts)
168 | 
169 |       res_skip_acts = self.res_skip_layers[i](acts)
170 |       if i < self.n_layers - 1:
171 |         res_acts = res_skip_acts[:,:self.hidden_channels,:]
172 |         x = (x + res_acts) * x_mask
173 |         output = output + res_skip_acts[:,self.hidden_channels:,:]
174 |       else:
175 |         output = output + res_skip_acts
176 |     return output * x_mask
177 | 
178 |   def remove_weight_norm(self):
179 |     if self.gin_channels != 0:
180 |       torch.nn.utils.remove_weight_norm(self.cond_layer)
181 |     for l in self.in_layers:
182 |       torch.nn.utils.remove_weight_norm(l)
183 |     for l in self.res_skip_layers:
184 |      torch.nn.utils.remove_weight_norm(l)
185 | 
186 | 
187 | class ResBlock1(torch.nn.Module):
188 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
189 |         super(ResBlock1, self).__init__()
190 |         self.convs1 = nn.ModuleList([
191 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
192 |                                padding=get_padding(kernel_size, dilation[0]))),
193 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
194 |                                padding=get_padding(kernel_size, dilation[1]))),
195 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
196 |                                padding=get_padding(kernel_size, dilation[2])))
197 |         ])
198 |         self.convs1.apply(init_weights)
199 | 
200 |         self.convs2 = nn.ModuleList([
201 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
202 |                                padding=get_padding(kernel_size, 1))),
203 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
204 |                                padding=get_padding(kernel_size, 1))),
205 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
206 |                                padding=get_padding(kernel_size, 1)))
207 |         ])
208 |         self.convs2.apply(init_weights)
209 | 
210 |     def forward(self, x, x_mask=None):
211 |         for c1, c2 in zip(self.convs1, self.convs2):
212 |             xt = F.leaky_relu(x, LRELU_SLOPE)
213 |             if x_mask is not None:
214 |                 xt = xt * x_mask
215 |             xt = c1(xt)
216 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
217 |             if x_mask is not None:
218 |                 xt = xt * x_mask
219 |             xt = c2(xt)
220 |             x = xt + x
221 |         if x_mask is not None:
222 |             x = x * x_mask
223 |         return x
224 | 
225 |     def remove_weight_norm(self):
226 |         for l in self.convs1:
227 |             remove_weight_norm(l)
228 |         for l in self.convs2:
229 |             remove_weight_norm(l)
230 | 
231 | 
232 | class ResBlock2(torch.nn.Module):
233 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
234 |         super(ResBlock2, self).__init__()
235 |         self.convs = nn.ModuleList([
236 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
237 |                                padding=get_padding(kernel_size, dilation[0]))),
238 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
239 |                                padding=get_padding(kernel_size, dilation[1])))
240 |         ])
241 |         self.convs.apply(init_weights)
242 | 
243 |     def forward(self, x, x_mask=None):
244 |         for c in self.convs:
245 |             xt = F.leaky_relu(x, LRELU_SLOPE)
246 |             if x_mask is not None:
247 |                 xt = xt * x_mask
248 |             xt = c(xt)
249 |             x = xt + x
250 |         if x_mask is not None:
251 |             x = x * x_mask
252 |         return x
253 | 
254 |     def remove_weight_norm(self):
255 |         for l in self.convs:
256 |             remove_weight_norm(l)
257 | 
258 | 
259 | class Log(nn.Module):
260 |   def forward(self, x, x_mask, reverse=False, **kwargs):
261 |     if not reverse:
262 |       y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
263 |       logdet = torch.sum(-y, [1, 2])
264 |       return y, logdet
265 |     else:
266 |       x = torch.exp(x) * x_mask
267 |       return x
268 |     
269 | 
270 | class Flip(nn.Module):
271 |   def forward(self, x, *args, reverse=False, **kwargs):
272 |     x = torch.flip(x, [1])
273 |     if not reverse:
274 |       logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
275 |       return x, logdet
276 |     else:
277 |       return x
278 | 
279 | 
280 | class ElementwiseAffine(nn.Module):
281 |   def __init__(self, channels):
282 |     super().__init__()
283 |     self.channels = channels
284 |     self.m = nn.Parameter(torch.zeros(channels,1))
285 |     self.logs = nn.Parameter(torch.zeros(channels,1))
286 | 
287 |   def forward(self, x, x_mask, reverse=False, **kwargs):
288 |     if not reverse:
289 |       y = self.m + torch.exp(self.logs) * x
290 |       y = y * x_mask
291 |       logdet = torch.sum(self.logs * x_mask, [1,2])
292 |       return y, logdet
293 |     else:
294 |       x = (x - self.m) * torch.exp(-self.logs) * x_mask
295 |       return x
296 | 
297 | 
298 | class ResidualCouplingLayer(nn.Module):
299 |   def __init__(self,
300 |       channels,
301 |       hidden_channels,
302 |       kernel_size,
303 |       dilation_rate,
304 |       n_layers,
305 |       p_dropout=0,
306 |       gin_channels=0,
307 |       mean_only=False):
308 |     assert channels % 2 == 0, "channels should be divisible by 2"
309 |     super().__init__()
310 |     self.channels = channels
311 |     self.hidden_channels = hidden_channels
312 |     self.kernel_size = kernel_size
313 |     self.dilation_rate = dilation_rate
314 |     self.n_layers = n_layers
315 |     self.half_channels = channels // 2
316 |     self.mean_only = mean_only
317 | 
318 |     self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
319 |     self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
320 |     self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
321 |     self.post.weight.data.zero_()
322 |     self.post.bias.data.zero_()
323 | 
324 |   def forward(self, x, x_mask, g=None, reverse=False):
325 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
326 |     h = self.pre(x0) * x_mask
327 |     h = self.enc(h, x_mask, g=g)
328 |     stats = self.post(h) * x_mask
329 |     if not self.mean_only:
330 |       m, logs = torch.split(stats, [self.half_channels]*2, 1)
331 |     else:
332 |       m = stats
333 |       logs = torch.zeros_like(m)
334 | 
335 |     if not reverse:
336 |       x1 = m + x1 * torch.exp(logs) * x_mask
337 |       x = torch.cat([x0, x1], 1)
338 |       logdet = torch.sum(logs, [1,2])
339 |       return x, logdet
340 |     else:
341 |       x1 = (x1 - m) * torch.exp(-logs) * x_mask
342 |       x = torch.cat([x0, x1], 1)
343 |       return x
344 | 
345 | 
346 | class ConvFlow(nn.Module):
347 |   def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
348 |     super().__init__()
349 |     self.in_channels = in_channels
350 |     self.filter_channels = filter_channels
351 |     self.kernel_size = kernel_size
352 |     self.n_layers = n_layers
353 |     self.num_bins = num_bins
354 |     self.tail_bound = tail_bound
355 |     self.half_channels = in_channels // 2
356 | 
357 |     self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
358 |     self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
359 |     self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
360 |     self.proj.weight.data.zero_()
361 |     self.proj.bias.data.zero_()
362 | 
363 |   def forward(self, x, x_mask, g=None, reverse=False):
364 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
365 |     h = self.pre(x0)
366 |     h = self.convs(h, x_mask, g=g)
367 |     h = self.proj(h) * x_mask
368 | 
369 |     b, c, t = x0.shape
370 |     h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
371 | 
372 |     unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
373 |     unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
374 |     unnormalized_derivatives = h[..., 2 * self.num_bins:]
375 | 
376 |     x1, logabsdet = piecewise_rational_quadratic_transform(x1,
377 |         unnormalized_widths,
378 |         unnormalized_heights,
379 |         unnormalized_derivatives,
380 |         inverse=reverse,
381 |         tails='linear',
382 |         tail_bound=self.tail_bound
383 |     )
384 | 
385 |     x = torch.cat([x0, x1], 1) * x_mask
386 |     logdet = torch.sum(logabsdet * x_mask, [1,2])
387 |     if not reverse:
388 |         return x, logdet
389 |     else:
390 |         return x
391 | 


--------------------------------------------------------------------------------