├── fig
├── proposed_model.png
└── with_tsukuyomi_chan.png
├── monotonic_align
├── setup.py
├── __init__.py
└── core.pyx
├── requirements.txt
├── convert_to_22050.py
├── filelists
├── filelist_val2.txt.cleaned
├── vctk_audio_sid_text_val_filelist.txt
├── vctk_audio_sid_text_val_filelist.txt.cleaned
├── ljs_audio_text_val_filelist.txt
├── filelist_train2.txt.cleaned
└── ljs_audio_text_val_filelist.txt.cleaned
├── text
├── symbols.py
├── LICENSE
├── __init__.py
├── py2kn.json
├── japanese.py
├── cleaners.py
└── korean.py
├── preprocess.py
├── configs
├── tsukuyomi_chan.json
├── ljs_istft_vits.json
├── ljs_mb_istft_vits.json
├── ljs_mini_istft_vits.json
├── ljs_mini_mb_istft_vits.json
└── ljs_ms_istft_vits.json
├── losses.py
├── inference.ipynb
├── README.md
├── mel_processing.py
├── pqmf.py
├── stft_loss.py
├── commons.py
├── utils.py
├── transforms.py
├── stft.py
├── LICENSE
├── train_latest.py
├── train_latest_ms.py
├── attentions.py
└── modules.py
/fig/proposed_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/misakiudon/MB-iSTFT-VITS-multilingual/HEAD/fig/proposed_model.png
--------------------------------------------------------------------------------
/fig/with_tsukuyomi_chan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/misakiudon/MB-iSTFT-VITS-multilingual/HEAD/fig/with_tsukuyomi_chan.png
--------------------------------------------------------------------------------
/monotonic_align/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | from Cython.Build import cythonize
3 | import numpy
4 |
5 | setup(
6 | name = 'monotonic_align',
7 | ext_modules = cythonize("core.pyx"),
8 | include_dirs=[numpy.get_include()]
9 | )
10 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython==0.29.21
2 | librosa==0.8.0
3 | matplotlib==3.3.1
4 | numpy==1.18.5
5 | phonemizer==2.2.1
6 | scipy==1.5.2
7 | tensorboard==2.3.0
8 | torch==1.6.0
9 | torchvision==0.7.0
10 | Unidecode==1.1.1
11 | pysoundfile==0.9.0.post1
12 | pyopenjtalk==0.2.0
13 | jamo==0.4.1
14 | ko_pron==1.3
15 |
--------------------------------------------------------------------------------
/convert_to_22050.py:
--------------------------------------------------------------------------------
1 | import os
2 | import librosa
3 | import argparse
4 | import soundfile as sf
5 |
6 | if __name__ == '__main__':
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument("--in_path", default="./tsukuyomi_raw/", required=True)
9 | parser.add_argument("--out_path", default="./tsukuyomi/" ,required=True)
10 |
11 | args = parser.parse_args()
12 |
13 | os.makedirs(args.out_path, exist_ok=True)
14 | filenames = os.listdir(args.in_path)
15 | for filename in filenames:
16 | print(args.in_path+filename)
17 | y, sr = librosa.core.load(args.in_path+filename, sr=22050, mono=True)
18 | sf.write(args.out_path+filename, y, sr, subtype="PCM_16")
19 |
--------------------------------------------------------------------------------
/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from .monotonic_align.core import maximum_path_c
4 |
5 |
6 | def maximum_path(neg_cent, mask):
7 | """ Cython optimized version.
8 | neg_cent: [b, t_t, t_s]
9 | mask: [b, t_t, t_s]
10 | """
11 | device = neg_cent.device
12 | dtype = neg_cent.dtype
13 | neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
14 | path = np.zeros(neg_cent.shape, dtype=np.int32)
15 |
16 | t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
17 | t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
18 | maximum_path_c(path, neg_cent, t_t_max, t_s_max)
19 | return torch.from_numpy(path).to(device=device, dtype=dtype)
20 |
--------------------------------------------------------------------------------
/filelists/filelist_val2.txt.cleaned:
--------------------------------------------------------------------------------
1 | ./tsukuyomi/VOICEACTRESS100_096.wav|pe↑Nʃirubenia↓ʃuu, pi↑Qtsuba↓aguno, a↑regeeniiko↓okooo so↑tsugyoo ʃ i, ka↑riforuniada↓igaku, ba↑akuree↓kooni nyu↑ugaku.
2 | ./tsukuyomi/VOICEACTRESS100_097.wav|ko↑no ga↓ineNno do↑onyuuniyoQte, sa↑ma↓zamana ba↑rie↓eʃoNno, ryu↑utaino ko↑Npyuutaaʃimyure↓eʃoNga, ta↑ka↓i se↓edode ka↑nooto na↓Qta.
3 | ./tsukuyomi/VOICEACTRESS100_098.wav|i↓nui do↓Qkuni nyu↓ukyo ʃI↑te, o↑obaaho↓oru su↑be↓kIka do↓oka, pa↑fo↓omaNsuga ʧe↓QkU sa↑reta.
4 | ./tsukuyomi/VOICEACTRESS100_099.wav|de↑byuuwe↓etowa, su↑upaabaNtamu↓kyuudewanaku, fe↑zaa↓kyuudaQta.
5 | ./tsukuyomi/VOICEACTRESS100_100.wav|a↓ariiwa, ko↓ouno na↓kao, mi↑namino ba↑ajinia↓ʃuu, wi↑NʧesUtaaʧi↓kakuno, fi↑Qʃaazuhi↓rumade, gu↓No ʃi↑rizo↓ita.
--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 |
3 | '''
4 | Defines the set of symbols used in text input to the model.
5 | '''
6 | _pad = '_'
7 | _punctuation = ';:,.!?¡¿—…"«»“” '
8 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ'
9 | _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
10 |
11 | '''# korean_cleaners
12 | _pad = '_'
13 | _punctuation = ',.!?…~'
14 | _letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
15 | '''
16 |
17 | # Export all symbols:
18 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
19 |
20 | # Special symbol ids
21 | SPACE_ID = symbols.index(" ")
22 |
--------------------------------------------------------------------------------
/text/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2017 Keith Ito
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import text
3 | from utils import load_filepaths_and_text
4 |
5 | if __name__ == '__main__':
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument("--out_extension", default="cleaned")
8 | parser.add_argument("--text_index", default=1, type=int)
9 | parser.add_argument("--filelists", nargs="+", default=["filelists/ljs_audio_text_val_filelist.txt", "filelists/ljs_audio_text_test_filelist.txt"])
10 | parser.add_argument("--text_cleaners", nargs="+", default=["english_cleaners2"])
11 |
12 | args = parser.parse_args()
13 |
14 |
15 | for filelist in args.filelists:
16 | print("START:", filelist)
17 | filepaths_and_text = load_filepaths_and_text(filelist)
18 | for i in range(len(filepaths_and_text)):
19 | original_text = filepaths_and_text[i][args.text_index]
20 | cleaned_text = text._clean_text(original_text, args.text_cleaners)
21 | filepaths_and_text[i][args.text_index] = cleaned_text
22 |
23 | new_filelist = filelist + "." + args.out_extension
24 | with open(new_filelist, "w", encoding="utf-8") as f:
25 | f.writelines(["|".join(x) + "\n" for x in filepaths_and_text])
26 |
--------------------------------------------------------------------------------
/monotonic_align/core.pyx:
--------------------------------------------------------------------------------
1 | cimport cython
2 | from cython.parallel import prange
3 |
4 |
5 | @cython.boundscheck(False)
6 | @cython.wraparound(False)
7 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
8 | cdef int x
9 | cdef int y
10 | cdef float v_prev
11 | cdef float v_cur
12 | cdef float tmp
13 | cdef int index = t_x - 1
14 |
15 | for y in range(t_y):
16 | for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
17 | if x == y:
18 | v_cur = max_neg_val
19 | else:
20 | v_cur = value[y-1, x]
21 | if x == 0:
22 | if y == 0:
23 | v_prev = 0.
24 | else:
25 | v_prev = max_neg_val
26 | else:
27 | v_prev = value[y-1, x-1]
28 | value[y, x] += max(v_prev, v_cur)
29 |
30 | for y in range(t_y - 1, -1, -1):
31 | path[y, index] = 1
32 | if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
33 | index = index - 1
34 |
35 |
36 | @cython.boundscheck(False)
37 | @cython.wraparound(False)
38 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
39 | cdef int b = paths.shape[0]
40 | cdef int i
41 | for i in prange(b, nogil=True):
42 | maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
43 |
--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 | from text import cleaners
3 | from text.symbols import symbols
4 |
5 |
6 | # Mappings from symbol to numeric ID and vice versa:
7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9 |
10 |
11 | def text_to_sequence(text, cleaner_names):
12 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13 | Args:
14 | text: string to convert to a sequence
15 | cleaner_names: names of the cleaner functions to run the text through
16 | Returns:
17 | List of integers corresponding to the symbols in the text
18 | '''
19 | sequence = []
20 |
21 | clean_text = _clean_text(text, cleaner_names)
22 | for symbol in clean_text:
23 | symbol_id = _symbol_to_id[symbol]
24 | sequence += [symbol_id]
25 | return sequence
26 |
27 |
28 | def cleaned_text_to_sequence(cleaned_text):
29 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
30 | Args:
31 | text: string to convert to a sequence
32 | Returns:
33 | List of integers corresponding to the symbols in the text
34 | '''
35 | sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
36 | return sequence
37 |
38 |
39 | def sequence_to_text(sequence):
40 | '''Converts a sequence of IDs back to a string'''
41 | result = ''
42 | for symbol_id in sequence:
43 | s = _id_to_symbol[symbol_id]
44 | result += s
45 | return result
46 |
47 |
48 | def _clean_text(text, cleaner_names):
49 | for name in cleaner_names:
50 | cleaner = getattr(cleaners, name)
51 | if not cleaner:
52 | raise Exception('Unknown cleaner: %s' % name)
53 | text = cleaner(text)
54 | return text
55 |
--------------------------------------------------------------------------------
/configs/tsukuyomi_chan.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "eval_interval": 1000,
5 | "seed": 1234,
6 | "epochs": 10000,
7 | "learning_rate": 2e-4,
8 | "betas": [0.8, 0.99],
9 | "eps": 1e-9,
10 | "batch_size": 32,
11 | "fp16_run": false,
12 | "lr_decay": 0.999875,
13 | "segment_size": 8192,
14 | "init_lr_ratio": 1,
15 | "warmup_epochs": 0,
16 | "c_mel": 45,
17 | "c_kl": 1.0,
18 | "fft_sizes": [384, 683, 171],
19 | "hop_sizes": [30, 60, 10],
20 | "win_lengths": [150, 300, 60],
21 | "window": "hann_window"
22 | },
23 | "data": {
24 | "training_files":"./filelists/filelist_train2.txt.cleaned",
25 | "validation_files":"./filelists/filelist_val2.txt.cleaned",
26 | "text_cleaners":["japanese_cleaners"],
27 | "max_wav_value": 32768.0,
28 | "sampling_rate": 22050,
29 | "filter_length": 1024,
30 | "hop_length": 256,
31 | "win_length": 1024,
32 | "n_mel_channels": 80,
33 | "mel_fmin": 0.0,
34 | "mel_fmax": null,
35 | "add_blank": true,
36 | "n_speakers": 0,
37 | "cleaned_text": true
38 | },
39 | "model": {
40 | "ms_istft_vits": false,
41 | "mb_istft_vits": true,
42 | "istft_vits": false,
43 | "subbands": 4,
44 | "gen_istft_n_fft": 16,
45 | "gen_istft_hop_size": 4,
46 | "inter_channels": 192,
47 | "hidden_channels": 192,
48 | "filter_channels": 768,
49 | "n_heads": 2,
50 | "n_layers": 6,
51 | "kernel_size": 3,
52 | "p_dropout": 0.1,
53 | "resblock": "1",
54 | "resblock_kernel_sizes": [3,7,11],
55 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 | "upsample_rates": [4,4],
57 | "upsample_initial_channel": 512,
58 | "upsample_kernel_sizes": [16,16],
59 | "n_layers_q": 3,
60 | "use_spectral_norm": false,
61 | "use_sdp": false
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/configs/ljs_istft_vits.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "eval_interval": 100000,
5 | "seed": 1234,
6 | "epochs": 20000,
7 | "learning_rate": 2e-4,
8 | "betas": [0.8, 0.99],
9 | "eps": 1e-9,
10 | "batch_size": 64,
11 | "fp16_run": false,
12 | "lr_decay": 0.999875,
13 | "segment_size": 8192,
14 | "init_lr_ratio": 1,
15 | "warmup_epochs": 0,
16 | "c_mel": 45,
17 | "c_kl": 1.0,
18 | "fft_sizes": [384, 683, 171],
19 | "hop_sizes": [30, 60, 10],
20 | "win_lengths": [150, 300, 60],
21 | "window": "hann_window"
22 | },
23 | "data": {
24 | "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 | "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 | "text_cleaners":["english_cleaners2"],
27 | "max_wav_value": 32768.0,
28 | "sampling_rate": 22050,
29 | "filter_length": 1024,
30 | "hop_length": 256,
31 | "win_length": 1024,
32 | "n_mel_channels": 80,
33 | "mel_fmin": 0.0,
34 | "mel_fmax": null,
35 | "add_blank": true,
36 | "n_speakers": 0,
37 | "cleaned_text": true
38 | },
39 | "model": {
40 | "ms_istft_vits": false,
41 | "mb_istft_vits": false,
42 | "istft_vits": true,
43 | "subbands": false,
44 | "gen_istft_n_fft": 16,
45 | "gen_istft_hop_size": 4,
46 | "inter_channels": 192,
47 | "hidden_channels": 192,
48 | "filter_channels": 768,
49 | "n_heads": 2,
50 | "n_layers": 6,
51 | "kernel_size": 3,
52 | "p_dropout": 0.1,
53 | "resblock": "1",
54 | "resblock_kernel_sizes": [3,7,11],
55 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 | "upsample_rates": [8,8],
57 | "upsample_initial_channel": 512,
58 | "upsample_kernel_sizes": [16,16],
59 | "n_layers_q": 3,
60 | "use_spectral_norm": false,
61 | "use_sdp": false
62 | }
63 |
64 | }
65 |
--------------------------------------------------------------------------------
/configs/ljs_mb_istft_vits.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "eval_interval": 100000,
5 | "seed": 1234,
6 | "epochs": 20000,
7 | "learning_rate": 2e-4,
8 | "betas": [0.8, 0.99],
9 | "eps": 1e-9,
10 | "batch_size": 64,
11 | "fp16_run": false,
12 | "lr_decay": 0.999875,
13 | "segment_size": 8192,
14 | "init_lr_ratio": 1,
15 | "warmup_epochs": 0,
16 | "c_mel": 45,
17 | "c_kl": 1.0,
18 | "fft_sizes": [384, 683, 171],
19 | "hop_sizes": [30, 60, 10],
20 | "win_lengths": [150, 300, 60],
21 | "window": "hann_window"
22 | },
23 | "data": {
24 | "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 | "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 | "text_cleaners":["english_cleaners2"],
27 | "max_wav_value": 32768.0,
28 | "sampling_rate": 22050,
29 | "filter_length": 1024,
30 | "hop_length": 256,
31 | "win_length": 1024,
32 | "n_mel_channels": 80,
33 | "mel_fmin": 0.0,
34 | "mel_fmax": null,
35 | "add_blank": true,
36 | "n_speakers": 0,
37 | "cleaned_text": true
38 | },
39 | "model": {
40 | "ms_istft_vits": false,
41 | "mb_istft_vits": true,
42 | "istft_vits": false,
43 | "subbands": 4,
44 | "gen_istft_n_fft": 16,
45 | "gen_istft_hop_size": 4,
46 | "inter_channels": 192,
47 | "hidden_channels": 192,
48 | "filter_channels": 768,
49 | "n_heads": 2,
50 | "n_layers": 6,
51 | "kernel_size": 3,
52 | "p_dropout": 0.1,
53 | "resblock": "1",
54 | "resblock_kernel_sizes": [3,7,11],
55 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 | "upsample_rates": [4,4],
57 | "upsample_initial_channel": 512,
58 | "upsample_kernel_sizes": [16,16],
59 | "n_layers_q": 3,
60 | "use_spectral_norm": false,
61 | "use_sdp": false
62 | }
63 |
64 | }
65 |
--------------------------------------------------------------------------------
/configs/ljs_mini_istft_vits.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "eval_interval": 100000,
5 | "seed": 1234,
6 | "epochs": 20000,
7 | "learning_rate": 2e-4,
8 | "betas": [0.8, 0.99],
9 | "eps": 1e-9,
10 | "batch_size": 64,
11 | "fp16_run": false,
12 | "lr_decay": 0.999875,
13 | "segment_size": 8192,
14 | "init_lr_ratio": 1,
15 | "warmup_epochs": 0,
16 | "c_mel": 45,
17 | "c_kl": 1.0,
18 | "fft_sizes": [384, 683, 171],
19 | "hop_sizes": [30, 60, 10],
20 | "win_lengths": [150, 300, 60],
21 | "window": "hann_window"
22 | },
23 | "data": {
24 | "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 | "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 | "text_cleaners":["english_cleaners2"],
27 | "max_wav_value": 32768.0,
28 | "sampling_rate": 22050,
29 | "filter_length": 1024,
30 | "hop_length": 256,
31 | "win_length": 1024,
32 | "n_mel_channels": 80,
33 | "mel_fmin": 0.0,
34 | "mel_fmax": null,
35 | "add_blank": true,
36 | "n_speakers": 0,
37 | "cleaned_text": true
38 | },
39 | "model": {
40 | "ms_istft_vits": false,
41 | "mb_istft_vits": false,
42 | "istft_vits": true,
43 | "subbands": false,
44 | "gen_istft_n_fft": 16,
45 | "gen_istft_hop_size": 4,
46 | "inter_channels": 192,
47 | "hidden_channels": 96,
48 | "filter_channels": 768,
49 | "n_heads": 2,
50 | "n_layers": 3,
51 | "kernel_size": 3,
52 | "p_dropout": 0.1,
53 | "resblock": "1",
54 | "resblock_kernel_sizes": [3,7,11],
55 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 | "upsample_rates": [8,8],
57 | "upsample_initial_channel": 256,
58 | "upsample_kernel_sizes": [16,16],
59 | "n_layers_q": 3,
60 | "use_spectral_norm": false,
61 | "use_sdp": false
62 | }
63 |
64 | }
65 |
--------------------------------------------------------------------------------
/configs/ljs_mini_mb_istft_vits.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "eval_interval": 100000,
5 | "seed": 1234,
6 | "epochs": 20000,
7 | "learning_rate": 2e-4,
8 | "betas": [0.8, 0.99],
9 | "eps": 1e-9,
10 | "batch_size": 64,
11 | "fp16_run": false,
12 | "lr_decay": 0.999875,
13 | "segment_size": 8192,
14 | "init_lr_ratio": 1,
15 | "warmup_epochs": 0,
16 | "c_mel": 45,
17 | "c_kl": 1.0,
18 | "fft_sizes": [384, 683, 171],
19 | "hop_sizes": [30, 60, 10],
20 | "win_lengths": [150, 300, 60],
21 | "window": "hann_window"
22 | },
23 | "data": {
24 | "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 | "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 | "text_cleaners":["english_cleaners2"],
27 | "max_wav_value": 32768.0,
28 | "sampling_rate": 22050,
29 | "filter_length": 1024,
30 | "hop_length": 256,
31 | "win_length": 1024,
32 | "n_mel_channels": 80,
33 | "mel_fmin": 0.0,
34 | "mel_fmax": null,
35 | "add_blank": true,
36 | "n_speakers": 0,
37 | "cleaned_text": true
38 | },
39 | "model": {
40 | "ms_istft_vits": false,
41 | "mb_istft_vits": true,
42 | "istft_vits": false,
43 | "subbands": 4,
44 | "gen_istft_n_fft": 16,
45 | "gen_istft_hop_size": 4,
46 | "inter_channels": 192,
47 | "hidden_channels": 96,
48 | "filter_channels": 768,
49 | "n_heads": 2,
50 | "n_layers": 3,
51 | "kernel_size": 3,
52 | "p_dropout": 0.1,
53 | "resblock": "1",
54 | "resblock_kernel_sizes": [3,7,11],
55 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 | "upsample_rates": [4,4],
57 | "upsample_initial_channel": 256,
58 | "upsample_kernel_sizes": [16,16],
59 | "n_layers_q": 3,
60 | "use_spectral_norm": false,
61 | "use_sdp": false
62 | }
63 |
64 | }
65 |
--------------------------------------------------------------------------------
/losses.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn import functional as F
3 | from stft_loss import MultiResolutionSTFTLoss
4 |
5 |
6 | import commons
7 |
8 |
9 | def feature_loss(fmap_r, fmap_g):
10 | loss = 0
11 | for dr, dg in zip(fmap_r, fmap_g):
12 | for rl, gl in zip(dr, dg):
13 | rl = rl.float().detach()
14 | gl = gl.float()
15 | loss += torch.mean(torch.abs(rl - gl))
16 |
17 | return loss * 2
18 |
19 |
20 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
21 | loss = 0
22 | r_losses = []
23 | g_losses = []
24 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
25 | dr = dr.float()
26 | dg = dg.float()
27 | r_loss = torch.mean((1-dr)**2)
28 | g_loss = torch.mean(dg**2)
29 | loss += (r_loss + g_loss)
30 | r_losses.append(r_loss.item())
31 | g_losses.append(g_loss.item())
32 |
33 | return loss, r_losses, g_losses
34 |
35 |
36 | def generator_loss(disc_outputs):
37 | loss = 0
38 | gen_losses = []
39 | for dg in disc_outputs:
40 | dg = dg.float()
41 | l = torch.mean((1-dg)**2)
42 | gen_losses.append(l)
43 | loss += l
44 |
45 | return loss, gen_losses
46 |
47 |
48 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
49 | """
50 | z_p, logs_q: [b, h, t_t]
51 | m_p, logs_p: [b, h, t_t]
52 | """
53 | z_p = z_p.float()
54 | logs_q = logs_q.float()
55 | m_p = m_p.float()
56 | logs_p = logs_p.float()
57 | z_mask = z_mask.float()
58 |
59 | kl = logs_p - logs_q - 0.5
60 | kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
61 | kl = torch.sum(kl * z_mask)
62 | l = kl / torch.sum(z_mask)
63 | return l
64 |
65 | def subband_stft_loss(h, y_mb, y_hat_mb):
66 | sub_stft_loss = MultiResolutionSTFTLoss(h.train.fft_sizes, h.train.hop_sizes, h.train.win_lengths)
67 | y_mb = y_mb.view(-1, y_mb.size(2))
68 | y_hat_mb = y_hat_mb.view(-1, y_hat_mb.size(2))
69 | sub_sc_loss, sub_mag_loss = sub_stft_loss(y_hat_mb[:, :y_mb.size(-1)], y_mb)
70 | return sub_sc_loss+sub_mag_loss
71 |
72 |
--------------------------------------------------------------------------------
/configs/ljs_ms_istft_vits.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": {
3 | "log_interval": 200,
4 | "eval_interval": 100000,
5 | "seed": 1234,
6 | "epochs": 20000,
7 | "learning_rate": 2e-4,
8 | "betas": [0.8, 0.99],
9 | "eps": 1e-9,
10 | "batch_size": 64,
11 | "fp16_run": false,
12 | "lr_decay": 0.999875,
13 | "segment_size": 8192,
14 | "init_lr_ratio": 1,
15 | "warmup_epochs": 0,
16 | "c_mel": 45,
17 | "c_kl": 1.0,
18 | "fft_sizes": [384, 683, 171],
19 | "hop_sizes": [30, 60, 10],
20 | "win_lengths": [150, 300, 60],
21 | "window": "hann_window"
22 | },
23 | "data": {
24 | "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 | "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 | "text_cleaners":["english_cleaners2"],
27 | "max_wav_value": 32768.0,
28 | "sampling_rate": 22050,
29 | "filter_length": 1024,
30 | "hop_length": 256,
31 | "win_length": 1024,
32 | "n_mel_channels": 80,
33 | "mel_fmin": 0.0,
34 | "mel_fmax": null,
35 | "add_blank": true,
36 | "n_speakers": 0,
37 | "cleaned_text": true
38 | },
39 | "model": {
40 | "ms_istft_vits": true,
41 | "mb_istft_vits": false,
42 | "istft_vits": false,
43 | "subbands": 4,
44 | "gen_istft_n_fft": 16,
45 | "gen_istft_hop_size": 4,
46 | "inter_channels": 192,
47 | "hidden_channels": 192,
48 | "filter_channels": 768,
49 | "n_heads": 2,
50 | "n_layers": 6,
51 | "kernel_size": 3,
52 | "p_dropout": 0.1,
53 | "resblock": "1",
54 | "resblock_kernel_sizes": [3,7,11],
55 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 | "upsample_rates": [4,4],
57 | "upsample_initial_channel": 512,
58 | "upsample_kernel_sizes": [16,16],
59 | "n_layers_q": 3,
60 | "use_spectral_norm": false,
61 | "use_sdp": false
62 | }
63 |
64 | }
65 |
--------------------------------------------------------------------------------
/inference.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "%matplotlib inline\n",
10 | "import matplotlib.pyplot as plt\n",
11 | "import IPython.display as ipd\n",
12 | "\n",
13 | "import os\n",
14 | "import json\n",
15 | "import math\n",
16 | "import torch\n",
17 | "from torch import nn\n",
18 | "from torch.nn import functional as F\n",
19 | "from torch.utils.data import DataLoader\n",
20 | "\n",
21 | "import commons\n",
22 | "import utils\n",
23 | "from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate\n",
24 | "from models import SynthesizerTrn\n",
25 | "from text.symbols import symbols\n",
26 | "from text import text_to_sequence\n",
27 | "\n",
28 | "from scipy.io.wavfile import write\n",
29 | "\n",
30 | "\n",
31 | "def get_text(text, hps):\n",
32 | " text_norm = text_to_sequence(text, hps.data.text_cleaners)\n",
33 | " if hps.data.add_blank:\n",
34 | " text_norm = commons.intersperse(text_norm, 0)\n",
35 | " text_norm = torch.LongTensor(text_norm)\n",
36 | " return text_norm"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "## MB-iSTFT-VITS"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "hps = utils.get_hparams_from_file(\"./configs/tsukuyomi_chan.json\")"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "net_g = SynthesizerTrn(\n",
62 | " len(symbols),\n",
63 | " hps.data.filter_length // 2 + 1,\n",
64 | " hps.train.segment_size // hps.data.hop_length,\n",
65 | " **hps.model).cuda()\n",
66 | "_ = net_g.eval()\n",
67 | "\n",
68 | "_ = utils.load_checkpoint(\"./logs/tsukuyomi/G_100000.pth\", net_g, None)"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "stn_tst = get_text(\"こんにちは。\", hps)\n",
78 | "with torch.no_grad():\n",
79 | " x_tst = stn_tst.cuda().unsqueeze(0)\n",
80 | " x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()\n",
81 | " audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()\n",
82 | "ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))"
83 | ]
84 | }
85 | ],
86 | "metadata": {
87 | "kernelspec": {
88 | "display_name": "Python 3",
89 | "language": "python",
90 | "name": "python3"
91 | },
92 | "language_info": {
93 | "codemirror_mode": {
94 | "name": "ipython",
95 | "version": 3
96 | },
97 | "file_extension": ".py",
98 | "mimetype": "text/x-python",
99 | "name": "python",
100 | "nbconvert_exporter": "python",
101 | "pygments_lexer": "ipython3",
102 | "version": "3.8.13"
103 | }
104 | },
105 | "nbformat": 4,
106 | "nbformat_minor": 4
107 | }
108 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MB-iSTFT-VITS with Multilingual Implementations
2 |
3 |
4 | This is an multilingual implementation of [MB-iSTFT-VITS](https://github.com/MasayaKawamura/MB-iSTFT-VITS) to support conversion to various languages. MB-iSTFT-VITS showed 4.1 times faster inference time compared with original VITS!
5 | Preprocessed Japanese Single Speaker training material is provided with [つくよみちゃんコーパス(tsukuyomi-chan corpus).](https://tyc.rei-yumesaki.net/material/corpus/) You need to download the corpus and place 100 `.wav` files to `./tsukuyomi_raw`.
6 |
7 |
8 | - Currently Supported: Japanese / Korean
9 | - Chinese / CJKE / and other languages will be updated very soon!
10 |
11 |
12 | # How to use
13 | Python >= 3.6 (Python == 3.7 is suggested)
14 |
15 | ## Clone this repository
16 | ```sh
17 | git clone https://github.com/misakiudon/MB-iSTFT-VITS-multilingual.git
18 | ```
19 |
20 | ## Install requirements
21 | ```sh
22 | pip install -r requirements.txt
23 | ```
24 | You may need to install espeak first: `apt-get install espeak`
25 |
26 | ## Create manifest data
27 | ### Single speaker
28 | "n_speakers" should be 0 in config.json
29 | ```
30 | path/to/XXX.wav|transcript
31 | ```
32 | - Example
33 | ```
34 | dataset/001.wav|こんにちは。
35 | ```
36 |
37 | ### Mutiple speakers
38 | Speaker id should start from 0
39 | ```
40 | path/to/XXX.wav|speaker id|transcript
41 | ```
42 | - Example
43 | ```
44 | dataset/001.wav|0|こんにちは。
45 | ```
46 |
47 | ## Preprocess
48 | Japanese preprocessed manifest data is provided with `filelists/filelist_train2.txt.cleaned` and `filelists/filelist_val2.txt.cleaned`.
49 | ```sh
50 | # Single speaker
51 | python preprocess.py --text_index 1 --filelists path/to/filelist_train.txt path/to/filelist_val.txt --text_cleaners 'japanese_cleaners'
52 |
53 | # Mutiple speakers
54 | python preprocess.py --text_index 2 --filelists path/to/filelist_train.txt path/to/filelist_val.txt --text_cleaners 'japanese_cleaners'
55 | ```
56 |
57 | If your speech file is either not `22050Hz / Mono / PCM-16`, the you should resample your .wav file first.
58 | ```sh
59 | python convert_to_22050.py --in_path path/to/original_wav_dir/ --out_path path/to/output_wav_dir/
60 | ```
61 |
62 | ## Build monotonic alignment search
63 | ```sh
64 | # Cython-version Monotonoic Alignment Search
65 | cd monotonic_align
66 | mkdir monotonic_align
67 | python setup.py build_ext --inplace
68 | ```
69 |
70 | ## Setting json file in [configs](configs)
71 |
72 | | Model | How to set up json file in [configs](configs) | Sample of json file configuration|
73 | | :---: | :---: | :---: |
74 | | iSTFT-VITS | ```"istft_vits": true, ```
``` "upsample_rates": [8,8], ``` | ljs_istft_vits.json |
75 | | MB-iSTFT-VITS | ```"subbands": 4,```
```"mb_istft_vits": true, ```
``` "upsample_rates": [4,4], ``` | ljs_mb_istft_vits.json |
76 | | MS-iSTFT-VITS | ```"subbands": 4,```
```"ms_istft_vits": true, ```
``` "upsample_rates": [4,4], ``` | ljs_ms_istft_vits.json |
77 |
78 | For tutorial, check `config/tsukuyomi_chan.json` for more examples
79 | - If you have done preprocessing, set "cleaned_text" to true.
80 | - Change `training_files` and `validation_files` to the path of preprocessed manifest files.
81 | - Select same `text_cleaners` you used in preprocessing step.
82 |
83 | ## Train
84 | ```sh
85 | # Single speaker
86 | python train_latest.py -c -m
87 |
88 | # Mutiple speakers
89 | python train_latest_ms.py -c -m
90 | ```
91 | In the case of training MB-iSTFT-VITS with Japanese tutorial corpus, run the following script. Resume training from lastest checkpoint is automatic.
92 | ```sh
93 | python train_latest.py -c configs/tsukuyomi_chan.json -m tsukuyomi
94 | ```
95 |
96 | After the training, you can check inference audio using [inference.ipynb](inference.ipynb)
97 |
98 | ## References
99 | - https://github.com/MasayaKawamura/MB-iSTFT-VITS
100 | - https://github.com/CjangCjengh/vits
101 | - https://github.com/Francis-Komizu/VITS
102 |
--------------------------------------------------------------------------------
/mel_processing.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import random
4 | import torch
5 | from torch import nn
6 | import torch.nn.functional as F
7 | import torch.utils.data
8 | import numpy as np
9 | import librosa
10 | import librosa.util as librosa_util
11 | from librosa.util import normalize, pad_center, tiny
12 | from scipy.signal import get_window
13 | from scipy.io.wavfile import read
14 | from librosa.filters import mel as librosa_mel_fn
15 |
16 | MAX_WAV_VALUE = 32768.0
17 |
18 |
19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
20 | """
21 | PARAMS
22 | ------
23 | C: compression factor
24 | """
25 | return torch.log(torch.clamp(x, min=clip_val) * C)
26 |
27 |
28 | def dynamic_range_decompression_torch(x, C=1):
29 | """
30 | PARAMS
31 | ------
32 | C: compression factor used to compress
33 | """
34 | return torch.exp(x) / C
35 |
36 |
37 | def spectral_normalize_torch(magnitudes):
38 | output = dynamic_range_compression_torch(magnitudes)
39 | return output
40 |
41 |
42 | def spectral_de_normalize_torch(magnitudes):
43 | output = dynamic_range_decompression_torch(magnitudes)
44 | return output
45 |
46 |
47 | mel_basis = {}
48 | hann_window = {}
49 |
50 |
51 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
52 | if torch.min(y) < -1.:
53 | print('min value is ', torch.min(y))
54 | if torch.max(y) > 1.:
55 | print('max value is ', torch.max(y))
56 |
57 | global hann_window
58 | dtype_device = str(y.dtype) + '_' + str(y.device)
59 | wnsize_dtype_device = str(win_size) + '_' + dtype_device
60 | if wnsize_dtype_device not in hann_window:
61 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
62 |
63 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
64 | y = y.squeeze(1)
65 |
66 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
67 | center=center, pad_mode='reflect', normalized=False, onesided=True)
68 |
69 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
70 | return spec
71 |
72 |
73 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
74 | global mel_basis
75 | dtype_device = str(spec.dtype) + '_' + str(spec.device)
76 | fmax_dtype_device = str(fmax) + '_' + dtype_device
77 | if fmax_dtype_device not in mel_basis:
78 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
79 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
80 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
81 | spec = spectral_normalize_torch(spec)
82 | return spec
83 |
84 |
85 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
86 | if torch.min(y) < -1.:
87 | print('min value is ', torch.min(y))
88 | if torch.max(y) > 1.:
89 | print('max value is ', torch.max(y))
90 |
91 | global mel_basis, hann_window
92 | dtype_device = str(y.dtype) + '_' + str(y.device)
93 | fmax_dtype_device = str(fmax) + '_' + dtype_device
94 | wnsize_dtype_device = str(win_size) + '_' + dtype_device
95 | if fmax_dtype_device not in mel_basis:
96 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
97 | mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
98 | if wnsize_dtype_device not in hann_window:
99 | hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
100 |
101 | y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
102 | y = y.squeeze(1)
103 |
104 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
105 | center=center, pad_mode='reflect', normalized=False, onesided=True)
106 |
107 | spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
108 |
109 | spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
110 | spec = spectral_normalize_torch(spec)
111 |
112 | return spec
113 |
--------------------------------------------------------------------------------
/pqmf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Copyright 2020 Tomoki Hayashi
4 | # MIT License (https://opensource.org/licenses/MIT)
5 |
6 | """Pseudo QMF modules."""
7 |
8 | import numpy as np
9 | import torch
10 | import torch.nn.functional as F
11 |
12 | from scipy.signal import kaiser
13 |
14 |
15 | def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0):
16 | """Design prototype filter for PQMF.
17 | This method is based on `A Kaiser window approach for the design of prototype
18 | filters of cosine modulated filterbanks`_.
19 | Args:
20 | taps (int): The number of filter taps.
21 | cutoff_ratio (float): Cut-off frequency ratio.
22 | beta (float): Beta coefficient for kaiser window.
23 | Returns:
24 | ndarray: Impluse response of prototype filter (taps + 1,).
25 | .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
26 | https://ieeexplore.ieee.org/abstract/document/681427
27 | """
28 | # check the arguments are valid
29 | assert taps % 2 == 0, "The number of taps mush be even number."
30 | assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
31 |
32 | # make initial filter
33 | omega_c = np.pi * cutoff_ratio
34 | with np.errstate(invalid='ignore'):
35 | h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) \
36 | / (np.pi * (np.arange(taps + 1) - 0.5 * taps))
37 | h_i[taps // 2] = np.cos(0) * cutoff_ratio # fix nan due to indeterminate form
38 |
39 | # apply kaiser window
40 | w = kaiser(taps + 1, beta)
41 | h = h_i * w
42 |
43 | return h
44 |
45 |
46 | class PQMF(torch.nn.Module):
47 | """PQMF module.
48 | This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
49 | .. _`Near-perfect-reconstruction pseudo-QMF banks`:
50 | https://ieeexplore.ieee.org/document/258122
51 | """
52 |
53 | def __init__(self, device, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0):
54 | """Initilize PQMF module.
55 | Args:
56 | subbands (int): The number of subbands.
57 | taps (int): The number of filter taps.
58 | cutoff_ratio (float): Cut-off frequency ratio.
59 | beta (float): Beta coefficient for kaiser window.
60 | """
61 | super(PQMF, self).__init__()
62 |
63 | # define filter coefficient
64 | h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
65 | h_analysis = np.zeros((subbands, len(h_proto)))
66 | h_synthesis = np.zeros((subbands, len(h_proto)))
67 | for k in range(subbands):
68 | h_analysis[k] = 2 * h_proto * np.cos(
69 | (2 * k + 1) * (np.pi / (2 * subbands)) *
70 | (np.arange(taps + 1) - ((taps - 1) / 2)) +
71 | (-1) ** k * np.pi / 4)
72 | h_synthesis[k] = 2 * h_proto * np.cos(
73 | (2 * k + 1) * (np.pi / (2 * subbands)) *
74 | (np.arange(taps + 1) - ((taps - 1) / 2)) -
75 | (-1) ** k * np.pi / 4)
76 |
77 | # convert to tensor
78 | analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).cuda(device)
79 | synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0).cuda(device)
80 |
81 | # register coefficients as beffer
82 | self.register_buffer("analysis_filter", analysis_filter)
83 | self.register_buffer("synthesis_filter", synthesis_filter)
84 |
85 | # filter for downsampling & upsampling
86 | updown_filter = torch.zeros((subbands, subbands, subbands)).float().cuda(device)
87 | for k in range(subbands):
88 | updown_filter[k, k, 0] = 1.0
89 | self.register_buffer("updown_filter", updown_filter)
90 | self.subbands = subbands
91 |
92 | # keep padding info
93 | self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
94 |
95 | def analysis(self, x):
96 | """Analysis with PQMF.
97 | Args:
98 | x (Tensor): Input tensor (B, 1, T).
99 | Returns:
100 | Tensor: Output tensor (B, subbands, T // subbands).
101 | """
102 | x = F.conv1d(self.pad_fn(x), self.analysis_filter)
103 | return F.conv1d(x, self.updown_filter, stride=self.subbands)
104 |
105 | def synthesis(self, x):
106 | """Synthesis with PQMF.
107 | Args:
108 | x (Tensor): Input tensor (B, subbands, T // subbands).
109 | Returns:
110 | Tensor: Output tensor (B, 1, T).
111 | """
112 | # NOTE(kan-bayashi): Power will be dreased so here multipy by # subbands.
113 | # Not sure this is the correct way, it is better to check again.
114 | # TODO(kan-bayashi): Understand the reconstruction procedure
115 | x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands)
116 | return F.conv1d(self.pad_fn(x), self.synthesis_filter)
--------------------------------------------------------------------------------
/text/py2kn.json:
--------------------------------------------------------------------------------
1 | {"a": "アー", "ai": "アイ", "an": "アン", "ang": "アン", "ao": "アオ", "ba": "バー", "bai": "バイ", "ban": "バン", "bang": "バン", "bao": "バオ", "bei": "ベイ", "ben": "ベン", "beng": "ボン", "bi": "ビー", "bian": "ビィェン", "biao": "ビィャォ", "bie": "ビィェ", "bin": "ビン", "bing": "ビン", "bo": "ブォ", "bu": "ブー", "ca": "ツァ", "cai": "ツァィ", "can": "ツァン", "cang": "ツァン", "cao": "ツァォ", "ce": "ツェ", "cen": "ツェン", "ceng": "ツォン", "cha": "チャ", "chai": "チャイ", "chan": "チャン", "chang": "チャン", "chao": "チャオ", "che": "チェ", "chen": "チェン", "cheng": "チォン", "chi": "チー", "chong": "チョン", "chou": "チョウ", "chu": "チュ", "chuan": "チュァン", "chuai": "チュァイ", "chuang": "チュゥァン", "chui": "チュイ", "chun": "チュン", "chuo": "チャオ", "ci": "ツー", "cong": "ツォン", "cou": "ツォゥ ", "cu": "ツゥ", "cuan": "ツァン", "cui": "ツイ", "cun": "ツン", "cuo": "ツゥォ", "da": "ダー", "dai": "ダイ", "dan": "ダン", "dang": "ダン", "dao": "ダオ", "de": "デェ", "dei": "デイ", "dun": "ドゥン", "deng": "ドン", "di": "ディ", "dian": "ディェン", "diao": "ディァォ", "die": "ディェ", "ding": "ディン", "diu": "ディゥ", "dong": "ドン", "dou": "ドウ", "du": "ドゥ", "duan": "ドゥァン", "dui": "ドゥイ", "duo": "ドゥォ", "e": "ェ", "ei": "ェイ", "en": "エン", "eng": "鞥", "er": "ェ", "fa": "ファ", "fan": "ファン", "fang": "ファン", "fei": "フェイ", "fen": "フェン", "feng": "フォン", "fuo": "フォ", "fou": "フォウ", "fu": "フー", "ga": "ガー", "gai": "ガイ", "gan": "ガン", "gang": "ガン", "gao": "ガオ", "ge": "グェ", "gei": "ゲイ", "gen": "ゲン", "geng": "ゴン", "gong": "ゴン", "gou": "ゴウ", "gu": "グー", "gua": "グァ", "guai": "グゥァイ", "guan": "グァン", "guang": "グゥァン", "gui": "グゥイ", "gun": "ガン", "guo": "グゥォ", "ha": "ハー", "hai": "ハイ", "han": "ハン", "hang": "ハン", "hao": "ハオ", "he": "フェ゛ァ", "hei": "ヘイ", "hen": "ヘン", "heng": "ホン", "hong": "ホン", "hou": "ホウ", "hu": "フー", "hua": "ファ", "huai": "フゥァイ", "huan": "ファン", "huang": "フゥァン", "hui": "フゥイ", "hun": "フン", "huo": "フォ", "ji": "ジー", "jia": "ジャ", "jian": "ジィェン", "jiang": "ジィァン", "jiao": "ジャオ", "jie": "ジェ", "jin": "ジン", "jing": "ジン", "jiong": "ジィォン", "jiu": "ジゥ", "ju": "ジュ", "juan": "ジュェン", "jue": "ジュェ", "jun": "ジュン", "ka": "カー", "kai": "カイ", "kan": "カン", "kang": "カン", "kao": "カオ", "ke": "クェ゛ァ", "ken": "ケン", "keng": "コン", "kong": "コン", "kou": "コウ", "ku": "クー", "kua": "クァ", "kuai": "クァィ", "kuan": "クァン", "kuang": "クゥァン", "kui": "クゥイ", "kun": "クン", "kuo": "クォ", "la": "ラー", "lai": "ライ", "lan": "ラン", "lang": "ラン", "lao": "ラオ", "le": "ラ", "lei": "レイ", "leng": "ラン", "li": "リー", "liang": "リィァン", "lian": "リィェン", "liao": "リィァォ", "lie": "リィェ", "lin": "リン", "ling": "リン", "liu": "リィゥ", "long": "ロン", "lou": "ロウ", "lu": "ルー", "lv": "リュ", "luan": "ルゥァン", "lue": "リュェ", "lun": "ルゥン", "luo": "ルゥォ", "ma": "マー", "mai": "マイ", "man": "マン", "mang": "マン", "mao": "マオ", "me": "ムェ", "mei": "メイ", "men": "メン", "meng": "モン", "mi": "ミィ", "mian": "ミィェン", "miao": "ミィァォ", "mie": "ミィェ", "min": "ミン", "ming": "ミン", "miu": "ミィゥ", "mo": "ムォ", "mou": "モウ", "mu": "ムー", "na": "ナー", "nai": "ナイ", "nan": "ナン", "nang": "ナン", "nao": "ナオ", "ne": "ヌェ゛ァ", "nei": "ネイ", "nen": "ネン", "neng": "ノン", "ni": "ニー", "nian": "ニィェン", "niang": "ニィァン", "niao": "ニィァォ", "nie": "ニィェ", "nin": "ニン", "ning": "ニン", "niu": "ニュェ", "nong": "ノン", "nou": "ノウ", "nu": "ヌー", "nv": "ニュ", "nuan": "ヌァン", "nuo": "ヌオ", "o": "オ", "ou": "オウ", "pa": "パー", "pai": "パイ", "pan": "パン", "pang": "パン", "pao": "パオ", "pei": "ペイ", "pen": "ペン", "peng": "ポン", "pi": "ピー", "pian": "ピィェン", "piao": "ピィァオ", "pie": "ピェ", "pin": "ピン", "ping": "ピン", "po": "ポォ", "pou": "ポウ", "pu": "プー", "qi": "チー", "qia": "チィァ", "qian": "チィェン", "qiang": "チィァン", "qiao": "チィァォ", "qie": "チィェ", "qin": "チン", "qing": "チン", "qiong": "チォン", "qiu": "チィゥ", "qu": "チュ", "quan": "チュェン", "que": "チュェ", "qun": "チュン", "ran": "ラン", "rang": "ラン", "rao": "ラオ", "re": "レ", "ren": "レン", "reng": "ロン", "ri": "リ", "rong": "ロン", "rou": "ロウ", "ru": "ルー", "ruan": "ルァン", "rui": "ルイ", "run": "ルン", "ruo": "ルォ", "sa": "サー", "sai": "サオ", "san": "サン", "sang": "サン", "se": "スェ", "sen": "セン", "seng": "ソン", "sha": "シャ", "shai": "シャイ", "shan": "シャン", "shang": "シャン", "shao": "シャオ", "she": "シェ", "shen": "シェン", "sheng": "シォン", "shi": "シー", "shou": "ショウ", "shu": "シュ", "shua": "シュァ", "shuai": "シュァイ", "shuan": "シュァン", "shuang": "シュゥァン", "shui": "シュイ", "shun": "シュン", "shuo": "シュォ", "si": "スー", "song": "ソン", "sou": "ソウ", "su": "スー", "suan": "スゥァン", "sui": "スイ", "sun": "スン", "suo": "スォ", "ta": "ター", "tai": "タイ", "tan": "タン", "tang": "タン", "tao": "タオ", "te": "テェ", "teng": "トン", "ti": "ティ", "tian": "ティェン", "tiao": "ティァォ", "tie": "ティェ", "ting": "ティン", "tong": "トン", "tou": "トウ", "tu": "トゥ", "tuan": "トゥァン", "tui": "トゥイ", "tun": "トゥン", "tuo": "トゥォ", "wa": "ウァ", "wai": "ワィ", "wan": "ワン", "wang": "ワン", "wei": "ウェイ", "wen": "ウェン", "weng": "ウォン", "wo": "ウォ", "wu": "ウー", "xi": "シー", "xia": "シァ", "xian": "シィェン", "xiang": "シィァン", "xiao": "シァォ", "xie": "シェ", "xin": "シン", "xing": "シン", "xiong": "シィォン", "xiu": "シゥ", "xu": "シュ", "xuan": "シュェン", "xue": "シュェ", "xun": "シュン", "ya": "ヤー", "yan": "イェン", "yang": "ヤン", "yao": "イャォ", "ye": "イェ", "yi": "イー", "yin": "イン", "ying": "イン", "yong": "ヨン", "you": "ヨウ", "yu": "ユー", "yuan": "ユェン", "yue": "ユェ", "yun": "ユン", "za": "ザー", "zai": "ヂャイ", "zan": "ザン", "zang": "ザン", "zao": "ザオ", "ze": "ゼェ", "zei": "ゼイ", "zen": "ゼン", "zeng": "ゾン", "zhan": "ヂャン", "zhang": "ヂャン", "zhao": "ヂャオ", "zhe": "ヂェ゛ァ", "zhen": "ヂェン", "zheng": "ヂォン", "zhi": "ヂー", "zhong": "ヂョン", "zhou": "ヂョウ", "zhu": "ヂュ", "zhua": "ヂュア", "zhuai": "ヂュァイ", "zhuan": "ヂュァン", "zhuang": "ヂュゥァン", "zhui": "ヂュイ", "zhun": "ヂュン", "zhuo": "ヂュオ", "zi": "ズー", "zong": "ゾン", "zou": "ゾウ", "zu": "ズー", "zuan": "ズァン", "zui": "ズイ", "zun": "ズン", "zuo": "ズゥォ", ",": "、", "。": "。", "!": "!", "?": "?", "……": "。"}
2 |
--------------------------------------------------------------------------------
/stft_loss.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Copyright 2019 Tomoki Hayashi
4 | # MIT License (https://opensource.org/licenses/MIT)
5 |
6 | """STFT-based Loss modules."""
7 |
8 | import torch
9 | import torch.nn.functional as F
10 |
11 |
12 | def stft(x, fft_size, hop_size, win_length, window):
13 | """Perform STFT and convert to magnitude spectrogram.
14 | Args:
15 | x (Tensor): Input signal tensor (B, T).
16 | fft_size (int): FFT size.
17 | hop_size (int): Hop size.
18 | win_length (int): Window length.
19 | window (str): Window function type.
20 | Returns:
21 | Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
22 | """
23 | x_stft = torch.stft(x, fft_size, hop_size, win_length, window.to(x.device))
24 | real = x_stft[..., 0]
25 | imag = x_stft[..., 1]
26 |
27 | # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
28 | return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
29 |
30 |
31 | class SpectralConvergengeLoss(torch.nn.Module):
32 | """Spectral convergence loss module."""
33 |
34 | def __init__(self):
35 | """Initilize spectral convergence loss module."""
36 | super(SpectralConvergengeLoss, self).__init__()
37 |
38 | def forward(self, x_mag, y_mag):
39 | """Calculate forward propagation.
40 | Args:
41 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
42 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
43 | Returns:
44 | Tensor: Spectral convergence loss value.
45 | """
46 | return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
47 |
48 |
49 | class LogSTFTMagnitudeLoss(torch.nn.Module):
50 | """Log STFT magnitude loss module."""
51 |
52 | def __init__(self):
53 | """Initilize los STFT magnitude loss module."""
54 | super(LogSTFTMagnitudeLoss, self).__init__()
55 |
56 | def forward(self, x_mag, y_mag):
57 | """Calculate forward propagation.
58 | Args:
59 | x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
60 | y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
61 | Returns:
62 | Tensor: Log STFT magnitude loss value.
63 | """
64 | return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
65 |
66 |
67 | class STFTLoss(torch.nn.Module):
68 | """STFT loss module."""
69 |
70 | def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"):
71 | """Initialize STFT loss module."""
72 | super(STFTLoss, self).__init__()
73 | self.fft_size = fft_size
74 | self.shift_size = shift_size
75 | self.win_length = win_length
76 | self.window = getattr(torch, window)(win_length)
77 | self.spectral_convergenge_loss = SpectralConvergengeLoss()
78 | self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
79 |
80 | def forward(self, x, y):
81 | """Calculate forward propagation.
82 | Args:
83 | x (Tensor): Predicted signal (B, T).
84 | y (Tensor): Groundtruth signal (B, T).
85 | Returns:
86 | Tensor: Spectral convergence loss value.
87 | Tensor: Log STFT magnitude loss value.
88 | """
89 | x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
90 | y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
91 | sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
92 | mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
93 |
94 | return sc_loss, mag_loss
95 |
96 |
97 | class MultiResolutionSTFTLoss(torch.nn.Module):
98 | """Multi resolution STFT loss module."""
99 |
100 | def __init__(self,
101 | fft_sizes=[1024, 2048, 512],
102 | hop_sizes=[120, 240, 50],
103 | win_lengths=[600, 1200, 240],
104 | window="hann_window"):
105 | """Initialize Multi resolution STFT loss module.
106 | Args:
107 | fft_sizes (list): List of FFT sizes.
108 | hop_sizes (list): List of hop sizes.
109 | win_lengths (list): List of window lengths.
110 | window (str): Window function type.
111 | """
112 | super(MultiResolutionSTFTLoss, self).__init__()
113 | assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
114 | self.stft_losses = torch.nn.ModuleList()
115 | for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
116 | self.stft_losses += [STFTLoss(fs, ss, wl, window)]
117 |
118 | def forward(self, x, y):
119 | """Calculate forward propagation.
120 | Args:
121 | x (Tensor): Predicted signal (B, T).
122 | y (Tensor): Groundtruth signal (B, T).
123 | Returns:
124 | Tensor: Multi resolution spectral convergence loss value.
125 | Tensor: Multi resolution log STFT magnitude loss value.
126 | """
127 | sc_loss = 0.0
128 | mag_loss = 0.0
129 | for f in self.stft_losses:
130 | sc_l, mag_l = f(x, y)
131 | sc_loss += sc_l
132 | mag_loss += mag_l
133 | sc_loss /= len(self.stft_losses)
134 | mag_loss /= len(self.stft_losses)
135 |
136 | return sc_loss, mag_loss
--------------------------------------------------------------------------------
/commons.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy as np
3 | import torch
4 | from torch import nn
5 | from torch.nn import functional as F
6 |
7 |
8 | def init_weights(m, mean=0.0, std=0.01):
9 | classname = m.__class__.__name__
10 | if classname.find("Conv") != -1:
11 | m.weight.data.normal_(mean, std)
12 |
13 |
14 | def get_padding(kernel_size, dilation=1):
15 | return int((kernel_size*dilation - dilation)/2)
16 |
17 |
18 | def convert_pad_shape(pad_shape):
19 | l = pad_shape[::-1]
20 | pad_shape = [item for sublist in l for item in sublist]
21 | return pad_shape
22 |
23 |
24 | def intersperse(lst, item):
25 | result = [item] * (len(lst) * 2 + 1)
26 | result[1::2] = lst
27 | return result
28 |
29 |
30 | def kl_divergence(m_p, logs_p, m_q, logs_q):
31 | """KL(P||Q)"""
32 | kl = (logs_q - logs_p) - 0.5
33 | kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
34 | return kl
35 |
36 |
37 | def rand_gumbel(shape):
38 | """Sample from the Gumbel distribution, protect from overflows."""
39 | uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40 | return -torch.log(-torch.log(uniform_samples))
41 |
42 |
43 | def rand_gumbel_like(x):
44 | g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45 | return g
46 |
47 |
48 | def slice_segments(x, ids_str, segment_size=4):
49 | ret = torch.zeros_like(x[:, :, :segment_size])
50 | for i in range(x.size(0)):
51 | idx_str = ids_str[i]
52 | idx_end = idx_str + segment_size
53 | ret[i] = x[i, :, idx_str:idx_end]
54 | return ret
55 |
56 |
57 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
58 | b, d, t = x.size()
59 | if x_lengths is None:
60 | x_lengths = t
61 | ids_str_max = x_lengths - segment_size + 1
62 | ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
63 | ret = slice_segments(x, ids_str, segment_size)
64 | return ret, ids_str
65 |
66 |
67 | def get_timing_signal_1d(
68 | length, channels, min_timescale=1.0, max_timescale=1.0e4):
69 | position = torch.arange(length, dtype=torch.float)
70 | num_timescales = channels // 2
71 | log_timescale_increment = (
72 | math.log(float(max_timescale) / float(min_timescale)) /
73 | (num_timescales - 1))
74 | inv_timescales = min_timescale * torch.exp(
75 | torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
76 | scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
77 | signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
78 | signal = F.pad(signal, [0, 0, 0, channels % 2])
79 | signal = signal.view(1, channels, length)
80 | return signal
81 |
82 |
83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
84 | b, channels, length = x.size()
85 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
86 | return x + signal.to(dtype=x.dtype, device=x.device)
87 |
88 |
89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
90 | b, channels, length = x.size()
91 | signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
92 | return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
93 |
94 |
95 | def subsequent_mask(length):
96 | mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
97 | return mask
98 |
99 |
100 | @torch.jit.script
101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102 | n_channels_int = n_channels[0]
103 | in_act = input_a + input_b
104 | t_act = torch.tanh(in_act[:, :n_channels_int, :])
105 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106 | acts = t_act * s_act
107 | return acts
108 |
109 |
110 | def convert_pad_shape(pad_shape):
111 | l = pad_shape[::-1]
112 | pad_shape = [item for sublist in l for item in sublist]
113 | return pad_shape
114 |
115 |
116 | def shift_1d(x):
117 | x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118 | return x
119 |
120 |
121 | def sequence_mask(length, max_length=None):
122 | if max_length is None:
123 | max_length = length.max()
124 | x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125 | return x.unsqueeze(0) < length.unsqueeze(1)
126 |
127 |
128 | def generate_path(duration, mask):
129 | """
130 | duration: [b, 1, t_x]
131 | mask: [b, 1, t_y, t_x]
132 | """
133 | device = duration.device
134 |
135 | b, _, t_y, t_x = mask.shape
136 | cum_duration = torch.cumsum(duration, -1)
137 |
138 | cum_duration_flat = cum_duration.view(b * t_x)
139 | path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
140 | path = path.view(b, t_x, t_y)
141 | path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
142 | path = path.unsqueeze(1).transpose(2,3) * mask
143 | return path
144 |
145 |
146 | def clip_grad_value_(parameters, clip_value, norm_type=2):
147 | if isinstance(parameters, torch.Tensor):
148 | parameters = [parameters]
149 | parameters = list(filter(lambda p: p.grad is not None, parameters))
150 | norm_type = float(norm_type)
151 | if clip_value is not None:
152 | clip_value = float(clip_value)
153 |
154 | total_norm = 0
155 | for p in parameters:
156 | param_norm = p.grad.data.norm(norm_type)
157 | total_norm += param_norm.item() ** norm_type
158 | if clip_value is not None:
159 | p.grad.data.clamp_(min=-clip_value, max=clip_value)
160 | total_norm = total_norm ** (1. / norm_type)
161 | return total_norm
162 |
--------------------------------------------------------------------------------
/text/japanese.py:
--------------------------------------------------------------------------------
1 | import re
2 | from unidecode import unidecode
3 | import pyopenjtalk
4 |
5 |
6 | # Regular expression matching Japanese without punctuation marks:
7 | _japanese_characters = re.compile(
8 | r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
9 |
10 | # Regular expression matching non-Japanese characters or punctuation marks:
11 | _japanese_marks = re.compile(
12 | r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
13 |
14 | # List of (symbol, Japanese) pairs for marks:
15 | _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
16 | ('%', 'パーセント')
17 | ]]
18 |
19 | # List of (romaji, ipa) pairs for marks:
20 | _romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
21 | ('ts', 'ʦ'),
22 | ('u', 'ɯ'),
23 | ('j', 'ʥ'),
24 | ('y', 'j'),
25 | ('ni', 'n^i'),
26 | ('nj', 'n^'),
27 | ('hi', 'çi'),
28 | ('hj', 'ç'),
29 | ('f', 'ɸ'),
30 | ('I', 'i*'),
31 | ('U', 'ɯ*'),
32 | ('r', 'ɾ')
33 | ]]
34 |
35 | # List of (romaji, ipa2) pairs for marks:
36 | _romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
37 | ('u', 'ɯ'),
38 | ('ʧ', 'tʃ'),
39 | ('j', 'dʑ'),
40 | ('y', 'j'),
41 | ('ni', 'n^i'),
42 | ('nj', 'n^'),
43 | ('hi', 'çi'),
44 | ('hj', 'ç'),
45 | ('f', 'ɸ'),
46 | ('I', 'i*'),
47 | ('U', 'ɯ*'),
48 | ('r', 'ɾ')
49 | ]]
50 |
51 | # List of (consonant, sokuon) pairs:
52 | _real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
53 | (r'Q([↑↓]*[kg])', r'k#\1'),
54 | (r'Q([↑↓]*[tdjʧ])', r't#\1'),
55 | (r'Q([↑↓]*[sʃ])', r's\1'),
56 | (r'Q([↑↓]*[pb])', r'p#\1')
57 | ]]
58 |
59 | # List of (consonant, hatsuon) pairs:
60 | _real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
61 | (r'N([↑↓]*[pbm])', r'm\1'),
62 | (r'N([↑↓]*[ʧʥj])', r'n^\1'),
63 | (r'N([↑↓]*[tdn])', r'n\1'),
64 | (r'N([↑↓]*[kg])', r'ŋ\1')
65 | ]]
66 |
67 |
68 | def symbols_to_japanese(text):
69 | for regex, replacement in _symbols_to_japanese:
70 | text = re.sub(regex, replacement, text)
71 | return text
72 |
73 |
74 | def japanese_to_romaji_with_accent(text):
75 | '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
76 | text = symbols_to_japanese(text)
77 | sentences = re.split(_japanese_marks, text)
78 | marks = re.findall(_japanese_marks, text)
79 | text = ''
80 | for i, sentence in enumerate(sentences):
81 | if re.match(_japanese_characters, sentence):
82 | if text != '':
83 | text += ' '
84 | labels = pyopenjtalk.extract_fullcontext(sentence)
85 | for n, label in enumerate(labels):
86 | phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
87 | if phoneme not in ['sil', 'pau']:
88 | text += phoneme.replace('ch', 'ʧ').replace('sh',
89 | 'ʃ').replace('cl', 'Q')
90 | else:
91 | continue
92 | # n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
93 | a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
94 | a2 = int(re.search(r"\+(\d+)\+", label).group(1))
95 | a3 = int(re.search(r"\+(\d+)/", label).group(1))
96 | if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
97 | a2_next = -1
98 | else:
99 | a2_next = int(
100 | re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
101 | # Accent phrase boundary
102 | if a3 == 1 and a2_next == 1:
103 | text += ' '
104 | # Falling
105 | elif a1 == 0 and a2_next == a2 + 1:
106 | text += '↓'
107 | # Rising
108 | elif a2 == 1 and a2_next == 2:
109 | text += '↑'
110 | if i < len(marks):
111 | text += unidecode(marks[i]).replace(' ', '')
112 | return text
113 |
114 |
115 | def get_real_sokuon(text):
116 | for regex, replacement in _real_sokuon:
117 | text = re.sub(regex, replacement, text)
118 | return text
119 |
120 |
121 | def get_real_hatsuon(text):
122 | for regex, replacement in _real_hatsuon:
123 | text = re.sub(regex, replacement, text)
124 | return text
125 |
126 |
127 | def japanese_to_ipa(text):
128 | text = japanese_to_romaji_with_accent(text).replace('...', '…')
129 | text = re.sub(
130 | r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
131 | text = get_real_sokuon(text)
132 | text = get_real_hatsuon(text)
133 | for regex, replacement in _romaji_to_ipa:
134 | text = re.sub(regex, replacement, text)
135 | return text
136 |
137 |
138 | def japanese_to_ipa2(text):
139 | text = japanese_to_romaji_with_accent(text).replace('...', '…')
140 | text = get_real_sokuon(text)
141 | text = get_real_hatsuon(text)
142 | for regex, replacement in _romaji_to_ipa2:
143 | text = re.sub(regex, replacement, text)
144 | return text
145 |
146 |
147 | def japanese_to_ipa3(text):
148 | text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace(
149 | 'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a')
150 | text = re.sub(
151 | r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
152 | text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text)
153 | return text
154 |
--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
1 | """ from https://github.com/keithito/tacotron """
2 |
3 | '''
4 | Cleaners are transformations that run over the input text at both training and eval time.
5 |
6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8 | 1. "english_cleaners" for English text
9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12 | the symbols in symbols.py to match your data).
13 | '''
14 |
15 | import re
16 | from unidecode import unidecode
17 | from phonemizer import phonemize
18 | import pyopenjtalk
19 | from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
20 | from text.korean import latin_to_hangul, number_to_hangul, divide_hangul, korean_to_lazy_ipa, korean_to_ipa
21 |
22 | # Regular expression matching whitespace:
23 | _whitespace_re = re.compile(r'\s+')
24 |
25 | # Regular expression matching Japanese without punctuation marks:
26 | _japanese_characters = re.compile(r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
27 |
28 | # Regular expression matching non-Japanese characters or punctuation marks:
29 | _japanese_marks = re.compile(r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
30 |
31 | # List of (regular expression, replacement) pairs for abbreviations:
32 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
33 | ('mrs', 'misess'),
34 | ('mr', 'mister'),
35 | ('dr', 'doctor'),
36 | ('st', 'saint'),
37 | ('co', 'company'),
38 | ('jr', 'junior'),
39 | ('maj', 'major'),
40 | ('gen', 'general'),
41 | ('drs', 'doctors'),
42 | ('rev', 'reverend'),
43 | ('lt', 'lieutenant'),
44 | ('hon', 'honorable'),
45 | ('sgt', 'sergeant'),
46 | ('capt', 'captain'),
47 | ('esq', 'esquire'),
48 | ('ltd', 'limited'),
49 | ('col', 'colonel'),
50 | ('ft', 'fort'),
51 | ]]
52 |
53 |
54 | def expand_abbreviations(text):
55 | for regex, replacement in _abbreviations:
56 | text = re.sub(regex, replacement, text)
57 | return text
58 |
59 |
60 | def expand_numbers(text):
61 | return normalize_numbers(text)
62 |
63 |
64 | def lowercase(text):
65 | return text.lower()
66 |
67 |
68 | def collapse_whitespace(text):
69 | return re.sub(_whitespace_re, ' ', text)
70 |
71 |
72 | def convert_to_ascii(text):
73 | return unidecode(text)
74 |
75 |
76 | def basic_cleaners(text):
77 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
78 | text = lowercase(text)
79 | text = collapse_whitespace(text)
80 | return text
81 |
82 |
83 | def transliteration_cleaners(text):
84 | '''Pipeline for non-English text that transliterates to ASCII.'''
85 | text = convert_to_ascii(text)
86 | text = lowercase(text)
87 | text = collapse_whitespace(text)
88 | return text
89 |
90 |
91 | def english_cleaners(text):
92 | '''Pipeline for English text, including abbreviation expansion.'''
93 | text = convert_to_ascii(text)
94 | text = lowercase(text)
95 | text = expand_abbreviations(text)
96 | phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
97 | phonemes = collapse_whitespace(phonemes)
98 | return phonemes
99 |
100 |
101 | def english_cleaners2(text):
102 | '''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
103 | text = convert_to_ascii(text)
104 | text = lowercase(text)
105 | text = expand_abbreviations(text)
106 | phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
107 | phonemes = collapse_whitespace(phonemes)
108 | return phonemes
109 |
110 |
111 | def japanese_cleaners(text):
112 | text = japanese_to_romaji_with_accent(text)
113 | text = re.sub(r'([A-Za-z])$', r'\1.', text)
114 | return text
115 |
116 |
117 | def japanese_cleaners2(text):
118 | return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
119 |
120 |
121 | def korean_cleaners(text):
122 | '''Pipeline for Korean text'''
123 | text = latin_to_hangul(text)
124 | text = number_to_hangul(text)
125 | text = divide_hangul(text)
126 | text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
127 | return text
128 |
129 |
130 | def japanese_triphone_cleaners(text):
131 | sentences = re.split(_japanese_marks, text)
132 | marks = re.findall(_japanese_marks, text)
133 | text = ''
134 | for i, sentence in enumerate(sentences):
135 | phones = pyopenjtalk.g2p(sentence, kana=False)
136 | phones = phones.replace(' ','')
137 | phones = phones.replace('A', 'a').replace('I', 'i').replace('U', 'u').replace('E', 'e').replace('O', 'o')
138 | phones = phones.replace('ch','ʧ').replace('sh','ʃ').replace('cl','Q')
139 | triphones = []
140 | length = len(phones)
141 | for j, phone in enumerate(phones):
142 | if length == 1:
143 | triphone = phone
144 | else:
145 | if j == 0:
146 | triphone = f'{phone}+{phones[j+1]}'
147 | elif j == length - 1:
148 | triphone = f'{phones[j-1]}-{phone}'
149 | else:
150 | triphone = f'{phones[j-1]}-{phone}+{phones[j+1]}'
151 | triphones.append(triphone)
152 | subtext = ' '.join(triphones)
153 | text += subtext
154 | if i < len(marks):
155 | text += unidecode(marks[i]).replace(' ', '')
156 | if len(text) > 0 and re.match('[A-Za-z]',text[-1]):
157 | text += '.'
158 |
159 | return text
160 |
--------------------------------------------------------------------------------
/text/korean.py:
--------------------------------------------------------------------------------
1 | import re
2 | from jamo import h2j, j2hcj
3 | import ko_pron
4 |
5 |
6 | # This is a list of Korean classifiers preceded by pure Korean numerals.
7 | _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
8 |
9 | # List of (hangul, hangul divided) pairs:
10 | _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
11 | ('ㄳ', 'ㄱㅅ'),
12 | ('ㄵ', 'ㄴㅈ'),
13 | ('ㄶ', 'ㄴㅎ'),
14 | ('ㄺ', 'ㄹㄱ'),
15 | ('ㄻ', 'ㄹㅁ'),
16 | ('ㄼ', 'ㄹㅂ'),
17 | ('ㄽ', 'ㄹㅅ'),
18 | ('ㄾ', 'ㄹㅌ'),
19 | ('ㄿ', 'ㄹㅍ'),
20 | ('ㅀ', 'ㄹㅎ'),
21 | ('ㅄ', 'ㅂㅅ'),
22 | ('ㅘ', 'ㅗㅏ'),
23 | ('ㅙ', 'ㅗㅐ'),
24 | ('ㅚ', 'ㅗㅣ'),
25 | ('ㅝ', 'ㅜㅓ'),
26 | ('ㅞ', 'ㅜㅔ'),
27 | ('ㅟ', 'ㅜㅣ'),
28 | ('ㅢ', 'ㅡㅣ'),
29 | ('ㅑ', 'ㅣㅏ'),
30 | ('ㅒ', 'ㅣㅐ'),
31 | ('ㅕ', 'ㅣㅓ'),
32 | ('ㅖ', 'ㅣㅔ'),
33 | ('ㅛ', 'ㅣㅗ'),
34 | ('ㅠ', 'ㅣㅜ')
35 | ]]
36 |
37 | # List of (Latin alphabet, hangul) pairs:
38 | _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
39 | ('a', '에이'),
40 | ('b', '비'),
41 | ('c', '시'),
42 | ('d', '디'),
43 | ('e', '이'),
44 | ('f', '에프'),
45 | ('g', '지'),
46 | ('h', '에이치'),
47 | ('i', '아이'),
48 | ('j', '제이'),
49 | ('k', '케이'),
50 | ('l', '엘'),
51 | ('m', '엠'),
52 | ('n', '엔'),
53 | ('o', '오'),
54 | ('p', '피'),
55 | ('q', '큐'),
56 | ('r', '아르'),
57 | ('s', '에스'),
58 | ('t', '티'),
59 | ('u', '유'),
60 | ('v', '브이'),
61 | ('w', '더블유'),
62 | ('x', '엑스'),
63 | ('y', '와이'),
64 | ('z', '제트')
65 | ]]
66 |
67 | # List of (ipa, lazy ipa) pairs:
68 | _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
69 | ('t͡ɕ','ʧ'),
70 | ('d͡ʑ','ʥ'),
71 | ('ɲ','n^'),
72 | ('ɕ','ʃ'),
73 | ('ʷ','w'),
74 | ('ɭ','l`'),
75 | ('ʎ','ɾ'),
76 | ('ɣ','ŋ'),
77 | ('ɰ','ɯ'),
78 | ('ʝ','j'),
79 | ('ʌ','ə'),
80 | ('ɡ','g'),
81 | ('\u031a','#'),
82 | ('\u0348','='),
83 | ('\u031e',''),
84 | ('\u0320',''),
85 | ('\u0339','')
86 | ]]
87 |
88 |
89 | def latin_to_hangul(text):
90 | for regex, replacement in _latin_to_hangul:
91 | text = re.sub(regex, replacement, text)
92 | return text
93 |
94 |
95 | def divide_hangul(text):
96 | text = j2hcj(h2j(text))
97 | for regex, replacement in _hangul_divided:
98 | text = re.sub(regex, replacement, text)
99 | return text
100 |
101 |
102 | def hangul_number(num, sino=True):
103 | '''Reference https://github.com/Kyubyong/g2pK'''
104 | num = re.sub(',', '', num)
105 |
106 | if num == '0':
107 | return '영'
108 | if not sino and num == '20':
109 | return '스무'
110 |
111 | digits = '123456789'
112 | names = '일이삼사오육칠팔구'
113 | digit2name = {d: n for d, n in zip(digits, names)}
114 |
115 | modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
116 | decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
117 | digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
118 | digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
119 |
120 | spelledout = []
121 | for i, digit in enumerate(num):
122 | i = len(num) - i - 1
123 | if sino:
124 | if i == 0:
125 | name = digit2name.get(digit, '')
126 | elif i == 1:
127 | name = digit2name.get(digit, '') + '십'
128 | name = name.replace('일십', '십')
129 | else:
130 | if i == 0:
131 | name = digit2mod.get(digit, '')
132 | elif i == 1:
133 | name = digit2dec.get(digit, '')
134 | if digit == '0':
135 | if i % 4 == 0:
136 | last_three = spelledout[-min(3, len(spelledout)):]
137 | if ''.join(last_three) == '':
138 | spelledout.append('')
139 | continue
140 | else:
141 | spelledout.append('')
142 | continue
143 | if i == 2:
144 | name = digit2name.get(digit, '') + '백'
145 | name = name.replace('일백', '백')
146 | elif i == 3:
147 | name = digit2name.get(digit, '') + '천'
148 | name = name.replace('일천', '천')
149 | elif i == 4:
150 | name = digit2name.get(digit, '') + '만'
151 | name = name.replace('일만', '만')
152 | elif i == 5:
153 | name = digit2name.get(digit, '') + '십'
154 | name = name.replace('일십', '십')
155 | elif i == 6:
156 | name = digit2name.get(digit, '') + '백'
157 | name = name.replace('일백', '백')
158 | elif i == 7:
159 | name = digit2name.get(digit, '') + '천'
160 | name = name.replace('일천', '천')
161 | elif i == 8:
162 | name = digit2name.get(digit, '') + '억'
163 | elif i == 9:
164 | name = digit2name.get(digit, '') + '십'
165 | elif i == 10:
166 | name = digit2name.get(digit, '') + '백'
167 | elif i == 11:
168 | name = digit2name.get(digit, '') + '천'
169 | elif i == 12:
170 | name = digit2name.get(digit, '') + '조'
171 | elif i == 13:
172 | name = digit2name.get(digit, '') + '십'
173 | elif i == 14:
174 | name = digit2name.get(digit, '') + '백'
175 | elif i == 15:
176 | name = digit2name.get(digit, '') + '천'
177 | spelledout.append(name)
178 | return ''.join(elem for elem in spelledout)
179 |
180 |
181 | def number_to_hangul(text):
182 | '''Reference https://github.com/Kyubyong/g2pK'''
183 | tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
184 | for token in tokens:
185 | num, classifier = token
186 | if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
187 | spelledout = hangul_number(num, sino=False)
188 | else:
189 | spelledout = hangul_number(num, sino=True)
190 | text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
191 | # digit by digit for remaining digits
192 | digits = '0123456789'
193 | names = '영일이삼사오육칠팔구'
194 | for d, n in zip(digits, names):
195 | text = text.replace(d, n)
196 | return text
197 |
198 |
199 | def korean_to_lazy_ipa(text):
200 | text = latin_to_hangul(text)
201 | text = number_to_hangul(text)
202 | text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
203 | for regex, replacement in _ipa_to_lazy_ipa:
204 | text = re.sub(regex, replacement, text)
205 | return text
206 |
207 |
208 | def korean_to_ipa(text):
209 | text = korean_to_lazy_ipa(text)
210 | return text.replace('ʧ','tʃ').replace('ʥ','dʑ')
211 |
--------------------------------------------------------------------------------
/filelists/vctk_audio_sid_text_val_filelist.txt:
--------------------------------------------------------------------------------
1 | DUMMY2/p364/p364_240.wav|88|It had happened to him.
2 | DUMMY2/p280/p280_148.wav|52|It is open season on the Old Firm.
3 | DUMMY2/p231/p231_320.wav|50|However, he is a coach, and he remains a coach at heart.
4 | DUMMY2/p282/p282_129.wav|83|It is not a U-turn.
5 | DUMMY2/p254/p254_015.wav|41|The Greeks used to imagine that it was a sign from the gods to foretell war or heavy rain.
6 | DUMMY2/p228/p228_285.wav|57|The songs are just so good.
7 | DUMMY2/p334/p334_307.wav|38|If they don't, they can expect their funding to be cut.
8 | DUMMY2/p287/p287_081.wav|77|I've never seen anything like it.
9 | DUMMY2/p247/p247_083.wav|14|It is a job creation scheme.)
10 | DUMMY2/p264/p264_051.wav|65|We were leading by two goals.)
11 | DUMMY2/p335/p335_058.wav|49|Let's see that increase over the years.
12 | DUMMY2/p236/p236_225.wav|75|There is no quick fix.
13 | DUMMY2/p374/p374_353.wav|11|And that brings us to the point.
14 | DUMMY2/p272/p272_076.wav|69|Sounds like The Sixth Sense?
15 | DUMMY2/p271/p271_152.wav|27|The petition was formally presented at Downing Street yesterday.
16 | DUMMY2/p228/p228_127.wav|57|They've got to account for it.
17 | DUMMY2/p276/p276_223.wav|106|It's been a humbling year.
18 | DUMMY2/p262/p262_248.wav|45|The project has already secured the support of Sir Sean Connery.
19 | DUMMY2/p314/p314_086.wav|51|The team this year is going places.
20 | DUMMY2/p225/p225_038.wav|101|Diving is no part of football.
21 | DUMMY2/p279/p279_088.wav|25|The shareholders will vote to wind up the company on Friday morning.
22 | DUMMY2/p272/p272_018.wav|69|Aristotle thought that the rainbow was caused by reflection of the sun's rays by the rain.
23 | DUMMY2/p256/p256_098.wav|90|She told The Herald.
24 | DUMMY2/p261/p261_218.wav|100|All will be revealed in due course.
25 | DUMMY2/p265/p265_063.wav|73|IT shouldn't come as a surprise, but it does.
26 | DUMMY2/p314/p314_042.wav|51|It is all about people being assaulted, abused.
27 | DUMMY2/p241/p241_188.wav|86|I wish I could say something.
28 | DUMMY2/p283/p283_111.wav|95|It's good to have a voice.
29 | DUMMY2/p275/p275_006.wav|40|When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow.
30 | DUMMY2/p228/p228_092.wav|57|Today I couldn't run on it.
31 | DUMMY2/p295/p295_343.wav|92|The atmosphere is businesslike.
32 | DUMMY2/p228/p228_187.wav|57|They will run a mile.
33 | DUMMY2/p294/p294_317.wav|104|It didn't put me off.
34 | DUMMY2/p231/p231_445.wav|50|It sounded like a bomb.
35 | DUMMY2/p272/p272_086.wav|69|Today she has been released.
36 | DUMMY2/p255/p255_210.wav|31|It was worth a photograph.
37 | DUMMY2/p229/p229_060.wav|67|And a film maker was born.
38 | DUMMY2/p260/p260_232.wav|81|The Home Office would not release any further details about the group.
39 | DUMMY2/p245/p245_025.wav|59|Johnson was pretty low.
40 | DUMMY2/p333/p333_185.wav|64|This area is perfect for children.
41 | DUMMY2/p244/p244_242.wav|78|He is a man of the people.
42 | DUMMY2/p376/p376_187.wav|71|"It is a terrible loss."
43 | DUMMY2/p239/p239_156.wav|48|It is a good lifestyle.
44 | DUMMY2/p307/p307_037.wav|22|He released a half-dozen solo albums.
45 | DUMMY2/p305/p305_185.wav|54|I am not even thinking about that.
46 | DUMMY2/p272/p272_081.wav|69|It was magic.
47 | DUMMY2/p302/p302_297.wav|30|I'm trying to stay open on that.
48 | DUMMY2/p275/p275_320.wav|40|We are in the end game.
49 | DUMMY2/p239/p239_231.wav|48|Then we will face the Danish champions.
50 | DUMMY2/p268/p268_301.wav|87|It was only later that the condition was diagnosed.
51 | DUMMY2/p336/p336_088.wav|98|They failed to reach agreement yesterday.
52 | DUMMY2/p278/p278_255.wav|10|They made such decisions in London.
53 | DUMMY2/p361/p361_132.wav|79|That got me out.
54 | DUMMY2/p307/p307_146.wav|22|You hope he prevails.
55 | DUMMY2/p244/p244_147.wav|78|They could not ignore the will of parliament, he claimed.
56 | DUMMY2/p294/p294_283.wav|104|This is our unfinished business.
57 | DUMMY2/p283/p283_300.wav|95|I would have the hammer in the crowd.
58 | DUMMY2/p239/p239_079.wav|48|I can understand the frustrations of our fans.
59 | DUMMY2/p264/p264_009.wav|65|There is , according to legend, a boiling pot of gold at one end. )
60 | DUMMY2/p307/p307_348.wav|22|He did not oppose the divorce.
61 | DUMMY2/p304/p304_308.wav|72|We are the gateway to justice.
62 | DUMMY2/p281/p281_056.wav|36|None has ever been found.
63 | DUMMY2/p267/p267_158.wav|0|We were given a warm and friendly reception.
64 | DUMMY2/p300/p300_169.wav|102|Who do these people think they are?
65 | DUMMY2/p276/p276_177.wav|106|They exist in name alone.
66 | DUMMY2/p228/p228_245.wav|57|It is a policy which has the full support of the minister.
67 | DUMMY2/p300/p300_303.wav|102|I'm wondering what you feel about the youngest.
68 | DUMMY2/p362/p362_247.wav|15|This would give Scotland around eight members.
69 | DUMMY2/p326/p326_031.wav|28|United were in control without always being dominant.
70 | DUMMY2/p361/p361_288.wav|79|I did not think it was very proper.
71 | DUMMY2/p286/p286_145.wav|63|Tiger is not the norm.
72 | DUMMY2/p234/p234_071.wav|3|She did that for the rest of her life.
73 | DUMMY2/p263/p263_296.wav|39|The decision was announced at its annual conference in Dunfermline.
74 | DUMMY2/p323/p323_228.wav|34|She became a heroine of my childhood.
75 | DUMMY2/p280/p280_346.wav|52|It was a bit like having children.
76 | DUMMY2/p333/p333_080.wav|64|But the tragedy did not stop there.
77 | DUMMY2/p226/p226_268.wav|43|That decision is for the British Parliament and people.
78 | DUMMY2/p362/p362_314.wav|15|Is that right?
79 | DUMMY2/p240/p240_047.wav|93|It is so sad.
80 | DUMMY2/p250/p250_207.wav|24|You could feel the heat.
81 | DUMMY2/p273/p273_176.wav|56|Neither side would reveal the details of the offer.
82 | DUMMY2/p316/p316_147.wav|85|And frankly, it's been a while.
83 | DUMMY2/p265/p265_047.wav|73|It is unique.
84 | DUMMY2/p336/p336_353.wav|98|Sometimes you get them, sometimes you don't.
85 | DUMMY2/p230/p230_376.wav|35|This hasn't happened in a vacuum.
86 | DUMMY2/p308/p308_209.wav|107|There is great potential on this river.
87 | DUMMY2/p250/p250_442.wav|24|We have not yet received a letter from the Irish.
88 | DUMMY2/p260/p260_037.wav|81|It's a fact.
89 | DUMMY2/p299/p299_345.wav|58|We're very excited and challenged by the project.
90 | DUMMY2/p269/p269_218.wav|94|A Grampian Police spokesman said.
91 | DUMMY2/p306/p306_014.wav|12|To the Hebrews it was a token that there would be no more universal floods.
92 | DUMMY2/p271/p271_292.wav|27|It's a record label, not a form of music.
93 | DUMMY2/p247/p247_225.wav|14|I am considered a teenager.)
94 | DUMMY2/p294/p294_094.wav|104|It should be a condition of employment.
95 | DUMMY2/p269/p269_031.wav|94|Is this accurate?
96 | DUMMY2/p275/p275_116.wav|40|It's not fair.
97 | DUMMY2/p265/p265_006.wav|73|When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow.
98 | DUMMY2/p285/p285_072.wav|2|Mr Irvine said Mr Rafferty was now in good spirits.
99 | DUMMY2/p270/p270_167.wav|8|We did what we had to do.
100 | DUMMY2/p360/p360_397.wav|60|It is a relief.
101 |
--------------------------------------------------------------------------------
/filelists/vctk_audio_sid_text_val_filelist.txt.cleaned:
--------------------------------------------------------------------------------
1 | DUMMY2/p364/p364_240.wav|88|ɪt hɐd hˈæpənd tə hˌɪm.
2 | DUMMY2/p280/p280_148.wav|52|ɪt ɪz ˈoʊpən sˈiːzən ɑːnðɪ ˈoʊld fˈɜːm.
3 | DUMMY2/p231/p231_320.wav|50|haʊˈɛvɚ, hiː ɪz ɐ kˈoʊtʃ, ænd hiː ɹɪmˈeɪnz ɐ kˈoʊtʃ æt hˈɑːɹt.
4 | DUMMY2/p282/p282_129.wav|83|ɪt ɪz nˌɑːɾə jˈuːtˈɜːn.
5 | DUMMY2/p254/p254_015.wav|41|ðə ɡɹˈiːks jˈuːzd tʊ ɪmˈædʒɪn ðˌɐɾɪt wʌzɐ sˈaɪn fɹʌmðə ɡˈɑːdz tə foːɹtˈɛl wˈɔːɹ ɔːɹ hˈɛvi ɹˈeɪn.
6 | DUMMY2/p228/p228_285.wav|57|ðə sˈɔŋz ɑːɹ dʒˈʌst sˌoʊ ɡˈʊd.
7 | DUMMY2/p334/p334_307.wav|38|ɪf ðeɪ dˈoʊnt, ðeɪ kæn ɛkspˈɛkt ðɛɹ fˈʌndɪŋ təbi kˈʌt.
8 | DUMMY2/p287/p287_081.wav|77|aɪv nˈɛvɚ sˈiːn ˈɛnɪθˌɪŋ lˈaɪk ɪt.
9 | DUMMY2/p247/p247_083.wav|14|ɪt ɪz ɐ dʒˈɑːb kɹiːˈeɪʃən skˈiːm.
10 | DUMMY2/p264/p264_051.wav|65|wiː wɜː lˈiːdɪŋ baɪ tˈuː ɡˈoʊlz.
11 | DUMMY2/p335/p335_058.wav|49|lˈɛts sˈiː ðæt ˈɪnkɹiːs ˌoʊvɚ ðə jˈɪɹz.
12 | DUMMY2/p236/p236_225.wav|75|ðɛɹ ɪz nˈoʊ kwˈɪk fˈɪks.
13 | DUMMY2/p374/p374_353.wav|11|ænd ðæt bɹˈɪŋz ˌʌs tə ðə pˈɔɪnt.
14 | DUMMY2/p272/p272_076.wav|69|sˈaʊndz lˈaɪk ðə sˈɪksθ sˈɛns?
15 | DUMMY2/p271/p271_152.wav|27|ðə pətˈɪʃən wʌz fˈɔːɹməli pɹɪzˈɛntᵻd æt dˈaʊnɪŋ stɹˈiːt jˈɛstɚdˌeɪ.
16 | DUMMY2/p228/p228_127.wav|57|ðeɪv ɡɑːt tʊ ɐkˈaʊnt fɔːɹ ɪt.
17 | DUMMY2/p276/p276_223.wav|106|ɪts bˌɪn ɐ hˈʌmblɪŋ jˈɪɹ.
18 | DUMMY2/p262/p262_248.wav|45|ðə pɹˈɑːdʒɛkt hɐz ɔːlɹˌɛdi sɪkjˈʊɹd ðə səpˈoːɹt ʌv sˌɜː ʃˈɔːn kɑːnɚɹi.
19 | DUMMY2/p314/p314_086.wav|51|ðə tˈiːm ðɪs jˈɪɹ ɪz ɡˌoʊɪŋ plˈeɪsᵻz.
20 | DUMMY2/p225/p225_038.wav|101|dˈaɪvɪŋ ɪz nˈoʊ pˈɑːɹt ʌv fˈʊtbɔːl.
21 | DUMMY2/p279/p279_088.wav|25|ðə ʃˈɛɹhoʊldɚz wɪl vˈoʊt tə wˈaɪnd ˈʌp ðə kˈʌmpəni ˌɑːn fɹˈaɪdeɪ mˈɔːɹnɪŋ.
22 | DUMMY2/p272/p272_018.wav|69|ˈæɹɪstˌɑːɾəl θˈɔːt ðætðə ɹˈeɪnboʊ wʌz kˈɔːzd baɪ ɹɪflˈɛkʃən ʌvðə sˈʌnz ɹˈeɪz baɪ ðə ɹˈeɪn.
23 | DUMMY2/p256/p256_098.wav|90|ʃiː tˈoʊld ðə hˈɛɹəld.
24 | DUMMY2/p261/p261_218.wav|100|ˈɔːl wɪl biː ɹɪvˈiːld ɪn dˈuː kˈoːɹs.
25 | DUMMY2/p265/p265_063.wav|73|ɪt ʃˌʊdənt kˈʌm æz ɐ sɚpɹˈaɪz, bˌʌt ɪt dˈʌz.
26 | DUMMY2/p314/p314_042.wav|51|ɪt ɪz ˈɔːl ɐbˌaʊt pˈiːpəl bˌiːɪŋ ɐsˈɑːltᵻd, ɐbjˈuːsd.
27 | DUMMY2/p241/p241_188.wav|86|ˈaɪ wˈɪʃ ˈaɪ kʊd sˈeɪ sˈʌmθɪŋ.
28 | DUMMY2/p283/p283_111.wav|95|ɪts ɡˈʊd tə hæv ɐ vˈɔɪs.
29 | DUMMY2/p275/p275_006.wav|40|wˌɛn ðə sˈʌnlaɪt stɹˈaɪks ɹˈeɪndɹɑːps ɪnðɪ ˈɛɹ, ðeɪ ˈækt æz ɐ pɹˈɪzəm ænd fˈɔːɹm ɐ ɹˈeɪnboʊ.
30 | DUMMY2/p228/p228_092.wav|57|tədˈeɪ ˈaɪ kˌʊdənt ɹˈʌn ˈɑːn ɪt.
31 | DUMMY2/p295/p295_343.wav|92|ðɪ ˈætməsfˌɪɹ ɪz bˈɪznəslˌaɪk.
32 | DUMMY2/p228/p228_187.wav|57|ðeɪ wɪl ɹˈʌn ɐ mˈaɪl.
33 | DUMMY2/p294/p294_317.wav|104|ɪt dˈɪdnt pˌʊt mˌiː ˈɔf.
34 | DUMMY2/p231/p231_445.wav|50|ɪt sˈaʊndᵻd lˈaɪk ɐ bˈɑːm.
35 | DUMMY2/p272/p272_086.wav|69|tədˈeɪ ʃiː hɐzbɪn ɹɪlˈiːsd.
36 | DUMMY2/p255/p255_210.wav|31|ɪt wʌz wˈɜːθ ɐ fˈoʊɾəɡɹˌæf.
37 | DUMMY2/p229/p229_060.wav|67|ænd ɐ fˈɪlm mˈeɪkɚ wʌz bˈɔːɹn.
38 | DUMMY2/p260/p260_232.wav|81|ðə hˈoʊm ˈɑːfɪs wʊd nˌɑːt ɹɪlˈiːs ˌɛni fˈɜːðɚ diːtˈeɪlz ɐbˌaʊt ðə ɡɹˈuːp.
39 | DUMMY2/p245/p245_025.wav|59|dʒˈɑːnsən wʌz pɹˈɪɾi lˈoʊ.
40 | DUMMY2/p333/p333_185.wav|64|ðɪs ˈɛɹiə ɪz pˈɜːfɛkt fɔːɹ tʃˈɪldɹən.
41 | DUMMY2/p244/p244_242.wav|78|hiː ɪz ɐ mˈæn ʌvðə pˈiːpəl.
42 | DUMMY2/p376/p376_187.wav|71|"ɪt ɪz ɐ tˈɛɹəbəl lˈɔs."
43 | DUMMY2/p239/p239_156.wav|48|ɪt ɪz ɐ ɡˈʊd lˈaɪfstaɪl.
44 | DUMMY2/p307/p307_037.wav|22|hiː ɹɪlˈiːsd ɐ hˈæfdˈʌzən sˈoʊloʊ ˈælbəmz.
45 | DUMMY2/p305/p305_185.wav|54|ˈaɪ æm nˌɑːt ˈiːvən θˈɪŋkɪŋ ɐbˌaʊt ðˈæt.
46 | DUMMY2/p272/p272_081.wav|69|ɪt wʌz mˈædʒɪk.
47 | DUMMY2/p302/p302_297.wav|30|aɪm tɹˈaɪɪŋ tə stˈeɪ ˈoʊpən ˌɑːn ðˈæt.
48 | DUMMY2/p275/p275_320.wav|40|wiː ɑːɹ ɪnðɪ ˈɛnd ɡˈeɪm.
49 | DUMMY2/p239/p239_231.wav|48|ðˈɛn wiː wɪl fˈeɪs ðə dˈeɪnɪʃ tʃˈæmpiənz.
50 | DUMMY2/p268/p268_301.wav|87|ɪt wʌz ˈoʊnli lˈeɪɾɚ ðætðə kəndˈɪʃən wʌz dˌaɪəɡnˈoʊzd.
51 | DUMMY2/p336/p336_088.wav|98|ðeɪ fˈeɪld tə ɹˈiːtʃ ɐɡɹˈiːmənt jˈɛstɚdˌeɪ.
52 | DUMMY2/p278/p278_255.wav|10|ðeɪ mˌeɪd sˈʌtʃ dᵻsˈɪʒənz ɪn lˈʌndən.
53 | DUMMY2/p361/p361_132.wav|79|ðæt ɡɑːt mˌiː ˈaʊt.
54 | DUMMY2/p307/p307_146.wav|22|juː hˈoʊp hiː pɹɪvˈeɪlz.
55 | DUMMY2/p244/p244_147.wav|78|ðeɪ kʊd nˌɑːt ɪɡnˈoːɹ ðə wɪl ʌv pˈɑːɹləmənt, hiː klˈeɪmd.
56 | DUMMY2/p294/p294_283.wav|104|ðɪs ɪz ˌaʊɚɹ ʌnfˈɪnɪʃt bˈɪznəs.
57 | DUMMY2/p283/p283_300.wav|95|ˈaɪ wʊdhɐv ðə hˈæmɚɹ ɪnðə kɹˈaʊd.
58 | DUMMY2/p239/p239_079.wav|48|ˈaɪ kæn ˌʌndɚstˈænd ðə fɹʌstɹˈeɪʃənz ʌv ˌaʊɚ fˈænz.
59 | DUMMY2/p264/p264_009.wav|65|ðɛɹˈɪz , ɐkˈoːɹdɪŋ tə lˈɛdʒənd, ɐ bˈɔɪlɪŋ pˈɑːt ʌv ɡˈoʊld æt wˈʌn ˈɛnd.
60 | DUMMY2/p307/p307_348.wav|22|hiː dɪdnˌɑːt əpˈoʊz ðə dɪvˈoːɹs.
61 | DUMMY2/p304/p304_308.wav|72|wiː ɑːɹ ðə ɡˈeɪtweɪ tə dʒˈʌstɪs.
62 | DUMMY2/p281/p281_056.wav|36|nˈʌn hɐz ˈɛvɚ bˌɪn fˈaʊnd.
63 | DUMMY2/p267/p267_158.wav|0|wiː wɜː ɡˈɪvən ɐ wˈɔːɹm ænd fɹˈɛndli ɹɪsˈɛpʃən.
64 | DUMMY2/p300/p300_169.wav|102|hˌuː dˈuː ðiːz pˈiːpəl θˈɪŋk ðeɪ ɑːɹ?
65 | DUMMY2/p276/p276_177.wav|106|ðeɪ ɛɡzˈɪst ɪn nˈeɪm ɐlˈoʊn.
66 | DUMMY2/p228/p228_245.wav|57|ɪt ɪz ɐ pˈɑːlɪsi wˌɪtʃ hɐz ðə fˈʊl səpˈoːɹt ʌvðə mˈɪnɪstɚ.
67 | DUMMY2/p300/p300_303.wav|102|aɪm wˈʌndɚɹɪŋ wˌʌt juː fˈiːl ɐbˌaʊt ðə jˈʌŋɡəst.
68 | DUMMY2/p362/p362_247.wav|15|ðɪs wʊd ɡˈɪv skˈɑːtlənd ɐɹˈaʊnd ˈeɪt mˈɛmbɚz.
69 | DUMMY2/p326/p326_031.wav|28|juːnˈaɪɾᵻd wɜːɹ ɪn kəntɹˈoʊl wɪðˌaʊt ˈɔːlweɪz bˌiːɪŋ dˈɑːmɪnənt.
70 | DUMMY2/p361/p361_288.wav|79|ˈaɪ dɪdnˌɑːt θˈɪŋk ɪt wʌz vˈɛɹi pɹˈɑːpɚ.
71 | DUMMY2/p286/p286_145.wav|63|tˈaɪɡɚɹ ɪz nˌɑːt ðə nˈɔːɹm.
72 | DUMMY2/p234/p234_071.wav|3|ʃiː dˈɪd ðæt fɚðə ɹˈɛst ʌv hɜː lˈaɪf.
73 | DUMMY2/p263/p263_296.wav|39|ðə dᵻsˈɪʒən wʌz ɐnˈaʊnst æt ɪts ˈænjuːəl kˈɑːnfɹəns ɪn dˈʌnfɚmlˌaɪn.
74 | DUMMY2/p323/p323_228.wav|34|ʃiː bɪkˌeɪm ɐ hˈɛɹoʊˌɪn ʌv maɪ tʃˈaɪldhʊd.
75 | DUMMY2/p280/p280_346.wav|52|ɪt wʌzɐ bˈɪt lˈaɪk hˌævɪŋ tʃˈɪldɹən.
76 | DUMMY2/p333/p333_080.wav|64|bˌʌt ðə tɹˈædʒədi dɪdnˌɑːt stˈɑːp ðˈɛɹ.
77 | DUMMY2/p226/p226_268.wav|43|ðæt dᵻsˈɪʒən ɪz fɚðə bɹˈɪɾɪʃ pˈɑːɹləmənt ænd pˈiːpəl.
78 | DUMMY2/p362/p362_314.wav|15|ɪz ðæt ɹˈaɪt?
79 | DUMMY2/p240/p240_047.wav|93|ɪt ɪz sˌoʊ sˈæd.
80 | DUMMY2/p250/p250_207.wav|24|juː kʊd fˈiːl ðə hˈiːt.
81 | DUMMY2/p273/p273_176.wav|56|nˈiːðɚ sˈaɪd wʊd ɹɪvˈiːl ðə diːtˈeɪlz ʌvðɪ ˈɑːfɚ.
82 | DUMMY2/p316/p316_147.wav|85|ænd fɹˈæŋkli, ɪts bˌɪn ɐ wˈaɪl.
83 | DUMMY2/p265/p265_047.wav|73|ɪt ɪz juːnˈiːk.
84 | DUMMY2/p336/p336_353.wav|98|sˈʌmtaɪmz juː ɡˈɛt ðˌɛm, sˈʌmtaɪmz juː dˈoʊnt.
85 | DUMMY2/p230/p230_376.wav|35|ðɪs hˈæzənt hˈæpənd ɪn ɐ vˈækjuːm.
86 | DUMMY2/p308/p308_209.wav|107|ðɛɹ ɪz ɡɹˈeɪt pətˈɛnʃəl ˌɑːn ðɪs ɹˈɪvɚ.
87 | DUMMY2/p250/p250_442.wav|24|wiː hɐvnˌɑːt jˈɛt ɹɪsˈiːvd ɐ lˈɛɾɚ fɹʌmðɪ ˈaɪɹɪʃ.
88 | DUMMY2/p260/p260_037.wav|81|ɪts ɐ fˈækt.
89 | DUMMY2/p299/p299_345.wav|58|wɪɹ vˈɛɹi ɛksˈaɪɾᵻd ænd tʃˈælɪndʒd baɪ ðə pɹˈɑːdʒɛkt.
90 | DUMMY2/p269/p269_218.wav|94|ɐ ɡɹˈæmpiən pəlˈiːs spˈoʊksmən sˈɛd.
91 | DUMMY2/p306/p306_014.wav|12|tə ðə hˈiːbɹuːz ɪt wʌzɐ tˈoʊkən ðæt ðɛɹ wʊd biː nˈoʊmˌoːɹ jˌuːnɪvˈɜːsəl flˈʌdz.
92 | DUMMY2/p271/p271_292.wav|27|ɪts ɐ ɹˈɛkɚd lˈeɪbəl, nˌɑːɾə fˈɔːɹm ʌv mjˈuːzɪk.
93 | DUMMY2/p247/p247_225.wav|14|ˈaɪ æm kənsˈɪdɚd ɐ tˈiːneɪdʒɚ.
94 | DUMMY2/p294/p294_094.wav|104|ɪt ʃˌʊd biː ɐ kəndˈɪʃən ʌv ɛmplˈɔɪmənt.
95 | DUMMY2/p269/p269_031.wav|94|ɪz ðɪs ˈækjʊɹət?
96 | DUMMY2/p275/p275_116.wav|40|ɪts nˌɑːt fˈɛɹ.
97 | DUMMY2/p265/p265_006.wav|73|wˌɛn ðə sˈʌnlaɪt stɹˈaɪks ɹˈeɪndɹɑːps ɪnðɪ ˈɛɹ, ðeɪ ˈækt æz ɐ pɹˈɪzəm ænd fˈɔːɹm ɐ ɹˈeɪnboʊ.
98 | DUMMY2/p285/p285_072.wav|2|mˈɪstɚɹ ˈɜːvaɪn sˈɛd mˈɪstɚ ɹˈæfɚɾi wʌz nˈaʊ ɪn ɡˈʊd spˈɪɹɪts.
99 | DUMMY2/p270/p270_167.wav|8|wiː dˈɪd wˌʌt wiː hædtə dˈuː.
100 | DUMMY2/p360/p360_397.wav|60|ɪt ɪz ɐ ɹɪlˈiːf.
101 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import sys
4 | import argparse
5 | import logging
6 | import json
7 | import subprocess
8 | import numpy as np
9 | from scipy.io.wavfile import read
10 | import torch
11 |
12 | MATPLOTLIB_FLAG = False
13 |
14 | logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
15 | logger = logging
16 |
17 |
18 | def load_checkpoint(checkpoint_path, model, optimizer=None):
19 | assert os.path.isfile(checkpoint_path)
20 | checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
21 | iteration = checkpoint_dict['iteration']
22 | learning_rate = checkpoint_dict['learning_rate']
23 | if optimizer is not None:
24 | optimizer.load_state_dict(checkpoint_dict['optimizer'])
25 | saved_state_dict = checkpoint_dict['model']
26 | if hasattr(model, 'module'):
27 | state_dict = model.module.state_dict()
28 | else:
29 | state_dict = model.state_dict()
30 | new_state_dict= {}
31 | for k, v in state_dict.items():
32 | try:
33 | new_state_dict[k] = saved_state_dict[k]
34 | except:
35 | logger.info("%s is not in the checkpoint" % k)
36 | new_state_dict[k] = v
37 | if hasattr(model, 'module'):
38 | model.module.load_state_dict(new_state_dict)
39 | else:
40 | model.load_state_dict(new_state_dict)
41 | logger.info("Loaded checkpoint '{}' (iteration {})" .format(
42 | checkpoint_path, iteration))
43 | return model, optimizer, learning_rate, iteration
44 |
45 |
46 | def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
47 | logger.info("Saving model and optimizer state at iteration {} to {}".format(
48 | iteration, checkpoint_path))
49 | if hasattr(model, 'module'):
50 | state_dict = model.module.state_dict()
51 | else:
52 | state_dict = model.state_dict()
53 | torch.save({'model': state_dict,
54 | 'iteration': iteration,
55 | 'optimizer': optimizer.state_dict(),
56 | 'learning_rate': learning_rate}, checkpoint_path)
57 |
58 |
59 | def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
60 | for k, v in scalars.items():
61 | writer.add_scalar(k, v, global_step)
62 | for k, v in histograms.items():
63 | writer.add_histogram(k, v, global_step)
64 | for k, v in images.items():
65 | writer.add_image(k, v, global_step, dataformats='HWC')
66 | for k, v in audios.items():
67 | writer.add_audio(k, v, global_step, audio_sampling_rate)
68 |
69 |
70 | def latest_checkpoint_path(dir_path, regex="G_*.pth"):
71 | f_list = glob.glob(os.path.join(dir_path, regex))
72 | f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
73 | x = f_list[-1]
74 | print(x)
75 | return x
76 |
77 |
78 | def plot_spectrogram_to_numpy(spectrogram):
79 | global MATPLOTLIB_FLAG
80 | if not MATPLOTLIB_FLAG:
81 | import matplotlib
82 | matplotlib.use("Agg")
83 | MATPLOTLIB_FLAG = True
84 | mpl_logger = logging.getLogger('matplotlib')
85 | mpl_logger.setLevel(logging.WARNING)
86 | import matplotlib.pylab as plt
87 | import numpy as np
88 |
89 | fig, ax = plt.subplots(figsize=(10,2))
90 | im = ax.imshow(spectrogram, aspect="auto", origin="lower",
91 | interpolation='none')
92 | plt.colorbar(im, ax=ax)
93 | plt.xlabel("Frames")
94 | plt.ylabel("Channels")
95 | plt.tight_layout()
96 |
97 | fig.canvas.draw()
98 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
99 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
100 | plt.close()
101 | return data
102 |
103 |
104 | def plot_alignment_to_numpy(alignment, info=None):
105 | global MATPLOTLIB_FLAG
106 | if not MATPLOTLIB_FLAG:
107 | import matplotlib
108 | matplotlib.use("Agg")
109 | MATPLOTLIB_FLAG = True
110 | mpl_logger = logging.getLogger('matplotlib')
111 | mpl_logger.setLevel(logging.WARNING)
112 | import matplotlib.pylab as plt
113 | import numpy as np
114 |
115 | fig, ax = plt.subplots(figsize=(6, 4))
116 | im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
117 | interpolation='none')
118 | fig.colorbar(im, ax=ax)
119 | xlabel = 'Decoder timestep'
120 | if info is not None:
121 | xlabel += '\n\n' + info
122 | plt.xlabel(xlabel)
123 | plt.ylabel('Encoder timestep')
124 | plt.tight_layout()
125 |
126 | fig.canvas.draw()
127 | data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
128 | data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
129 | plt.close()
130 | return data
131 |
132 |
133 | def load_wav_to_torch(full_path):
134 | sampling_rate, data = read(full_path)
135 | return torch.FloatTensor(data.astype(np.float32)), sampling_rate
136 |
137 |
138 | def load_filepaths_and_text(filename, split="|"):
139 | with open(filename, encoding='utf-8') as f:
140 | filepaths_and_text = [line.strip().split(split) for line in f]
141 | return filepaths_and_text
142 |
143 |
144 | def get_hparams(init=True):
145 | parser = argparse.ArgumentParser()
146 | parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
147 | help='JSON file for configuration')
148 | parser.add_argument('-m', '--model', type=str, required=True,
149 | help='Model name')
150 |
151 | args = parser.parse_args()
152 | model_dir = os.path.join("./logs", args.model)
153 |
154 | if not os.path.exists(model_dir):
155 | os.makedirs(model_dir)
156 |
157 | config_path = args.config
158 | config_save_path = os.path.join(model_dir, "config.json")
159 | if init:
160 | with open(config_path, "r") as f:
161 | data = f.read()
162 | with open(config_save_path, "w") as f:
163 | f.write(data)
164 | else:
165 | with open(config_save_path, "r") as f:
166 | data = f.read()
167 | config = json.loads(data)
168 |
169 | hparams = HParams(**config)
170 | hparams.model_dir = model_dir
171 | return hparams
172 |
173 |
174 | def get_hparams_from_dir(model_dir):
175 | config_save_path = os.path.join(model_dir, "config.json")
176 | with open(config_save_path, "r") as f:
177 | data = f.read()
178 | config = json.loads(data)
179 |
180 | hparams =HParams(**config)
181 | hparams.model_dir = model_dir
182 | return hparams
183 |
184 |
185 | def get_hparams_from_file(config_path):
186 | with open(config_path, "r") as f:
187 | data = f.read()
188 | config = json.loads(data)
189 |
190 | hparams =HParams(**config)
191 | return hparams
192 |
193 |
194 | def check_git_hash(model_dir):
195 | source_dir = os.path.dirname(os.path.realpath(__file__))
196 | if not os.path.exists(os.path.join(source_dir, ".git")):
197 | logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
198 | source_dir
199 | ))
200 | return
201 |
202 | cur_hash = subprocess.getoutput("git rev-parse HEAD")
203 |
204 | path = os.path.join(model_dir, "githash")
205 | if os.path.exists(path):
206 | saved_hash = open(path).read()
207 | if saved_hash != cur_hash:
208 | logger.warn("git hash values are different. {}(saved) != {}(current)".format(
209 | saved_hash[:8], cur_hash[:8]))
210 | else:
211 | open(path, "w").write(cur_hash)
212 |
213 |
214 | def get_logger(model_dir, filename="train.log"):
215 | global logger
216 | logger = logging.getLogger(os.path.basename(model_dir))
217 | logger.setLevel(logging.DEBUG)
218 |
219 | formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
220 | if not os.path.exists(model_dir):
221 | os.makedirs(model_dir)
222 | h = logging.FileHandler(os.path.join(model_dir, filename))
223 | h.setLevel(logging.DEBUG)
224 | h.setFormatter(formatter)
225 | logger.addHandler(h)
226 | return logger
227 |
228 |
229 | class HParams():
230 | def __init__(self, **kwargs):
231 | for k, v in kwargs.items():
232 | if type(v) == dict:
233 | v = HParams(**v)
234 | self[k] = v
235 |
236 | def keys(self):
237 | return self.__dict__.keys()
238 |
239 | def items(self):
240 | return self.__dict__.items()
241 |
242 | def values(self):
243 | return self.__dict__.values()
244 |
245 | def __len__(self):
246 | return len(self.__dict__)
247 |
248 | def __getitem__(self, key):
249 | return getattr(self, key)
250 |
251 | def __setitem__(self, key, value):
252 | return setattr(self, key, value)
253 |
254 | def __contains__(self, key):
255 | return key in self.__dict__
256 |
257 | def __repr__(self):
258 | return self.__dict__.__repr__()
259 |
--------------------------------------------------------------------------------
/transforms.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn import functional as F
3 |
4 | import numpy as np
5 |
6 |
7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
9 | DEFAULT_MIN_DERIVATIVE = 1e-3
10 |
11 |
12 | def piecewise_rational_quadratic_transform(inputs,
13 | unnormalized_widths,
14 | unnormalized_heights,
15 | unnormalized_derivatives,
16 | inverse=False,
17 | tails=None,
18 | tail_bound=1.,
19 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
20 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
21 | min_derivative=DEFAULT_MIN_DERIVATIVE):
22 |
23 | if tails is None:
24 | spline_fn = rational_quadratic_spline
25 | spline_kwargs = {}
26 | else:
27 | spline_fn = unconstrained_rational_quadratic_spline
28 | spline_kwargs = {
29 | 'tails': tails,
30 | 'tail_bound': tail_bound
31 | }
32 |
33 | outputs, logabsdet = spline_fn(
34 | inputs=inputs,
35 | unnormalized_widths=unnormalized_widths,
36 | unnormalized_heights=unnormalized_heights,
37 | unnormalized_derivatives=unnormalized_derivatives,
38 | inverse=inverse,
39 | min_bin_width=min_bin_width,
40 | min_bin_height=min_bin_height,
41 | min_derivative=min_derivative,
42 | **spline_kwargs
43 | )
44 | return outputs, logabsdet
45 |
46 |
47 | def searchsorted(bin_locations, inputs, eps=1e-6):
48 | bin_locations[..., -1] += eps
49 | return torch.sum(
50 | inputs[..., None] >= bin_locations,
51 | dim=-1
52 | ) - 1
53 |
54 |
55 | def unconstrained_rational_quadratic_spline(inputs,
56 | unnormalized_widths,
57 | unnormalized_heights,
58 | unnormalized_derivatives,
59 | inverse=False,
60 | tails='linear',
61 | tail_bound=1.,
62 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
63 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
64 | min_derivative=DEFAULT_MIN_DERIVATIVE):
65 | inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
66 | outside_interval_mask = ~inside_interval_mask
67 |
68 | outputs = torch.zeros_like(inputs)
69 | logabsdet = torch.zeros_like(inputs)
70 |
71 | if tails == 'linear':
72 | unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
73 | constant = np.log(np.exp(1 - min_derivative) - 1)
74 | unnormalized_derivatives[..., 0] = constant
75 | unnormalized_derivatives[..., -1] = constant
76 |
77 | outputs[outside_interval_mask] = inputs[outside_interval_mask]
78 | logabsdet[outside_interval_mask] = 0
79 | else:
80 | raise RuntimeError('{} tails are not implemented.'.format(tails))
81 |
82 | outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
83 | inputs=inputs[inside_interval_mask],
84 | unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
85 | unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
86 | unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
87 | inverse=inverse,
88 | left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
89 | min_bin_width=min_bin_width,
90 | min_bin_height=min_bin_height,
91 | min_derivative=min_derivative
92 | )
93 |
94 | return outputs, logabsdet
95 |
96 | def rational_quadratic_spline(inputs,
97 | unnormalized_widths,
98 | unnormalized_heights,
99 | unnormalized_derivatives,
100 | inverse=False,
101 | left=0., right=1., bottom=0., top=1.,
102 | min_bin_width=DEFAULT_MIN_BIN_WIDTH,
103 | min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
104 | min_derivative=DEFAULT_MIN_DERIVATIVE):
105 | if torch.min(inputs) < left or torch.max(inputs) > right:
106 | raise ValueError('Input to a transform is not within its domain')
107 |
108 | num_bins = unnormalized_widths.shape[-1]
109 |
110 | if min_bin_width * num_bins > 1.0:
111 | raise ValueError('Minimal bin width too large for the number of bins')
112 | if min_bin_height * num_bins > 1.0:
113 | raise ValueError('Minimal bin height too large for the number of bins')
114 |
115 | widths = F.softmax(unnormalized_widths, dim=-1)
116 | widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
117 | cumwidths = torch.cumsum(widths, dim=-1)
118 | cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
119 | cumwidths = (right - left) * cumwidths + left
120 | cumwidths[..., 0] = left
121 | cumwidths[..., -1] = right
122 | widths = cumwidths[..., 1:] - cumwidths[..., :-1]
123 |
124 | derivatives = min_derivative + F.softplus(unnormalized_derivatives)
125 |
126 | heights = F.softmax(unnormalized_heights, dim=-1)
127 | heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
128 | cumheights = torch.cumsum(heights, dim=-1)
129 | cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
130 | cumheights = (top - bottom) * cumheights + bottom
131 | cumheights[..., 0] = bottom
132 | cumheights[..., -1] = top
133 | heights = cumheights[..., 1:] - cumheights[..., :-1]
134 |
135 | if inverse:
136 | bin_idx = searchsorted(cumheights, inputs)[..., None]
137 | else:
138 | bin_idx = searchsorted(cumwidths, inputs)[..., None]
139 |
140 | input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
141 | input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
142 |
143 | input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
144 | delta = heights / widths
145 | input_delta = delta.gather(-1, bin_idx)[..., 0]
146 |
147 | input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
148 | input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
149 |
150 | input_heights = heights.gather(-1, bin_idx)[..., 0]
151 |
152 | if inverse:
153 | a = (((inputs - input_cumheights) * (input_derivatives
154 | + input_derivatives_plus_one
155 | - 2 * input_delta)
156 | + input_heights * (input_delta - input_derivatives)))
157 | b = (input_heights * input_derivatives
158 | - (inputs - input_cumheights) * (input_derivatives
159 | + input_derivatives_plus_one
160 | - 2 * input_delta))
161 | c = - input_delta * (inputs - input_cumheights)
162 |
163 | discriminant = b.pow(2) - 4 * a * c
164 | assert (discriminant >= 0).all()
165 |
166 | root = (2 * c) / (-b - torch.sqrt(discriminant))
167 | outputs = root * input_bin_widths + input_cumwidths
168 |
169 | theta_one_minus_theta = root * (1 - root)
170 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
171 | * theta_one_minus_theta)
172 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
173 | + 2 * input_delta * theta_one_minus_theta
174 | + input_derivatives * (1 - root).pow(2))
175 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
176 |
177 | return outputs, -logabsdet
178 | else:
179 | theta = (inputs - input_cumwidths) / input_bin_widths
180 | theta_one_minus_theta = theta * (1 - theta)
181 |
182 | numerator = input_heights * (input_delta * theta.pow(2)
183 | + input_derivatives * theta_one_minus_theta)
184 | denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
185 | * theta_one_minus_theta)
186 | outputs = input_cumheights + numerator / denominator
187 |
188 | derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
189 | + 2 * input_delta * theta_one_minus_theta
190 | + input_derivatives * (1 - theta).pow(2))
191 | logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
192 |
193 | return outputs, logabsdet
194 |
--------------------------------------------------------------------------------
/stft.py:
--------------------------------------------------------------------------------
1 | """
2 | BSD 3-Clause License
3 | Copyright (c) 2017, Prem Seetharaman
4 | All rights reserved.
5 | * Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 | * Redistributions of source code must retain the above copyright notice,
8 | this list of conditions and the following disclaimer.
9 | * Redistributions in binary form must reproduce the above copyright notice, this
10 | list of conditions and the following disclaimer in the
11 | documentation and/or other materials provided with the distribution.
12 | * Neither the name of the copyright holder nor the names of its
13 | contributors may be used to endorse or promote products derived from this
14 | software without specific prior written permission.
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
19 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
22 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | """
26 |
27 | import torch
28 | import numpy as np
29 | import torch.nn.functional as F
30 | from torch.autograd import Variable
31 | from scipy.signal import get_window
32 | from librosa.util import pad_center, tiny
33 | import librosa.util as librosa_util
34 |
35 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
36 | n_fft=800, dtype=np.float32, norm=None):
37 | """
38 | # from librosa 0.6
39 | Compute the sum-square envelope of a window function at a given hop length.
40 | This is used to estimate modulation effects induced by windowing
41 | observations in short-time fourier transforms.
42 | Parameters
43 | ----------
44 | window : string, tuple, number, callable, or list-like
45 | Window specification, as in `get_window`
46 | n_frames : int > 0
47 | The number of analysis frames
48 | hop_length : int > 0
49 | The number of samples to advance between frames
50 | win_length : [optional]
51 | The length of the window function. By default, this matches `n_fft`.
52 | n_fft : int > 0
53 | The length of each analysis frame.
54 | dtype : np.dtype
55 | The data type of the output
56 | Returns
57 | -------
58 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
59 | The sum-squared envelope of the window function
60 | """
61 | if win_length is None:
62 | win_length = n_fft
63 |
64 | n = n_fft + hop_length * (n_frames - 1)
65 | x = np.zeros(n, dtype=dtype)
66 |
67 | # Compute the squared window at the desired length
68 | win_sq = get_window(window, win_length, fftbins=True)
69 | win_sq = librosa_util.normalize(win_sq, norm=norm)**2
70 | win_sq = librosa_util.pad_center(win_sq, n_fft)
71 |
72 | # Fill the envelope
73 | for i in range(n_frames):
74 | sample = i * hop_length
75 | x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
76 | return x
77 |
78 |
79 | class STFT(torch.nn.Module):
80 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
81 | def __init__(self, filter_length=800, hop_length=200, win_length=800,
82 | window='hann'):
83 | super(STFT, self).__init__()
84 | self.filter_length = filter_length
85 | self.hop_length = hop_length
86 | self.win_length = win_length
87 | self.window = window
88 | self.forward_transform = None
89 | scale = self.filter_length / self.hop_length
90 | fourier_basis = np.fft.fft(np.eye(self.filter_length))
91 |
92 | cutoff = int((self.filter_length / 2 + 1))
93 | fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
94 | np.imag(fourier_basis[:cutoff, :])])
95 |
96 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
97 | inverse_basis = torch.FloatTensor(
98 | np.linalg.pinv(scale * fourier_basis).T[:, None, :])
99 |
100 | if window is not None:
101 | assert(filter_length >= win_length)
102 | # get window and zero center pad it to filter_length
103 | fft_window = get_window(window, win_length, fftbins=True)
104 | fft_window = pad_center(fft_window, filter_length)
105 | fft_window = torch.from_numpy(fft_window).float()
106 |
107 | # window the bases
108 | forward_basis *= fft_window
109 | inverse_basis *= fft_window
110 |
111 | self.register_buffer('forward_basis', forward_basis.float())
112 | self.register_buffer('inverse_basis', inverse_basis.float())
113 |
114 | def transform(self, input_data):
115 | num_batches = input_data.size(0)
116 | num_samples = input_data.size(1)
117 |
118 | self.num_samples = num_samples
119 |
120 | # similar to librosa, reflect-pad the input
121 | input_data = input_data.view(num_batches, 1, num_samples)
122 | input_data = F.pad(
123 | input_data.unsqueeze(1),
124 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
125 | mode='reflect')
126 | input_data = input_data.squeeze(1)
127 |
128 | forward_transform = F.conv1d(
129 | input_data,
130 | Variable(self.forward_basis, requires_grad=False),
131 | stride=self.hop_length,
132 | padding=0)
133 |
134 | cutoff = int((self.filter_length / 2) + 1)
135 | real_part = forward_transform[:, :cutoff, :]
136 | imag_part = forward_transform[:, cutoff:, :]
137 |
138 | magnitude = torch.sqrt(real_part**2 + imag_part**2)
139 | phase = torch.autograd.Variable(
140 | torch.atan2(imag_part.data, real_part.data))
141 |
142 | return magnitude, phase
143 |
144 | def inverse(self, magnitude, phase):
145 | recombine_magnitude_phase = torch.cat(
146 | [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
147 |
148 | inverse_transform = F.conv_transpose1d(
149 | recombine_magnitude_phase,
150 | Variable(self.inverse_basis, requires_grad=False),
151 | stride=self.hop_length,
152 | padding=0)
153 |
154 | if self.window is not None:
155 | window_sum = window_sumsquare(
156 | self.window, magnitude.size(-1), hop_length=self.hop_length,
157 | win_length=self.win_length, n_fft=self.filter_length,
158 | dtype=np.float32)
159 | # remove modulation effects
160 | approx_nonzero_indices = torch.from_numpy(
161 | np.where(window_sum > tiny(window_sum))[0])
162 | window_sum = torch.autograd.Variable(
163 | torch.from_numpy(window_sum), requires_grad=False)
164 | window_sum = window_sum.to(inverse_transform.device()) if magnitude.is_cuda else window_sum
165 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
166 |
167 | # scale by hop ratio
168 | inverse_transform *= float(self.filter_length) / self.hop_length
169 |
170 | inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
171 | inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
172 |
173 | return inverse_transform
174 |
175 | def forward(self, input_data):
176 | self.magnitude, self.phase = self.transform(input_data)
177 | reconstruction = self.inverse(self.magnitude, self.phase)
178 | return reconstruction
179 |
180 |
181 | class TorchSTFT(torch.nn.Module):
182 | def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'):
183 | super().__init__()
184 | self.filter_length = filter_length
185 | self.hop_length = hop_length
186 | self.win_length = win_length
187 | self.window = torch.from_numpy(get_window(window, win_length, fftbins=True).astype(np.float32))
188 |
189 | def transform(self, input_data):
190 | forward_transform = torch.stft(
191 | input_data,
192 | self.filter_length, self.hop_length, self.win_length, window=self.window,
193 | return_complex=True)
194 |
195 | return torch.abs(forward_transform), torch.angle(forward_transform)
196 |
197 | def inverse(self, magnitude, phase):
198 | inverse_transform = torch.istft(
199 | magnitude * torch.exp(phase * 1j),
200 | self.filter_length, self.hop_length, self.win_length, window=self.window.to(magnitude.device))
201 |
202 | return inverse_transform.unsqueeze(-2) # unsqueeze to stay consistent with conv_transpose1d implementation
203 |
204 | def forward(self, input_data):
205 | self.magnitude, self.phase = self.transform(input_data)
206 | reconstruction = self.inverse(self.magnitude, self.phase)
207 | return reconstruction
208 |
209 |
210 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/filelists/ljs_audio_text_val_filelist.txt:
--------------------------------------------------------------------------------
1 | DUMMY1/LJ022-0023.wav|The overwhelming majority of people in this country know how to sift the wheat from the chaff in what they hear and what they read.
2 | DUMMY1/LJ043-0030.wav|If somebody did that to me, a lousy trick like that, to take my wife away, and all the furniture, I would be mad as hell, too.
3 | DUMMY1/LJ005-0201.wav|as is shown by the report of the Commissioners to inquire into the state of the municipal corporations in eighteen thirty-five.
4 | DUMMY1/LJ001-0110.wav|Even the Caslon type when enlarged shows great shortcomings in this respect:
5 | DUMMY1/LJ003-0345.wav|All the committee could do in this respect was to throw the responsibility on others.
6 | DUMMY1/LJ007-0154.wav|These pungent and well-grounded strictures applied with still greater force to the unconvicted prisoner, the man who came to the prison innocent, and still uncontaminated,
7 | DUMMY1/LJ018-0098.wav|and recognized as one of the frequenters of the bogus law-stationers. His arrest led to that of others.
8 | DUMMY1/LJ047-0044.wav|Oswald was, however, willing to discuss his contacts with Soviet authorities. He denied having any involvement with Soviet intelligence agencies
9 | DUMMY1/LJ031-0038.wav|The first physician to see the President at Parkland Hospital was Dr. Charles J. Carrico, a resident in general surgery.
10 | DUMMY1/LJ048-0194.wav|during the morning of November twenty-two prior to the motorcade.
11 | DUMMY1/LJ049-0026.wav|On occasion the Secret Service has been permitted to have an agent riding in the passenger compartment with the President.
12 | DUMMY1/LJ004-0152.wav|although at Mr. Buxton's visit a new jail was in process of erection, the first step towards reform since Howard's visitation in seventeen seventy-four.
13 | DUMMY1/LJ008-0278.wav|or theirs might be one of many, and it might be considered necessary to "make an example."
14 | DUMMY1/LJ043-0002.wav|The Warren Commission Report. By The President's Commission on the Assassination of President Kennedy. Chapter seven. Lee Harvey Oswald:
15 | DUMMY1/LJ009-0114.wav|Mr. Wakefield winds up his graphic but somewhat sensational account by describing another religious service, which may appropriately be inserted here.
16 | DUMMY1/LJ028-0506.wav|A modern artist would have difficulty in doing such accurate work.
17 | DUMMY1/LJ050-0168.wav|with the particular purposes of the agency involved. The Commission recognizes that this is a controversial area
18 | DUMMY1/LJ039-0223.wav|Oswald's Marine training in marksmanship, his other rifle experience and his established familiarity with this particular weapon
19 | DUMMY1/LJ029-0032.wav|According to O'Donnell, quote, we had a motorcade wherever we went, end quote.
20 | DUMMY1/LJ031-0070.wav|Dr. Clark, who most closely observed the head wound,
21 | DUMMY1/LJ034-0198.wav|Euins, who was on the southwest corner of Elm and Houston Streets testified that he could not describe the man he saw in the window.
22 | DUMMY1/LJ026-0068.wav|Energy enters the plant, to a small extent,
23 | DUMMY1/LJ039-0075.wav|once you know that you must put the crosshairs on the target and that is all that is necessary.
24 | DUMMY1/LJ004-0096.wav|the fatal consequences whereof might be prevented if the justices of the peace were duly authorized
25 | DUMMY1/LJ005-0014.wav|Speaking on a debate on prison matters, he declared that
26 | DUMMY1/LJ012-0161.wav|he was reported to have fallen away to a shadow.
27 | DUMMY1/LJ018-0239.wav|His disappearance gave color and substance to evil reports already in circulation that the will and conveyance above referred to
28 | DUMMY1/LJ019-0257.wav|Here the tread-wheel was in use, there cellular cranks, or hard-labor machines.
29 | DUMMY1/LJ028-0008.wav|you tap gently with your heel upon the shoulder of the dromedary to urge her on.
30 | DUMMY1/LJ024-0083.wav|This plan of mine is no attack on the Court;
31 | DUMMY1/LJ042-0129.wav|No night clubs or bowling alleys, no places of recreation except the trade union dances. I have had enough.
32 | DUMMY1/LJ036-0103.wav|The police asked him whether he could pick out his passenger from the lineup.
33 | DUMMY1/LJ046-0058.wav|During his Presidency, Franklin D. Roosevelt made almost four hundred journeys and traveled more than three hundred fifty thousand miles.
34 | DUMMY1/LJ014-0076.wav|He was seen afterwards smoking and talking with his hosts in their back parlor, and never seen again alive.
35 | DUMMY1/LJ002-0043.wav|long narrow rooms -- one thirty-six feet, six twenty-three feet, and the eighth eighteen,
36 | DUMMY1/LJ009-0076.wav|We come to the sermon.
37 | DUMMY1/LJ017-0131.wav|even when the high sheriff had told him there was no possibility of a reprieve, and within a few hours of execution.
38 | DUMMY1/LJ046-0184.wav|but there is a system for the immediate notification of the Secret Service by the confining institution when a subject is released or escapes.
39 | DUMMY1/LJ014-0263.wav|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.
40 | DUMMY1/LJ042-0096.wav|(old exchange rate) in addition to his factory salary of approximately equal amount
41 | DUMMY1/LJ049-0050.wav|Hill had both feet on the car and was climbing aboard to assist President and Mrs. Kennedy.
42 | DUMMY1/LJ019-0186.wav|seeing that since the establishment of the Central Criminal Court, Newgate received prisoners for trial from several counties,
43 | DUMMY1/LJ028-0307.wav|then let twenty days pass, and at the end of that time station near the Chaldasan gates a body of four thousand.
44 | DUMMY1/LJ012-0235.wav|While they were in a state of insensibility the murder was committed.
45 | DUMMY1/LJ034-0053.wav|reached the same conclusion as Latona that the prints found on the cartons were those of Lee Harvey Oswald.
46 | DUMMY1/LJ014-0030.wav|These were damnatory facts which well supported the prosecution.
47 | DUMMY1/LJ015-0203.wav|but were the precautions too minute, the vigilance too close to be eluded or overcome?
48 | DUMMY1/LJ028-0093.wav|but his scribe wrote it in the manner customary for the scribes of those days to write of their royal masters.
49 | DUMMY1/LJ002-0018.wav|The inadequacy of the jail was noticed and reported upon again and again by the grand juries of the city of London,
50 | DUMMY1/LJ028-0275.wav|At last, in the twentieth month,
51 | DUMMY1/LJ012-0042.wav|which he kept concealed in a hiding-place with a trap-door just under his bed.
52 | DUMMY1/LJ011-0096.wav|He married a lady also belonging to the Society of Friends, who brought him a large fortune, which, and his own money, he put into a city firm,
53 | DUMMY1/LJ036-0077.wav|Roger D. Craig, a deputy sheriff of Dallas County,
54 | DUMMY1/LJ016-0318.wav|Other officials, great lawyers, governors of prisons, and chaplains supported this view.
55 | DUMMY1/LJ013-0164.wav|who came from his room ready dressed, a suspicious circumstance, as he was always late in the morning.
56 | DUMMY1/LJ027-0141.wav|is closely reproduced in the life-history of existing deer. Or, in other words,
57 | DUMMY1/LJ028-0335.wav|accordingly they committed to him the command of their whole army, and put the keys of their city into his hands.
58 | DUMMY1/LJ031-0202.wav|Mrs. Kennedy chose the hospital in Bethesda for the autopsy because the President had served in the Navy.
59 | DUMMY1/LJ021-0145.wav|From those willing to join in establishing this hoped-for period of peace,
60 | DUMMY1/LJ016-0288.wav|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells.
61 | DUMMY1/LJ028-0081.wav|Years later, when the archaeologists could readily distinguish the false from the true,
62 | DUMMY1/LJ018-0081.wav|his defense being that he had intended to commit suicide, but that, on the appearance of this officer who had wronged him,
63 | DUMMY1/LJ021-0066.wav|together with a great increase in the payrolls, there has come a substantial rise in the total of industrial profits
64 | DUMMY1/LJ009-0238.wav|After this the sheriffs sent for another rope, but the spectators interfered, and the man was carried back to jail.
65 | DUMMY1/LJ005-0079.wav|and improve the morals of the prisoners, and shall insure the proper measure of punishment to convicted offenders.
66 | DUMMY1/LJ035-0019.wav|drove to the northwest corner of Elm and Houston, and parked approximately ten feet from the traffic signal.
67 | DUMMY1/LJ036-0174.wav|This is the approximate time he entered the roominghouse, according to Earlene Roberts, the housekeeper there.
68 | DUMMY1/LJ046-0146.wav|The criteria in effect prior to November twenty-two, nineteen sixty-three, for determining whether to accept material for the PRS general files
69 | DUMMY1/LJ017-0044.wav|and the deepest anxiety was felt that the crime, if crime there had been, should be brought home to its perpetrator.
70 | DUMMY1/LJ017-0070.wav|but his sporting operations did not prosper, and he became a needy man, always driven to desperate straits for cash.
71 | DUMMY1/LJ014-0020.wav|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;
72 | DUMMY1/LJ016-0020.wav|He never reached the cistern, but fell back into the yard, injuring his legs severely.
73 | DUMMY1/LJ045-0230.wav|when he was finally apprehended in the Texas Theatre. Although it is not fully corroborated by others who were present,
74 | DUMMY1/LJ035-0129.wav|and she must have run down the stairs ahead of Oswald and would probably have seen or heard him.
75 | DUMMY1/LJ008-0307.wav|afterwards express a wish to murder the Recorder for having kept them so long in suspense.
76 | DUMMY1/LJ008-0294.wav|nearly indefinitely deferred.
77 | DUMMY1/LJ047-0148.wav|On October twenty-five,
78 | DUMMY1/LJ008-0111.wav|They entered a "stone cold room," and were presently joined by the prisoner.
79 | DUMMY1/LJ034-0042.wav|that he could only testify with certainty that the print was less than three days old.
80 | DUMMY1/LJ037-0234.wav|Mrs. Mary Brock, the wife of a mechanic who worked at the station, was there at the time and she saw a white male,
81 | DUMMY1/LJ040-0002.wav|Chapter seven. Lee Harvey Oswald: Background and Possible Motives, Part one.
82 | DUMMY1/LJ045-0140.wav|The arguments he used to justify his use of the alias suggest that Oswald may have come to think that the whole world was becoming involved
83 | DUMMY1/LJ012-0035.wav|the number and names on watches, were carefully removed or obliterated after the goods passed out of his hands.
84 | DUMMY1/LJ012-0250.wav|On the seventh July, eighteen thirty-seven,
85 | DUMMY1/LJ016-0179.wav|contracted with sheriffs and conveners to work by the job.
86 | DUMMY1/LJ016-0138.wav|at a distance from the prison.
87 | DUMMY1/LJ027-0052.wav|These principles of homology are essential to a correct interpretation of the facts of morphology.
88 | DUMMY1/LJ031-0134.wav|On one occasion Mrs. Johnson, accompanied by two Secret Service agents, left the room to see Mrs. Kennedy and Mrs. Connally.
89 | DUMMY1/LJ019-0273.wav|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline.
90 | DUMMY1/LJ014-0110.wav|At the first the boxes were impounded, opened, and found to contain many of O'Connor's effects.
91 | DUMMY1/LJ034-0160.wav|on Brennan's subsequent certain identification of Lee Harvey Oswald as the man he saw fire the rifle.
92 | DUMMY1/LJ038-0199.wav|eleven. If I am alive and taken prisoner,
93 | DUMMY1/LJ014-0010.wav|yet he could not overcome the strange fascination it had for him, and remained by the side of the corpse till the stretcher came.
94 | DUMMY1/LJ033-0047.wav|I noticed when I went out that the light was on, end quote,
95 | DUMMY1/LJ040-0027.wav|He was never satisfied with anything.
96 | DUMMY1/LJ048-0228.wav|and others who were present say that no agent was inebriated or acted improperly.
97 | DUMMY1/LJ003-0111.wav|He was in consequence put out of the protection of their internal law, end quote. Their code was a subject of some curiosity.
98 | DUMMY1/LJ008-0258.wav|Let me retrace my steps, and speak more in detail of the treatment of the condemned in those bloodthirsty and brutally indifferent days,
99 | DUMMY1/LJ029-0022.wav|The original plan called for the President to spend only one day in the State, making whirlwind visits to Dallas, Fort Worth, San Antonio, and Houston.
100 | DUMMY1/LJ004-0045.wav|Mr. Sturges Bourne, Sir James Mackintosh, Sir James Scarlett, and William Wilberforce.
101 |
--------------------------------------------------------------------------------
/train_latest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | import itertools
5 | import math
6 | import torch
7 | from torch import nn, optim
8 | from torch.nn import functional as F
9 | from torch.utils.data import DataLoader
10 | from torch.utils.tensorboard import SummaryWriter
11 | import torch.multiprocessing as mp
12 | import torch.distributed as dist
13 | from torch.nn.parallel import DistributedDataParallel as DDP
14 | from torch.cuda.amp import autocast, GradScaler
15 | from pqmf import PQMF
16 |
17 | import commons
18 | import utils
19 | from data_utils import (
20 | TextAudioLoader,
21 | TextAudioCollate,
22 | DistributedBucketSampler
23 | )
24 | from models import (
25 | SynthesizerTrn,
26 | MultiPeriodDiscriminator,
27 | )
28 | from losses import (
29 | generator_loss,
30 | discriminator_loss,
31 | feature_loss,
32 | kl_loss,
33 | subband_stft_loss
34 | )
35 | from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
36 | from text.symbols import symbols
37 |
38 | torch.autograd.set_detect_anomaly(True)
39 | torch.backends.cudnn.benchmark = True
40 | global_step = 0
41 |
42 |
43 | def main():
44 | """Assume Single Node Multi GPUs Training Only"""
45 | assert torch.cuda.is_available(), "CPU training is not allowed."
46 |
47 | n_gpus = torch.cuda.device_count()
48 | os.environ['MASTER_ADDR'] = 'localhost'
49 | os.environ['MASTER_PORT'] = '65520'
50 | # n_gpus = 1
51 |
52 | hps = utils.get_hparams()
53 | mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
54 |
55 |
56 | def run(rank, n_gpus, hps):
57 | global global_step
58 | if rank == 0:
59 | logger = utils.get_logger(hps.model_dir)
60 | logger.info(hps)
61 | utils.check_git_hash(hps.model_dir)
62 | writer = SummaryWriter(log_dir=hps.model_dir)
63 | writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
64 |
65 | dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
66 | torch.manual_seed(hps.train.seed)
67 | torch.cuda.set_device(rank)
68 |
69 | train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
70 | train_sampler = DistributedBucketSampler(
71 | train_dataset,
72 | hps.train.batch_size,
73 | [32,300,400,500,600,700,800,900,1000],
74 | num_replicas=n_gpus,
75 | rank=rank,
76 | shuffle=True)
77 | collate_fn = TextAudioCollate()
78 | train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True,
79 | collate_fn=collate_fn, batch_sampler=train_sampler)
80 | if rank == 0:
81 | eval_dataset = TextAudioLoader(hps.data.validation_files, hps.data)
82 | eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False,
83 | batch_size=hps.train.batch_size, pin_memory=True,
84 | drop_last=False, collate_fn=collate_fn)
85 |
86 | net_g = SynthesizerTrn(
87 | len(symbols),
88 | hps.data.filter_length // 2 + 1,
89 | hps.train.segment_size // hps.data.hop_length,
90 | **hps.model).cuda(rank)
91 | net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
92 | optim_g = torch.optim.AdamW(
93 | net_g.parameters(),
94 | hps.train.learning_rate,
95 | betas=hps.train.betas,
96 | eps=hps.train.eps)
97 | optim_d = torch.optim.AdamW(
98 | net_d.parameters(),
99 | hps.train.learning_rate,
100 | betas=hps.train.betas,
101 | eps=hps.train.eps)
102 | net_g = DDP(net_g, device_ids=[rank])
103 | net_d = DDP(net_d, device_ids=[rank])
104 |
105 | try:
106 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g)
107 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d)
108 | global_step = (epoch_str - 1) * len(train_loader)
109 | except:
110 | epoch_str = 1
111 | global_step = 0
112 |
113 | scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str-2)
114 | scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str-2)
115 |
116 | scaler = GradScaler(enabled=hps.train.fp16_run)
117 |
118 | for epoch in range(epoch_str, hps.train.epochs + 1):
119 | if rank==0:
120 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval])
121 | else:
122 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None)
123 | scheduler_g.step()
124 | scheduler_d.step()
125 |
126 |
127 |
128 | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers):
129 | net_g, net_d = nets
130 | optim_g, optim_d = optims
131 | scheduler_g, scheduler_d = schedulers
132 | train_loader, eval_loader = loaders
133 | if writers is not None:
134 | writer, writer_eval = writers
135 |
136 | train_loader.batch_sampler.set_epoch(epoch)
137 | global global_step
138 |
139 | net_g.train()
140 | net_d.train()
141 | for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths) in enumerate(train_loader):
142 | x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True)
143 | spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
144 | y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True)
145 |
146 | with autocast(enabled=hps.train.fp16_run):
147 | y_hat, y_hat_mb, l_length, attn, ids_slice, x_mask, z_mask,\
148 | (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths)
149 |
150 | mel = spec_to_mel_torch(
151 | spec,
152 | hps.data.filter_length,
153 | hps.data.n_mel_channels,
154 | hps.data.sampling_rate,
155 | hps.data.mel_fmin,
156 | hps.data.mel_fmax)
157 | y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
158 | y_hat_mel = mel_spectrogram_torch(
159 | y_hat.squeeze(1),
160 | hps.data.filter_length,
161 | hps.data.n_mel_channels,
162 | hps.data.sampling_rate,
163 | hps.data.hop_length,
164 | hps.data.win_length,
165 | hps.data.mel_fmin,
166 | hps.data.mel_fmax
167 | )
168 |
169 | y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice
170 |
171 | # Discriminator
172 | y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
173 | with autocast(enabled=False):
174 | loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
175 | loss_disc_all = loss_disc
176 | optim_d.zero_grad()
177 | scaler.scale(loss_disc_all).backward()
178 | scaler.unscale_(optim_d)
179 | grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
180 | scaler.step(optim_d)
181 |
182 |
183 |
184 |
185 | with autocast(enabled=hps.train.fp16_run):
186 | # Generator
187 | y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
188 | with autocast(enabled=False):
189 | loss_dur = torch.sum(l_length.float())
190 | loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
191 | loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
192 |
193 | loss_fm = feature_loss(fmap_r, fmap_g)
194 | loss_gen, losses_gen = generator_loss(y_d_hat_g)
195 |
196 | if hps.model.mb_istft_vits == True:
197 | pqmf = PQMF(y.device)
198 | y_mb = pqmf.analysis(y)
199 | loss_subband = subband_stft_loss(hps, y_mb, y_hat_mb)
200 | else:
201 | loss_subband = torch.tensor(0.0)
202 |
203 | loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl + loss_subband
204 |
205 | optim_g.zero_grad()
206 | scaler.scale(loss_gen_all).backward()
207 | scaler.unscale_(optim_g)
208 | grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
209 | scaler.step(optim_g)
210 | scaler.update()
211 |
212 | if rank==0:
213 | if global_step % hps.train.log_interval == 0:
214 | lr = optim_g.param_groups[0]['lr']
215 | losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl, loss_subband]
216 | logger.info('Train Epoch: {} [{:.0f}%]'.format(
217 | epoch,
218 | 100. * batch_idx / len(train_loader)))
219 | logger.info([x.item() for x in losses] + [global_step, lr])
220 |
221 | scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g}
222 | scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl, "loss/g/subband": loss_subband})
223 |
224 | scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
225 | scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
226 | scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
227 | image_dict = {
228 | "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
229 | "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
230 | "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
231 | "all/attn": utils.plot_alignment_to_numpy(attn[0,0].data.cpu().numpy())
232 | }
233 | utils.summarize(
234 | writer=writer,
235 | global_step=global_step,
236 | images=image_dict,
237 | scalars=scalar_dict)
238 |
239 | if global_step % hps.train.eval_interval == 0:
240 | evaluate(hps, net_g, eval_loader, writer_eval)
241 | utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
242 | utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
243 | global_step += 1
244 |
245 |
246 | if rank == 0:
247 | logger.info('====> Epoch: {}'.format(epoch))
248 |
249 |
250 |
251 |
252 | def evaluate(hps, generator, eval_loader, writer_eval):
253 | generator.eval()
254 | with torch.no_grad():
255 | for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths) in enumerate(eval_loader):
256 | x, x_lengths = x.cuda(0), x_lengths.cuda(0)
257 | spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0)
258 | y, y_lengths = y.cuda(0), y_lengths.cuda(0)
259 |
260 | # remove else
261 | x = x[:1]
262 | x_lengths = x_lengths[:1]
263 | spec = spec[:1]
264 | spec_lengths = spec_lengths[:1]
265 | y = y[:1]
266 | y_lengths = y_lengths[:1]
267 | break
268 | y_hat, y_hat_mb, attn, mask, *_ = generator.module.infer(x, x_lengths, max_len=1000)
269 | y_hat_lengths = mask.sum([1,2]).long() * hps.data.hop_length
270 |
271 | mel = spec_to_mel_torch(
272 | spec,
273 | hps.data.filter_length,
274 | hps.data.n_mel_channels,
275 | hps.data.sampling_rate,
276 | hps.data.mel_fmin,
277 | hps.data.mel_fmax)
278 | y_hat_mel = mel_spectrogram_torch(
279 | y_hat.squeeze(1).float(),
280 | hps.data.filter_length,
281 | hps.data.n_mel_channels,
282 | hps.data.sampling_rate,
283 | hps.data.hop_length,
284 | hps.data.win_length,
285 | hps.data.mel_fmin,
286 | hps.data.mel_fmax
287 | )
288 | image_dict = {
289 | "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy())
290 | }
291 | audio_dict = {
292 | "gen/audio": y_hat[0,:,:y_hat_lengths[0]]
293 | }
294 | if global_step == 0:
295 | image_dict.update({"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())})
296 | audio_dict.update({"gt/audio": y[0,:,:y_lengths[0]]})
297 |
298 | utils.summarize(
299 | writer=writer_eval,
300 | global_step=global_step,
301 | images=image_dict,
302 | audios=audio_dict,
303 | audio_sampling_rate=hps.data.sampling_rate
304 | )
305 | generator.train()
306 |
307 |
308 | if __name__ == "__main__":
309 | os.environ[
310 | "TORCH_DISTRIBUTED_DEBUG"
311 | ] = "DETAIL"
312 | main()
313 |
--------------------------------------------------------------------------------
/filelists/filelist_train2.txt.cleaned:
--------------------------------------------------------------------------------
1 | ./tsukuyomi/VOICEACTRESS100_001.wav|ma↑ta, to↓ojino yo↓oni, go↓dai myo↑oo↓oto yo↑bareru, ʃu↑yoona myo↑oo↓ono ʧu↑uo↓oni ha↓isareru ko↑to↓mo o↓oi.
2 | ./tsukuyomi/VOICEACTRESS100_002.wav|nyu↑uiNguraNdo↓fuuwa, gyu↑unyuuo be↓esUto ʃI↑ta, ʃi↑ro↓i ku↑riimusu↓upudeari, bo↑sUtoNkuramuʧa↓udaatomo yo↑bareru.
3 | ./tsukuyomi/VOICEACTRESS100_003.wav|ko↑Npyuutage↓emuno me↓ekaaya, gyo↑okaida↓Ntainadoni ka↑NreN su↑ru ji↓Nbutsuno ka↑te↓gori.
4 | ./tsukuyomi/VOICEACTRESS100_004.wav|sa↑abisumaneejaadoonyuu↓ekino ta↑me, o↑oi↓maʧi e↓kIkara, e↑NkakUka↓Nri ʃI↑te i↑ru.
5 | ./tsukuyomi/VOICEACTRESS100_005.wav|ʃi↓rubaa sa↑afaaʃuugekiji↓keNmadeni, ri↓ʧaazuwa, ʧi↑imu↓meeto to↑moni, ko↑kUsai↓tekini su↑upaahi↓iroo, o↓yobi, yu↑ume↓ejiNto ʃI↑te, ni↓NʧI sa↑rete i↑ru.
6 | ./tsukuyomi/VOICEACTRESS100_006.wav|ts u yu↑reNharuto↓ryoowa, byu↑ruteNberuku↓ryooni he↑Nnyuu sa↑reta.
7 | ./tsukuyomi/VOICEACTRESS100_007.wav|ji↑kaN ryo↑oikIto, ku↑ukaNryo↓oikide kyo↑otsuu su↑ru ʃo↑riʃu↓hoowa, fi↑rutari↓Nguni yo↑ru, nyu↑uryokUʃi↓Ngoono kyo↓okadearu.
8 | ./tsukuyomi/VOICEACTRESS100_008.wav|ʃa↑Nʧiino se↑Ngyoo↓purowa, ʧi↓imukara ʃI↑hara↓wareru kyu↓uryooto, ta↑ikyoku↓hio, o↓mona ʃu↑unyuuto ʃI↑te i↑ru.
9 | ./tsukuyomi/VOICEACTRESS100_009.wav|ma↑ta ne↑jimeʃiwa, ʧu↑usa↓Nnoono o↑same↓ru, ryu↑ukyuuo↓okokUtono ko↑oekinimo sa↑Nka ʃI↑ta.
10 | ./tsukuyomi/VOICEACTRESS100_010.wav|su↑maato↓foNkara, fi↑iʧaafo↓Nmade, ma↑ruʧideba↓isuni ta↑ioo.
11 | ./tsukuyomi/VOICEACTRESS100_011.wav|ke↑emyoo ʃa↑datsuna na↑reeʃoNkara, jo↑oʧo↓kaN a↑fure↓ru ka↑tarima↓de, ha↑bahiro↓i hyo↑ogeNryo↓kuo mo↓tsu.
12 | ./tsukuyomi/VOICEACTRESS100_012.wav|ko↑ozoowa, ha↑ganeseeno ta↑Nitsu a↓aʧide, kyo↑okyakuwa, i↑ʃItsumidearu.
13 | ./tsukuyomi/VOICEACTRESS100_013.wav|so↑koe, o↓onaaga a↑taraʃi↓i ʃe↓fUto ʃI↑te, u↑dekIkino hyo↑nu↓kuo ma↑ne↓ku.
14 | ./tsukuyomi/VOICEACTRESS100_014.wav|ku↑iiNzuabenyuua↓rufani ʃo↑zoku ʃI↑te i↑ru.
15 | ./tsukuyomi/VOICEACTRESS100_015.wav|i↑Qpo↓ode, gyo↓gyooto ʃo↓ogyoode, rya↑nesUko↓owa ha↑Nee ʃI↑te i↑ta.
16 | ./tsukuyomi/VOICEACTRESS100_016.wav|ko↑no, nyu↑usausuweeruzudaihyooʧi↓imuga, wa↑rabi↓izuno ʧu↑ukakUto na↓Qte i↑ku.
17 | ./tsukuyomi/VOICEACTRESS100_017.wav|ta↓daʃi, gya↑NburuizoNʃoono nyu↑uiNʧi↓ryooo i↑Qte i↑ru byo↑oiNwa, wa↓zukadearu.
18 | ./tsukuyomi/VOICEACTRESS100_018.wav|ta↓no me↓jaana di↑sUtoribyu↓uʃoNni ku↑rabe, se↑kyuritiijoono mo↑Ndaino ʃu↑useega, o↑soi ba↑aimo a↓ru.
19 | ./tsukuyomi/VOICEACTRESS100_019.wav|be↑rugaato↓oa ma↓eno, ve↑digeNuufaapa↓akuniwa, se↑Nsooto da↑Natsuno gi↑seeʃano ta↑me↓no ki↑neN↓higa ta↓Qte i↑ru.
20 | ./tsukuyomi/VOICEACTRESS100_020.wav|ze↑NbeepaburiQʃaazukyo↓okaino, be↑sUtosUtora↓tejiigeemuobuzaiyaao, ni↑QpoN↓jiNto ʃI↑te ju↑ʃoo.
21 | ./tsukuyomi/VOICEACTRESS100_021.wav|i↑tami↓wa, te↑Ntekiyo↓ri ʧi↑Ntsuuya↓kuo, jo↑omyakUto↓oyo su↑ru ko↑to↓de, ʧi↑Ntsuuo o↑konau.
22 | ./tsukuyomi/VOICEACTRESS100_022.wav|ko↑no to↓kini, fu↑yuutairikUpuruva↓mani a↓ru, ʧu↑uritsu↓koku, byu↑eru↓baga, a↓ru jo↑ohoosujikara, ba↑QʃUʃo↓oguNno ʃo↑keeto, ze↑No↓ojo, a↓aʃeno ji↓gaio ha↑Qpyoo.
23 | ./tsukuyomi/VOICEACTRESS100_023.wav|na↑Nsee↓bu wo↓oreNwa, be↑ia↓amaN fa↓amuzuto, fi↑Qtsujera↓rudono ʧi↓kude, ko↑osee sa↑reru.
24 | ./tsukuyomi/VOICEACTRESS100_024.wav|ko↑no ta↑me, pu↑razumaʧuuno i↓oNya, de↓Nʃino mo↓tsu, he↑ekiNuNdooene↓rugiio, o↓Ndode hyo↑oge↓N su↑ru ko↑to↓ga a↓ru.
25 | ./tsukuyomi/VOICEACTRESS100_025.wav|so↑no hyo↑ohyooto ʃI↑ta hI↑togaraga, ro↑onyakuna↓Nnyoni ʃI↑tawarete i↑ru.
26 | ./tsukuyomi/VOICEACTRESS100_026.wav|ge↓Nzai, nyu↑ujaajii↓ʃuu, mu↓ua zu↑ta↓uNni su↓Nde i↑ru.
27 | ./tsukuyomi/VOICEACTRESS100_027.wav|ʧo↑oikini a↓Qta, mi↑tsunesaN↓haNwa, na↑gaoka↓haNni, be↑e hya↑Q↓pyooo o↑kuQta ko↑to↓de yu↑umee.
28 | ./tsukuyomi/VOICEACTRESS100_028.wav|ko↑no to↓ki, pe↑rime↓edeewa, a↑mupIkutoriyu↓ooNni do↑okoo ʃI↑te, te↓ebaini ki↓te i↑ta, r i ky u mu↑ni↓osuni, tsu↓mato ʃI↑te a↑taerareta.
29 | ./tsukuyomi/VOICEACTRESS100_029.wav|ge↓Nzaino ka↑Qsooo mo↑kUtekIto ʃI↑ta, sU↑kiibu↓utsuwa, ka↑tai pu↑rasUʧiQku↓ʃeruto, ya↑waraka↓i i↑Nnaabu↓utsUkara na↓ru.
30 | ./tsukuyomi/VOICEACTRESS100_030.wav|bo↑ogo↓ori bu↓utsuwa, hyo↑ome↓Nni ha↑rareta, go↑museeno u↑sui ma↑ku↓de de↑ki↓te i↑ru.
31 | ./tsukuyomi/VOICEACTRESS100_031.wav|ko↓oʃano da↑ihyooga, we↑Qjiu↓Qdono, ja↑sUpaawe↓adearu.
32 | ./tsukuyomi/VOICEACTRESS100_032.wav|ki↑i↓kyokuga ha↑QʃiN su↑ru, nyu↑usuneQtowaaku↓meeo ka↑NʃIta ta↓itoruno, nyu↑usuba↓Ngumino na↓kadewa, re↑gyuraaho↓osooga, mo↑Qto↓mo o↓oi.
33 | ./tsukuyomi/VOICEACTRESS100_033.wav|ge↑enoopu↓rodakUʃoN, a↑myu↓uzuno gu↑ruupUki↓gyoo.
34 | ./tsukuyomi/VOICEACTRESS100_034.wav|ʧo↑obo↓iNo ʃo↑oryaku ʃI↑te, e↑ryu↓ʃioNtomo hyo↓okI sa↑reru.
35 | ./tsukuyomi/VOICEACTRESS100_035.wav|mo↑rinagano o↑iʃi↓i gyu↑unyuuwa, ko↓i a↑oironi, gyu↑unyuu↓biNo a↑ʃira↓Qta de↑za↓iNno, pa↑Qkugyu↓unyuudearu.
36 | ./tsukuyomi/VOICEACTRESS100_036.wav|ba↑Ngumibo↓otooo, to↑okyoomuubiise↓esakuno a↑nime↓eʃoNde, hyo↑oge↓N su↑ru te↑Nmo, kyo↑otsuu ʃI↑te i↑ta.
37 | ./tsukuyomi/VOICEACTRESS100_037.wav|ko↑myu↓uNwa, se↑enu↓gawato, e↑soNnu↓kawano, go↑oryuuʧi↓teNto na↓Qte i↑ru.
38 | ./tsukuyomi/VOICEACTRESS100_038.wav|do↑ojini, fU↑kuimi↓rakuruerefaNtsuni, ko↑oʧIke↓NniNde, nyu↑udaN su↑ru ko↑to↓ga ha↑Qpyoo sa↑reta.
39 | ./tsukuyomi/VOICEACTRESS100_039.wav|o↑Qtodearu ko↑muroga, kyu↑ukyuuʃa↓o yo↑bi, to↑naibyo↓oiNni, ki↑Nkyuu ha↑Nsoo sa↑reru.
40 | ./tsukuyomi/VOICEACTRESS100_040.wav|gi↑re↓sUpiiwa, ma↓Qgiio tsu↑ujite, i↓nesUto ʃi↑ria↓Qta.
41 | ./tsukuyomi/VOICEACTRESS100_041.wav|fo↑Nteenuburooyo↓oʃIkidewa, gu↑ui↓tekina e↓ga, ʃi↑Qkuino mo↓orudoni tsU↑kawarete i↑ru.
42 | ./tsukuyomi/VOICEACTRESS100_042.wav|sa↑ijiNwa, bi↑ʃunuhano se↓ejiN, su↓waa mi↑inaaraayaN.
43 | ./tsukuyomi/VOICEACTRESS100_043.wav|ha↓adee su↑ga, pe↑ruse↓poneeni ko↓io ʃI↑ta no↑wa, a↑purodi↓iteeno, sa↑kuryakudearuto sa↑rete i↑ru.
44 | ./tsukuyomi/VOICEACTRESS100_044.wav|ku↓weeNbaaNʧaaNwa, ʧi↓isana ko↑myu↓nitiide, no↓ogyooya, ʃo↓ogyooo ʧu↑uʃiNni, na↑rita↓Qte i↑ta↓to, ka↑Nga↓erarete i↑ru.
45 | ./tsukuyomi/VOICEACTRESS100_045.wav|ve↑ezaajiteNʃa↓dooya, myu↓ureN ru↓utoni ʃI↑taga↓Qta, sa↑ikuriNgutsu↓aawa, pe↓etaasuhaageNo, ke↑eyu su↑ru.
46 | ./tsukuyomi/VOICEACTRESS100_046.wav|fo↑omyura↓kaawa, tsu↑ujoo, o↑opuNhoi↓irude, ʃi↑Nguruʃi↓itaadearu.
47 | ./tsukuyomi/VOICEACTRESS100_047.wav|do↑ojitsu a↓sani, o↑osakana↓Nbade, ʃu↑QpatsUse↓remoniiga ka↑isai sa↑re, e↑egyoou↓NteNni, ju↑utoo sa↑reta.
48 | ./tsukuyomi/VOICEACTRESS100_048.wav|so↑ʃIte, i↑NdepeNdeNto↓ʃino, do↑kUʃato↓ohyoode e↑ra↓bu, pu↑remiariigusaiyuuʃuugooruki↓ipaani e↑ra↓bareta.
49 | ./tsukuyomi/VOICEACTRESS100_049.wav|pu↑reiyaa↓kyarakUtaawa, kyu↑udeNo se↓Nkyo ʃI↑ta, ja↑akuna ku↑ri↓iʧaani so↑oguu su↑ru.
50 | ./tsukuyomi/VOICEACTRESS100_050.wav|fi↑irudomaake↓tiNguwa, re↑kIʃi↓tekiniwa, i↑Qpo↓o tsu↑ukoono ko↑myunikeeʃoNtsu↓uruto ʃI↑te, ka↑Nga↓erarete ki↓ta.
51 | ./tsukuyomi/VOICEACTRESS100_051.wav|de↑byuu↓gono su↑une↓NkaNwa, be↑biife↓isUto ʃI↑te, ho↓Nmyoode ka↑tsudoo.
52 | ./tsukuyomi/VOICEACTRESS100_052.wav|ga↑Qkooya byo↑oiNna↓dono, kyu↑uʃokugyo↓omude, e↑eyo↓osoo ke↑esaN su↑ru jo↑ode, ju↑uyoona ʃi↓ryoono hI↑to↓tsudearu.
53 | ./tsukuyomi/VOICEACTRESS100_053.wav|to↓oji, a↑yaʃii wa↓arudoni jo↑oʧuu ʃI↑te i↑ta gi↑ko↓nekoga, ku↑uhakuni↓te ha↑Qpyoo.
54 | ./tsukuyomi/VOICEACTRESS100_054.wav|yu↓ufUkuna nyu↑uyookaa↓taʧiwa, gu↑re↓evuseNdo, ke↑ebajooya, ʃi↓ipuʃeQdobei, ke↑ebajoona↓doni tsu↑do↓i, u↑mizoino ko↑okyuu re↓sUtoraNya, ho↓teruo ri↑yoo ʃI↑ta.
55 | ./tsukuyomi/VOICEACTRESS100_055.wav|wo↑riaazumiQkusumaaʃaruaatsuakademiiʃo↓zoku.
56 | ./tsukuyomi/VOICEACTRESS100_056.wav|to↑koro↓ga, e↑riyuʃIkuto↓oNwa, nyu↓mupeeno se↑eʃimo kI↑kazuni, de↑emeete↓eruno ka↓ʃio, ki↑ritao↓ʃIta.
57 | ./tsukuyomi/VOICEACTRESS100_057.wav|ko↑no je↑ʃii↓yakude sU↑tei↓mosuwa, e↑mii↓ʃooni no↑mine↓eto sa↑reta ko↑to↓mo a↓ru.
58 | ./tsukuyomi/VOICEACTRESS100_058.wav|su↑weedeNi↓miNno ryo↓oʃiNno mo↑to↓ni, ma↑saʧuuseQtsu↓ʃuu, ke↓NburiQjinite u↑mareru.
59 | ./tsukuyomi/VOICEACTRESS100_059.wav|kyu↑ueNno fa↑Nto↓ohyoodemo, ni↑Nkiga gu↑uzooka ʃI↑te i↑ta, na↑gaʃima ʃi↑geoni ni↑kUhakU su↑ru.
60 | ./tsukuyomi/VOICEACTRESS100_060.wav|ha↓hawa, pi↑itaamariQtsuba↓aguno se↑eʃiNbyo↓oiNni nyu↑uiN ʃI↑te i↑ru to↓kini, be↓Qʃiio u↑mu.
61 | ./tsukuyomi/VOICEACTRESS100_061.wav|po↑iNtoga↓adokara, su↑moorufo↓waadomade ko↑nase↓ru, so↑ogooryo↓kuga ta↑ka↓i yu↑utiritiipu↓reeyaadearu.
62 | ./tsukuyomi/VOICEACTRESS100_062.wav|gu↑re↓Qguwa, mi↑ʃIʃiQpi↓ʃuu, a↑badi↓iNni a↓ru, o↑Qdoferoozu↓boʧini ma↑isoo sa↑reru ko↑to↓ni na↓Qta.
63 | ./tsukuyomi/VOICEACTRESS100_063.wav|o↑oatariʃuuryoo↓gowa, gu↑radieetaaʧa↓Nsuni to↑tsunyuu su↑ru.
64 | ./tsukuyomi/VOICEACTRESS100_064.wav|ko↑no ki↓Nni yo↑ru byo↑okiwa, ha↑iirokabibyooto na↑zuke↓rarete i↑ru mo↑no↓ga o↓oi.
65 | ./tsukuyomi/VOICEACTRESS100_065.wav|re↑gyuraame↓Nbaano ka↑oja↓ʃiNo ku↑ri↓Qku ʃI↑ta a↓toni, mu↑ubiipureiyaa↓fuuni sa↑isee sa↑reruto i↑u, to↑kuina ke↑eʃIkito na↓Qte i↑ru.
66 | ./tsukuyomi/VOICEACTRESS100_066.wav|ka↑Nzooeno sa↑Nsokyo↓okyuuwa, ka↑Ndo↓omyakUto, te↑eatsu↓keeno mo↑N↓myakuo ka↓iʃIte, o↑konawarete i↑ru.
67 | ./tsukuyomi/VOICEACTRESS100_067.wav|de↑Qdo↓kiiwa, ta↑ipura↓itaaya, ko↑Npyu↓utano ki↑ibo↓odoni o↑keru, to↑kUʃuna so↑oʃoku↓kiidearu.
68 | ./tsukuyomi/VOICEACTRESS100_068.wav|ʃa↓NʃaN u↑ma↓wa, u↑dojiNgu↓ue sa↑Npai su↑ru, ʃi↑NkoNfu↓ufuga no↑Qte i↑ta u↑ma↓no ko↑to.
69 | ./tsukuyomi/VOICEACTRESS100_069.wav|bu↑ruuriQjisa↓Nmyakuno ge↑Nryuukara, ri↑Qʧimo↓Ndomade, o↓okuno ha↑yaseya fU↑ʧi↓ga, tsu↑riya kyu↑uryuuku↓dario ta↑noʃi↓masete ku↑reru.
70 | ./tsukuyomi/VOICEACTRESS100_070.wav|bo↑o↓haNwa, i↑isUtomaN↓ra, gya↓Nguno sU↑piikui↓ijiino a↑garikara, wa↓iroo to↓Qte i↑ta↓tomo u↑wasa sa↑reta.
71 | ./tsukuyomi/VOICEACTRESS100_071.wav|pe↑Nʃirubenia↓ʃuu, fi↑raderu↓fiano ko↓ogai, wi↑Nre↓Qdono re↑Nkinaubyo↓oiNde u↑mareta.
72 | ./tsukuyomi/VOICEACTRESS100_072.wav|bu↑ra↓Qguwa, byu↓u e↑ru↓guNyorimo, re↑QseedaQta ta↑me↓ni, ko↑no ki↓kaio i↑ka↓sU ko↑to↓o ʧu↓uʧo ʃI↑ta.
73 | ./tsukuyomi/VOICEACTRESS100_073.wav|jo↑oiNgi↓iNto ʃI↑te, ba↓aNweruwa, ka↑riforunia↓ʃuuno, re↑Npooka↓nyuuni sa↑Nsee ʃI↑ta.
74 | ./tsukuyomi/VOICEACTRESS100_074.wav|re↑jeNdoʃiri↓izuo be↓esuni, yo↑o fu↑riiki↓kooo so↑nae↓ta, byu↓u ka↓mera.
75 | ./tsukuyomi/VOICEACTRESS100_075.wav|ga↑Qkyokuno se↑Ntaapoji↓ʃoNwa, e↑ikeebiifootii↓eitono, ta↑ka↓haʃi mi↓namiga tsU↑tome↓ta.
76 | ./tsukuyomi/VOICEACTRESS100_076.wav|di↑onyu↓usosuno, ʧo↑oaio u↑ke↓ru, o↑ineusu↓ooto, h i, a↑rutai↓aano a↑idani, ka↓riyu do↑oNno o↓ojoto ʃI↑te, se↓eo u↑ke↓ta.
77 | ./tsukuyomi/VOICEACTRESS100_077.wav|o↑oniʃi yo↑ojoono, ju↑Nkoo ko↓odokara, do↓oryoku na↓ʃide, ʧi↑jooe ka↑Qkuuhi↓koo ʃ i, ki↑Nkyuu ʧa↑kurikuni se↑ekoo ʃI↑ta.
78 | ./tsukuyomi/VOICEACTRESS100_078.wav|hyo↑ogeNgyo↓oretsuno ʃI↑hyoohyooo, bu↓Nʃino ta↑iʃooseeo a↑rawa↓su, te↑N↓guNno ʃI↑hyoohyooo mo↑ʧii↓te, su↑Nde ya↑kUhyooge↓Ne bu↑Nkai su↑ru.
79 | ./tsukuyomi/VOICEACTRESS100_079.wav|ta↑iyoogyogyooo↓onaano, na↓kabe ke↑NkIʧino i↓noʧio u↑ke↓te, pu↑royakyuukyu↓udaNno, ta↑iyoohoe↓eruzuni ka↑kawa↓ru.
80 | ./tsukuyomi/VOICEACTRESS100_080.wav|ka↓sUkani kI↑koete ku↓ru se↓N kyu↓uhyakU sa↓Njuu i↑ʧine↓NbaNno sa↑Nbi↓kaga, ʃi↑daini o↓okIkunaQte i↑ku.
81 | ./tsukuyomi/VOICEACTRESS100_081.wav|mo↓o i↑ideeNgaNpekIʧii↓kino jo↓obuwa, pu↑raasaatopurawihaaNjiiNi↓sekie tsu↑nagaru, ta↑igawa sa↑Ndooni tsu↑zuite i↑ru.
82 | ./tsukuyomi/VOICEACTRESS100_082.wav|ka↑amira↓boʃIto yo↑barete i↑ru wa↑kUseekara, u↑ʧuuseNni no↑Qte, ʧI↑kyuuni ʃi↑Nnyuu ʃI↑ta u↑ʧuu↓jiN.
83 | ./tsukuyomi/VOICEACTRESS100_083.wav|do↑Qgaaba↓Nkuwa, ta↓raya ni↓ʃiNno gyo↑kaku↓ryouga o↓oi, ju↑uyoona, gyo↑joodearu.
84 | ./tsukuyomi/VOICEACTRESS100_084.wav|ʃo↑oneNji↓daiwa, ro↑ʃiate↓ekoku, ʧe↑runiihiu↓keN, pu↑ruiruukui↓guN, to↑rosUʧanuiitsuyamurade su↑go↓ʃIta.
85 | ./tsukuyomi/VOICEACTRESS100_085.wav|i↑haino ho↑to↓Ndowa, su↑weedeNniʃIka↓igaNno, bu↓u hyu↑usureeNʧIho↓ono ko↑jimani a↓ru gyo↑soN, f u y a r u ba↓Qka ʃu↑uheNno u↓mini, sa↑NkotsU sa↑reta.
86 | ./tsukuyomi/VOICEACTRESS100_086.wav|ko↑Qkyooo ko↑ete, re↑Qʃawa, ka↑iryoo sa↑reta za↑iraiseNni so↓Qte, a↑aheNʧuuoo↓ekini mu↑kau.
87 | ./tsukuyomi/VOICEACTRESS100_087.wav|fU↑kuokadaieeho↓okUsudewanaku, ʧo↑okyorihoono ho↑kyooo me↑za↓ʃIte i↑ta, o↑osakakiNtetsuba↓faroozukara, o↓faao u↑ke↓te nyu↑udaN.
88 | ./tsukuyomi/VOICEACTRESS100_088.wav|so↑koniwa, hya↑kudoru↓satsUto, a↑merikani ko↓ito i↑u, mi↑jika↓i me↑QseejidakedaQta.
89 | ./tsukuyomi/VOICEACTRESS100_089.wav|ge↓Nzaiwa, ba↓Qhao mo↑ʧi↓ifUto ʃI↑ta, ha↑apUʃiko↓odono sa↑Qkyoku↓kato ʃI↑te, ki↑okU sa↑rete i↑ru.
90 | ./tsukuyomi/VOICEACTRESS100_090.wav|se↑Ntoo↓fUkuwa, ryo↑o↓udeo ro↑ʃUtsu ʃ i, ryo↑okyakuga, a↑Ndaasu↓utsude o↑owarete i↑ru.
91 | ./tsukuyomi/VOICEACTRESS100_091.wav|do↑obo↓aneni, su↑weedeNo↓ohi, jo↑zefi↓inuga i↑ru.
92 | ./tsukuyomi/VOICEACTRESS100_092.wav|ʃi↑gaiseNwa, hyo↑omeNene↓rugiino, ʧi↑isa↓i po↓rimaao se↑QʧakU su↑ru sa↓ino, ze↑N↓ʃorini ri↑yoo sa↑reru.
93 | ./tsukuyomi/VOICEACTRESS100_093.wav|ji↓ʃiNno pe↑ejide, me↓Qseejiya, ko↑okaiko↓meNtoo to↑oʃi↓te, re↓byuuo to↑okoo ʃI↑ta yu↓uzaato, ko↑myunike↓eʃoNo to↓ru ko↑to↓ga ka↑noodearu.
94 | ./tsukuyomi/VOICEACTRESS100_094.wav|wa↓kakI hi↑no ha↑Ngyaku↓yueni, u↓ʧuuno ʧu↑uo↓oo tsu↑ihoo sa↑rete, wa↑kUsee, ʧI↑kyuuni ya↑QtekIta ʃu↑ji↓Nkoo, be↓ruzebabuga, u↑ʧuuseN ka↓runaakuno na↓kade, ma↑go↓ni ka↑taru so↑odaina mo↑noga↓tari.
95 | ./tsukuyomi/VOICEACTRESS100_095.wav|ja↓gaatowa ta↑iʃoo↓tekini, bo↑diibi↓rudaao ho↑ofUtsuto sa↑seru, ma↓Qʧona ta↑iikUkai↓keeno ga↑ikeNga to↑kUʧoo.
--------------------------------------------------------------------------------
/filelists/ljs_audio_text_val_filelist.txt.cleaned:
--------------------------------------------------------------------------------
1 | DUMMY1/LJ022-0023.wav|ðɪ ˌoʊvɚwˈɛlmɪŋ mədʒˈɔːɹɪɾi ʌv pˈiːpəl ɪn ðɪs kˈʌntɹi nˈoʊ hˌaʊ tə sˈɪft ðə wˈiːt fɹʌmðə tʃˈæf ɪn wˌʌt ðeɪ hˈɪɹ ænd wˌʌt ðeɪ ɹˈiːd.
2 | DUMMY1/LJ043-0030.wav|ɪf sˈʌmbɑːdi dˈɪd ðˈæt tə mˌiː, ɐ lˈaʊsi tɹˈɪk lˈaɪk ðˈæt, tə tˈeɪk maɪ wˈaɪf ɐwˈeɪ, ænd ˈɔːl ðə fˈɜːnɪtʃɚ, ˈaɪ wʊd biː mˈæd æz hˈɛl, tˈuː.
3 | DUMMY1/LJ005-0201.wav|ˌæzˌɪz ʃˈoʊn baɪ ðə ɹɪpˈoːɹt ʌvðə kəmˈɪʃənɚz tʊ ɪnkwˈaɪɚɹ ˌɪntʊ ðə stˈeɪt ʌvðə mjuːnˈɪsɪpəl kˌɔːɹpɚɹˈeɪʃənz ɪn eɪtˈiːn θˈɜːɾifˈaɪv.
4 | DUMMY1/LJ001-0110.wav|ˈiːvən ðə kˈæslɑːn tˈaɪp wɛn ɛnlˈɑːɹdʒd ʃˈoʊz ɡɹˈeɪt ʃˈɔːɹtkʌmɪŋz ɪn ðɪs ɹɪspˈɛkt:
5 | DUMMY1/LJ003-0345.wav|ˈɔːl ðə kəmˈɪɾi kʊd dˈuː ɪn ðɪs ɹɪspˈɛkt wʌz tə θɹˈoʊ ðə ɹɪspˌɑːnsəbˈɪlɪɾi ˌɑːn ˈʌðɚz.
6 | DUMMY1/LJ007-0154.wav|ðiːz pˈʌndʒənt ænd wˈɛlɡɹˈaʊndᵻd stɹˈɪktʃɚz ɐplˈaɪd wɪð stˈɪl ɡɹˈeɪɾɚ fˈoːɹs tə ðɪ ʌnkənvˈɪktᵻd pɹˈɪzənɚ, ðə mˈæn hˌuː kˈeɪm tə ðə pɹˈɪzən ˈɪnəsənt, ænd stˈɪl ʌnkəntˈæmᵻnˌeɪɾᵻd,
7 | DUMMY1/LJ018-0098.wav|ænd ɹˈɛkəɡnˌaɪzd æz wˈʌn ʌvðə fɹˈiːkwɛntɚz ʌvðə bˈoʊɡəs lˈɔːstˈeɪʃənɚz. hɪz ɐɹˈɛst lˈɛd tə ðæt ʌv ˈʌðɚz.
8 | DUMMY1/LJ047-0044.wav|ˈɑːswəld wʌz, haʊˈɛvɚ, wˈɪlɪŋ tə dɪskˈʌs hɪz kˈɑːntækts wɪð sˈoʊviət ɐθˈɔːɹɪɾiz. hiː dɪnˈaɪd hˌævɪŋ ˌɛni ɪnvˈɑːlvmənt wɪð sˈoʊviət ɪntˈɛlɪdʒəns ˈeɪdʒənsiz
9 | DUMMY1/LJ031-0038.wav|ðə fˈɜːst fɪzˈɪʃən tə sˈiː ðə pɹˈɛzɪdənt æt pˈɑːɹklənd hˈɑːspɪɾəl wʌz dˈɑːktɚ tʃˈɑːɹlz dʒˈeɪ. kˈæɹɪkˌoʊ, ɐ ɹˈɛzɪdənt ɪn dʒˈɛnɚɹəl sˈɜːdʒɚɹi.
10 | DUMMY1/LJ048-0194.wav|dˈʊɹɪŋ ðə mˈɔːɹnɪŋ ʌv noʊvˈɛmbɚ twˈɛntitˈuː pɹˈaɪɚ tə ðə mˈoʊɾɚkˌeɪd.
11 | DUMMY1/LJ049-0026.wav|ˌɑːn əkˈeɪʒən ðə sˈiːkɹət sˈɜːvɪs hɐzbɪn pɚmˈɪɾᵻd tə hæv ɐn ˈeɪdʒənt ɹˈaɪdɪŋ ɪnðə pˈæsɪndʒɚ kəmpˈɑːɹtmənt wɪððə pɹˈɛzɪdənt.
12 | DUMMY1/LJ004-0152.wav|ɑːlðˈoʊ æt mˈɪstɚ bˈʌkstənz vˈɪzɪt ɐ nˈuː dʒˈeɪl wʌz ɪn pɹˈɑːsɛs ʌv ɪɹˈɛkʃən, ðə fˈɜːst stˈɛp tʊwˈɔːɹdz ɹɪfˈɔːɹm sˈɪns hˈaʊɚdz vˌɪzɪtˈeɪʃən ɪn sˌɛvəntˈiːn sˈɛvəntifˈoːɹ.
13 | DUMMY1/LJ008-0278.wav|ɔːɹ ðˈɛɹz mˌaɪt biː wˈʌn ʌv mˈɛni, ænd ɪt mˌaɪt biː kənsˈɪdɚd nˈɛsəsɚɹi tuː "mˌeɪk ɐn ɛɡzˈæmpəl."
14 | DUMMY1/LJ043-0002.wav|ðə wˈɔːɹən kəmˈɪʃən ɹɪpˈoːɹt. baɪ ðə pɹˈɛzɪdənts kəmˈɪʃən ɑːnðɪ ɐsˌæsᵻnˈeɪʃən ʌv pɹˈɛzɪdənt kˈɛnədi. tʃˈæptɚ sˈɛvən. lˈiː hˈɑːɹvi ˈɑːswəld:
15 | DUMMY1/LJ009-0114.wav|mˈɪstɚ wˈeɪkfiːld wˈaɪndz ˈʌp hɪz ɡɹˈæfɪk bˌʌt sˈʌmwʌt sɛnsˈeɪʃənəl ɐkˈaʊnt baɪ dɪskɹˈaɪbɪŋ ɐnˈʌðɚ ɹɪlˈɪdʒəs sˈɜːvɪs, wˌɪtʃ mˈeɪ ɐpɹˈoʊpɹɪətli biː ɪnsˈɜːɾᵻd hˈɪɹ.
16 | DUMMY1/LJ028-0506.wav|ɐ mˈɑːdɚn ˈɑːɹɾɪst wʊdhɐv dˈɪfɪkˌʌlti ɪn dˌuːɪŋ sˈʌtʃ ˈækjʊɹət wˈɜːk.
17 | DUMMY1/LJ050-0168.wav|wɪððə pɚtˈɪkjʊlɚ pˈɜːpəsᵻz ʌvðɪ ˈeɪdʒənsi ɪnvˈɑːlvd. ðə kəmˈɪʃən ɹˈɛkəɡnˌaɪzɪz ðæt ðɪs ɪz ɐ kˌɑːntɹəvˈɜːʃəl ˈɛɹiə
18 | DUMMY1/LJ039-0223.wav|ˈɑːswəldz mɚɹˈiːn tɹˈeɪnɪŋ ɪn mˈɑːɹksmənʃˌɪp, hɪz ˈʌðɚ ɹˈaɪfəl ɛkspˈiəɹɪəns ænd hɪz ɪstˈæblɪʃt fəmˌɪlɪˈæɹɪɾi wɪð ðɪs pɚtˈɪkjʊlɚ wˈɛpən
19 | DUMMY1/LJ029-0032.wav|ɐkˈoːɹdɪŋ tʊ oʊdˈɑːnəl, kwˈoʊt, wiː hɐd ɐ mˈoʊɾɚkˌeɪd wɛɹɹˈɛvɚ wiː wˈɛnt, ˈɛnd kwˈoʊt.
20 | DUMMY1/LJ031-0070.wav|dˈɑːktɚ klˈɑːɹk, hˌuː mˈoʊst klˈoʊsli ɑːbzˈɜːvd ðə hˈɛd wˈuːnd,
21 | DUMMY1/LJ034-0198.wav|jˈuːɪnz, hˌuː wʌz ɑːnðə saʊθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən stɹˈiːts tˈɛstɪfˌaɪd ðæt hiː kʊd nˌɑːt dɪskɹˈaɪb ðə mˈæn hiː sˈɔː ɪnðə wˈɪndoʊ.
22 | DUMMY1/LJ026-0068.wav|ˈɛnɚdʒi ˈɛntɚz ðə plˈænt, tʊ ɐ smˈɔːl ɛkstˈɛnt,
23 | DUMMY1/LJ039-0075.wav|wˈʌns juː nˈoʊ ðæt juː mˈʌst pˌʊt ðə kɹˈɔshɛɹz ɑːnðə tˈɑːɹɡɪt ænd ðæt ɪz ˈɔːl ðæt ɪz nˈɛsəsɚɹi.
24 | DUMMY1/LJ004-0096.wav|ðə fˈeɪɾəl kˈɑːnsɪkwənsᵻz wˈɛɹɑːf mˌaɪt biː pɹɪvˈɛntᵻd ɪf ðə dʒˈʌstɪsᵻz ʌvðə pˈiːs wɜː djˈuːli ˈɔːθɚɹˌaɪzd
25 | DUMMY1/LJ005-0014.wav|spˈiːkɪŋ ˌɑːn ɐ dɪbˈeɪt ˌɑːn pɹˈɪzən mˈæɾɚz, hiː dᵻklˈɛɹd ðˈæt
26 | DUMMY1/LJ012-0161.wav|hiː wʌz ɹɪpˈoːɹɾᵻd tə hæv fˈɔːlən ɐwˈeɪ tʊ ɐ ʃˈædoʊ.
27 | DUMMY1/LJ018-0239.wav|hɪz dˌɪsɐpˈɪɹəns ɡˈeɪv kˈʌlɚ ænd sˈʌbstəns tʊ ˈiːvəl ɹɪpˈoːɹts ɔːlɹˌɛdi ɪn sˌɜːkjʊlˈeɪʃən ðætðə wɪl ænd kənvˈeɪəns əbˌʌv ɹɪfˈɜːd tuː
28 | DUMMY1/LJ019-0257.wav|hˈɪɹ ðə tɹˈɛdwˈiːl wʌz ɪn jˈuːs, ðɛɹ sˈɛljʊlɚ kɹˈæŋks, ɔːɹ hˈɑːɹdlˈeɪbɚ məʃˈiːnz.
29 | DUMMY1/LJ028-0008.wav|juː tˈæp dʒˈɛntli wɪð jʊɹ hˈiːl əpˌɑːn ðə ʃˈoʊldɚɹ ʌvðə dɹˈoʊmdɚɹi tʊ ˈɜːdʒ hɜːɹ ˈɑːn.
30 | DUMMY1/LJ024-0083.wav|ðɪs plˈæn ʌv mˈaɪn ɪz nˈoʊ ɐtˈæk ɑːnðə kˈoːɹt;
31 | DUMMY1/LJ042-0129.wav|nˈoʊ nˈaɪt klˈʌbz ɔːɹ bˈoʊlɪŋ ˈælɪz, nˈoʊ plˈeɪsᵻz ʌv ɹˌɛkɹiːˈeɪʃən ɛksˈɛpt ðə tɹˈeɪd jˈuːniən dˈænsᵻz. ˈaɪ hæv hɐd ɪnˈʌf.
32 | DUMMY1/LJ036-0103.wav|ðə pəlˈiːs ˈæskt hˌɪm wˈɛðɚ hiː kʊd pˈɪk ˈaʊt hɪz pˈæsɪndʒɚ fɹʌmðə lˈaɪnʌp.
33 | DUMMY1/LJ046-0058.wav|dˈʊɹɪŋ hɪz pɹˈɛzɪdənsi, fɹˈæŋklɪn dˈiː. ɹˈoʊzəvˌɛlt mˌeɪd ˈɔːlmoʊst fˈoːɹ hˈʌndɹəd dʒˈɜːnɪz ænd tɹˈævəld mˈoːɹ ðɐn θɹˈiː hˈʌndɹəd fˈɪfti θˈaʊzənd mˈaɪlz.
34 | DUMMY1/LJ014-0076.wav|hiː wʌz sˈiːn ˈæftɚwɚdz smˈoʊkɪŋ ænd tˈɔːkɪŋ wɪð hɪz hˈoʊsts ɪn ðɛɹ bˈæk pˈɑːɹlɚ, ænd nˈɛvɚ sˈiːn ɐɡˈɛn ɐlˈaɪv.
35 | DUMMY1/LJ002-0043.wav|lˈɑːŋ nˈæɹoʊ ɹˈuːmz wˈʌn θˈɜːɾisˈɪks fˈiːt, sˈɪks twˈɛntiθɹˈiː fˈiːt, ænd ðɪ ˈeɪtθ eɪtˈiːn,
36 | DUMMY1/LJ009-0076.wav|wiː kˈʌm tə ðə sˈɜːmən.
37 | DUMMY1/LJ017-0131.wav|ˈiːvən wɛn ðə hˈaɪ ʃˈɛɹɪf hɐd tˈoʊld hˌɪm ðɛɹwˌʌz nˈoʊ pˌɑːsəbˈɪlɪɾi əvɚ ɹɪpɹˈiːv, ænd wɪðˌɪn ɐ fjˈuː ˈaɪʊɹz ʌv ˌɛksɪkjˈuːʃən.
38 | DUMMY1/LJ046-0184.wav|bˌʌt ðɛɹ ɪz ɐ sˈɪstəm fɚðɪ ɪmˈiːdɪət nˌoʊɾɪfɪkˈeɪʃən ʌvðə sˈiːkɹət sˈɜːvɪs baɪ ðə kənfˈaɪnɪŋ ˌɪnstɪtˈuːʃən wɛn ɐ sˈʌbdʒɛkt ɪz ɹɪlˈiːsd ɔːɹ ɛskˈeɪps.
39 | DUMMY1/LJ014-0263.wav|wˌɛn ˈʌðɚ plˈɛʒɚz pˈɔːld hiː tˈʊk ɐ θˈiəɾɚ, ænd pˈoʊzd æz ɐ mjuːnˈɪfɪsənt pˈeɪtɹən ʌvðə dɹəmˈæɾɪk ˈɑːɹt.
40 | DUMMY1/LJ042-0096.wav| ˈoʊld ɛkstʃˈeɪndʒ ɹˈeɪt ɪn ɐdˈɪʃən tə hɪz fˈæktɚɹi sˈælɚɹi ʌv ɐpɹˈɑːksɪmətli ˈiːkwəl ɐmˈaʊnt
41 | DUMMY1/LJ049-0050.wav|hˈɪl hɐd bˈoʊθ fˈiːt ɑːnðə kˈɑːɹ ænd wʌz klˈaɪmɪŋ ɐbˈoːɹd tʊ ɐsˈɪst pɹˈɛzɪdənt ænd mɪsˈɛs kˈɛnədi.
42 | DUMMY1/LJ019-0186.wav|sˈiːɪŋ ðæt sˈɪns ðɪ ɪstˈæblɪʃmənt ʌvðə sˈɛntɹəl kɹˈɪmɪnəl kˈoːɹt, nˈuːɡeɪt ɹɪsˈiːvd pɹˈɪzənɚz fɔːɹ tɹˈaɪəl fɹʌm sˈɛvɹəl kˈaʊntɪz,
43 | DUMMY1/LJ028-0307.wav|ðˈɛn lˈɛt twˈɛnti dˈeɪz pˈæs, ænd æt ðɪ ˈɛnd ʌv ðæt tˈaɪm stˈeɪʃən nˌɪɹ ðə tʃˈældæsən ɡˈeɪts ɐ bˈɑːdi ʌv fˈoːɹ θˈaʊzənd.
44 | DUMMY1/LJ012-0235.wav|wˌaɪl ðeɪ wɜːɹ ɪn ɐ stˈeɪt ʌv ɪnsˌɛnsəbˈɪlɪɾi ðə mˈɜːdɚ wʌz kəmˈɪɾᵻd.
45 | DUMMY1/LJ034-0053.wav|ɹˈiːtʃt ðə sˈeɪm kənklˈuːʒən æz lætˈoʊnə ðætðə pɹˈɪnts fˈaʊnd ɑːnðə kˈɑːɹtənz wɜː ðoʊz ʌv lˈiː hˈɑːɹvi ˈɑːswəld.
46 | DUMMY1/LJ014-0030.wav|ðiːz wɜː dˈæmnətˌoːɹi fˈækts wˌɪtʃ wˈɛl səpˈoːɹɾᵻd ðə pɹˌɑːsɪkjˈuːʃən.
47 | DUMMY1/LJ015-0203.wav|bˌʌt wɜː ðə pɹɪkˈɔːʃənz tˈuː mˈɪnɪt, ðə vˈɪdʒɪləns tˈuː klˈoʊs təbi ɪlˈuːdᵻd ɔːɹ ˌoʊvɚkˈʌm?
48 | DUMMY1/LJ028-0093.wav|bˌʌt hɪz skɹˈaɪb ɹˈoʊt ɪt ɪnðə mˈænɚ kˈʌstəmˌɛɹi fɚðə skɹˈaɪbz ʌv ðoʊz dˈeɪz tə ɹˈaɪt ʌv ðɛɹ ɹˈɔɪəl mˈæstɚz.
49 | DUMMY1/LJ002-0018.wav|ðɪ ɪnˈædɪkwəsi ʌvðə dʒˈeɪl wʌz nˈoʊɾɪsd ænd ɹɪpˈoːɹɾᵻd əpˌɑːn ɐɡˈɛn ænd ɐɡˈɛn baɪ ðə ɡɹˈænd dʒˈʊɹɪz ʌvðə sˈɪɾi ʌv lˈʌndən,
50 | DUMMY1/LJ028-0275.wav|æt lˈæst, ɪnðə twˈɛntiəθ mˈʌnθ,
51 | DUMMY1/LJ012-0042.wav|wˌɪtʃ hiː kˈɛpt kənsˈiːld ɪn ɐ hˈaɪdɪŋplˈeɪs wɪð ɐ tɹˈæpdˈoːɹ dʒˈʌst ˌʌndɚ hɪz bˈɛd.
52 | DUMMY1/LJ011-0096.wav|hiː mˈæɹɪd ɐ lˈeɪdi ˈɑːlsoʊ bɪlˈɑːŋɪŋ tə ðə səsˈaɪəɾi ʌv fɹˈɛndz, hˌuː bɹˈɔːt hˌɪm ɐ lˈɑːɹdʒ fˈɔːɹtʃən, wˈɪtʃ, ænd hɪz ˈoʊn mˈʌni, hiː pˌʊt ˌɪntʊ ɐ sˈɪɾi fˈɜːm,
53 | DUMMY1/LJ036-0077.wav|ɹˈɑːdʒɚ dˈiː. kɹˈeɪɡ, ɐ dˈɛpjuːɾi ʃˈɛɹɪf ʌv dˈæləs kˈaʊnti,
54 | DUMMY1/LJ016-0318.wav|ˈʌðɚɹ əfˈɪʃəlz, ɡɹˈeɪt lˈɔɪɚz, ɡˈʌvɚnɚz ʌv pɹˈɪzənz, ænd tʃˈæplɪnz səpˈoːɹɾᵻd ðɪs vjˈuː.
55 | DUMMY1/LJ013-0164.wav|hˌuː kˈeɪm fɹʌm hɪz ɹˈuːm ɹˈɛdi dɹˈɛst, ɐ səspˈɪʃəs sˈɜːkəmstˌæns, æz hiː wʌz ˈɔːlweɪz lˈeɪt ɪnðə mˈɔːɹnɪŋ.
56 | DUMMY1/LJ027-0141.wav|ɪz klˈoʊsli ɹɪpɹədˈuːst ɪnðə lˈaɪfhˈɪstɚɹi ʌv ɛɡzˈɪstɪŋ dˈɪɹ. ˈɔːɹ, ɪn ˈʌðɚ wˈɜːdz,
57 | DUMMY1/LJ028-0335.wav|ɐkˈoːɹdɪŋli ðeɪ kəmˈɪɾᵻd tə hˌɪm ðə kəmˈænd ʌv ðɛɹ hˈoʊl ˈɑːɹmi, ænd pˌʊt ðə kˈiːz ʌv ðɛɹ sˈɪɾi ˌɪntʊ hɪz hˈændz.
58 | DUMMY1/LJ031-0202.wav|mɪsˈɛs kˈɛnədi tʃˈoʊz ðə hˈɑːspɪɾəl ɪn bəθˈɛzdə fɚðɪ ˈɔːtɑːpsi bɪkˈʌz ðə pɹˈɛzɪdənt hɐd sˈɜːvd ɪnðə nˈeɪvi.
59 | DUMMY1/LJ021-0145.wav|fɹʌm ðoʊz wˈɪlɪŋ tə dʒˈɔɪn ɪn ɪstˈæblɪʃɪŋ ðɪs hˈoʊptfɔːɹ pˈiəɹɪəd ʌv pˈiːs,
60 | DUMMY1/LJ016-0288.wav|"mˈʌlɚ, mˈʌlɚ, hiːz ðə mˈæn," tˈɪl ɐ daɪvˈɜːʒən wʌz kɹiːˈeɪɾᵻd baɪ ðɪ ɐpˈɪɹəns ʌvðə ɡˈæloʊz, wˌɪtʃ wʌz ɹɪsˈiːvd wɪð kəntˈɪnjuːəs jˈɛlz.
61 | DUMMY1/LJ028-0081.wav|jˈɪɹz lˈeɪɾɚ, wˌɛn ðɪ ˌɑːɹkiːˈɑːlədʒˌɪsts kʊd ɹˈɛdɪli dɪstˈɪŋɡwɪʃ ðə fˈɑːls fɹʌmðə tɹˈuː,
62 | DUMMY1/LJ018-0081.wav|hɪz dɪfˈɛns bˌiːɪŋ ðæt hiː hɐd ɪntˈɛndᵻd tə kəmˈɪt sˈuːɪsˌaɪd, bˌʌt ðˈæt, ɑːnðɪ ɐpˈɪɹəns ʌv ðɪs ˈɑːfɪsɚ hˌuː hɐd ɹˈɔŋd hˌɪm,
63 | DUMMY1/LJ021-0066.wav|təɡˌɛðɚ wɪð ɐ ɡɹˈeɪt ˈɪnkɹiːs ɪnðə pˈeɪɹoʊlz, ðɛɹ hɐz kˈʌm ɐ səbstˈænʃəl ɹˈaɪz ɪnðə tˈoʊɾəl ʌv ɪndˈʌstɹɪəl pɹˈɑːfɪts
64 | DUMMY1/LJ009-0238.wav|ˈæftɚ ðɪs ðə ʃˈɛɹɪfs sˈɛnt fɔːɹ ɐnˈʌðɚ ɹˈoʊp, bˌʌt ðə spɛktˈeɪɾɚz ˌɪntəfˈɪɹd, ænd ðə mˈæn wʌz kˈæɹɪd bˈæk tə dʒˈeɪl.
65 | DUMMY1/LJ005-0079.wav|ænd ɪmpɹˈuːv ðə mˈɔːɹəlz ʌvðə pɹˈɪzənɚz, ænd ʃˌæl ɪnʃˈʊɹ ðə pɹˈɑːpɚ mˈɛʒɚɹ ʌv pˈʌnɪʃmənt tə kənvˈɪktᵻd əfˈɛndɚz.
66 | DUMMY1/LJ035-0019.wav|dɹˈoʊv tə ðə nɔːɹθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən, ænd pˈɑːɹkt ɐpɹˈɑːksɪmətli tˈɛn fˈiːt fɹʌmðə tɹˈæfɪk sˈɪɡnəl.
67 | DUMMY1/LJ036-0174.wav|ðɪs ɪz ðɪ ɐpɹˈɑːksɪmət tˈaɪm hiː ˈɛntɚd ðə ɹˈuːmɪŋhˌaʊs, ɐkˈoːɹdɪŋ tʊ ˈɜːliːn ɹˈɑːbɚts, ðə hˈaʊskiːpɚ ðˈɛɹ.
68 | DUMMY1/LJ046-0146.wav|ðə kɹaɪtˈiəɹɪə ɪn ɪfˈɛkt pɹˈaɪɚ tə noʊvˈɛmbɚ twˈɛntitˈuː, naɪntˈiːn sˈɪkstiθɹˈiː, fɔːɹ dɪtˈɜːmɪnɪŋ wˈɛðɚ tʊ ɐksˈɛpt mətˈiəɹɪəl fɚðə pˌiːˌɑːɹˈɛs dʒˈɛnɚɹəl fˈaɪlz
69 | DUMMY1/LJ017-0044.wav|ænd ðə dˈiːpəst æŋzˈaɪəɾi wʌz fˈɛlt ðætðə kɹˈaɪm, ɪf kɹˈaɪm ðˈɛɹ hɐdbɪn, ʃˌʊd biː bɹˈɔːt hˈoʊm tʊ ɪts pˈɜːpɪtɹˌeɪɾɚ.
70 | DUMMY1/LJ017-0070.wav|bˌʌt hɪz spˈoːɹɾɪŋ ˌɑːpɚɹˈeɪʃənz dɪdnˌɑːt pɹˈɑːspɚ, ænd hiː bɪkˌeɪm ɐ nˈiːdi mˈæn, ˈɔːlweɪz dɹˈɪvən tə dˈɛspɚɹət stɹˈeɪts fɔːɹ kˈæʃ.
71 | DUMMY1/LJ014-0020.wav|hiː wʌz sˈuːn ˈæftɚwɚdz ɐɹˈɛstᵻd ˌɑːn səspˈɪʃən, ænd ɐ sˈɜːtʃ ʌv hɪz lˈɑːdʒɪŋz bɹˈɔːt tə lˈaɪt sˈɛvɹəl ɡˈɑːɹmənts sˈætʃɚɹˌeɪɾᵻd wɪð blˈʌd;
72 | DUMMY1/LJ016-0020.wav|hiː nˈɛvɚ ɹˈiːtʃt ðə sˈɪstɚn, bˌʌt fˈɛl bˈæk ˌɪntʊ ðə jˈɑːɹd, ˈɪndʒɚɹɪŋ hɪz lˈɛɡz sɪvˈɪɹli.
73 | DUMMY1/LJ045-0230.wav|wˌɛn hiː wʌz fˈaɪnəli ˌæpɹɪhˈɛndᵻd ɪnðə tˈɛksəs θˈiəɾɚ. ɑːlðˈoʊ ɪt ɪz nˌɑːt fˈʊli kɚɹˈɑːbɚɹˌeɪɾᵻd baɪ ˈʌðɚz hˌuː wɜː pɹˈɛzənt,
74 | DUMMY1/LJ035-0129.wav|ænd ʃiː mˈʌstɐv ɹˈʌn dˌaʊn ðə stˈɛɹz ɐhˈɛd ʌv ˈɑːswəld ænd wʊd pɹˈɑːbəbli hæv sˈiːn ɔːɹ hˈɜːd hˌɪm.
75 | DUMMY1/LJ008-0307.wav|ˈæftɚwɚdz ɛkspɹˈɛs ɐ wˈɪʃ tə mˈɜːdɚ ðə ɹɪkˈoːɹdɚ fɔːɹ hˌævɪŋ kˈɛpt ðˌɛm sˌoʊ lˈɑːŋ ɪn səspˈɛns.
76 | DUMMY1/LJ008-0294.wav|nˌɪɹli ɪndˈɛfɪnətli dɪfˈɜːd.
77 | DUMMY1/LJ047-0148.wav|ˌɑːn ɑːktˈoʊbɚ twˈɛntifˈaɪv,
78 | DUMMY1/LJ008-0111.wav|ðeɪ ˈɛntɚd ˈeɪ "stˈoʊn kˈoʊld ɹˈuːm," ænd wɜː pɹˈɛzəntli dʒˈɔɪnd baɪ ðə pɹˈɪzənɚ.
79 | DUMMY1/LJ034-0042.wav|ðæt hiː kʊd ˈoʊnli tˈɛstɪfˌaɪ wɪð sˈɜːtənti ðætðə pɹˈɪnt wʌz lˈɛs ðɐn θɹˈiː dˈeɪz ˈoʊld.
80 | DUMMY1/LJ037-0234.wav|mɪsˈɛs mˈɛɹi bɹˈɑːk, ðə wˈaɪf əvə mɪkˈænɪk hˌuː wˈɜːkt æt ðə stˈeɪʃən, wʌz ðɛɹ æt ðə tˈaɪm ænd ʃiː sˈɔː ɐ wˈaɪt mˈeɪl,
81 | DUMMY1/LJ040-0002.wav|tʃˈæptɚ sˈɛvən. lˈiː hˈɑːɹvi ˈɑːswəld: bˈækɡɹaʊnd ænd pˈɑːsəbəl mˈoʊɾɪvz, pˈɑːɹt wˌʌn.
82 | DUMMY1/LJ045-0140.wav|ðɪ ˈɑːɹɡjuːmənts hiː jˈuːzd tə dʒˈʌstɪfˌaɪ hɪz jˈuːs ʌvðɪ ˈeɪliəs sədʒˈɛst ðæt ˈɑːswəld mˌeɪhɐv kˈʌm tə θˈɪŋk ðætðə hˈoʊl wˈɜːld wʌz bɪkˈʌmɪŋ ɪnvˈɑːlvd
83 | DUMMY1/LJ012-0035.wav|ðə nˈʌmbɚ ænd nˈeɪmz ˌɑːn wˈɑːtʃᵻz, wɜː kˈɛɹfəli ɹɪmˈuːvd ɔːɹ əblˈɪɾɚɹˌeɪɾᵻd ˈæftɚ ðə ɡˈʊdz pˈæst ˌaʊɾəv hɪz hˈændz.
84 | DUMMY1/LJ012-0250.wav|ɑːnðə sˈɛvənθ dʒuːlˈaɪ, eɪtˈiːn θˈɜːɾisˈɛvən,
85 | DUMMY1/LJ016-0179.wav|kəntɹˈæktᵻd wɪð ʃˈɛɹɪfs ænd kənvˈɛnɚz tə wˈɜːk baɪ ðə dʒˈɑːb.
86 | DUMMY1/LJ016-0138.wav|æɾə dˈɪstəns fɹʌmðə pɹˈɪzən.
87 | DUMMY1/LJ027-0052.wav|ðiːz pɹˈɪnsɪpəlz ʌv həmˈɑːlədʒi ɑːɹ ɪsˈɛnʃəl tʊ ɐ kɚɹˈɛkt ɪntˌɜːpɹɪtˈeɪʃən ʌvðə fˈækts ʌv mɔːɹfˈɑːlədʒi.
88 | DUMMY1/LJ031-0134.wav|ˌɑːn wˈʌn əkˈeɪʒən mɪsˈɛs dʒˈɑːnsən, ɐkˈʌmpənɪd baɪ tˈuː sˈiːkɹət sˈɜːvɪs ˈeɪdʒənts, lˈɛft ðə ɹˈuːm tə sˈiː mɪsˈɛs kˈɛnədi ænd mɪsˈɛs kənˈæli.
89 | DUMMY1/LJ019-0273.wav|wˌɪtʃ sˌɜː dʒˈɑːʃjuːə dʒˈɛb tˈoʊld ðə kəmˈɪɾi hiː kənsˈɪdɚd ðə pɹˈɑːpɚɹ ˈɛlɪmənts ʌv pˈiːnəl dˈɪsɪplˌɪn.
90 | DUMMY1/LJ014-0110.wav|æt ðə fˈɜːst ðə bˈɑːksᵻz wɜːɹ ɪmpˈaʊndᵻd, ˈoʊpənd, ænd fˈaʊnd tə kəntˈeɪn mˈɛnɪəv oʊkˈɑːnɚz ɪfˈɛkts.
91 | DUMMY1/LJ034-0160.wav|ˌɑːn bɹˈɛnənz sˈʌbsɪkwənt sˈɜːtən aɪdˈɛntɪfɪkˈeɪʃən ʌv lˈiː hˈɑːɹvi ˈɑːswəld æz ðə mˈæn hiː sˈɔː fˈaɪɚ ðə ɹˈaɪfəl.
92 | DUMMY1/LJ038-0199.wav|ɪlˈɛvən. ɪf ˈaɪ æm ɐlˈaɪv ænd tˈeɪkən pɹˈɪzənɚ,
93 | DUMMY1/LJ014-0010.wav|jˈɛt hiː kʊd nˌɑːt ˌoʊvɚkˈʌm ðə stɹˈeɪndʒ fˌæsᵻnˈeɪʃən ɪt hˈɐd fɔːɹ hˌɪm, ænd ɹɪmˈeɪnd baɪ ðə sˈaɪd ʌvðə kˈɔːɹps tˈɪl ðə stɹˈɛtʃɚ kˈeɪm.
94 | DUMMY1/LJ033-0047.wav|ˈaɪ nˈoʊɾɪsd wɛn ˈaɪ wɛnt ˈaʊt ðætðə lˈaɪt wʌz ˈɑːn, ˈɛnd kwˈoʊt,
95 | DUMMY1/LJ040-0027.wav|hiː wʌz nˈɛvɚ sˈæɾɪsfˌaɪd wɪð ˈɛnɪθˌɪŋ.
96 | DUMMY1/LJ048-0228.wav|ænd ˈʌðɚz hˌuː wɜː pɹˈɛzənt sˈeɪ ðæt nˈoʊ ˈeɪdʒənt wʌz ɪnˈiːbɹɪˌeɪɾᵻd ɔːɹ ˈæktᵻd ɪmpɹˈɑːpɚli.
97 | DUMMY1/LJ003-0111.wav|hiː wʌz ɪn kˈɑːnsɪkwəns pˌʊt ˌaʊɾəv ðə pɹətˈɛkʃən ʌv ðɛɹ ɪntˈɜːnəl lˈɔː, ˈɛnd kwˈoʊt. ðɛɹ kˈoʊd wʌzɐ sˈʌbdʒɛkt ʌv sˌʌm kjˌʊɹɪˈɑːsɪɾi.
98 | DUMMY1/LJ008-0258.wav|lˈɛt mˌiː ɹɪtɹˈeɪs maɪ stˈɛps, ænd spˈiːk mˈoːɹ ɪn diːtˈeɪl ʌvðə tɹˈiːtmənt ʌvðə kəndˈɛmd ɪn ðoʊz blˈʌdθɜːsti ænd bɹˈuːɾəli ɪndˈɪfɹənt dˈeɪz,
99 | DUMMY1/LJ029-0022.wav|ðɪ ɚɹˈɪdʒɪnəl plˈæn kˈɔːld fɚðə pɹˈɛzɪdənt tə spˈɛnd ˈoʊnli wˈʌn dˈeɪ ɪnðə stˈeɪt, mˌeɪkɪŋ wˈɜːlwɪnd vˈɪzɪts tə dˈæləs, fˈɔːɹt wˈɜːθ, sˌæn æntˈoʊnɪˌoʊ, ænd hjˈuːstən.
100 | DUMMY1/LJ004-0045.wav|mˈɪstɚ stˈɜːdʒᵻz bˈoːɹn, sˌɜː dʒˈeɪmz mˈækɪntˌɑːʃ, sˌɜː dʒˈeɪmz skˈɑːɹlɪt, ænd wˈɪljəm wˈɪlbɚfˌoːɹs.
101 |
--------------------------------------------------------------------------------
/train_latest_ms.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | import itertools
5 | import math
6 | import torch
7 | from torch import nn, optim
8 | from torch.nn import functional as F
9 | from torch.utils.data import DataLoader
10 | from torch.utils.tensorboard import SummaryWriter
11 | import torch.multiprocessing as mp
12 | import torch.distributed as dist
13 | from torch.nn.parallel import DistributedDataParallel as DDP
14 | from torch.cuda.amp import autocast, GradScaler
15 | from pqmf import PQMF
16 |
17 | import commons
18 | import utils
19 | from data_utils import (
20 | TextAudioSpeakerLoader,
21 | TextAudioSpeakerCollate,
22 | DistributedBucketSampler
23 | )
24 | from models import (
25 | SynthesizerTrn,
26 | MultiPeriodDiscriminator,
27 | )
28 | from losses import (
29 | generator_loss,
30 | discriminator_loss,
31 | feature_loss,
32 | kl_loss,
33 | subband_stft_loss
34 | )
35 | from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
36 | from text.symbols import symbols
37 |
38 | torch.autograd.set_detect_anomaly(True)
39 | torch.backends.cudnn.benchmark = True
40 | global_step = 0
41 |
42 |
43 | def main():
44 | """Assume Single Node Multi GPUs Training Only"""
45 | assert torch.cuda.is_available(), "CPU training is not allowed."
46 |
47 | n_gpus = torch.cuda.device_count()
48 | os.environ['MASTER_ADDR'] = 'localhost'
49 | os.environ['MASTER_PORT'] = '65520'
50 | # n_gpus = 1
51 |
52 | hps = utils.get_hparams()
53 | mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
54 |
55 |
56 | def run(rank, n_gpus, hps):
57 | global global_step
58 | if rank == 0:
59 | logger = utils.get_logger(hps.model_dir)
60 | logger.info(hps)
61 | utils.check_git_hash(hps.model_dir)
62 | writer = SummaryWriter(log_dir=hps.model_dir)
63 | writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
64 |
65 | dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
66 | torch.manual_seed(hps.train.seed)
67 | torch.cuda.set_device(rank)
68 |
69 | train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data)
70 | train_sampler = DistributedBucketSampler(
71 | train_dataset,
72 | hps.train.batch_size,
73 | [32,300,400,500,600,700,800,900,1000],
74 | num_replicas=n_gpus,
75 | rank=rank,
76 | shuffle=True)
77 | collate_fn = TextAudioSpeakerCollate()
78 | train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True,
79 | collate_fn=collate_fn, batch_sampler=train_sampler)
80 | if rank == 0:
81 | eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
82 | eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False,
83 | batch_size=hps.train.batch_size, pin_memory=True,
84 | drop_last=False, collate_fn=collate_fn)
85 |
86 | net_g = SynthesizerTrn(
87 | len(symbols),
88 | hps.data.filter_length // 2 + 1,
89 | hps.train.segment_size // hps.data.hop_length,
90 | n_speakers=hps.data.n_speakers,
91 | **hps.model).cuda(rank)
92 | net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
93 | optim_g = torch.optim.AdamW(
94 | net_g.parameters(),
95 | hps.train.learning_rate,
96 | betas=hps.train.betas,
97 | eps=hps.train.eps)
98 | optim_d = torch.optim.AdamW(
99 | net_d.parameters(),
100 | hps.train.learning_rate,
101 | betas=hps.train.betas,
102 | eps=hps.train.eps)
103 | net_g = DDP(net_g, device_ids=[rank])
104 | net_d = DDP(net_d, device_ids=[rank])
105 |
106 | try:
107 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g)
108 | _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d)
109 | global_step = (epoch_str - 1) * len(train_loader)
110 | except:
111 | epoch_str = 1
112 | global_step = 0
113 |
114 | scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str-2)
115 | scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str-2)
116 |
117 | scaler = GradScaler(enabled=hps.train.fp16_run)
118 |
119 | for epoch in range(epoch_str, hps.train.epochs + 1):
120 | if rank==0:
121 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval])
122 | else:
123 | train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None)
124 | scheduler_g.step()
125 | scheduler_d.step()
126 |
127 |
128 |
129 | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers):
130 | net_g, net_d = nets
131 | optim_g, optim_d = optims
132 | scheduler_g, scheduler_d = schedulers
133 | train_loader, eval_loader = loaders
134 | if writers is not None:
135 | writer, writer_eval = writers
136 |
137 | train_loader.batch_sampler.set_epoch(epoch)
138 | global global_step
139 |
140 | net_g.train()
141 | net_d.train()
142 | for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(train_loader):
143 | x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True)
144 | spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
145 | y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True)
146 | speakers = speakers.cuda(rank, non_blocking=True)
147 |
148 | with autocast(enabled=hps.train.fp16_run):
149 | y_hat, y_hat_mb, l_length, attn, ids_slice, x_mask, z_mask,\
150 | (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths, speakers)
151 |
152 | mel = spec_to_mel_torch(
153 | spec,
154 | hps.data.filter_length,
155 | hps.data.n_mel_channels,
156 | hps.data.sampling_rate,
157 | hps.data.mel_fmin,
158 | hps.data.mel_fmax)
159 | y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
160 | y_hat_mel = mel_spectrogram_torch(
161 | y_hat.squeeze(1),
162 | hps.data.filter_length,
163 | hps.data.n_mel_channels,
164 | hps.data.sampling_rate,
165 | hps.data.hop_length,
166 | hps.data.win_length,
167 | hps.data.mel_fmin,
168 | hps.data.mel_fmax
169 | )
170 |
171 | y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice
172 |
173 | # Discriminator
174 | y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
175 | with autocast(enabled=False):
176 | loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
177 | loss_disc_all = loss_disc
178 | optim_d.zero_grad()
179 | scaler.scale(loss_disc_all).backward()
180 | scaler.unscale_(optim_d)
181 | grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
182 | scaler.step(optim_d)
183 |
184 |
185 |
186 |
187 | with autocast(enabled=hps.train.fp16_run):
188 | # Generator
189 | y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
190 | with autocast(enabled=False):
191 | loss_dur = torch.sum(l_length.float())
192 | loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
193 | loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
194 |
195 | loss_fm = feature_loss(fmap_r, fmap_g)
196 | loss_gen, losses_gen = generator_loss(y_d_hat_g)
197 |
198 | if hps.model.mb_istft_vits == True:
199 | pqmf = PQMF(y.device)
200 | y_mb = pqmf.analysis(y)
201 | loss_subband = subband_stft_loss(hps, y_mb, y_hat_mb)
202 | else:
203 | loss_subband = torch.tensor(0.0)
204 |
205 | loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl + loss_subband
206 |
207 | optim_g.zero_grad()
208 | scaler.scale(loss_gen_all).backward()
209 | scaler.unscale_(optim_g)
210 | grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
211 | scaler.step(optim_g)
212 | scaler.update()
213 |
214 | if rank==0:
215 | if global_step % hps.train.log_interval == 0:
216 | lr = optim_g.param_groups[0]['lr']
217 | losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl, loss_subband]
218 | logger.info('Train Epoch: {} [{:.0f}%]'.format(
219 | epoch,
220 | 100. * batch_idx / len(train_loader)))
221 | logger.info([x.item() for x in losses] + [global_step, lr])
222 |
223 | scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g}
224 | scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl, "loss/g/subband": loss_subband})
225 |
226 | scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
227 | scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
228 | scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
229 | image_dict = {
230 | "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
231 | "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
232 | "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
233 | "all/attn": utils.plot_alignment_to_numpy(attn[0,0].data.cpu().numpy())
234 | }
235 | utils.summarize(
236 | writer=writer,
237 | global_step=global_step,
238 | images=image_dict,
239 | scalars=scalar_dict)
240 |
241 | if global_step % hps.train.eval_interval == 0:
242 | evaluate(hps, net_g, eval_loader, writer_eval)
243 | utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
244 | utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
245 | global_step += 1
246 |
247 |
248 | if rank == 0:
249 | logger.info('====> Epoch: {}'.format(epoch))
250 |
251 |
252 |
253 |
254 | def evaluate(hps, generator, eval_loader, writer_eval):
255 | generator.eval()
256 | with torch.no_grad():
257 | for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(eval_loader):
258 | x, x_lengths = x.cuda(0), x_lengths.cuda(0)
259 | spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0)
260 | y, y_lengths = y.cuda(0), y_lengths.cuda(0)
261 | speakers = speakers.cuda(0)
262 |
263 | # remove else
264 | x = x[:1]
265 | x_lengths = x_lengths[:1]
266 | spec = spec[:1]
267 | spec_lengths = spec_lengths[:1]
268 | y = y[:1]
269 | y_lengths = y_lengths[:1]
270 | speakers = speakers[:1]
271 | break
272 | y_hat, y_hat_mb, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, max_len=1000)
273 | y_hat_lengths = mask.sum([1,2]).long() * hps.data.hop_length
274 |
275 | mel = spec_to_mel_torch(
276 | spec,
277 | hps.data.filter_length,
278 | hps.data.n_mel_channels,
279 | hps.data.sampling_rate,
280 | hps.data.mel_fmin,
281 | hps.data.mel_fmax)
282 | y_hat_mel = mel_spectrogram_torch(
283 | y_hat.squeeze(1).float(),
284 | hps.data.filter_length,
285 | hps.data.n_mel_channels,
286 | hps.data.sampling_rate,
287 | hps.data.hop_length,
288 | hps.data.win_length,
289 | hps.data.mel_fmin,
290 | hps.data.mel_fmax
291 | )
292 | image_dict = {
293 | "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy())
294 | }
295 | audio_dict = {
296 | "gen/audio": y_hat[0,:,:y_hat_lengths[0]]
297 | }
298 | if global_step == 0:
299 | image_dict.update({"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())})
300 | audio_dict.update({"gt/audio": y[0,:,:y_lengths[0]]})
301 |
302 | utils.summarize(
303 | writer=writer_eval,
304 | global_step=global_step,
305 | images=image_dict,
306 | audios=audio_dict,
307 | audio_sampling_rate=hps.data.sampling_rate
308 | )
309 | generator.train()
310 |
311 |
312 | if __name__ == "__main__":
313 | os.environ[
314 | "TORCH_DISTRIBUTED_DEBUG"
315 | ] = "DETAIL"
316 | main()
317 |
--------------------------------------------------------------------------------
/attentions.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import math
3 | import numpy as np
4 | import torch
5 | from torch import nn
6 | from torch.nn import functional as F
7 |
8 | import commons
9 | import modules
10 | from modules import LayerNorm
11 |
12 |
13 | class Encoder(nn.Module):
14 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
15 | super().__init__()
16 | self.hidden_channels = hidden_channels
17 | self.filter_channels = filter_channels
18 | self.n_heads = n_heads
19 | self.n_layers = n_layers
20 | self.kernel_size = kernel_size
21 | self.p_dropout = p_dropout
22 | self.window_size = window_size
23 |
24 | self.drop = nn.Dropout(p_dropout)
25 | self.attn_layers = nn.ModuleList()
26 | self.norm_layers_1 = nn.ModuleList()
27 | self.ffn_layers = nn.ModuleList()
28 | self.norm_layers_2 = nn.ModuleList()
29 | for i in range(self.n_layers):
30 | self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
31 | self.norm_layers_1.append(LayerNorm(hidden_channels))
32 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
33 | self.norm_layers_2.append(LayerNorm(hidden_channels))
34 |
35 | def forward(self, x, x_mask):
36 | attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
37 | x = x * x_mask
38 | for i in range(self.n_layers):
39 | y = self.attn_layers[i](x, x, attn_mask)
40 | y = self.drop(y)
41 | x = self.norm_layers_1[i](x + y)
42 |
43 | y = self.ffn_layers[i](x, x_mask)
44 | y = self.drop(y)
45 | x = self.norm_layers_2[i](x + y)
46 | x = x * x_mask
47 | return x
48 |
49 |
50 | class Decoder(nn.Module):
51 | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
52 | super().__init__()
53 | self.hidden_channels = hidden_channels
54 | self.filter_channels = filter_channels
55 | self.n_heads = n_heads
56 | self.n_layers = n_layers
57 | self.kernel_size = kernel_size
58 | self.p_dropout = p_dropout
59 | self.proximal_bias = proximal_bias
60 | self.proximal_init = proximal_init
61 |
62 | self.drop = nn.Dropout(p_dropout)
63 | self.self_attn_layers = nn.ModuleList()
64 | self.norm_layers_0 = nn.ModuleList()
65 | self.encdec_attn_layers = nn.ModuleList()
66 | self.norm_layers_1 = nn.ModuleList()
67 | self.ffn_layers = nn.ModuleList()
68 | self.norm_layers_2 = nn.ModuleList()
69 | for i in range(self.n_layers):
70 | self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
71 | self.norm_layers_0.append(LayerNorm(hidden_channels))
72 | self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
73 | self.norm_layers_1.append(LayerNorm(hidden_channels))
74 | self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
75 | self.norm_layers_2.append(LayerNorm(hidden_channels))
76 |
77 | def forward(self, x, x_mask, h, h_mask):
78 | """
79 | x: decoder input
80 | h: encoder output
81 | """
82 | self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
83 | encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
84 | x = x * x_mask
85 | for i in range(self.n_layers):
86 | y = self.self_attn_layers[i](x, x, self_attn_mask)
87 | y = self.drop(y)
88 | x = self.norm_layers_0[i](x + y)
89 |
90 | y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
91 | y = self.drop(y)
92 | x = self.norm_layers_1[i](x + y)
93 |
94 | y = self.ffn_layers[i](x, x_mask)
95 | y = self.drop(y)
96 | x = self.norm_layers_2[i](x + y)
97 | x = x * x_mask
98 | return x
99 |
100 |
101 | class MultiHeadAttention(nn.Module):
102 | def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
103 | super().__init__()
104 | assert channels % n_heads == 0
105 |
106 | self.channels = channels
107 | self.out_channels = out_channels
108 | self.n_heads = n_heads
109 | self.p_dropout = p_dropout
110 | self.window_size = window_size
111 | self.heads_share = heads_share
112 | self.block_length = block_length
113 | self.proximal_bias = proximal_bias
114 | self.proximal_init = proximal_init
115 | self.attn = None
116 |
117 | self.k_channels = channels // n_heads
118 | self.conv_q = nn.Conv1d(channels, channels, 1)
119 | self.conv_k = nn.Conv1d(channels, channels, 1)
120 | self.conv_v = nn.Conv1d(channels, channels, 1)
121 | self.conv_o = nn.Conv1d(channels, out_channels, 1)
122 | self.drop = nn.Dropout(p_dropout)
123 |
124 | if window_size is not None:
125 | n_heads_rel = 1 if heads_share else n_heads
126 | rel_stddev = self.k_channels**-0.5
127 | self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
128 | self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
129 |
130 | nn.init.xavier_uniform_(self.conv_q.weight)
131 | nn.init.xavier_uniform_(self.conv_k.weight)
132 | nn.init.xavier_uniform_(self.conv_v.weight)
133 | if proximal_init:
134 | with torch.no_grad():
135 | self.conv_k.weight.copy_(self.conv_q.weight)
136 | self.conv_k.bias.copy_(self.conv_q.bias)
137 |
138 | def forward(self, x, c, attn_mask=None):
139 | q = self.conv_q(x)
140 | k = self.conv_k(c)
141 | v = self.conv_v(c)
142 |
143 | x, self.attn = self.attention(q, k, v, mask=attn_mask)
144 |
145 | x = self.conv_o(x)
146 | return x
147 |
148 | def attention(self, query, key, value, mask=None):
149 | # reshape [b, d, t] -> [b, n_h, t, d_k]
150 | b, d, t_s, t_t = (*key.size(), query.size(2))
151 | query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
152 | key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
153 | value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
154 |
155 | scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
156 | if self.window_size is not None:
157 | assert t_s == t_t, "Relative attention is only available for self-attention."
158 | key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
159 | rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
160 | scores_local = self._relative_position_to_absolute_position(rel_logits)
161 | scores = scores + scores_local
162 | if self.proximal_bias:
163 | assert t_s == t_t, "Proximal bias is only available for self-attention."
164 | scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
165 | if mask is not None:
166 | scores = scores.masked_fill(mask == 0, -1e4)
167 | if self.block_length is not None:
168 | assert t_s == t_t, "Local attention is only available for self-attention."
169 | block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
170 | scores = scores.masked_fill(block_mask == 0, -1e4)
171 | p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
172 | p_attn = self.drop(p_attn)
173 | output = torch.matmul(p_attn, value)
174 | if self.window_size is not None:
175 | relative_weights = self._absolute_position_to_relative_position(p_attn)
176 | value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
177 | output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
178 | output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
179 | return output, p_attn
180 |
181 | def _matmul_with_relative_values(self, x, y):
182 | """
183 | x: [b, h, l, m]
184 | y: [h or 1, m, d]
185 | ret: [b, h, l, d]
186 | """
187 | ret = torch.matmul(x, y.unsqueeze(0))
188 | return ret
189 |
190 | def _matmul_with_relative_keys(self, x, y):
191 | """
192 | x: [b, h, l, d]
193 | y: [h or 1, m, d]
194 | ret: [b, h, l, m]
195 | """
196 | ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
197 | return ret
198 |
199 | def _get_relative_embeddings(self, relative_embeddings, length):
200 | max_relative_position = 2 * self.window_size + 1
201 | # Pad first before slice to avoid using cond ops.
202 | pad_length = max(length - (self.window_size + 1), 0)
203 | slice_start_position = max((self.window_size + 1) - length, 0)
204 | slice_end_position = slice_start_position + 2 * length - 1
205 | if pad_length > 0:
206 | padded_relative_embeddings = F.pad(
207 | relative_embeddings,
208 | commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
209 | else:
210 | padded_relative_embeddings = relative_embeddings
211 | used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
212 | return used_relative_embeddings
213 |
214 | def _relative_position_to_absolute_position(self, x):
215 | """
216 | x: [b, h, l, 2*l-1]
217 | ret: [b, h, l, l]
218 | """
219 | batch, heads, length, _ = x.size()
220 | # Concat columns of pad to shift from relative to absolute indexing.
221 | x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
222 |
223 | # Concat extra elements so to add up to shape (len+1, 2*len-1).
224 | x_flat = x.view([batch, heads, length * 2 * length])
225 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
226 |
227 | # Reshape and slice out the padded elements.
228 | x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
229 | return x_final
230 |
231 | def _absolute_position_to_relative_position(self, x):
232 | """
233 | x: [b, h, l, l]
234 | ret: [b, h, l, 2*l-1]
235 | """
236 | batch, heads, length, _ = x.size()
237 | # padd along column
238 | x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
239 | x_flat = x.view([batch, heads, length**2 + length*(length -1)])
240 | # add 0's in the beginning that will skew the elements after reshape
241 | x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
242 | x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
243 | return x_final
244 |
245 | def _attention_bias_proximal(self, length):
246 | """Bias for self-attention to encourage attention to close positions.
247 | Args:
248 | length: an integer scalar.
249 | Returns:
250 | a Tensor with shape [1, 1, length, length]
251 | """
252 | r = torch.arange(length, dtype=torch.float32)
253 | diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
254 | return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
255 |
256 |
257 | class FFN(nn.Module):
258 | def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
259 | super().__init__()
260 | self.in_channels = in_channels
261 | self.out_channels = out_channels
262 | self.filter_channels = filter_channels
263 | self.kernel_size = kernel_size
264 | self.p_dropout = p_dropout
265 | self.activation = activation
266 | self.causal = causal
267 |
268 | if causal:
269 | self.padding = self._causal_padding
270 | else:
271 | self.padding = self._same_padding
272 |
273 | self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
274 | self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
275 | self.drop = nn.Dropout(p_dropout)
276 |
277 | def forward(self, x, x_mask):
278 | x = self.conv_1(self.padding(x * x_mask))
279 | if self.activation == "gelu":
280 | x = x * torch.sigmoid(1.702 * x)
281 | else:
282 | x = torch.relu(x)
283 | x = self.drop(x)
284 | x = self.conv_2(self.padding(x * x_mask))
285 | return x * x_mask
286 |
287 | def _causal_padding(self, x):
288 | if self.kernel_size == 1:
289 | return x
290 | pad_l = self.kernel_size - 1
291 | pad_r = 0
292 | padding = [[0, 0], [0, 0], [pad_l, pad_r]]
293 | x = F.pad(x, commons.convert_pad_shape(padding))
294 | return x
295 |
296 | def _same_padding(self, x):
297 | if self.kernel_size == 1:
298 | return x
299 | pad_l = (self.kernel_size - 1) // 2
300 | pad_r = self.kernel_size // 2
301 | padding = [[0, 0], [0, 0], [pad_l, pad_r]]
302 | x = F.pad(x, commons.convert_pad_shape(padding))
303 | return x
304 |
--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import math
3 | import numpy as np
4 | import scipy
5 | import torch
6 | from torch import nn
7 | from torch.nn import functional as F
8 |
9 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10 | from torch.nn.utils import weight_norm, remove_weight_norm
11 |
12 | import commons
13 | from commons import init_weights, get_padding
14 | from transforms import piecewise_rational_quadratic_transform
15 |
16 |
17 | LRELU_SLOPE = 0.1
18 |
19 |
20 | class LayerNorm(nn.Module):
21 | def __init__(self, channels, eps=1e-5):
22 | super().__init__()
23 | self.channels = channels
24 | self.eps = eps
25 |
26 | self.gamma = nn.Parameter(torch.ones(channels))
27 | self.beta = nn.Parameter(torch.zeros(channels))
28 |
29 | def forward(self, x):
30 | x = x.transpose(1, -1)
31 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
32 | return x.transpose(1, -1)
33 |
34 |
35 | class ConvReluNorm(nn.Module):
36 | def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
37 | super().__init__()
38 | self.in_channels = in_channels
39 | self.hidden_channels = hidden_channels
40 | self.out_channels = out_channels
41 | self.kernel_size = kernel_size
42 | self.n_layers = n_layers
43 | self.p_dropout = p_dropout
44 | assert n_layers > 1, "Number of layers should be larger than 0."
45 |
46 | self.conv_layers = nn.ModuleList()
47 | self.norm_layers = nn.ModuleList()
48 | self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
49 | self.norm_layers.append(LayerNorm(hidden_channels))
50 | self.relu_drop = nn.Sequential(
51 | nn.ReLU(),
52 | nn.Dropout(p_dropout))
53 | for _ in range(n_layers-1):
54 | self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
55 | self.norm_layers.append(LayerNorm(hidden_channels))
56 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
57 | self.proj.weight.data.zero_()
58 | self.proj.bias.data.zero_()
59 |
60 | def forward(self, x, x_mask):
61 | x_org = x
62 | for i in range(self.n_layers):
63 | x = self.conv_layers[i](x * x_mask)
64 | x = self.norm_layers[i](x)
65 | x = self.relu_drop(x)
66 | x = x_org + self.proj(x)
67 | return x * x_mask
68 |
69 |
70 | class DDSConv(nn.Module):
71 | """
72 | Dialted and Depth-Separable Convolution
73 | """
74 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
75 | super().__init__()
76 | self.channels = channels
77 | self.kernel_size = kernel_size
78 | self.n_layers = n_layers
79 | self.p_dropout = p_dropout
80 |
81 | self.drop = nn.Dropout(p_dropout)
82 | self.convs_sep = nn.ModuleList()
83 | self.convs_1x1 = nn.ModuleList()
84 | self.norms_1 = nn.ModuleList()
85 | self.norms_2 = nn.ModuleList()
86 | for i in range(n_layers):
87 | dilation = kernel_size ** i
88 | padding = (kernel_size * dilation - dilation) // 2
89 | self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
90 | groups=channels, dilation=dilation, padding=padding
91 | ))
92 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
93 | self.norms_1.append(LayerNorm(channels))
94 | self.norms_2.append(LayerNorm(channels))
95 |
96 | def forward(self, x, x_mask, g=None):
97 | if g is not None:
98 | x = x + g
99 | for i in range(self.n_layers):
100 | y = self.convs_sep[i](x * x_mask)
101 | y = self.norms_1[i](y)
102 | y = F.gelu(y)
103 | y = self.convs_1x1[i](y)
104 | y = self.norms_2[i](y)
105 | y = F.gelu(y)
106 | y = self.drop(y)
107 | x = x + y
108 | return x * x_mask
109 |
110 |
111 | class WN(torch.nn.Module):
112 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
113 | super(WN, self).__init__()
114 | assert(kernel_size % 2 == 1)
115 | self.hidden_channels =hidden_channels
116 | self.kernel_size = kernel_size,
117 | self.dilation_rate = dilation_rate
118 | self.n_layers = n_layers
119 | self.gin_channels = gin_channels
120 | self.p_dropout = p_dropout
121 |
122 | self.in_layers = torch.nn.ModuleList()
123 | self.res_skip_layers = torch.nn.ModuleList()
124 | self.drop = nn.Dropout(p_dropout)
125 |
126 | if gin_channels != 0:
127 | cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
128 | self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
129 |
130 | for i in range(n_layers):
131 | dilation = dilation_rate ** i
132 | padding = int((kernel_size * dilation - dilation) / 2)
133 | in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
134 | dilation=dilation, padding=padding)
135 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
136 | self.in_layers.append(in_layer)
137 |
138 | # last one is not necessary
139 | if i < n_layers - 1:
140 | res_skip_channels = 2 * hidden_channels
141 | else:
142 | res_skip_channels = hidden_channels
143 |
144 | res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
145 | res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
146 | self.res_skip_layers.append(res_skip_layer)
147 |
148 | def forward(self, x, x_mask, g=None, **kwargs):
149 | output = torch.zeros_like(x)
150 | n_channels_tensor = torch.IntTensor([self.hidden_channels])
151 |
152 | if g is not None:
153 | g = self.cond_layer(g)
154 |
155 | for i in range(self.n_layers):
156 | x_in = self.in_layers[i](x)
157 | if g is not None:
158 | cond_offset = i * 2 * self.hidden_channels
159 | g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
160 | else:
161 | g_l = torch.zeros_like(x_in)
162 |
163 | acts = commons.fused_add_tanh_sigmoid_multiply(
164 | x_in,
165 | g_l,
166 | n_channels_tensor)
167 | acts = self.drop(acts)
168 |
169 | res_skip_acts = self.res_skip_layers[i](acts)
170 | if i < self.n_layers - 1:
171 | res_acts = res_skip_acts[:,:self.hidden_channels,:]
172 | x = (x + res_acts) * x_mask
173 | output = output + res_skip_acts[:,self.hidden_channels:,:]
174 | else:
175 | output = output + res_skip_acts
176 | return output * x_mask
177 |
178 | def remove_weight_norm(self):
179 | if self.gin_channels != 0:
180 | torch.nn.utils.remove_weight_norm(self.cond_layer)
181 | for l in self.in_layers:
182 | torch.nn.utils.remove_weight_norm(l)
183 | for l in self.res_skip_layers:
184 | torch.nn.utils.remove_weight_norm(l)
185 |
186 |
187 | class ResBlock1(torch.nn.Module):
188 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
189 | super(ResBlock1, self).__init__()
190 | self.convs1 = nn.ModuleList([
191 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
192 | padding=get_padding(kernel_size, dilation[0]))),
193 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
194 | padding=get_padding(kernel_size, dilation[1]))),
195 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
196 | padding=get_padding(kernel_size, dilation[2])))
197 | ])
198 | self.convs1.apply(init_weights)
199 |
200 | self.convs2 = nn.ModuleList([
201 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
202 | padding=get_padding(kernel_size, 1))),
203 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
204 | padding=get_padding(kernel_size, 1))),
205 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
206 | padding=get_padding(kernel_size, 1)))
207 | ])
208 | self.convs2.apply(init_weights)
209 |
210 | def forward(self, x, x_mask=None):
211 | for c1, c2 in zip(self.convs1, self.convs2):
212 | xt = F.leaky_relu(x, LRELU_SLOPE)
213 | if x_mask is not None:
214 | xt = xt * x_mask
215 | xt = c1(xt)
216 | xt = F.leaky_relu(xt, LRELU_SLOPE)
217 | if x_mask is not None:
218 | xt = xt * x_mask
219 | xt = c2(xt)
220 | x = xt + x
221 | if x_mask is not None:
222 | x = x * x_mask
223 | return x
224 |
225 | def remove_weight_norm(self):
226 | for l in self.convs1:
227 | remove_weight_norm(l)
228 | for l in self.convs2:
229 | remove_weight_norm(l)
230 |
231 |
232 | class ResBlock2(torch.nn.Module):
233 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
234 | super(ResBlock2, self).__init__()
235 | self.convs = nn.ModuleList([
236 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
237 | padding=get_padding(kernel_size, dilation[0]))),
238 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
239 | padding=get_padding(kernel_size, dilation[1])))
240 | ])
241 | self.convs.apply(init_weights)
242 |
243 | def forward(self, x, x_mask=None):
244 | for c in self.convs:
245 | xt = F.leaky_relu(x, LRELU_SLOPE)
246 | if x_mask is not None:
247 | xt = xt * x_mask
248 | xt = c(xt)
249 | x = xt + x
250 | if x_mask is not None:
251 | x = x * x_mask
252 | return x
253 |
254 | def remove_weight_norm(self):
255 | for l in self.convs:
256 | remove_weight_norm(l)
257 |
258 |
259 | class Log(nn.Module):
260 | def forward(self, x, x_mask, reverse=False, **kwargs):
261 | if not reverse:
262 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
263 | logdet = torch.sum(-y, [1, 2])
264 | return y, logdet
265 | else:
266 | x = torch.exp(x) * x_mask
267 | return x
268 |
269 |
270 | class Flip(nn.Module):
271 | def forward(self, x, *args, reverse=False, **kwargs):
272 | x = torch.flip(x, [1])
273 | if not reverse:
274 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
275 | return x, logdet
276 | else:
277 | return x
278 |
279 |
280 | class ElementwiseAffine(nn.Module):
281 | def __init__(self, channels):
282 | super().__init__()
283 | self.channels = channels
284 | self.m = nn.Parameter(torch.zeros(channels,1))
285 | self.logs = nn.Parameter(torch.zeros(channels,1))
286 |
287 | def forward(self, x, x_mask, reverse=False, **kwargs):
288 | if not reverse:
289 | y = self.m + torch.exp(self.logs) * x
290 | y = y * x_mask
291 | logdet = torch.sum(self.logs * x_mask, [1,2])
292 | return y, logdet
293 | else:
294 | x = (x - self.m) * torch.exp(-self.logs) * x_mask
295 | return x
296 |
297 |
298 | class ResidualCouplingLayer(nn.Module):
299 | def __init__(self,
300 | channels,
301 | hidden_channels,
302 | kernel_size,
303 | dilation_rate,
304 | n_layers,
305 | p_dropout=0,
306 | gin_channels=0,
307 | mean_only=False):
308 | assert channels % 2 == 0, "channels should be divisible by 2"
309 | super().__init__()
310 | self.channels = channels
311 | self.hidden_channels = hidden_channels
312 | self.kernel_size = kernel_size
313 | self.dilation_rate = dilation_rate
314 | self.n_layers = n_layers
315 | self.half_channels = channels // 2
316 | self.mean_only = mean_only
317 |
318 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
319 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
320 | self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
321 | self.post.weight.data.zero_()
322 | self.post.bias.data.zero_()
323 |
324 | def forward(self, x, x_mask, g=None, reverse=False):
325 | x0, x1 = torch.split(x, [self.half_channels]*2, 1)
326 | h = self.pre(x0) * x_mask
327 | h = self.enc(h, x_mask, g=g)
328 | stats = self.post(h) * x_mask
329 | if not self.mean_only:
330 | m, logs = torch.split(stats, [self.half_channels]*2, 1)
331 | else:
332 | m = stats
333 | logs = torch.zeros_like(m)
334 |
335 | if not reverse:
336 | x1 = m + x1 * torch.exp(logs) * x_mask
337 | x = torch.cat([x0, x1], 1)
338 | logdet = torch.sum(logs, [1,2])
339 | return x, logdet
340 | else:
341 | x1 = (x1 - m) * torch.exp(-logs) * x_mask
342 | x = torch.cat([x0, x1], 1)
343 | return x
344 |
345 |
346 | class ConvFlow(nn.Module):
347 | def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
348 | super().__init__()
349 | self.in_channels = in_channels
350 | self.filter_channels = filter_channels
351 | self.kernel_size = kernel_size
352 | self.n_layers = n_layers
353 | self.num_bins = num_bins
354 | self.tail_bound = tail_bound
355 | self.half_channels = in_channels // 2
356 |
357 | self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
358 | self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
359 | self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
360 | self.proj.weight.data.zero_()
361 | self.proj.bias.data.zero_()
362 |
363 | def forward(self, x, x_mask, g=None, reverse=False):
364 | x0, x1 = torch.split(x, [self.half_channels]*2, 1)
365 | h = self.pre(x0)
366 | h = self.convs(h, x_mask, g=g)
367 | h = self.proj(h) * x_mask
368 |
369 | b, c, t = x0.shape
370 | h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
371 |
372 | unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
373 | unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
374 | unnormalized_derivatives = h[..., 2 * self.num_bins:]
375 |
376 | x1, logabsdet = piecewise_rational_quadratic_transform(x1,
377 | unnormalized_widths,
378 | unnormalized_heights,
379 | unnormalized_derivatives,
380 | inverse=reverse,
381 | tails='linear',
382 | tail_bound=self.tail_bound
383 | )
384 |
385 | x = torch.cat([x0, x1], 1) * x_mask
386 | logdet = torch.sum(logabsdet * x_mask, [1,2])
387 | if not reverse:
388 | return x, logdet
389 | else:
390 | return x
391 |
--------------------------------------------------------------------------------