├── DUMMY1
├── .gitignore
├── fig
    └── proposed_model.png
├── text
    ├── __pycache__
    │   ├── __init__.cpython-38.pyc
    │   ├── cleaners.cpython-38.pyc
    │   └── symbols.cpython-38.pyc
    ├── symbols.py
    ├── LICENSE
    ├── __init__.py
    └── cleaners.py
├── monotonic_align
    ├── build
    │   └── temp.linux-x86_64-3.8
    │   │   └── core.o
    ├── __pycache__
    │   └── __init__.cpython-38.pyc
    ├── monotonic_align
    │   └── core.cpython-38-x86_64-linux-gnu.so
    ├── setup.py
    ├── __init__.py
    └── core.pyx
├── requirements.txt
├── preprocess.py
├── configs
    ├── ljs_istft_vits.json
    ├── ljs_mini_istft_vits.json
    ├── ljs_mini_mb_istft_vits.json
    ├── ljs_mb_istft_vits.json
    └── ljs_ms_istft_vits.json
├── losses.py
├── inference.ipynb
├── README.md
├── pqmf.py
├── stft_loss.py
├── commons.py
├── mel_processing.py
├── filelists
    ├── vctk_audio_sid_text_val_filelist.txt
    ├── vctk_audio_sid_text_val_filelist.txt.cleaned
    ├── ljs_audio_text_val_filelist.txt
    └── ljs_audio_text_val_filelist.txt.cleaned
├── utils.py
├── transforms.py
├── stft.py
├── LICENSE
├── attentions.py
├── train_latest.py
├── modules.py
├── data_utils.py
└── models.py


/DUMMY1:
--------------------------------------------------------------------------------
1 | /home/hcy71/DATA/LJSpeech-1.1/wavs


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | DUMMY1
2 | logs
3 | *.pyc
4 | __pycache__


--------------------------------------------------------------------------------
/fig/proposed_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hcy71o/MB-iSTFT-VITS-with-AutoVocoder/HEAD/fig/proposed_model.png


--------------------------------------------------------------------------------
/text/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hcy71o/MB-iSTFT-VITS-with-AutoVocoder/HEAD/text/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/text/__pycache__/cleaners.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hcy71o/MB-iSTFT-VITS-with-AutoVocoder/HEAD/text/__pycache__/cleaners.cpython-38.pyc


--------------------------------------------------------------------------------
/text/__pycache__/symbols.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hcy71o/MB-iSTFT-VITS-with-AutoVocoder/HEAD/text/__pycache__/symbols.cpython-38.pyc


--------------------------------------------------------------------------------
/monotonic_align/build/temp.linux-x86_64-3.8/core.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hcy71o/MB-iSTFT-VITS-with-AutoVocoder/HEAD/monotonic_align/build/temp.linux-x86_64-3.8/core.o


--------------------------------------------------------------------------------
/monotonic_align/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hcy71o/MB-iSTFT-VITS-with-AutoVocoder/HEAD/monotonic_align/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/monotonic_align/monotonic_align/core.cpython-38-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hcy71o/MB-iSTFT-VITS-with-AutoVocoder/HEAD/monotonic_align/monotonic_align/core.cpython-38-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Cython==0.29.21
 2 | librosa==0.8.0
 3 | matplotlib==3.3.1
 4 | numpy==1.18.5
 5 | phonemizer==2.2.1
 6 | scipy==1.5.2
 7 | tensorboard==2.3.0
 8 | torch==1.6.0
 9 | torchvision==0.7.0
10 | Unidecode==1.1.1
11 | 


--------------------------------------------------------------------------------
/monotonic_align/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from Cython.Build import cythonize
 3 | import numpy
 4 | 
 5 | setup(
 6 |   name = 'monotonic_align',
 7 |   ext_modules = cythonize("core.pyx"),
 8 |   include_dirs=[numpy.get_include()]
 9 | )
10 | 


--------------------------------------------------------------------------------
/text/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Defines the set of symbols used in text input to the model.
 5 | '''
 6 | _pad        = '_'
 7 | _punctuation = ';:,.!?¡¿—…"«»“” '
 8 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
 9 | _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
10 | 
11 | 
12 | # Export all symbols:
13 | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
14 | 
15 | # Special symbol ids
16 | SPACE_ID = symbols.index(" ")
17 | 


--------------------------------------------------------------------------------
/monotonic_align/__init__.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from .monotonic_align.core import maximum_path_c
 4 | 
 5 | 
 6 | def maximum_path(neg_cent, mask):
 7 |   """ Cython optimized version.
 8 |   neg_cent: [b, t_t, t_s]
 9 |   mask: [b, t_t, t_s]
10 |   """
11 |   device = neg_cent.device
12 |   dtype = neg_cent.dtype
13 |   neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
14 |   path = np.zeros(neg_cent.shape, dtype=np.int32)
15 | 
16 |   t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
17 |   t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
18 |   maximum_path_c(path, neg_cent, t_t_max, t_s_max)
19 |   return torch.from_numpy(path).to(device=device, dtype=dtype)
20 | 


--------------------------------------------------------------------------------
/text/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Keith Ito
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import text
 3 | from utils import load_filepaths_and_text
 4 | 
 5 | if __name__ == '__main__':
 6 |   parser = argparse.ArgumentParser()
 7 |   parser.add_argument("--out_extension", default="cleaned")
 8 |   parser.add_argument("--text_index", default=1, type=int)
 9 |   parser.add_argument("--filelists", nargs="+", default=["filelists/ljs_audio_text_val_filelist.txt", "filelists/ljs_audio_text_test_filelist.txt"])
10 |   parser.add_argument("--text_cleaners", nargs="+", default=["english_cleaners2"])
11 | 
12 |   args = parser.parse_args()
13 |     
14 | 
15 |   for filelist in args.filelists:
16 |     print("START:", filelist)
17 |     filepaths_and_text = load_filepaths_and_text(filelist)
18 |     for i in range(len(filepaths_and_text)):
19 |       original_text = filepaths_and_text[i][args.text_index]
20 |       cleaned_text = text._clean_text(original_text, args.text_cleaners)
21 |       filepaths_and_text[i][args.text_index] = cleaned_text
22 | 
23 |     new_filelist = filelist + "." + args.out_extension
24 |     with open(new_filelist, "w", encoding="utf-8") as f:
25 |       f.writelines(["|".join(x) + "\n" for x in filepaths_and_text])
26 | 


--------------------------------------------------------------------------------
/monotonic_align/core.pyx:
--------------------------------------------------------------------------------
 1 | cimport cython
 2 | from cython.parallel import prange
 3 | 
 4 | 
 5 | @cython.boundscheck(False)
 6 | @cython.wraparound(False)
 7 | cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
 8 |   cdef int x
 9 |   cdef int y
10 |   cdef float v_prev
11 |   cdef float v_cur
12 |   cdef float tmp
13 |   cdef int index = t_x - 1
14 | 
15 |   for y in range(t_y):
16 |     for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
17 |       if x == y:
18 |         v_cur = max_neg_val
19 |       else:
20 |         v_cur = value[y-1, x]
21 |       if x == 0:
22 |         if y == 0:
23 |           v_prev = 0.
24 |         else:
25 |           v_prev = max_neg_val
26 |       else:
27 |         v_prev = value[y-1, x-1]
28 |       value[y, x] += max(v_prev, v_cur)
29 | 
30 |   for y in range(t_y - 1, -1, -1):
31 |     path[y, index] = 1
32 |     if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
33 |       index = index - 1
34 | 
35 | 
36 | @cython.boundscheck(False)
37 | @cython.wraparound(False)
38 | cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
39 |   cdef int b = paths.shape[0]
40 |   cdef int i
41 |   for i in prange(b, nogil=True):
42 |     maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
43 | 


--------------------------------------------------------------------------------
/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | from text import cleaners
 3 | from text.symbols import symbols
 4 | 
 5 | 
 6 | # Mappings from symbol to numeric ID and vice versa:
 7 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 8 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
 9 | 
10 | 
11 | def text_to_sequence(text, cleaner_names):
12 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13 |     Args:
14 |       text: string to convert to a sequence
15 |       cleaner_names: names of the cleaner functions to run the text through
16 |     Returns:
17 |       List of integers corresponding to the symbols in the text
18 |   '''
19 |   sequence = []
20 | 
21 |   clean_text = _clean_text(text, cleaner_names)
22 |   for symbol in clean_text:
23 |     symbol_id = _symbol_to_id[symbol]
24 |     sequence += [symbol_id]
25 |   return sequence
26 | 
27 | 
28 | def cleaned_text_to_sequence(cleaned_text):
29 |   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
30 |     Args:
31 |       text: string to convert to a sequence
32 |     Returns:
33 |       List of integers corresponding to the symbols in the text
34 |   '''
35 |   sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
36 |   return sequence
37 | 
38 | 
39 | def sequence_to_text(sequence):
40 |   '''Converts a sequence of IDs back to a string'''
41 |   result = ''
42 |   for symbol_id in sequence:
43 |     s = _id_to_symbol[symbol_id]
44 |     result += s
45 |   return result
46 | 
47 | 
48 | def _clean_text(text, cleaner_names):
49 |   for name in cleaner_names:
50 |     cleaner = getattr(cleaners, name)
51 |     if not cleaner:
52 |       raise Exception('Unknown cleaner: %s' % name)
53 |     text = cleaner(text)
54 |   return text
55 | 


--------------------------------------------------------------------------------
/configs/ljs_istft_vits.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "eval_interval": 100000,
 5 |     "seed": 1234,
 6 |     "epochs": 20000,
 7 |     "learning_rate": 2e-4,
 8 |     "betas": [0.8, 0.99],
 9 |     "eps": 1e-9,
10 |     "batch_size": 64,
11 |     "fp16_run": false,
12 |     "lr_decay": 0.999875,
13 |     "segment_size": 8192,
14 |     "init_lr_ratio": 1,
15 |     "warmup_epochs": 0,
16 |     "c_mel": 45,
17 |     "c_kl": 1.0,
18 |     "fft_sizes": [384, 683, 171],
19 |     "hop_sizes": [30, 60, 10],
20 |     "win_lengths": [150, 300, 60],
21 |     "window": "hann_window"  
22 |   },
23 |   "data": {
24 |     "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 |     "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 |     "text_cleaners":["english_cleaners2"],
27 |     "max_wav_value": 32768.0,
28 |     "sampling_rate": 22050,
29 |     "filter_length": 1024,
30 |     "hop_length": 256,
31 |     "win_length": 1024,
32 |     "n_mel_channels": 80,
33 |     "mel_fmin": 0.0,
34 |     "mel_fmax": null,
35 |     "add_blank": true,
36 |     "n_speakers": 0,
37 |     "cleaned_text": true
38 |   },
39 |   "model": {
40 |     "ms_istft_vits": false,
41 |     "mb_istft_vits": false,
42 |     "istft_vits": true,
43 |     "subbands": false,
44 |     "gen_istft_n_fft": 16,
45 |     "gen_istft_hop_size": 4,
46 |     "inter_channels": 192,
47 |     "hidden_channels": 192,
48 |     "filter_channels": 768,
49 |     "n_heads": 2,
50 |     "n_layers": 6,
51 |     "kernel_size": 3,
52 |     "p_dropout": 0.1,
53 |     "resblock": "1",
54 |     "resblock_kernel_sizes": [3,7,11],
55 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 |     "upsample_rates": [8,8],
57 |     "upsample_initial_channel": 512,
58 |     "upsample_kernel_sizes": [16,16],
59 |     "n_layers_q": 3,
60 |     "use_spectral_norm": false,
61 |     "use_sdp": false
62 |   }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/configs/ljs_mini_istft_vits.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "eval_interval": 100000,
 5 |     "seed": 1234,
 6 |     "epochs": 20000,
 7 |     "learning_rate": 2e-4,
 8 |     "betas": [0.8, 0.99],
 9 |     "eps": 1e-9,
10 |     "batch_size": 64,
11 |     "fp16_run": false,
12 |     "lr_decay": 0.999875,
13 |     "segment_size": 8192,
14 |     "init_lr_ratio": 1,
15 |     "warmup_epochs": 0,
16 |     "c_mel": 45,
17 |     "c_kl": 1.0,
18 |     "fft_sizes": [384, 683, 171],
19 |     "hop_sizes": [30, 60, 10],
20 |     "win_lengths": [150, 300, 60],
21 |     "window": "hann_window"  
22 |   },
23 |   "data": {
24 |     "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 |     "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 |     "text_cleaners":["english_cleaners2"],
27 |     "max_wav_value": 32768.0,
28 |     "sampling_rate": 22050,
29 |     "filter_length": 1024,
30 |     "hop_length": 256,
31 |     "win_length": 1024,
32 |     "n_mel_channels": 80,
33 |     "mel_fmin": 0.0,
34 |     "mel_fmax": null,
35 |     "add_blank": true,
36 |     "n_speakers": 0,
37 |     "cleaned_text": true
38 |   },
39 |   "model": {
40 |     "ms_istft_vits": false,
41 |     "mb_istft_vits": false,
42 |     "istft_vits": true,
43 |     "subbands": false,
44 |     "gen_istft_n_fft": 16,
45 |     "gen_istft_hop_size": 4,
46 |     "inter_channels": 192,
47 |     "hidden_channels": 96,
48 |     "filter_channels": 768,
49 |     "n_heads": 2,
50 |     "n_layers": 3,
51 |     "kernel_size": 3,
52 |     "p_dropout": 0.1,
53 |     "resblock": "1",
54 |     "resblock_kernel_sizes": [3,7,11],
55 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 |     "upsample_rates": [8,8],
57 |     "upsample_initial_channel": 256,
58 |     "upsample_kernel_sizes": [16,16],
59 |     "n_layers_q": 3,
60 |     "use_spectral_norm": false,
61 |     "use_sdp": false
62 |   }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/configs/ljs_mini_mb_istft_vits.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "eval_interval": 100000,
 5 |     "seed": 1234,
 6 |     "epochs": 20000,
 7 |     "learning_rate": 2e-4,
 8 |     "betas": [0.8, 0.99],
 9 |     "eps": 1e-9,
10 |     "batch_size": 64,
11 |     "fp16_run": false,
12 |     "lr_decay": 0.999875,
13 |     "segment_size": 8192,
14 |     "init_lr_ratio": 1,
15 |     "warmup_epochs": 0,
16 |     "c_mel": 45,
17 |     "c_kl": 1.0,
18 |     "fft_sizes": [384, 683, 171],
19 |     "hop_sizes": [30, 60, 10],
20 |     "win_lengths": [150, 300, 60],
21 |     "window": "hann_window"  
22 |   },
23 |   "data": {
24 |     "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 |     "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 |     "text_cleaners":["english_cleaners2"],
27 |     "max_wav_value": 32768.0,
28 |     "sampling_rate": 22050,
29 |     "filter_length": 1024,
30 |     "hop_length": 256,
31 |     "win_length": 1024,
32 |     "n_mel_channels": 80,
33 |     "mel_fmin": 0.0,
34 |     "mel_fmax": null,
35 |     "add_blank": true,
36 |     "n_speakers": 0,
37 |     "cleaned_text": true
38 |   },
39 |   "model": {
40 |     "ms_istft_vits": false,
41 |     "mb_istft_vits": true,
42 |     "istft_vits": false,
43 |     "subbands": 4,
44 |     "gen_istft_n_fft": 16,
45 |     "gen_istft_hop_size": 4,
46 |     "inter_channels": 192,
47 |     "hidden_channels": 96,
48 |     "filter_channels": 768,
49 |     "n_heads": 2,
50 |     "n_layers": 3,
51 |     "kernel_size": 3,
52 |     "p_dropout": 0.1,
53 |     "resblock": "1",
54 |     "resblock_kernel_sizes": [3,7,11],
55 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 |     "upsample_rates": [4,4],
57 |     "upsample_initial_channel": 256,
58 |     "upsample_kernel_sizes": [16,16],
59 |     "n_layers_q": 3,
60 |     "use_spectral_norm": false,
61 |     "use_sdp": false
62 |   }
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/configs/ljs_mb_istft_vits.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "log_interval": 200,
 4 |     "eval_interval": 100000,
 5 |     "seed": 1234,
 6 |     "epochs": 20000,
 7 |     "learning_rate": 2e-4,
 8 |     "betas": [0.8, 0.99],
 9 |     "eps": 1e-9,
10 |     "batch_size": 64,
11 |     "fp16_run": false,
12 |     "lr_decay": 0.999875,
13 |     "segment_size": 8192,
14 |     "init_lr_ratio": 1,
15 |     "warmup_epochs": 0,
16 |     "c_mel": 45,
17 |     "c_wav": 100,
18 |     "c_kl": 1.0,
19 |     "fft_sizes": [384, 683, 171],
20 |     "hop_sizes": [30, 60, 10],
21 |     "win_lengths": [150, 300, 60],
22 |     "window": "hann_window"  
23 |   },
24 |   "data": {
25 |     "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
26 |     "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
27 |     "text_cleaners":["english_cleaners2"],
28 |     "max_wav_value": 32768.0,
29 |     "sampling_rate": 22050,
30 |     "filter_length": 1024,
31 |     "hop_length": 256,
32 |     "win_length": 1024,
33 |     "n_mel_channels": 80,
34 |     "mel_fmin": 0.0,
35 |     "mel_fmax": null,
36 |     "add_blank": true,
37 |     "n_speakers": 0,
38 |     "cleaned_text": true
39 |   },
40 |   "model": {
41 |     "ms_istft_vits": false,
42 |     "mb_istft_vits": true,
43 |     "istft_vits": false,
44 |     "subbands": 4,
45 |     "n_blocks": 11, 
46 |     "latent_dim": 192,
47 |     "gen_istft_n_fft": 256,
48 |     "gen_istft_hop_size": 64,
49 |     "inter_channels": 192,
50 |     "hidden_channels": 192,
51 |     "filter_channels": 768,
52 |     "n_heads": 2,
53 |     "n_layers": 6,
54 |     "kernel_size": 3,
55 |     "p_dropout": 0.1,
56 |     "resblock": "1",
57 |     "resblock_kernel_sizes": [3,7,11],
58 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
59 |     "upsample_rates": [4,4],
60 |     "upsample_initial_channel": 512,
61 |     "upsample_kernel_sizes": [16,16],
62 |     "n_layers_q": 3,
63 |     "use_spectral_norm": false,
64 |     "use_sdp": false
65 |   }
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/losses.py:
--------------------------------------------------------------------------------
 1 | import torch 
 2 | from torch.nn import functional as F
 3 | from stft_loss import MultiResolutionSTFTLoss
 4 | 
 5 | 
 6 | import commons
 7 | 
 8 | 
 9 | def feature_loss(fmap_r, fmap_g):
10 |   loss = 0
11 |   for dr, dg in zip(fmap_r, fmap_g):
12 |     for rl, gl in zip(dr, dg):
13 |       rl = rl.float().detach()
14 |       gl = gl.float()
15 |       loss += torch.mean(torch.abs(rl - gl))
16 | 
17 |   return loss * 2 
18 | 
19 | 
20 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
21 |   loss = 0
22 |   r_losses = []
23 |   g_losses = []
24 |   for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
25 |     dr = dr.float()
26 |     dg = dg.float()
27 |     r_loss = torch.mean((1-dr)**2)
28 |     g_loss = torch.mean(dg**2)
29 |     loss += (r_loss + g_loss)
30 |     r_losses.append(r_loss.item())
31 |     g_losses.append(g_loss.item())
32 | 
33 |   return loss, r_losses, g_losses
34 | 
35 | 
36 | def generator_loss(disc_outputs):
37 |   loss = 0
38 |   gen_losses = []
39 |   for dg in disc_outputs:
40 |     dg = dg.float()
41 |     l = torch.mean((1-dg)**2)
42 |     gen_losses.append(l)
43 |     loss += l
44 | 
45 |   return loss, gen_losses
46 | 
47 | 
48 | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
49 |   """
50 |   z_p, logs_q: [b, h, t_t]
51 |   m_p, logs_p: [b, h, t_t]
52 |   """
53 |   z_p = z_p.float()
54 |   logs_q = logs_q.float()
55 |   m_p = m_p.float()
56 |   logs_p = logs_p.float()
57 |   z_mask = z_mask.float()
58 | 
59 |   kl = logs_p - logs_q - 0.5
60 |   kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
61 |   kl = torch.sum(kl * z_mask)
62 |   l = kl / torch.sum(z_mask)
63 |   return l
64 | 
65 | def subband_stft_loss(h, y_mb, y_hat_mb):
66 |   sub_stft_loss = MultiResolutionSTFTLoss(h.train.fft_sizes, h.train.hop_sizes, h.train.win_lengths)
67 |   y_mb =  y_mb.view(-1, y_mb.size(2))
68 |   y_hat_mb = y_hat_mb.view(-1, y_hat_mb.size(2))
69 |   sub_sc_loss, sub_mag_loss = sub_stft_loss(y_hat_mb[:, :y_mb.size(-1)], y_mb)
70 |   return sub_sc_loss+sub_mag_loss
71 | 
72 | 


--------------------------------------------------------------------------------
/configs/ljs_ms_istft_vits.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": {
 3 |       "log_interval": 200,
 4 |       "eval_interval": 100000,
 5 |       "seed": 1234,
 6 |       "epochs": 20000,
 7 |       "learning_rate": 2e-4,
 8 |       "betas": [0.8, 0.99],
 9 |       "eps": 1e-9,
10 |       "batch_size": 64,
11 |       "fp16_run": false,
12 |       "lr_decay": 0.999875,
13 |       "segment_size": 8192,
14 |       "init_lr_ratio": 1,
15 |       "warmup_epochs": 0,
16 |       "c_mel": 45,
17 |       "c_kl": 1.0,
18 |       "fft_sizes": [384, 683, 171],
19 |       "hop_sizes": [30, 60, 10],
20 |       "win_lengths": [150, 300, 60],
21 |       "window": "hann_window"  
22 |     },
23 |     "data": {
24 |       "training_files":"filelists/ljs_audio_text_train_filelist.txt.cleaned",
25 |       "validation_files":"filelists/ljs_audio_text_val_filelist.txt.cleaned",
26 |       "text_cleaners":["english_cleaners2"],
27 |       "max_wav_value": 32768.0,
28 |       "sampling_rate": 22050,
29 |       "filter_length": 1024,
30 |       "hop_length": 256,
31 |       "win_length": 1024,
32 |       "n_mel_channels": 80,
33 |       "mel_fmin": 0.0,
34 |       "mel_fmax": null,
35 |       "add_blank": true,
36 |       "n_speakers": 0,
37 |       "cleaned_text": true
38 |     },
39 |     "model": {
40 |       "ms_istft_vits": true,
41 |       "mb_istft_vits": false,
42 |       "istft_vits": false,
43 |       "subbands": 4,
44 |       "gen_istft_n_fft": 16,
45 |       "gen_istft_hop_size": 4,
46 |       "inter_channels": 192,
47 |       "hidden_channels": 192,
48 |       "filter_channels": 768,
49 |       "n_heads": 2,
50 |       "n_layers": 6,
51 |       "kernel_size": 3,
52 |       "p_dropout": 0.1,
53 |       "resblock": "1",
54 |       "resblock_kernel_sizes": [3,7,11],
55 |       "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
56 |       "upsample_rates": [4,4],
57 |       "upsample_initial_channel": 512,
58 |       "upsample_kernel_sizes": [16,16],
59 |       "n_layers_q": 3,
60 |       "use_spectral_norm": false,
61 |       "use_sdp": false
62 |     }
63 |   
64 |   }
65 |   


--------------------------------------------------------------------------------
/inference.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%matplotlib inline\n",
 10 |     "import matplotlib.pyplot as plt\n",
 11 |     "import IPython.display as ipd\n",
 12 |     "\n",
 13 |     "import os\n",
 14 |     "import json\n",
 15 |     "import math\n",
 16 |     "import torch\n",
 17 |     "from torch import nn\n",
 18 |     "from torch.nn import functional as F\n",
 19 |     "from torch.utils.data import DataLoader\n",
 20 |     "\n",
 21 |     "import commons\n",
 22 |     "import utils\n",
 23 |     "from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate\n",
 24 |     "from models import SynthesizerTrn\n",
 25 |     "from text.symbols import symbols\n",
 26 |     "from text import text_to_sequence\n",
 27 |     "\n",
 28 |     "from scipy.io.wavfile import write\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "def get_text(text, hps):\n",
 32 |     "    text_norm = text_to_sequence(text, hps.data.text_cleaners)\n",
 33 |     "    if hps.data.add_blank:\n",
 34 |     "        text_norm = commons.intersperse(text_norm, 0)\n",
 35 |     "    text_norm = torch.LongTensor(text_norm)\n",
 36 |     "    return text_norm"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## MB-iSTFT-VITS"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "hps = utils.get_hparams_from_file(\"./configs/ljs_mb_istft_vits.json\")"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "net_g = SynthesizerTrn(\n",
 62 |     "    len(symbols),\n",
 63 |     "    hps.data.filter_length // 2 + 1,\n",
 64 |     "    hps.train.segment_size // hps.data.hop_length,\n",
 65 |     "    **hps.model).cuda()\n",
 66 |     "_ = net_g.eval()\n",
 67 |     "\n",
 68 |     "_ = utils.load_checkpoint(\"./logs/ljs_mb_istft_vits/G_800000.pth\", net_g, None)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "stn_tst = get_text(\"This is a sample audio\", hps)\n",
 78 |     "with torch.no_grad():\n",
 79 |     "    x_tst = stn_tst.cuda().unsqueeze(0)\n",
 80 |     "    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()\n",
 81 |     "    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()\n",
 82 |     "ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))"
 83 |    ]
 84 |   }
 85 |  ],
 86 |  "metadata": {
 87 |   "kernelspec": {
 88 |    "display_name": "Python 3",
 89 |    "language": "python",
 90 |    "name": "python3"
 91 |   },
 92 |   "language_info": {
 93 |    "codemirror_mode": {
 94 |     "name": "ipython",
 95 |     "version": 3
 96 |    },
 97 |    "file_extension": ".py",
 98 |    "mimetype": "text/x-python",
 99 |    "name": "python",
100 |    "nbconvert_exporter": "python",
101 |    "pygments_lexer": "ipython3",
102 |    "version": "3.8.13"
103 |   }
104 |  },
105 |  "nbformat": 4,
106 |  "nbformat_minor": 4
107 | }
108 | 


--------------------------------------------------------------------------------
/text/cleaners.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/keithito/tacotron """
  2 | 
  3 | '''
  4 | Cleaners are transformations that run over the input text at both training and eval time.
  5 | 
  6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
  7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  8 |   1. "english_cleaners" for English text
  9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
 10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
 11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
 12 |      the symbols in symbols.py to match your data).
 13 | '''
 14 | 
 15 | import re
 16 | from unidecode import unidecode
 17 | from phonemizer import phonemize
 18 | 
 19 | 
 20 | # Regular expression matching whitespace:
 21 | _whitespace_re = re.compile(r'\s+')
 22 | 
 23 | # List of (regular expression, replacement) pairs for abbreviations:
 24 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
 25 |   ('mrs', 'misess'),
 26 |   ('mr', 'mister'),
 27 |   ('dr', 'doctor'),
 28 |   ('st', 'saint'),
 29 |   ('co', 'company'),
 30 |   ('jr', 'junior'),
 31 |   ('maj', 'major'),
 32 |   ('gen', 'general'),
 33 |   ('drs', 'doctors'),
 34 |   ('rev', 'reverend'),
 35 |   ('lt', 'lieutenant'),
 36 |   ('hon', 'honorable'),
 37 |   ('sgt', 'sergeant'),
 38 |   ('capt', 'captain'),
 39 |   ('esq', 'esquire'),
 40 |   ('ltd', 'limited'),
 41 |   ('col', 'colonel'),
 42 |   ('ft', 'fort'),
 43 | ]]
 44 | 
 45 | 
 46 | def expand_abbreviations(text):
 47 |   for regex, replacement in _abbreviations:
 48 |     text = re.sub(regex, replacement, text)
 49 |   return text
 50 | 
 51 | 
 52 | def expand_numbers(text):
 53 |   return normalize_numbers(text)
 54 | 
 55 | 
 56 | def lowercase(text):
 57 |   return text.lower()
 58 | 
 59 | 
 60 | def collapse_whitespace(text):
 61 |   return re.sub(_whitespace_re, ' ', text)
 62 | 
 63 | 
 64 | def convert_to_ascii(text):
 65 |   return unidecode(text)
 66 | 
 67 | 
 68 | def basic_cleaners(text):
 69 |   '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
 70 |   text = lowercase(text)
 71 |   text = collapse_whitespace(text)
 72 |   return text
 73 | 
 74 | 
 75 | def transliteration_cleaners(text):
 76 |   '''Pipeline for non-English text that transliterates to ASCII.'''
 77 |   text = convert_to_ascii(text)
 78 |   text = lowercase(text)
 79 |   text = collapse_whitespace(text)
 80 |   return text
 81 | 
 82 | 
 83 | def english_cleaners(text):
 84 |   '''Pipeline for English text, including abbreviation expansion.'''
 85 |   text = convert_to_ascii(text)
 86 |   text = lowercase(text)
 87 |   text = expand_abbreviations(text)
 88 |   phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
 89 |   phonemes = collapse_whitespace(phonemes)
 90 |   return phonemes
 91 | 
 92 | 
 93 | def english_cleaners2(text):
 94 |   '''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
 95 |   text = convert_to_ascii(text)
 96 |   text = lowercase(text)
 97 |   text = expand_abbreviations(text)
 98 |   phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
 99 |   phonemes = collapse_whitespace(phonemes)
100 |   return phonemes
101 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MB-iSTFT-VITS with AutoVocoder
 2 | 
 3 | ## Motivation for implementation
 4 | Starting from [VITS](https://arxiv.org/abs/2106.06103), [MB-iSTFT-VITS](https://arxiv.org/abs/2210.15975) improves the synthesis speed using below techniques:
 5 | 1. Multi-band parallel generation strategy by decomposing speech signals into sub-band signals
 6 | 2. iSTFT based waveform generation process<br>
 7 | 
 8 | Based on this well-designed framework, this repository aims to further improve sound quality and inference speed with [Autovocoder](https://github.com/hcy71o/AutoVocoder).<br> This repo is based on [MB-iSTFT-VITS](https://github.com/MasayaKawamura/MB-iSTFT-VITS), and the expected modifications and enhancements are below:
 9 | - [x] 1. Replace the iSTFTNet-based decoder to AutoVocoder-based decoder.<br>
10 | 
11 | - [x] 2. In iSTFT operation, use Real/Imaginary instead of Phase/Magnitude components to construct complex spectrogram. Add time-domain reconstruction loss.
12 | 
13 | - [x] 3. Revise the posterior encoder to accept 4 complex components instead of linear spectrogram.<br>
14 | 
15 | * Owing to nature of VITS that models powerful latents, AutoVocoder can be proper application due to its autoencoder architecture. Also it has fast inference speed 
16 | by directly generating waveform with `(1024, 256, 1024)` fft/hop/win size without upsmpling modules. (Multi-band startegy will be maintained)
17 | * Conventional TTS models including VITS, modeling phase information has been entirely the role of a decoder (vocoder). In `Mod 3.`, by providing phase information to latents, we test whether prior can reliably approx these latents.
18 | 
19 | `Disclaimer : This repo is built for testing purpose. Performance is not guaranteed. Welcome your contributions.`
20 | 
21 | ## Note
22 | * For easy comparison, we did not change the whole architecture of the posterior encoder. Instead, we only used group convolution in the front part to process revised inputs (4 complex components).
23 | * In current, this repo tries to implement MB-iSTFT-VITS based model. Application to mini, MS, w/o MB might be future work.
24 | 
25 | ## Explanation (from [MB-iSTFT-VITS](https://github.com/MasayaKawamura/MB-iSTFT-VITS))
26 | 
27 | ### 0. Baseline: MB-iSTFT-VITS
28 | 
29 | <p align="center"><img src="./fig/proposed_model.png" width="85%"></p>
30 | 
31 | ### 1. Pre-requisites
32 | 
33 | 0. Python >= 3.6
34 | 0. Clone this repository
35 | 0. Install python requirements. Please refer [requirements.txt](requirements.txt)
36 |     1. You may need to install espeak first: `apt-get install espeak`
37 | 0. Download datasets
38 |     1. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), then rename or create a link to the dataset folder: `ln -s /path/to/LJSpeech-1.1/wavs DUMMY1`
39 | 0. Build Monotonic Alignment Search and run preprocessing if you use your own datasets.
40 | ```sh
41 | # Cython-version Monotonoic Alignment Search
42 | cd monotonic_align
43 | mkdir monotonic_align
44 | python setup.py build_ext --inplace
45 | ```
46 | 
47 | ### 2. Training
48 | In the case of MB-iSTFT-VITS training, run the following script
49 | ```sh
50 | python train_latest.py -c configs/ljs_mb_istft_vits.json -m ljs_mb_istft_vits
51 | 
52 | ```
53 | 
54 | After the training, you can check inference audio using [inference.ipynb](inference.ipynb)
55 | 
56 | ## References
57 | - MB-iSTFT-VITS: [Paper](https://arxiv.org/abs/2210.15975) / [Code](https://github.com/MasayaKawamura/MB-iSTFT-VITS)
58 | - AutoVocoder: [Paper](https://arxiv.org/abs/2211.06989) / [Code](https://github.com/hcy71o/AutoVocoder) (unofficial)


--------------------------------------------------------------------------------
/pqmf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2020 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """Pseudo QMF modules."""
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | import torch.nn.functional as F
 11 | 
 12 | from scipy.signal import kaiser
 13 | 
 14 | 
 15 | def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0):
 16 |     """Design prototype filter for PQMF.
 17 |     This method is based on `A Kaiser window approach for the design of prototype
 18 |     filters of cosine modulated filterbanks`_.
 19 |     Args:
 20 |         taps (int): The number of filter taps.
 21 |         cutoff_ratio (float): Cut-off frequency ratio.
 22 |         beta (float): Beta coefficient for kaiser window.
 23 |     Returns:
 24 |         ndarray: Impluse response of prototype filter (taps + 1,).
 25 |     .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
 26 |         https://ieeexplore.ieee.org/abstract/document/681427
 27 |     """
 28 |     # check the arguments are valid
 29 |     assert taps % 2 == 0, "The number of taps mush be even number."
 30 |     assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
 31 | 
 32 |     # make initial filter
 33 |     omega_c = np.pi * cutoff_ratio
 34 |     with np.errstate(invalid='ignore'):
 35 |         h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) \
 36 |             / (np.pi * (np.arange(taps + 1) - 0.5 * taps))
 37 |     h_i[taps // 2] = np.cos(0) * cutoff_ratio  # fix nan due to indeterminate form
 38 | 
 39 |     # apply kaiser window
 40 |     w = kaiser(taps + 1, beta)
 41 |     h = h_i * w
 42 | 
 43 |     return h
 44 | 
 45 | 
 46 | class PQMF(torch.nn.Module):
 47 |     """PQMF module.
 48 |     This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
 49 |     .. _`Near-perfect-reconstruction pseudo-QMF banks`:
 50 |         https://ieeexplore.ieee.org/document/258122
 51 |     """
 52 | 
 53 |     def __init__(self, device, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0):
 54 |         """Initilize PQMF module.
 55 |         Args:
 56 |             subbands (int): The number of subbands.
 57 |             taps (int): The number of filter taps.
 58 |             cutoff_ratio (float): Cut-off frequency ratio.
 59 |             beta (float): Beta coefficient for kaiser window.
 60 |         """
 61 |         super(PQMF, self).__init__()
 62 | 
 63 |         # define filter coefficient
 64 |         h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
 65 |         h_analysis = np.zeros((subbands, len(h_proto)))
 66 |         h_synthesis = np.zeros((subbands, len(h_proto)))
 67 |         for k in range(subbands):
 68 |             h_analysis[k] = 2 * h_proto * np.cos(
 69 |                 (2 * k + 1) * (np.pi / (2 * subbands)) *
 70 |                 (np.arange(taps + 1) - ((taps - 1) / 2)) +
 71 |                 (-1) ** k * np.pi / 4)
 72 |             h_synthesis[k] = 2 * h_proto * np.cos(
 73 |                 (2 * k + 1) * (np.pi / (2 * subbands)) *
 74 |                 (np.arange(taps + 1) - ((taps - 1) / 2)) -
 75 |                 (-1) ** k * np.pi / 4)
 76 | 
 77 |         # convert to tensor
 78 |         analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).cuda(device)
 79 |         synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0).cuda(device)
 80 | 
 81 |         # register coefficients as beffer
 82 |         self.register_buffer("analysis_filter", analysis_filter)
 83 |         self.register_buffer("synthesis_filter", synthesis_filter)
 84 | 
 85 |         # filter for downsampling & upsampling
 86 |         updown_filter = torch.zeros((subbands, subbands, subbands)).float().cuda(device)
 87 |         for k in range(subbands):
 88 |             updown_filter[k, k, 0] = 1.0
 89 |         self.register_buffer("updown_filter", updown_filter)
 90 |         self.subbands = subbands
 91 | 
 92 |         # keep padding info
 93 |         self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
 94 | 
 95 |     def analysis(self, x):
 96 |         """Analysis with PQMF.
 97 |         Args:
 98 |             x (Tensor): Input tensor (B, 1, T).
 99 |         Returns:
100 |             Tensor: Output tensor (B, subbands, T // subbands).
101 |         """
102 |         x = F.conv1d(self.pad_fn(x), self.analysis_filter)
103 |         return F.conv1d(x, self.updown_filter, stride=self.subbands)
104 | 
105 |     def synthesis(self, x):
106 |         """Synthesis with PQMF.
107 |         Args:
108 |             x (Tensor): Input tensor (B, subbands, T // subbands).
109 |         Returns:
110 |             Tensor: Output tensor (B, 1, T).
111 |         """
112 |         # NOTE(kan-bayashi): Power will be dreased so here multipy by # subbands.
113 |         #   Not sure this is the correct way, it is better to check again.
114 |         # TODO(kan-bayashi): Understand the reconstruction procedure
115 |         x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands)
116 |         return F.conv1d(self.pad_fn(x), self.synthesis_filter)


--------------------------------------------------------------------------------
/stft_loss.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Tomoki Hayashi
  4 | #  MIT License (https://opensource.org/licenses/MIT)
  5 | 
  6 | """STFT-based Loss modules."""
  7 | 
  8 | import torch
  9 | import torch.nn.functional as F
 10 | 
 11 | 
 12 | def stft(x, fft_size, hop_size, win_length, window):
 13 |     """Perform STFT and convert to magnitude spectrogram.
 14 |     Args:
 15 |         x (Tensor): Input signal tensor (B, T).
 16 |         fft_size (int): FFT size.
 17 |         hop_size (int): Hop size.
 18 |         win_length (int): Window length.
 19 |         window (str): Window function type.
 20 |     Returns:
 21 |         Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
 22 |     """
 23 |     x_stft = torch.stft(x, fft_size, hop_size, win_length, window.to(x.device))
 24 |     real = x_stft[..., 0]
 25 |     imag = x_stft[..., 1]
 26 | 
 27 |     # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
 28 |     return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
 29 | 
 30 | 
 31 | class SpectralConvergengeLoss(torch.nn.Module):
 32 |     """Spectral convergence loss module."""
 33 | 
 34 |     def __init__(self):
 35 |         """Initilize spectral convergence loss module."""
 36 |         super(SpectralConvergengeLoss, self).__init__()
 37 | 
 38 |     def forward(self, x_mag, y_mag):
 39 |         """Calculate forward propagation.
 40 |         Args:
 41 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 42 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 43 |         Returns:
 44 |             Tensor: Spectral convergence loss value.
 45 |         """
 46 |         return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
 47 | 
 48 | 
 49 | class LogSTFTMagnitudeLoss(torch.nn.Module):
 50 |     """Log STFT magnitude loss module."""
 51 | 
 52 |     def __init__(self):
 53 |         """Initilize los STFT magnitude loss module."""
 54 |         super(LogSTFTMagnitudeLoss, self).__init__()
 55 | 
 56 |     def forward(self, x_mag, y_mag):
 57 |         """Calculate forward propagation.
 58 |         Args:
 59 |             x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
 60 |             y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
 61 |         Returns:
 62 |             Tensor: Log STFT magnitude loss value.
 63 |         """
 64 |         return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
 65 | 
 66 | 
 67 | class STFTLoss(torch.nn.Module):
 68 |     """STFT loss module."""
 69 | 
 70 |     def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"):
 71 |         """Initialize STFT loss module."""
 72 |         super(STFTLoss, self).__init__()
 73 |         self.fft_size = fft_size
 74 |         self.shift_size = shift_size
 75 |         self.win_length = win_length
 76 |         self.window = getattr(torch, window)(win_length)
 77 |         self.spectral_convergenge_loss = SpectralConvergengeLoss()
 78 |         self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
 79 | 
 80 |     def forward(self, x, y):
 81 |         """Calculate forward propagation.
 82 |         Args:
 83 |             x (Tensor): Predicted signal (B, T).
 84 |             y (Tensor): Groundtruth signal (B, T).
 85 |         Returns:
 86 |             Tensor: Spectral convergence loss value.
 87 |             Tensor: Log STFT magnitude loss value.
 88 |         """
 89 |         x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
 90 |         y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
 91 |         sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
 92 |         mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
 93 | 
 94 |         return sc_loss, mag_loss
 95 | 
 96 | 
 97 | class MultiResolutionSTFTLoss(torch.nn.Module):
 98 |     """Multi resolution STFT loss module."""
 99 | 
100 |     def __init__(self,
101 |                  fft_sizes=[1024, 2048, 512],
102 |                  hop_sizes=[120, 240, 50],
103 |                  win_lengths=[600, 1200, 240],
104 |                  window="hann_window"):
105 |         """Initialize Multi resolution STFT loss module.
106 |         Args:
107 |             fft_sizes (list): List of FFT sizes.
108 |             hop_sizes (list): List of hop sizes.
109 |             win_lengths (list): List of window lengths.
110 |             window (str): Window function type.
111 |         """
112 |         super(MultiResolutionSTFTLoss, self).__init__()
113 |         assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
114 |         self.stft_losses = torch.nn.ModuleList()
115 |         for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
116 |             self.stft_losses += [STFTLoss(fs, ss, wl, window)]
117 | 
118 |     def forward(self, x, y):
119 |         """Calculate forward propagation.
120 |         Args:
121 |             x (Tensor): Predicted signal (B, T).
122 |             y (Tensor): Groundtruth signal (B, T).
123 |         Returns:
124 |             Tensor: Multi resolution spectral convergence loss value.
125 |             Tensor: Multi resolution log STFT magnitude loss value.
126 |         """
127 |         sc_loss = 0.0
128 |         mag_loss = 0.0
129 |         for f in self.stft_losses:
130 |             sc_l, mag_l = f(x, y)
131 |             sc_loss += sc_l
132 |             mag_loss += mag_l
133 |         sc_loss /= len(self.stft_losses)
134 |         mag_loss /= len(self.stft_losses)
135 | 
136 |         return sc_loss, mag_loss


--------------------------------------------------------------------------------
/commons.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | 
  8 | def init_weights(m, mean=0.0, std=0.01):
  9 |   classname = m.__class__.__name__
 10 |   if classname.find("Conv") != -1:
 11 |     m.weight.data.normal_(mean, std)
 12 | 
 13 | 
 14 | def get_padding(kernel_size, dilation=1):
 15 |   return int((kernel_size*dilation - dilation)/2)
 16 | 
 17 | 
 18 | def convert_pad_shape(pad_shape):
 19 |   l = pad_shape[::-1]
 20 |   pad_shape = [item for sublist in l for item in sublist]
 21 |   return pad_shape
 22 | 
 23 | 
 24 | def intersperse(lst, item):
 25 |   result = [item] * (len(lst) * 2 + 1)
 26 |   result[1::2] = lst
 27 |   return result
 28 | 
 29 | 
 30 | def kl_divergence(m_p, logs_p, m_q, logs_q):
 31 |   """KL(P||Q)"""
 32 |   kl = (logs_q - logs_p) - 0.5
 33 |   kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
 34 |   return kl
 35 | 
 36 | 
 37 | def rand_gumbel(shape):
 38 |   """Sample from the Gumbel distribution, protect from overflows."""
 39 |   uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
 40 |   return -torch.log(-torch.log(uniform_samples))
 41 | 
 42 | 
 43 | def rand_gumbel_like(x):
 44 |   g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
 45 |   return g
 46 | 
 47 | 
 48 | def slice_segments(x, ids_str, segment_size=4):
 49 |   ret = torch.zeros_like(x[:, :, :segment_size])
 50 |   for i in range(x.size(0)):
 51 |     idx_str = ids_str[i]
 52 |     idx_end = idx_str + segment_size
 53 |     ret[i] = x[i, :, idx_str:idx_end]
 54 |   return ret
 55 | 
 56 | 
 57 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
 58 |   b, d, t = x.size()
 59 |   if x_lengths is None:
 60 |     x_lengths = t
 61 |   ids_str_max = x_lengths - segment_size + 1
 62 |   ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
 63 |   ret = slice_segments(x, ids_str, segment_size)
 64 |   return ret, ids_str
 65 | 
 66 | 
 67 | def get_timing_signal_1d(
 68 |     length, channels, min_timescale=1.0, max_timescale=1.0e4):
 69 |   position = torch.arange(length, dtype=torch.float)
 70 |   num_timescales = channels // 2
 71 |   log_timescale_increment = (
 72 |       math.log(float(max_timescale) / float(min_timescale)) /
 73 |       (num_timescales - 1))
 74 |   inv_timescales = min_timescale * torch.exp(
 75 |       torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
 76 |   scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
 77 |   signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
 78 |   signal = F.pad(signal, [0, 0, 0, channels % 2])
 79 |   signal = signal.view(1, channels, length)
 80 |   return signal
 81 | 
 82 | 
 83 | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
 84 |   b, channels, length = x.size()
 85 |   signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 86 |   return x + signal.to(dtype=x.dtype, device=x.device)
 87 | 
 88 | 
 89 | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
 90 |   b, channels, length = x.size()
 91 |   signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
 92 |   return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
 93 | 
 94 | 
 95 | def subsequent_mask(length):
 96 |   mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
 97 |   return mask
 98 | 
 99 | 
100 | @torch.jit.script
101 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102 |   n_channels_int = n_channels[0]
103 |   in_act = input_a + input_b
104 |   t_act = torch.tanh(in_act[:, :n_channels_int, :])
105 |   s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106 |   acts = t_act * s_act
107 |   return acts
108 | 
109 | 
110 | def convert_pad_shape(pad_shape):
111 |   l = pad_shape[::-1]
112 |   pad_shape = [item for sublist in l for item in sublist]
113 |   return pad_shape
114 | 
115 | 
116 | def shift_1d(x):
117 |   x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118 |   return x
119 | 
120 | 
121 | def sequence_mask(length, max_length=None):
122 |   if max_length is None:
123 |     max_length = length.max()
124 |   x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125 |   return x.unsqueeze(0) < length.unsqueeze(1)
126 | 
127 | 
128 | def generate_path(duration, mask):
129 |   """
130 |   duration: [b, 1, t_x]
131 |   mask: [b, 1, t_y, t_x]
132 |   """
133 |   device = duration.device
134 |   
135 |   b, _, t_y, t_x = mask.shape
136 |   cum_duration = torch.cumsum(duration, -1)
137 |   
138 |   cum_duration_flat = cum_duration.view(b * t_x)
139 |   path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
140 |   path = path.view(b, t_x, t_y)
141 |   path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
142 |   path = path.unsqueeze(1).transpose(2,3) * mask
143 |   return path
144 | 
145 | 
146 | def clip_grad_value_(parameters, clip_value, norm_type=2):
147 |   if isinstance(parameters, torch.Tensor):
148 |     parameters = [parameters]
149 |   parameters = list(filter(lambda p: p.grad is not None, parameters))
150 |   norm_type = float(norm_type)
151 |   if clip_value is not None:
152 |     clip_value = float(clip_value)
153 | 
154 |   total_norm = 0
155 |   for p in parameters:
156 |     param_norm = p.grad.data.norm(norm_type)
157 |     total_norm += param_norm.item() ** norm_type
158 |     if clip_value is not None:
159 |       p.grad.data.clamp_(min=-clip_value, max=clip_value)
160 |   total_norm = total_norm ** (1. / norm_type)
161 |   return total_norm
162 | 


--------------------------------------------------------------------------------
/mel_processing.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import random
  4 | import torch
  5 | from torch import nn
  6 | import torch.nn.functional as F
  7 | import torch.utils.data
  8 | import numpy as np
  9 | import librosa
 10 | import librosa.util as librosa_util
 11 | from librosa.util import normalize, pad_center, tiny
 12 | from scipy.signal import get_window
 13 | from scipy.io.wavfile import read
 14 | from librosa.filters import mel as librosa_mel_fn
 15 | 
 16 | MAX_WAV_VALUE = 32768.0
 17 | 
 18 | 
 19 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 20 |     """
 21 |     PARAMS
 22 |     ------
 23 |     C: compression factor
 24 |     """
 25 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 26 | 
 27 | 
 28 | def dynamic_range_decompression_torch(x, C=1):
 29 |     """
 30 |     PARAMS
 31 |     ------
 32 |     C: compression factor used to compress
 33 |     """
 34 |     return torch.exp(x) / C
 35 | 
 36 | 
 37 | def spectral_normalize_torch(magnitudes):
 38 |     output = dynamic_range_compression_torch(magnitudes)
 39 |     return output
 40 | 
 41 | 
 42 | def spectral_de_normalize_torch(magnitudes):
 43 |     output = dynamic_range_decompression_torch(magnitudes)
 44 |     return output
 45 | 
 46 | 
 47 | mel_basis = {}
 48 | hann_window = {}
 49 | 
 50 | def complx_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 51 |     
 52 |     global hann_window
 53 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 54 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 55 |     if wnsize_dtype_device not in hann_window:
 56 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
 57 | 
 58 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 59 |     y = y.squeeze(1)
 60 | 
 61 |     # (B, N, T)
 62 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
 63 |                       center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
 64 |     
 65 |     spec = torch.view_as_real(spec) # (B, N, T, 2)
 66 |     mag = torch.sqrt(spec.pow(2).sum(-1)+(1e-6)) # (B, N, T)
 67 |     phase = torch.angle(spec.sum(-1)) # (B, N, T)
 68 |     spec = spec.permute(0,3,1,2) #(B, 2, N, T)
 69 |     
 70 |     # (B, 4, N, T)
 71 |     complex_comp = torch.cat((spec,mag.unsqueeze(1),phase.unsqueeze(1)), dim=1)
 72 | 
 73 |     return complex_comp
 74 | 
 75 | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
 76 |     # if torch.min(y) < -1.:
 77 |     #     print('min value is ', torch.min(y))
 78 |     # if torch.max(y) > 1.:
 79 |     #     print('max value is ', torch.max(y))
 80 | 
 81 |     global hann_window
 82 |     dtype_device = str(y.dtype) + '_' + str(y.device)
 83 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
 84 |     if wnsize_dtype_device not in hann_window:
 85 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
 86 | 
 87 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 88 |     y = y.squeeze(1)
 89 | 
 90 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
 91 |                       center=center, pad_mode='reflect', normalized=False, onesided=True)
 92 | 
 93 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
 94 |     # (B, N, T)
 95 |     return spec
 96 | 
 97 | 
 98 | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
 99 |     global mel_basis
100 |     dtype_device = str(spec.dtype) + '_' + str(spec.device)
101 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
102 |     if fmax_dtype_device not in mel_basis:
103 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
104 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
105 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
106 |     spec = spectral_normalize_torch(spec)
107 |     return spec
108 | 
109 | 
110 | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
111 |     # if torch.min(y) < -1.:
112 |     #     print('min value is ', torch.min(y))
113 |     # if torch.max(y) > 1.:
114 |     #     print('max value is ', torch.max(y))
115 | 
116 |     global mel_basis, hann_window
117 |     dtype_device = str(y.dtype) + '_' + str(y.device)
118 |     fmax_dtype_device = str(fmax) + '_' + dtype_device
119 |     wnsize_dtype_device = str(win_size) + '_' + dtype_device
120 |     if fmax_dtype_device not in mel_basis:
121 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
122 |         mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
123 |     if wnsize_dtype_device not in hann_window:
124 |         hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
125 | 
126 |     y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
127 |     y = y.squeeze(1)
128 | 
129 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
130 |                       center=center, pad_mode='reflect', normalized=False, onesided=True)
131 | 
132 |     spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
133 | 
134 |     spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
135 |     spec = spectral_normalize_torch(spec)
136 | 
137 |     return spec
138 | 


--------------------------------------------------------------------------------
/filelists/vctk_audio_sid_text_val_filelist.txt:
--------------------------------------------------------------------------------
  1 | DUMMY2/p364/p364_240.wav|88|It had happened to him.
  2 | DUMMY2/p280/p280_148.wav|52|It is open season on the Old Firm.
  3 | DUMMY2/p231/p231_320.wav|50|However, he is a coach, and he remains a coach at heart.
  4 | DUMMY2/p282/p282_129.wav|83|It is not a U-turn.
  5 | DUMMY2/p254/p254_015.wav|41|The Greeks used to imagine that it was a sign from the gods to foretell war or heavy rain.
  6 | DUMMY2/p228/p228_285.wav|57|The songs are just so good.
  7 | DUMMY2/p334/p334_307.wav|38|If they don't, they can expect their funding to be cut.
  8 | DUMMY2/p287/p287_081.wav|77|I've never seen anything like it.
  9 | DUMMY2/p247/p247_083.wav|14|It is a job creation scheme.)
 10 | DUMMY2/p264/p264_051.wav|65|We were leading by two goals.)
 11 | DUMMY2/p335/p335_058.wav|49|Let's see that increase over the years.
 12 | DUMMY2/p236/p236_225.wav|75|There is no quick fix.
 13 | DUMMY2/p374/p374_353.wav|11|And that brings us to the point.
 14 | DUMMY2/p272/p272_076.wav|69|Sounds like The Sixth Sense?
 15 | DUMMY2/p271/p271_152.wav|27|The petition was formally presented at Downing Street yesterday.
 16 | DUMMY2/p228/p228_127.wav|57|They've got to account for it.
 17 | DUMMY2/p276/p276_223.wav|106|It's been a humbling year.
 18 | DUMMY2/p262/p262_248.wav|45|The project has already secured the support of Sir Sean Connery.
 19 | DUMMY2/p314/p314_086.wav|51|The team this year is going places.
 20 | DUMMY2/p225/p225_038.wav|101|Diving is no part of football.
 21 | DUMMY2/p279/p279_088.wav|25|The shareholders will vote to wind up the company on Friday morning.
 22 | DUMMY2/p272/p272_018.wav|69|Aristotle thought that the rainbow was caused by reflection of the sun's rays by the rain.
 23 | DUMMY2/p256/p256_098.wav|90|She told The Herald.
 24 | DUMMY2/p261/p261_218.wav|100|All will be revealed in due course.
 25 | DUMMY2/p265/p265_063.wav|73|IT shouldn't come as a surprise, but it does.
 26 | DUMMY2/p314/p314_042.wav|51|It is all about people being assaulted, abused.
 27 | DUMMY2/p241/p241_188.wav|86|I wish I could say something.
 28 | DUMMY2/p283/p283_111.wav|95|It's good to have a voice.
 29 | DUMMY2/p275/p275_006.wav|40|When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow.
 30 | DUMMY2/p228/p228_092.wav|57|Today I couldn't run on it.
 31 | DUMMY2/p295/p295_343.wav|92|The atmosphere is businesslike.
 32 | DUMMY2/p228/p228_187.wav|57|They will run a mile.
 33 | DUMMY2/p294/p294_317.wav|104|It didn't put me off.
 34 | DUMMY2/p231/p231_445.wav|50|It sounded like a bomb.
 35 | DUMMY2/p272/p272_086.wav|69|Today she has been released.
 36 | DUMMY2/p255/p255_210.wav|31|It was worth a photograph.
 37 | DUMMY2/p229/p229_060.wav|67|And a film maker was born.
 38 | DUMMY2/p260/p260_232.wav|81|The Home Office would not release any further details about the group.
 39 | DUMMY2/p245/p245_025.wav|59|Johnson was pretty low.
 40 | DUMMY2/p333/p333_185.wav|64|This area is perfect for children.
 41 | DUMMY2/p244/p244_242.wav|78|He is a man of the people.
 42 | DUMMY2/p376/p376_187.wav|71|"It is a terrible loss."
 43 | DUMMY2/p239/p239_156.wav|48|It is a good lifestyle.
 44 | DUMMY2/p307/p307_037.wav|22|He released a half-dozen solo albums.
 45 | DUMMY2/p305/p305_185.wav|54|I am not even thinking about that.
 46 | DUMMY2/p272/p272_081.wav|69|It was magic.
 47 | DUMMY2/p302/p302_297.wav|30|I'm trying to stay open on that.
 48 | DUMMY2/p275/p275_320.wav|40|We are in the end game.
 49 | DUMMY2/p239/p239_231.wav|48|Then we will face the Danish champions.
 50 | DUMMY2/p268/p268_301.wav|87|It was only later that the condition was diagnosed.
 51 | DUMMY2/p336/p336_088.wav|98|They failed to reach agreement yesterday.
 52 | DUMMY2/p278/p278_255.wav|10|They made such decisions in London.
 53 | DUMMY2/p361/p361_132.wav|79|That got me out.
 54 | DUMMY2/p307/p307_146.wav|22|You hope he prevails.
 55 | DUMMY2/p244/p244_147.wav|78|They could not ignore the will of parliament, he claimed.
 56 | DUMMY2/p294/p294_283.wav|104|This is our unfinished business.
 57 | DUMMY2/p283/p283_300.wav|95|I would have the hammer in the crowd.
 58 | DUMMY2/p239/p239_079.wav|48|I can understand the frustrations of our fans.
 59 | DUMMY2/p264/p264_009.wav|65|There is , according to legend, a boiling pot of gold at one end. )
 60 | DUMMY2/p307/p307_348.wav|22|He did not oppose the divorce.
 61 | DUMMY2/p304/p304_308.wav|72|We are the gateway to justice.
 62 | DUMMY2/p281/p281_056.wav|36|None has ever been found.
 63 | DUMMY2/p267/p267_158.wav|0|We were given a warm and friendly reception.
 64 | DUMMY2/p300/p300_169.wav|102|Who do these people think they are?
 65 | DUMMY2/p276/p276_177.wav|106|They exist in name alone.
 66 | DUMMY2/p228/p228_245.wav|57|It is a policy which has the full support of the minister.
 67 | DUMMY2/p300/p300_303.wav|102|I'm wondering what you feel about the youngest.
 68 | DUMMY2/p362/p362_247.wav|15|This would give Scotland around eight members.
 69 | DUMMY2/p326/p326_031.wav|28|United were in control without always being dominant.
 70 | DUMMY2/p361/p361_288.wav|79|I did not think it was very proper.
 71 | DUMMY2/p286/p286_145.wav|63|Tiger is not the norm.
 72 | DUMMY2/p234/p234_071.wav|3|She did that for the rest of her life.
 73 | DUMMY2/p263/p263_296.wav|39|The decision was announced at its annual conference in Dunfermline.
 74 | DUMMY2/p323/p323_228.wav|34|She became a heroine of my childhood.
 75 | DUMMY2/p280/p280_346.wav|52|It was a bit like having children.
 76 | DUMMY2/p333/p333_080.wav|64|But the tragedy did not stop there.
 77 | DUMMY2/p226/p226_268.wav|43|That decision is for the British Parliament and people.
 78 | DUMMY2/p362/p362_314.wav|15|Is that right?
 79 | DUMMY2/p240/p240_047.wav|93|It is so sad.
 80 | DUMMY2/p250/p250_207.wav|24|You could feel the heat.
 81 | DUMMY2/p273/p273_176.wav|56|Neither side would reveal the details of the offer.
 82 | DUMMY2/p316/p316_147.wav|85|And frankly, it's been a while.
 83 | DUMMY2/p265/p265_047.wav|73|It is unique.
 84 | DUMMY2/p336/p336_353.wav|98|Sometimes you get them, sometimes you don't.
 85 | DUMMY2/p230/p230_376.wav|35|This hasn't happened in a vacuum.
 86 | DUMMY2/p308/p308_209.wav|107|There is great potential on this river.
 87 | DUMMY2/p250/p250_442.wav|24|We have not yet received a letter from the Irish.
 88 | DUMMY2/p260/p260_037.wav|81|It's a fact.
 89 | DUMMY2/p299/p299_345.wav|58|We're very excited and challenged by the project.
 90 | DUMMY2/p269/p269_218.wav|94|A Grampian Police spokesman said.
 91 | DUMMY2/p306/p306_014.wav|12|To the Hebrews it was a token that there would be no more universal floods.
 92 | DUMMY2/p271/p271_292.wav|27|It's a record label, not a form of music.
 93 | DUMMY2/p247/p247_225.wav|14|I am considered a teenager.)
 94 | DUMMY2/p294/p294_094.wav|104|It should be a condition of employment.
 95 | DUMMY2/p269/p269_031.wav|94|Is this accurate?
 96 | DUMMY2/p275/p275_116.wav|40|It's not fair.
 97 | DUMMY2/p265/p265_006.wav|73|When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow.
 98 | DUMMY2/p285/p285_072.wav|2|Mr Irvine said Mr Rafferty was now in good spirits.
 99 | DUMMY2/p270/p270_167.wav|8|We did what we had to do.
100 | DUMMY2/p360/p360_397.wav|60|It is a relief.
101 | 


--------------------------------------------------------------------------------
/filelists/vctk_audio_sid_text_val_filelist.txt.cleaned:
--------------------------------------------------------------------------------
  1 | DUMMY2/p364/p364_240.wav|88|ɪt hɐd hˈæpənd tə hˌɪm.
  2 | DUMMY2/p280/p280_148.wav|52|ɪt ɪz ˈoʊpən sˈiːzən ɑːnðɪ ˈoʊld fˈɜːm.
  3 | DUMMY2/p231/p231_320.wav|50|haʊˈɛvɚ, hiː ɪz ɐ kˈoʊtʃ, ænd hiː ɹɪmˈeɪnz ɐ kˈoʊtʃ æt hˈɑːɹt.
  4 | DUMMY2/p282/p282_129.wav|83|ɪt ɪz nˌɑːɾə jˈuːtˈɜːn.
  5 | DUMMY2/p254/p254_015.wav|41|ðə ɡɹˈiːks jˈuːzd tʊ ɪmˈædʒɪn ðˌɐɾɪt wʌzɐ sˈaɪn fɹʌmðə ɡˈɑːdz tə foːɹtˈɛl wˈɔːɹ ɔːɹ hˈɛvi ɹˈeɪn.
  6 | DUMMY2/p228/p228_285.wav|57|ðə sˈɔŋz ɑːɹ dʒˈʌst sˌoʊ ɡˈʊd.
  7 | DUMMY2/p334/p334_307.wav|38|ɪf ðeɪ dˈoʊnt, ðeɪ kæn ɛkspˈɛkt ðɛɹ fˈʌndɪŋ təbi kˈʌt.
  8 | DUMMY2/p287/p287_081.wav|77|aɪv nˈɛvɚ sˈiːn ˈɛnɪθˌɪŋ lˈaɪk ɪt.
  9 | DUMMY2/p247/p247_083.wav|14|ɪt ɪz ɐ dʒˈɑːb kɹiːˈeɪʃən skˈiːm.
 10 | DUMMY2/p264/p264_051.wav|65|wiː wɜː lˈiːdɪŋ baɪ tˈuː ɡˈoʊlz.
 11 | DUMMY2/p335/p335_058.wav|49|lˈɛts sˈiː ðæt ˈɪnkɹiːs ˌoʊvɚ ðə jˈɪɹz.
 12 | DUMMY2/p236/p236_225.wav|75|ðɛɹ ɪz nˈoʊ kwˈɪk fˈɪks.
 13 | DUMMY2/p374/p374_353.wav|11|ænd ðæt bɹˈɪŋz ˌʌs tə ðə pˈɔɪnt.
 14 | DUMMY2/p272/p272_076.wav|69|sˈaʊndz lˈaɪk ðə sˈɪksθ sˈɛns?
 15 | DUMMY2/p271/p271_152.wav|27|ðə pətˈɪʃən wʌz fˈɔːɹməli pɹɪzˈɛntᵻd æt dˈaʊnɪŋ stɹˈiːt jˈɛstɚdˌeɪ.
 16 | DUMMY2/p228/p228_127.wav|57|ðeɪv ɡɑːt tʊ ɐkˈaʊnt fɔːɹ ɪt.
 17 | DUMMY2/p276/p276_223.wav|106|ɪts bˌɪn ɐ hˈʌmblɪŋ jˈɪɹ.
 18 | DUMMY2/p262/p262_248.wav|45|ðə pɹˈɑːdʒɛkt hɐz ɔːlɹˌɛdi sɪkjˈʊɹd ðə səpˈoːɹt ʌv sˌɜː ʃˈɔːn kɑːnɚɹi.
 19 | DUMMY2/p314/p314_086.wav|51|ðə tˈiːm ðɪs jˈɪɹ ɪz ɡˌoʊɪŋ plˈeɪsᵻz.
 20 | DUMMY2/p225/p225_038.wav|101|dˈaɪvɪŋ ɪz nˈoʊ pˈɑːɹt ʌv fˈʊtbɔːl.
 21 | DUMMY2/p279/p279_088.wav|25|ðə ʃˈɛɹhoʊldɚz wɪl vˈoʊt tə wˈaɪnd ˈʌp ðə kˈʌmpəni ˌɑːn fɹˈaɪdeɪ mˈɔːɹnɪŋ.
 22 | DUMMY2/p272/p272_018.wav|69|ˈæɹɪstˌɑːɾəl θˈɔːt ðætðə ɹˈeɪnboʊ wʌz kˈɔːzd baɪ ɹɪflˈɛkʃən ʌvðə sˈʌnz ɹˈeɪz baɪ ðə ɹˈeɪn.
 23 | DUMMY2/p256/p256_098.wav|90|ʃiː tˈoʊld ðə hˈɛɹəld.
 24 | DUMMY2/p261/p261_218.wav|100|ˈɔːl wɪl biː ɹɪvˈiːld ɪn dˈuː kˈoːɹs.
 25 | DUMMY2/p265/p265_063.wav|73|ɪt ʃˌʊdənt kˈʌm æz ɐ sɚpɹˈaɪz, bˌʌt ɪt dˈʌz.
 26 | DUMMY2/p314/p314_042.wav|51|ɪt ɪz ˈɔːl ɐbˌaʊt pˈiːpəl bˌiːɪŋ ɐsˈɑːltᵻd, ɐbjˈuːsd.
 27 | DUMMY2/p241/p241_188.wav|86|ˈaɪ wˈɪʃ ˈaɪ kʊd sˈeɪ sˈʌmθɪŋ.
 28 | DUMMY2/p283/p283_111.wav|95|ɪts ɡˈʊd tə hæv ɐ vˈɔɪs.
 29 | DUMMY2/p275/p275_006.wav|40|wˌɛn ðə sˈʌnlaɪt stɹˈaɪks ɹˈeɪndɹɑːps ɪnðɪ ˈɛɹ, ðeɪ ˈækt æz ɐ pɹˈɪzəm ænd fˈɔːɹm ɐ ɹˈeɪnboʊ.
 30 | DUMMY2/p228/p228_092.wav|57|tədˈeɪ ˈaɪ kˌʊdənt ɹˈʌn ˈɑːn ɪt.
 31 | DUMMY2/p295/p295_343.wav|92|ðɪ ˈætməsfˌɪɹ ɪz bˈɪznəslˌaɪk.
 32 | DUMMY2/p228/p228_187.wav|57|ðeɪ wɪl ɹˈʌn ɐ mˈaɪl.
 33 | DUMMY2/p294/p294_317.wav|104|ɪt dˈɪdnt pˌʊt mˌiː ˈɔf.
 34 | DUMMY2/p231/p231_445.wav|50|ɪt sˈaʊndᵻd lˈaɪk ɐ bˈɑːm.
 35 | DUMMY2/p272/p272_086.wav|69|tədˈeɪ ʃiː hɐzbɪn ɹɪlˈiːsd.
 36 | DUMMY2/p255/p255_210.wav|31|ɪt wʌz wˈɜːθ ɐ fˈoʊɾəɡɹˌæf.
 37 | DUMMY2/p229/p229_060.wav|67|ænd ɐ fˈɪlm mˈeɪkɚ wʌz bˈɔːɹn.
 38 | DUMMY2/p260/p260_232.wav|81|ðə hˈoʊm ˈɑːfɪs wʊd nˌɑːt ɹɪlˈiːs ˌɛni fˈɜːðɚ diːtˈeɪlz ɐbˌaʊt ðə ɡɹˈuːp.
 39 | DUMMY2/p245/p245_025.wav|59|dʒˈɑːnsən wʌz pɹˈɪɾi lˈoʊ.
 40 | DUMMY2/p333/p333_185.wav|64|ðɪs ˈɛɹiə ɪz pˈɜːfɛkt fɔːɹ tʃˈɪldɹən.
 41 | DUMMY2/p244/p244_242.wav|78|hiː ɪz ɐ mˈæn ʌvðə pˈiːpəl.
 42 | DUMMY2/p376/p376_187.wav|71|"ɪt ɪz ɐ tˈɛɹəbəl lˈɔs."
 43 | DUMMY2/p239/p239_156.wav|48|ɪt ɪz ɐ ɡˈʊd lˈaɪfstaɪl.
 44 | DUMMY2/p307/p307_037.wav|22|hiː ɹɪlˈiːsd ɐ hˈæfdˈʌzən sˈoʊloʊ ˈælbəmz.
 45 | DUMMY2/p305/p305_185.wav|54|ˈaɪ æm nˌɑːt ˈiːvən θˈɪŋkɪŋ ɐbˌaʊt ðˈæt.
 46 | DUMMY2/p272/p272_081.wav|69|ɪt wʌz mˈædʒɪk.
 47 | DUMMY2/p302/p302_297.wav|30|aɪm tɹˈaɪɪŋ tə stˈeɪ ˈoʊpən ˌɑːn ðˈæt.
 48 | DUMMY2/p275/p275_320.wav|40|wiː ɑːɹ ɪnðɪ ˈɛnd ɡˈeɪm.
 49 | DUMMY2/p239/p239_231.wav|48|ðˈɛn wiː wɪl fˈeɪs ðə dˈeɪnɪʃ tʃˈæmpiənz.
 50 | DUMMY2/p268/p268_301.wav|87|ɪt wʌz ˈoʊnli lˈeɪɾɚ ðætðə kəndˈɪʃən wʌz dˌaɪəɡnˈoʊzd.
 51 | DUMMY2/p336/p336_088.wav|98|ðeɪ fˈeɪld tə ɹˈiːtʃ ɐɡɹˈiːmənt jˈɛstɚdˌeɪ.
 52 | DUMMY2/p278/p278_255.wav|10|ðeɪ mˌeɪd sˈʌtʃ dᵻsˈɪʒənz ɪn lˈʌndən.
 53 | DUMMY2/p361/p361_132.wav|79|ðæt ɡɑːt mˌiː ˈaʊt.
 54 | DUMMY2/p307/p307_146.wav|22|juː hˈoʊp hiː pɹɪvˈeɪlz.
 55 | DUMMY2/p244/p244_147.wav|78|ðeɪ kʊd nˌɑːt ɪɡnˈoːɹ ðə wɪl ʌv pˈɑːɹləmənt, hiː klˈeɪmd.
 56 | DUMMY2/p294/p294_283.wav|104|ðɪs ɪz ˌaʊɚɹ ʌnfˈɪnɪʃt bˈɪznəs.
 57 | DUMMY2/p283/p283_300.wav|95|ˈaɪ wʊdhɐv ðə hˈæmɚɹ ɪnðə kɹˈaʊd.
 58 | DUMMY2/p239/p239_079.wav|48|ˈaɪ kæn ˌʌndɚstˈænd ðə fɹʌstɹˈeɪʃənz ʌv ˌaʊɚ fˈænz.
 59 | DUMMY2/p264/p264_009.wav|65|ðɛɹˈɪz , ɐkˈoːɹdɪŋ tə lˈɛdʒənd, ɐ bˈɔɪlɪŋ pˈɑːt ʌv ɡˈoʊld æt wˈʌn ˈɛnd. 
 60 | DUMMY2/p307/p307_348.wav|22|hiː dɪdnˌɑːt əpˈoʊz ðə dɪvˈoːɹs.
 61 | DUMMY2/p304/p304_308.wav|72|wiː ɑːɹ ðə ɡˈeɪtweɪ tə dʒˈʌstɪs.
 62 | DUMMY2/p281/p281_056.wav|36|nˈʌn hɐz ˈɛvɚ bˌɪn fˈaʊnd.
 63 | DUMMY2/p267/p267_158.wav|0|wiː wɜː ɡˈɪvən ɐ wˈɔːɹm ænd fɹˈɛndli ɹɪsˈɛpʃən.
 64 | DUMMY2/p300/p300_169.wav|102|hˌuː dˈuː ðiːz pˈiːpəl θˈɪŋk ðeɪ ɑːɹ?
 65 | DUMMY2/p276/p276_177.wav|106|ðeɪ ɛɡzˈɪst ɪn nˈeɪm ɐlˈoʊn.
 66 | DUMMY2/p228/p228_245.wav|57|ɪt ɪz ɐ pˈɑːlɪsi wˌɪtʃ hɐz ðə fˈʊl səpˈoːɹt ʌvðə mˈɪnɪstɚ.
 67 | DUMMY2/p300/p300_303.wav|102|aɪm wˈʌndɚɹɪŋ wˌʌt juː fˈiːl ɐbˌaʊt ðə jˈʌŋɡəst.
 68 | DUMMY2/p362/p362_247.wav|15|ðɪs wʊd ɡˈɪv skˈɑːtlənd ɐɹˈaʊnd ˈeɪt mˈɛmbɚz.
 69 | DUMMY2/p326/p326_031.wav|28|juːnˈaɪɾᵻd wɜːɹ ɪn kəntɹˈoʊl wɪðˌaʊt ˈɔːlweɪz bˌiːɪŋ dˈɑːmɪnənt.
 70 | DUMMY2/p361/p361_288.wav|79|ˈaɪ dɪdnˌɑːt θˈɪŋk ɪt wʌz vˈɛɹi pɹˈɑːpɚ.
 71 | DUMMY2/p286/p286_145.wav|63|tˈaɪɡɚɹ ɪz nˌɑːt ðə nˈɔːɹm.
 72 | DUMMY2/p234/p234_071.wav|3|ʃiː dˈɪd ðæt fɚðə ɹˈɛst ʌv hɜː lˈaɪf.
 73 | DUMMY2/p263/p263_296.wav|39|ðə dᵻsˈɪʒən wʌz ɐnˈaʊnst æt ɪts ˈænjuːəl kˈɑːnfɹəns ɪn dˈʌnfɚmlˌaɪn.
 74 | DUMMY2/p323/p323_228.wav|34|ʃiː bɪkˌeɪm ɐ hˈɛɹoʊˌɪn ʌv maɪ tʃˈaɪldhʊd.
 75 | DUMMY2/p280/p280_346.wav|52|ɪt wʌzɐ bˈɪt lˈaɪk hˌævɪŋ tʃˈɪldɹən.
 76 | DUMMY2/p333/p333_080.wav|64|bˌʌt ðə tɹˈædʒədi dɪdnˌɑːt stˈɑːp ðˈɛɹ.
 77 | DUMMY2/p226/p226_268.wav|43|ðæt dᵻsˈɪʒən ɪz fɚðə bɹˈɪɾɪʃ pˈɑːɹləmənt ænd pˈiːpəl.
 78 | DUMMY2/p362/p362_314.wav|15|ɪz ðæt ɹˈaɪt?
 79 | DUMMY2/p240/p240_047.wav|93|ɪt ɪz sˌoʊ sˈæd.
 80 | DUMMY2/p250/p250_207.wav|24|juː kʊd fˈiːl ðə hˈiːt.
 81 | DUMMY2/p273/p273_176.wav|56|nˈiːðɚ sˈaɪd wʊd ɹɪvˈiːl ðə diːtˈeɪlz ʌvðɪ ˈɑːfɚ.
 82 | DUMMY2/p316/p316_147.wav|85|ænd fɹˈæŋkli, ɪts bˌɪn ɐ wˈaɪl.
 83 | DUMMY2/p265/p265_047.wav|73|ɪt ɪz juːnˈiːk.
 84 | DUMMY2/p336/p336_353.wav|98|sˈʌmtaɪmz juː ɡˈɛt ðˌɛm, sˈʌmtaɪmz juː dˈoʊnt.
 85 | DUMMY2/p230/p230_376.wav|35|ðɪs hˈæzənt hˈæpənd ɪn ɐ vˈækjuːm.
 86 | DUMMY2/p308/p308_209.wav|107|ðɛɹ ɪz ɡɹˈeɪt pətˈɛnʃəl ˌɑːn ðɪs ɹˈɪvɚ.
 87 | DUMMY2/p250/p250_442.wav|24|wiː hɐvnˌɑːt jˈɛt ɹɪsˈiːvd ɐ lˈɛɾɚ fɹʌmðɪ ˈaɪɹɪʃ.
 88 | DUMMY2/p260/p260_037.wav|81|ɪts ɐ fˈækt.
 89 | DUMMY2/p299/p299_345.wav|58|wɪɹ vˈɛɹi ɛksˈaɪɾᵻd ænd tʃˈælɪndʒd baɪ ðə pɹˈɑːdʒɛkt.
 90 | DUMMY2/p269/p269_218.wav|94|ɐ ɡɹˈæmpiən pəlˈiːs spˈoʊksmən sˈɛd.
 91 | DUMMY2/p306/p306_014.wav|12|tə ðə hˈiːbɹuːz ɪt wʌzɐ tˈoʊkən ðæt ðɛɹ wʊd biː nˈoʊmˌoːɹ jˌuːnɪvˈɜːsəl flˈʌdz.
 92 | DUMMY2/p271/p271_292.wav|27|ɪts ɐ ɹˈɛkɚd lˈeɪbəl, nˌɑːɾə fˈɔːɹm ʌv mjˈuːzɪk.
 93 | DUMMY2/p247/p247_225.wav|14|ˈaɪ æm kənsˈɪdɚd ɐ tˈiːneɪdʒɚ.
 94 | DUMMY2/p294/p294_094.wav|104|ɪt ʃˌʊd biː ɐ kəndˈɪʃən ʌv ɛmplˈɔɪmənt.
 95 | DUMMY2/p269/p269_031.wav|94|ɪz ðɪs ˈækjʊɹət?
 96 | DUMMY2/p275/p275_116.wav|40|ɪts nˌɑːt fˈɛɹ.
 97 | DUMMY2/p265/p265_006.wav|73|wˌɛn ðə sˈʌnlaɪt stɹˈaɪks ɹˈeɪndɹɑːps ɪnðɪ ˈɛɹ, ðeɪ ˈækt æz ɐ pɹˈɪzəm ænd fˈɔːɹm ɐ ɹˈeɪnboʊ.
 98 | DUMMY2/p285/p285_072.wav|2|mˈɪstɚɹ ˈɜːvaɪn sˈɛd mˈɪstɚ ɹˈæfɚɾi wʌz nˈaʊ ɪn ɡˈʊd spˈɪɹɪts.
 99 | DUMMY2/p270/p270_167.wav|8|wiː dˈɪd wˌʌt wiː hædtə dˈuː.
100 | DUMMY2/p360/p360_397.wav|60|ɪt ɪz ɐ ɹɪlˈiːf.
101 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import sys
  4 | import argparse
  5 | import logging
  6 | import json
  7 | import subprocess
  8 | import numpy as np
  9 | from scipy.io.wavfile import read
 10 | import torch
 11 | 
 12 | MATPLOTLIB_FLAG = False
 13 | 
 14 | logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
 15 | logger = logging
 16 | 
 17 | 
 18 | def load_checkpoint(checkpoint_path, model, optimizer=None):
 19 |   assert os.path.isfile(checkpoint_path)
 20 |   checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
 21 |   iteration = checkpoint_dict['iteration']
 22 |   learning_rate = checkpoint_dict['learning_rate']
 23 |   if optimizer is not None:
 24 |     optimizer.load_state_dict(checkpoint_dict['optimizer'])
 25 |   saved_state_dict = checkpoint_dict['model']
 26 |   if hasattr(model, 'module'):
 27 |     state_dict = model.module.state_dict()
 28 |   else:
 29 |     state_dict = model.state_dict()
 30 |   new_state_dict= {}
 31 |   for k, v in state_dict.items():
 32 |     try:
 33 |       new_state_dict[k] = saved_state_dict[k]
 34 |     except:
 35 |       logger.info("%s is not in the checkpoint" % k)
 36 |       new_state_dict[k] = v
 37 |   if hasattr(model, 'module'):
 38 |     model.module.load_state_dict(new_state_dict)
 39 |   else:
 40 |     model.load_state_dict(new_state_dict)
 41 |   logger.info("Loaded checkpoint '{}' (iteration {})" .format(
 42 |     checkpoint_path, iteration))
 43 |   return model, optimizer, learning_rate, iteration
 44 | 
 45 | 
 46 | def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
 47 |   logger.info("Saving model and optimizer state at iteration {} to {}".format(
 48 |     iteration, checkpoint_path))
 49 |   if hasattr(model, 'module'):
 50 |     state_dict = model.module.state_dict()
 51 |   else:
 52 |     state_dict = model.state_dict()
 53 |   torch.save({'model': state_dict,
 54 |               'iteration': iteration,
 55 |               'optimizer': optimizer.state_dict(),
 56 |               'learning_rate': learning_rate}, checkpoint_path)
 57 | 
 58 | 
 59 | def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
 60 |   for k, v in scalars.items():
 61 |     writer.add_scalar(k, v, global_step)
 62 |   for k, v in histograms.items():
 63 |     writer.add_histogram(k, v, global_step)
 64 |   for k, v in images.items():
 65 |     writer.add_image(k, v, global_step, dataformats='HWC')
 66 |   for k, v in audios.items():
 67 |     writer.add_audio(k, v, global_step, audio_sampling_rate)
 68 | 
 69 | 
 70 | def latest_checkpoint_path(dir_path, regex="G_*.pth"):
 71 |   f_list = glob.glob(os.path.join(dir_path, regex))
 72 |   f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
 73 |   x = f_list[-1]
 74 |   print(x)
 75 |   return x
 76 | 
 77 | 
 78 | def plot_spectrogram_to_numpy(spectrogram):
 79 |   global MATPLOTLIB_FLAG
 80 |   if not MATPLOTLIB_FLAG:
 81 |     import matplotlib
 82 |     matplotlib.use("Agg")
 83 |     MATPLOTLIB_FLAG = True
 84 |     mpl_logger = logging.getLogger('matplotlib')
 85 |     mpl_logger.setLevel(logging.WARNING)
 86 |   import matplotlib.pylab as plt
 87 |   import numpy as np
 88 |   
 89 |   fig, ax = plt.subplots(figsize=(10,2))
 90 |   im = ax.imshow(spectrogram, aspect="auto", origin="lower",
 91 |                   interpolation='none')
 92 |   plt.colorbar(im, ax=ax)
 93 |   plt.xlabel("Frames")
 94 |   plt.ylabel("Channels")
 95 |   plt.tight_layout()
 96 | 
 97 |   fig.canvas.draw()
 98 |   data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
 99 |   data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
100 |   plt.close()
101 |   return data
102 | 
103 | 
104 | def plot_alignment_to_numpy(alignment, info=None):
105 |   global MATPLOTLIB_FLAG
106 |   if not MATPLOTLIB_FLAG:
107 |     import matplotlib
108 |     matplotlib.use("Agg")
109 |     MATPLOTLIB_FLAG = True
110 |     mpl_logger = logging.getLogger('matplotlib')
111 |     mpl_logger.setLevel(logging.WARNING)
112 |   import matplotlib.pylab as plt
113 |   import numpy as np
114 | 
115 |   fig, ax = plt.subplots(figsize=(6, 4))
116 |   im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
117 |                   interpolation='none')
118 |   fig.colorbar(im, ax=ax)
119 |   xlabel = 'Decoder timestep'
120 |   if info is not None:
121 |       xlabel += '\n\n' + info
122 |   plt.xlabel(xlabel)
123 |   plt.ylabel('Encoder timestep')
124 |   plt.tight_layout()
125 | 
126 |   fig.canvas.draw()
127 |   data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
128 |   data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
129 |   plt.close()
130 |   return data
131 | 
132 | 
133 | def load_wav_to_torch(full_path):
134 |   sampling_rate, data = read(full_path)
135 |   return torch.FloatTensor(data.astype(np.float32)), sampling_rate
136 | 
137 | 
138 | def load_filepaths_and_text(filename, split="|"):
139 |   with open(filename, encoding='utf-8') as f:
140 |     filepaths_and_text = [line.strip().split(split) for line in f]
141 |   return filepaths_and_text
142 | 
143 | 
144 | def get_hparams(init=True):
145 |   parser = argparse.ArgumentParser()
146 |   parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
147 |                       help='JSON file for configuration')
148 |   parser.add_argument('-m', '--model', type=str, required=True,
149 |                       help='Model name')
150 |   
151 |   args = parser.parse_args()
152 |   model_dir = os.path.join("./logs", args.model)
153 | 
154 |   if not os.path.exists(model_dir):
155 |     os.makedirs(model_dir)
156 | 
157 |   config_path = args.config
158 |   config_save_path = os.path.join(model_dir, "config.json")
159 |   if init:
160 |     with open(config_path, "r") as f:
161 |       data = f.read()
162 |     with open(config_save_path, "w") as f:
163 |       f.write(data)
164 |   else:
165 |     with open(config_save_path, "r") as f:
166 |       data = f.read()
167 |   config = json.loads(data)
168 |   
169 |   hparams = HParams(**config)
170 |   hparams.model_dir = model_dir
171 |   return hparams
172 | 
173 | 
174 | def get_hparams_from_dir(model_dir):
175 |   config_save_path = os.path.join(model_dir, "config.json")
176 |   with open(config_save_path, "r") as f:
177 |     data = f.read()
178 |   config = json.loads(data)
179 | 
180 |   hparams =HParams(**config)
181 |   hparams.model_dir = model_dir
182 |   return hparams
183 | 
184 | 
185 | def get_hparams_from_file(config_path):
186 |   with open(config_path, "r") as f:
187 |     data = f.read()
188 |   config = json.loads(data)
189 | 
190 |   hparams =HParams(**config)
191 |   return hparams
192 | 
193 | 
194 | def check_git_hash(model_dir):
195 |   source_dir = os.path.dirname(os.path.realpath(__file__))
196 |   if not os.path.exists(os.path.join(source_dir, ".git")):
197 |     logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
198 |       source_dir
199 |     ))
200 |     return
201 | 
202 |   cur_hash = subprocess.getoutput("git rev-parse HEAD")
203 | 
204 |   path = os.path.join(model_dir, "githash")
205 |   if os.path.exists(path):
206 |     saved_hash = open(path).read()
207 |     if saved_hash != cur_hash:
208 |       logger.warn("git hash values are different. {}(saved) != {}(current)".format(
209 |         saved_hash[:8], cur_hash[:8]))
210 |   else:
211 |     open(path, "w").write(cur_hash)
212 | 
213 | 
214 | def get_logger(model_dir, filename="train.log"):
215 |   global logger
216 |   logger = logging.getLogger(os.path.basename(model_dir))
217 |   logger.setLevel(logging.DEBUG)
218 |   
219 |   formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
220 |   if not os.path.exists(model_dir):
221 |     os.makedirs(model_dir)
222 |   h = logging.FileHandler(os.path.join(model_dir, filename))
223 |   h.setLevel(logging.DEBUG)
224 |   h.setFormatter(formatter)
225 |   logger.addHandler(h)
226 |   return logger
227 | 
228 | 
229 | class HParams():
230 |   def __init__(self, **kwargs):
231 |     for k, v in kwargs.items():
232 |       if type(v) == dict:
233 |         v = HParams(**v)
234 |       self[k] = v
235 |     
236 |   def keys(self):
237 |     return self.__dict__.keys()
238 | 
239 |   def items(self):
240 |     return self.__dict__.items()
241 | 
242 |   def values(self):
243 |     return self.__dict__.values()
244 | 
245 |   def __len__(self):
246 |     return len(self.__dict__)
247 | 
248 |   def __getitem__(self, key):
249 |     return getattr(self, key)
250 | 
251 |   def __setitem__(self, key, value):
252 |     return setattr(self, key, value)
253 | 
254 |   def __contains__(self, key):
255 |     return key in self.__dict__
256 | 
257 |   def __repr__(self):
258 |     return self.__dict__.__repr__()
259 | 


--------------------------------------------------------------------------------
/transforms.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import functional as F
  3 | 
  4 | import numpy as np
  5 | 
  6 | 
  7 | DEFAULT_MIN_BIN_WIDTH = 1e-3
  8 | DEFAULT_MIN_BIN_HEIGHT = 1e-3
  9 | DEFAULT_MIN_DERIVATIVE = 1e-3
 10 | 
 11 | 
 12 | def piecewise_rational_quadratic_transform(inputs, 
 13 |                                            unnormalized_widths,
 14 |                                            unnormalized_heights,
 15 |                                            unnormalized_derivatives,
 16 |                                            inverse=False,
 17 |                                            tails=None, 
 18 |                                            tail_bound=1.,
 19 |                                            min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 20 |                                            min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 21 |                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
 22 | 
 23 |     if tails is None:
 24 |         spline_fn = rational_quadratic_spline
 25 |         spline_kwargs = {}
 26 |     else:
 27 |         spline_fn = unconstrained_rational_quadratic_spline
 28 |         spline_kwargs = {
 29 |             'tails': tails,
 30 |             'tail_bound': tail_bound
 31 |         }
 32 | 
 33 |     outputs, logabsdet = spline_fn(
 34 |             inputs=inputs,
 35 |             unnormalized_widths=unnormalized_widths,
 36 |             unnormalized_heights=unnormalized_heights,
 37 |             unnormalized_derivatives=unnormalized_derivatives,
 38 |             inverse=inverse,
 39 |             min_bin_width=min_bin_width,
 40 |             min_bin_height=min_bin_height,
 41 |             min_derivative=min_derivative,
 42 |             **spline_kwargs
 43 |     )
 44 |     return outputs, logabsdet
 45 | 
 46 | 
 47 | def searchsorted(bin_locations, inputs, eps=1e-6):
 48 |     bin_locations[..., -1] += eps
 49 |     return torch.sum(
 50 |         inputs[..., None] >= bin_locations,
 51 |         dim=-1
 52 |     ) - 1
 53 | 
 54 | 
 55 | def unconstrained_rational_quadratic_spline(inputs,
 56 |                                             unnormalized_widths,
 57 |                                             unnormalized_heights,
 58 |                                             unnormalized_derivatives,
 59 |                                             inverse=False,
 60 |                                             tails='linear',
 61 |                                             tail_bound=1.,
 62 |                                             min_bin_width=DEFAULT_MIN_BIN_WIDTH,
 63 |                                             min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
 64 |                                             min_derivative=DEFAULT_MIN_DERIVATIVE):
 65 |     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
 66 |     outside_interval_mask = ~inside_interval_mask
 67 | 
 68 |     outputs = torch.zeros_like(inputs)
 69 |     logabsdet = torch.zeros_like(inputs)
 70 | 
 71 |     if tails == 'linear':
 72 |         unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
 73 |         constant = np.log(np.exp(1 - min_derivative) - 1)
 74 |         unnormalized_derivatives[..., 0] = constant
 75 |         unnormalized_derivatives[..., -1] = constant
 76 | 
 77 |         outputs[outside_interval_mask] = inputs[outside_interval_mask]
 78 |         logabsdet[outside_interval_mask] = 0
 79 |     else:
 80 |         raise RuntimeError('{} tails are not implemented.'.format(tails))
 81 | 
 82 |     outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
 83 |         inputs=inputs[inside_interval_mask],
 84 |         unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
 85 |         unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
 86 |         unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
 87 |         inverse=inverse,
 88 |         left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
 89 |         min_bin_width=min_bin_width,
 90 |         min_bin_height=min_bin_height,
 91 |         min_derivative=min_derivative
 92 |     )
 93 | 
 94 |     return outputs, logabsdet
 95 | 
 96 | def rational_quadratic_spline(inputs,
 97 |                               unnormalized_widths,
 98 |                               unnormalized_heights,
 99 |                               unnormalized_derivatives,
100 |                               inverse=False,
101 |                               left=0., right=1., bottom=0., top=1.,
102 |                               min_bin_width=DEFAULT_MIN_BIN_WIDTH,
103 |                               min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
104 |                               min_derivative=DEFAULT_MIN_DERIVATIVE):
105 |     if torch.min(inputs) < left or torch.max(inputs) > right:
106 |         raise ValueError('Input to a transform is not within its domain')
107 | 
108 |     num_bins = unnormalized_widths.shape[-1]
109 | 
110 |     if min_bin_width * num_bins > 1.0:
111 |         raise ValueError('Minimal bin width too large for the number of bins')
112 |     if min_bin_height * num_bins > 1.0:
113 |         raise ValueError('Minimal bin height too large for the number of bins')
114 | 
115 |     widths = F.softmax(unnormalized_widths, dim=-1)
116 |     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
117 |     cumwidths = torch.cumsum(widths, dim=-1)
118 |     cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
119 |     cumwidths = (right - left) * cumwidths + left
120 |     cumwidths[..., 0] = left
121 |     cumwidths[..., -1] = right
122 |     widths = cumwidths[..., 1:] - cumwidths[..., :-1]
123 | 
124 |     derivatives = min_derivative + F.softplus(unnormalized_derivatives)
125 | 
126 |     heights = F.softmax(unnormalized_heights, dim=-1)
127 |     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
128 |     cumheights = torch.cumsum(heights, dim=-1)
129 |     cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
130 |     cumheights = (top - bottom) * cumheights + bottom
131 |     cumheights[..., 0] = bottom
132 |     cumheights[..., -1] = top
133 |     heights = cumheights[..., 1:] - cumheights[..., :-1]
134 | 
135 |     if inverse:
136 |         bin_idx = searchsorted(cumheights, inputs)[..., None]
137 |     else:
138 |         bin_idx = searchsorted(cumwidths, inputs)[..., None]
139 | 
140 |     input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
141 |     input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
142 | 
143 |     input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
144 |     delta = heights / widths
145 |     input_delta = delta.gather(-1, bin_idx)[..., 0]
146 | 
147 |     input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
148 |     input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
149 | 
150 |     input_heights = heights.gather(-1, bin_idx)[..., 0]
151 | 
152 |     if inverse:
153 |         a = (((inputs - input_cumheights) * (input_derivatives
154 |                                              + input_derivatives_plus_one
155 |                                              - 2 * input_delta)
156 |               + input_heights * (input_delta - input_derivatives)))
157 |         b = (input_heights * input_derivatives
158 |              - (inputs - input_cumheights) * (input_derivatives
159 |                                               + input_derivatives_plus_one
160 |                                               - 2 * input_delta))
161 |         c = - input_delta * (inputs - input_cumheights)
162 | 
163 |         discriminant = b.pow(2) - 4 * a * c
164 |         assert (discriminant >= 0).all()
165 | 
166 |         root = (2 * c) / (-b - torch.sqrt(discriminant))
167 |         outputs = root * input_bin_widths + input_cumwidths
168 | 
169 |         theta_one_minus_theta = root * (1 - root)
170 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
171 |                                      * theta_one_minus_theta)
172 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
173 |                                                      + 2 * input_delta * theta_one_minus_theta
174 |                                                      + input_derivatives * (1 - root).pow(2))
175 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
176 | 
177 |         return outputs, -logabsdet
178 |     else:
179 |         theta = (inputs - input_cumwidths) / input_bin_widths
180 |         theta_one_minus_theta = theta * (1 - theta)
181 | 
182 |         numerator = input_heights * (input_delta * theta.pow(2)
183 |                                      + input_derivatives * theta_one_minus_theta)
184 |         denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
185 |                                      * theta_one_minus_theta)
186 |         outputs = input_cumheights + numerator / denominator
187 | 
188 |         derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
189 |                                                      + 2 * input_delta * theta_one_minus_theta
190 |                                                      + input_derivatives * (1 - theta).pow(2))
191 |         logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
192 | 
193 |         return outputs, logabsdet
194 | 


--------------------------------------------------------------------------------
/stft.py:
--------------------------------------------------------------------------------
  1 | """
  2 | BSD 3-Clause License
  3 | Copyright (c) 2017, Prem Seetharaman
  4 | All rights reserved.
  5 | * Redistribution and use in source and binary forms, with or without
  6 |   modification, are permitted provided that the following conditions are met:
  7 | * Redistributions of source code must retain the above copyright notice,
  8 |   this list of conditions and the following disclaimer.
  9 | * Redistributions in binary form must reproduce the above copyright notice, this
 10 |   list of conditions and the following disclaimer in the
 11 |   documentation and/or other materials provided with the distribution.
 12 | * Neither the name of the copyright holder nor the names of its
 13 |   contributors may be used to endorse or promote products derived from this
 14 |   software without specific prior written permission.
 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 19 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 22 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | """
 26 | 
 27 | import torch
 28 | import numpy as np
 29 | import torch.nn.functional as F
 30 | from torch.autograd import Variable
 31 | from scipy.signal import get_window
 32 | from librosa.util import pad_center, tiny
 33 | import librosa.util as librosa_util
 34 | 
 35 | def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
 36 |                      n_fft=800, dtype=np.float32, norm=None):
 37 |     """
 38 |     # from librosa 0.6
 39 |     Compute the sum-square envelope of a window function at a given hop length.
 40 |     This is used to estimate modulation effects induced by windowing
 41 |     observations in short-time fourier transforms.
 42 |     Parameters
 43 |     ----------
 44 |     window : string, tuple, number, callable, or list-like
 45 |         Window specification, as in `get_window`
 46 |     n_frames : int > 0
 47 |         The number of analysis frames
 48 |     hop_length : int > 0
 49 |         The number of samples to advance between frames
 50 |     win_length : [optional]
 51 |         The length of the window function.  By default, this matches `n_fft`.
 52 |     n_fft : int > 0
 53 |         The length of each analysis frame.
 54 |     dtype : np.dtype
 55 |         The data type of the output
 56 |     Returns
 57 |     -------
 58 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
 59 |         The sum-squared envelope of the window function
 60 |     """
 61 |     if win_length is None:
 62 |         win_length = n_fft
 63 | 
 64 |     n = n_fft + hop_length * (n_frames - 1)
 65 |     x = np.zeros(n, dtype=dtype)
 66 | 
 67 |     # Compute the squared window at the desired length
 68 |     win_sq = get_window(window, win_length, fftbins=True)
 69 |     win_sq = librosa_util.normalize(win_sq, norm=norm)**2
 70 |     win_sq = librosa_util.pad_center(win_sq, n_fft)
 71 | 
 72 |     # Fill the envelope
 73 |     for i in range(n_frames):
 74 |         sample = i * hop_length
 75 |         x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
 76 |     return x
 77 | 
 78 | 
 79 | class STFT(torch.nn.Module):
 80 |     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
 81 |     def __init__(self, filter_length=800, hop_length=200, win_length=800,
 82 |                  window='hann'):
 83 |         super(STFT, self).__init__()
 84 |         self.filter_length = filter_length
 85 |         self.hop_length = hop_length
 86 |         self.win_length = win_length
 87 |         self.window = window
 88 |         self.forward_transform = None
 89 |         scale = self.filter_length / self.hop_length
 90 |         fourier_basis = np.fft.fft(np.eye(self.filter_length))
 91 | 
 92 |         cutoff = int((self.filter_length / 2 + 1))
 93 |         fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
 94 |                                    np.imag(fourier_basis[:cutoff, :])])
 95 | 
 96 |         forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
 97 |         inverse_basis = torch.FloatTensor(
 98 |             np.linalg.pinv(scale * fourier_basis).T[:, None, :])
 99 | 
100 |         if window is not None:
101 |             assert(filter_length >= win_length)
102 |             # get window and zero center pad it to filter_length
103 |             fft_window = get_window(window, win_length, fftbins=True)
104 |             fft_window = pad_center(fft_window, filter_length)
105 |             fft_window = torch.from_numpy(fft_window).float()
106 | 
107 |             # window the bases
108 |             forward_basis *= fft_window
109 |             inverse_basis *= fft_window
110 | 
111 |         self.register_buffer('forward_basis', forward_basis.float())
112 |         self.register_buffer('inverse_basis', inverse_basis.float())
113 | 
114 |     def transform(self, input_data):
115 |         num_batches = input_data.size(0)
116 |         num_samples = input_data.size(1)
117 | 
118 |         self.num_samples = num_samples
119 | 
120 |         # similar to librosa, reflect-pad the input
121 |         input_data = input_data.view(num_batches, 1, num_samples)
122 |         input_data = F.pad(
123 |             input_data.unsqueeze(1),
124 |             (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
125 |             mode='reflect')
126 |         input_data = input_data.squeeze(1)
127 | 
128 |         forward_transform = F.conv1d(
129 |             input_data,
130 |             Variable(self.forward_basis, requires_grad=False),
131 |             stride=self.hop_length,
132 |             padding=0)
133 | 
134 |         cutoff = int((self.filter_length / 2) + 1)
135 |         real_part = forward_transform[:, :cutoff, :]
136 |         imag_part = forward_transform[:, cutoff:, :]
137 | 
138 |         magnitude = torch.sqrt(real_part**2 + imag_part**2)
139 |         phase = torch.autograd.Variable(
140 |             torch.atan2(imag_part.data, real_part.data))
141 | 
142 |         return magnitude, phase
143 | 
144 |     def inverse(self, magnitude, phase):
145 |         recombine_magnitude_phase = torch.cat(
146 |             [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
147 | 
148 |         inverse_transform = F.conv_transpose1d(
149 |             recombine_magnitude_phase,
150 |             Variable(self.inverse_basis, requires_grad=False),
151 |             stride=self.hop_length,
152 |             padding=0)
153 | 
154 |         if self.window is not None:
155 |             window_sum = window_sumsquare(
156 |                 self.window, magnitude.size(-1), hop_length=self.hop_length,
157 |                 win_length=self.win_length, n_fft=self.filter_length,
158 |                 dtype=np.float32)
159 |             # remove modulation effects
160 |             approx_nonzero_indices = torch.from_numpy(
161 |                 np.where(window_sum > tiny(window_sum))[0])
162 |             window_sum = torch.autograd.Variable(
163 |                 torch.from_numpy(window_sum), requires_grad=False)
164 |             window_sum = window_sum.to(inverse_transform.device()) if magnitude.is_cuda else window_sum
165 |             inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
166 | 
167 |             # scale by hop ratio
168 |             inverse_transform *= float(self.filter_length) / self.hop_length
169 | 
170 |         inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
171 |         inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
172 | 
173 |         return inverse_transform
174 | 
175 |     def forward(self, input_data):
176 |         self.magnitude, self.phase = self.transform(input_data)
177 |         reconstruction = self.inverse(self.magnitude, self.phase)
178 |         return reconstruction
179 | 
180 | 
181 | class TorchSTFT(torch.nn.Module):
182 |     def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'):
183 |         super().__init__()
184 |         self.filter_length = filter_length
185 |         self.hop_length = hop_length
186 |         self.win_length = win_length
187 |         self.window = torch.from_numpy(get_window(window, win_length, fftbins=True).astype(np.float32))
188 | 
189 |     def transform(self, input_data):
190 |         forward_transform = torch.stft(
191 |             input_data,
192 |             self.filter_length, self.hop_length, self.win_length, window=self.window,
193 |             return_complex=True)
194 | 
195 |         return torch.abs(forward_transform), torch.angle(forward_transform)
196 | 
197 |     def inverse(self, magnitude, phase):
198 |         inverse_transform = torch.istft(
199 |             magnitude * torch.exp(phase * 1j),
200 |             self.filter_length, self.hop_length, self.win_length, window=self.window.to(magnitude.device))
201 | 
202 |         return inverse_transform.unsqueeze(-2)  # unsqueeze to stay consistent with conv_transpose1d implementation
203 | 
204 |     def cartesian_inverse(self, real, imag):
205 |         complex_num = real + 1j*imag
206 |         inverse_transform = torch.istft(complex_num,self.filter_length, self.hop_length, self.win_length, window=self.window.to(real.device))
207 | 
208 |         return inverse_transform.unsqueeze(-2)  # unsqueeze to stay consistent with conv_transpose1d implementation
209 |     
210 |     def forward(self, input_data):
211 |         self.magnitude, self.phase = self.transform(input_data)
212 |         reconstruction = self.inverse(self.magnitude, self.phase)
213 |         return reconstruction
214 | 
215 | 
216 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/filelists/ljs_audio_text_val_filelist.txt:
--------------------------------------------------------------------------------
  1 | DUMMY1/LJ022-0023.wav|The overwhelming majority of people in this country know how to sift the wheat from the chaff in what they hear and what they read.
  2 | DUMMY1/LJ043-0030.wav|If somebody did that to me, a lousy trick like that, to take my wife away, and all the furniture, I would be mad as hell, too.
  3 | DUMMY1/LJ005-0201.wav|as is shown by the report of the Commissioners to inquire into the state of the municipal corporations in eighteen thirty-five.
  4 | DUMMY1/LJ001-0110.wav|Even the Caslon type when enlarged shows great shortcomings in this respect:
  5 | DUMMY1/LJ003-0345.wav|All the committee could do in this respect was to throw the responsibility on others.
  6 | DUMMY1/LJ007-0154.wav|These pungent and well-grounded strictures applied with still greater force to the unconvicted prisoner, the man who came to the prison innocent, and still uncontaminated,
  7 | DUMMY1/LJ018-0098.wav|and recognized as one of the frequenters of the bogus law-stationers. His arrest led to that of others.
  8 | DUMMY1/LJ047-0044.wav|Oswald was, however, willing to discuss his contacts with Soviet authorities. He denied having any involvement with Soviet intelligence agencies
  9 | DUMMY1/LJ031-0038.wav|The first physician to see the President at Parkland Hospital was Dr. Charles J. Carrico, a resident in general surgery.
 10 | DUMMY1/LJ048-0194.wav|during the morning of November twenty-two prior to the motorcade.
 11 | DUMMY1/LJ049-0026.wav|On occasion the Secret Service has been permitted to have an agent riding in the passenger compartment with the President.
 12 | DUMMY1/LJ004-0152.wav|although at Mr. Buxton's visit a new jail was in process of erection, the first step towards reform since Howard's visitation in seventeen seventy-four.
 13 | DUMMY1/LJ008-0278.wav|or theirs might be one of many, and it might be considered necessary to "make an example."
 14 | DUMMY1/LJ043-0002.wav|The Warren Commission Report. By The President's Commission on the Assassination of President Kennedy. Chapter seven. Lee Harvey Oswald:
 15 | DUMMY1/LJ009-0114.wav|Mr. Wakefield winds up his graphic but somewhat sensational account by describing another religious service, which may appropriately be inserted here.
 16 | DUMMY1/LJ028-0506.wav|A modern artist would have difficulty in doing such accurate work.
 17 | DUMMY1/LJ050-0168.wav|with the particular purposes of the agency involved. The Commission recognizes that this is a controversial area
 18 | DUMMY1/LJ039-0223.wav|Oswald's Marine training in marksmanship, his other rifle experience and his established familiarity with this particular weapon
 19 | DUMMY1/LJ029-0032.wav|According to O'Donnell, quote, we had a motorcade wherever we went, end quote.
 20 | DUMMY1/LJ031-0070.wav|Dr. Clark, who most closely observed the head wound,
 21 | DUMMY1/LJ034-0198.wav|Euins, who was on the southwest corner of Elm and Houston Streets testified that he could not describe the man he saw in the window.
 22 | DUMMY1/LJ026-0068.wav|Energy enters the plant, to a small extent,
 23 | DUMMY1/LJ039-0075.wav|once you know that you must put the crosshairs on the target and that is all that is necessary.
 24 | DUMMY1/LJ004-0096.wav|the fatal consequences whereof might be prevented if the justices of the peace were duly authorized
 25 | DUMMY1/LJ005-0014.wav|Speaking on a debate on prison matters, he declared that
 26 | DUMMY1/LJ012-0161.wav|he was reported to have fallen away to a shadow.
 27 | DUMMY1/LJ018-0239.wav|His disappearance gave color and substance to evil reports already in circulation that the will and conveyance above referred to
 28 | DUMMY1/LJ019-0257.wav|Here the tread-wheel was in use, there cellular cranks, or hard-labor machines.
 29 | DUMMY1/LJ028-0008.wav|you tap gently with your heel upon the shoulder of the dromedary to urge her on.
 30 | DUMMY1/LJ024-0083.wav|This plan of mine is no attack on the Court;
 31 | DUMMY1/LJ042-0129.wav|No night clubs or bowling alleys, no places of recreation except the trade union dances. I have had enough.
 32 | DUMMY1/LJ036-0103.wav|The police asked him whether he could pick out his passenger from the lineup.
 33 | DUMMY1/LJ046-0058.wav|During his Presidency, Franklin D. Roosevelt made almost four hundred journeys and traveled more than three hundred fifty thousand miles.
 34 | DUMMY1/LJ014-0076.wav|He was seen afterwards smoking and talking with his hosts in their back parlor, and never seen again alive.
 35 | DUMMY1/LJ002-0043.wav|long narrow rooms -- one thirty-six feet, six twenty-three feet, and the eighth eighteen,
 36 | DUMMY1/LJ009-0076.wav|We come to the sermon.
 37 | DUMMY1/LJ017-0131.wav|even when the high sheriff had told him there was no possibility of a reprieve, and within a few hours of execution.
 38 | DUMMY1/LJ046-0184.wav|but there is a system for the immediate notification of the Secret Service by the confining institution when a subject is released or escapes.
 39 | DUMMY1/LJ014-0263.wav|When other pleasures palled he took a theatre, and posed as a munificent patron of the dramatic art.
 40 | DUMMY1/LJ042-0096.wav|(old exchange rate) in addition to his factory salary of approximately equal amount
 41 | DUMMY1/LJ049-0050.wav|Hill had both feet on the car and was climbing aboard to assist President and Mrs. Kennedy.
 42 | DUMMY1/LJ019-0186.wav|seeing that since the establishment of the Central Criminal Court, Newgate received prisoners for trial from several counties,
 43 | DUMMY1/LJ028-0307.wav|then let twenty days pass, and at the end of that time station near the Chaldasan gates a body of four thousand.
 44 | DUMMY1/LJ012-0235.wav|While they were in a state of insensibility the murder was committed.
 45 | DUMMY1/LJ034-0053.wav|reached the same conclusion as Latona that the prints found on the cartons were those of Lee Harvey Oswald.
 46 | DUMMY1/LJ014-0030.wav|These were damnatory facts which well supported the prosecution.
 47 | DUMMY1/LJ015-0203.wav|but were the precautions too minute, the vigilance too close to be eluded or overcome?
 48 | DUMMY1/LJ028-0093.wav|but his scribe wrote it in the manner customary for the scribes of those days to write of their royal masters.
 49 | DUMMY1/LJ002-0018.wav|The inadequacy of the jail was noticed and reported upon again and again by the grand juries of the city of London,
 50 | DUMMY1/LJ028-0275.wav|At last, in the twentieth month,
 51 | DUMMY1/LJ012-0042.wav|which he kept concealed in a hiding-place with a trap-door just under his bed.
 52 | DUMMY1/LJ011-0096.wav|He married a lady also belonging to the Society of Friends, who brought him a large fortune, which, and his own money, he put into a city firm,
 53 | DUMMY1/LJ036-0077.wav|Roger D. Craig, a deputy sheriff of Dallas County,
 54 | DUMMY1/LJ016-0318.wav|Other officials, great lawyers, governors of prisons, and chaplains supported this view.
 55 | DUMMY1/LJ013-0164.wav|who came from his room ready dressed, a suspicious circumstance, as he was always late in the morning.
 56 | DUMMY1/LJ027-0141.wav|is closely reproduced in the life-history of existing deer. Or, in other words,
 57 | DUMMY1/LJ028-0335.wav|accordingly they committed to him the command of their whole army, and put the keys of their city into his hands.
 58 | DUMMY1/LJ031-0202.wav|Mrs. Kennedy chose the hospital in Bethesda for the autopsy because the President had served in the Navy.
 59 | DUMMY1/LJ021-0145.wav|From those willing to join in establishing this hoped-for period of peace,
 60 | DUMMY1/LJ016-0288.wav|"Müller, Müller, He's the man," till a diversion was created by the appearance of the gallows, which was received with continuous yells.
 61 | DUMMY1/LJ028-0081.wav|Years later, when the archaeologists could readily distinguish the false from the true,
 62 | DUMMY1/LJ018-0081.wav|his defense being that he had intended to commit suicide, but that, on the appearance of this officer who had wronged him,
 63 | DUMMY1/LJ021-0066.wav|together with a great increase in the payrolls, there has come a substantial rise in the total of industrial profits
 64 | DUMMY1/LJ009-0238.wav|After this the sheriffs sent for another rope, but the spectators interfered, and the man was carried back to jail.
 65 | DUMMY1/LJ005-0079.wav|and improve the morals of the prisoners, and shall insure the proper measure of punishment to convicted offenders.
 66 | DUMMY1/LJ035-0019.wav|drove to the northwest corner of Elm and Houston, and parked approximately ten feet from the traffic signal.
 67 | DUMMY1/LJ036-0174.wav|This is the approximate time he entered the roominghouse, according to Earlene Roberts, the housekeeper there.
 68 | DUMMY1/LJ046-0146.wav|The criteria in effect prior to November twenty-two, nineteen sixty-three, for determining whether to accept material for the PRS general files
 69 | DUMMY1/LJ017-0044.wav|and the deepest anxiety was felt that the crime, if crime there had been, should be brought home to its perpetrator.
 70 | DUMMY1/LJ017-0070.wav|but his sporting operations did not prosper, and he became a needy man, always driven to desperate straits for cash.
 71 | DUMMY1/LJ014-0020.wav|He was soon afterwards arrested on suspicion, and a search of his lodgings brought to light several garments saturated with blood;
 72 | DUMMY1/LJ016-0020.wav|He never reached the cistern, but fell back into the yard, injuring his legs severely.
 73 | DUMMY1/LJ045-0230.wav|when he was finally apprehended in the Texas Theatre. Although it is not fully corroborated by others who were present,
 74 | DUMMY1/LJ035-0129.wav|and she must have run down the stairs ahead of Oswald and would probably have seen or heard him.
 75 | DUMMY1/LJ008-0307.wav|afterwards express a wish to murder the Recorder for having kept them so long in suspense.
 76 | DUMMY1/LJ008-0294.wav|nearly indefinitely deferred.
 77 | DUMMY1/LJ047-0148.wav|On October twenty-five,
 78 | DUMMY1/LJ008-0111.wav|They entered a "stone cold room," and were presently joined by the prisoner.
 79 | DUMMY1/LJ034-0042.wav|that he could only testify with certainty that the print was less than three days old.
 80 | DUMMY1/LJ037-0234.wav|Mrs. Mary Brock, the wife of a mechanic who worked at the station, was there at the time and she saw a white male,
 81 | DUMMY1/LJ040-0002.wav|Chapter seven. Lee Harvey Oswald: Background and Possible Motives, Part one.
 82 | DUMMY1/LJ045-0140.wav|The arguments he used to justify his use of the alias suggest that Oswald may have come to think that the whole world was becoming involved
 83 | DUMMY1/LJ012-0035.wav|the number and names on watches, were carefully removed or obliterated after the goods passed out of his hands.
 84 | DUMMY1/LJ012-0250.wav|On the seventh July, eighteen thirty-seven,
 85 | DUMMY1/LJ016-0179.wav|contracted with sheriffs and conveners to work by the job.
 86 | DUMMY1/LJ016-0138.wav|at a distance from the prison.
 87 | DUMMY1/LJ027-0052.wav|These principles of homology are essential to a correct interpretation of the facts of morphology.
 88 | DUMMY1/LJ031-0134.wav|On one occasion Mrs. Johnson, accompanied by two Secret Service agents, left the room to see Mrs. Kennedy and Mrs. Connally.
 89 | DUMMY1/LJ019-0273.wav|which Sir Joshua Jebb told the committee he considered the proper elements of penal discipline.
 90 | DUMMY1/LJ014-0110.wav|At the first the boxes were impounded, opened, and found to contain many of O'Connor's effects.
 91 | DUMMY1/LJ034-0160.wav|on Brennan's subsequent certain identification of Lee Harvey Oswald as the man he saw fire the rifle.
 92 | DUMMY1/LJ038-0199.wav|eleven. If I am alive and taken prisoner,
 93 | DUMMY1/LJ014-0010.wav|yet he could not overcome the strange fascination it had for him, and remained by the side of the corpse till the stretcher came.
 94 | DUMMY1/LJ033-0047.wav|I noticed when I went out that the light was on, end quote,
 95 | DUMMY1/LJ040-0027.wav|He was never satisfied with anything.
 96 | DUMMY1/LJ048-0228.wav|and others who were present say that no agent was inebriated or acted improperly.
 97 | DUMMY1/LJ003-0111.wav|He was in consequence put out of the protection of their internal law, end quote. Their code was a subject of some curiosity.
 98 | DUMMY1/LJ008-0258.wav|Let me retrace my steps, and speak more in detail of the treatment of the condemned in those bloodthirsty and brutally indifferent days,
 99 | DUMMY1/LJ029-0022.wav|The original plan called for the President to spend only one day in the State, making whirlwind visits to Dallas, Fort Worth, San Antonio, and Houston.
100 | DUMMY1/LJ004-0045.wav|Mr. Sturges Bourne, Sir James Mackintosh, Sir James Scarlett, and William Wilberforce.
101 | 


--------------------------------------------------------------------------------
/filelists/ljs_audio_text_val_filelist.txt.cleaned:
--------------------------------------------------------------------------------
  1 | DUMMY1/LJ022-0023.wav|ðɪ ˌoʊvɚwˈɛlmɪŋ mədʒˈɔːɹɪɾi ʌv pˈiːpəl ɪn ðɪs kˈʌntɹi nˈoʊ hˌaʊ tə sˈɪft ðə wˈiːt fɹʌmðə tʃˈæf ɪn wˌʌt ðeɪ hˈɪɹ ænd wˌʌt ðeɪ ɹˈiːd.
  2 | DUMMY1/LJ043-0030.wav|ɪf sˈʌmbɑːdi dˈɪd ðˈæt tə mˌiː, ɐ lˈaʊsi tɹˈɪk lˈaɪk ðˈæt, tə tˈeɪk maɪ wˈaɪf ɐwˈeɪ, ænd ˈɔːl ðə fˈɜːnɪtʃɚ, ˈaɪ wʊd biː mˈæd æz hˈɛl, tˈuː.
  3 | DUMMY1/LJ005-0201.wav|ˌæzˌɪz ʃˈoʊn baɪ ðə ɹɪpˈoːɹt ʌvðə kəmˈɪʃənɚz tʊ ɪnkwˈaɪɚɹ ˌɪntʊ ðə stˈeɪt ʌvðə mjuːnˈɪsɪpəl kˌɔːɹpɚɹˈeɪʃənz ɪn eɪtˈiːn θˈɜːɾifˈaɪv.
  4 | DUMMY1/LJ001-0110.wav|ˈiːvən ðə kˈæslɑːn tˈaɪp wɛn ɛnlˈɑːɹdʒd ʃˈoʊz ɡɹˈeɪt ʃˈɔːɹtkʌmɪŋz ɪn ðɪs ɹɪspˈɛkt:
  5 | DUMMY1/LJ003-0345.wav|ˈɔːl ðə kəmˈɪɾi kʊd dˈuː ɪn ðɪs ɹɪspˈɛkt wʌz tə θɹˈoʊ ðə ɹɪspˌɑːnsəbˈɪlɪɾi ˌɑːn ˈʌðɚz.
  6 | DUMMY1/LJ007-0154.wav|ðiːz pˈʌndʒənt ænd wˈɛlɡɹˈaʊndᵻd stɹˈɪktʃɚz ɐplˈaɪd wɪð stˈɪl ɡɹˈeɪɾɚ fˈoːɹs tə ðɪ ʌnkənvˈɪktᵻd pɹˈɪzənɚ, ðə mˈæn hˌuː kˈeɪm tə ðə pɹˈɪzən ˈɪnəsənt, ænd stˈɪl ʌnkəntˈæmᵻnˌeɪɾᵻd,
  7 | DUMMY1/LJ018-0098.wav|ænd ɹˈɛkəɡnˌaɪzd æz wˈʌn ʌvðə fɹˈiːkwɛntɚz ʌvðə bˈoʊɡəs lˈɔːstˈeɪʃənɚz. hɪz ɐɹˈɛst lˈɛd tə ðæt ʌv ˈʌðɚz.
  8 | DUMMY1/LJ047-0044.wav|ˈɑːswəld wʌz, haʊˈɛvɚ, wˈɪlɪŋ tə dɪskˈʌs hɪz kˈɑːntækts wɪð sˈoʊviət ɐθˈɔːɹɪɾiz. hiː dɪnˈaɪd hˌævɪŋ ˌɛni ɪnvˈɑːlvmənt wɪð sˈoʊviət ɪntˈɛlɪdʒəns ˈeɪdʒənsiz
  9 | DUMMY1/LJ031-0038.wav|ðə fˈɜːst fɪzˈɪʃən tə sˈiː ðə pɹˈɛzɪdənt æt pˈɑːɹklənd hˈɑːspɪɾəl wʌz dˈɑːktɚ tʃˈɑːɹlz dʒˈeɪ. kˈæɹɪkˌoʊ, ɐ ɹˈɛzɪdənt ɪn dʒˈɛnɚɹəl sˈɜːdʒɚɹi.
 10 | DUMMY1/LJ048-0194.wav|dˈʊɹɪŋ ðə mˈɔːɹnɪŋ ʌv noʊvˈɛmbɚ twˈɛntitˈuː pɹˈaɪɚ tə ðə mˈoʊɾɚkˌeɪd.
 11 | DUMMY1/LJ049-0026.wav|ˌɑːn əkˈeɪʒən ðə sˈiːkɹət sˈɜːvɪs hɐzbɪn pɚmˈɪɾᵻd tə hæv ɐn ˈeɪdʒənt ɹˈaɪdɪŋ ɪnðə pˈæsɪndʒɚ kəmpˈɑːɹtmənt wɪððə pɹˈɛzɪdənt.
 12 | DUMMY1/LJ004-0152.wav|ɑːlðˈoʊ æt mˈɪstɚ bˈʌkstənz vˈɪzɪt ɐ nˈuː dʒˈeɪl wʌz ɪn pɹˈɑːsɛs ʌv ɪɹˈɛkʃən, ðə fˈɜːst stˈɛp tʊwˈɔːɹdz ɹɪfˈɔːɹm sˈɪns hˈaʊɚdz vˌɪzɪtˈeɪʃən ɪn sˌɛvəntˈiːn sˈɛvəntifˈoːɹ.
 13 | DUMMY1/LJ008-0278.wav|ɔːɹ ðˈɛɹz mˌaɪt biː wˈʌn ʌv mˈɛni, ænd ɪt mˌaɪt biː kənsˈɪdɚd nˈɛsəsɚɹi tuː "mˌeɪk ɐn ɛɡzˈæmpəl."
 14 | DUMMY1/LJ043-0002.wav|ðə wˈɔːɹən kəmˈɪʃən ɹɪpˈoːɹt. baɪ ðə pɹˈɛzɪdənts kəmˈɪʃən ɑːnðɪ ɐsˌæsᵻnˈeɪʃən ʌv pɹˈɛzɪdənt kˈɛnədi. tʃˈæptɚ sˈɛvən. lˈiː hˈɑːɹvi ˈɑːswəld:
 15 | DUMMY1/LJ009-0114.wav|mˈɪstɚ wˈeɪkfiːld wˈaɪndz ˈʌp hɪz ɡɹˈæfɪk bˌʌt sˈʌmwʌt sɛnsˈeɪʃənəl ɐkˈaʊnt baɪ dɪskɹˈaɪbɪŋ ɐnˈʌðɚ ɹɪlˈɪdʒəs sˈɜːvɪs, wˌɪtʃ mˈeɪ ɐpɹˈoʊpɹɪətli biː ɪnsˈɜːɾᵻd hˈɪɹ.
 16 | DUMMY1/LJ028-0506.wav|ɐ mˈɑːdɚn ˈɑːɹɾɪst wʊdhɐv dˈɪfɪkˌʌlti ɪn dˌuːɪŋ sˈʌtʃ ˈækjʊɹət wˈɜːk.
 17 | DUMMY1/LJ050-0168.wav|wɪððə pɚtˈɪkjʊlɚ pˈɜːpəsᵻz ʌvðɪ ˈeɪdʒənsi ɪnvˈɑːlvd. ðə kəmˈɪʃən ɹˈɛkəɡnˌaɪzɪz ðæt ðɪs ɪz ɐ kˌɑːntɹəvˈɜːʃəl ˈɛɹiə
 18 | DUMMY1/LJ039-0223.wav|ˈɑːswəldz mɚɹˈiːn tɹˈeɪnɪŋ ɪn mˈɑːɹksmənʃˌɪp, hɪz ˈʌðɚ ɹˈaɪfəl ɛkspˈiəɹɪəns ænd hɪz ɪstˈæblɪʃt fəmˌɪlɪˈæɹɪɾi wɪð ðɪs pɚtˈɪkjʊlɚ wˈɛpən
 19 | DUMMY1/LJ029-0032.wav|ɐkˈoːɹdɪŋ tʊ oʊdˈɑːnəl, kwˈoʊt, wiː hɐd ɐ mˈoʊɾɚkˌeɪd wɛɹɹˈɛvɚ wiː wˈɛnt, ˈɛnd kwˈoʊt.
 20 | DUMMY1/LJ031-0070.wav|dˈɑːktɚ klˈɑːɹk, hˌuː mˈoʊst klˈoʊsli ɑːbzˈɜːvd ðə hˈɛd wˈuːnd,
 21 | DUMMY1/LJ034-0198.wav|jˈuːɪnz, hˌuː wʌz ɑːnðə saʊθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən stɹˈiːts tˈɛstɪfˌaɪd ðæt hiː kʊd nˌɑːt dɪskɹˈaɪb ðə mˈæn hiː sˈɔː ɪnðə wˈɪndoʊ.
 22 | DUMMY1/LJ026-0068.wav|ˈɛnɚdʒi ˈɛntɚz ðə plˈænt, tʊ ɐ smˈɔːl ɛkstˈɛnt,
 23 | DUMMY1/LJ039-0075.wav|wˈʌns juː nˈoʊ ðæt juː mˈʌst pˌʊt ðə kɹˈɔshɛɹz ɑːnðə tˈɑːɹɡɪt ænd ðæt ɪz ˈɔːl ðæt ɪz nˈɛsəsɚɹi.
 24 | DUMMY1/LJ004-0096.wav|ðə fˈeɪɾəl kˈɑːnsɪkwənsᵻz wˈɛɹɑːf mˌaɪt biː pɹɪvˈɛntᵻd ɪf ðə dʒˈʌstɪsᵻz ʌvðə pˈiːs wɜː djˈuːli ˈɔːθɚɹˌaɪzd
 25 | DUMMY1/LJ005-0014.wav|spˈiːkɪŋ ˌɑːn ɐ dɪbˈeɪt ˌɑːn pɹˈɪzən mˈæɾɚz, hiː dᵻklˈɛɹd ðˈæt
 26 | DUMMY1/LJ012-0161.wav|hiː wʌz ɹɪpˈoːɹɾᵻd tə hæv fˈɔːlən ɐwˈeɪ tʊ ɐ ʃˈædoʊ.
 27 | DUMMY1/LJ018-0239.wav|hɪz dˌɪsɐpˈɪɹəns ɡˈeɪv kˈʌlɚ ænd sˈʌbstəns tʊ ˈiːvəl ɹɪpˈoːɹts ɔːlɹˌɛdi ɪn sˌɜːkjʊlˈeɪʃən ðætðə wɪl ænd kənvˈeɪəns əbˌʌv ɹɪfˈɜːd tuː
 28 | DUMMY1/LJ019-0257.wav|hˈɪɹ ðə tɹˈɛdwˈiːl wʌz ɪn jˈuːs, ðɛɹ sˈɛljʊlɚ kɹˈæŋks, ɔːɹ hˈɑːɹdlˈeɪbɚ məʃˈiːnz.
 29 | DUMMY1/LJ028-0008.wav|juː tˈæp dʒˈɛntli wɪð jʊɹ hˈiːl əpˌɑːn ðə ʃˈoʊldɚɹ ʌvðə dɹˈoʊmdɚɹi tʊ ˈɜːdʒ hɜːɹ ˈɑːn.
 30 | DUMMY1/LJ024-0083.wav|ðɪs plˈæn ʌv mˈaɪn ɪz nˈoʊ ɐtˈæk ɑːnðə kˈoːɹt;
 31 | DUMMY1/LJ042-0129.wav|nˈoʊ nˈaɪt klˈʌbz ɔːɹ bˈoʊlɪŋ ˈælɪz, nˈoʊ plˈeɪsᵻz ʌv ɹˌɛkɹiːˈeɪʃən ɛksˈɛpt ðə tɹˈeɪd jˈuːniən dˈænsᵻz. ˈaɪ hæv hɐd ɪnˈʌf.
 32 | DUMMY1/LJ036-0103.wav|ðə pəlˈiːs ˈæskt hˌɪm wˈɛðɚ hiː kʊd pˈɪk ˈaʊt hɪz pˈæsɪndʒɚ fɹʌmðə lˈaɪnʌp.
 33 | DUMMY1/LJ046-0058.wav|dˈʊɹɪŋ hɪz pɹˈɛzɪdənsi, fɹˈæŋklɪn dˈiː. ɹˈoʊzəvˌɛlt mˌeɪd ˈɔːlmoʊst fˈoːɹ hˈʌndɹəd dʒˈɜːnɪz ænd tɹˈævəld mˈoːɹ ðɐn θɹˈiː hˈʌndɹəd fˈɪfti θˈaʊzənd mˈaɪlz.
 34 | DUMMY1/LJ014-0076.wav|hiː wʌz sˈiːn ˈæftɚwɚdz smˈoʊkɪŋ ænd tˈɔːkɪŋ wɪð hɪz hˈoʊsts ɪn ðɛɹ bˈæk pˈɑːɹlɚ, ænd nˈɛvɚ sˈiːn ɐɡˈɛn ɐlˈaɪv.
 35 | DUMMY1/LJ002-0043.wav|lˈɑːŋ nˈæɹoʊ ɹˈuːmz wˈʌn θˈɜːɾisˈɪks fˈiːt, sˈɪks twˈɛntiθɹˈiː fˈiːt, ænd ðɪ ˈeɪtθ eɪtˈiːn,
 36 | DUMMY1/LJ009-0076.wav|wiː kˈʌm tə ðə sˈɜːmən.
 37 | DUMMY1/LJ017-0131.wav|ˈiːvən wɛn ðə hˈaɪ ʃˈɛɹɪf hɐd tˈoʊld hˌɪm ðɛɹwˌʌz nˈoʊ pˌɑːsəbˈɪlɪɾi əvɚ ɹɪpɹˈiːv, ænd wɪðˌɪn ɐ fjˈuː ˈaɪʊɹz ʌv ˌɛksɪkjˈuːʃən.
 38 | DUMMY1/LJ046-0184.wav|bˌʌt ðɛɹ ɪz ɐ sˈɪstəm fɚðɪ ɪmˈiːdɪət nˌoʊɾɪfɪkˈeɪʃən ʌvðə sˈiːkɹət sˈɜːvɪs baɪ ðə kənfˈaɪnɪŋ ˌɪnstɪtˈuːʃən wɛn ɐ sˈʌbdʒɛkt ɪz ɹɪlˈiːsd ɔːɹ ɛskˈeɪps.
 39 | DUMMY1/LJ014-0263.wav|wˌɛn ˈʌðɚ plˈɛʒɚz pˈɔːld hiː tˈʊk ɐ θˈiəɾɚ, ænd pˈoʊzd æz ɐ mjuːnˈɪfɪsənt pˈeɪtɹən ʌvðə dɹəmˈæɾɪk ˈɑːɹt.
 40 | DUMMY1/LJ042-0096.wav| ˈoʊld ɛkstʃˈeɪndʒ ɹˈeɪt ɪn ɐdˈɪʃən tə hɪz fˈæktɚɹi sˈælɚɹi ʌv ɐpɹˈɑːksɪmətli ˈiːkwəl ɐmˈaʊnt
 41 | DUMMY1/LJ049-0050.wav|hˈɪl hɐd bˈoʊθ fˈiːt ɑːnðə kˈɑːɹ ænd wʌz klˈaɪmɪŋ ɐbˈoːɹd tʊ ɐsˈɪst pɹˈɛzɪdənt ænd mɪsˈɛs kˈɛnədi.
 42 | DUMMY1/LJ019-0186.wav|sˈiːɪŋ ðæt sˈɪns ðɪ ɪstˈæblɪʃmənt ʌvðə sˈɛntɹəl kɹˈɪmɪnəl kˈoːɹt, nˈuːɡeɪt ɹɪsˈiːvd pɹˈɪzənɚz fɔːɹ tɹˈaɪəl fɹʌm sˈɛvɹəl kˈaʊntɪz,
 43 | DUMMY1/LJ028-0307.wav|ðˈɛn lˈɛt twˈɛnti dˈeɪz pˈæs, ænd æt ðɪ ˈɛnd ʌv ðæt tˈaɪm stˈeɪʃən nˌɪɹ ðə tʃˈældæsən ɡˈeɪts ɐ bˈɑːdi ʌv fˈoːɹ θˈaʊzənd.
 44 | DUMMY1/LJ012-0235.wav|wˌaɪl ðeɪ wɜːɹ ɪn ɐ stˈeɪt ʌv ɪnsˌɛnsəbˈɪlɪɾi ðə mˈɜːdɚ wʌz kəmˈɪɾᵻd.
 45 | DUMMY1/LJ034-0053.wav|ɹˈiːtʃt ðə sˈeɪm kənklˈuːʒən æz lætˈoʊnə ðætðə pɹˈɪnts fˈaʊnd ɑːnðə kˈɑːɹtənz wɜː ðoʊz ʌv lˈiː hˈɑːɹvi ˈɑːswəld.
 46 | DUMMY1/LJ014-0030.wav|ðiːz wɜː dˈæmnətˌoːɹi fˈækts wˌɪtʃ wˈɛl səpˈoːɹɾᵻd ðə pɹˌɑːsɪkjˈuːʃən.
 47 | DUMMY1/LJ015-0203.wav|bˌʌt wɜː ðə pɹɪkˈɔːʃənz tˈuː mˈɪnɪt, ðə vˈɪdʒɪləns tˈuː klˈoʊs təbi ɪlˈuːdᵻd ɔːɹ ˌoʊvɚkˈʌm?
 48 | DUMMY1/LJ028-0093.wav|bˌʌt hɪz skɹˈaɪb ɹˈoʊt ɪt ɪnðə mˈænɚ kˈʌstəmˌɛɹi fɚðə skɹˈaɪbz ʌv ðoʊz dˈeɪz tə ɹˈaɪt ʌv ðɛɹ ɹˈɔɪəl mˈæstɚz.
 49 | DUMMY1/LJ002-0018.wav|ðɪ ɪnˈædɪkwəsi ʌvðə dʒˈeɪl wʌz nˈoʊɾɪsd ænd ɹɪpˈoːɹɾᵻd əpˌɑːn ɐɡˈɛn ænd ɐɡˈɛn baɪ ðə ɡɹˈænd dʒˈʊɹɪz ʌvðə sˈɪɾi ʌv lˈʌndən,
 50 | DUMMY1/LJ028-0275.wav|æt lˈæst, ɪnðə twˈɛntiəθ mˈʌnθ,
 51 | DUMMY1/LJ012-0042.wav|wˌɪtʃ hiː kˈɛpt kənsˈiːld ɪn ɐ hˈaɪdɪŋplˈeɪs wɪð ɐ tɹˈæpdˈoːɹ dʒˈʌst ˌʌndɚ hɪz bˈɛd.
 52 | DUMMY1/LJ011-0096.wav|hiː mˈæɹɪd ɐ lˈeɪdi ˈɑːlsoʊ bɪlˈɑːŋɪŋ tə ðə səsˈaɪəɾi ʌv fɹˈɛndz, hˌuː bɹˈɔːt hˌɪm ɐ lˈɑːɹdʒ fˈɔːɹtʃən, wˈɪtʃ, ænd hɪz ˈoʊn mˈʌni, hiː pˌʊt ˌɪntʊ ɐ sˈɪɾi fˈɜːm,
 53 | DUMMY1/LJ036-0077.wav|ɹˈɑːdʒɚ dˈiː. kɹˈeɪɡ, ɐ dˈɛpjuːɾi ʃˈɛɹɪf ʌv dˈæləs kˈaʊnti,
 54 | DUMMY1/LJ016-0318.wav|ˈʌðɚɹ əfˈɪʃəlz, ɡɹˈeɪt lˈɔɪɚz, ɡˈʌvɚnɚz ʌv pɹˈɪzənz, ænd tʃˈæplɪnz səpˈoːɹɾᵻd ðɪs vjˈuː.
 55 | DUMMY1/LJ013-0164.wav|hˌuː kˈeɪm fɹʌm hɪz ɹˈuːm ɹˈɛdi dɹˈɛst, ɐ səspˈɪʃəs sˈɜːkəmstˌæns, æz hiː wʌz ˈɔːlweɪz lˈeɪt ɪnðə mˈɔːɹnɪŋ.
 56 | DUMMY1/LJ027-0141.wav|ɪz klˈoʊsli ɹɪpɹədˈuːst ɪnðə lˈaɪfhˈɪstɚɹi ʌv ɛɡzˈɪstɪŋ dˈɪɹ. ˈɔːɹ, ɪn ˈʌðɚ wˈɜːdz,
 57 | DUMMY1/LJ028-0335.wav|ɐkˈoːɹdɪŋli ðeɪ kəmˈɪɾᵻd tə hˌɪm ðə kəmˈænd ʌv ðɛɹ hˈoʊl ˈɑːɹmi, ænd pˌʊt ðə kˈiːz ʌv ðɛɹ sˈɪɾi ˌɪntʊ hɪz hˈændz.
 58 | DUMMY1/LJ031-0202.wav|mɪsˈɛs kˈɛnədi tʃˈoʊz ðə hˈɑːspɪɾəl ɪn bəθˈɛzdə fɚðɪ ˈɔːtɑːpsi bɪkˈʌz ðə pɹˈɛzɪdənt hɐd sˈɜːvd ɪnðə nˈeɪvi.
 59 | DUMMY1/LJ021-0145.wav|fɹʌm ðoʊz wˈɪlɪŋ tə dʒˈɔɪn ɪn ɪstˈæblɪʃɪŋ ðɪs hˈoʊptfɔːɹ pˈiəɹɪəd ʌv pˈiːs,
 60 | DUMMY1/LJ016-0288.wav|"mˈʌlɚ, mˈʌlɚ, hiːz ðə mˈæn," tˈɪl ɐ daɪvˈɜːʒən wʌz kɹiːˈeɪɾᵻd baɪ ðɪ ɐpˈɪɹəns ʌvðə ɡˈæloʊz, wˌɪtʃ wʌz ɹɪsˈiːvd wɪð kəntˈɪnjuːəs jˈɛlz.
 61 | DUMMY1/LJ028-0081.wav|jˈɪɹz lˈeɪɾɚ, wˌɛn ðɪ ˌɑːɹkiːˈɑːlədʒˌɪsts kʊd ɹˈɛdɪli dɪstˈɪŋɡwɪʃ ðə fˈɑːls fɹʌmðə tɹˈuː,
 62 | DUMMY1/LJ018-0081.wav|hɪz dɪfˈɛns bˌiːɪŋ ðæt hiː hɐd ɪntˈɛndᵻd tə kəmˈɪt sˈuːɪsˌaɪd, bˌʌt ðˈæt, ɑːnðɪ ɐpˈɪɹəns ʌv ðɪs ˈɑːfɪsɚ hˌuː hɐd ɹˈɔŋd hˌɪm,
 63 | DUMMY1/LJ021-0066.wav|təɡˌɛðɚ wɪð ɐ ɡɹˈeɪt ˈɪnkɹiːs ɪnðə pˈeɪɹoʊlz, ðɛɹ hɐz kˈʌm ɐ səbstˈænʃəl ɹˈaɪz ɪnðə tˈoʊɾəl ʌv ɪndˈʌstɹɪəl pɹˈɑːfɪts
 64 | DUMMY1/LJ009-0238.wav|ˈæftɚ ðɪs ðə ʃˈɛɹɪfs sˈɛnt fɔːɹ ɐnˈʌðɚ ɹˈoʊp, bˌʌt ðə spɛktˈeɪɾɚz ˌɪntəfˈɪɹd, ænd ðə mˈæn wʌz kˈæɹɪd bˈæk tə dʒˈeɪl.
 65 | DUMMY1/LJ005-0079.wav|ænd ɪmpɹˈuːv ðə mˈɔːɹəlz ʌvðə pɹˈɪzənɚz, ænd ʃˌæl ɪnʃˈʊɹ ðə pɹˈɑːpɚ mˈɛʒɚɹ ʌv pˈʌnɪʃmənt tə kənvˈɪktᵻd əfˈɛndɚz.
 66 | DUMMY1/LJ035-0019.wav|dɹˈoʊv tə ðə nɔːɹθwˈɛst kˈɔːɹnɚɹ ʌv ˈɛlm ænd hjˈuːstən, ænd pˈɑːɹkt ɐpɹˈɑːksɪmətli tˈɛn fˈiːt fɹʌmðə tɹˈæfɪk sˈɪɡnəl.
 67 | DUMMY1/LJ036-0174.wav|ðɪs ɪz ðɪ ɐpɹˈɑːksɪmət tˈaɪm hiː ˈɛntɚd ðə ɹˈuːmɪŋhˌaʊs, ɐkˈoːɹdɪŋ tʊ ˈɜːliːn ɹˈɑːbɚts, ðə hˈaʊskiːpɚ ðˈɛɹ.
 68 | DUMMY1/LJ046-0146.wav|ðə kɹaɪtˈiəɹɪə ɪn ɪfˈɛkt pɹˈaɪɚ tə noʊvˈɛmbɚ twˈɛntitˈuː, naɪntˈiːn sˈɪkstiθɹˈiː, fɔːɹ dɪtˈɜːmɪnɪŋ wˈɛðɚ tʊ ɐksˈɛpt mətˈiəɹɪəl fɚðə pˌiːˌɑːɹˈɛs dʒˈɛnɚɹəl fˈaɪlz
 69 | DUMMY1/LJ017-0044.wav|ænd ðə dˈiːpəst æŋzˈaɪəɾi wʌz fˈɛlt ðætðə kɹˈaɪm, ɪf kɹˈaɪm ðˈɛɹ hɐdbɪn, ʃˌʊd biː bɹˈɔːt hˈoʊm tʊ ɪts pˈɜːpɪtɹˌeɪɾɚ.
 70 | DUMMY1/LJ017-0070.wav|bˌʌt hɪz spˈoːɹɾɪŋ ˌɑːpɚɹˈeɪʃənz dɪdnˌɑːt pɹˈɑːspɚ, ænd hiː bɪkˌeɪm ɐ nˈiːdi mˈæn, ˈɔːlweɪz dɹˈɪvən tə dˈɛspɚɹət stɹˈeɪts fɔːɹ kˈæʃ.
 71 | DUMMY1/LJ014-0020.wav|hiː wʌz sˈuːn ˈæftɚwɚdz ɐɹˈɛstᵻd ˌɑːn səspˈɪʃən, ænd ɐ sˈɜːtʃ ʌv hɪz lˈɑːdʒɪŋz bɹˈɔːt tə lˈaɪt sˈɛvɹəl ɡˈɑːɹmənts sˈætʃɚɹˌeɪɾᵻd wɪð blˈʌd;
 72 | DUMMY1/LJ016-0020.wav|hiː nˈɛvɚ ɹˈiːtʃt ðə sˈɪstɚn, bˌʌt fˈɛl bˈæk ˌɪntʊ ðə jˈɑːɹd, ˈɪndʒɚɹɪŋ hɪz lˈɛɡz sɪvˈɪɹli.
 73 | DUMMY1/LJ045-0230.wav|wˌɛn hiː wʌz fˈaɪnəli ˌæpɹɪhˈɛndᵻd ɪnðə tˈɛksəs θˈiəɾɚ. ɑːlðˈoʊ ɪt ɪz nˌɑːt fˈʊli kɚɹˈɑːbɚɹˌeɪɾᵻd baɪ ˈʌðɚz hˌuː wɜː pɹˈɛzənt,
 74 | DUMMY1/LJ035-0129.wav|ænd ʃiː mˈʌstɐv ɹˈʌn dˌaʊn ðə stˈɛɹz ɐhˈɛd ʌv ˈɑːswəld ænd wʊd pɹˈɑːbəbli hæv sˈiːn ɔːɹ hˈɜːd hˌɪm.
 75 | DUMMY1/LJ008-0307.wav|ˈæftɚwɚdz ɛkspɹˈɛs ɐ wˈɪʃ tə mˈɜːdɚ ðə ɹɪkˈoːɹdɚ fɔːɹ hˌævɪŋ kˈɛpt ðˌɛm sˌoʊ lˈɑːŋ ɪn səspˈɛns.
 76 | DUMMY1/LJ008-0294.wav|nˌɪɹli ɪndˈɛfɪnətli dɪfˈɜːd.
 77 | DUMMY1/LJ047-0148.wav|ˌɑːn ɑːktˈoʊbɚ twˈɛntifˈaɪv,
 78 | DUMMY1/LJ008-0111.wav|ðeɪ ˈɛntɚd ˈeɪ "stˈoʊn kˈoʊld ɹˈuːm," ænd wɜː pɹˈɛzəntli dʒˈɔɪnd baɪ ðə pɹˈɪzənɚ.
 79 | DUMMY1/LJ034-0042.wav|ðæt hiː kʊd ˈoʊnli tˈɛstɪfˌaɪ wɪð sˈɜːtənti ðætðə pɹˈɪnt wʌz lˈɛs ðɐn θɹˈiː dˈeɪz ˈoʊld.
 80 | DUMMY1/LJ037-0234.wav|mɪsˈɛs mˈɛɹi bɹˈɑːk, ðə wˈaɪf əvə mɪkˈænɪk hˌuː wˈɜːkt æt ðə stˈeɪʃən, wʌz ðɛɹ æt ðə tˈaɪm ænd ʃiː sˈɔː ɐ wˈaɪt mˈeɪl,
 81 | DUMMY1/LJ040-0002.wav|tʃˈæptɚ sˈɛvən. lˈiː hˈɑːɹvi ˈɑːswəld: bˈækɡɹaʊnd ænd pˈɑːsəbəl mˈoʊɾɪvz, pˈɑːɹt wˌʌn.
 82 | DUMMY1/LJ045-0140.wav|ðɪ ˈɑːɹɡjuːmənts hiː jˈuːzd tə dʒˈʌstɪfˌaɪ hɪz jˈuːs ʌvðɪ ˈeɪliəs sədʒˈɛst ðæt ˈɑːswəld mˌeɪhɐv kˈʌm tə θˈɪŋk ðætðə hˈoʊl wˈɜːld wʌz bɪkˈʌmɪŋ ɪnvˈɑːlvd
 83 | DUMMY1/LJ012-0035.wav|ðə nˈʌmbɚ ænd nˈeɪmz ˌɑːn wˈɑːtʃᵻz, wɜː kˈɛɹfəli ɹɪmˈuːvd ɔːɹ əblˈɪɾɚɹˌeɪɾᵻd ˈæftɚ ðə ɡˈʊdz pˈæst ˌaʊɾəv hɪz hˈændz.
 84 | DUMMY1/LJ012-0250.wav|ɑːnðə sˈɛvənθ dʒuːlˈaɪ, eɪtˈiːn θˈɜːɾisˈɛvən,
 85 | DUMMY1/LJ016-0179.wav|kəntɹˈæktᵻd wɪð ʃˈɛɹɪfs ænd kənvˈɛnɚz tə wˈɜːk baɪ ðə dʒˈɑːb.
 86 | DUMMY1/LJ016-0138.wav|æɾə dˈɪstəns fɹʌmðə pɹˈɪzən.
 87 | DUMMY1/LJ027-0052.wav|ðiːz pɹˈɪnsɪpəlz ʌv həmˈɑːlədʒi ɑːɹ ɪsˈɛnʃəl tʊ ɐ kɚɹˈɛkt ɪntˌɜːpɹɪtˈeɪʃən ʌvðə fˈækts ʌv mɔːɹfˈɑːlədʒi.
 88 | DUMMY1/LJ031-0134.wav|ˌɑːn wˈʌn əkˈeɪʒən mɪsˈɛs dʒˈɑːnsən, ɐkˈʌmpənɪd baɪ tˈuː sˈiːkɹət sˈɜːvɪs ˈeɪdʒənts, lˈɛft ðə ɹˈuːm tə sˈiː mɪsˈɛs kˈɛnədi ænd mɪsˈɛs kənˈæli.
 89 | DUMMY1/LJ019-0273.wav|wˌɪtʃ sˌɜː dʒˈɑːʃjuːə dʒˈɛb tˈoʊld ðə kəmˈɪɾi hiː kənsˈɪdɚd ðə pɹˈɑːpɚɹ ˈɛlɪmənts ʌv pˈiːnəl dˈɪsɪplˌɪn.
 90 | DUMMY1/LJ014-0110.wav|æt ðə fˈɜːst ðə bˈɑːksᵻz wɜːɹ ɪmpˈaʊndᵻd, ˈoʊpənd, ænd fˈaʊnd tə kəntˈeɪn mˈɛnɪəv oʊkˈɑːnɚz ɪfˈɛkts.
 91 | DUMMY1/LJ034-0160.wav|ˌɑːn bɹˈɛnənz sˈʌbsɪkwənt sˈɜːtən aɪdˈɛntɪfɪkˈeɪʃən ʌv lˈiː hˈɑːɹvi ˈɑːswəld æz ðə mˈæn hiː sˈɔː fˈaɪɚ ðə ɹˈaɪfəl.
 92 | DUMMY1/LJ038-0199.wav|ɪlˈɛvən. ɪf ˈaɪ æm ɐlˈaɪv ænd tˈeɪkən pɹˈɪzənɚ,
 93 | DUMMY1/LJ014-0010.wav|jˈɛt hiː kʊd nˌɑːt ˌoʊvɚkˈʌm ðə stɹˈeɪndʒ fˌæsᵻnˈeɪʃən ɪt hˈɐd fɔːɹ hˌɪm, ænd ɹɪmˈeɪnd baɪ ðə sˈaɪd ʌvðə kˈɔːɹps tˈɪl ðə stɹˈɛtʃɚ kˈeɪm.
 94 | DUMMY1/LJ033-0047.wav|ˈaɪ nˈoʊɾɪsd wɛn ˈaɪ wɛnt ˈaʊt ðætðə lˈaɪt wʌz ˈɑːn, ˈɛnd kwˈoʊt,
 95 | DUMMY1/LJ040-0027.wav|hiː wʌz nˈɛvɚ sˈæɾɪsfˌaɪd wɪð ˈɛnɪθˌɪŋ.
 96 | DUMMY1/LJ048-0228.wav|ænd ˈʌðɚz hˌuː wɜː pɹˈɛzənt sˈeɪ ðæt nˈoʊ ˈeɪdʒənt wʌz ɪnˈiːbɹɪˌeɪɾᵻd ɔːɹ ˈæktᵻd ɪmpɹˈɑːpɚli.
 97 | DUMMY1/LJ003-0111.wav|hiː wʌz ɪn kˈɑːnsɪkwəns pˌʊt ˌaʊɾəv ðə pɹətˈɛkʃən ʌv ðɛɹ ɪntˈɜːnəl lˈɔː, ˈɛnd kwˈoʊt. ðɛɹ kˈoʊd wʌzɐ sˈʌbdʒɛkt ʌv sˌʌm kjˌʊɹɪˈɑːsɪɾi.
 98 | DUMMY1/LJ008-0258.wav|lˈɛt mˌiː ɹɪtɹˈeɪs maɪ stˈɛps, ænd spˈiːk mˈoːɹ ɪn diːtˈeɪl ʌvðə tɹˈiːtmənt ʌvðə kəndˈɛmd ɪn ðoʊz blˈʌdθɜːsti ænd bɹˈuːɾəli ɪndˈɪfɹənt dˈeɪz,
 99 | DUMMY1/LJ029-0022.wav|ðɪ ɚɹˈɪdʒɪnəl plˈæn kˈɔːld fɚðə pɹˈɛzɪdənt tə spˈɛnd ˈoʊnli wˈʌn dˈeɪ ɪnðə stˈeɪt, mˌeɪkɪŋ wˈɜːlwɪnd vˈɪzɪts tə dˈæləs, fˈɔːɹt wˈɜːθ, sˌæn æntˈoʊnɪˌoʊ, ænd hjˈuːstən.
100 | DUMMY1/LJ004-0045.wav|mˈɪstɚ stˈɜːdʒᵻz bˈoːɹn, sˌɜː dʒˈeɪmz mˈækɪntˌɑːʃ, sˌɜː dʒˈeɪmz skˈɑːɹlɪt, ænd wˈɪljəm wˈɪlbɚfˌoːɹs.
101 | 


--------------------------------------------------------------------------------
/attentions.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import torch
  5 | from torch import nn
  6 | from torch.nn import functional as F
  7 | 
  8 | import commons
  9 | import modules
 10 | from modules import LayerNorm
 11 |    
 12 | 
 13 | class Encoder(nn.Module):
 14 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
 15 |     super().__init__()
 16 |     self.hidden_channels = hidden_channels
 17 |     self.filter_channels = filter_channels
 18 |     self.n_heads = n_heads
 19 |     self.n_layers = n_layers
 20 |     self.kernel_size = kernel_size
 21 |     self.p_dropout = p_dropout
 22 |     self.window_size = window_size
 23 | 
 24 |     self.drop = nn.Dropout(p_dropout)
 25 |     self.attn_layers = nn.ModuleList()
 26 |     self.norm_layers_1 = nn.ModuleList()
 27 |     self.ffn_layers = nn.ModuleList()
 28 |     self.norm_layers_2 = nn.ModuleList()
 29 |     for i in range(self.n_layers):
 30 |       self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
 31 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 32 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
 33 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 34 | 
 35 |   def forward(self, x, x_mask):
 36 |     attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 37 |     x = x * x_mask
 38 |     for i in range(self.n_layers):
 39 |       y = self.attn_layers[i](x, x, attn_mask)
 40 |       y = self.drop(y)
 41 |       x = self.norm_layers_1[i](x + y)
 42 | 
 43 |       y = self.ffn_layers[i](x, x_mask)
 44 |       y = self.drop(y)
 45 |       x = self.norm_layers_2[i](x + y)
 46 |     x = x * x_mask
 47 |     return x
 48 | 
 49 | 
 50 | class Decoder(nn.Module):
 51 |   def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
 52 |     super().__init__()
 53 |     self.hidden_channels = hidden_channels
 54 |     self.filter_channels = filter_channels
 55 |     self.n_heads = n_heads
 56 |     self.n_layers = n_layers
 57 |     self.kernel_size = kernel_size
 58 |     self.p_dropout = p_dropout
 59 |     self.proximal_bias = proximal_bias
 60 |     self.proximal_init = proximal_init
 61 | 
 62 |     self.drop = nn.Dropout(p_dropout)
 63 |     self.self_attn_layers = nn.ModuleList()
 64 |     self.norm_layers_0 = nn.ModuleList()
 65 |     self.encdec_attn_layers = nn.ModuleList()
 66 |     self.norm_layers_1 = nn.ModuleList()
 67 |     self.ffn_layers = nn.ModuleList()
 68 |     self.norm_layers_2 = nn.ModuleList()
 69 |     for i in range(self.n_layers):
 70 |       self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
 71 |       self.norm_layers_0.append(LayerNorm(hidden_channels))
 72 |       self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
 73 |       self.norm_layers_1.append(LayerNorm(hidden_channels))
 74 |       self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
 75 |       self.norm_layers_2.append(LayerNorm(hidden_channels))
 76 | 
 77 |   def forward(self, x, x_mask, h, h_mask):
 78 |     """
 79 |     x: decoder input
 80 |     h: encoder output
 81 |     """
 82 |     self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
 83 |     encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
 84 |     x = x * x_mask
 85 |     for i in range(self.n_layers):
 86 |       y = self.self_attn_layers[i](x, x, self_attn_mask)
 87 |       y = self.drop(y)
 88 |       x = self.norm_layers_0[i](x + y)
 89 | 
 90 |       y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
 91 |       y = self.drop(y)
 92 |       x = self.norm_layers_1[i](x + y)
 93 |       
 94 |       y = self.ffn_layers[i](x, x_mask)
 95 |       y = self.drop(y)
 96 |       x = self.norm_layers_2[i](x + y)
 97 |     x = x * x_mask
 98 |     return x
 99 | 
100 | 
101 | class MultiHeadAttention(nn.Module):
102 |   def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
103 |     super().__init__()
104 |     assert channels % n_heads == 0
105 | 
106 |     self.channels = channels
107 |     self.out_channels = out_channels
108 |     self.n_heads = n_heads
109 |     self.p_dropout = p_dropout
110 |     self.window_size = window_size
111 |     self.heads_share = heads_share
112 |     self.block_length = block_length
113 |     self.proximal_bias = proximal_bias
114 |     self.proximal_init = proximal_init
115 |     self.attn = None
116 | 
117 |     self.k_channels = channels // n_heads
118 |     self.conv_q = nn.Conv1d(channels, channels, 1)
119 |     self.conv_k = nn.Conv1d(channels, channels, 1)
120 |     self.conv_v = nn.Conv1d(channels, channels, 1)
121 |     self.conv_o = nn.Conv1d(channels, out_channels, 1)
122 |     self.drop = nn.Dropout(p_dropout)
123 | 
124 |     if window_size is not None:
125 |       n_heads_rel = 1 if heads_share else n_heads
126 |       rel_stddev = self.k_channels**-0.5
127 |       self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
128 |       self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
129 | 
130 |     nn.init.xavier_uniform_(self.conv_q.weight)
131 |     nn.init.xavier_uniform_(self.conv_k.weight)
132 |     nn.init.xavier_uniform_(self.conv_v.weight)
133 |     if proximal_init:
134 |       with torch.no_grad():
135 |         self.conv_k.weight.copy_(self.conv_q.weight)
136 |         self.conv_k.bias.copy_(self.conv_q.bias)
137 |       
138 |   def forward(self, x, c, attn_mask=None):
139 |     q = self.conv_q(x)
140 |     k = self.conv_k(c)
141 |     v = self.conv_v(c)
142 |     
143 |     x, self.attn = self.attention(q, k, v, mask=attn_mask)
144 | 
145 |     x = self.conv_o(x)
146 |     return x
147 | 
148 |   def attention(self, query, key, value, mask=None):
149 |     # reshape [b, d, t] -> [b, n_h, t, d_k]
150 |     b, d, t_s, t_t = (*key.size(), query.size(2))
151 |     query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
152 |     key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
153 |     value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
154 | 
155 |     scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
156 |     if self.window_size is not None:
157 |       assert t_s == t_t, "Relative attention is only available for self-attention."
158 |       key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
159 |       rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
160 |       scores_local = self._relative_position_to_absolute_position(rel_logits)
161 |       scores = scores + scores_local
162 |     if self.proximal_bias:
163 |       assert t_s == t_t, "Proximal bias is only available for self-attention."
164 |       scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
165 |     if mask is not None:
166 |       scores = scores.masked_fill(mask == 0, -1e4)
167 |       if self.block_length is not None:
168 |         assert t_s == t_t, "Local attention is only available for self-attention."
169 |         block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
170 |         scores = scores.masked_fill(block_mask == 0, -1e4)
171 |     p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
172 |     p_attn = self.drop(p_attn)
173 |     output = torch.matmul(p_attn, value)
174 |     if self.window_size is not None:
175 |       relative_weights = self._absolute_position_to_relative_position(p_attn)
176 |       value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
177 |       output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
178 |     output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
179 |     return output, p_attn
180 | 
181 |   def _matmul_with_relative_values(self, x, y):
182 |     """
183 |     x: [b, h, l, m]
184 |     y: [h or 1, m, d]
185 |     ret: [b, h, l, d]
186 |     """
187 |     ret = torch.matmul(x, y.unsqueeze(0))
188 |     return ret
189 | 
190 |   def _matmul_with_relative_keys(self, x, y):
191 |     """
192 |     x: [b, h, l, d]
193 |     y: [h or 1, m, d]
194 |     ret: [b, h, l, m]
195 |     """
196 |     ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
197 |     return ret
198 | 
199 |   def _get_relative_embeddings(self, relative_embeddings, length):
200 |     max_relative_position = 2 * self.window_size + 1
201 |     # Pad first before slice to avoid using cond ops.
202 |     pad_length = max(length - (self.window_size + 1), 0)
203 |     slice_start_position = max((self.window_size + 1) - length, 0)
204 |     slice_end_position = slice_start_position + 2 * length - 1
205 |     if pad_length > 0:
206 |       padded_relative_embeddings = F.pad(
207 |           relative_embeddings,
208 |           commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
209 |     else:
210 |       padded_relative_embeddings = relative_embeddings
211 |     used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
212 |     return used_relative_embeddings
213 | 
214 |   def _relative_position_to_absolute_position(self, x):
215 |     """
216 |     x: [b, h, l, 2*l-1]
217 |     ret: [b, h, l, l]
218 |     """
219 |     batch, heads, length, _ = x.size()
220 |     # Concat columns of pad to shift from relative to absolute indexing.
221 |     x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
222 | 
223 |     # Concat extra elements so to add up to shape (len+1, 2*len-1).
224 |     x_flat = x.view([batch, heads, length * 2 * length])
225 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
226 | 
227 |     # Reshape and slice out the padded elements.
228 |     x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
229 |     return x_final
230 | 
231 |   def _absolute_position_to_relative_position(self, x):
232 |     """
233 |     x: [b, h, l, l]
234 |     ret: [b, h, l, 2*l-1]
235 |     """
236 |     batch, heads, length, _ = x.size()
237 |     # padd along column
238 |     x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
239 |     x_flat = x.view([batch, heads, length**2 + length*(length -1)])
240 |     # add 0's in the beginning that will skew the elements after reshape
241 |     x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
242 |     x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
243 |     return x_final
244 | 
245 |   def _attention_bias_proximal(self, length):
246 |     """Bias for self-attention to encourage attention to close positions.
247 |     Args:
248 |       length: an integer scalar.
249 |     Returns:
250 |       a Tensor with shape [1, 1, length, length]
251 |     """
252 |     r = torch.arange(length, dtype=torch.float32)
253 |     diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
254 |     return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
255 | 
256 | 
257 | class FFN(nn.Module):
258 |   def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
259 |     super().__init__()
260 |     self.in_channels = in_channels
261 |     self.out_channels = out_channels
262 |     self.filter_channels = filter_channels
263 |     self.kernel_size = kernel_size
264 |     self.p_dropout = p_dropout
265 |     self.activation = activation
266 |     self.causal = causal
267 | 
268 |     if causal:
269 |       self.padding = self._causal_padding
270 |     else:
271 |       self.padding = self._same_padding
272 | 
273 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
274 |     self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
275 |     self.drop = nn.Dropout(p_dropout)
276 | 
277 |   def forward(self, x, x_mask):
278 |     x = self.conv_1(self.padding(x * x_mask))
279 |     if self.activation == "gelu":
280 |       x = x * torch.sigmoid(1.702 * x)
281 |     else:
282 |       x = torch.relu(x)
283 |     x = self.drop(x)
284 |     x = self.conv_2(self.padding(x * x_mask))
285 |     return x * x_mask
286 |   
287 |   def _causal_padding(self, x):
288 |     if self.kernel_size == 1:
289 |       return x
290 |     pad_l = self.kernel_size - 1
291 |     pad_r = 0
292 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
293 |     x = F.pad(x, commons.convert_pad_shape(padding))
294 |     return x
295 | 
296 |   def _same_padding(self, x):
297 |     if self.kernel_size == 1:
298 |       return x
299 |     pad_l = (self.kernel_size - 1) // 2
300 |     pad_r = self.kernel_size // 2
301 |     padding = [[0, 0], [0, 0], [pad_l, pad_r]]
302 |     x = F.pad(x, commons.convert_pad_shape(padding))
303 |     return x
304 | 


--------------------------------------------------------------------------------
/train_latest.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import argparse
  4 | import itertools
  5 | import math
  6 | import torch
  7 | from torch import nn, optim
  8 | from torch.nn import functional as F
  9 | from torch.utils.data import DataLoader
 10 | from torch.utils.tensorboard import SummaryWriter
 11 | import torch.multiprocessing as mp
 12 | import torch.distributed as dist
 13 | from torch.nn.parallel import DistributedDataParallel as DDP
 14 | from torch.cuda.amp import autocast, GradScaler
 15 | from pqmf import PQMF
 16 | 
 17 | import commons
 18 | import utils
 19 | from data_utils import (
 20 |   TextAudioLoader,
 21 |   TextAudioCollate,
 22 |   DistributedBucketSampler
 23 | )
 24 | from models import (
 25 |   SynthesizerTrn,
 26 |   MultiPeriodDiscriminator,
 27 | )
 28 | from losses import (
 29 |   generator_loss,
 30 |   discriminator_loss,
 31 |   feature_loss,
 32 |   kl_loss,
 33 |   subband_stft_loss
 34 | )
 35 | from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
 36 | from text.symbols import symbols
 37 | 
 38 | torch.autograd.set_detect_anomaly(True)
 39 | torch.backends.cudnn.benchmark = True
 40 | global_step = 0
 41 | 
 42 | 
 43 | def main():
 44 |   """Assume Single Node Multi GPUs Training Only"""
 45 |   assert torch.cuda.is_available(), "CPU training is not allowed."
 46 | 
 47 |   n_gpus = torch.cuda.device_count()
 48 |   os.environ['MASTER_ADDR'] = 'localhost'
 49 |   os.environ['MASTER_PORT'] = '65520'
 50 | #   n_gpus = 1
 51 | 
 52 |   hps = utils.get_hparams()
 53 |   mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
 54 | 
 55 | 
 56 | def run(rank, n_gpus, hps):
 57 |   global global_step
 58 |   if rank == 0:
 59 |     logger = utils.get_logger(hps.model_dir)
 60 |     logger.info(hps)
 61 |     utils.check_git_hash(hps.model_dir)
 62 |     writer = SummaryWriter(log_dir=hps.model_dir)
 63 |     writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
 64 | 
 65 |   dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
 66 |   torch.manual_seed(hps.train.seed)
 67 |   torch.cuda.set_device(rank)
 68 | 
 69 |   train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
 70 |   train_sampler = DistributedBucketSampler(
 71 |       train_dataset,
 72 |       hps.train.batch_size,
 73 |       [32,300,400,500,600,700,800,900,1000],
 74 |       num_replicas=n_gpus,
 75 |       rank=rank,
 76 |       shuffle=True)
 77 |   collate_fn = TextAudioCollate()
 78 |   train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True,
 79 |       collate_fn=collate_fn, batch_sampler=train_sampler)
 80 |   if rank == 0:
 81 |     eval_dataset = TextAudioLoader(hps.data.validation_files, hps.data)
 82 |     eval_loader = DataLoader(eval_dataset, num_workers=1, shuffle=False,
 83 |         batch_size=hps.train.batch_size, pin_memory=True,
 84 |         drop_last=False, collate_fn=collate_fn)
 85 | 
 86 |   net_g = SynthesizerTrn(
 87 |       len(symbols),
 88 |       hps.data.filter_length // 2 + 1,
 89 |       hps.train.segment_size // hps.data.hop_length,
 90 |       **hps.model).cuda(rank)
 91 |   net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
 92 |   optim_g = torch.optim.AdamW(
 93 |       net_g.parameters(), 
 94 |       hps.train.learning_rate, 
 95 |       betas=hps.train.betas, 
 96 |       eps=hps.train.eps)
 97 |   optim_d = torch.optim.AdamW(
 98 |       net_d.parameters(),
 99 |       hps.train.learning_rate, 
100 |       betas=hps.train.betas, 
101 |       eps=hps.train.eps)
102 |   net_g = DDP(net_g, device_ids=[rank])
103 |   net_d = DDP(net_d, device_ids=[rank])
104 | 
105 |   try:
106 |     _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g)
107 |     _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d)
108 |     global_step = (epoch_str - 1) * len(train_loader)
109 |   except:
110 |     epoch_str = 1
111 |     global_step = 0
112 | 
113 |   scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str-2)
114 |   scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str-2)
115 | 
116 |   scaler = GradScaler(enabled=hps.train.fp16_run)
117 | 
118 |   for epoch in range(epoch_str, hps.train.epochs + 1):
119 |     if rank==0:
120 |       train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, eval_loader], logger, [writer, writer_eval])
121 |     else:
122 |       train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler, [train_loader, None], None, None)
123 |     scheduler_g.step()
124 |     scheduler_d.step()
125 | 
126 | 
127 | 
128 | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers):
129 |   net_g, net_d = nets
130 |   optim_g, optim_d = optims
131 |   scheduler_g, scheduler_d = schedulers
132 |   train_loader, eval_loader = loaders
133 |   if writers is not None:
134 |     writer, writer_eval = writers
135 | 
136 |   train_loader.batch_sampler.set_epoch(epoch)
137 |   global global_step
138 | 
139 |   net_g.train()
140 |   net_d.train()
141 |   for batch_idx, (x, x_lengths, complx, complx_lengths, y, y_lengths) in enumerate(train_loader):
142 |     x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True)
143 |     complx, complx_lengths = complx.cuda(rank, non_blocking=True), complx_lengths.cuda(rank, non_blocking=True)
144 |     y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True)
145 | 
146 |     with autocast(enabled=hps.train.fp16_run):
147 |       y_hat, y_hat_mb, l_length, attn, ids_slice, x_mask, z_mask,\
148 |       (z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, complx, complx_lengths)
149 | 
150 |       '''
151 |       mel = spec_to_mel_torch(
152 |           spec, 
153 |           hps.data.filter_length, 
154 |           hps.data.n_mel_channels, 
155 |           hps.data.sampling_rate,
156 |           hps.data.mel_fmin, 
157 |           hps.data.mel_fmax)
158 |       y_mel = commons.slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
159 |       '''
160 |       y_hat_mel = mel_spectrogram_torch(
161 |           y_hat.squeeze(1), 
162 |           hps.data.filter_length, 
163 |           hps.data.n_mel_channels, 
164 |           hps.data.sampling_rate, 
165 |           hps.data.hop_length, 
166 |           hps.data.win_length, 
167 |           hps.data.mel_fmin, 
168 |           hps.data.mel_fmax
169 |       )
170 |       
171 |       y_all = y # Just used for plot
172 |       y = commons.slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice 
173 |       y_mel = mel_spectrogram_torch(
174 |           y.squeeze(1), 
175 |           hps.data.filter_length, 
176 |           hps.data.n_mel_channels, 
177 |           hps.data.sampling_rate, 
178 |           hps.data.hop_length, 
179 |           hps.data.win_length, 
180 |           hps.data.mel_fmin, 
181 |           hps.data.mel_fmax
182 |       )
183 | 
184 |       # Discriminator
185 |       y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
186 |       with autocast(enabled=False):
187 |         loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
188 |         loss_disc_all = loss_disc
189 |     optim_d.zero_grad()
190 |     scaler.scale(loss_disc_all).backward()
191 |     scaler.unscale_(optim_d)
192 |     grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
193 |     scaler.step(optim_d)
194 | 
195 |     
196 | 
197 | 
198 |     with autocast(enabled=hps.train.fp16_run):
199 |       # Generator
200 |       y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
201 |       with autocast(enabled=False):
202 |         loss_dur = torch.sum(l_length.float())
203 |         loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
204 |         loss_wav = F.mse_loss(y, y_hat) * hps.train.c_wav # Time-Domain MSE Loss
205 |         loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
206 | 
207 |         loss_fm = feature_loss(fmap_r, fmap_g)
208 |         loss_gen, losses_gen = generator_loss(y_d_hat_g)
209 |         
210 |         if hps.model.mb_istft_vits == True:
211 |           pqmf = PQMF(y.device)
212 |           y_mb = pqmf.analysis(y)
213 |           loss_subband = subband_stft_loss(hps, y_mb, y_hat_mb)
214 |         else:
215 |           loss_subband = torch.tensor(0.0)
216 | 
217 |         loss_gen_all = loss_gen + loss_fm + loss_mel + loss_wav + loss_dur + loss_kl + loss_subband
218 | 
219 |     optim_g.zero_grad()
220 |     scaler.scale(loss_gen_all).backward()
221 |     scaler.unscale_(optim_g)
222 |     grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
223 |     scaler.step(optim_g)
224 |     scaler.update()
225 | 
226 |     if rank==0:
227 |       if global_step % hps.train.log_interval == 0:
228 |         lr = optim_g.param_groups[0]['lr']
229 |         losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_wav, loss_dur, loss_kl, loss_subband]
230 |         logger.info('Train Epoch: {} [{:.0f}%]'.format(
231 |           epoch,
232 |           100. * batch_idx / len(train_loader)))
233 |         logger.info([x.item() for x in losses] + [global_step, lr])
234 |         
235 |         scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g}
236 |         scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/wav": loss_wav, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl, "loss/g/subband": loss_subband})
237 | 
238 |         scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
239 |         scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
240 |         scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
241 |         mel = mel_spectrogram_torch(
242 |             y_all.squeeze(1), hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, 
243 |             hps.data.hop_length, hps.data.win_length, hps.data.mel_fmin, hps.data.mel_fmax
244 |         )
245 |         image_dict = { 
246 |             "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
247 |             "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), 
248 |             "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
249 |             "all/attn": utils.plot_alignment_to_numpy(attn[0,0].data.cpu().numpy())
250 |         }
251 |         utils.summarize(
252 |           writer=writer,
253 |           global_step=global_step, 
254 |           images=image_dict,
255 |           scalars=scalar_dict)
256 | 
257 |       if global_step % hps.train.eval_interval == 0:
258 |         evaluate(hps, net_g, eval_loader, writer_eval)
259 |         utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
260 |         utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
261 |     global_step += 1
262 | 
263 |   
264 |   if rank == 0:
265 |     logger.info('====> Epoch: {}'.format(epoch))
266 |   
267 |     
268 | 
269 |  
270 | def evaluate(hps, generator, eval_loader, writer_eval):
271 |     generator.eval()
272 |     with torch.no_grad():
273 |       for batch_idx, (x, x_lengths, complx, complx_lengths, y, y_lengths) in enumerate(eval_loader):
274 |         x, x_lengths = x.cuda(0), x_lengths.cuda(0)
275 |         complx, complx_lengths = complx.cuda(0), complx_lengths.cuda(0)
276 |         y, y_lengths = y.cuda(0), y_lengths.cuda(0)
277 | 
278 |         # remove else
279 |         x = x[:1]
280 |         x_lengths = x_lengths[:1]
281 |         complx = complx[:1]
282 |         complx_lengths = complx_lengths[:1]
283 |         y = y[:1]
284 |         y_lengths = y_lengths[:1]
285 |         break
286 |       y_hat, y_hat_mb, attn, mask, *_ = generator.module.infer(x, x_lengths, max_len=1000)
287 |       y_hat_lengths = mask.sum([1,2]).long() * hps.data.hop_length
288 | 
289 |       mel = mel_spectrogram_torch(
290 |             y.squeeze(1), hps.data.filter_length, hps.data.n_mel_channels, hps.data.sampling_rate, 
291 |             hps.data.hop_length, hps.data.win_length, hps.data.mel_fmin, hps.data.mel_fmax
292 |       )
293 |       y_hat_mel = mel_spectrogram_torch(
294 |         y_hat.squeeze(1).float(),
295 |         hps.data.filter_length,
296 |         hps.data.n_mel_channels,
297 |         hps.data.sampling_rate,
298 |         hps.data.hop_length,
299 |         hps.data.win_length,
300 |         hps.data.mel_fmin,
301 |         hps.data.mel_fmax
302 |       )
303 |     image_dict = {
304 |       "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy())
305 |     }
306 |     audio_dict = {
307 |       "gen/audio": y_hat[0,:,:y_hat_lengths[0]]
308 |     }
309 |     if global_step == 0:
310 |       image_dict.update({"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())})
311 |       audio_dict.update({"gt/audio": y[0,:,:y_lengths[0]]})
312 | 
313 |     utils.summarize(
314 |       writer=writer_eval,
315 |       global_step=global_step, 
316 |       images=image_dict,
317 |       audios=audio_dict,
318 |       audio_sampling_rate=hps.data.sampling_rate
319 |     )
320 |     generator.train()
321 | 
322 |                            
323 | if __name__ == "__main__":
324 |   os.environ[
325 |         "TORCH_DISTRIBUTED_DEBUG"
326 |     ] = "DETAIL"
327 |   main()
328 | 


--------------------------------------------------------------------------------
/modules.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import numpy as np
  4 | import scipy
  5 | import torch
  6 | from torch import nn
  7 | from torch.nn import functional as F
  8 | 
  9 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 10 | from torch.nn.utils import weight_norm, remove_weight_norm
 11 | 
 12 | import commons
 13 | from commons import init_weights, get_padding
 14 | from transforms import piecewise_rational_quadratic_transform
 15 | 
 16 | 
 17 | LRELU_SLOPE = 0.1
 18 | 
 19 | 
 20 | class LayerNorm(nn.Module):
 21 |   def __init__(self, channels, eps=1e-5):
 22 |     super().__init__()
 23 |     self.channels = channels
 24 |     self.eps = eps
 25 | 
 26 |     self.gamma = nn.Parameter(torch.ones(channels))
 27 |     self.beta = nn.Parameter(torch.zeros(channels))
 28 | 
 29 |   def forward(self, x):
 30 |     x = x.transpose(1, -1)
 31 |     x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 32 |     return x.transpose(1, -1)
 33 | 
 34 |  
 35 | class ConvReluNorm(nn.Module):
 36 |   def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
 37 |     super().__init__()
 38 |     self.in_channels = in_channels
 39 |     self.hidden_channels = hidden_channels
 40 |     self.out_channels = out_channels
 41 |     self.kernel_size = kernel_size
 42 |     self.n_layers = n_layers
 43 |     self.p_dropout = p_dropout
 44 |     assert n_layers > 1, "Number of layers should be larger than 0."
 45 | 
 46 |     self.conv_layers = nn.ModuleList()
 47 |     self.norm_layers = nn.ModuleList()
 48 |     self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 49 |     self.norm_layers.append(LayerNorm(hidden_channels))
 50 |     self.relu_drop = nn.Sequential(
 51 |         nn.ReLU(),
 52 |         nn.Dropout(p_dropout))
 53 |     for _ in range(n_layers-1):
 54 |       self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 55 |       self.norm_layers.append(LayerNorm(hidden_channels))
 56 |     self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 57 |     self.proj.weight.data.zero_()
 58 |     self.proj.bias.data.zero_()
 59 | 
 60 |   def forward(self, x, x_mask):
 61 |     x_org = x
 62 |     for i in range(self.n_layers):
 63 |       x = self.conv_layers[i](x * x_mask)
 64 |       x = self.norm_layers[i](x)
 65 |       x = self.relu_drop(x)
 66 |     x = x_org + self.proj(x)
 67 |     return x * x_mask
 68 | 
 69 | 
 70 | class DDSConv(nn.Module):
 71 |   """
 72 |   Dialted and Depth-Separable Convolution
 73 |   """
 74 |   def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
 75 |     super().__init__()
 76 |     self.channels = channels
 77 |     self.kernel_size = kernel_size
 78 |     self.n_layers = n_layers
 79 |     self.p_dropout = p_dropout
 80 | 
 81 |     self.drop = nn.Dropout(p_dropout)
 82 |     self.convs_sep = nn.ModuleList()
 83 |     self.convs_1x1 = nn.ModuleList()
 84 |     self.norms_1 = nn.ModuleList()
 85 |     self.norms_2 = nn.ModuleList()
 86 |     for i in range(n_layers):
 87 |       dilation = kernel_size ** i
 88 |       padding = (kernel_size * dilation - dilation) // 2
 89 |       self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 
 90 |           groups=channels, dilation=dilation, padding=padding
 91 |       ))
 92 |       self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
 93 |       self.norms_1.append(LayerNorm(channels))
 94 |       self.norms_2.append(LayerNorm(channels))
 95 | 
 96 |   def forward(self, x, x_mask, g=None):
 97 |     if g is not None:
 98 |       x = x + g
 99 |     for i in range(self.n_layers):
100 |       y = self.convs_sep[i](x * x_mask)
101 |       y = self.norms_1[i](y)
102 |       y = F.gelu(y)
103 |       y = self.convs_1x1[i](y)
104 |       y = self.norms_2[i](y)
105 |       y = F.gelu(y)
106 |       y = self.drop(y)
107 |       x = x + y
108 |     return x * x_mask
109 | 
110 | 
111 | class WN(torch.nn.Module):
112 |   def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
113 |     super(WN, self).__init__()
114 |     assert(kernel_size % 2 == 1)
115 |     self.hidden_channels =hidden_channels
116 |     self.kernel_size = kernel_size,
117 |     self.dilation_rate = dilation_rate
118 |     self.n_layers = n_layers
119 |     self.gin_channels = gin_channels
120 |     self.p_dropout = p_dropout
121 | 
122 |     self.in_layers = torch.nn.ModuleList()
123 |     self.res_skip_layers = torch.nn.ModuleList()
124 |     self.drop = nn.Dropout(p_dropout)
125 | 
126 |     if gin_channels != 0:
127 |       cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
128 |       self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
129 | 
130 |     for i in range(n_layers):
131 |       dilation = dilation_rate ** i
132 |       padding = int((kernel_size * dilation - dilation) / 2)
133 |       in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
134 |                                  dilation=dilation, padding=padding)
135 |       in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
136 |       self.in_layers.append(in_layer)
137 | 
138 |       # last one is not necessary
139 |       if i < n_layers - 1:
140 |         res_skip_channels = 2 * hidden_channels
141 |       else:
142 |         res_skip_channels = hidden_channels
143 | 
144 |       res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
145 |       res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
146 |       self.res_skip_layers.append(res_skip_layer)
147 | 
148 |   def forward(self, x, x_mask, g=None, **kwargs):
149 |     output = torch.zeros_like(x)
150 |     n_channels_tensor = torch.IntTensor([self.hidden_channels])
151 | 
152 |     if g is not None:
153 |       g = self.cond_layer(g)
154 | 
155 |     for i in range(self.n_layers):
156 |       x_in = self.in_layers[i](x)
157 |       if g is not None:
158 |         cond_offset = i * 2 * self.hidden_channels
159 |         g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
160 |       else:
161 |         g_l = torch.zeros_like(x_in)
162 | 
163 |       acts = commons.fused_add_tanh_sigmoid_multiply(
164 |           x_in,
165 |           g_l,
166 |           n_channels_tensor)
167 |       acts = self.drop(acts)
168 | 
169 |       res_skip_acts = self.res_skip_layers[i](acts)
170 |       if i < self.n_layers - 1:
171 |         res_acts = res_skip_acts[:,:self.hidden_channels,:]
172 |         x = (x + res_acts) * x_mask
173 |         output = output + res_skip_acts[:,self.hidden_channels:,:]
174 |       else:
175 |         output = output + res_skip_acts
176 |     return output * x_mask
177 | 
178 |   def remove_weight_norm(self):
179 |     if self.gin_channels != 0:
180 |       torch.nn.utils.remove_weight_norm(self.cond_layer)
181 |     for l in self.in_layers:
182 |       torch.nn.utils.remove_weight_norm(l)
183 |     for l in self.res_skip_layers:
184 |      torch.nn.utils.remove_weight_norm(l)
185 | 
186 | 
187 | class ResBlock1(torch.nn.Module):
188 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
189 |         super(ResBlock1, self).__init__()
190 |         self.convs1 = nn.ModuleList([
191 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
192 |                                padding=get_padding(kernel_size, dilation[0]))),
193 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
194 |                                padding=get_padding(kernel_size, dilation[1]))),
195 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
196 |                                padding=get_padding(kernel_size, dilation[2])))
197 |         ])
198 |         self.convs1.apply(init_weights)
199 | 
200 |         self.convs2 = nn.ModuleList([
201 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
202 |                                padding=get_padding(kernel_size, 1))),
203 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
204 |                                padding=get_padding(kernel_size, 1))),
205 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
206 |                                padding=get_padding(kernel_size, 1)))
207 |         ])
208 |         self.convs2.apply(init_weights)
209 | 
210 |     def forward(self, x, x_mask=None):
211 |         for c1, c2 in zip(self.convs1, self.convs2):
212 |             xt = F.leaky_relu(x, LRELU_SLOPE)
213 |             if x_mask is not None:
214 |                 xt = xt * x_mask
215 |             xt = c1(xt)
216 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
217 |             if x_mask is not None:
218 |                 xt = xt * x_mask
219 |             xt = c2(xt)
220 |             x = xt + x
221 |         if x_mask is not None:
222 |             x = x * x_mask
223 |         return x
224 | 
225 |     def remove_weight_norm(self):
226 |         for l in self.convs1:
227 |             remove_weight_norm(l)
228 |         for l in self.convs2:
229 |             remove_weight_norm(l)
230 | 
231 | 
232 | class ResBlock2(torch.nn.Module):
233 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
234 |         super(ResBlock2, self).__init__()
235 |         self.convs = nn.ModuleList([
236 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
237 |                                padding=get_padding(kernel_size, dilation[0]))),
238 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
239 |                                padding=get_padding(kernel_size, dilation[1])))
240 |         ])
241 |         self.convs.apply(init_weights)
242 | 
243 |     def forward(self, x, x_mask=None):
244 |         for c in self.convs:
245 |             xt = F.leaky_relu(x, LRELU_SLOPE)
246 |             if x_mask is not None:
247 |                 xt = xt * x_mask
248 |             xt = c(xt)
249 |             x = xt + x
250 |         if x_mask is not None:
251 |             x = x * x_mask
252 |         return x
253 | 
254 |     def remove_weight_norm(self):
255 |         for l in self.convs:
256 |             remove_weight_norm(l)
257 | 
258 | 
259 | class Log(nn.Module):
260 |   def forward(self, x, x_mask, reverse=False, **kwargs):
261 |     if not reverse:
262 |       y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
263 |       logdet = torch.sum(-y, [1, 2])
264 |       return y, logdet
265 |     else:
266 |       x = torch.exp(x) * x_mask
267 |       return x
268 |     
269 | 
270 | class Flip(nn.Module):
271 |   def forward(self, x, *args, reverse=False, **kwargs):
272 |     x = torch.flip(x, [1])
273 |     if not reverse:
274 |       logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
275 |       return x, logdet
276 |     else:
277 |       return x
278 | 
279 | 
280 | class ElementwiseAffine(nn.Module):
281 |   def __init__(self, channels):
282 |     super().__init__()
283 |     self.channels = channels
284 |     self.m = nn.Parameter(torch.zeros(channels,1))
285 |     self.logs = nn.Parameter(torch.zeros(channels,1))
286 | 
287 |   def forward(self, x, x_mask, reverse=False, **kwargs):
288 |     if not reverse:
289 |       y = self.m + torch.exp(self.logs) * x
290 |       y = y * x_mask
291 |       logdet = torch.sum(self.logs * x_mask, [1,2])      
292 |       return y, logdet
293 |     else:
294 |       x = (x - self.m) * torch.exp(-self.logs) * x_mask
295 |       return x
296 | 
297 | 
298 | class ResidualCouplingLayer(nn.Module):
299 |   def __init__(self,
300 |       channels,
301 |       hidden_channels,
302 |       kernel_size,
303 |       dilation_rate,
304 |       n_layers,
305 |       p_dropout=0,
306 |       gin_channels=0,
307 |       mean_only=False):
308 |     assert channels % 2 == 0, "channels should be divisible by 2"
309 |     super().__init__()
310 |     self.channels = channels
311 |     self.hidden_channels = hidden_channels
312 |     self.kernel_size = kernel_size
313 |     self.dilation_rate = dilation_rate
314 |     self.n_layers = n_layers
315 |     self.half_channels = channels // 2
316 |     self.mean_only = mean_only
317 | 
318 |     self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
319 |     self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
320 |     self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
321 |     self.post.weight.data.zero_()
322 |     self.post.bias.data.zero_()
323 | 
324 |   def forward(self, x, x_mask, g=None, reverse=False):
325 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
326 |     h = self.pre(x0) * x_mask
327 |     h = self.enc(h, x_mask, g=g)
328 |     stats = self.post(h) * x_mask
329 |     if not self.mean_only:
330 |       m, logs = torch.split(stats, [self.half_channels]*2, 1)
331 |     else:
332 |       m = stats
333 |       logs = torch.zeros_like(m)
334 | 
335 |     if not reverse:
336 |       x1 = m + x1 * torch.exp(logs) * x_mask
337 |       x = torch.cat([x0, x1], 1)
338 |       logdet = torch.sum(logs, [1,2])
339 |       return x, logdet
340 |     else:
341 |       x1 = (x1 - m) * torch.exp(-logs) * x_mask
342 |       x = torch.cat([x0, x1], 1)
343 |       return x
344 | 
345 | 
346 | class ConvFlow(nn.Module):
347 |   def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
348 |     super().__init__()
349 |     self.in_channels = in_channels
350 |     self.filter_channels = filter_channels
351 |     self.kernel_size = kernel_size
352 |     self.n_layers = n_layers
353 |     self.num_bins = num_bins
354 |     self.tail_bound = tail_bound
355 |     self.half_channels = in_channels // 2
356 | 
357 |     self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
358 |     self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
359 |     self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
360 |     self.proj.weight.data.zero_()
361 |     self.proj.bias.data.zero_()
362 | 
363 |   def forward(self, x, x_mask, g=None, reverse=False):
364 |     x0, x1 = torch.split(x, [self.half_channels]*2, 1)
365 |     h = self.pre(x0)
366 |     h = self.convs(h, x_mask, g=g)
367 |     h = self.proj(h) * x_mask
368 | 
369 |     b, c, t = x0.shape
370 |     h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
371 | 
372 |     unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
373 |     unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
374 |     unnormalized_derivatives = h[..., 2 * self.num_bins:]
375 | 
376 |     x1, logabsdet = piecewise_rational_quadratic_transform(x1,
377 |         unnormalized_widths,
378 |         unnormalized_heights,
379 |         unnormalized_derivatives,
380 |         inverse=reverse,
381 |         tails='linear',
382 |         tail_bound=self.tail_bound
383 |     )
384 | 
385 |     x = torch.cat([x0, x1], 1) * x_mask
386 |     logdet = torch.sum(logabsdet * x_mask, [1,2])
387 |     if not reverse:
388 |         return x, logdet
389 |     else:
390 |         return x
391 | 


--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import os
  3 | import random
  4 | import numpy as np
  5 | import torch
  6 | import torch.utils.data
  7 | 
  8 | import commons 
  9 | from mel_processing import spectrogram_torch, complx_torch
 10 | from utils import load_wav_to_torch, load_filepaths_and_text
 11 | from text import text_to_sequence, cleaned_text_to_sequence
 12 | 
 13 | 
 14 | class TextAudioLoader(torch.utils.data.Dataset):
 15 |     """
 16 |         1) loads audio, text pairs
 17 |         2) normalizes text and converts them to sequences of integers
 18 |         3) computes 4 complex components from audio files.
 19 |     """
 20 |     def __init__(self, audiopaths_and_text, hparams):
 21 |         self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
 22 |         self.text_cleaners  = hparams.text_cleaners
 23 |         self.max_wav_value  = hparams.max_wav_value
 24 |         self.sampling_rate  = hparams.sampling_rate
 25 |         self.filter_length  = hparams.filter_length 
 26 |         self.hop_length     = hparams.hop_length 
 27 |         self.win_length     = hparams.win_length
 28 |         self.sampling_rate  = hparams.sampling_rate 
 29 | 
 30 |         self.cleaned_text = getattr(hparams, "cleaned_text", False)
 31 | 
 32 |         self.add_blank = hparams.add_blank
 33 |         self.min_text_len = getattr(hparams, "min_text_len", 1)
 34 |         self.max_text_len = getattr(hparams, "max_text_len", 190)
 35 | 
 36 |         random.seed(1234)
 37 |         random.shuffle(self.audiopaths_and_text)
 38 |         self._filter()
 39 | 
 40 | 
 41 |     def _filter(self):
 42 |         """
 43 |         Filter text & store complx components lengths
 44 |         """
 45 |         # Store spectrogram lengths for Bucketing
 46 |         # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
 47 |         # spec_length = wav_length // hop_length
 48 | 
 49 |         audiopaths_and_text_new = []
 50 |         lengths = []
 51 |         for audiopath, text in self.audiopaths_and_text:
 52 |             if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
 53 |                 audiopaths_and_text_new.append([audiopath, text])
 54 |                 lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
 55 |         self.audiopaths_and_text = audiopaths_and_text_new
 56 |         self.lengths = lengths
 57 | 
 58 |     def get_audio_text_pair(self, audiopath_and_text):
 59 |         # separate filename and text
 60 |         audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
 61 |         text = self.get_text(text)
 62 |         complx, wav = self.get_audio(audiopath)
 63 |         return (text, complx, wav)
 64 | 
 65 |     def get_audio(self, filename):
 66 |         audio, sampling_rate = load_wav_to_torch(filename)
 67 |         if sampling_rate != self.sampling_rate:
 68 |             raise ValueError("{} {} SR doesn't match target {} SR".format(
 69 |                 sampling_rate, self.sampling_rate))
 70 |         audio_norm = audio / self.max_wav_value
 71 |         audio_norm = audio_norm.unsqueeze(0)
 72 |         # spec_filename = filename.replace(".wav", ".spec.pt")
 73 |         complx_filename = filename.replace(".wav", ".complx.pt")
 74 |         if os.path.exists(complx_filename):
 75 |             complx = torch.load(complx_filename)
 76 |         else:
 77 |             complx = complx_torch(audio_norm, self.filter_length,
 78 |                 self.sampling_rate, self.hop_length, self.win_length,
 79 |                 center=False)
 80 |             complx = torch.squeeze(complx, 0) #(4, N, T)
 81 |             torch.save(complx, complx_filename)
 82 |         return complx, audio_norm
 83 | 
 84 |     def get_text(self, text):
 85 |         if self.cleaned_text:
 86 |             text_norm = cleaned_text_to_sequence(text)
 87 |         else:
 88 |             text_norm = text_to_sequence(text, self.text_cleaners)
 89 |         if self.add_blank:
 90 |             text_norm = commons.intersperse(text_norm, 0)
 91 |         text_norm = torch.LongTensor(text_norm)
 92 |         return text_norm
 93 | 
 94 |     def __getitem__(self, index):
 95 |         return self.get_audio_text_pair(self.audiopaths_and_text[index])
 96 | 
 97 |     def __len__(self):
 98 |         return len(self.audiopaths_and_text)
 99 | 
100 | 
101 | class TextAudioCollate():
102 |     """ Zero-pads model inputs and targets
103 |     """
104 |     def __init__(self, return_ids=False):
105 |         self.return_ids = return_ids
106 | 
107 |     def __call__(self, batch):
108 |         """Collate's training batch from normalized text and aduio
109 |         PARAMS
110 |         ------
111 |         batch: [text_normalized, complx_normalized, wav_normalized]
112 |         """
113 |         # Right zero-pad all one-hot text sequences to max input length
114 |         _, ids_sorted_decreasing = torch.sort(
115 |             torch.LongTensor([x[1].size(1) for x in batch]),
116 |             dim=0, descending=True)
117 | 
118 |         max_text_len = max([len(x[0]) for x in batch])
119 |         max_complx_len = max([x[1].size(2) for x in batch]) # (4, N, T) -> T
120 |         max_wav_len = max([x[2].size(1) for x in batch])
121 | 
122 |         text_lengths = torch.LongTensor(len(batch))
123 |         complx_lengths = torch.LongTensor(len(batch))
124 |         wav_lengths = torch.LongTensor(len(batch))
125 | 
126 |         text_padded = torch.LongTensor(len(batch), max_text_len)
127 |         complx_padded = torch.FloatTensor(len(batch), 4,  batch[0][1].size(1), max_complx_len) # (B, 4, N, max_T)
128 |         wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
129 |         text_padded.zero_()
130 |         complx_padded.zero_()
131 |         wav_padded.zero_()
132 |         for i in range(len(ids_sorted_decreasing)):
133 |             row = batch[ids_sorted_decreasing[i]]
134 | 
135 |             text = row[0]
136 |             text_padded[i, :text.size(0)] = text
137 |             text_lengths[i] = text.size(0)
138 | 
139 |             complx = row[1]
140 |             complx_padded[i, :, :, :complx.size(2)] = complx
141 |             complx_lengths[i] = complx.size(2)
142 | 
143 |             wav = row[2]
144 |             wav_padded[i, :, :wav.size(1)] = wav
145 |             wav_lengths[i] = wav.size(1)
146 | 
147 |         if self.return_ids:
148 |             return text_padded, text_lengths, complx_padded, complx_lengths, wav_padded, wav_lengths, ids_sorted_decreasing
149 |         return text_padded, text_lengths, complx_padded, complx_lengths, wav_padded, wav_lengths
150 | 
151 | 
152 | """Multi speaker version"""
153 | class TextAudioSpeakerLoader(torch.utils.data.Dataset):
154 |     """
155 |         1) loads audio, speaker_id, text pairs
156 |         2) normalizes text and converts them to sequences of integers
157 |         3) computes spectrograms from audio files.
158 |     """
159 |     def __init__(self, audiopaths_sid_text, hparams):
160 |         self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
161 |         self.text_cleaners = hparams.text_cleaners
162 |         self.max_wav_value = hparams.max_wav_value
163 |         self.sampling_rate = hparams.sampling_rate
164 |         self.filter_length  = hparams.filter_length
165 |         self.hop_length     = hparams.hop_length
166 |         self.win_length     = hparams.win_length
167 |         self.sampling_rate  = hparams.sampling_rate
168 | 
169 |         self.cleaned_text = getattr(hparams, "cleaned_text", False)
170 | 
171 |         self.add_blank = hparams.add_blank
172 |         self.min_text_len = getattr(hparams, "min_text_len", 1)
173 |         self.max_text_len = getattr(hparams, "max_text_len", 190)
174 | 
175 |         random.seed(1234)
176 |         random.shuffle(self.audiopaths_sid_text)
177 |         self._filter()
178 | 
179 |     def _filter(self):
180 |         """
181 |         Filter text & store spec lengths
182 |         """
183 |         # Store spectrogram lengths for Bucketing
184 |         # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
185 |         # spec_length = wav_length // hop_length
186 | 
187 |         audiopaths_sid_text_new = []
188 |         lengths = []
189 |         for audiopath, sid, text in self.audiopaths_sid_text:
190 |             if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
191 |                 audiopaths_sid_text_new.append([audiopath, sid, text])
192 |                 lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
193 |         self.audiopaths_sid_text = audiopaths_sid_text_new
194 |         self.lengths = lengths
195 | 
196 |     def get_audio_text_speaker_pair(self, audiopath_sid_text):
197 |         # separate filename, speaker_id and text
198 |         audiopath, sid, text = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2]
199 |         text = self.get_text(text)
200 |         spec, wav = self.get_audio(audiopath)
201 |         sid = self.get_sid(sid)
202 |         return (text, spec, wav, sid)
203 | 
204 |     def get_audio(self, filename):
205 |         audio, sampling_rate = load_wav_to_torch(filename)
206 |         if sampling_rate != self.sampling_rate:
207 |             raise ValueError("{} {} SR doesn't match target {} SR".format(
208 |                 sampling_rate, self.sampling_rate))
209 |         audio_norm = audio / self.max_wav_value
210 |         audio_norm = audio_norm.unsqueeze(0)
211 |         spec_filename = filename.replace(".wav", ".spec.pt")
212 |         if os.path.exists(spec_filename):
213 |             spec = torch.load(spec_filename)
214 |         else:
215 |             spec = spectrogram_torch(audio_norm, self.filter_length,
216 |                 self.sampling_rate, self.hop_length, self.win_length,
217 |                 center=False)
218 |             spec = torch.squeeze(spec, 0)
219 |             torch.save(spec, spec_filename)
220 |         return spec, audio_norm
221 | 
222 |     def get_text(self, text):
223 |         if self.cleaned_text:
224 |             text_norm = cleaned_text_to_sequence(text)
225 |         else:
226 |             text_norm = text_to_sequence(text, self.text_cleaners)
227 |         if self.add_blank:
228 |             text_norm = commons.intersperse(text_norm, 0)
229 |         text_norm = torch.LongTensor(text_norm)
230 |         return text_norm
231 | 
232 |     def get_sid(self, sid):
233 |         sid = torch.LongTensor([int(sid)])
234 |         return sid
235 | 
236 |     def __getitem__(self, index):
237 |         return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
238 | 
239 |     def __len__(self):
240 |         return len(self.audiopaths_sid_text)
241 | 
242 | 
243 | class TextAudioSpeakerCollate():
244 |     """ Zero-pads model inputs and targets
245 |     """
246 |     def __init__(self, return_ids=False):
247 |         self.return_ids = return_ids
248 | 
249 |     def __call__(self, batch):
250 |         """Collate's training batch from normalized text, audio and speaker identities
251 |         PARAMS
252 |         ------
253 |         batch: [text_normalized, spec_normalized, wav_normalized, sid]
254 |         """
255 |         # Right zero-pad all one-hot text sequences to max input length
256 |         _, ids_sorted_decreasing = torch.sort(
257 |             torch.LongTensor([x[1].size(1) for x in batch]),
258 |             dim=0, descending=True)
259 | 
260 |         max_text_len = max([len(x[0]) for x in batch])
261 |         max_spec_len = max([x[1].size(1) for x in batch])
262 |         max_wav_len = max([x[2].size(1) for x in batch])
263 | 
264 |         text_lengths = torch.LongTensor(len(batch))
265 |         spec_lengths = torch.LongTensor(len(batch))
266 |         wav_lengths = torch.LongTensor(len(batch))
267 |         sid = torch.LongTensor(len(batch))
268 | 
269 |         text_padded = torch.LongTensor(len(batch), max_text_len)
270 |         spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
271 |         wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
272 |         text_padded.zero_()
273 |         spec_padded.zero_()
274 |         wav_padded.zero_()
275 |         for i in range(len(ids_sorted_decreasing)):
276 |             row = batch[ids_sorted_decreasing[i]]
277 | 
278 |             text = row[0]
279 |             text_padded[i, :text.size(0)] = text
280 |             text_lengths[i] = text.size(0)
281 | 
282 |             spec = row[1]
283 |             spec_padded[i, :, :spec.size(1)] = spec
284 |             spec_lengths[i] = spec.size(1)
285 | 
286 |             wav = row[2]
287 |             wav_padded[i, :, :wav.size(1)] = wav
288 |             wav_lengths[i] = wav.size(1)
289 | 
290 |             sid[i] = row[3]
291 | 
292 |         if self.return_ids:
293 |             return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
294 |         return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid
295 | 
296 | 
297 | class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
298 |     """
299 |     Maintain similar input lengths in a batch.
300 |     Length groups are specified by boundaries.
301 |     Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
302 |   
303 |     It removes samples which are not included in the boundaries.
304 |     Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
305 |     """
306 |     def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
307 |         super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
308 |         self.lengths = dataset.lengths
309 |         self.batch_size = batch_size
310 |         self.boundaries = boundaries
311 |   
312 |         self.buckets, self.num_samples_per_bucket = self._create_buckets()
313 |         self.total_size = sum(self.num_samples_per_bucket)
314 |         self.num_samples = self.total_size // self.num_replicas
315 |   
316 |     def _create_buckets(self):
317 |         buckets = [[] for _ in range(len(self.boundaries) - 1)]
318 |         for i in range(len(self.lengths)):
319 |             length = self.lengths[i]
320 |             idx_bucket = self._bisect(length)
321 |             if idx_bucket != -1:
322 |                 buckets[idx_bucket].append(i)
323 |   
324 |         for i in range(len(buckets) - 1, 0, -1):
325 |             if len(buckets[i]) == 0:
326 |                 buckets.pop(i)
327 |                 self.boundaries.pop(i+1)
328 |   
329 |         num_samples_per_bucket = []
330 |         for i in range(len(buckets)):
331 |             len_bucket = len(buckets[i])
332 |             total_batch_size = self.num_replicas * self.batch_size
333 |             rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
334 |             num_samples_per_bucket.append(len_bucket + rem)
335 |         return buckets, num_samples_per_bucket
336 |   
337 |     def __iter__(self):
338 |       # deterministically shuffle based on epoch
339 |       g = torch.Generator()
340 |       g.manual_seed(self.epoch)
341 |   
342 |       indices = []
343 |       if self.shuffle:
344 |           for bucket in self.buckets:
345 |               indices.append(torch.randperm(len(bucket), generator=g).tolist())
346 |       else:
347 |           for bucket in self.buckets:
348 |               indices.append(list(range(len(bucket))))
349 |   
350 |       batches = []
351 |       for i in range(len(self.buckets)):
352 |           bucket = self.buckets[i]
353 |           len_bucket = len(bucket)
354 |           ids_bucket = indices[i]
355 |           num_samples_bucket = self.num_samples_per_bucket[i]
356 |   
357 |           # add extra samples to make it evenly divisible
358 |           rem = num_samples_bucket - len_bucket
359 |           ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
360 |   
361 |           # subsample
362 |           ids_bucket = ids_bucket[self.rank::self.num_replicas]
363 |   
364 |           # batching
365 |           for j in range(len(ids_bucket) // self.batch_size):
366 |               batch = [bucket[idx] for idx in ids_bucket[j*self.batch_size:(j+1)*self.batch_size]]
367 |               batches.append(batch)
368 |   
369 |       if self.shuffle:
370 |           batch_ids = torch.randperm(len(batches), generator=g).tolist()
371 |           batches = [batches[i] for i in batch_ids]
372 |       self.batches = batches
373 |   
374 |       assert len(self.batches) * self.batch_size == self.num_samples
375 |       return iter(self.batches)
376 |   
377 |     def _bisect(self, x, lo=0, hi=None):
378 |       if hi is None:
379 |           hi = len(self.boundaries) - 1
380 |   
381 |       if hi > lo:
382 |           mid = (hi + lo) // 2
383 |           if self.boundaries[mid] < x and x <= self.boundaries[mid+1]:
384 |               return mid
385 |           elif x <= self.boundaries[mid]:
386 |               return self._bisect(x, lo, mid)
387 |           else:
388 |               return self._bisect(x, mid + 1, hi)
389 |       else:
390 |           return -1
391 | 
392 |     def __len__(self):
393 |         return self.num_samples // self.batch_size
394 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | 
  7 | import commons
  8 | import modules
  9 | import attentions
 10 | import monotonic_align
 11 | 
 12 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 13 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 14 | from commons import init_weights, get_padding
 15 | from pqmf import PQMF
 16 | from stft import TorchSTFT
 17 | import math
 18 | 
 19 | 
 20 | class StochasticDurationPredictor(nn.Module):
 21 |   def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
 22 |     super().__init__()
 23 |     filter_channels = in_channels # it needs to be removed from future version.
 24 |     self.in_channels = in_channels
 25 |     self.filter_channels = filter_channels
 26 |     self.kernel_size = kernel_size
 27 |     self.p_dropout = p_dropout
 28 |     self.n_flows = n_flows
 29 |     self.gin_channels = gin_channels
 30 | 
 31 |     self.log_flow = modules.Log()
 32 |     self.flows = nn.ModuleList()
 33 |     self.flows.append(modules.ElementwiseAffine(2))
 34 |     for i in range(n_flows):
 35 |       self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
 36 |       self.flows.append(modules.Flip())
 37 | 
 38 |     self.post_pre = nn.Conv1d(1, filter_channels, 1)
 39 |     self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
 40 |     self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
 41 |     self.post_flows = nn.ModuleList()
 42 |     self.post_flows.append(modules.ElementwiseAffine(2))
 43 |     for i in range(4):
 44 |       self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
 45 |       self.post_flows.append(modules.Flip())
 46 | 
 47 |     self.pre = nn.Conv1d(in_channels, filter_channels, 1)
 48 |     self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
 49 |     self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
 50 |     if gin_channels != 0:
 51 |       self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
 52 | 
 53 |   def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
 54 |     x = torch.detach(x)
 55 |     x = self.pre(x)
 56 |     if g is not None:
 57 |       g = torch.detach(g)
 58 |       x = x + self.cond(g)
 59 |     x = self.convs(x, x_mask)
 60 |     x = self.proj(x) * x_mask
 61 | 
 62 |     if not reverse:
 63 |       flows = self.flows
 64 |       assert w is not None
 65 | 
 66 |       logdet_tot_q = 0 
 67 |       h_w = self.post_pre(w)
 68 |       h_w = self.post_convs(h_w, x_mask)
 69 |       h_w = self.post_proj(h_w) * x_mask
 70 |       e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
 71 |       z_q = e_q
 72 |       for flow in self.post_flows:
 73 |         z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
 74 |         logdet_tot_q += logdet_q
 75 |       z_u, z1 = torch.split(z_q, [1, 1], 1) 
 76 |       u = torch.sigmoid(z_u) * x_mask
 77 |       z0 = (w - u) * x_mask
 78 |       logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
 79 |       logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
 80 | 
 81 |       logdet_tot = 0
 82 |       z0, logdet = self.log_flow(z0, x_mask)
 83 |       logdet_tot += logdet
 84 |       z = torch.cat([z0, z1], 1)
 85 |       for flow in flows:
 86 |         z, logdet = flow(z, x_mask, g=x, reverse=reverse)
 87 |         logdet_tot = logdet_tot + logdet
 88 |       nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
 89 |       return nll + logq # [b]
 90 |     else:
 91 |       flows = list(reversed(self.flows))
 92 |       flows = flows[:-2] + [flows[-1]] # remove a useless vflow
 93 |       z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
 94 |       for flow in flows:
 95 |         z = flow(z, x_mask, g=x, reverse=reverse)
 96 |       z0, z1 = torch.split(z, [1, 1], 1)
 97 |       logw = z0
 98 |       return logw
 99 | 
100 | 
101 | class DurationPredictor(nn.Module):
102 |   def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
103 |     super().__init__()
104 | 
105 |     self.in_channels = in_channels
106 |     self.filter_channels = filter_channels
107 |     self.kernel_size = kernel_size
108 |     self.p_dropout = p_dropout
109 |     self.gin_channels = gin_channels
110 | 
111 |     self.drop = nn.Dropout(p_dropout)
112 |     self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
113 |     self.norm_1 = modules.LayerNorm(filter_channels)
114 |     self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
115 |     self.norm_2 = modules.LayerNorm(filter_channels)
116 |     self.proj = nn.Conv1d(filter_channels, 1, 1)
117 | 
118 |     if gin_channels != 0:
119 |       self.cond = nn.Conv1d(gin_channels, in_channels, 1)
120 | 
121 |   def forward(self, x, x_mask, g=None):
122 |     x = torch.detach(x)
123 |     if g is not None:
124 |       g = torch.detach(g)
125 |       x = x + self.cond(g)
126 |     x = self.conv_1(x * x_mask)
127 |     x = torch.relu(x)
128 |     x = self.norm_1(x)
129 |     x = self.drop(x)
130 |     x = self.conv_2(x * x_mask)
131 |     x = torch.relu(x)
132 |     x = self.norm_2(x)
133 |     x = self.drop(x)
134 |     x = self.proj(x * x_mask)
135 |     return x * x_mask
136 | 
137 | 
138 | class TextEncoder(nn.Module):
139 |   def __init__(self,
140 |       n_vocab,
141 |       out_channels,
142 |       hidden_channels,
143 |       filter_channels,
144 |       n_heads,
145 |       n_layers,
146 |       kernel_size,
147 |       p_dropout):
148 |     super().__init__()
149 |     self.n_vocab = n_vocab
150 |     self.out_channels = out_channels
151 |     self.hidden_channels = hidden_channels
152 |     self.filter_channels = filter_channels
153 |     self.n_heads = n_heads
154 |     self.n_layers = n_layers
155 |     self.kernel_size = kernel_size
156 |     self.p_dropout = p_dropout
157 | 
158 |     self.emb = nn.Embedding(n_vocab, hidden_channels)
159 |     nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
160 | 
161 |     self.encoder = attentions.Encoder(
162 |       hidden_channels,
163 |       filter_channels,
164 |       n_heads,
165 |       n_layers,
166 |       kernel_size,
167 |       p_dropout)
168 |     self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
169 | 
170 |   def forward(self, x, x_lengths):
171 |     x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
172 |     x = torch.transpose(x, 1, -1) # [b, h, t]
173 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
174 | 
175 |     x = self.encoder(x * x_mask, x_mask)
176 |     stats = self.proj(x) * x_mask
177 | 
178 |     m, logs = torch.split(stats, self.out_channels, dim=1)
179 |     return x, m, logs, x_mask
180 | 
181 | 
182 | class ResidualCouplingBlock(nn.Module):
183 |   def __init__(self,
184 |       channels,
185 |       hidden_channels,
186 |       kernel_size,
187 |       dilation_rate,
188 |       n_layers,
189 |       n_flows=4,
190 |       gin_channels=0):
191 |     super().__init__()
192 |     self.channels = channels
193 |     self.hidden_channels = hidden_channels
194 |     self.kernel_size = kernel_size
195 |     self.dilation_rate = dilation_rate
196 |     self.n_layers = n_layers
197 |     self.n_flows = n_flows
198 |     self.gin_channels = gin_channels
199 | 
200 |     self.flows = nn.ModuleList()
201 |     for i in range(n_flows):
202 |       self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
203 |       self.flows.append(modules.Flip())
204 | 
205 |   def forward(self, x, x_mask, g=None, reverse=False):
206 |     if not reverse:
207 |       for flow in self.flows:
208 |         x, _ = flow(x, x_mask, g=g, reverse=reverse)
209 |     else:
210 |       for flow in reversed(self.flows):
211 |         x = flow(x, x_mask, g=g, reverse=reverse)
212 |     return x
213 | 
214 | class PosteriorEncoder(nn.Module):
215 |   def __init__(self,
216 |       in_channels,
217 |       out_channels,
218 |       hidden_channels,
219 |       kernel_size,
220 |       dilation_rate,
221 |       n_layers,
222 |       gin_channels=0):
223 |     super().__init__()
224 |     self.in_channels = in_channels
225 |     self.out_channels = out_channels
226 |     self.hidden_channels = hidden_channels
227 |     self.kernel_size = kernel_size
228 |     self.dilation_rate = dilation_rate
229 |     self.n_layers = n_layers
230 |     self.gin_channels = gin_channels
231 |     
232 |     self.pre = nn.Conv1d(4*in_channels, hidden_channels, 1, groups=4) #* Use group convolution
233 |     self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
234 |     self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
235 | 
236 |   def forward(self, x, x_lengths, g=None):
237 |     '''
238 |     x: (B, 4, N, T)
239 |     x_length: (B,)
240 |     '''
241 |     x = torch.reshape(x, (x.shape[0], -1, x.shape[-1])) # (B, 4N, T)
242 |     x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
243 |     x = self.pre(x) * x_mask
244 |     x = self.enc(x, x_mask, g=g)
245 |     stats = self.proj(x) * x_mask
246 |     m, logs = torch.split(stats, self.out_channels, dim=1)
247 |     z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
248 |     return z, m, logs, x_mask
249 | 
250 | 
251 | class iSTFT_Generator(torch.nn.Module):
252 |     def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size, gin_channels=0):
253 |         super(iSTFT_Generator, self).__init__()
254 |         # self.h = h
255 |         self.gen_istft_n_fft = gen_istft_n_fft
256 |         self.gen_istft_hop_size = gen_istft_hop_size
257 | 
258 |         self.num_kernels = len(resblock_kernel_sizes)
259 |         self.num_upsamples = len(upsample_rates)
260 |         self.conv_pre = weight_norm(Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3))
261 |         resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
262 | 
263 |         self.ups = nn.ModuleList()
264 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
265 |             self.ups.append(weight_norm(
266 |                 ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
267 |                                 k, u, padding=(k-u)//2)))
268 | 
269 |         self.resblocks = nn.ModuleList()
270 |         for i in range(len(self.ups)):
271 |             ch = upsample_initial_channel//(2**(i+1))
272 |             for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
273 |                 self.resblocks.append(resblock(ch, k, d))
274 | 
275 |         self.post_n_fft = self.gen_istft_n_fft
276 |         self.conv_post = weight_norm(Conv1d(ch, self.post_n_fft + 2, 7, 1, padding=3))
277 |         self.ups.apply(init_weights)
278 |         self.conv_post.apply(init_weights)
279 |         self.reflection_pad = torch.nn.ReflectionPad1d((1, 0))
280 |         self.stft = TorchSTFT(filter_length=self.gen_istft_n_fft, hop_length=self.gen_istft_hop_size, win_length=self.gen_istft_n_fft)
281 |     def forward(self, x, g=None):
282 |         
283 |         x = self.conv_pre(x)
284 |         for i in range(self.num_upsamples):
285 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
286 |             x = self.ups[i](x)
287 |             xs = None
288 |             for j in range(self.num_kernels):
289 |                 if xs is None:
290 |                     xs = self.resblocks[i*self.num_kernels+j](x)
291 |                 else:
292 |                     xs += self.resblocks[i*self.num_kernels+j](x)
293 |             x = xs / self.num_kernels
294 |         x = F.leaky_relu(x)
295 |         x = self.reflection_pad(x)
296 |         x = self.conv_post(x)
297 |         spec = torch.exp(x[:,:self.post_n_fft // 2 + 1, :])
298 |         phase = math.pi*torch.sin(x[:, self.post_n_fft // 2 + 1:, :])
299 |         out = self.stft.inverse(spec, phase).to(x.device)
300 |         return out, None
301 | 
302 |     def remove_weight_norm(self):
303 |         print('Removing weight norm...')
304 |         for l in self.ups:
305 |             remove_weight_norm(l)
306 |         for l in self.resblocks:
307 |             l.remove_weight_norm()
308 |         remove_weight_norm(self.conv_pre)
309 |         remove_weight_norm(self.conv_post)
310 | 
311 | class ResBlock(torch.nn.Module):
312 |     def __init__(self, in_ch, out_ch):
313 |         super().__init__()
314 |         self.convs = nn.ModuleList([
315 |             Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
316 |             nn.BatchNorm2d(out_ch),
317 |             nn.ReLU()
318 |         ])
319 |         self.out_ch = out_ch
320 |         self.in_ch = in_ch
321 | 
322 |     def forward(self, x):
323 |         for c in self.convs:
324 |             res = c(x)
325 |             if self.out_ch == self.in_ch:
326 |                 x = res + x
327 |             else:
328 |                 x = res
329 |         return x
330 |       
331 | class Multiband_iSTFT_Generator(torch.nn.Module):
332 |     def __init__(self, latent_dim, n_blocks, gen_istft_n_fft, gen_istft_hop_size, subbands, gin_channels=0):
333 |         super(Multiband_iSTFT_Generator, self).__init__()
334 |         # self.h = h
335 |         self.subbands = subbands
336 |         self.linear = Conv1d(latent_dim, gen_istft_n_fft//2+1, 1, 1, 0)
337 |         self.decs = nn.ModuleList()
338 |         middle = n_blocks//2 + 1
339 |         for i in range(1, n_blocks+1):
340 |             if i < middle:
341 |                 self.decs.append(ResBlock(1,1))
342 |             elif i == middle:
343 |                 self.decs.append(ResBlock(1,4))
344 |             else:
345 |                 self.decs.append(ResBlock(4,4))
346 |                 
347 |         self.conv_post = Conv2d(4, self.subbands*2, 3, 1, padding=1) # Predict Real/Img (default) or Magitude/Phase
348 |         
349 |         self.reflection_pad = nn.ReflectionPad1d((1, 0))
350 |         self.stft = TorchSTFT(filter_length=gen_istft_n_fft, hop_length=gen_istft_hop_size, win_length=gen_istft_n_fft)
351 |         
352 |         self.gen_istft_n_fft = gen_istft_n_fft
353 |         self.gen_istft_hop_size = gen_istft_hop_size
354 |         
355 |         # self.pqmf = PQMF()
356 | 
357 | 
358 |     def forward(self, x, g=None):
359 |       pqmf = PQMF(x.device)
360 |       
361 |       # (B, 1, ch, length)
362 |       x = self.linear(x).unsqueeze(1)
363 |       for dec_block in self.decs:
364 |           x = dec_block(x)
365 | 
366 |       # (B, 4, N, T)
367 |       x = F.leaky_relu(x)
368 |       x = x.contiguous().view(x.size(0),-1,x.size(-1)) # (B, 4*ch, T)
369 |       x = self.reflection_pad(x)
370 |       x = x.contiguous().view(x.size(0),4,-1,x.size(-1)) # (B, 4*ch, T') -> (B, 4, ch, T')
371 |       
372 |       # (B, 4, ch, T') -> (B, 4*2, N, T')  subbands(4)*real/imag(2)
373 |       x = self.conv_post(x)
374 |       # (B, 4, N, T)
375 |       real = x[:,:self.subbands,:,:]
376 |       imag = x[:,self.subbands:,:,:]
377 |       
378 |       y_mb_hat = self.stft.cartesian_inverse(torch.reshape(real, (real.shape[0]*self.subbands, self.gen_istft_n_fft // 2 + 1, real.shape[-1])), torch.reshape(imag, (imag.shape[0]*self.subbands, self.gen_istft_n_fft // 2 + 1, imag.shape[-1])))
379 |       # (4*B, ...) -> (4, B, ...)
380 |       y_mb_hat = torch.reshape(y_mb_hat, (x.shape[0], self.subbands, 1, y_mb_hat.shape[-1]))
381 |       y_mb_hat = y_mb_hat.squeeze(-2)
382 | 
383 |       y_g_hat = pqmf.synthesis(y_mb_hat)
384 | 
385 |       return y_g_hat, y_mb_hat
386 | 
387 |     def remove_weight_norm(self):
388 |       print('Removing weight norm...')
389 |       for l in self.ups:
390 |           remove_weight_norm(l)
391 |       for l in self.resblocks:
392 |           l.remove_weight_norm()
393 | 
394 | 
395 | class Multistream_iSTFT_Generator(torch.nn.Module):
396 |     def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size, subbands, gin_channels=0):
397 |         super(Multistream_iSTFT_Generator, self).__init__()
398 |         # self.h = h
399 |         self.subbands = subbands
400 |         self.num_kernels = len(resblock_kernel_sizes)
401 |         self.num_upsamples = len(upsample_rates)
402 |         self.conv_pre = weight_norm(Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3))
403 |         resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
404 | 
405 |         self.ups = nn.ModuleList()
406 |         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
407 |             self.ups.append(weight_norm(
408 |                 ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
409 |                                 k, u, padding=(k-u)//2)))
410 | 
411 |         self.resblocks = nn.ModuleList()
412 |         for i in range(len(self.ups)):
413 |             ch = upsample_initial_channel//(2**(i+1))
414 |             for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
415 |                 self.resblocks.append(resblock(ch, k, d))
416 | 
417 |         self.post_n_fft = gen_istft_n_fft
418 |         self.ups.apply(init_weights)
419 |         self.reflection_pad = torch.nn.ReflectionPad1d((1, 0))
420 |         self.reshape_pixelshuffle = []
421 |  
422 |         self.subband_conv_post = weight_norm(Conv1d(ch, self.subbands*(self.post_n_fft + 2), 7, 1, padding=3))
423 |         
424 |         self.subband_conv_post.apply(init_weights)
425 |         
426 |         self.gen_istft_n_fft = gen_istft_n_fft
427 |         self.gen_istft_hop_size = gen_istft_hop_size
428 | 
429 |         updown_filter = torch.zeros((self.subbands, self.subbands, self.subbands)).float()
430 |         for k in range(self.subbands):
431 |             updown_filter[k, k, 0] = 1.0
432 |         self.register_buffer("updown_filter", updown_filter)
433 |         self.multistream_conv_post = weight_norm(Conv1d(4, 1, kernel_size=63, bias=False, padding=get_padding(63, 1)))
434 |         self.multistream_conv_post.apply(init_weights)
435 |         
436 | 
437 | 
438 |     def forward(self, x, g=None):
439 |       stft = TorchSTFT(filter_length=self.gen_istft_n_fft, hop_length=self.gen_istft_hop_size, win_length=self.gen_istft_n_fft).to(x.device)
440 |       # pqmf = PQMF(x.device)
441 | 
442 |       x = self.conv_pre(x)#[B, ch, length]
443 |         
444 |       for i in range(self.num_upsamples):
445 | 
446 |           
447 |           x = F.leaky_relu(x, modules.LRELU_SLOPE)
448 |           x = self.ups[i](x)
449 |           
450 |           
451 |           xs = None
452 |           for j in range(self.num_kernels):
453 |               if xs is None:
454 |                   xs = self.resblocks[i*self.num_kernels+j](x)
455 |               else:
456 |                   xs += self.resblocks[i*self.num_kernels+j](x)
457 |           x = xs / self.num_kernels
458 |           
459 |       x = F.leaky_relu(x)
460 |       x = self.reflection_pad(x)
461 |       x = self.subband_conv_post(x)
462 |       x = torch.reshape(x, (x.shape[0], self.subbands, x.shape[1]//self.subbands, x.shape[-1]))
463 | 
464 |       spec = torch.exp(x[:,:,:self.post_n_fft // 2 + 1, :])
465 |       phase = math.pi*torch.sin(x[:,:, self.post_n_fft // 2 + 1:, :])
466 | 
467 |       y_mb_hat = stft.inverse(torch.reshape(spec, (spec.shape[0]*self.subbands, self.gen_istft_n_fft // 2 + 1, spec.shape[-1])), torch.reshape(phase, (phase.shape[0]*self.subbands, self.gen_istft_n_fft // 2 + 1, phase.shape[-1])))
468 |       y_mb_hat = torch.reshape(y_mb_hat, (x.shape[0], self.subbands, 1, y_mb_hat.shape[-1]))
469 |       y_mb_hat = y_mb_hat.squeeze(-2)
470 | 
471 |       y_mb_hat = F.conv_transpose1d(y_mb_hat, self.updown_filter.cuda(x.device) * self.subbands, stride=self.subbands)
472 | 
473 |       y_g_hat = self.multistream_conv_post(y_mb_hat)
474 | 
475 |       return y_g_hat, y_mb_hat
476 | 
477 |     def remove_weight_norm(self):
478 |       print('Removing weight norm...')
479 |       for l in self.ups:
480 |           remove_weight_norm(l)
481 |       for l in self.resblocks:
482 |           l.remove_weight_norm()
483 | 
484 | 
485 | class DiscriminatorP(torch.nn.Module):
486 |     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
487 |         super(DiscriminatorP, self).__init__()
488 |         self.period = period
489 |         self.use_spectral_norm = use_spectral_norm
490 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
491 |         self.convs = nn.ModuleList([
492 |             norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
493 |             norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
494 |             norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
495 |             norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
496 |             norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
497 |         ])
498 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
499 | 
500 |     def forward(self, x):
501 |         fmap = []
502 | 
503 |         # 1d to 2d
504 |         b, c, t = x.shape
505 |         if t % self.period != 0: # pad first
506 |             n_pad = self.period - (t % self.period)
507 |             x = F.pad(x, (0, n_pad), "reflect")
508 |             t = t + n_pad
509 |         x = x.view(b, c, t // self.period, self.period)
510 | 
511 |         for l in self.convs:
512 |             x = l(x)
513 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
514 |             fmap.append(x)
515 |         x = self.conv_post(x)
516 |         fmap.append(x)
517 |         x = torch.flatten(x, 1, -1)
518 | 
519 |         return x, fmap
520 | 
521 | 
522 | class DiscriminatorS(torch.nn.Module):
523 |     def __init__(self, use_spectral_norm=False):
524 |         super(DiscriminatorS, self).__init__()
525 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
526 |         self.convs = nn.ModuleList([
527 |             norm_f(Conv1d(1, 16, 15, 1, padding=7)),
528 |             norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
529 |             norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
530 |             norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
531 |             norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
532 |             norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
533 |         ])
534 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
535 | 
536 |     def forward(self, x):
537 |         fmap = []
538 | 
539 |         for l in self.convs:
540 |             x = l(x)
541 |             x = F.leaky_relu(x, modules.LRELU_SLOPE)
542 |             fmap.append(x)
543 |         x = self.conv_post(x)
544 |         fmap.append(x)
545 |         x = torch.flatten(x, 1, -1)
546 | 
547 |         return x, fmap
548 | 
549 | 
550 | class MultiPeriodDiscriminator(torch.nn.Module):
551 |     def __init__(self, use_spectral_norm=False):
552 |         super(MultiPeriodDiscriminator, self).__init__()
553 |         periods = [2,3,5,7,11]
554 | 
555 |         discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
556 |         discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
557 |         self.discriminators = nn.ModuleList(discs)
558 | 
559 |     def forward(self, y, y_hat):
560 |         y_d_rs = []
561 |         y_d_gs = []
562 |         fmap_rs = []
563 |         fmap_gs = []
564 |         for i, d in enumerate(self.discriminators):
565 |             y_d_r, fmap_r = d(y)
566 |             y_d_g, fmap_g = d(y_hat)
567 |             y_d_rs.append(y_d_r)
568 |             y_d_gs.append(y_d_g)
569 |             fmap_rs.append(fmap_r)
570 |             fmap_gs.append(fmap_g)
571 | 
572 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
573 | 
574 | 
575 | 
576 | class SynthesizerTrn(nn.Module):
577 |   """
578 |   Synthesizer for Training
579 |   """
580 | 
581 |   def __init__(self, 
582 |     n_vocab,
583 |     spec_channels,
584 |     segment_size,
585 |     inter_channels,
586 |     hidden_channels,
587 |     filter_channels,
588 |     n_heads,
589 |     n_layers,
590 |     kernel_size,
591 |     p_dropout,
592 |     resblock, 
593 |     resblock_kernel_sizes, 
594 |     resblock_dilation_sizes, 
595 |     upsample_rates, 
596 |     upsample_initial_channel, 
597 |     upsample_kernel_sizes,
598 |     n_blocks,
599 |     latent_dim,
600 |     gen_istft_n_fft,
601 |     gen_istft_hop_size,
602 |     n_speakers=0,
603 |     gin_channels=0,
604 |     use_sdp=False,
605 |     ms_istft_vits=False,
606 |     mb_istft_vits = False,
607 |     subbands = False,
608 |     istft_vits=False,
609 |     **kwargs):
610 | 
611 |     super().__init__()
612 |     self.n_vocab = n_vocab
613 |     self.spec_channels = spec_channels
614 |     self.inter_channels = inter_channels
615 |     self.hidden_channels = hidden_channels
616 |     self.filter_channels = filter_channels
617 |     self.n_heads = n_heads
618 |     self.n_layers = n_layers
619 |     self.kernel_size = kernel_size
620 |     self.p_dropout = p_dropout
621 |     self.resblock = resblock
622 |     self.resblock_kernel_sizes = resblock_kernel_sizes
623 |     self.resblock_dilation_sizes = resblock_dilation_sizes
624 |     self.upsample_rates = upsample_rates
625 |     self.upsample_initial_channel = upsample_initial_channel
626 |     self.upsample_kernel_sizes = upsample_kernel_sizes
627 |     self.segment_size = segment_size
628 |     self.n_speakers = n_speakers
629 |     self.gin_channels = gin_channels
630 |     self.ms_istft_vits = ms_istft_vits
631 |     self.mb_istft_vits = mb_istft_vits
632 |     self.istft_vits = istft_vits
633 | 
634 |     self.use_sdp = use_sdp
635 | 
636 |     self.enc_p = TextEncoder(n_vocab,
637 |         inter_channels,
638 |         hidden_channels,
639 |         filter_channels,
640 |         n_heads,
641 |         n_layers,
642 |         kernel_size,
643 |         p_dropout)
644 |     if mb_istft_vits == True:
645 |       print('Mutli-band iSTFT VITS')
646 |       self.dec = Multiband_iSTFT_Generator(latent_dim, n_blocks, gen_istft_n_fft, gen_istft_hop_size, subbands, gin_channels=gin_channels)
647 |     elif ms_istft_vits == True:
648 |       print('Mutli-stream iSTFT VITS')
649 |       self.dec = Multistream_iSTFT_Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size, subbands, gin_channels=gin_channels)
650 |     elif istft_vits == True:
651 |       print('iSTFT-VITS')
652 |       self.dec = iSTFT_Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size, gin_channels=gin_channels)
653 |     else:
654 |       print('Decoder Error in json file')
655 | 
656 |     self.enc_q = PosteriorEncoder(spec_channels, latent_dim, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
657 | 
658 |     self.flow = ResidualCouplingBlock(latent_dim, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
659 | 
660 |     if use_sdp:
661 |       self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
662 |     else:
663 |       self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
664 | 
665 |     if n_speakers > 1:
666 |       self.emb_g = nn.Embedding(n_speakers, gin_channels)
667 | 
668 |   def forward(self, x, x_lengths, y, y_lengths, sid=None):
669 |     '''
670 |     y: complex components (B, 4, N, T)
671 |     '''
672 | 
673 |     x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
674 |     if self.n_speakers > 0:
675 |       g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
676 |     else:
677 |       g = None
678 | 
679 |     z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
680 |     
681 |     z_p = self.flow(z, y_mask, g=g)
682 | 
683 |     with torch.no_grad():
684 |       # negative cross-entropy
685 |       s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
686 |       neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
687 |       neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
688 |       neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
689 |       neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
690 |       neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
691 | 
692 |       attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
693 |       attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
694 | 
695 |     w = attn.sum(2)
696 |     if self.use_sdp:
697 |       l_length = self.dp(x, x_mask, w, g=g)
698 |       l_length = l_length / torch.sum(x_mask)
699 |     else:
700 |       logw_ = torch.log(w + 1e-6) * x_mask
701 |       logw = self.dp(x, x_mask, g=g)
702 |       l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging 
703 | 
704 |     # expand prior
705 |     m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
706 |     logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
707 | 
708 |     z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
709 |     o, o_mb = self.dec(z_slice, g=g)
710 |     
711 |     return o, o_mb, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
712 | 
713 |   def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
714 |     x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
715 |     if self.n_speakers > 0:
716 |       g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
717 |     else:
718 |       g = None
719 | 
720 |     if self.use_sdp:
721 |       logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
722 |     else:
723 |       logw = self.dp(x, x_mask, g=g)
724 |     w = torch.exp(logw) * x_mask * length_scale
725 |     w_ceil = torch.ceil(w)
726 |     y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
727 |     y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
728 |     attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
729 |     attn = commons.generate_path(w_ceil, attn_mask)
730 | 
731 |     m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
732 |     logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
733 | 
734 |     z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
735 |     z = self.flow(z_p, y_mask, g=g, reverse=True)
736 |     o, o_mb = self.dec((z * y_mask)[:,:,:max_len], g=g)
737 |     return o, o_mb, attn, y_mask, (z, z_p, m_p, logs_p)
738 | 
739 |   def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
740 |     assert self.n_speakers > 0, "n_speakers have to be larger than 0."
741 |     g_src = self.emb_g(sid_src).unsqueeze(-1)
742 |     g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
743 |     z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
744 |     z_p = self.flow(z, y_mask, g=g_src)
745 |     z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
746 |     o_hat, o_hat_mb = self.dec(z_hat * y_mask, g=g_tgt)
747 |     return o_hat, o_hat_mb, y_mask, (z, z_p, z_hat)
748 | 
749 | 


--------------------------------------------------------------------------------