├── README.md
├── audio_process.py
├── config.yaml
├── data_utils
    ├── clean.py
    ├── dataset.py
    ├── dataset_w_stats.py
    ├── makecsv.py
    ├── remove_bad_grid.py
    └── replace.sh
├── dataset_review
    ├── .ipynb_checkpoints
    │   └── speakers_stats-checkpoint.ipynb
    ├── filter_speakers.ipynb
    ├── hist.png
    ├── least20.png
    ├── short_train.txt
    ├── short_val.txt
    ├── speakers_short.json
    ├── speakers_stats.ipynb
    ├── speakers_to_remove.txt
    └── top20.png
├── examples.ipynb
├── examples
    ├── Airapetova_Darja_abooks_voxforge.wav
    ├── Arhipova_Natalja_abooks_voxforge.wav
    ├── Bolshakova_Ksenija_abooks_voxforge.wav
    ├── Chebaturkina_Elena_abooks_voxforge.wav
    ├── DrLutz_abooks_voxforge.wav
    ├── Efremov_Oleg_abooks_voxforge.wav
    ├── Goblin_abooks_voxforge.wav
    ├── Goblin_dance.wav
    ├── Grigorjev_Yurii_abooks_voxforge.wav
    ├── Kaljagin_A_abooks_voxforge.wav
    ├── Karpov_N_abooks_voxforge.wav
    ├── Konjahin_V_abooks_voxforge.wav
    ├── Kononov_Mikhail_abooks_voxforge.wav
    ├── Kotov_Alexandr_abooks_voxforge.wav
    ├── Kovaleva_Anna_abooks_voxforge.wav
    ├── Kuznetsov_Alexei_abooks_voxforge.wav
    ├── Kuznetsov_Vsevolod_abooks_voxforge.wav
    ├── Kvasha_Igor_abooks_voxforge.wav
    ├── Larionov_Vsevolod_abooks_voxforge.wav
    ├── Larionova-Ludm_abooks_voxforge.wav
    ├── Litvinov_I_abooks_voxforge.wav
    ├── Markin_Petr_abooks_voxforge.wav
    ├── Martjanov_O_abooks_voxforge.wav
    ├── Medvedeva_Galcova_Olga_abooks_voxforge.wav
    ├── Muhametzyanov_Radik_abooks_voxforge.wav
    ├── Pokrovsky_Boris_abooks_voxforge.wav
    ├── Popova_Alevtina_abooks_voxforge.wav
    ├── Rezalin_Aleksandr_abooks_voxforge.wav
    ├── Rosljakov_Mixail_abooks_voxforge.wav
    ├── Schirvind_A_abooks_voxforge.wav
    ├── Stukalov_Vladimir_abooks_voxforge.wav
    ├── Suetin_Pavel_abooks_voxforge.wav
    ├── Sushkov_Vladimir_abooks_voxforge.wav
    ├── Sytnik_I_abooks_voxforge.wav
    ├── Taratorkin_Georgiy_abooks_voxforge.wav
    ├── Tarinicheva_Tatjana_abooks_voxforge.wav
    ├── Terenkov_Alexandr_abooks_voxforge.wav
    ├── Trifilov_Nikolai_abooks_voxforge.wav
    ├── Vasiljev_Y_abooks_voxforge.wav
    ├── Vesnik_E_abooks_voxforge.wav
    ├── Vihrov_V_abooks_voxforge.wav
    ├── Vorobjeva_Irina_abooks_voxforge.wav
    ├── Zhirnov_Sergey_abooks_voxforge.wav
    ├── Zozulin_Viktor_abooks_voxforge.wav
    ├── goblin_opentts.wav
    ├── hajdurova_ailab.wav
    ├── ira_abooks_voxforge.wav
    ├── joh_abooks_voxforge.wav
    ├── june_shaman.wav
    ├── len_shaman.wav
    ├── mar_abooks_voxforge.wav
    ├── minaev_ailab.wav
    ├── morti_shaman.wav
    ├── nikolaev_ailab.wav
    ├── noname_opentts.wav
    ├── russian_single.wav
    ├── user11_mozilla.wav
    ├── user12_mozilla.wav
    ├── user17_mozilla.wav
    ├── user1_mozilla.wav
    ├── user20_mozilla.wav
    ├── user26_mozilla.wav
    ├── user4_mozilla.wav
    ├── user5_mozilla.wav
    ├── user6_mozilla.wav
    ├── user7_mozilla.wav
    ├── user8_mozilla.wav
    └── vsh_abooks_voxforge.wav
├── fs_two
    ├── README.md
    ├── audio
    │   ├── __init__.py
    │   ├── audio_processing.py
    │   ├── stft.py
    │   └── tools.py
    ├── cwt
    │   ├── __init__.py
    │   └── cwt_utils.py
    ├── dataset.py
    ├── evaluate.py
    ├── model
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── fastspeech2.cpython-38.pyc
    │   │   ├── loss.cpython-38.pyc
    │   │   ├── modules.cpython-38.pyc
    │   │   └── optimizer.cpython-38.pyc
    │   ├── fastspeech2.py
    │   ├── loss.py
    │   ├── modules.py
    │   └── optimizer.py
    ├── prepare_align.py
    ├── preprocess.py
    ├── preprocessor
    │   ├── common_multi.py
    │   └── preprocessor.py
    ├── synthesize.py
    ├── text
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── cleaners.cpython-38.pyc
    │   │   ├── cmudict.cpython-38.pyc
    │   │   ├── numbers.cpython-38.pyc
    │   │   ├── pinyin.cpython-38.pyc
    │   │   ├── russian.cpython-38.pyc
    │   │   └── symbols.cpython-38.pyc
    │   ├── cleaners.py
    │   ├── cmudict.py
    │   ├── numbers.py
    │   ├── pinyin.py
    │   ├── russian.py
    │   └── symbols.py
    ├── transformer
    │   ├── Constants.py
    │   ├── Layers.py
    │   ├── Models.py
    │   ├── Modules.py
    │   ├── SubLayers.py
    │   ├── __init__.py
    │   └── __pycache__
    │   │   ├── Constants.cpython-38.pyc
    │   │   ├── Layers.cpython-38.pyc
    │   │   ├── Models.cpython-38.pyc
    │   │   ├── Modules.cpython-38.pyc
    │   │   ├── SubLayers.cpython-38.pyc
    │   │   └── __init__.cpython-38.pyc
    └── utils
    │   ├── __pycache__
    │       └── tools.cpython-38.pyc
    │   ├── model.py
    │   └── tools.py
├── fsapi.py
├── hifi
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-38.pyc
    │   └── models.cpython-38.pyc
    ├── env.py
    ├── meldataset.py
    ├── models.py
    ├── utils.py
    └── vocoder
    │   ├── __pycache__
    │       └── utils.cpython-38.pyc
    │   └── utils.py
├── hifiapi.py
├── input_process.py
├── prepare_data.py
├── pretrained
    ├── rus_all.dict
    ├── speakers.json
    └── stats.json
├── requirements.txt
├── train.py
├── tts_king.py
└── voice_over.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | #### A video example with TTS voice - over using several speakers:
 2 | [![Watch the video](https://img.youtube.com/vi/DB6pS-CoWVs/0.jpg)](https://www.youtube.com/watch?v=DB6pS-CoWVs&t)
 3 | 
 4 | 
 5 | ## Brief
 6 | 
 7 | We embarked from this implementation for the start:https://github.com/ming024/FastSpeech2
 8 | 
 9 | However, we have made several changes so the code is not identical. 
10 | 
11 | For example: 
12 | - We use masking for input grapheme tokens during training;
13 | - CWT was implemented as in the original paper but we did not observe any improvements. Final model was trained without CWT. But you can train a model on your data with it: use_cwt flag in config;
14 | - Data preprocessing is slightly different, especially in langauge specific parts.
15 | 
16 | ### Dataset:
17 | 
18 | Russian dataset was borrowed from here https://github.com/vlomme/Multi-Tacotron-Voice-Cloning. We did not use all the speakers and filtered them based on length and records quality. Only 65 speakers were used at the end. You can check all the examples in 'examples'.
19 | 
20 | ### MFA:
21 | 
22 | MFA was trained from scartch after preprocessing text with russian_g2p. Using MFA might be not straightforward, so we refer to this manual: https://github.com/ivanvovk/DurIAN#6-how-to-align-your-own-data
23 | 
24 | 
25 | 
26 | # Usage 
27 | 
28 | 1. We use russian_g2p, so you will need to install it first.
29 | 
30 |     git init
31 |     git clone https://github.com/nsu-ai/russian_g2p.git\
32 |     cd russian_g2p
33 |     pip3 install -r requirements.txt
34 |     pip install .
35 | 
36 | 2. Then Install requirements.txt
37 | 
38 | 3. Download weights:
39 | https://drive.google.com/drive/folders/1dX7ELe9C9-ja_liYrgph3Uu5Z5EMljjh?usp=sharing
40 | 
41 |     - Move hifi gan and FS2 weights into 'pretrained';
42 |     - Check that paths in config match;
43 | 
44 |     - tts.weights_path - path to pretrained FastSpeech model;
45 |     - add speakers_json to the same folder as model weights - speaker names, it should be there right now for pretrained model;
46 |     - add sats_json to the same folder as model weights - raw data pitch and energy stats;
47 |     - hifi.weights_path - path to pretrnained HIFI Gan.
48 | 
49 | 
50 | 4. If all above is set check the notebook "examples.ipynb"
51 | 
52 | # Training your own model
53 | 
54 | 1. Assuming you preprocessed the data with MFA aligner. Your folders structure should be following:
55 | 
56 | 
57 | ``` 
58 | data
59 | ├── speaker_one
60 | │   ├── record_1.TextGrid  # genrated by MFA
61 | │   ├── record_1.wav      
62 | │   └── record_1.lab       # just a text file with a text string
63 | │       
64 | └── speaker_two
65 |     ├── ...
66 |     └── ...
67 | ```
68 | 
69 | 2. Once data is organized and the path to the data is set in config 'raw_path' run prepare_data.py.
70 | 
71 | 3. Prepare_data.py will generate more files such as energy and pitch into a folder set by 'preprocessed_path'
72 | 
73 | 4. Finally, set a path to a lexicon dict. Words and its translitirations generated by rissian_g2p. If you do not use rissian_g2p your dictionary will be different. An example can found in 'pretrained' folder.
74 | 
75 | ## Have Fun! 


--------------------------------------------------------------------------------
/audio_process.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import glob
 3 | 
 4 | def convert_mp3_to_wav(new_dir,  filename, sr=None):
 5 |     filename = filename.split(".mp3")[0]
 6 |     if sr:
 7 |         return subprocess.call(["ffmpeg", f"-i {filename}.mp3 -ar {sr} {new_dir}/{filename}.wav"])
 8 |     else:
 9 |         return subprocess.call(["ffmpeg", f"-i {filename}.mp3 {new_dir}/{filename}.wav"])
10 |     
11 | 
12 | def convert_dataset(dir, new_dir, sr=None):
13 |     for filename in glob.glob(f"{dir}/**/*.mp3"):
14 |         convert_mp3_to_wav(new_dir, filename, sr)
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
  1 | exp_name: 'multi'
  2 | gpu: 'cpu'
  3 | run_debug_eval: false
  4 | logger:
  5 |   offline: false
  6 |   wandb_key: 
  7 | 
  8 | 
  9 | tts:
 10 |   weights_path: './pretrained/290000.pth.tar'
 11 |   restore_step: 0  
 12 |   
 13 | hifi:
 14 |   weights_path: './pretrained/hifi.pth'
 15 |   MAX_WAV_VALUE: 32768
 16 |   resblock: "1"
 17 |   num_gpus: 0
 18 |   batch_size: 8
 19 |   learning_rate: 0.0002
 20 |   adam_b1: 0.8
 21 |   adam_b2: 0.99
 22 |   lr_decay: 0.999
 23 |   seed: 1234
 24 | 
 25 |   upsample_rates: [8,8,2,2]
 26 |   upsample_kernel_sizes: [16,16,4,4]
 27 |   upsample_initial_channel: 512
 28 |   resblock_kernel_sizes: [3,7,11]
 29 |   resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
 30 |   resblock_initial_channel: 256
 31 | 
 32 |   segment_size: 8192
 33 |   num_mels: 80
 34 |   num_freq: 1025
 35 |   n_fft: 1024
 36 |   hop_size: 256
 37 |   win_size: 1024
 38 |   sampling_rate: 22050
 39 | 
 40 | 
 41 | train_config:
 42 |   path:
 43 |     ckpt_path: "../output/ckpt/multi_final"
 44 |     result_path: "../output/result/multi_final"
 45 |   optimizer:
 46 |     batch_size: 16
 47 |     betas: [0.95, 0.999]
 48 |     eps: 0.00001
 49 |     weight_decay: 0.0
 50 |     grad_clip_thresh: 1.0
 51 |     grad_acc_step: 4
 52 |     warm_up_step: 4000
 53 |     anneal_steps: [300000, 400000, 500000]
 54 |     anneal_rate: 0.7
 55 |   step:
 56 |     total_step: 900000
 57 |     log_step: 100
 58 |     synth_step: 1000
 59 |     val_step: 1000
 60 |     save_step: 5000
 61 |   
 62 |   max_masks_per_sentence: 0.15
 63 | 
 64 | preprocess_config:
 65 |   dataset: "MAIN"
 66 | 
 67 |   path:
 68 |     lexicon_path: "./rus_all.dict"  
 69 |     raw_path: "./speakers"
 70 |     preprocessed_path: "./processed"
 71 | 
 72 | 
 73 |   preprocessing:
 74 |     val_size: 512
 75 |     text:
 76 |       text_cleaners: []
 77 |       language: "ru"
 78 |     audio:
 79 |       sampling_rate: 22050
 80 |       max_wav_value: 32768.0
 81 |     stft:
 82 |       filter_length: 1024
 83 |       hop_length: 256
 84 |       win_length: 1024
 85 |     mel:
 86 |       n_mel_channels: 80
 87 |       mel_fmin: 0
 88 |       mel_fmax: 8000 # please set to 8000 for HiFi-GAN vocoder, set to null for MelGAN vocoder
 89 |     pitch:
 90 |       feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
 91 |       normalization: True
 92 |     energy:
 93 |       feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
 94 |       normalization: True
 95 | 
 96 | model_config:
 97 |   transformer:
 98 |     encoder_layer: 4
 99 |     encoder_head: 2
100 |     encoder_hidden: 256
101 |     variance_hidden: 256
102 |     decoder_layer: 6
103 |     decoder_head: 2
104 |     decoder_hidden: 256
105 |     conv_filter_size: 1024
106 |     conv_kernel_size: [9, 1]
107 |     encoder_dropout: 0.2
108 |     decoder_dropout: 0.2
109 | 
110 |   variance_predictor:
111 |     filter_size: 256
112 |     kernel_size: 3
113 |     dropout: 0.5
114 |   use_cwt: False
115 |   variance_embedding:
116 |     pitch_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
117 |     energy_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
118 |     n_bins: 256
119 | 
120 |   multi_speaker: True
121 | 
122 |   max_seq_len: 1000
123 | 
124 |   vocoder:
125 |     model: "HiFi-GAN" # support 'HiFi-GAN', 'MelGAN'
126 |     speaker: "universal" # support  'LJSpeech', 'universal'
127 |     use_cpu: true
128 | 


--------------------------------------------------------------------------------
/data_utils/clean.py:
--------------------------------------------------------------------------------
 1 | from string import ascii_letters, digits, whitespace
 2 | 
 3 | cyrillic_letters = (
 4 |     "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
 5 | )
 6 | 
 7 | 
 8 | def strip(text):
 9 |     allowed_chars = cyrillic_letters  # + digits + whitespace
10 |     return "".join([c for c in text if c in allowed_chars])
11 | 
12 | 
13 | with open("vocab.lab", "r") as r:
14 |     lines = r.read()
15 | lines = sorted([strip(l) for l in lines.split("\n")], key=len)
16 | 
17 | with open("./vocab_clean.txt", "w") as f:
18 |     for text in lines:
19 |         f.write(text + "\n")
20 | 


--------------------------------------------------------------------------------
/data_utils/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | # traverse root directory, and list directories as dirs and files as files
 5 | 
 6 | FINAL_DIR = "./ailabs_speaker"
 7 | 
 8 | os.makedirs(FINAL_DIR, exist_ok=True)
 9 | 
10 | 
11 | def csv_dict(path):
12 |     with open(path) as f:
13 |         lines = f.readlines()
14 | 
15 |     for line in lines:
16 |         line = line.split("|")
17 |         yield line[0], line[2].lower()
18 | 
19 | 
20 | def make_record(f_path, name, text, speaker):
21 |     DIR = os.path.join(FINAL_DIR, speaker)
22 |     os.makedirs(DIR, exist_ok=True)
23 |     destination_wav = os.path.join(DIR, name + ".wav")
24 |     destination_lab = os.path.join(DIR, name + ".lab")
25 |     shutil.copy(f_path, destination_wav)
26 |     with open(destination_lab, "w") as f:
27 |         f.write(text)
28 | 
29 | 
30 | texts = []
31 | for root, dirs, files in os.walk("."):
32 |     path = root.split(os.sep)
33 |     if "metadata.csv" in files:
34 |         csv_path = os.path.join(root, "metadata.csv")
35 |         for name, text in csv_dict(csv_path):
36 |             file_path = os.path.join(root, "wavs", name + ".wav")
37 |             speaker = root.split("/")[-2]
38 |             text = text.replace("ё", "йо")
39 |             # make_record(file_path, name, text, speaker)
40 |             texts = texts + text.split(" ")
41 | 
42 | print(len(set(texts)))
43 | with open("./vocab.lab", "w") as f:
44 |     for text in set(texts):
45 |         f.write(text + "\n")
46 | 


--------------------------------------------------------------------------------
/data_utils/dataset_w_stats.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import string
  3 | 
  4 | from string import ascii_letters, digits, whitespace
  5 | 
  6 | cyrillic_letters = (
  7 |     "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
  8 | )
  9 | 
 10 | 
 11 | def my_strip(text):
 12 |     allowed_chars = cyrillic_letters + digits + whitespace
 13 |     return "".join([c for c in text if c in allowed_chars]).replace("\n", "")
 14 | 
 15 | 
 16 | # traverse root directory, and list directories as dirs and files as files
 17 | 
 18 | SOURCE_DIR = "./dataset_main/speakers/"
 19 | 
 20 | 
 21 | class SpeakerStat:
 22 |     def __init__(self):
 23 |         self.speakers = dict()
 24 | 
 25 |     def add(self, name):
 26 |         self.speakers[name] = [0, 0, ""]
 27 | 
 28 |     def update(self, name, text):
 29 |         len_words = len(text.split(" "))
 30 |         self.speakers[name][0] += 1
 31 |         self.speakers[name][1] += len_words
 32 |         self.speakers[name][2] += " " + text
 33 | 
 34 |     def make_csv(self, file_path):
 35 |         csv_records = ["source_name|speaker_id|num_sentences|len_words"]
 36 |         for speaker in self.speakers:
 37 |             dataset_name = speaker.split("_")[-1]
 38 |             num_sentences = self.speakers[speaker][0]
 39 |             len_words = self.speakers[speaker][1]
 40 |             string = f"{dataset_name}|{speaker}|{num_sentences}|{len_words}"
 41 |             csv_records.append(string)
 42 | 
 43 |         self.save(file_path, csv_records)
 44 | 
 45 |     def save(self, file_path, records):
 46 |         with open(file_path, "w") as f:
 47 |             for text in records:
 48 |                 f.write(text + "\n")
 49 | 
 50 |     def save_vocab(self, file_path):
 51 |         words = []
 52 |         for speaker in self.speakers:
 53 |             sp_words = self.speakers[speaker][2].split(" ")
 54 |             sp_words = [w for w in sp_words if len(w) > 0]
 55 |             words += sp_words
 56 |         words = list(set(words))
 57 |         words = sorted(words, key=len)
 58 |         print(f"unique words: {len(words)}")
 59 |         self.save(file_path, words)
 60 | 
 61 | 
 62 | def csv_dict(path):
 63 |     with open(path) as f:
 64 |         lines = f.readlines()
 65 | 
 66 |     for line in lines:
 67 |         line = line.split("|")
 68 |         if len(line) == 3:
 69 |             yield line[0], line[2].lower()
 70 |         if len(line) == 2:
 71 |             yield line[0], line[1].lower()
 72 | 
 73 | 
 74 | def make_record(f_path, text):
 75 |     with open(f_path, "w") as f:
 76 |         f.write(text)
 77 | 
 78 | 
 79 | # def clean(s):
 80 | #     exclude = set(
 81 | #         list(string.punctuation) + ["", "_", "\n", "...", "..", "«", "»"]
 82 | #     )
 83 | #     return my_strip("".join(ch for ch in s if ch not in exclude))
 84 | 
 85 | 
 86 | def main():
 87 |     speakers_lib = SpeakerStat()
 88 | 
 89 |     for directory in os.listdir(SOURCE_DIR):
 90 |         full_directory = os.path.join(SOURCE_DIR, directory)
 91 |         speakers_lib.add(directory)
 92 |         csv_path = os.path.join(full_directory, "metadata.csv")
 93 |         for name, text in csv_dict(csv_path):
 94 |             text = my_strip(text)
 95 |             speakers_lib.update(directory, text)
 96 |             file_path = os.path.join(full_directory, name + ".txt")
 97 |             make_record(file_path, text)
 98 |             make_record(file_path.replace("txt", "lab"), text)
 99 | 
100 |     speakers_lib.make_csv("./speaker_stats.csv")
101 |     speakers_lib.save_vocab("./vocab.lab")
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     main()
106 | 


--------------------------------------------------------------------------------
/data_utils/makecsv.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | # traverse root directory, and list directories as dirs and files as files
 5 | 
 6 | SOURCE_DIR = "./dataset_main/speakers/amed_shaman/"
 7 | 
 8 | # russian_single
 9 | # noname_opentts
10 | 
11 | csv_records = []
12 | for file in os.listdir(SOURCE_DIR):
13 |     if ".lab" in file:
14 |         txt_path = os.path.join(SOURCE_DIR, file)
15 |         with open(txt_path, "r") as f:
16 |             text = f.read().replace("\n", "")
17 |         string = f"{file.replace('.txt','')}|{text}|{text}"
18 |         csv_records.append(string)
19 | 
20 | final_path = os.path.join(SOURCE_DIR, "metadata.csv")
21 | with open(final_path, "w") as f:
22 |     for text in set(csv_records):
23 |         f.write(text + "\n")
24 | 


--------------------------------------------------------------------------------
/data_utils/remove_bad_grid.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | FOLDER =  '/home/dev/other/fsp/data/dataset_main/speakers/'
 4 | 
 5 | BAD_SANTA_LIST =  '/home/dev/other/fsp/data/dataset_main/aligner/prev_unaligned.txt'
 6 | 
 7 | def cat(f1, f2):
 8 |     return os.path.join(f1,f2)
 9 | 
10 | def make_key(path):
11 |     path = ''.join(path.split('.')[0])
12 |     return '_'.join(path.split('/'))
13 | 
14 | def get_path_dict(folder):
15 |     path_dict = dict()
16 |     for speaker in os.listdir(folder):
17 |         if 'txt' in speaker:
18 |             continue
19 |         full_speaker = cat(folder, speaker)
20 |         for rec in os.listdir(full_speaker):
21 |             full_rec =  cat(full_speaker, rec)
22 |             key = make_key(cat(speaker,rec))
23 |             path_dict[key]=full_rec.split('.')[0]
24 | 
25 |     return path_dict
26 | 
27 | def get_keys(bad_list):
28 |     names =  []
29 |     with open(bad_list) as f:
30 |         names_list = f.read()
31 | 
32 |     names = [n.split(' ')[0].split('\t')[0] for n in names_list.split('\n')]
33 |     print(f'found {len(names)} bad records')
34 |     return names
35 | 
36 | 
37 | if __name__ =='__main__':
38 |     path_dict = get_path_dict(FOLDER)
39 |     names =  get_keys(BAD_SANTA_LIST)
40 |     for i, name in enumerate(names):
41 |         if name in path_dict:
42 |             path_to_remove =  path_dict[name]
43 |         else:
44 |             continue
45 |         try:
46 |             os.remove(path_to_remove+'.wav')
47 |             os.remove(path_to_remove+'.txt')
48 |             os.remove(path_to_remove+'.lab')
49 |         except Exception as e:
50 |             print(e)
51 |         print(f'{i+1} Removed {path_to_remove}')
52 | 


--------------------------------------------------------------------------------
/data_utils/replace.sh:
--------------------------------------------------------------------------------
1 | for folder in *mozilla*; do
2 | 	cd $folder
3 | 	for f in *.wav.*; do 
4 | 	    mv -- "$f" "${f/.wav*/}.lab"
5 | 	done
6 | 	cd ..
7 | done
8 | 


--------------------------------------------------------------------------------
/dataset_review/hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/dataset_review/hist.png


--------------------------------------------------------------------------------
/dataset_review/least20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/dataset_review/least20.png


--------------------------------------------------------------------------------
/dataset_review/speakers_short.json:
--------------------------------------------------------------------------------
1 | {"hajdurova_ailab": 0, "user17_mozilla": 1, "user20_mozilla": 2, "mar_abooks_voxforge": 3, "Vorobjeva_Irina_abooks_voxforge": 4, "Litvinov_I_abooks_voxforge": 5, "Rezalin_Aleksandr_abooks_voxforge": 6, "user6_mozilla": 7, "Vihrov_V_abooks_voxforge": 8, "user11_mozilla": 9, "morti_shaman": 10, "joh_abooks_voxforge": 11, "Kononov_Mikhail_abooks_voxforge": 12, "Sushkov_Vladimir_abooks_voxforge": 13, "Markin_Petr_abooks_voxforge": 14, "Stukalov_Vladimir_abooks_voxforge": 15, "user26_mozilla": 16, "june_shaman": 17, "user12_mozilla": 18, "Kuznetsov_Vsevolod_abooks_voxforge": 19, "vsh_abooks_voxforge": 20, "Schirvind_A_abooks_voxforge": 21, "Vasiljev_Y_abooks_voxforge": 22, "Goblin_abooks_voxforge": 23, "Tarinicheva_Tatjana_abooks_voxforge": 24, "Larionov_Vsevolod_abooks_voxforge": 25, "Kaljagin_A_abooks_voxforge": 26, "Terenkov_Alexandr_abooks_voxforge": 27, "Kuznetsov_Alexei_abooks_voxforge": 28, "Rosljakov_Mixail_abooks_voxforge": 29, "Kvasha_Igor_abooks_voxforge": 30, "user1_mozilla": 31, "russian_single": 32, "Zozulin_Viktor_abooks_voxforge": 33, "Zhirnov_Sergey_abooks_voxforge": 34, "Vesnik_E_abooks_voxforge": 35, "Pokrovsky_Boris_abooks_voxforge": 36, "Martjanov_O_abooks_voxforge": 37, "len_shaman": 38, "Trifilov_Nikolai_abooks_voxforge": 39, "Taratorkin_Georgiy_abooks_voxforge": 40, "user7_mozilla": 41, "Kovaleva_Anna_abooks_voxforge": 42, "Sytnik_I_abooks_voxforge": 43, "noname_opentts": 44, "user8_mozilla": 45, "ira_abooks_voxforge": 46, "Bolshakova_Ksenija_abooks_voxforge": 47, "Muhametzyanov_Radik_abooks_voxforge": 48, "Grigorjev_Yurii_abooks_voxforge": 49, "user5_mozilla": 50, "Efremov_Oleg_abooks_voxforge": 51, "Chebaturkina_Elena_abooks_voxforge": 52, "nikolaev_ailab": 53, "user4_mozilla": 54, "Kotov_Alexandr_abooks_voxforge": 55, "Arhipova_Natalja_abooks_voxforge": 56, "Suetin_Pavel_abooks_voxforge": 57, "Medvedeva_Galcova_Olga_abooks_voxforge": 58, "Airapetova_Darja_abooks_voxforge": 59, "Popova_Alevtina_abooks_voxforge": 60, "Konjahin_V_abooks_voxforge": 61, "DrLutz_abooks_voxforge": 62, "Karpov_N_abooks_voxforge": 63, "Larionova-Ludm_abooks_voxforge": 64, "minaev_ailab": 65}


--------------------------------------------------------------------------------
/dataset_review/speakers_to_remove.txt:
--------------------------------------------------------------------------------
  1 | Ljubimcev_Pavel_abooks_voxforge
  2 | Zemcov_D_abooks_voxforge
  3 | Gerasimov_Vladimir_abooks_voxforge
  4 | Time_Elizaveta_abooks_voxforge
  5 | Lazarev_Yurii_abooks_voxforge
  6 | Ljelikova_Lidija_abooks_voxforge
  7 | Podoruga_Alexander_abooks_voxforge
  8 | Erisanova_I_abooks_voxforge
  9 | Kupriyanov_Vasilij_abooks_voxforge
 10 | Kocharjan_Suren_abooks_voxforge
 11 | Chonishvili_S_abooks_voxforge
 12 | ana_shaman
 13 | Zjuzina_O_abooks_voxforge
 14 | Myagkov_Andrey_abooks_voxforge
 15 | Korolev_Vladimir_abooks_voxforge
 16 | mat_abooks_voxforge
 17 | Mihailovskii_abooks_voxforge
 18 | Malyshkina_I_abooks_voxforge
 19 | Savitskij_Nikolai_abooks_voxforge
 20 | Rosenberg_Mikhail_abooks_voxforge
 21 | Fedosov_S_abooks_voxforge
 22 | Evstigneev_E_abooks_voxforge
 23 | rio_shaman
 24 | user22_mozilla
 25 | Sidoruk_Al_abooks_voxforge
 26 | Maksimov_V_abooks_voxforge
 27 | Gubenko_N_abooks_voxforge
 28 | Mushatin_Igor_abooks_voxforge
 29 | Ivanova_M_abooks_voxforge
 30 | Verovoi_Denis_abooks_voxforge
 31 | Zamorev_Sergei_abooks_voxforge
 32 | user9_mozilla
 33 | Murasko_Igor_abooks_voxforge
 34 | Nevinniy_Vyacheslav_abooks_voxforge
 35 | user13_mozilla
 36 | Koksharov_Aleksadr_abooks_voxforge
 37 | user30_mozilla
 38 | Ranevskaya_F_abooks_voxforge
 39 | Basov_Ivan_abooks_voxforge
 40 | Telegina_T_abooks_voxforge
 41 | Kulagin_L_abooks_voxforge
 42 | nsh_abooks_voxforge
 43 | Klyukvin_A_abooks_voxforge
 44 | Platonov_Maksim_abooks_voxforge
 45 | Rajkin_Arkadij_abooks_voxforge
 46 | Brockaja_Leontina_abooks_voxforge
 47 | Ziganshina_Era_abooks_voxforge
 48 | Zadvornih_Vyacheslav_abooks_voxforge
 49 | Kurilov_Andrey_abooks_voxforge
 50 | Burdelov_O_abooks_voxforge
 51 | Kulyutnikov_abooks_voxforge
 52 | user25_mozilla
 53 | Tolubeev_V_abooks_voxforge
 54 | Papanov_Anatoliy_abooks_voxforge
 55 | Sergey_Shakurov_abooks_voxforge
 56 | Gorbunov_S_abooks_voxforge
 57 | Vjalikova_O_abooks_voxforge
 58 | Samoylov_Oleg_abooks_voxforge
 59 | Rjabcev_E_abooks_voxforge
 60 | user10_mozilla
 61 | Dubina_A_abooks_voxforge
 62 | Cherhjak_M_abooks_voxforge
 63 | Borisov_O_abooks_voxforge
 64 | tray_shaman
 65 | Levina_L_abooks_voxforge
 66 | Korneva_Natalja_abooks_voxforge
 67 | Solomin_Vitaliy_abooks_voxforge
 68 | Osobik_Vladimir_abooks_voxforge
 69 | yo_shaman
 70 | Shishkin_O_abooks_voxforge
 71 | Aroseva_O_abooks_voxforge
 72 | Kornizkaja_Evgenija_abooks_voxforge
 73 | Ternovskii_E_abooks_voxforge
 74 | Bronevoy_L_abooks_voxforge
 75 | Maretskaja_Vera_abooks_voxforge
 76 | Golovataja_Lidija_abooks_voxforge
 77 | Lanovoy_Vasiliy_abooks_voxforge
 78 | user29_mozilla
 79 | Sevjakov_V_abooks_voxforge
 80 | user2_mozilla
 81 | user15_mozilla
 82 | Valijev_German_abooks_voxforge
 83 | user23_mozilla
 84 | Zaborovskii_J_abooks_voxforge
 85 | Boris_Plotnikov_abooks_voxforge
 86 | Skljar_Al_abooks_voxforge
 87 | Kazakov_Alexei_abooks_voxforge
 88 | Kolpakov_Artem_abooks_voxforge
 89 | Isakov_Nikolai_abooks_voxforge
 90 | Rossoshanskij_Aleksei_abooks_voxforge
 91 | Smoktunovskiy_Innokentiy_abooks_voxforge
 92 | Batalov_Alexey_abooks_voxforge
 93 | Samoilov_V_abooks_voxforge
 94 | Rovinskij_Vladimir_abooks_voxforge
 95 | Pinsker_M_abooks_voxforge
 96 | Bobylev_Ilia_abooks_voxforge
 97 | Golub_Oleg_abooks_voxforge
 98 | Zareckii_A_abooks_voxforge
 99 | Kukushkin_A_abooks_voxforge
100 | Balakirev_A_abooks_voxforge
101 | Petrov_Victor_abooks_voxforge
102 | svu_abooks_voxforge
103 | Rudnichenko_V_abooks_voxforge
104 | Starchikov_S_abooks_voxforge
105 | user21_mozilla
106 | Lazarev_Al_abooks_voxforge
107 | Borzunov_A_abooks_voxforge
108 | Lebedeva_V_abooks_voxforge
109 | Vitorgan_E_abooks_voxforge
110 | Prudovskiy_Ilja_abooks_voxforge
111 | user19_mozilla
112 | Pozdnjakov_M_abooks_voxforge
113 | user18_mozilla
114 | Yankovsky_Oleg_abooks_voxforge
115 | user16_mozilla
116 | user28_mozilla
117 | Alexandr_Slobodskoy_abooks_voxforge
118 | Andrienko_A_abooks_voxforge
119 | evg_abooks_voxforge
120 | Martynyuk_Yu_abooks_voxforge
121 | user14_mozilla
122 | Hazov_Evgeniy_abooks_voxforge
123 | Mironov_Evgeniy_abooks_voxforge
124 | user3_mozilla
125 | Kiseljev_R_abooks_voxforge
126 | Malishevskiy_Evgeniy_abooks_voxforge
127 | ruslan_ruslan
128 | Samoedov_E_abooks_voxforge
129 | Baljan_Georgiy_abooks_voxforge
130 | esh_abooks_voxforge
131 | sve_abooks_voxforge
132 | Petrov_K_abooks_voxforge
133 | Putin_abooks_voxforge
134 | amed_shaman
135 | Kazarinova_Elena_abooks_voxforge
136 | Sitnik_Stanislav_abooks_voxforge
137 | mgn_abooks_voxforge
138 | Kuzmina_S_abooks_voxforge
139 | Kozii_N_abooks_voxforge
140 | Burlak_Vadim_abooks_voxforge
141 | user27_mozilla
142 | urp_abooks_voxforge
143 | Prohoda_Andrey_abooks_voxforge
144 | Mironov_A_abooks_voxforge
145 | Tabakov_Oleg_abooks_voxforge
146 | Sazykin_Ilja_abooks_voxforge
147 | Gabidulin_Ruslan_abooks_voxforge
148 | Gorelik_Tamara_abooks_voxforge
149 | Kolygo_Dmitrii_abooks_voxforge
150 | Staburov_Roman_abooks_voxforge
151 | Smehov_Veniamin_abooks_voxforge
152 | sun_abooks_voxforge
153 | Jurskii_S_abooks_voxforge
154 | Antonik_abooks_voxforge
155 | Plyatt_R_abooks_voxforge
156 | Perov_Danila_abooks_voxforge
157 | ski_abooks_voxforge
158 | Muravjeva_I_abooks_voxforge
159 | Gerd_Z_abooks_voxforge
160 | Gusev_A_abooks_voxforge
161 | Uryupin_Dmitii_abooks_voxforge
162 | Raschkin_Jrij_abooks_voxforge
163 | Andriyanov_AL_abooks_voxforge
164 | Ilinsky_Ig_abooks_voxforge
165 | Podlesny_Mark_abooks_voxforge
166 | Ktorov_Anatoliy_abooks_voxforge
167 | Kuznetsova_Valentina_abooks_voxforge
168 | Litvinova_N_abooks_voxforge
169 | Ulyanov_M_abooks_voxforge
170 | Zuravljev_Dmitriy_abooks_voxforge
171 | len_abooks_voxforge
172 | Bykov_Alexandr_abooks_voxforge
173 | Golubkina_Marija_abooks_voxforge
174 | Gaft_Valentin_abooks_voxforge
175 | Utochkina_O_abooks_voxforge
176 | Kindinov_Evgeniy_abooks_voxforge
177 | user24_mozilla
178 | Jakovlev_abooks_voxforge
179 | Lapkin_Ignatii_abooks_voxforge
180 | Smarzevskaja_Tatjana_abooks_voxforge
181 | Borisov_Grigorii_abooks_voxforge
182 | Koretskij_Vladimir_abooks_voxforge
183 | Semenova_Ekaterina_abooks_voxforge
184 | 


--------------------------------------------------------------------------------
/dataset_review/top20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/dataset_review/top20.png


--------------------------------------------------------------------------------
/examples/Airapetova_Darja_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Airapetova_Darja_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Arhipova_Natalja_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Arhipova_Natalja_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Bolshakova_Ksenija_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Bolshakova_Ksenija_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Chebaturkina_Elena_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Chebaturkina_Elena_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/DrLutz_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/DrLutz_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Efremov_Oleg_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Efremov_Oleg_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Goblin_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Goblin_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Goblin_dance.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Goblin_dance.wav


--------------------------------------------------------------------------------
/examples/Grigorjev_Yurii_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Grigorjev_Yurii_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Kaljagin_A_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kaljagin_A_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Karpov_N_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Karpov_N_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Konjahin_V_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Konjahin_V_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Kononov_Mikhail_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kononov_Mikhail_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Kotov_Alexandr_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kotov_Alexandr_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Kovaleva_Anna_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kovaleva_Anna_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Kuznetsov_Alexei_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kuznetsov_Alexei_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Kuznetsov_Vsevolod_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kuznetsov_Vsevolod_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Kvasha_Igor_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kvasha_Igor_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Larionov_Vsevolod_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Larionov_Vsevolod_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Larionova-Ludm_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Larionova-Ludm_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Litvinov_I_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Litvinov_I_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Markin_Petr_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Markin_Petr_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Martjanov_O_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Martjanov_O_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Medvedeva_Galcova_Olga_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Medvedeva_Galcova_Olga_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Muhametzyanov_Radik_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Muhametzyanov_Radik_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Pokrovsky_Boris_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Pokrovsky_Boris_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Popova_Alevtina_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Popova_Alevtina_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Rezalin_Aleksandr_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Rezalin_Aleksandr_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Rosljakov_Mixail_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Rosljakov_Mixail_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Schirvind_A_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Schirvind_A_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Stukalov_Vladimir_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Stukalov_Vladimir_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Suetin_Pavel_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Suetin_Pavel_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Sushkov_Vladimir_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Sushkov_Vladimir_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Sytnik_I_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Sytnik_I_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Taratorkin_Georgiy_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Taratorkin_Georgiy_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Tarinicheva_Tatjana_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Tarinicheva_Tatjana_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Terenkov_Alexandr_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Terenkov_Alexandr_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Trifilov_Nikolai_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Trifilov_Nikolai_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Vasiljev_Y_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Vasiljev_Y_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Vesnik_E_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Vesnik_E_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Vihrov_V_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Vihrov_V_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Vorobjeva_Irina_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Vorobjeva_Irina_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Zhirnov_Sergey_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Zhirnov_Sergey_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/Zozulin_Viktor_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Zozulin_Viktor_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/goblin_opentts.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/goblin_opentts.wav


--------------------------------------------------------------------------------
/examples/hajdurova_ailab.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/hajdurova_ailab.wav


--------------------------------------------------------------------------------
/examples/ira_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/ira_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/joh_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/joh_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/june_shaman.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/june_shaman.wav


--------------------------------------------------------------------------------
/examples/len_shaman.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/len_shaman.wav


--------------------------------------------------------------------------------
/examples/mar_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/mar_abooks_voxforge.wav


--------------------------------------------------------------------------------
/examples/minaev_ailab.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/minaev_ailab.wav


--------------------------------------------------------------------------------
/examples/morti_shaman.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/morti_shaman.wav


--------------------------------------------------------------------------------
/examples/nikolaev_ailab.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/nikolaev_ailab.wav


--------------------------------------------------------------------------------
/examples/noname_opentts.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/noname_opentts.wav


--------------------------------------------------------------------------------
/examples/russian_single.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/russian_single.wav


--------------------------------------------------------------------------------
/examples/user11_mozilla.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user11_mozilla.wav


--------------------------------------------------------------------------------
/examples/user12_mozilla.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user12_mozilla.wav


--------------------------------------------------------------------------------
/examples/user17_mozilla.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user17_mozilla.wav


--------------------------------------------------------------------------------
/examples/user1_mozilla.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user1_mozilla.wav


--------------------------------------------------------------------------------
/examples/user20_mozilla.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user20_mozilla.wav


--------------------------------------------------------------------------------
/examples/user26_mozilla.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user26_mozilla.wav


--------------------------------------------------------------------------------
/examples/user4_mozilla.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user4_mozilla.wav


--------------------------------------------------------------------------------
/examples/user5_mozilla.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user5_mozilla.wav


--------------------------------------------------------------------------------
/examples/user6_mozilla.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user6_mozilla.wav


--------------------------------------------------------------------------------
/examples/user7_mozilla.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user7_mozilla.wav


--------------------------------------------------------------------------------
/examples/user8_mozilla.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user8_mozilla.wav


--------------------------------------------------------------------------------
/examples/vsh_abooks_voxforge.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/vsh_abooks_voxforge.wav


--------------------------------------------------------------------------------
/fs_two/README.md:
--------------------------------------------------------------------------------
  1 | # FastSpeech 2 - PyTorch Implementation
  2 | 
  3 | This is a PyTorch implementation of Microsoft's text-to-speech system [**FastSpeech 2: Fast and High-Quality End-to-End Text to Speech**](https://arxiv.org/abs/2006.04558v1). 
  4 | This project is based on [xcmyz's implementation](https://github.com/xcmyz/FastSpeech) of FastSpeech. Feel free to use/modify the code.
  5 | 
  6 | There are several versions of FastSpeech 2.
  7 | This implementation is more similar to [version 1](https://arxiv.org/abs/2006.04558v1), which uses F0 values as the pitch features.
  8 | On the other hand, pitch spectrograms extracted by continuous wavelet transform are used as the pitch features in the [later versions](https://arxiv.org/abs/2006.04558).
  9 | 
 10 | ![](./img/model.png)
 11 | 
 12 | # Updates
 13 | - 2021/2/26: Support English and Mandarin TTS
 14 | - 2021/2/26: Support multi-speaker TTS (AISHELL-3 and LibriTTS)
 15 | - 2021/2/26: Support MelGAN and HiFi-GAN vocoder
 16 | 
 17 | # Audio Samples
 18 | Audio samples generated by this implementation can be found [here](https://ming024.github.io/FastSpeech2/). 
 19 | 
 20 | # Quickstart
 21 | 
 22 | ## Dependencies
 23 | You can install the Python dependencies with
 24 | ```
 25 | pip3 install -r requirements.txt
 26 | ```
 27 | 
 28 | ## Inference
 29 | 
 30 | You have to download the [pretrained models](https://drive.google.com/drive/folders/1DOhZGlTLMbbAAFZmZGDdc77kz1PloS7F?usp=sharing) and put them in ``output/ckpt/LJSpeech/`` or ``output/ckpt/AISHELL3``.
 31 | 
 32 | For English single-speaker TTS, run
 33 | ```
 34 | python3 synthesize.py --text "YOUR_DESIRED_TEXT" --restore_step 900000 --mode single -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml
 35 | ```
 36 | 
 37 | For Mandarin multi-speaker TTS, try
 38 | ```
 39 | python3 synthesize.py --text "大家好" --speaker_id SPEAKER_ID --restore_step 900000 --mode single -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml
 40 | ```
 41 | 
 42 | The generated utterances will be put in ``output/result/``.
 43 | 
 44 | Here is an example of synthesized mel-spectrogram of the sentence "Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition", with the English single-speaker TTS model.  
 45 | ![](./img/synthesized_melspectrogram.png)
 46 | 
 47 | ## Batch Inference
 48 | Batch inference is also supported, try
 49 | 
 50 | ```
 51 | python3 synthesize.py --source preprocessed_data/LJSpeech/val.txt --restore_step 900000 --mode batch -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml
 52 | ```
 53 | to synthesize all utterances in ``preprocessed_data/LJSpeech/val.txt``
 54 | 
 55 | ## Controllability
 56 | The pitch/volume/speaking rate of the synthesized utterances can be controlled by specifying the desired pitch/energy/duration ratios.
 57 | For example, one can increase the speaking rate by 20 % and decrease the volume by 20 % by
 58 | 
 59 | ```
 60 | python3 synthesize.py --text "YOUR_DESIRED_TEXT" --restore_step 900000 --mode single -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml --duration_control 0.8 --energy_control 0.8
 61 | ```
 62 | 
 63 | # Training
 64 | 
 65 | ## Datasets
 66 | 
 67 | The supported datasets are
 68 | 
 69 | - [LJSpeech](https://keithito.com/LJ-Speech-Dataset/): a single-speaker English dataset consists of 13100 short audio clips of a female speaker reading passages from 7 non-fiction books, approximately 24 hours in total.
 70 | - [AISHELL-3](http://www.aishelltech.com/aishell_3): a Mandarin TTS dataset with 218 male and female speakers, roughly 85 hours in total.
 71 | - [LibriTTS](https://research.google/tools/datasets/libri-tts/): a multi-speaker English dataset containing 585 hours of speech by 2456 speakers.
 72 | 
 73 | We take LJSpeech as an example hereafter.
 74 | 
 75 | ## Preprocessing
 76 |  
 77 | First, run 
 78 | ```
 79 | python3 prepare_align.py config/LJSpeech/preprocess.yaml
 80 | ```
 81 | for some preparations.
 82 | 
 83 | As described in the paper, [Montreal Forced Aligner](https://montreal-forced-aligner.readthedocs.io/en/latest/) (MFA) is used to obtain the alignments between the utterances and the phoneme sequences.
 84 | Alignments for the LJSpeech and AISHELL-3 datasets are provided [here](https://drive.google.com/drive/folders/1DBRkALpPd6FL9gjHMmMEdHODmkgNIIK4?usp=sharing).
 85 | You have to unzip the files in ``preprocessed_data/LJSpeech/TextGrid/``.
 86 | 
 87 | After that, run the preprocessing script by
 88 | ```
 89 | python3 preprocess.py config/LJSpeech/preprocess.yaml
 90 | ```
 91 | 
 92 | Alternately, you can align the corpus by yourself. 
 93 | Download the official MFA package and run
 94 | ```
 95 | ./montreal-forced-aligner/bin/mfa_align raw_data/LJSpeech/ lexicon/librispeech-lexicon.txt english preprocessed_data/LJSpeech
 96 | ```
 97 | or
 98 | ```
 99 | ./montreal-forced-aligner/bin/mfa_train_and_align raw_data/LJSpeech/ lexicon/librispeech-lexicon.txt preprocessed_data/LJSpeech
100 | ```
101 | 
102 | to align the corpus and then run the preprocessing script.
103 | ```
104 | python3 preprocess.py config/LJSpeech/preprocess.yaml
105 | ```
106 | 
107 | ## Training
108 | 
109 | Train your model with
110 | ```
111 | python3 train.py -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml
112 | ```
113 | 
114 | The model takes less than 10k steps (less than 1 hour on my GTX1080Ti GPU) of training to generate audio samples with acceptable quality, which is much more efficient than the autoregressive models such as Tacotron2.
115 | 
116 | # TensorBoard
117 | 
118 | Use
119 | ```
120 | tensorboard --logdir output/log/LJSpeech
121 | ```
122 | 
123 | to serve TensorBoard on your localhost.
124 | The loss curves, synthesized mel-spectrograms, and audios are shown.
125 | 
126 | ![](./img/tensorboard_loss.png)
127 | ![](./img/tensorboard_spec.png)
128 | ![](./img/tensorboard_audio.png)
129 | 
130 | # Implementation Issues
131 | 
132 | - Following [xcmyz's implementation](https://github.com/xcmyz/FastSpeech), I use an additional Tacotron-2-styled Postnet after the decoder, which is not used in the original paper.
133 | - Gradient clipping is used in the training.
134 | - In my experience, using phoneme-level pitch and energy prediction instead of frame-level prediction results in much better prosody, and normalizing the pitch and energy features also helps. Please refer to ``config/README.md`` for more details.
135 | 
136 | Please inform me if you find any mistakes in this repo, or any useful tips to train the FastSpeech 2 model.
137 | 
138 | # References
139 | - [FastSpeech 2: Fast and High-Quality End-to-End Text to Speech](https://arxiv.org/abs/2006.04558), Y. Ren, *et al*.
140 | - [xcmyz's FastSpeech implementation](https://github.com/xcmyz/FastSpeech)
141 | - [TensorSpeech's FastSpeech 2 implementation](https://github.com/TensorSpeech/TensorflowTTS)
142 | - [rishikksh20's FastSpeech 2 implementation](https://github.com/rishikksh20/FastSpeech2)
143 | 


--------------------------------------------------------------------------------
/fs_two/audio/__init__.py:
--------------------------------------------------------------------------------
1 | import fs_two.audio.tools
2 | import fs_two.audio.stft
3 | import fs_two.audio.audio_processing
4 | 


--------------------------------------------------------------------------------
/fs_two/audio/audio_processing.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import librosa.util as librosa_util
  4 | from scipy.signal import get_window
  5 | 
  6 | 
  7 | def window_sumsquare(
  8 |     window,
  9 |     n_frames,
 10 |     hop_length,
 11 |     win_length,
 12 |     n_fft,
 13 |     dtype=np.float32,
 14 |     norm=None,
 15 | ):
 16 |     """
 17 |     # from librosa 0.6
 18 |     Compute the sum-square envelope of a window function at a given hop length.
 19 | 
 20 |     This is used to estimate modulation effects induced by windowing
 21 |     observations in short-time fourier transforms.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     window : string, tuple, number, callable, or list-like
 26 |         Window specification, as in `get_window`
 27 | 
 28 |     n_frames : int > 0
 29 |         The number of analysis frames
 30 | 
 31 |     hop_length : int > 0
 32 |         The number of samples to advance between frames
 33 | 
 34 |     win_length : [optional]
 35 |         The length of the window function.  By default, this matches `n_fft`.
 36 | 
 37 |     n_fft : int > 0
 38 |         The length of each analysis frame.
 39 | 
 40 |     dtype : np.dtype
 41 |         The data type of the output
 42 | 
 43 |     Returns
 44 |     -------
 45 |     wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
 46 |         The sum-squared envelope of the window function
 47 |     """
 48 |     if win_length is None:
 49 |         win_length = n_fft
 50 | 
 51 |     n = n_fft + hop_length * (n_frames - 1)
 52 |     x = np.zeros(n, dtype=dtype)
 53 | 
 54 |     # Compute the squared window at the desired length
 55 |     win_sq = get_window(window, win_length, fftbins=True)
 56 |     win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
 57 |     win_sq = librosa_util.pad_center(win_sq, n_fft)
 58 | 
 59 |     # Fill the envelope
 60 |     for i in range(n_frames):
 61 |         sample = i * hop_length
 62 |         x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
 63 |     return x
 64 | 
 65 | 
 66 | def griffin_lim(magnitudes, stft_fn, n_iters=30):
 67 |     """
 68 |     PARAMS
 69 |     ------
 70 |     magnitudes: spectrogram magnitudes
 71 |     stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
 72 |     """
 73 | 
 74 |     angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
 75 |     angles = angles.astype(np.float32)
 76 |     angles = torch.autograd.Variable(torch.from_numpy(angles))
 77 |     signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
 78 | 
 79 |     for i in range(n_iters):
 80 |         _, angles = stft_fn.transform(signal)
 81 |         signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
 82 |     return signal
 83 | 
 84 | 
 85 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
 86 |     """
 87 |     PARAMS
 88 |     ------
 89 |     C: compression factor
 90 |     """
 91 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 92 | 
 93 | 
 94 | def dynamic_range_decompression(x, C=1):
 95 |     """
 96 |     PARAMS
 97 |     ------
 98 |     C: compression factor used to compress
 99 |     """
100 |     return torch.exp(x) / C
101 | 


--------------------------------------------------------------------------------
/fs_two/audio/stft.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import numpy as np
  4 | from scipy.signal import get_window
  5 | from librosa.util import pad_center, tiny
  6 | from librosa.filters import mel as librosa_mel_fn
  7 | 
  8 | from fs_two.audio.audio_processing import (
  9 |     dynamic_range_compression,
 10 |     dynamic_range_decompression,
 11 |     window_sumsquare,
 12 | )
 13 | 
 14 | DEVICE = 3
 15 | 
 16 | 
 17 | class STFT(torch.nn.Module):
 18 |     """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
 19 | 
 20 |     def __init__(self, filter_length, hop_length, win_length, window="hann"):
 21 |         super(STFT, self).__init__()
 22 |         self.filter_length = filter_length
 23 |         self.hop_length = hop_length
 24 |         self.win_length = win_length
 25 |         self.window = window
 26 |         self.forward_transform = None
 27 |         scale = self.filter_length / self.hop_length
 28 |         fourier_basis = np.fft.fft(np.eye(self.filter_length))
 29 | 
 30 |         cutoff = int((self.filter_length / 2 + 1))
 31 |         fourier_basis = np.vstack(
 32 |             [
 33 |                 np.real(fourier_basis[:cutoff, :]),
 34 |                 np.imag(fourier_basis[:cutoff, :]),
 35 |             ]
 36 |         )
 37 | 
 38 |         forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
 39 |         inverse_basis = torch.FloatTensor(
 40 |             np.linalg.pinv(scale * fourier_basis).T[:, None, :]
 41 |         )
 42 | 
 43 |         if window is not None:
 44 |             assert filter_length >= win_length
 45 |             # get window and zero center pad it to filter_length
 46 |             fft_window = get_window(window, win_length, fftbins=True)
 47 |             fft_window = pad_center(fft_window, filter_length)
 48 |             fft_window = torch.from_numpy(fft_window).float()
 49 | 
 50 |             # window the bases
 51 |             forward_basis *= fft_window
 52 |             inverse_basis *= fft_window
 53 | 
 54 |         self.register_buffer("forward_basis", forward_basis.float())
 55 |         self.register_buffer("inverse_basis", inverse_basis.float())
 56 | 
 57 |     def transform(self, input_data):
 58 |         num_batches = input_data.size(0)
 59 |         num_samples = input_data.size(1)
 60 | 
 61 |         self.num_samples = num_samples
 62 | 
 63 |         # similar to librosa, reflect-pad the input
 64 |         input_data = input_data.view(num_batches, 1, num_samples)
 65 |         input_data = F.pad(
 66 |             input_data.unsqueeze(1),
 67 |             (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
 68 |             mode="reflect",
 69 |         )
 70 |         input_data = input_data.squeeze(1)
 71 | 
 72 |         forward_transform = F.conv1d(
 73 |             input_data.cuda(DEVICE),
 74 |             torch.autograd.Variable(
 75 |                 self.forward_basis, requires_grad=False
 76 |             ).cuda(DEVICE),
 77 |             stride=self.hop_length,
 78 |             padding=0,
 79 |         ).cpu()
 80 | 
 81 |         cutoff = int((self.filter_length / 2) + 1)
 82 |         real_part = forward_transform[:, :cutoff, :]
 83 |         imag_part = forward_transform[:, cutoff:, :]
 84 | 
 85 |         magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2)
 86 |         phase = torch.autograd.Variable(
 87 |             torch.atan2(imag_part.data, real_part.data)
 88 |         )
 89 | 
 90 |         return magnitude, phase
 91 | 
 92 |     def inverse(self, magnitude, phase):
 93 |         recombine_magnitude_phase = torch.cat(
 94 |             [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
 95 |         )
 96 | 
 97 |         inverse_transform = F.conv_transpose1d(
 98 |             recombine_magnitude_phase,
 99 |             torch.autograd.Variable(self.inverse_basis, requires_grad=False),
100 |             stride=self.hop_length,
101 |             padding=0,
102 |         )
103 | 
104 |         if self.window is not None:
105 |             window_sum = window_sumsquare(
106 |                 self.window,
107 |                 magnitude.size(-1),
108 |                 hop_length=self.hop_length,
109 |                 win_length=self.win_length,
110 |                 n_fft=self.filter_length,
111 |                 dtype=np.float32,
112 |             )
113 |             # remove modulation effects
114 |             approx_nonzero_indices = torch.from_numpy(
115 |                 np.where(window_sum > tiny(window_sum))[0]
116 |             )
117 |             window_sum = torch.autograd.Variable(
118 |                 torch.from_numpy(window_sum), requires_grad=False
119 |             )
120 |             window_sum = (
121 |                 window_sum.cuda(DEVICE) if magnitude.is_cuda else window_sum
122 |             )
123 |             inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
124 |                 approx_nonzero_indices
125 |             ]
126 | 
127 |             # scale by hop ratio
128 |             inverse_transform *= float(self.filter_length) / self.hop_length
129 | 
130 |         inverse_transform = inverse_transform[
131 |             :, :, int(self.filter_length / 2) :
132 |         ]
133 |         inverse_transform = inverse_transform[
134 |             :, :, : -int(self.filter_length / 2) :
135 |         ]
136 | 
137 |         return inverse_transform
138 | 
139 |     def forward(self, input_data):
140 |         self.magnitude, self.phase = self.transform(input_data)
141 |         reconstruction = self.inverse(self.magnitude, self.phase)
142 |         return reconstruction
143 | 
144 | 
145 | class TacotronSTFT(torch.nn.Module):
146 |     def __init__(
147 |         self,
148 |         filter_length,
149 |         hop_length,
150 |         win_length,
151 |         n_mel_channels,
152 |         sampling_rate,
153 |         mel_fmin,
154 |         mel_fmax,
155 |     ):
156 |         super(TacotronSTFT, self).__init__()
157 |         self.n_mel_channels = n_mel_channels
158 |         self.sampling_rate = sampling_rate
159 |         self.stft_fn = STFT(filter_length, hop_length, win_length)
160 |         mel_basis = librosa_mel_fn(
161 |             sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax
162 |         )
163 |         mel_basis = torch.from_numpy(mel_basis).float()
164 |         self.register_buffer("mel_basis", mel_basis)
165 | 
166 |     def spectral_normalize(self, magnitudes):
167 |         output = dynamic_range_compression(magnitudes)
168 |         return output
169 | 
170 |     def spectral_de_normalize(self, magnitudes):
171 |         output = dynamic_range_decompression(magnitudes)
172 |         return output
173 | 
174 |     def mel_spectrogram(self, y):
175 |         """Computes mel-spectrograms from a batch of waves
176 |         PARAMS
177 |         ------
178 |         y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
179 | 
180 |         RETURNS
181 |         -------
182 |         mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
183 |         """
184 |         assert torch.min(y.data) >= -1
185 |         assert torch.max(y.data) <= 1
186 | 
187 |         magnitudes, phases = self.stft_fn.transform(y)
188 |         magnitudes = magnitudes.data
189 |         mel_output = torch.matmul(self.mel_basis, magnitudes)
190 |         mel_output = self.spectral_normalize(mel_output)
191 |         energy = torch.norm(magnitudes, dim=1)
192 | 
193 |         return mel_output, energy
194 | 


--------------------------------------------------------------------------------
/fs_two/audio/tools.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from scipy.io.wavfile import write
 4 | 
 5 | from fs_two.audio.audio_processing import griffin_lim
 6 | 
 7 | 
 8 | def get_mel_from_wav(audio, _stft):
 9 |     audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
10 |     audio = torch.autograd.Variable(audio, requires_grad=False)
11 |     melspec, energy = _stft.mel_spectrogram(audio)
12 |     melspec = torch.squeeze(melspec, 0).numpy().astype(np.float32)
13 |     energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
14 | 
15 |     return melspec, energy
16 | 
17 | 
18 | def inv_mel_spec(mel, out_filename, _stft, griffin_iters=60):
19 |     mel = torch.stack([mel])
20 |     mel_decompress = _stft.spectral_de_normalize(mel)
21 |     mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
22 |     spec_from_mel_scaling = 1000
23 |     spec_from_mel = torch.mm(mel_decompress[0], _stft.mel_basis)
24 |     spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
25 |     spec_from_mel = spec_from_mel * spec_from_mel_scaling
26 | 
27 |     audio = griffin_lim(
28 |         torch.autograd.Variable(spec_from_mel[:, :, :-1]), _stft._stft_fn, griffin_iters
29 |     )
30 | 
31 |     audio = audio.squeeze()
32 |     audio = audio.cpu().numpy()
33 |     audio_path = out_filename
34 |     write(audio_path, _stft.sampling_rate, audio)
35 | 


--------------------------------------------------------------------------------
/fs_two/cwt/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2017 Tom Runia
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to conditions.
11 | #
12 | # Author: Tom Runia
13 | # Date Created: 2018-04-16
14 | 
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 


--------------------------------------------------------------------------------
/fs_two/cwt/cwt_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import pycwt as wavelet
 4 | from sklearn import preprocessing
 5 | 
 6 | 
 7 | def mse(a, b):
 8 |     return ((a - b) ** 2).mean()
 9 | 
10 | 
11 | # PREPROCESSING
12 | 
13 | 
14 | def transform_cwt(lf0, J=10):
15 |     mother = wavelet.MexicanHat()
16 |     dt = 0.005
17 |     dj = 1
18 |     s0 = dt * 2
19 |     # Returns J + 1 scales
20 |     Wavelet_lf0, scales, freqs, coi, fft, fftfreqs = wavelet.cwt(
21 |         np.squeeze(lf0), dt, dj, s0, J, mother
22 |     )
23 |     Wavelet_lf0 = np.real(Wavelet_lf0).T
24 |     return Wavelet_lf0
25 | 
26 | 
27 | def inverse_cwt(wavelet_coefs, num_scales=10):
28 |     lf0_rec = np.zeros([wavelet_coefs.shape[0], num_scales])
29 |     for i in range(0, num_scales):
30 |         lf0_rec[:, i] = wavelet_coefs[:, i] * ((i + 1 + 2.5) ** (-2.5))
31 |     lf0_rec_sum = np.sum(lf0_rec, axis=1)
32 |     lf0_rec_sum = preprocessing.scale(lf0_rec_sum)
33 |     return lf0_rec_sum
34 | 
35 | 
36 | # TO REVERSE ADD
37 | # reverse  = inverse_batch_cwt(wavelet_coefs, scales=10)*std + mean
38 | 
39 | 
40 | class TorchStandardScaler:
41 |     def fit(self, x):
42 |         self.mean = x.mean(0, keepdim=True)
43 |         self.std = x.std(0, unbiased=False, keepdim=True)
44 | 
45 |     def transform(self, x):
46 |         x -= self.mean
47 |         x /= self.std + 1e-12
48 |         return x
49 | 
50 | 
51 | scaler_tc = TorchStandardScaler()
52 | 
53 | 
54 | def inverse_batch_cwt(wavelet_coefs, num_scales=10):
55 |     batch_size = wavelet_coefs.shape[0]
56 |     length = wavelet_coefs.shape[1]
57 |     lf0_rec = torch.zeros([batch_size, length, num_scales], dtype=torch.float32).to(wavelet_coefs.device)
58 |     for i in range(0, num_scales):
59 |         lf0_rec[:, :, i] = wavelet_coefs[:, :, i] * ((i + 1 + 2.5) ** (-2.5))
60 |     lf0_rec_sum = torch.sum(lf0_rec, axis=-1)
61 |     # lf0_rec_sum = scaler(lf0_rec_sum)
62 |     scaler_tc.fit(lf0_rec_sum)
63 |     lf0_rec_sum = scaler_tc.transform(lf0_rec_sum)
64 | 
65 |     torch.nan_to_num(lf0_rec_sum, nan=0.0)
66 |     return lf0_rec_sum
67 | 


--------------------------------------------------------------------------------
/fs_two/dataset.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | import os
  4 | 
  5 | import random
  6 | import numpy as np
  7 | from torch.utils.data import Dataset
  8 | 
  9 | from fs_two.text import text_to_sequence
 10 | from fs_two.utils.tools import pad_1D, pad_2D
 11 | from fs_two.text.symbols import _mask, _silences
 12 | 
 13 | 
 14 | def random_mask(text, _silences, max_masks_per_sentence, _mask):
 15 |     # randonly mask some sentences
 16 |     # we do not want to mask short sentences
 17 | 
 18 |     text = text.split(" ")
 19 |     max_len = len(text)
 20 |     masks_count = int(
 21 |         max_masks_per_sentence * max_len
 22 |     )  # max_masks_per_sentence = 0.15
 23 |     if masks_count == 0:
 24 |         return text
 25 |     mask_indexes = random.choices(list(range(max_len)), k=masks_count)
 26 |     for ind in mask_indexes:
 27 |         if not text[ind] in _silences:
 28 |             text[ind] = _mask
 29 |     return " ".join(text)
 30 | 
 31 | 
 32 | class Dataset(Dataset):
 33 |     def __init__(
 34 |         self,
 35 |         filename,
 36 |         preprocess_config,
 37 |         train_config,
 38 |         sort=False,
 39 |         drop_last=True,
 40 |     ):
 41 |         self._silences = [s.replace("@", "") for s in _silences]
 42 |         self.max_masks_per_sentence = train_config.max_masks_per_sentence
 43 |         self.dataset_name = preprocess_config["dataset"]
 44 |         self.preprocessed_path = preprocess_config["path"]["preprocessed_path"]
 45 |         self.cleaners = preprocess_config["preprocessing"]["text"][
 46 |             "text_cleaners"
 47 |         ]
 48 |         self.batch_size = train_config["optimizer"]["batch_size"]
 49 | 
 50 |         (
 51 |             self.basename,
 52 |             self.speaker,
 53 |             self.text,
 54 |             self.raw_text,
 55 |         ) = self.process_meta(filename)
 56 |         with open(os.path.join(self.preprocessed_path, "speakers.json")) as f:
 57 |             self.speaker_map = json.load(f)
 58 |         self.sort = sort
 59 |         self.drop_last = drop_last
 60 | 
 61 |     def __len__(self):
 62 |         return len(self.text)
 63 | 
 64 |     def __getitem__(self, idx):
 65 |         basename = self.basename[idx]
 66 |         speaker = self.speaker[idx]
 67 |         speaker_id = self.speaker_map[speaker]
 68 |         raw_text = self.raw_text[idx]
 69 |         phone = np.array(text_to_sequence(self.text[idx], self.cleaners))
 70 |         mel_path = os.path.join(
 71 |             self.preprocessed_path,
 72 |             "mel",
 73 |             "{}-mel-{}.npy".format(speaker, basename),
 74 |         )
 75 |         mel = np.load(mel_path)
 76 | 
 77 |         energy_path = os.path.join(
 78 |             self.preprocessed_path,
 79 |             "energy",
 80 |             "{}-energy-{}.npy".format(speaker, basename),
 81 |         )
 82 |         energy = np.load(energy_path)
 83 |         duration_path = os.path.join(
 84 |             self.preprocessed_path,
 85 |             "duration",
 86 |             "{}-duration-{}.npy".format(speaker, basename),
 87 |         )
 88 |         duration = np.load(duration_path)
 89 | 
 90 |         pitch_cwt_path = os.path.join(
 91 |             self.preprocessed_path,
 92 |             "pitch",
 93 |             "{}-cwt-pitch-{}.npy".format(speaker, basename),
 94 |         )
 95 | 
 96 |         pitch_path = os.path.join(
 97 |             self.preprocessed_path,
 98 |             "pitch",
 99 |             "{}-pitch-{}.npy".format(speaker, basename),
100 |         )
101 | 
102 |         pitch_raw = np.load(pitch_path)
103 |         pitch_cwt = np.load(pitch_cwt_path)
104 | 
105 |         pitch_mean_path = os.path.join(
106 |             self.preprocessed_path,
107 |             "pitch",
108 |             "{}-pitch-mean-{}.npy".format(speaker, basename),
109 |         )
110 |         pitch_mean = np.load(pitch_mean_path)
111 | 
112 |         pitch_std_path = os.path.join(
113 |             self.preprocessed_path,
114 |             "pitch",
115 |             "{}-pitch-std-{}.npy".format(speaker, basename),
116 |         )
117 |         pitch_std = np.load(pitch_std_path)
118 | 
119 |         sample = {
120 |             "id": basename,
121 |             "speaker": speaker_id,
122 |             "text": phone,
123 |             "raw_text": raw_text,
124 |             "mel": mel,
125 |             "energy": energy,
126 |             "duration": duration,
127 |             "pitch_raw": pitch_raw,
128 |             "pitch_mean": pitch_mean,
129 |             "pitch_std": pitch_std,
130 |             "pitch_cwt": pitch_cwt,
131 |         }
132 | 
133 |         return sample
134 | 
135 |     def process_meta(self, filename):
136 |         with open(
137 |             os.path.join(self.preprocessed_path, filename),
138 |             "r",
139 |             encoding="utf-8",
140 |         ) as f:
141 |             name = []
142 |             speaker = []
143 |             text = []
144 |             raw_text = []
145 |             for line in f.readlines():
146 |                 n, s, t, r = line.strip("\n").split("|")
147 |                 name.append(n)
148 |                 speaker.append(s)
149 |                 if self.max_masks_per_sentence > 1:
150 |                     t = random_mask(
151 |                         t, self._silences, self.max_masks_per_sentence, _mask
152 |                     )
153 |                 text.append(t)
154 |                 raw_text.append(r)
155 | 
156 |             return name, speaker, text, raw_text
157 | 
158 |     def reprocess(self, data, idxs):
159 |         ids = [data[idx]["id"] for idx in idxs]
160 |         speakers = [data[idx]["speaker"] for idx in idxs]
161 |         texts = [data[idx]["text"] for idx in idxs]
162 |         raw_texts = [data[idx]["raw_text"] for idx in idxs]
163 |         mels = [data[idx]["mel"] for idx in idxs]
164 | 
165 |         pitches_mean = [data[idx]["pitch_mean"] for idx in idxs]
166 |         pitches_std = [data[idx]["pitch_std"] for idx in idxs]
167 |         pitches_cwt = [data[idx]["pitch_cwt"] for idx in idxs]
168 |         pitches_raw = [data[idx]["pitch_raw"] for idx in idxs]
169 | 
170 |         energies = [data[idx]["energy"] for idx in idxs]
171 |         durations = [data[idx]["duration"] for idx in idxs]
172 | 
173 |         text_lens = np.array([text.shape[0] for text in texts])
174 |         mel_lens = np.array([mel.shape[0] for mel in mels])
175 | 
176 |         speakers = np.array(speakers)
177 |         pitches_mean = np.array(pitches_mean)
178 |         pitches_std = np.array(pitches_std)
179 | 
180 |         texts = pad_1D(texts)
181 |         mels = pad_2D(mels)
182 |         energies = pad_1D(energies)
183 |         pitches_raw = pad_1D(pitches_raw)
184 |         durations = pad_1D(durations)
185 | 
186 |         pitches_cwt = pad_2D(pitches_cwt)
187 | 
188 |         return (
189 |             ids,
190 |             raw_texts,
191 |             speakers,
192 |             texts,
193 |             text_lens,
194 |             max(text_lens),
195 |             mels,
196 |             mel_lens,
197 |             max(mel_lens),
198 |             energies,
199 |             durations,
200 |             pitches_raw,
201 |             pitches_cwt,
202 |             pitches_mean,
203 |             pitches_std,
204 |         )
205 | 
206 |     def collate_fn(self, data):
207 |         data_size = len(data)
208 | 
209 |         if self.sort:
210 |             len_arr = np.array([d["text"].shape[0] for d in data])
211 |             idx_arr = np.argsort(-len_arr)
212 |         else:
213 |             idx_arr = np.arange(data_size)
214 | 
215 |         tail = idx_arr[len(idx_arr) - (len(idx_arr) % self.batch_size) :]
216 |         idx_arr = idx_arr[: len(idx_arr) - (len(idx_arr) % self.batch_size)]
217 |         idx_arr = idx_arr.reshape((-1, self.batch_size)).tolist()
218 |         if not self.drop_last and len(tail) > 0:
219 |             idx_arr += [tail.tolist()]
220 | 
221 |         output = list()
222 |         for idx in idx_arr:
223 |             output.append(self.reprocess(data, idx))
224 | 
225 |         return output
226 | 


--------------------------------------------------------------------------------
/fs_two/evaluate.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | 
  5 | import torch
  6 | import yaml
  7 | import torch.nn as nn
  8 | from torch.utils.data import DataLoader
  9 | 
 10 | from fs_two.utils.model import get_model, get_vocoder
 11 | from fs_two.utils.tools import to_device, log, synth_one_sample
 12 | from fs_two.model import FastSpeech2Loss
 13 | from fs_two.dataset import Dataset
 14 | 
 15 | # TODO SET device via config
 16 | 
 17 | 
 18 | def evaluate(
 19 |     model, step, cfg, logger=None, train_val="val", vocoder=None, device=0
 20 | ):
 21 |     dataset = Dataset(
 22 |         "val.txt",
 23 |         cfg.preprocess_config,
 24 |         cfg.train_config,
 25 |         sort=False,
 26 |         drop_last=False,
 27 |     )
 28 |     batch_size = cfg.train_config["optimizer"]["batch_size"]
 29 |     loader = DataLoader(
 30 |         dataset,
 31 |         batch_size=batch_size,
 32 |         shuffle=False,
 33 |         collate_fn=dataset.collate_fn,
 34 |     )
 35 | 
 36 |     # Get loss function
 37 |     Loss = FastSpeech2Loss(cfg.preprocess_config, cfg.model_config)
 38 | 
 39 |     # Evaluation
 40 |     loss_sums = [0 for _ in range(6)]
 41 |     for batchs in loader:
 42 |         for batch in batchs:
 43 |             batch = to_device(batch, device)
 44 |             with torch.no_grad():
 45 |                 # Forward
 46 |                 output = model(*(batch[2:]))
 47 | 
 48 |                 # Cal Loss
 49 |                 losses = Loss(batch, output)
 50 | 
 51 |                 for i in range(1, len(losses)):
 52 |                     loss_sums[i - 1] += losses[i].item() * len(batch[0])
 53 | 
 54 |     loss_means = [loss_sum / len(dataset) for loss_sum in loss_sums]
 55 |     loss_means = [sum(loss_means)] + loss_means
 56 |     loss_logs = [step] + loss_means
 57 | 
 58 |     message = """Validation Step {}, 
 59 |                  Total Loss: {:.4f}, 
 60 |                  Mel Loss: {:.4f}, 
 61 |                  Pitch Loss: {:.4f}, 
 62 |                  Mean pitch {:.4f},
 63 |                  Std pitch {:.4f}""".format(
 64 |         *loss_logs
 65 |     )
 66 | 
 67 |     if logger is not None:
 68 |         fig, wav_reconstruction, wav_prediction, tag = synth_one_sample(
 69 |             batch,
 70 |             output,
 71 |             vocoder,
 72 |             cfg.model_config,
 73 |             cfg.preprocess_config,
 74 |         )
 75 | 
 76 |         log(logger, "val", step, losses=loss_means)
 77 |         log(
 78 |             logger,
 79 |             "val",
 80 |             fig=fig,
 81 |             tag="Validation/step_{}_{}".format(step, tag),
 82 |         )
 83 |         sampling_rate = cfg.preprocess_config["preprocessing"]["audio"][
 84 |             "sampling_rate"
 85 |         ]
 86 |         log(
 87 |             logger,
 88 |             "val",
 89 |             audio=wav_reconstruction,
 90 |             sampling_rate=sampling_rate,
 91 |             tag="Validation/step_{}_{}_reconstructed".format(step, tag),
 92 |         )
 93 |         log(
 94 |             logger,
 95 |             "val",
 96 |             audio=wav_prediction,
 97 |             sampling_rate=sampling_rate,
 98 |             tag="Validation/step_{}_{}_synthesized".format(step, tag),
 99 |         )
100 | 
101 |     return message
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     device = 0
106 |     parser = argparse.ArgumentParser()
107 |     parser.add_argument("--restore_step", type=int, default=30000)
108 |     parser.add_argument(
109 |         "-p",
110 |         "--preprocess_config",
111 |         type=str,
112 |         required=True,
113 |         help="path to preprocess.yaml",
114 |     )
115 |     parser.add_argument(
116 |         "-m",
117 |         "--model_config",
118 |         type=str,
119 |         required=True,
120 |         help="path to model.yaml",
121 |     )
122 |     parser.add_argument(
123 |         "-t",
124 |         "--train_config",
125 |         type=str,
126 |         required=True,
127 |         help="path to train.yaml",
128 |     )
129 |     args = parser.parse_args()
130 | 
131 |     # Read Config
132 |     preprocess_config = yaml.load(
133 |         open(args.preprocess_config, "r"), Loader=yaml.FullLoader
134 |     )
135 |     model_config = yaml.load(
136 |         open(args.model_config, "r"), Loader=yaml.FullLoader
137 |     )
138 |     train_config = yaml.load(
139 |         open(args.train_config, "r"), Loader=yaml.FullLoader
140 |     )
141 |     configs = (preprocess_config, model_config, train_config)
142 | 
143 |     # Get model
144 |     model = get_model(args, configs, device, train=False).to(device)
145 | 
146 |     message = evaluate(model, args.restore_step, configs)
147 |     print(message)
148 | 


--------------------------------------------------------------------------------
/fs_two/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .fastspeech2 import FastSpeech2
2 | from .loss import FastSpeech2Loss
3 | from .optimizer import ScheduledOptim
4 | 


--------------------------------------------------------------------------------
/fs_two/model/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/model/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/model/__pycache__/fastspeech2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/model/__pycache__/fastspeech2.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/model/__pycache__/loss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/model/__pycache__/loss.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/model/__pycache__/modules.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/model/__pycache__/modules.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/model/__pycache__/optimizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/model/__pycache__/optimizer.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/model/fastspeech2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | 
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from fs_two.transformer import Encoder, Decoder, PostNet
  8 | from .modules import VarianceAdaptor
  9 | from fs_two.utils.tools import get_mask_from_lengths
 10 | 
 11 | 
 12 | class FastSpeech2(nn.Module):
 13 |     """FastSpeech2"""
 14 | 
 15 |     def __init__(
 16 |         self, preprocess_config, model_config, n_speakers=None, device="cpu"
 17 |     ):
 18 |         super(FastSpeech2, self).__init__()
 19 |         self.model_config = model_config
 20 | 
 21 |         self.encoder = Encoder(model_config)
 22 |         self.variance_adaptor = VarianceAdaptor(
 23 |             preprocess_config, model_config, device
 24 |         )
 25 |         self.decoder = Decoder(model_config)
 26 |         self.mel_linear = nn.Linear(
 27 |             model_config["transformer"]["decoder_hidden"],
 28 |             preprocess_config["preprocessing"]["mel"]["n_mel_channels"],
 29 |         )
 30 |         nn.init.xavier_normal_(self.mel_linear.weight)
 31 |         self.speaker_emb = None
 32 | 
 33 |         if model_config["multi_speaker"]:
 34 |             if n_speakers is None:
 35 |                 n_speakers = get_speakers_number(preprocess_config)
 36 |             self.speaker_emb = nn.Embedding(
 37 |                 n_speakers,
 38 |                 model_config["transformer"]["encoder_hidden"],
 39 |             )
 40 | 
 41 |         self.postnet = PostNet()
 42 | 
 43 |     def forward(
 44 |         self,
 45 |         speakers,
 46 |         texts,
 47 |         src_lens,
 48 |         max_src_len,
 49 |         mels=None,
 50 |         mel_lens=None,
 51 |         max_mel_len=None,
 52 |         e_targets=None,
 53 |         d_targets=None,
 54 |         pitches_raw=None,
 55 |         pitches_cwt=None,
 56 |         pitches_mean=None,
 57 |         pitches_std=None,
 58 |         p_control=1.0,
 59 |         e_control=1.0,
 60 |         d_control=1.0,
 61 |     ):
 62 |         src_masks = get_mask_from_lengths(
 63 |             src_lens, max_src_len, device=texts.device
 64 |         )
 65 |         mel_masks = (
 66 |             get_mask_from_lengths(mel_lens, max_mel_len, device=texts.device)
 67 |             if mel_lens is not None
 68 |             else None
 69 |         )
 70 | 
 71 |         output = self.encoder(texts, src_masks)
 72 |         if self.speaker_emb is not None:
 73 |             embedding = (
 74 |                 self.speaker_emb(speakers).unsqueeze(1).expand(-1, 1, -1)
 75 |             )
 76 |         (
 77 |             output,
 78 |             pitch_prediction,
 79 |             e_predictions,
 80 |             log_d_predictions,
 81 |             d_rounded,
 82 |             mel_lens,
 83 |             mel_masks,
 84 |             pitch_mean,
 85 |             pitch_std,
 86 |         ) = self.variance_adaptor(
 87 |             output,
 88 |             embedding,
 89 |             src_masks,
 90 |             mel_masks,
 91 |             max_mel_len,
 92 |             pitches_raw,
 93 |             pitches_cwt,
 94 |             e_targets,
 95 |             d_targets,
 96 |             p_control,
 97 |             e_control,
 98 |             d_control,
 99 |         )
100 | 
101 |         output, mel_masks = self.decoder(output, mel_masks)
102 |         output = self.mel_linear(output)
103 | 
104 |         postnet_output = self.postnet(output) + output
105 | 
106 |         return (
107 |             output,
108 |             pitch_prediction,
109 |             e_predictions,
110 |             log_d_predictions,
111 |             d_rounded,
112 |             src_masks,
113 |             mel_masks,
114 |             src_lens,
115 |             mel_lens,
116 |             postnet_output,
117 |             pitch_mean,
118 |             pitch_std,
119 |         )
120 | 
121 | 
122 | def get_speakers_number(preprocess_config):
123 |     speaker_json = os.path.join(
124 |         preprocess_config["path"]["preprocessed_path"],
125 |         "speakers.json",
126 |     )
127 |     if os.path.exists(speaker_json):
128 |         with open(
129 |             speaker_json,
130 |             "r",
131 |         ) as f:
132 |             n_speakers = len(json.load(f))
133 |     else:
134 |         raise Exception(
135 |             "Model is multispeaker but number of speakers was not provided explicitly"
136 |         )
137 |     return n_speakers
138 | 


--------------------------------------------------------------------------------
/fs_two/model/loss.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | 
  5 | class FastSpeech2Loss(nn.Module):
  6 |     """FastSpeech2 Loss"""
  7 | 
  8 |     def __init__(self, preprocess_config, model_config):
  9 |         super(FastSpeech2Loss, self).__init__()
 10 |         self.pitch_feature_level = preprocess_config["preprocessing"]["pitch"][
 11 |             "feature"
 12 |         ]
 13 |         self.energy_feature_level = preprocess_config["preprocessing"][
 14 |             "energy"
 15 |         ]["feature"]
 16 |         self.mse_loss = nn.MSELoss()
 17 |         self.mae_loss = nn.L1Loss()
 18 | 
 19 |         if model_config.use_cwt:
 20 |             self.use_cwt = True
 21 |         else:
 22 |             self.use_cwt = False
 23 | 
 24 |     def forward(self, inputs, predictions):
 25 |         # TARGETS
 26 |         speaker_targets = inputs[2]
 27 |         (
 28 |             mel_targets,
 29 |             _,
 30 |             _,
 31 |             energy_targets,
 32 |             duration_targets,
 33 |             pitches_raw_targets,
 34 |             pitches_cwt_targets,
 35 |             pitch_mean,
 36 |             pitch_std,
 37 |         ) = inputs[6:]
 38 | 
 39 |         # PREDICTIONS
 40 |         (
 41 |             mel_predictions,
 42 |             pitch_predictions,
 43 |             energy_predictions,
 44 |             log_duration_predictions,
 45 |             _,
 46 |             src_masks,
 47 |             mel_masks,
 48 |             _,
 49 |             _,
 50 |             postnet_mel_predictions,
 51 |             pitch_mean_pred,
 52 |             pitch_std_pred,
 53 |         ) = predictions
 54 |         src_masks = ~src_masks
 55 |         mel_masks = ~mel_masks
 56 |         log_duration_targets = torch.log(duration_targets.float() + 1)
 57 |         mel_targets = mel_targets[:, : mel_masks.shape[1], :]
 58 |         mel_masks = mel_masks[:, : mel_masks.shape[1]]
 59 | 
 60 |         log_duration_targets.requires_grad = False
 61 |         pitches_raw_targets.requires_grad = False
 62 |         pitches_cwt_targets.requires_grad = False
 63 |         energy_targets.requires_grad = False
 64 |         mel_targets.requires_grad = False
 65 | 
 66 |         if self.use_cwt:
 67 |             pitch_mask = src_masks.unsqueeze(2)
 68 |             pitch_mask = pitch_mask.repeat(1, 1, 11)
 69 |             pitch_predictions = pitch_predictions.masked_select(pitch_mask)
 70 |             pitch_targets = pitches_cwt_targets.masked_select(pitch_mask)
 71 |         else:
 72 |             pitch_predictions = pitch_predictions.masked_select(src_masks)
 73 |             pitch_targets = pitches_raw_targets.masked_select(src_masks)
 74 | 
 75 |         energy_predictions = energy_predictions.masked_select(src_masks)
 76 |         energy_targets = energy_targets.masked_select(src_masks)
 77 | 
 78 |         log_duration_predictions = log_duration_predictions.masked_select(
 79 |             src_masks
 80 |         )
 81 |         log_duration_targets = log_duration_targets.masked_select(src_masks)
 82 | 
 83 |         mel_predictions = mel_predictions * mel_masks.unsqueeze(-1)
 84 |         # mel_predictions = mel_predictions.masked_select(mel_masks.unsqueeze(-1))
 85 |         postnet_mel_predictions = postnet_mel_predictions * mel_masks.unsqueeze(
 86 |             -1
 87 |         )
 88 |         # postnet_mel_predictions = postnet_mel_predictions.masked_select(mel_masks.unsqueeze(-1))
 89 | 
 90 |         # mel_targets = mel_targets.masked_select(mel_masks.unsqueeze(-1))
 91 |         mel_targets = mel_targets * mel_masks.unsqueeze(-1)
 92 | 
 93 |         mel_loss = self.mse_loss(mel_predictions, mel_targets)
 94 |         mel_loss_mae = self.mae_loss(mel_predictions, mel_targets)
 95 |         postnet_mel_loss = self.mae_loss(postnet_mel_predictions, mel_targets)
 96 |         total_mel_loss = mel_loss + mel_loss_mae + postnet_mel_loss
 97 | 
 98 |         pitch_loss = self.mse_loss(pitch_predictions, pitch_targets)
 99 | 
100 |         energy_loss = self.mse_loss(energy_predictions, energy_targets)
101 |         duration_loss = self.mse_loss(
102 |             log_duration_predictions, log_duration_targets
103 |         )
104 | 
105 |         if self.use_cwt:
106 |             std_pitch_loss = self.mse_loss(
107 |                 pitch_std_pred, pitch_std.unsqueeze(1)
108 |             )
109 |             mean_pitch_loss = self.mse_loss(
110 |                 pitch_mean_pred, pitch_mean.unsqueeze(1)
111 |             )
112 |         else:
113 |             # std and mean are used only for CWT prediction
114 |             std_pitch_loss = torch.tensor([0]).to(pitch_loss.device)
115 |             mean_pitch_loss = torch.tensor([0]).to(pitch_loss.device)
116 | 
117 |         total_loss = (
118 |             total_mel_loss
119 |             + duration_loss
120 |             + pitch_loss
121 |             + energy_loss
122 |             + mean_pitch_loss
123 |             + std_pitch_loss
124 |         )
125 | 
126 |         return (
127 |             total_loss,
128 |             total_mel_loss,
129 |             pitch_loss,
130 |             energy_loss,
131 |             duration_loss,
132 |             mean_pitch_loss,
133 |             std_pitch_loss,
134 |         )
135 | 


--------------------------------------------------------------------------------
/fs_two/model/modules.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from collections import OrderedDict
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import numpy as np
  8 | from torch.autograd import Function
  9 | 
 10 | from fs_two.utils.tools import get_mask_from_lengths, pad
 11 | from fs_two.cwt.cwt_utils import inverse_batch_cwt
 12 | 
 13 | 
 14 | class VarianceAdaptor(nn.Module):
 15 |     """ Variance Adaptor """
 16 |     def __init__(self, preprocess_config, model_config, device):
 17 |         super(VarianceAdaptor, self).__init__()
 18 |         self.device = device
 19 | 
 20 |         hidden_size = model_config["transformer"]["variance_hidden"]
 21 | 
 22 |         self.duration_predictor = VariancePredictor(model_config)
 23 |         self.length_regulator = LengthRegulator()
 24 | 
 25 |         if model_config.use_cwt:
 26 |             self.use_cwt = True
 27 |             self.pitch_predictor = VariancePredictor(model_config,
 28 |                                                      output_size=11,
 29 |                                                      dropout=0.1)
 30 |         else:
 31 |             self.use_cwt = False
 32 |             self.pitch_predictor = VariancePredictor(model_config)
 33 | 
 34 |         # PitchPredictor(hidden_size, cwt_size=11)
 35 | 
 36 |         self.energy_predictor = VariancePredictor(model_config)
 37 | 
 38 |         self.pitch_mean = CNNscalar(size_one=hidden_size, size_two=11)
 39 |         self.pitch_std = CNNscalar(size_one=hidden_size, size_two=11)
 40 | 
 41 |         self.pitch_feature_level = preprocess_config["preprocessing"]["pitch"][
 42 |             "feature"]
 43 |         self.energy_feature_level = preprocess_config["preprocessing"][
 44 |             "energy"]["feature"]
 45 |         assert self.pitch_feature_level in ["phoneme_level", "frame_level"]
 46 |         assert self.energy_feature_level in ["phoneme_level", "frame_level"]
 47 | 
 48 |         pitch_quantization = model_config["variance_embedding"][
 49 |             "pitch_quantization"]
 50 |         energy_quantization = model_config["variance_embedding"][
 51 |             "energy_quantization"]
 52 |         n_bins = model_config["variance_embedding"]["n_bins"]
 53 |         assert pitch_quantization in ["linear", "log"]
 54 |         assert energy_quantization in ["linear", "log"]
 55 |         with open(
 56 |                 os.path.join(preprocess_config["path"]["preprocessed_path"],
 57 |                              "stats.json")) as f:
 58 |             stats = json.load(f)
 59 |             pitch_min, pitch_max = stats["pitch"][:2]
 60 |             energy_min, energy_max = stats["energy"][:2]
 61 | 
 62 |         if pitch_quantization == "log":
 63 |             self.pitch_bins = nn.Parameter(
 64 |                 torch.exp(
 65 |                     torch.linspace(np.log(pitch_min), np.log(pitch_max),
 66 |                                    n_bins - 1)),
 67 |                 requires_grad=False,
 68 |             )
 69 |         else:
 70 |             self.pitch_bins = nn.Parameter(
 71 |                 torch.linspace(pitch_min, pitch_max, n_bins - 1),
 72 |                 requires_grad=False,
 73 |             )
 74 |         if energy_quantization == "log":
 75 |             self.energy_bins = nn.Parameter(
 76 |                 torch.exp(
 77 |                     torch.linspace(np.log(energy_min), np.log(energy_max),
 78 |                                    n_bins - 1)),
 79 |                 requires_grad=False,
 80 |             )
 81 |         else:
 82 |             self.energy_bins = nn.Parameter(
 83 |                 torch.linspace(energy_min, energy_max, n_bins - 1),
 84 |                 requires_grad=False,
 85 |             )
 86 | 
 87 |         self.pitch_embedding = nn.Embedding(
 88 |             n_bins, model_config["transformer"]["encoder_hidden"])
 89 |         self.energy_embedding = nn.Embedding(
 90 |             n_bins, model_config["transformer"]["encoder_hidden"])
 91 | 
 92 |     def get_pitch_embedding_normal(self, x, target, mask, control=1):
 93 |         prediction = self.pitch_predictor(x, mask)
 94 |         if target is not None:
 95 |             embedding = self.pitch_embedding(
 96 |                 torch.bucketize(target, self.pitch_bins))
 97 |         else:
 98 |             prediction = prediction * control
 99 |             embedding = self.pitch_embedding(
100 |                 torch.bucketize(prediction, self.pitch_bins))
101 |         return prediction, embedding
102 | 
103 |     def get_pitch_embedding_cwt(self, x, pitch_target_cwt, mask, control=1):
104 |         # batch, seq_len, 10 -> batch, 10 -> batch, 1
105 |         mask = mask.unsqueeze(2)
106 |         mask = mask.repeat(1, 1, 11)
107 |         pitch_cwt_prediction = self.pitch_predictor(x, mask)
108 | 
109 |         # NOTE: Might be more stable if train on Ground Truth
110 |         # if pitch_target_cwt is None:
111 |         #     pitch_cwt = pitch_cwt_prediction
112 |         # else:
113 |         #     pitch_cwt = pitch_target_cwt
114 | 
115 |         pitch_cwt = pitch_cwt_prediction
116 | 
117 |         pitch_mean = self.pitch_mean(x.detach(), pitch_cwt.detach())
118 |         pitch_std = self.pitch_std(x.detach(), pitch_cwt.detach())
119 | 
120 |         pitch = inverse_batch_cwt(pitch_cwt)
121 | 
122 |         # print(pitch.shape)
123 |         # print(pitch_std.shape)
124 |         # print(pitch_mean.shape)
125 |         pitch = (pitch * pitch_std) + pitch_mean
126 | 
127 |         pitch_embedding = self.pitch_embedding(
128 |             torch.bucketize(pitch * control, self.pitch_bins))
129 |         return pitch_cwt_prediction, pitch_embedding, pitch_mean, pitch_std
130 | 
131 |     def get_energy_embedding(self, x, target, mask, control):
132 |         prediction = self.energy_predictor(x, mask)
133 |         if target is not None:
134 |             embedding = self.energy_embedding(
135 |                 torch.bucketize(target, self.energy_bins))
136 |         else:
137 |             prediction = prediction * control
138 |             embedding = self.energy_embedding(
139 |                 torch.bucketize(prediction, self.energy_bins))
140 |         return prediction, embedding
141 | 
142 |     def forward(
143 |         self,
144 |         x,
145 |         embedding,
146 |         src_mask,
147 |         mel_mask=None,
148 |         max_len=None,
149 |         pitch_raw_target=None,
150 |         pitch_cwt_target=None,
151 |         energy_target=None,
152 |         duration_target=None,
153 |         p_control=1.0,
154 |         e_control=1.0,
155 |         d_control=1.0,
156 |     ):
157 | 
158 |         log_duration_prediction = self.duration_predictor(x, src_mask)
159 |         x = x + embedding
160 |         if self.use_cwt:
161 |             (
162 |                 pitch_prediction,
163 |                 pitch_embedding,
164 |                 pitch_mean,
165 |                 pitch_std,
166 |             ) = self.get_pitch_embedding_cwt(
167 |                 x,
168 |                 pitch_cwt_target,
169 |                 src_mask,
170 |                 p_control,
171 |             )
172 |         else:
173 |             (
174 |                 pitch_prediction,
175 |                 pitch_embedding,
176 |             ) = self.get_pitch_embedding_normal(
177 |                 x,
178 |                 pitch_raw_target,
179 |                 src_mask,
180 |                 p_control,
181 |             )
182 |             pitch_mean = None
183 |             pitch_std = None
184 | 
185 |         x = x + pitch_embedding
186 | 
187 |         energy_prediction, energy_embedding = self.get_energy_embedding(
188 |             x,
189 |             energy_target,
190 |             src_mask,
191 |             e_control,
192 |         )
193 |         x = x + energy_embedding
194 | 
195 |         if duration_target is not None:
196 |             x, mel_len = self.length_regulator(x, duration_target, max_len)
197 |             duration_rounded = duration_target
198 |         else:
199 |             duration_rounded = torch.clamp(
200 |                 (torch.round(torch.exp(log_duration_prediction) - 1) *
201 |                  d_control),
202 |                 min=0,
203 |             )
204 |             x, mel_len = self.length_regulator(x, duration_rounded, max_len)
205 |             mel_mask = get_mask_from_lengths(mel_len, device=self.device)
206 | 
207 |         return (
208 |             x,
209 |             pitch_prediction,
210 |             energy_prediction,
211 |             log_duration_prediction,
212 |             duration_rounded,
213 |             mel_len,
214 |             mel_mask,
215 |             pitch_mean,
216 |             pitch_std,
217 |         )
218 | 
219 | 
220 | class LengthRegulator(nn.Module):
221 |     """ Length Regulator """
222 |     def __init__(self):
223 |         super(LengthRegulator, self).__init__()
224 | 
225 |     def LR(self, x, duration, max_len):
226 |         output = list()
227 |         mel_len = list()
228 |         for batch, expand_target in zip(x, duration):
229 |             expanded = self.expand(batch, expand_target)
230 |             output.append(expanded)
231 |             mel_len.append(expanded.shape[0])
232 | 
233 |         if max_len is not None:
234 |             output = pad(output, max_len)
235 |         else:
236 |             output = pad(output)
237 | 
238 |         return output, torch.LongTensor(mel_len).to(x.device)
239 | 
240 |     def expand(self, batch, predicted):
241 |         out = list()
242 | 
243 |         for i, vec in enumerate(batch):
244 |             expand_size = predicted[i].item()
245 |             out.append(vec.expand(max(int(expand_size), 0), -1))
246 |         out = torch.cat(out, 0)
247 | 
248 |         return out
249 | 
250 |     def forward(self, x, duration, max_len):
251 |         output, mel_len = self.LR(x, duration, max_len)
252 |         return output, mel_len
253 | 
254 | 
255 | class VariancePredictor(nn.Module):
256 |     """ Duration, Pitch and Energy Predictor """
257 |     def __init__(self, model_config, output_size=1, dropout=None):
258 |         super(VariancePredictor, self).__init__()
259 | 
260 |         self.input_size = model_config["transformer"]["variance_hidden"]
261 |         self.filter_size = model_config["variance_predictor"]["filter_size"]
262 |         self.kernel = model_config["variance_predictor"]["kernel_size"]
263 |         self.conv_output_size = model_config["variance_predictor"][
264 |             "filter_size"]
265 |         if dropout is None:
266 |             self.dropout = model_config["variance_predictor"]["dropout"]
267 |         else:
268 |             self.dropout = dropout
269 | 
270 |         self.conv_layer = nn.Sequential(
271 |             OrderedDict([
272 |                 (
273 |                     "conv1d_1",
274 |                     Conv(
275 |                         self.input_size,
276 |                         self.filter_size,
277 |                         kernel_size=self.kernel,
278 |                         padding=(self.kernel - 1) // 2,
279 |                     ),
280 |                 ),
281 |                 ("relu_1", nn.ReLU()),
282 |                 ("layer_norm_1", nn.LayerNorm(self.filter_size)),
283 |                 ("dropout_1", nn.Dropout(self.dropout)),
284 |                 (
285 |                     "conv1d_2",
286 |                     Conv(
287 |                         self.filter_size,
288 |                         self.filter_size,
289 |                         kernel_size=self.kernel,
290 |                         padding=1,
291 |                     ),
292 |                 ),
293 |                 ("relu_2", nn.ReLU()),
294 |                 ("layer_norm_2", nn.LayerNorm(self.filter_size)),
295 |                 ("dropout_2", nn.Dropout(self.dropout)),
296 |             ]))
297 | 
298 |         self.linear_layer = nn.Linear(self.conv_output_size, output_size)
299 |         nn.init.xavier_normal_(self.linear_layer.weight)
300 | 
301 |     def forward(self, encoder_output, mask):
302 |         out = self.conv_layer(encoder_output)
303 |         out = self.linear_layer(out)
304 |         out = out.squeeze(-1)
305 | 
306 |         if mask is not None:
307 |             out = out.masked_fill(mask, 0.0)
308 | 
309 |         return out
310 | 
311 | 
312 | class Conv(nn.Module):
313 |     """
314 |     Convolution Module
315 |     """
316 |     def __init__(
317 |         self,
318 |         in_channels,
319 |         out_channels,
320 |         kernel_size=1,
321 |         stride=1,
322 |         padding=0,
323 |         dilation=1,
324 |         bias=True,
325 |         w_init="linear",
326 |     ):
327 |         """
328 |         :param in_channels: dimension of input
329 |         :param out_channels: dimension of output
330 |         :param kernel_size: size of kernel
331 |         :param stride: size of stride
332 |         :param padding: size of padding
333 |         :param dilation: dilation rate
334 |         :param bias: boolean. if True, bias is included.
335 |         :param w_init: str. weight inits with xavier initialization.
336 |         """
337 |         super(Conv, self).__init__()
338 | 
339 |         self.conv = nn.Conv1d(
340 |             in_channels,
341 |             out_channels,
342 |             kernel_size=kernel_size,
343 |             stride=stride,
344 |             padding=padding,
345 |             dilation=dilation,
346 |             bias=bias,
347 |         )
348 |         nn.init.kaiming_normal_(self.conv.weight, nonlinearity="relu")
349 | 
350 |     def forward(self, x):
351 |         x = x.contiguous().transpose(1, 2)
352 |         x = self.conv(x)
353 |         x = x.contiguous().transpose(1, 2)
354 | 
355 |         return x
356 | 
357 | 
358 | class CNNflat(nn.Module):
359 |     def __init__(self, size, reduce=30):
360 |         super(CNNflat, self).__init__()
361 |         self.net = nn.Sequential(
362 |             nn.Conv1d(size, 1, 1),
363 |             nn.AdaptiveAvgPool1d(reduce),
364 |             nn.LayerNorm(reduce),
365 |             nn.ReLU(),
366 |         )
367 | 
368 |     def forward(self, x):
369 |         x = x.transpose(1, 2)
370 |         return self.net(x)
371 | 
372 | 
373 | class CNNscalar(nn.Module):
374 |     def __init__(self, size_one, size_two, reduce=30):
375 |         super(CNNscalar, self).__init__()
376 |         self.flat_one = CNNflat(size_one, reduce)
377 |         self.flat_two = CNNflat(size_two, reduce)
378 |         self.linear = nn.Linear(reduce, 1)
379 |         self.relu = nn.ReLU()
380 | 
381 |     def forward(self, x_one, x_two):
382 |         x_one = self.flat_one(x_one)
383 |         x_two = self.flat_two(x_two)
384 |         out = self.linear(x_one + x_two)
385 |         return self.relu(out).squeeze(1)
386 | 


--------------------------------------------------------------------------------
/fs_two/model/optimizer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | class ScheduledOptim:
 6 |     """ A simple wrapper class for learning rate scheduling """
 7 | 
 8 |     def __init__(self, model, train_config, model_config, current_step):
 9 | 
10 |         self._optimizer = torch.optim.Adam(
11 |             model.parameters(),
12 |             betas=train_config["optimizer"]["betas"],
13 |             eps=train_config["optimizer"]["eps"],
14 |             weight_decay=train_config["optimizer"]["weight_decay"],
15 |         )
16 |         self.n_warmup_steps = train_config["optimizer"]["warm_up_step"]
17 |         self.anneal_steps = train_config["optimizer"]["anneal_steps"]
18 |         self.anneal_rate = train_config["optimizer"]["anneal_rate"]
19 |         self.current_step = current_step
20 |         self.init_lr = np.power(
21 |             model_config["transformer"]["encoder_hidden"], -0.5
22 |         )
23 | 
24 |     def step_and_update_lr(self):
25 |         self._update_learning_rate()
26 |         self._optimizer.step()
27 | 
28 |     def zero_grad(self):
29 |         # print(self.init_lr)
30 |         self._optimizer.zero_grad()
31 | 
32 |     def load_state_dict(self, path):
33 |         self._optimizer.load_state_dict(path)
34 | 
35 |     def _get_lr_scale(self):
36 |         lr = np.min(
37 |             [
38 |                 np.power(self.current_step, -0.5),
39 |                 np.power(self.n_warmup_steps, -1.5) * self.current_step,
40 |             ]
41 |         )
42 |         for s in self.anneal_steps:
43 |             if self.current_step > s:
44 |                 lr = lr * self.anneal_rate
45 |         return lr
46 | 
47 |     def _update_learning_rate(self):
48 |         """ Learning rate scheduling per step """
49 |         self.current_step += 1
50 |         lr = self.init_lr * self._get_lr_scale()
51 | 
52 |         for param_group in self._optimizer.param_groups:
53 |             param_group["lr"] = lr


--------------------------------------------------------------------------------
/fs_two/prepare_align.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import yaml
 4 | 
 5 | from preprocessor import ljspeech, aishell3, libritts
 6 | 
 7 | 
 8 | def main(config):
 9 |     if "LJSpeech" in config["dataset"]:
10 |         ljspeech.prepare_align(config)
11 |     if "AISHELL3" in config["dataset"]:
12 |         aishell3.prepare_align(config)
13 |     if "LibriTTS" in config["dataset"]:
14 |         libritts.prepare_align(config)
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("config", type=str, help="path to preprocess.yaml")
20 |     args = parser.parse_args()
21 | 
22 |     config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
23 |     main(config)
24 | 


--------------------------------------------------------------------------------
/fs_two/preprocess.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import yaml
 4 | 
 5 | from fs_two.preprocessor.preprocessor import Preprocessor
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("config", type=str, help="path to preprocess.yaml")
11 |     args = parser.parse_args()
12 | 
13 |     config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
14 |     preprocessor = Preprocessor(config)
15 |     preprocessor.build_from_path()
16 | 


--------------------------------------------------------------------------------
/fs_two/preprocessor/common_multi.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import librosa
 4 | import numpy as np
 5 | from scipy.io import wavfile
 6 | from tqdm import tqdm
 7 | 
 8 | 
 9 | def prepare_align(config):
10 |     in_dir = config["path"]["corpus_path"]
11 |     out_dir = config["path"]["raw_path"]
12 |     sampling_rate = config["preprocessing"]["audio"]["sampling_rate"]
13 |     max_wav_value = config["preprocessing"]["audio"]["max_wav_value"]
14 |     for dataset in ["train", "test"]:
15 |         print("Processing {}ing set...".format(dataset))
16 |         with open(
17 |             os.path.join(in_dir, dataset, "content.txt"), encoding="utf-8"
18 |         ) as f:
19 |             for line in tqdm(f):
20 |                 wav_name, text = line.strip("\n").split("\t")
21 |                 speaker = wav_name[:7]
22 |                 text = text.split(" ")[1::2]
23 |                 wav_path = os.path.join(
24 |                     in_dir, dataset, "wav", speaker, wav_name
25 |                 )
26 |                 if os.path.exists(wav_path):
27 |                     os.makedirs(os.path.join(out_dir, speaker), exist_ok=True)
28 |                     wav, _ = librosa.load(wav_path, sampling_rate)
29 | 
30 |                     # ADD TO LOADER !!!
31 | 
32 |                     wav = wav / max(abs(wav)) * max_wav_value
33 |                     wavfile.write(
34 |                         os.path.join(out_dir, speaker, wav_name),
35 |                         sampling_rate,
36 |                         wav.astype(np.int16),
37 |                     )
38 |                     with open(
39 |                         os.path.join(
40 |                             out_dir, speaker, "{}.lab".format(wav_name[:11])
41 |                         ),
42 |                         "w",
43 |                     ) as f1:
44 |                         f1.write(" ".join(text))


--------------------------------------------------------------------------------
/fs_two/synthesize.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import argparse
  3 | from string import punctuation
  4 | 
  5 | import torch
  6 | import yaml
  7 | import numpy as np
  8 | from torch.utils.data import DataLoader
  9 | from g2p_en import G2p
 10 | from pypinyin import pinyin, Style
 11 | 
 12 | from utils.model import get_model, get_vocoder
 13 | from utils.tools import to_device, synth_samples
 14 | from dataset import TextDataset
 15 | from text import text_to_sequence
 16 | 
 17 | torch.cuda.set_device(0)
 18 | device = 0
 19 | # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 20 | 
 21 | 
 22 | def read_lexicon(lex_path):
 23 |     lexicon = {}
 24 |     with open(lex_path) as f:
 25 |         for line in f:
 26 |             temp = re.split(r"\s+", line.strip("\n"))
 27 |             word = temp[0]
 28 |             phones = temp[1:]
 29 |             if word.lower() not in lexicon:
 30 |                 lexicon[word.lower()] = phones
 31 |     return lexicon
 32 | 
 33 | 
 34 | def preprocess_english(text, preprocess_config):
 35 |     text = text.rstrip(punctuation)
 36 |     lexicon = read_lexicon(preprocess_config["path"]["lexicon_path"])
 37 | 
 38 |     g2p = G2p()
 39 |     phones = []
 40 |     words = re.split(r"([,;.\-\?\!\s+])", text)
 41 |     for w in words:
 42 |         if w.lower() in lexicon:
 43 |             phones += lexicon[w.lower()]
 44 |         else:
 45 |             phones += list(filter(lambda p: p != " ", g2p(w)))
 46 |     phones = "{" + "}{".join(phones) + "}"
 47 |     phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
 48 |     phones = phones.replace("}{", " ")
 49 | 
 50 |     print("Raw Text Sequence: {}".format(text))
 51 |     print("Phoneme Sequence: {}".format(phones))
 52 |     sequence = np.array(
 53 |         text_to_sequence(
 54 |             phones, preprocess_config["preprocessing"]["text"]["text_cleaners"]
 55 |         )
 56 |     )
 57 | 
 58 |     return np.array(sequence)
 59 | 
 60 | 
 61 | def preprocess_mandarin(text, preprocess_config):
 62 |     lexicon = read_lexicon(preprocess_config["path"]["lexicon_path"])
 63 | 
 64 |     phones = []
 65 |     pinyins = [
 66 |         p[0]
 67 |         for p in pinyin(
 68 |             text, style=Style.TONE3, strict=False, neutral_tone_with_five=True
 69 |         )
 70 |     ]
 71 |     for p in pinyins:
 72 |         if p in lexicon:
 73 |             phones += lexicon[p]
 74 |         else:
 75 |             phones.append("sp")
 76 | 
 77 |     phones = "{" + " ".join(phones) + "}"
 78 |     print("Raw Text Sequence: {}".format(text))
 79 |     print("Phoneme Sequence: {}".format(phones))
 80 |     sequence = np.array(
 81 |         text_to_sequence(
 82 |             phones, preprocess_config["preprocessing"]["text"]["text_cleaners"]
 83 |         )
 84 |     )
 85 | 
 86 |     return np.array(sequence)
 87 | 
 88 | 
 89 | def synthesize(model, step, configs, vocoder, batchs, control_values):
 90 |     preprocess_config, model_config, train_config = configs
 91 |     pitch_control, energy_control, duration_control = control_values
 92 | 
 93 |     for batch in batchs:
 94 |         batch = to_device(batch, device)
 95 |         with torch.no_grad():
 96 |             # Forward
 97 |             output = model(
 98 |                 *(batch[2:]),
 99 |                 p_control=pitch_control,
100 |                 e_control=energy_control,
101 |                 d_control=duration_control
102 |             )
103 |             synth_samples(
104 |                 batch,
105 |                 output,
106 |                 vocoder,
107 |                 model_config,
108 |                 preprocess_config,
109 |                 train_config["path"]["result_path"],
110 |             )
111 | 
112 | 
113 | if __name__ == "__main__":
114 | 
115 |     parser = argparse.ArgumentParser()
116 |     parser.add_argument("--restore_step", type=int, required=True)
117 |     parser.add_argument(
118 |         "--mode",
119 |         type=str,
120 |         choices=["batch", "single"],
121 |         required=True,
122 |         help="Synthesize a whole dataset or a single sentence",
123 |     )
124 |     parser.add_argument(
125 |         "--source",
126 |         type=str,
127 |         default=None,
128 |         help="path to a source file with format like train.txt and val.txt, for batch mode only",
129 |     )
130 |     parser.add_argument(
131 |         "--text",
132 |         type=str,
133 |         default=None,
134 |         help="raw text to synthesize, for single-sentence mode only",
135 |     )
136 |     parser.add_argument(
137 |         "--speaker_id",
138 |         type=int,
139 |         default=0,
140 |         help="speaker ID for multi-speaker synthesis, for single-sentence mode only",
141 |     )
142 |     parser.add_argument(
143 |         "-p",
144 |         "--preprocess_config",
145 |         type=str,
146 |         required=True,
147 |         help="path to preprocess.yaml",
148 |     )
149 |     parser.add_argument(
150 |         "-m",
151 |         "--model_config",
152 |         type=str,
153 |         required=True,
154 |         help="path to model.yaml",
155 |     )
156 |     parser.add_argument(
157 |         "-t",
158 |         "--train_config",
159 |         type=str,
160 |         required=True,
161 |         help="path to train.yaml",
162 |     )
163 |     parser.add_argument(
164 |         "--pitch_control",
165 |         type=float,
166 |         default=1.0,
167 |         help="control the pitch of the whole utterance, larger value for higher pitch",
168 |     )
169 |     parser.add_argument(
170 |         "--energy_control",
171 |         type=float,
172 |         default=1.0,
173 |         help="control the energy of the whole utterance, larger value for larger volume",
174 |     )
175 |     parser.add_argument(
176 |         "--duration_control",
177 |         type=float,
178 |         default=1.0,
179 |         help="control the speed of the whole utterance, larger value for slower speaking rate",
180 |     )
181 |     args = parser.parse_args()
182 | 
183 |     # Check source texts
184 |     if args.mode == "batch":
185 |         assert args.source is not None and args.text is None
186 |     if args.mode == "single":
187 |         assert args.source is None and args.text is not None
188 | 
189 |     # Read Config
190 |     preprocess_config = yaml.load(
191 |         open(args.preprocess_config, "r"), Loader=yaml.FullLoader
192 |     )
193 |     model_config = yaml.load(
194 |         open(args.model_config, "r"), Loader=yaml.FullLoader
195 |     )
196 |     train_config = yaml.load(
197 |         open(args.train_config, "r"), Loader=yaml.FullLoader
198 |     )
199 |     configs = (preprocess_config, model_config, train_config)
200 | 
201 |     # Get model
202 |     model = get_model(args, configs, device, train=False)
203 | 
204 |     # Load vocoder
205 |     vocoder = get_vocoder(model_config, device)
206 | 
207 |     # Preprocess texts
208 |     if args.mode == "batch":
209 |         # Get dataset
210 |         dataset = TextDataset(args.source, preprocess_config)
211 |         batchs = DataLoader(
212 |             dataset,
213 |             batch_size=8,
214 |             collate_fn=dataset.collate_fn,
215 |         )
216 |     if args.mode == "single":
217 |         ids = raw_texts = [args.text[:100]]
218 |         speakers = np.array([args.speaker_id])
219 |         if preprocess_config["preprocessing"]["text"]["language"] == "en":
220 |             texts = np.array([preprocess_english(args.text, preprocess_config)])
221 |         elif preprocess_config["preprocessing"]["text"]["language"] == "zh":
222 |             texts = np.array(
223 |                 [preprocess_mandarin(args.text, preprocess_config)]
224 |             )
225 |         text_lens = np.array([len(texts[0])])
226 |         batchs = [(ids, raw_texts, speakers, texts, text_lens, max(text_lens))]
227 | 
228 |     control_values = (
229 |         args.pitch_control,
230 |         args.energy_control,
231 |         args.duration_control,
232 |     )
233 | 
234 |     synthesize(
235 |         model, args.restore_step, configs, vocoder, batchs, control_values
236 |     )
237 | 


--------------------------------------------------------------------------------
/fs_two/text/__init__.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | import re
 3 | from fs_two.text import cleaners
 4 | from fs_two.text.symbols import symbols
 5 | 
 6 | 
 7 | # Mappings from symbol to numeric ID and vice versa:
 8 | _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 9 | _id_to_symbol = {i: s for i, s in enumerate(symbols)}
10 | 
11 | # Regular expression matching text enclosed in curly braces:
12 | _curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
13 | 
14 | 
15 | def text_to_sequence(text, cleaner_names):
16 |     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
17 | 
18 |     The text can optionally have ARPAbet sequences enclosed in curly braces embedded
19 |     in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
20 | 
21 |     Args:
22 |       text: string to convert to a sequence
23 |       cleaner_names: names of the cleaner functions to run the text through
24 | 
25 |     Returns:
26 |       List of integers corresponding to the symbols in the text
27 |     """
28 |     sequence = []
29 | 
30 |     # Check for curly braces and treat their contents as ARPAbet:
31 |     while len(text):
32 |         m = _curly_re.match(text)
33 | 
34 |         if not m:
35 |             sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
36 |             break
37 |         sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
38 |         sequence += _arpabet_to_sequence(m.group(2))
39 |         text = m.group(3)
40 | 
41 |     return sequence
42 | 
43 | 
44 | def sequence_to_text(sequence):
45 |     """Converts a sequence of IDs back to a string"""
46 |     result = ""
47 |     for symbol_id in sequence:
48 |         if symbol_id in _id_to_symbol:
49 |             s = _id_to_symbol[symbol_id]
50 |             # Enclose ARPAbet back in curly braces:
51 |             if len(s) > 1 and s[0] == "@":
52 |                 s = "{%s}" % s[1:]
53 |             result += s
54 |     return result.replace("}{", " ")
55 | 
56 | 
57 | def _clean_text(text, cleaner_names):
58 |     for name in cleaner_names:
59 |         cleaner = getattr(cleaners, name)
60 |         if not cleaner:
61 |             raise Exception("Unknown cleaner: %s" % name)
62 |         text = cleaner(text)
63 |     return text
64 | 
65 | 
66 | def _symbols_to_sequence(symbols):
67 |     return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
68 | 
69 | 
70 | def _arpabet_to_sequence(text):
71 |     return _symbols_to_sequence(["@" + s for s in text.split()])
72 | 
73 | 
74 | def _should_keep_symbol(s):
75 |     return s in _symbol_to_id and s != "_" and s != "~"
76 | 


--------------------------------------------------------------------------------
/fs_two/text/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/text/__pycache__/cleaners.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/cleaners.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/text/__pycache__/cmudict.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/cmudict.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/text/__pycache__/numbers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/numbers.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/text/__pycache__/pinyin.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/pinyin.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/text/__pycache__/russian.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/russian.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/text/__pycache__/symbols.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/symbols.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/text/cleaners.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | '''
 4 | Cleaners are transformations that run over the input text at both training and eval time.
 5 | 
 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use:
 8 |   1. "english_cleaners" for English text
 9 |   2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10 |      the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11 |   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12 |      the symbols in symbols.py to match your data).
13 | '''
14 | 
15 | 
16 | # Regular expression matching whitespace:
17 | import re
18 | from unidecode import unidecode
19 | from .numbers import normalize_numbers
20 | _whitespace_re = re.compile(r'\s+')
21 | 
22 | # List of (regular expression, replacement) pairs for abbreviations:
23 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
24 |     ('mrs', 'misess'),
25 |     ('mr', 'mister'),
26 |     ('dr', 'doctor'),
27 |     ('st', 'saint'),
28 |     ('co', 'company'),
29 |     ('jr', 'junior'),
30 |     ('maj', 'major'),
31 |     ('gen', 'general'),
32 |     ('drs', 'doctors'),
33 |     ('rev', 'reverend'),
34 |     ('lt', 'lieutenant'),
35 |     ('hon', 'honorable'),
36 |     ('sgt', 'sergeant'),
37 |     ('capt', 'captain'),
38 |     ('esq', 'esquire'),
39 |     ('ltd', 'limited'),
40 |     ('col', 'colonel'),
41 |     ('ft', 'fort'),
42 | ]]
43 | 
44 | 
45 | def expand_abbreviations(text):
46 |     for regex, replacement in _abbreviations:
47 |         text = re.sub(regex, replacement, text)
48 |     return text
49 | 
50 | 
51 | def expand_numbers(text):
52 |     return normalize_numbers(text)
53 | 
54 | 
55 | def lowercase(text):
56 |     return text.lower()
57 | 
58 | 
59 | def collapse_whitespace(text):
60 |     return re.sub(_whitespace_re, ' ', text)
61 | 
62 | 
63 | def convert_to_ascii(text):
64 |     return unidecode(text)
65 | 
66 | 
67 | def basic_cleaners(text):
68 |     '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
69 |     text = lowercase(text)
70 |     text = collapse_whitespace(text)
71 |     return text
72 | 
73 | 
74 | def transliteration_cleaners(text):
75 |     '''Pipeline for non-English text that transliterates to ASCII.'''
76 |     text = convert_to_ascii(text)
77 |     text = lowercase(text)
78 |     text = collapse_whitespace(text)
79 |     return text
80 | 
81 | 
82 | def english_cleaners(text):
83 |     '''Pipeline for English text, including number and abbreviation expansion.'''
84 |     text = convert_to_ascii(text)
85 |     text = lowercase(text)
86 |     text = expand_numbers(text)
87 |     text = expand_abbreviations(text)
88 |     text = collapse_whitespace(text)
89 |     return text
90 | 


--------------------------------------------------------------------------------
/fs_two/text/cmudict.py:
--------------------------------------------------------------------------------
  1 | """ from https://github.com/keithito/tacotron """
  2 | 
  3 | import re
  4 | 
  5 | 
  6 | valid_symbols = [
  7 |     "AA",
  8 |     "AA0",
  9 |     "AA1",
 10 |     "AA2",
 11 |     "AE",
 12 |     "AE0",
 13 |     "AE1",
 14 |     "AE2",
 15 |     "AH",
 16 |     "AH0",
 17 |     "AH1",
 18 |     "AH2",
 19 |     "AO",
 20 |     "AO0",
 21 |     "AO1",
 22 |     "AO2",
 23 |     "AW",
 24 |     "AW0",
 25 |     "AW1",
 26 |     "AW2",
 27 |     "AY",
 28 |     "AY0",
 29 |     "AY1",
 30 |     "AY2",
 31 |     "B",
 32 |     "CH",
 33 |     "D",
 34 |     "DH",
 35 |     "EH",
 36 |     "EH0",
 37 |     "EH1",
 38 |     "EH2",
 39 |     "ER",
 40 |     "ER0",
 41 |     "ER1",
 42 |     "ER2",
 43 |     "EY",
 44 |     "EY0",
 45 |     "EY1",
 46 |     "EY2",
 47 |     "F",
 48 |     "G",
 49 |     "HH",
 50 |     "IH",
 51 |     "IH0",
 52 |     "IH1",
 53 |     "IH2",
 54 |     "IY",
 55 |     "IY0",
 56 |     "IY1",
 57 |     "IY2",
 58 |     "JH",
 59 |     "K",
 60 |     "L",
 61 |     "M",
 62 |     "N",
 63 |     "NG",
 64 |     "OW",
 65 |     "OW0",
 66 |     "OW1",
 67 |     "OW2",
 68 |     "OY",
 69 |     "OY0",
 70 |     "OY1",
 71 |     "OY2",
 72 |     "P",
 73 |     "R",
 74 |     "S",
 75 |     "SH",
 76 |     "T",
 77 |     "TH",
 78 |     "UH",
 79 |     "UH0",
 80 |     "UH1",
 81 |     "UH2",
 82 |     "UW",
 83 |     "UW0",
 84 |     "UW1",
 85 |     "UW2",
 86 |     "V",
 87 |     "W",
 88 |     "Y",
 89 |     "Z",
 90 |     "ZH",
 91 | ]
 92 | 
 93 | _valid_symbol_set = set(valid_symbols)
 94 | 
 95 | 
 96 | class CMUDict:
 97 |     """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
 98 | 
 99 |     def __init__(self, file_or_path, keep_ambiguous=True):
100 |         if isinstance(file_or_path, str):
101 |             with open(file_or_path, encoding="latin-1") as f:
102 |                 entries = _parse_cmudict(f)
103 |         else:
104 |             entries = _parse_cmudict(file_or_path)
105 |         if not keep_ambiguous:
106 |             entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
107 |         self._entries = entries
108 | 
109 |     def __len__(self):
110 |         return len(self._entries)
111 | 
112 |     def lookup(self, word):
113 |         """Returns list of ARPAbet pronunciations of the given word."""
114 |         return self._entries.get(word.upper())
115 | 
116 | 
117 | _alt_re = re.compile(r"\([0-9]+\)")
118 | 
119 | 
120 | def _parse_cmudict(file):
121 |     cmudict = {}
122 |     for line in file:
123 |         if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
124 |             parts = line.split("  ")
125 |             word = re.sub(_alt_re, "", parts[0])
126 |             pronunciation = _get_pronunciation(parts[1])
127 |             if pronunciation:
128 |                 if word in cmudict:
129 |                     cmudict[word].append(pronunciation)
130 |                 else:
131 |                     cmudict[word] = [pronunciation]
132 |     return cmudict
133 | 
134 | 
135 | def _get_pronunciation(s):
136 |     parts = s.strip().split(" ")
137 |     for part in parts:
138 |         if part not in _valid_symbol_set:
139 |             return None
140 |     return " ".join(parts)
141 | 


--------------------------------------------------------------------------------
/fs_two/text/numbers.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | import inflect
 4 | import re
 5 | 
 6 | 
 7 | _inflect = inflect.engine()
 8 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
 9 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
10 | _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
11 | _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
12 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
13 | _number_re = re.compile(r"[0-9]+")
14 | 
15 | 
16 | def _remove_commas(m):
17 |     return m.group(1).replace(",", "")
18 | 
19 | 
20 | def _expand_decimal_point(m):
21 |     return m.group(1).replace(".", " point ")
22 | 
23 | 
24 | def _expand_dollars(m):
25 |     match = m.group(1)
26 |     parts = match.split(".")
27 |     if len(parts) > 2:
28 |         return match + " dollars"  # Unexpected format
29 |     dollars = int(parts[0]) if parts[0] else 0
30 |     cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
31 |     if dollars and cents:
32 |         dollar_unit = "dollar" if dollars == 1 else "dollars"
33 |         cent_unit = "cent" if cents == 1 else "cents"
34 |         return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
35 |     elif dollars:
36 |         dollar_unit = "dollar" if dollars == 1 else "dollars"
37 |         return "%s %s" % (dollars, dollar_unit)
38 |     elif cents:
39 |         cent_unit = "cent" if cents == 1 else "cents"
40 |         return "%s %s" % (cents, cent_unit)
41 |     else:
42 |         return "zero dollars"
43 | 
44 | 
45 | def _expand_ordinal(m):
46 |     return _inflect.number_to_words(m.group(0))
47 | 
48 | 
49 | def _expand_number(m):
50 |     num = int(m.group(0))
51 |     if num > 1000 and num < 3000:
52 |         if num == 2000:
53 |             return "two thousand"
54 |         elif num > 2000 and num < 2010:
55 |             return "two thousand " + _inflect.number_to_words(num % 100)
56 |         elif num % 100 == 0:
57 |             return _inflect.number_to_words(num // 100) + " hundred"
58 |         else:
59 |             return _inflect.number_to_words(
60 |                 num, andword="", zero="oh", group=2
61 |             ).replace(", ", " ")
62 |     else:
63 |         return _inflect.number_to_words(num, andword="")
64 | 
65 | 
66 | def normalize_numbers(text):
67 |     text = re.sub(_comma_number_re, _remove_commas, text)
68 |     text = re.sub(_pounds_re, r"\1 pounds", text)
69 |     text = re.sub(_dollars_re, _expand_dollars, text)
70 |     text = re.sub(_decimal_number_re, _expand_decimal_point, text)
71 |     text = re.sub(_ordinal_re, _expand_ordinal, text)
72 |     text = re.sub(_number_re, _expand_number, text)
73 |     return text
74 | 


--------------------------------------------------------------------------------
/fs_two/text/pinyin.py:
--------------------------------------------------------------------------------
  1 | initials = [
  2 |     "b",
  3 |     "c",
  4 |     "ch",
  5 |     "d",
  6 |     "f",
  7 |     "g",
  8 |     "h",
  9 |     "j",
 10 |     "k",
 11 |     "l",
 12 |     "m",
 13 |     "n",
 14 |     "p",
 15 |     "q",
 16 |     "r",
 17 |     "s",
 18 |     "sh",
 19 |     "t",
 20 |     "w",
 21 |     "x",
 22 |     "y",
 23 |     "z",
 24 |     "zh",
 25 | ]
 26 | finals = [
 27 |     "a1",
 28 |     "a2",
 29 |     "a3",
 30 |     "a4",
 31 |     "a5",
 32 |     "ai1",
 33 |     "ai2",
 34 |     "ai3",
 35 |     "ai4",
 36 |     "ai5",
 37 |     "an1",
 38 |     "an2",
 39 |     "an3",
 40 |     "an4",
 41 |     "an5",
 42 |     "ang1",
 43 |     "ang2",
 44 |     "ang3",
 45 |     "ang4",
 46 |     "ang5",
 47 |     "ao1",
 48 |     "ao2",
 49 |     "ao3",
 50 |     "ao4",
 51 |     "ao5",
 52 |     "e1",
 53 |     "e2",
 54 |     "e3",
 55 |     "e4",
 56 |     "e5",
 57 |     "ei1",
 58 |     "ei2",
 59 |     "ei3",
 60 |     "ei4",
 61 |     "ei5",
 62 |     "en1",
 63 |     "en2",
 64 |     "en3",
 65 |     "en4",
 66 |     "en5",
 67 |     "eng1",
 68 |     "eng2",
 69 |     "eng3",
 70 |     "eng4",
 71 |     "eng5",
 72 |     "er1",
 73 |     "er2",
 74 |     "er3",
 75 |     "er4",
 76 |     "er5",
 77 |     "i1",
 78 |     "i2",
 79 |     "i3",
 80 |     "i4",
 81 |     "i5",
 82 |     "ia1",
 83 |     "ia2",
 84 |     "ia3",
 85 |     "ia4",
 86 |     "ia5",
 87 |     "ian1",
 88 |     "ian2",
 89 |     "ian3",
 90 |     "ian4",
 91 |     "ian5",
 92 |     "iang1",
 93 |     "iang2",
 94 |     "iang3",
 95 |     "iang4",
 96 |     "iang5",
 97 |     "iao1",
 98 |     "iao2",
 99 |     "iao3",
100 |     "iao4",
101 |     "iao5",
102 |     "ie1",
103 |     "ie2",
104 |     "ie3",
105 |     "ie4",
106 |     "ie5",
107 |     "ii1",
108 |     "ii2",
109 |     "ii3",
110 |     "ii4",
111 |     "ii5",
112 |     "iii1",
113 |     "iii2",
114 |     "iii3",
115 |     "iii4",
116 |     "iii5",
117 |     "in1",
118 |     "in2",
119 |     "in3",
120 |     "in4",
121 |     "in5",
122 |     "ing1",
123 |     "ing2",
124 |     "ing3",
125 |     "ing4",
126 |     "ing5",
127 |     "iong1",
128 |     "iong2",
129 |     "iong3",
130 |     "iong4",
131 |     "iong5",
132 |     "iou1",
133 |     "iou2",
134 |     "iou3",
135 |     "iou4",
136 |     "iou5",
137 |     "o1",
138 |     "o2",
139 |     "o3",
140 |     "o4",
141 |     "o5",
142 |     "ong1",
143 |     "ong2",
144 |     "ong3",
145 |     "ong4",
146 |     "ong5",
147 |     "ou1",
148 |     "ou2",
149 |     "ou3",
150 |     "ou4",
151 |     "ou5",
152 |     "u1",
153 |     "u2",
154 |     "u3",
155 |     "u4",
156 |     "u5",
157 |     "ua1",
158 |     "ua2",
159 |     "ua3",
160 |     "ua4",
161 |     "ua5",
162 |     "uai1",
163 |     "uai2",
164 |     "uai3",
165 |     "uai4",
166 |     "uai5",
167 |     "uan1",
168 |     "uan2",
169 |     "uan3",
170 |     "uan4",
171 |     "uan5",
172 |     "uang1",
173 |     "uang2",
174 |     "uang3",
175 |     "uang4",
176 |     "uang5",
177 |     "uei1",
178 |     "uei2",
179 |     "uei3",
180 |     "uei4",
181 |     "uei5",
182 |     "uen1",
183 |     "uen2",
184 |     "uen3",
185 |     "uen4",
186 |     "uen5",
187 |     "uo1",
188 |     "uo2",
189 |     "uo3",
190 |     "uo4",
191 |     "uo5",
192 |     "v1",
193 |     "v2",
194 |     "v3",
195 |     "v4",
196 |     "v5",
197 |     "van1",
198 |     "van2",
199 |     "van3",
200 |     "van4",
201 |     "van5",
202 |     "ve1",
203 |     "ve2",
204 |     "ve3",
205 |     "ve4",
206 |     "ve5",
207 |     "vn1",
208 |     "vn2",
209 |     "vn3",
210 |     "vn4",
211 |     "vn5",
212 | ]
213 | valid_symbols = initials + finals + ["rr"]


--------------------------------------------------------------------------------
/fs_two/text/russian.py:
--------------------------------------------------------------------------------
  1 | valid_symbols = [
  2 |     "A",
  3 |     "A0",
  4 |     "B",
  5 |     "B0",
  6 |     "D",
  7 |     "D0",
  8 |     "DZ",
  9 |     "DZ0",
 10 |     "DZH",
 11 |     "DZH0",
 12 |     "E0",
 13 |     "F",
 14 |     "F0",
 15 |     "G",
 16 |     "G0",
 17 |     "GH",
 18 |     "I",
 19 |     "I0",
 20 |     "J0",
 21 |     "K",
 22 |     "K0",
 23 |     "KH",
 24 |     "KH0",
 25 |     "L",
 26 |     "L0",
 27 |     "M",
 28 |     "M0",
 29 |     "N",
 30 |     "N0",
 31 |     "O",
 32 |     "O0",
 33 |     "P",
 34 |     "P0",
 35 |     "R",
 36 |     "R0",
 37 |     "S",
 38 |     "S0",
 39 |     "SH",
 40 |     "SH0",
 41 |     "T",
 42 |     "T0",
 43 |     "TS",
 44 |     "TS0",
 45 |     "TSH",
 46 |     "TSH0",
 47 |     "U",
 48 |     "U0",
 49 |     "V",
 50 |     "V0",
 51 |     "Y",
 52 |     "Y0",
 53 |     "Z",
 54 |     "Z0",
 55 |     "ZH",
 56 | ]
 57 | 
 58 | old_valid_symbols = [
 59 |     "S",
 60 |     "Sj",
 61 |     "StS",
 62 |     "StSj",
 63 |     "Z",
 64 |     "Zj",
 65 |     "a",
 66 |     "b",
 67 |     "bj",
 68 |     "d",
 69 |     "dj",
 70 |     "e",
 71 |     "f",
 72 |     "g",
 73 |     "hrd",
 74 |     "i",
 75 |     "i2",
 76 |     "j",
 77 |     "jA",
 78 |     "jE",
 79 |     "jO",
 80 |     "jU",
 81 |     "k",
 82 |     "l",
 83 |     "lj",
 84 |     "m",
 85 |     "mj",
 86 |     "n",
 87 |     "nj",
 88 |     "o",
 89 |     "p",
 90 |     "pj",
 91 |     "r",
 92 |     "rj",
 93 |     "s",
 94 |     "sj",
 95 |     "t",
 96 |     "tS",
 97 |     "tSj",
 98 |     "tj",
 99 |     "ts",
100 |     "u",
101 |     "v",
102 |     "vj",
103 |     "x",
104 |     "z",
105 |     "zj",
106 | ]
107 | 


--------------------------------------------------------------------------------
/fs_two/text/symbols.py:
--------------------------------------------------------------------------------
 1 | """ from https://github.com/keithito/tacotron """
 2 | 
 3 | """
 4 | Defines the set of symbols used in text input to the model.
 5 | 
 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. """
 7 | 
 8 | from fs_two.text import cmudict, pinyin, russian
 9 | 
10 | _pad = "_"
11 | _mask = "mask"
12 | _punctuation = "!'(),.:;? "
13 | _special = "-"
14 | _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
15 | _silences = ["@sp", "@spn", "@sil"]
16 | 
17 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
18 | _arpabet = ["@" + s for s in cmudict.valid_symbols]
19 | _pinyin = ["@" + s for s in pinyin.valid_symbols]
20 | _russian = ["@" + s for s in russian.valid_symbols + [_mask]]
21 | 
22 | # Export all symbols:
23 | symbols = (
24 |     [_pad]
25 |     + list(_special)
26 |     + list(_punctuation)
27 |     + list(_letters)
28 |     + _arpabet
29 |     # + _pinyin
30 |     + _silences
31 |     + _russian
32 | )
33 | 


--------------------------------------------------------------------------------
/fs_two/transformer/Constants.py:
--------------------------------------------------------------------------------
 1 | PAD = 0
 2 | UNK = 1
 3 | BOS = 2
 4 | EOS = 3
 5 | 
 6 | PAD_WORD = "<blank>"
 7 | UNK_WORD = "<unk>"
 8 | BOS_WORD = "<s>"
 9 | EOS_WORD = "</s>"
10 | 


--------------------------------------------------------------------------------
/fs_two/transformer/Layers.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import numpy as np
  6 | from torch.nn import functional as F
  7 | 
  8 | from .SubLayers import MultiHeadAttention, PositionwiseFeedForward
  9 | 
 10 | 
 11 | class FFTBlock(torch.nn.Module):
 12 |     """FFT Block"""
 13 | 
 14 |     def __init__(
 15 |         self, d_model, n_head, d_k, d_v, d_inner, kernel_size, dropout=0.1
 16 |     ):
 17 |         super(FFTBlock, self).__init__()
 18 |         self.slf_attn = MultiHeadAttention(
 19 |             n_head, d_model, d_k, d_v, dropout=dropout
 20 |         )
 21 |         self.pos_ffn = PositionwiseFeedForward(
 22 |             d_model, d_inner, kernel_size, dropout=dropout
 23 |         )
 24 | 
 25 |     def forward(self, enc_input, mask=None, slf_attn_mask=None):
 26 |         enc_output, enc_slf_attn = self.slf_attn(
 27 |             enc_input, enc_input, enc_input, mask=slf_attn_mask
 28 |         )
 29 |         enc_output = enc_output.masked_fill(mask.unsqueeze(-1), 0)
 30 | 
 31 |         enc_output = self.pos_ffn(enc_output)
 32 |         enc_output = enc_output.masked_fill(mask.unsqueeze(-1), 0)
 33 | 
 34 |         return enc_output, enc_slf_attn
 35 | 
 36 | 
 37 | class ConvNorm(torch.nn.Module):
 38 |     def __init__(
 39 |         self,
 40 |         in_channels,
 41 |         out_channels,
 42 |         kernel_size=1,
 43 |         stride=1,
 44 |         padding=None,
 45 |         dilation=1,
 46 |         bias=True,
 47 |         w_init_gain="linear",
 48 |     ):
 49 |         super(ConvNorm, self).__init__()
 50 | 
 51 |         if padding is None:
 52 |             assert kernel_size % 2 == 1
 53 |             padding = int(dilation * (kernel_size - 1) / 2)
 54 | 
 55 |         self.conv = torch.nn.Conv1d(
 56 |             in_channels,
 57 |             out_channels,
 58 |             kernel_size=kernel_size,
 59 |             stride=stride,
 60 |             padding=padding,
 61 |             dilation=dilation,
 62 |             bias=bias,
 63 |         )
 64 | 
 65 |     def forward(self, signal):
 66 |         conv_signal = self.conv(signal)
 67 | 
 68 |         return conv_signal
 69 | 
 70 | 
 71 | class PostNet(nn.Module):
 72 |     """
 73 |     PostNet: Five 1-d convolution with 512 channels and kernel size 5
 74 |     """
 75 | 
 76 |     def __init__(
 77 |         self,
 78 |         n_mel_channels=80,
 79 |         postnet_embedding_dim=512,
 80 |         postnet_kernel_size=5,
 81 |         postnet_n_convolutions=5,
 82 |     ):
 83 | 
 84 |         super(PostNet, self).__init__()
 85 |         self.convolutions = nn.ModuleList()
 86 | 
 87 |         self.convolutions.append(
 88 |             nn.Sequential(
 89 |                 ConvNorm(
 90 |                     n_mel_channels,
 91 |                     postnet_embedding_dim,
 92 |                     kernel_size=postnet_kernel_size,
 93 |                     stride=1,
 94 |                     padding=int((postnet_kernel_size - 1) / 2),
 95 |                     dilation=1,
 96 |                     w_init_gain="tanh",
 97 |                 ),
 98 |                 nn.BatchNorm1d(postnet_embedding_dim),
 99 |             )
100 |         )
101 | 
102 |         for i in range(1, postnet_n_convolutions - 1):
103 |             self.convolutions.append(
104 |                 nn.Sequential(
105 |                     ConvNorm(
106 |                         postnet_embedding_dim,
107 |                         postnet_embedding_dim,
108 |                         kernel_size=postnet_kernel_size,
109 |                         stride=1,
110 |                         padding=int((postnet_kernel_size - 1) / 2),
111 |                         dilation=1,
112 |                         w_init_gain="tanh",
113 |                     ),
114 |                     nn.BatchNorm1d(postnet_embedding_dim),
115 |                 )
116 |             )
117 | 
118 |         self.convolutions.append(
119 |             nn.Sequential(
120 |                 ConvNorm(
121 |                     postnet_embedding_dim,
122 |                     n_mel_channels,
123 |                     kernel_size=postnet_kernel_size,
124 |                     stride=1,
125 |                     padding=int((postnet_kernel_size - 1) / 2),
126 |                     dilation=1,
127 |                     w_init_gain="linear",
128 |                 ),
129 |                 nn.BatchNorm1d(n_mel_channels),
130 |             )
131 |         )
132 | 
133 |     def forward(self, x):
134 |         x = x.contiguous().transpose(1, 2)
135 | 
136 |         for i in range(len(self.convolutions) - 1):
137 |             x = F.dropout(
138 |                 torch.tanh(self.convolutions[i](x)), 0.5, self.training
139 |             )
140 |         x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
141 | 
142 |         x = x.contiguous().transpose(1, 2)
143 |         return x


--------------------------------------------------------------------------------
/fs_two/transformer/Models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import numpy as np
  4 | 
  5 | import fs_two.transformer.Constants as Constants
  6 | from .Layers import FFTBlock
  7 | from fs_two.text.symbols import symbols
  8 | 
  9 | 
 10 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
 11 |     """ Sinusoid position encoding table """
 12 | 
 13 |     def cal_angle(position, hid_idx):
 14 |         return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
 15 | 
 16 |     def get_posi_angle_vec(position):
 17 |         return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
 18 | 
 19 |     sinusoid_table = np.array(
 20 |         [get_posi_angle_vec(pos_i) for pos_i in range(n_position)]
 21 |     )
 22 | 
 23 |     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
 24 |     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
 25 | 
 26 |     if padding_idx is not None:
 27 |         # zero vector for padding dimension
 28 |         sinusoid_table[padding_idx] = 0.0
 29 | 
 30 |     return torch.FloatTensor(sinusoid_table)
 31 | 
 32 | 
 33 | class Encoder(nn.Module):
 34 |     """ Encoder """
 35 | 
 36 |     def __init__(self, config):
 37 |         super(Encoder, self).__init__()
 38 | 
 39 |         n_position = config["max_seq_len"] + 1
 40 |         n_src_vocab = len(symbols) + 1
 41 |         d_word_vec = config["transformer"]["encoder_hidden"]
 42 |         n_layers = config["transformer"]["encoder_layer"]
 43 |         n_head = config["transformer"]["encoder_head"]
 44 |         d_k = d_v = (
 45 |             config["transformer"]["encoder_hidden"]
 46 |             // config["transformer"]["encoder_head"]
 47 |         )
 48 |         d_model = config["transformer"]["encoder_hidden"]
 49 |         d_inner = config["transformer"]["conv_filter_size"]
 50 |         kernel_size = config["transformer"]["conv_kernel_size"]
 51 |         dropout = config["transformer"]["encoder_dropout"]
 52 | 
 53 |         self.max_seq_len = config["max_seq_len"]
 54 |         self.d_model = d_model
 55 | 
 56 |         self.src_word_emb = nn.Embedding(
 57 |             n_src_vocab, d_word_vec, padding_idx=Constants.PAD
 58 |         )
 59 |         self.position_enc = nn.Parameter(
 60 |             get_sinusoid_encoding_table(n_position, d_word_vec).unsqueeze(0),
 61 |             requires_grad=False,
 62 |         )
 63 | 
 64 |         self.layer_stack = nn.ModuleList(
 65 |             [
 66 |                 FFTBlock(
 67 |                     d_model,
 68 |                     n_head,
 69 |                     d_k,
 70 |                     d_v,
 71 |                     d_inner,
 72 |                     kernel_size,
 73 |                     dropout=dropout,
 74 |                 )
 75 |                 for _ in range(n_layers)
 76 |             ]
 77 |         )
 78 | 
 79 |     def forward(self, src_seq, mask, return_attns=False):
 80 | 
 81 |         enc_slf_attn_list = []
 82 |         batch_size, max_len = src_seq.shape[0], src_seq.shape[1]
 83 | 
 84 |         # -- Prepare masks
 85 |         slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
 86 | 
 87 |         # -- Forward
 88 |         if not self.training and src_seq.shape[1] > self.max_seq_len:
 89 |             enc_output = self.src_word_emb(
 90 |                 src_seq
 91 |             ) + get_sinusoid_encoding_table(src_seq.shape[1], self.d_model)[
 92 |                 : src_seq.shape[1], :
 93 |             ].unsqueeze(
 94 |                 0
 95 |             ).expand(
 96 |                 batch_size, -1, -1
 97 |             ).to(
 98 |                 src_seq.device
 99 |             )
100 |         else:
101 |             enc_output = self.src_word_emb(src_seq) + self.position_enc[
102 |                 :, :max_len, :
103 |             ].expand(batch_size, -1, -1)
104 | 
105 |         for enc_layer in self.layer_stack:
106 |             enc_output, enc_slf_attn = enc_layer(
107 |                 enc_output, mask=mask, slf_attn_mask=slf_attn_mask
108 |             )
109 |             if return_attns:
110 |                 enc_slf_attn_list += [enc_slf_attn]
111 | 
112 |         return enc_output
113 | 
114 | 
115 | class Decoder(nn.Module):
116 |     """ Decoder """
117 | 
118 |     def __init__(self, config):
119 |         super(Decoder, self).__init__()
120 | 
121 |         n_position = config["max_seq_len"] + 1
122 |         d_word_vec = config["transformer"]["decoder_hidden"]
123 |         n_layers = config["transformer"]["decoder_layer"]
124 |         n_head = config["transformer"]["decoder_head"]
125 |         d_k = d_v = (
126 |             config["transformer"]["decoder_hidden"]
127 |             // config["transformer"]["decoder_head"]
128 |         )
129 |         d_model = config["transformer"]["decoder_hidden"]
130 |         d_inner = config["transformer"]["conv_filter_size"]
131 |         kernel_size = config["transformer"]["conv_kernel_size"]
132 |         dropout = config["transformer"]["decoder_dropout"]
133 | 
134 |         self.max_seq_len = config["max_seq_len"]
135 |         self.d_model = d_model
136 | 
137 |         self.position_enc = nn.Parameter(
138 |             get_sinusoid_encoding_table(n_position, d_word_vec).unsqueeze(0),
139 |             requires_grad=False,
140 |         )
141 | 
142 |         self.layer_stack = nn.ModuleList(
143 |             [
144 |                 FFTBlock(
145 |                     d_model,
146 |                     n_head,
147 |                     d_k,
148 |                     d_v,
149 |                     d_inner,
150 |                     kernel_size,
151 |                     dropout=dropout,
152 |                 )
153 |                 for _ in range(n_layers)
154 |             ]
155 |         )
156 | 
157 |     def forward(self, enc_seq, mask, return_attns=False):
158 | 
159 |         dec_slf_attn_list = []
160 |         batch_size, max_len = enc_seq.shape[0], enc_seq.shape[1]
161 | 
162 |         # -- Forward
163 |         if not self.training and enc_seq.shape[1] > self.max_seq_len:
164 |             # -- Prepare masks
165 |             slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
166 |             dec_output = enc_seq + get_sinusoid_encoding_table(
167 |                 enc_seq.shape[1], self.d_model
168 |             )[: enc_seq.shape[1], :].unsqueeze(0).expand(batch_size, -1, -1).to(
169 |                 enc_seq.device
170 |             )
171 |         else:
172 |             max_len = min(max_len, self.max_seq_len)
173 | 
174 |             # -- Prepare masks
175 |             slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
176 |             dec_output = enc_seq[:, :max_len, :] + self.position_enc[
177 |                 :, :max_len, :
178 |             ].expand(batch_size, -1, -1)
179 |             mask = mask[:, :max_len]
180 |             slf_attn_mask = slf_attn_mask[:, :, :max_len]
181 | 
182 |         for dec_layer in self.layer_stack:
183 |             dec_output, dec_slf_attn = dec_layer(
184 |                 dec_output, mask=mask, slf_attn_mask=slf_attn_mask
185 |             )
186 |             if return_attns:
187 |                 dec_slf_attn_list += [dec_slf_attn]
188 | 
189 |         return dec_output, mask
190 | 


--------------------------------------------------------------------------------
/fs_two/transformer/Modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | 
 6 | class ScaledDotProductAttention(nn.Module):
 7 |     """ Scaled Dot-Product Attention """
 8 | 
 9 |     def __init__(self, temperature):
10 |         super().__init__()
11 |         self.temperature = temperature
12 |         self.softmax = nn.Softmax(dim=2)
13 | 
14 |     def forward(self, q, k, v, mask=None):
15 |         attn = torch.bmm(q, k.transpose(1, 2))
16 |         attn = attn / self.temperature
17 | 
18 |         if mask is not None:
19 |             attn = attn.masked_fill(mask, -np.inf)
20 | 
21 |         attn = self.softmax(attn)
22 |         output = torch.bmm(attn, v)
23 | 
24 |         return output, attn
25 | 


--------------------------------------------------------------------------------
/fs_two/transformer/SubLayers.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | import numpy as np
  4 | 
  5 | from .Modules import ScaledDotProductAttention
  6 | 
  7 | 
  8 | class MultiHeadAttention(nn.Module):
  9 |     """ Multi-Head Attention module """
 10 | 
 11 |     def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
 12 |         super().__init__()
 13 | 
 14 |         self.n_head = n_head
 15 |         self.d_k = d_k
 16 |         self.d_v = d_v
 17 | 
 18 |         self.w_qs = nn.Linear(d_model, n_head * d_k)
 19 |         self.w_ks = nn.Linear(d_model, n_head * d_k)
 20 |         self.w_vs = nn.Linear(d_model, n_head * d_v)
 21 | 
 22 |         self.attention = ScaledDotProductAttention(
 23 |             temperature=np.power(d_k, 0.5)
 24 |         )
 25 |         self.layer_norm = nn.LayerNorm(d_model)
 26 | 
 27 |         self.fc = nn.Linear(n_head * d_v, d_model)
 28 | 
 29 |         self.dropout = nn.Dropout(dropout)
 30 | 
 31 |     def forward(self, q, k, v, mask=None):
 32 | 
 33 |         d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
 34 | 
 35 |         sz_b, len_q, _ = q.size()
 36 |         sz_b, len_k, _ = k.size()
 37 |         sz_b, len_v, _ = v.size()
 38 | 
 39 |         residual = q
 40 | 
 41 |         q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
 42 |         k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
 43 |         v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
 44 |         q = (
 45 |             q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k)
 46 |         )  # (n*b) x lq x dk
 47 |         k = (
 48 |             k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k)
 49 |         )  # (n*b) x lk x dk
 50 |         v = (
 51 |             v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v)
 52 |         )  # (n*b) x lv x dv
 53 | 
 54 |         mask = mask.repeat(n_head, 1, 1)  # (n*b) x .. x ..
 55 |         output, attn = self.attention(q, k, v, mask=mask)
 56 | 
 57 |         output = output.view(n_head, sz_b, len_q, d_v)
 58 |         output = (
 59 |             output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1)
 60 |         )  # b x lq x (n*dv)
 61 | 
 62 |         output = self.dropout(self.fc(output))
 63 |         output = self.layer_norm(output + residual)
 64 | 
 65 |         return output, attn
 66 | 
 67 | 
 68 | class PositionwiseFeedForward(nn.Module):
 69 |     """ A two-feed-forward-layer module """
 70 | 
 71 |     def __init__(self, d_in, d_hid, kernel_size, dropout=0.1):
 72 |         super().__init__()
 73 | 
 74 |         # Use Conv1D
 75 |         # position-wise
 76 |         self.w_1 = nn.Conv1d(
 77 |             d_in,
 78 |             d_hid,
 79 |             kernel_size=kernel_size[0],
 80 |             padding=(kernel_size[0] - 1) // 2,
 81 |         )
 82 |         # position-wise
 83 |         self.w_2 = nn.Conv1d(
 84 |             d_hid,
 85 |             d_in,
 86 |             kernel_size=kernel_size[1],
 87 |             padding=(kernel_size[1] - 1) // 2,
 88 |         )
 89 | 
 90 |         self.layer_norm = nn.LayerNorm(d_in)
 91 |         self.dropout = nn.Dropout(dropout)
 92 | 
 93 |     def forward(self, x):
 94 |         residual = x
 95 |         output = x.transpose(1, 2)
 96 |         output = self.w_2(F.relu(self.w_1(output)))
 97 |         output = output.transpose(1, 2)
 98 |         output = self.dropout(output)
 99 |         output = self.layer_norm(output+residual)
100 | 
101 |         return output


--------------------------------------------------------------------------------
/fs_two/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | from .Models import Encoder, Decoder
2 | from .Layers import PostNet


--------------------------------------------------------------------------------
/fs_two/transformer/__pycache__/Constants.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/transformer/__pycache__/Constants.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/transformer/__pycache__/Layers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/transformer/__pycache__/Layers.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/transformer/__pycache__/Models.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/transformer/__pycache__/Models.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/transformer/__pycache__/Modules.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/transformer/__pycache__/Modules.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/transformer/__pycache__/SubLayers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/transformer/__pycache__/SubLayers.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/transformer/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/transformer/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/utils/__pycache__/tools.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/utils/__pycache__/tools.cpython-38.pyc


--------------------------------------------------------------------------------
/fs_two/utils/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import numpy as np
  7 | 
  8 | # import fs_two.hifigan as hifigan
  9 | from fs_two.model import FastSpeech2, ScheduledOptim
 10 | 
 11 | 
 12 | def get_model(cfg, device, train=False, isModel=True, isEmbedding=True):
 13 | 
 14 |     model = FastSpeech2(cfg.preprocess_config, cfg.model_config, device=device)
 15 |     if cfg.tts.load_path:
 16 | 
 17 |         ckpt = torch.load(cfg.tts.load_path, map_location=torch.device("cpu"))
 18 |         if isModel:
 19 |             model.load_state_dict(ckpt["model"], strict=False)
 20 |         if isEmbedding:
 21 |             try:
 22 |                 model.load_state_dict(ckpt["embedding"], strict=False)
 23 |             except:
 24 |                 print("missing embedding")
 25 |         print("Loaded model from", cfg.tts.load_path)
 26 | 
 27 |     if train:
 28 |         # model = nn.DataParallel(model)
 29 |         model.to(device)
 30 |         model.train()
 31 |         scheduled_optim = ScheduledOptim(
 32 |             model, cfg.train_config, cfg.model_config, cfg.tts.restore_step
 33 |         )
 34 |         return model, scheduled_optim
 35 |     model.to(device)
 36 |     model.eval()
 37 |     model.requires_grad_ = False
 38 |     return model
 39 | 
 40 | 
 41 | def get_param_num(model):
 42 |     num_param = sum(param.numel() for param in model.parameters())
 43 |     return num_param
 44 | 
 45 | 
 46 | def get_vocoder(hifigan, config, device):
 47 |     if config["vocoder"]["use_cpu"]:
 48 |         device = "cpu"
 49 |     name = config["vocoder"]["model"]
 50 |     speaker = config["vocoder"]["speaker"]
 51 | 
 52 |     if name == "MelGAN":
 53 |         if speaker == "LJSpeech":
 54 |             vocoder = torch.hub.load(
 55 |                 "descriptinc/melgan-neurips", "load_melgan", "linda_johnson"
 56 |             )
 57 |         elif speaker == "universal":
 58 |             vocoder = torch.hub.load(
 59 |                 "descriptinc/melgan-neurips", "load_melgan", "multi_speaker"
 60 |             )
 61 |         vocoder.mel2wav.eval()
 62 |         vocoder.mel2wav.to(device)
 63 |     elif name == "HiFi-GAN":
 64 |         with open("./fs_two/hifigan/config.json", "r") as f:
 65 |             config = json.load(f)
 66 |         config = hifigan.AttrDict(config)
 67 |         vocoder = hifigan.Generator(config)
 68 |         if speaker == "LJSpeech":
 69 |             ckpt = torch.load("hifigan/generator_LJSpeech.pth.tar")
 70 |         elif speaker == "universal":
 71 |             ckpt = torch.load(
 72 |                 "/home/dev/other/fsp/weights/trained_original/hifi/generator_v1.pth",
 73 |                 map_location="cpu",
 74 |             )
 75 |         vocoder.load_state_dict(ckpt["generator"])
 76 |         vocoder.eval()
 77 |         vocoder.remove_weight_norm()
 78 |         # vocoder = nn.DataParallel(vocoder)
 79 |         vocoder.to(device)
 80 | 
 81 |     return vocoder
 82 | 
 83 | 
 84 | def vocoder_infer(mels, vocoder, model_config, preprocess_config, lengths=None):
 85 |     name = model_config["vocoder"]["model"]
 86 |     with torch.no_grad():
 87 |         if name == "MelGAN":
 88 |             wavs = vocoder.inverse(mels / np.log(10))
 89 |         elif name == "HiFi-GAN":
 90 |             wavs = vocoder(mels).squeeze(1)
 91 | 
 92 |     wavs = (
 93 |         wavs.cpu().numpy()
 94 |         * preprocess_config["preprocessing"]["audio"]["max_wav_value"]
 95 |     ).astype("int16")
 96 |     wavs = [wav for wav in wavs]
 97 | 
 98 |     for i in range(len(mels)):
 99 |         if lengths is not None:
100 |             wavs[i] = wavs[i][: lengths[i]]
101 | 
102 |     return wavs
103 | 


--------------------------------------------------------------------------------
/fs_two/utils/tools.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import random
  4 | 
  5 | import torch
  6 | import torch.nn.functional as F
  7 | import numpy as np
  8 | import matplotlib
  9 | from scipy.io import wavfile
 10 | from matplotlib import pyplot as plt
 11 | 
 12 | matplotlib.use("Agg")
 13 | 
 14 | 
 15 | def to_device(data, device="cpu"):
 16 |     if len(data) == 15:
 17 |         (
 18 |             ids,
 19 |             raw_texts,
 20 |             speakers,
 21 |             texts,
 22 |             src_lens,
 23 |             max_src_len,
 24 |             mels,
 25 |             mel_lens,
 26 |             max_mel_len,
 27 |             energies,
 28 |             durations,
 29 |             pitches_raw,
 30 |             pitches_cwt,
 31 |             pitches_mean,
 32 |             pitches_std,
 33 |         ) = data
 34 | 
 35 |         speakers = torch.from_numpy(speakers).long().to(device)
 36 |         texts = torch.from_numpy(texts).long().to(device)
 37 |         src_lens = torch.from_numpy(src_lens).to(device)
 38 |         mels = torch.from_numpy(mels).float().to(device)
 39 |         mel_lens = torch.from_numpy(mel_lens).to(device)
 40 |         energies = torch.from_numpy(energies).to(device)
 41 |         durations = torch.from_numpy(durations).long().to(device)
 42 | 
 43 |         pitches_cwt = torch.from_numpy(pitches_cwt).float().to(device)
 44 |         pitches_cwt = torch.nan_to_num(pitches_cwt, nan=0.0)
 45 | 
 46 |         pitches_raw = torch.from_numpy(pitches_raw).float().to(device)
 47 |         pitches_mean = torch.from_numpy(pitches_mean).float().to(device)
 48 |         pitches_std = torch.from_numpy(pitches_std).float().to(device)
 49 | 
 50 |         return (
 51 |             ids,
 52 |             raw_texts,
 53 |             speakers,
 54 |             texts,
 55 |             src_lens,
 56 |             max_src_len,
 57 |             mels,
 58 |             mel_lens,
 59 |             max_mel_len,
 60 |             energies,
 61 |             durations,
 62 |             pitches_raw,
 63 |             pitches_cwt,
 64 |             pitches_mean,
 65 |             pitches_std,
 66 |         )
 67 | 
 68 |     if len(data) == 6:
 69 |         (
 70 |             ids,
 71 |             raw_texts,
 72 |             speakers,
 73 |             texts,
 74 |             src_lens,
 75 |             max_src_len,
 76 |             # speakers_emb,
 77 |         ) = data
 78 | 
 79 |         speakers = torch.from_numpy(speakers).long().to(device)
 80 |         texts = torch.from_numpy(texts).long().to(device)
 81 |         src_lens = torch.from_numpy(src_lens).to(device)
 82 | 
 83 |         return (ids, raw_texts, speakers, texts, src_lens, max_src_len)
 84 | 
 85 | 
 86 | def log(
 87 |     logger,
 88 |     train_val,
 89 |     step=None,
 90 |     losses=None,
 91 |     fig=None,
 92 |     audio=None,
 93 |     sampling_rate=22050,
 94 |     tag="",
 95 | ):
 96 |     losses_names = [
 97 |         "Loss/total_loss",
 98 |         "Loss/mel_loss",
 99 |         "Loss/pitch_loss",
100 |         "Loss/energy_loss",
101 |         "Loss/duration_loss ",
102 |         "Mean pitch loss",
103 |         "Std pitch loss",
104 |     ]
105 | 
106 |     if losses is not None:
107 |         log_message = {
108 |             f"{losses_names[i]} {train_val.upper()}": losses[i]
109 |             for i in range(len(losses))
110 |         }
111 |         logger.log(log_message)
112 | 
113 |     if fig is not None:
114 |         logger.log({f"Spec {train_val.upper()}": fig})
115 | 
116 |     if audio is not None:
117 |         a = [logger.Audio(audio / max(abs(audio)), sample_rate=sampling_rate)]
118 |         logger.log({f"Audio {train_val.upper()}": a})
119 | 
120 | 
121 | def get_mask_from_lengths(lengths, max_len=None, device="cpu"):
122 |     batch_size = lengths.shape[0]
123 |     if max_len is None:
124 |         max_len = torch.max(lengths).item()
125 | 
126 |     ids = (
127 |         torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(device)
128 |     )
129 |     mask = ids >= lengths.unsqueeze(1).float().expand(-1, max_len)
130 | 
131 |     return mask
132 | 
133 | 
134 | def expand(values, durations):
135 |     out = list()
136 |     for value, d in zip(values, durations):
137 |         out += [value] * max(0, int(d))
138 |     return np.array(out)
139 | 
140 | 
141 | def synth_one_sample(
142 |     targets, predictions, vocoder, model_config, preprocess_config
143 | ):
144 |     b_size = len(targets[0])
145 |     rand_id = random.randint(0, b_size - 1)
146 |     basename = targets[0][rand_id]
147 |     src_len = predictions[7][rand_id].item()
148 |     mel_len = predictions[8][rand_id].item()
149 |     mel_target = targets[6][rand_id, :mel_len].detach().transpose(0, 1)
150 |     mel_prediction = predictions[9][rand_id, :mel_len].detach().transpose(0, 1)
151 |     duration = targets[10][rand_id, :src_len].detach().cpu().numpy()
152 |     if (
153 |         preprocess_config["preprocessing"]["pitch"]["feature"]
154 |         == "phoneme_level"
155 |     ):
156 |         pitch = targets[11][rand_id, :src_len].detach().cpu().numpy()
157 |         pitch = expand(pitch, duration)
158 |     else:
159 |         pitch = targets[11][rand_id, :mel_len].detach().cpu().numpy()
160 |     if (
161 |         preprocess_config["preprocessing"]["energy"]["feature"]
162 |         == "phoneme_level"
163 |     ):
164 |         energy = targets[9][rand_id, :src_len].detach().cpu().numpy()
165 |         energy = expand(energy, duration)
166 |     else:
167 |         energy = targets[9][rand_id, :mel_len].detach().cpu().numpy()
168 | 
169 |     with open(
170 |         os.path.join(
171 |             preprocess_config["path"]["preprocessed_path"], "stats.json"
172 |         )
173 |     ) as f:
174 |         stats = json.load(f)
175 |         stats = stats["pitch"] + stats["energy"][:2]
176 | 
177 |     fig = plot_mel(
178 |         [
179 |             (mel_prediction.cpu().numpy(), pitch, energy),
180 |             (mel_target.cpu().numpy(), pitch, energy),
181 |         ],
182 |         stats,
183 |         ["Synthetized Spectrogram", "Ground-Truth Spectrogram"],
184 |     )
185 | 
186 |     if vocoder is not None:
187 |         from .model import vocoder_infer
188 | 
189 |         if model_config["vocoder"]["use_cpu"]:
190 |             mel_target = mel_target.to("cpu")
191 |         wav_reconstruction = vocoder_infer(
192 |             mel_target.unsqueeze(0),
193 |             vocoder,
194 |             model_config,
195 |             preprocess_config,
196 |         )[0]
197 |         wav_prediction = vocoder_infer(
198 |             mel_prediction.unsqueeze(0),
199 |             vocoder,
200 |             model_config,
201 |             preprocess_config,
202 |         )[0]
203 |     else:
204 |         wav_reconstruction = wav_prediction = None
205 | 
206 |     return fig, wav_reconstruction, wav_prediction, basename
207 | 
208 | 
209 | def synth_samples(
210 |     targets, predictions, vocoder, model_config, preprocess_config, path
211 | ):
212 | 
213 |     basenames = targets[0]
214 |     for i in range(len(predictions[0])):
215 |         basename = basenames[i]
216 |         src_len = predictions[7][i].item()
217 |         mel_len = predictions[8][i].item()
218 |         mel_prediction = predictions[9][i, :mel_len].detach().transpose(0, 1)
219 |         duration = predictions[4][i, :src_len].detach().cpu().numpy()
220 |         if (
221 |             preprocess_config["preprocessing"]["pitch"]["feature"]
222 |             == "phoneme_level"
223 |         ):
224 |             pitch = predictions[1][i, :src_len].detach().cpu().numpy()
225 |             pitch = expand(pitch, duration)
226 |         else:
227 |             pitch = predictions[1][i, :mel_len].detach().cpu().numpy()
228 |         if (
229 |             preprocess_config["preprocessing"]["energy"]["feature"]
230 |             == "phoneme_level"
231 |         ):
232 |             energy = predictions[2][i, :src_len].detach().cpu().numpy()
233 |             energy = expand(energy, duration)
234 |         else:
235 |             energy = predictions[2][i, :mel_len].detach().cpu().numpy()
236 | 
237 |         with open(
238 |             os.path.join(
239 |                 preprocess_config["path"]["preprocessed_path"], "stats.json"
240 |             )
241 |         ) as f:
242 |             stats = json.load(f)
243 |             stats = stats["pitch"] + stats["energy"][:2]
244 | 
245 |         fig = plot_mel(
246 |             [
247 |                 (mel_prediction.cpu().numpy(), pitch, energy),
248 |             ],
249 |             stats,
250 |             ["Synthetized Spectrogram"],
251 |         )
252 |         plt.savefig(os.path.join(path, "{}.png".format(basename)))
253 |         plt.close()
254 | 
255 |     from .model import vocoder_infer
256 | 
257 |     mel_predictions = predictions[9].transpose(1, 2)
258 |     lengths = (
259 |         predictions[8]
260 |         * preprocess_config["preprocessing"]["stft"]["hop_length"]
261 |     )
262 |     wav_predictions = vocoder_infer(
263 |         mel_predictions,
264 |         vocoder,
265 |         model_config,
266 |         preprocess_config,
267 |         lengths=lengths,
268 |     )
269 | 
270 |     sampling_rate = preprocess_config["preprocessing"]["audio"]["sampling_rate"]
271 |     for wav, basename in zip(wav_predictions, basenames):
272 |         wavfile.write(
273 |             os.path.join(path, "{}.wav".format(basename)), sampling_rate, wav
274 |         )
275 | 
276 | 
277 | def plot_mel(data, stats, titles):
278 |     fig, axes = plt.subplots(len(data), 1, squeeze=False)
279 |     if titles is None:
280 |         titles = [None for i in range(len(data))]
281 |     pitch_min, pitch_max, pitch_mean, pitch_std, energy_min, energy_max = stats
282 |     pitch_min = pitch_min * pitch_std + pitch_mean
283 |     pitch_max = pitch_max * pitch_std + pitch_mean
284 | 
285 |     def add_axis(fig, old_ax):
286 |         ax = fig.add_axes(old_ax.get_position(), anchor="W")
287 |         ax.set_facecolor("None")
288 |         return ax
289 | 
290 |     for i in range(len(data)):
291 |         mel, pitch, energy = data[i]
292 |         pitch = pitch * pitch_std + pitch_mean
293 |         axes[i][0].imshow(mel, origin="lower")
294 |         axes[i][0].set_aspect(2.5, adjustable="box")
295 |         axes[i][0].set_ylim(0, mel.shape[0])
296 |         axes[i][0].set_title(titles[i], fontsize="medium")
297 |         axes[i][0].tick_params(labelsize="x-small", left=False, labelleft=False)
298 |         axes[i][0].set_anchor("W")
299 | 
300 |         ax1 = add_axis(fig, axes[i][0])
301 |         ax1.plot(pitch, color="tomato")
302 |         ax1.set_xlim(0, mel.shape[1])
303 |         ax1.set_ylim(0, pitch_max)
304 |         ax1.set_ylabel("F0", color="tomato")
305 |         ax1.tick_params(
306 |             labelsize="x-small",
307 |             colors="tomato",
308 |             bottom=False,
309 |             labelbottom=False,
310 |         )
311 | 
312 |         ax2 = add_axis(fig, axes[i][0])
313 |         ax2.plot(energy, color="darkviolet")
314 |         ax2.set_xlim(0, mel.shape[1])
315 |         ax2.set_ylim(energy_min, energy_max)
316 |         ax2.set_ylabel("Energy", color="darkviolet")
317 |         ax2.yaxis.set_label_position("right")
318 |         ax2.tick_params(
319 |             labelsize="x-small",
320 |             colors="darkviolet",
321 |             bottom=False,
322 |             labelbottom=False,
323 |             left=False,
324 |             labelleft=False,
325 |             right=True,
326 |             labelright=True,
327 |         )
328 | 
329 |     return fig
330 | 
331 | 
332 | def pad_1D(inputs, PAD=0):
333 |     def pad_data(x, length, PAD):
334 |         x_padded = np.pad(
335 |             x, (0, length - x.shape[0]), mode="constant", constant_values=PAD
336 |         )
337 |         return x_padded
338 | 
339 |     max_len = max((len(x) for x in inputs))
340 |     padded = np.stack([pad_data(x, max_len, PAD) for x in inputs])
341 | 
342 |     return padded
343 | 
344 | 
345 | def pad_2D(inputs, maxlen=None):
346 |     def pad(x, max_len):
347 |         PAD = 0
348 |         if np.shape(x)[0] > max_len:
349 |             raise ValueError("not max_len")
350 | 
351 |         s = np.shape(x)[1]
352 |         x_padded = np.pad(
353 |             x,
354 |             (0, max_len - np.shape(x)[0]),
355 |             mode="constant",
356 |             constant_values=PAD,
357 |         )
358 |         return x_padded[:, :s]
359 | 
360 |     if maxlen:
361 |         output = np.stack([pad(x, maxlen) for x in inputs])
362 |     else:
363 |         max_len = max(np.shape(x)[0] for x in inputs)
364 |         output = np.stack([pad(x, max_len) for x in inputs])
365 | 
366 |     return output
367 | 
368 | 
369 | def pad(input_ele, mel_max_length=None):
370 |     if mel_max_length:
371 |         max_len = mel_max_length
372 |     else:
373 |         max_len = max([input_ele[i].size(0) for i in range(len(input_ele))])
374 | 
375 |     out_list = list()
376 |     for i, batch in enumerate(input_ele):
377 |         if len(batch.shape) == 1:
378 |             one_batch_padded = F.pad(
379 |                 batch, (0, max_len - batch.size(0)), "constant", 0.0
380 |             )
381 |         elif len(batch.shape) == 2:
382 |             one_batch_padded = F.pad(
383 |                 batch, (0, 0, 0, max_len - batch.size(0)), "constant", 0.0
384 |             )
385 |         out_list.append(one_batch_padded)
386 |     out_padded = torch.stack(out_list)
387 |     return out_padded
388 | 


--------------------------------------------------------------------------------
/fsapi.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import numpy as np
 5 | 
 6 | from fs_two.model import FastSpeech2
 7 | 
 8 | 
 9 | class FSTWOapi:
10 |     def __init__(self, config, device=0):
11 |         weights_path = config.tts.weights_path
12 |         model_folder = "/".join(weights_path.split("/")[:-1])
13 |         config.preprocess_config.path.preprocessed_path = model_folder
14 | 
15 |         self.speakers_dict, self.speaker_names = load_speakers_json(
16 |             config.preprocess_config.path.preprocessed_path
17 |         )
18 | 
19 |         self.model = FastSpeech2(
20 |             config.preprocess_config,
21 |             config.model_config,
22 |             len(self.speaker_names),
23 |         ).to(device)
24 |         # Load checkpoint if exists
25 |         self.weights_path = weights_path
26 |         if weights_path is not None:
27 |             checkpoint = torch.load(weights_path, map_location="cpu")
28 |             state = checkpoint["model"]
29 |             state['speaker_emb.weight'] = checkpoint["embedding"]
30 |             self.model.load_state_dict(checkpoint["model"])
31 | 
32 |         self.cfg = config
33 |         self.device = device
34 | 
35 |         # TODO get the righ restore step
36 |         self.restore_step = 0
37 | 
38 |     def generate(
39 |         self,
40 |         phonemes,
41 |         duration_control=1.0,
42 |         pitch_control=1.0,
43 |         energy_control=1.0,
44 |         speaker_name=None,
45 |     ):
46 | 
47 |         if speaker_name is not None:
48 |             if not speaker_name in self.speakers_dict:
49 |                 raise Exception(
50 |                     f"Speaker {speaker_name} was not found in speakers.json"
51 |                 )
52 |             speaker_id = self.speakers_dict[speaker_name]
53 |             speaker = torch.tensor(speaker_id).long().unsqueeze(0)
54 |             speaker = speaker.to(self.device)
55 |         self.model.eval()
56 |         src_len = np.array([len(phonemes[0])])
57 |         result = self.model(
58 |             speaker,
59 |             torch.from_numpy(phonemes).long().to(self.device),
60 |             torch.from_numpy(src_len).to(self.device),
61 |             max(src_len),
62 |             d_control=duration_control,
63 |             p_control=pitch_control,
64 |             e_control=energy_control,
65 |         )
66 | 
67 |         (
68 |             output,
69 |             p_predictions,
70 |             e_predictions,
71 |             log_d_predictions,
72 |             d_rounded,
73 |             src_masks,
74 |             mel_masks,
75 |             src_lens,
76 |             mel_lens,
77 |             postnet_output,
78 |             pitch_mean,
79 |             pitch_std,
80 |         ) = result
81 | 
82 |         return postnet_output
83 | 
84 | 
85 | def load_speakers_json(dir_path):
86 |     json_paht = os.path.join(dir_path, "speakers.json")
87 |     if os.path.exists(json_paht):
88 |         with open(
89 |             json_paht,
90 |             "r",
91 |         ) as f:
92 |             speakers = json.load(f)
93 |     else:
94 |         print(f'Did not find speakers.josn at {dir_path}')
95 | 
96 |     return speakers, list(speakers.keys())
97 | 


--------------------------------------------------------------------------------
/hifi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/hifi/__init__.py


--------------------------------------------------------------------------------
/hifi/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/hifi/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/hifi/__pycache__/models.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/hifi/__pycache__/models.cpython-38.pyc


--------------------------------------------------------------------------------
/hifi/env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | 
 5 | class AttrDict(dict):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super(AttrDict, self).__init__(*args, **kwargs)
 8 |         self.__dict__ = self
 9 | 
10 | 
11 | def build_env(config, config_name, path):
12 |     t_path = os.path.join(path, config_name)
13 |     if config != t_path:
14 |         os.makedirs(path, exist_ok=True)
15 |         shutil.copyfile(config, os.path.join(path, config_name))
16 | 


--------------------------------------------------------------------------------
/hifi/meldataset.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import random
  4 | import torch
  5 | import torch.utils.data
  6 | import numpy as np
  7 | from librosa.util import normalize
  8 | from scipy.io.wavfile import read
  9 | from librosa.filters import mel as librosa_mel_fn
 10 | 
 11 | MAX_WAV_VALUE = 32768.0
 12 | 
 13 | 
 14 | def load_wav(full_path):
 15 |     sampling_rate, data = read(full_path)
 16 |     return data, sampling_rate
 17 | 
 18 | 
 19 | def dynamic_range_compression(x, C=1, clip_val=1e-5):
 20 |     return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
 21 | 
 22 | 
 23 | def dynamic_range_decompression(x, C=1):
 24 |     return np.exp(x) / C
 25 | 
 26 | 
 27 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
 28 |     return torch.log(torch.clamp(x, min=clip_val) * C)
 29 | 
 30 | 
 31 | def dynamic_range_decompression_torch(x, C=1):
 32 |     return torch.exp(x) / C
 33 | 
 34 | 
 35 | def spectral_normalize_torch(magnitudes):
 36 |     output = dynamic_range_compression_torch(magnitudes)
 37 |     return output
 38 | 
 39 | 
 40 | def spectral_de_normalize_torch(magnitudes):
 41 |     output = dynamic_range_decompression_torch(magnitudes)
 42 |     return output
 43 | 
 44 | 
 45 | mel_basis = {}
 46 | hann_window = {}
 47 | 
 48 | 
 49 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
 50 |     if torch.min(y) < -1.:
 51 |         print('min value is ', torch.min(y))
 52 |     if torch.max(y) > 1.:
 53 |         print('max value is ', torch.max(y))
 54 | 
 55 |     global mel_basis, hann_window
 56 |     if fmax not in mel_basis:
 57 |         mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
 58 |         mel_basis[str(fmax)+'_'+str(y.device)
 59 |                   ] = torch.from_numpy(mel).float().to(y.device)
 60 |         hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
 61 | 
 62 |     y = torch.nn.functional.pad(y.unsqueeze(
 63 |         1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
 64 |     y = y.squeeze(1)
 65 | 
 66 |     spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
 67 |                       center=center, pad_mode='reflect', normalized=False, onesided=True)
 68 | 
 69 |     spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
 70 | 
 71 |     spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
 72 |     spec = spectral_normalize_torch(spec)
 73 | 
 74 |     return spec
 75 | 
 76 | 
 77 | def get_dataset_filelist(a):
 78 |     with open(a.input_training_file, 'r', encoding='utf-8') as fi:
 79 |         training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
 80 |                           for x in fi.read().split('\n') if len(x) > 0]
 81 | 
 82 |     with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
 83 |         validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
 84 |                             for x in fi.read().split('\n') if len(x) > 0]
 85 |     return training_files, validation_files
 86 | 
 87 | 
 88 | class MelDataset(torch.utils.data.Dataset):
 89 |     def __init__(self, training_files, segment_size, n_fft, num_mels,
 90 |                  hop_size, win_size, sampling_rate,  fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
 91 |                  device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
 92 |         self.audio_files = training_files
 93 |         random.seed(1234)
 94 |         if shuffle:
 95 |             random.shuffle(self.audio_files)
 96 |         self.segment_size = segment_size
 97 |         self.sampling_rate = sampling_rate
 98 |         self.split = split
 99 |         self.n_fft = n_fft
100 |         self.num_mels = num_mels
101 |         self.hop_size = hop_size
102 |         self.win_size = win_size
103 |         self.fmin = fmin
104 |         self.fmax = fmax
105 |         self.fmax_loss = fmax_loss
106 |         self.cached_wav = None
107 |         self.n_cache_reuse = n_cache_reuse
108 |         self._cache_ref_count = 0
109 |         self.device = device
110 |         self.fine_tuning = fine_tuning
111 |         self.base_mels_path = base_mels_path
112 | 
113 |     def __getitem__(self, index):
114 |         filename = self.audio_files[index]
115 |         if self._cache_ref_count == 0:
116 |             audio, sampling_rate = load_wav(filename)
117 |             audio = audio / MAX_WAV_VALUE
118 |             if not self.fine_tuning:
119 |                 audio = normalize(audio) * 0.95
120 |             self.cached_wav = audio
121 |             if sampling_rate != self.sampling_rate:
122 |                 raise ValueError("{} SR doesn't match target {} SR".format(
123 |                     sampling_rate, self.sampling_rate))
124 |             self._cache_ref_count = self.n_cache_reuse
125 |         else:
126 |             audio = self.cached_wav
127 |             self._cache_ref_count -= 1
128 | 
129 |         audio = torch.FloatTensor(audio)
130 |         audio = audio.unsqueeze(0)
131 | 
132 |         if not self.fine_tuning:
133 |             if self.split:
134 |                 if audio.size(1) >= self.segment_size:
135 |                     max_audio_start = audio.size(1) - self.segment_size
136 |                     audio_start = random.randint(0, max_audio_start)
137 |                     audio = audio[:, audio_start:audio_start+self.segment_size]
138 |                 else:
139 |                     audio = torch.nn.functional.pad(
140 |                         audio, (0, self.segment_size - audio.size(1)), 'constant')
141 | 
142 |             mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
143 |                                   self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
144 |                                   center=False)
145 |         else:
146 |             mel = np.load(
147 |                 os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy'))
148 |             mel = torch.from_numpy(mel)
149 | 
150 |             if len(mel.shape) < 3:
151 |                 mel = mel.unsqueeze(0)
152 | 
153 |             if self.split:
154 |                 frames_per_seg = math.ceil(self.segment_size / self.hop_size)
155 | 
156 |                 if audio.size(1) >= self.segment_size:
157 |                     mel_start = random.randint(
158 |                         0, mel.size(2) - frames_per_seg - 1)
159 |                     mel = mel[:, :, mel_start:mel_start + frames_per_seg]
160 |                     audio = audio[:, mel_start *
161 |                                   self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
162 |                 else:
163 |                     mel = torch.nn.functional.pad(
164 |                         mel, (0, frames_per_seg - mel.size(2)), 'constant')
165 |                     audio = torch.nn.functional.pad(
166 |                         audio, (0, self.segment_size - audio.size(1)), 'constant')
167 | 
168 |         mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
169 |                                    self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
170 |                                    center=False)
171 | 
172 |         return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
173 | 
174 |     def __len__(self):
175 |         return len(self.audio_files)
176 | 


--------------------------------------------------------------------------------
/hifi/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import torch.nn as nn
  4 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
  5 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
  6 | from hifi.vocoder.utils import init_weights, get_padding
  7 | 
  8 | 
  9 | LRELU_SLOPE = 0.1
 10 | 
 11 | 
 12 | class ResBlock1(torch.nn.Module):
 13 |     def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
 14 |         super(ResBlock1, self).__init__()
 15 |         self.h = h
 16 |         self.convs1 = nn.ModuleList(
 17 |             [
 18 |                 weight_norm(
 19 |                     Conv1d(
 20 |                         channels,
 21 |                         channels,
 22 |                         kernel_size,
 23 |                         1,
 24 |                         dilation=dilation[0],
 25 |                         padding=get_padding(kernel_size, dilation[0]),
 26 |                     )
 27 |                 ),
 28 |                 weight_norm(
 29 |                     Conv1d(
 30 |                         channels,
 31 |                         channels,
 32 |                         kernel_size,
 33 |                         1,
 34 |                         dilation=dilation[1],
 35 |                         padding=get_padding(kernel_size, dilation[1]),
 36 |                     )
 37 |                 ),
 38 |                 weight_norm(
 39 |                     Conv1d(
 40 |                         channels,
 41 |                         channels,
 42 |                         kernel_size,
 43 |                         1,
 44 |                         dilation=dilation[2],
 45 |                         padding=get_padding(kernel_size, dilation[2]),
 46 |                     )
 47 |                 ),
 48 |             ]
 49 |         )
 50 |         self.convs1.apply(init_weights)
 51 | 
 52 |         self.convs2 = nn.ModuleList(
 53 |             [
 54 |                 weight_norm(
 55 |                     Conv1d(
 56 |                         channels,
 57 |                         channels,
 58 |                         kernel_size,
 59 |                         1,
 60 |                         dilation=1,
 61 |                         padding=get_padding(kernel_size, 1),
 62 |                     )
 63 |                 ),
 64 |                 weight_norm(
 65 |                     Conv1d(
 66 |                         channels,
 67 |                         channels,
 68 |                         kernel_size,
 69 |                         1,
 70 |                         dilation=1,
 71 |                         padding=get_padding(kernel_size, 1),
 72 |                     )
 73 |                 ),
 74 |                 weight_norm(
 75 |                     Conv1d(
 76 |                         channels,
 77 |                         channels,
 78 |                         kernel_size,
 79 |                         1,
 80 |                         dilation=1,
 81 |                         padding=get_padding(kernel_size, 1),
 82 |                     )
 83 |                 ),
 84 |             ]
 85 |         )
 86 |         self.convs2.apply(init_weights)
 87 | 
 88 |     def forward(self, x):
 89 |         for c1, c2 in zip(self.convs1, self.convs2):
 90 |             xt = F.leaky_relu(x, LRELU_SLOPE)
 91 |             xt = c1(xt)
 92 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
 93 |             xt = c2(xt)
 94 |             x = xt + x
 95 |         return x
 96 | 
 97 |     def remove_weight_norm(self):
 98 |         for l in self.convs1:
 99 |             remove_weight_norm(l)
100 |         for l in self.convs2:
101 |             remove_weight_norm(l)
102 | 
103 | 
104 | class ResBlock2(torch.nn.Module):
105 |     def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
106 |         super(ResBlock2, self).__init__()
107 |         self.h = h
108 |         self.convs = nn.ModuleList(
109 |             [
110 |                 weight_norm(
111 |                     Conv1d(
112 |                         channels,
113 |                         channels,
114 |                         kernel_size,
115 |                         1,
116 |                         dilation=dilation[0],
117 |                         padding=get_padding(kernel_size, dilation[0]),
118 |                     )
119 |                 ),
120 |                 weight_norm(
121 |                     Conv1d(
122 |                         channels,
123 |                         channels,
124 |                         kernel_size,
125 |                         1,
126 |                         dilation=dilation[1],
127 |                         padding=get_padding(kernel_size, dilation[1]),
128 |                     )
129 |                 ),
130 |             ]
131 |         )
132 |         self.convs.apply(init_weights)
133 | 
134 |     def forward(self, x):
135 |         for c in self.convs:
136 |             xt = F.leaky_relu(x, LRELU_SLOPE)
137 |             xt = c(xt)
138 |             x = xt + x
139 |         return x
140 | 
141 |     def remove_weight_norm(self):
142 |         for l in self.convs:
143 |             remove_weight_norm(l)
144 | 
145 | 
146 | class Generator(torch.nn.Module):
147 |     def __init__(self, h):
148 |         super(Generator, self).__init__()
149 |         self.h = h
150 |         self.num_kernels = len(h.resblock_kernel_sizes)
151 |         self.num_upsamples = len(h.upsample_rates)
152 |         self.conv_pre = weight_norm(
153 |             Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)
154 |         )
155 |         resblock = ResBlock1 if h.resblock == "1" else ResBlock2
156 | 
157 |         self.ups = nn.ModuleList()
158 |         for i, (u, k) in enumerate(
159 |             zip(h.upsample_rates, h.upsample_kernel_sizes)
160 |         ):
161 |             self.ups.append(
162 |                 weight_norm(
163 |                     ConvTranspose1d(
164 |                         h.upsample_initial_channel // (2 ** i),
165 |                         h.upsample_initial_channel // (2 ** (i + 1)),
166 |                         k,
167 |                         u,
168 |                         padding=(k - u) // 2,
169 |                     )
170 |                 )
171 |             )
172 | 
173 |         self.resblocks = nn.ModuleList()
174 |         for i in range(len(self.ups)):
175 |             ch = h.upsample_initial_channel // (2 ** (i + 1))
176 |             for j, (k, d) in enumerate(
177 |                 zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
178 |             ):
179 |                 self.resblocks.append(resblock(h, ch, k, d))
180 | 
181 |         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
182 |         self.ups.apply(init_weights)
183 |         self.conv_post.apply(init_weights)
184 | 
185 |     def forward(self, x):
186 |         x = self.conv_pre(x)
187 |         for i in range(self.num_upsamples):
188 |             x = F.leaky_relu(x, LRELU_SLOPE)
189 |             x = self.ups[i](x)
190 |             xs = None
191 |             for j in range(self.num_kernels):
192 |                 if xs is None:
193 |                     xs = self.resblocks[i * self.num_kernels + j](x)
194 |                 else:
195 |                     xs += self.resblocks[i * self.num_kernels + j](x)
196 |             x = xs / self.num_kernels
197 |         x = F.leaky_relu(x)
198 |         x = self.conv_post(x)
199 |         x = torch.tanh(x)
200 | 
201 |         return x
202 | 
203 |     def remove_weight_norm(self):
204 |         print("Removing weight norm for inference HIFI GAN...")
205 |         for l in self.ups:
206 |             remove_weight_norm(l)
207 |         for l in self.resblocks:
208 |             l.remove_weight_norm()
209 |         remove_weight_norm(self.conv_pre)
210 |         remove_weight_norm(self.conv_post)
211 | 
212 | 
213 | class DiscriminatorP(torch.nn.Module):
214 |     def __init__(
215 |         self, period, kernel_size=5, stride=3, use_spectral_norm=False
216 |     ):
217 |         super(DiscriminatorP, self).__init__()
218 |         self.period = period
219 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
220 |         self.convs = nn.ModuleList(
221 |             [
222 |                 norm_f(
223 |                     Conv2d(
224 |                         1,
225 |                         32,
226 |                         (kernel_size, 1),
227 |                         (stride, 1),
228 |                         padding=(get_padding(5, 1), 0),
229 |                     )
230 |                 ),
231 |                 norm_f(
232 |                     Conv2d(
233 |                         32,
234 |                         128,
235 |                         (kernel_size, 1),
236 |                         (stride, 1),
237 |                         padding=(get_padding(5, 1), 0),
238 |                     )
239 |                 ),
240 |                 norm_f(
241 |                     Conv2d(
242 |                         128,
243 |                         512,
244 |                         (kernel_size, 1),
245 |                         (stride, 1),
246 |                         padding=(get_padding(5, 1), 0),
247 |                     )
248 |                 ),
249 |                 norm_f(
250 |                     Conv2d(
251 |                         512,
252 |                         1024,
253 |                         (kernel_size, 1),
254 |                         (stride, 1),
255 |                         padding=(get_padding(5, 1), 0),
256 |                     )
257 |                 ),
258 |                 norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
259 |             ]
260 |         )
261 |         self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
262 | 
263 |     def forward(self, x):
264 |         fmap = []
265 | 
266 |         # 1d to 2d
267 |         b, c, t = x.shape
268 |         if t % self.period != 0:  # pad first
269 |             n_pad = self.period - (t % self.period)
270 |             x = F.pad(x, (0, n_pad), "reflect")
271 |             t = t + n_pad
272 |         x = x.view(b, c, t // self.period, self.period)
273 | 
274 |         for l in self.convs:
275 |             x = l(x)
276 |             x = F.leaky_relu(x, LRELU_SLOPE)
277 |             fmap.append(x)
278 |         x = self.conv_post(x)
279 |         fmap.append(x)
280 |         x = torch.flatten(x, 1, -1)
281 | 
282 |         return x, fmap
283 | 
284 | 
285 | class MultiPeriodDiscriminator(torch.nn.Module):
286 |     def __init__(self):
287 |         super(MultiPeriodDiscriminator, self).__init__()
288 |         self.discriminators = nn.ModuleList(
289 |             [
290 |                 DiscriminatorP(2),
291 |                 DiscriminatorP(3),
292 |                 DiscriminatorP(5),
293 |                 DiscriminatorP(7),
294 |                 DiscriminatorP(11),
295 |             ]
296 |         )
297 | 
298 |     def forward(self, y, y_hat):
299 |         y_d_rs = []
300 |         y_d_gs = []
301 |         fmap_rs = []
302 |         fmap_gs = []
303 |         for i, d in enumerate(self.discriminators):
304 |             y_d_r, fmap_r = d(y)
305 |             y_d_g, fmap_g = d(y_hat)
306 |             y_d_rs.append(y_d_r)
307 |             fmap_rs.append(fmap_r)
308 |             y_d_gs.append(y_d_g)
309 |             fmap_gs.append(fmap_g)
310 | 
311 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
312 | 
313 | 
314 | class DiscriminatorS(torch.nn.Module):
315 |     def __init__(self, use_spectral_norm=False):
316 |         super(DiscriminatorS, self).__init__()
317 |         norm_f = weight_norm if use_spectral_norm == False else spectral_norm
318 |         self.convs = nn.ModuleList(
319 |             [
320 |                 norm_f(Conv1d(1, 128, 15, 1, padding=7)),
321 |                 norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
322 |                 norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
323 |                 norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
324 |                 norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
325 |                 norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
326 |                 norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
327 |             ]
328 |         )
329 |         self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
330 | 
331 |     def forward(self, x):
332 |         fmap = []
333 |         for l in self.convs:
334 |             x = l(x)
335 |             x = F.leaky_relu(x, LRELU_SLOPE)
336 |             fmap.append(x)
337 |         x = self.conv_post(x)
338 |         fmap.append(x)
339 |         x = torch.flatten(x, 1, -1)
340 | 
341 |         return x, fmap
342 | 
343 | 
344 | class MultiScaleDiscriminator(torch.nn.Module):
345 |     def __init__(self):
346 |         super(MultiScaleDiscriminator, self).__init__()
347 |         self.discriminators = nn.ModuleList(
348 |             [
349 |                 DiscriminatorS(use_spectral_norm=True),
350 |                 DiscriminatorS(),
351 |                 DiscriminatorS(),
352 |             ]
353 |         )
354 |         self.meanpools = nn.ModuleList(
355 |             [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)]
356 |         )
357 | 
358 |     def forward(self, y, y_hat):
359 |         y_d_rs = []
360 |         y_d_gs = []
361 |         fmap_rs = []
362 |         fmap_gs = []
363 |         for i, d in enumerate(self.discriminators):
364 |             if i != 0:
365 |                 y = self.meanpools[i - 1](y)
366 |                 y_hat = self.meanpools[i - 1](y_hat)
367 |             y_d_r, fmap_r = d(y)
368 |             y_d_g, fmap_g = d(y_hat)
369 |             y_d_rs.append(y_d_r)
370 |             fmap_rs.append(fmap_r)
371 |             y_d_gs.append(y_d_g)
372 |             fmap_gs.append(fmap_g)
373 | 
374 |         return y_d_rs, y_d_gs, fmap_rs, fmap_gs
375 | 
376 | 
377 | def feature_loss(fmap_r, fmap_g):
378 |     loss = 0
379 |     for dr, dg in zip(fmap_r, fmap_g):
380 |         for rl, gl in zip(dr, dg):
381 |             loss += torch.mean(torch.abs(rl - gl))
382 | 
383 |     return loss * 2
384 | 
385 | 
386 | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
387 |     loss = 0
388 |     r_losses = []
389 |     g_losses = []
390 |     for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
391 |         r_loss = torch.mean((1 - dr) ** 2)
392 |         g_loss = torch.mean(dg ** 2)
393 |         loss += r_loss + g_loss
394 |         r_losses.append(r_loss.item())
395 |         g_losses.append(g_loss.item())
396 | 
397 |     return loss, r_losses, g_losses
398 | 
399 | 
400 | def generator_loss(disc_outputs):
401 |     loss = 0
402 |     gen_losses = []
403 |     for dg in disc_outputs:
404 |         l = torch.mean((1 - dg) ** 2)
405 |         gen_losses.append(l)
406 |         loss += l
407 | 
408 |     return loss, gen_losses


--------------------------------------------------------------------------------
/hifi/utils.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pylab as plt
 2 | import glob
 3 | import os
 4 | import matplotlib
 5 | import torch
 6 | from torch.nn.utils import weight_norm
 7 | matplotlib.use("Agg")
 8 | 
 9 | 
10 | def plot_spectrogram(spectrogram):
11 |     fig, ax = plt.subplots(figsize=(10, 2))
12 |     im = ax.imshow(spectrogram, aspect="auto", origin="lower",
13 |                    interpolation='none')
14 |     plt.colorbar(im, ax=ax)
15 | 
16 |     fig.canvas.draw()
17 |     plt.close()
18 | 
19 |     return fig
20 | 
21 | 
22 | def init_weights(m, mean=0.0, std=0.01):
23 |     classname = m.__class__.__name__
24 |     if classname.find("Conv") != -1:
25 |         m.weight.data.normal_(mean, std)
26 | 
27 | 
28 | def apply_weight_norm(m):
29 |     classname = m.__class__.__name__
30 |     if classname.find("Conv") != -1:
31 |         weight_norm(m)
32 | 
33 | 
34 | def get_padding(kernel_size, dilation=1):
35 |     return int((kernel_size*dilation - dilation)/2)
36 | 
37 | 
38 | def load_checkpoint(filepath, device):
39 |     assert os.path.isfile(filepath)
40 |     print("Loading '{}'".format(filepath))
41 |     checkpoint_dict = torch.load(filepath, map_location=device)
42 |     print("Complete.")
43 |     return checkpoint_dict
44 | 
45 | 
46 | def save_checkpoint(filepath, obj):
47 |     print("Saving checkpoint to {}".format(filepath))
48 |     torch.save(obj, filepath)
49 |     print("Complete.")
50 | 
51 | 
52 | def scan_checkpoint(cp_dir, prefix):
53 |     pattern = os.path.join(cp_dir, prefix + '????????')
54 |     cp_list = glob.glob(pattern)
55 |     if len(cp_list) == 0:
56 |         return None
57 |     return sorted(cp_list)[-1]
58 | 


--------------------------------------------------------------------------------
/hifi/vocoder/__pycache__/utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/hifi/vocoder/__pycache__/utils.cpython-38.pyc


--------------------------------------------------------------------------------
/hifi/vocoder/utils.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pylab as plt
 2 | import glob
 3 | import os
 4 | import matplotlib
 5 | import torch
 6 | from torch.nn.utils import weight_norm
 7 | 
 8 | matplotlib.use("Agg")
 9 | 
10 | 
11 | def plot_spectrogram(spectrogram):
12 |     fig, ax = plt.subplots(figsize=(10, 2))
13 |     im = ax.imshow(
14 |         spectrogram, aspect="auto", origin="lower", interpolation="none"
15 |     )
16 |     plt.colorbar(im, ax=ax)
17 | 
18 |     fig.canvas.draw()
19 |     plt.close()
20 | 
21 |     return fig
22 | 
23 | 
24 | def init_weights(m, mean=0.0, std=0.01):
25 |     classname = m.__class__.__name__
26 |     if classname.find("Conv") != -1:
27 |         m.weight.data.normal_(mean, std)
28 | 
29 | 
30 | def apply_weight_norm(m):
31 |     classname = m.__class__.__name__
32 |     if classname.find("Conv") != -1:
33 |         weight_norm(m)
34 | 
35 | 
36 | def get_padding(kernel_size, dilation=1):
37 |     return int((kernel_size * dilation - dilation) / 2)
38 | 
39 | 
40 | def load_checkpoint(filepath, device):
41 |     assert os.path.isfile(filepath)
42 |     print("Loading '{}'".format(filepath))
43 |     checkpoint_dict = torch.load(filepath, map_location=device)
44 |     print("Complete.")
45 |     return checkpoint_dict
46 | 
47 | 
48 | def save_checkpoint(filepath, obj):
49 |     print("Saving checkpoint to {}".format(filepath))
50 |     torch.save(obj, filepath)
51 |     print("Complete.")
52 | 
53 | 
54 | def scan_checkpoint(cp_dir, prefix):
55 |     pattern = os.path.join(cp_dir, prefix + "????????")
56 |     cp_list = glob.glob(pattern)
57 |     if len(cp_list) == 0:
58 |         return None
59 |     return sorted(cp_list)[-1]


--------------------------------------------------------------------------------
/hifiapi.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from hifi.models import Generator
 3 | 
 4 | 
 5 | class AttrDict(dict):
 6 |     def __init__(self, *args, **kwargs):
 7 |         super(AttrDict, self).__init__(*args, **kwargs)
 8 |         self.__dict__ = self
 9 | 
10 | 
11 | class HIFIapi:
12 |     def __init__(self, config, device="gpu"):
13 |         if config.model_config["vocoder"]["use_cpu"]:
14 |             device = "cpu"
15 | 
16 |         # Load checkpoint if exists
17 |         weights_path = config.hifi.weights_path
18 | 
19 |         self.model = Generator(config.hifi)
20 |         if weights_path is not None:
21 |             checkpoint = torch.load(weights_path, map_location="cpu")
22 |             self.model.load_state_dict(checkpoint["generator"])
23 | 
24 |         self.cfg = config
25 |         self.device = device
26 | 
27 |         self.model.to(device)
28 |         self.model.remove_weight_norm()
29 |         self.model.eval()
30 | 
31 |     # TODO:
32 |     def train(self):
33 |         raise NotImplemented(" Train for HiFi was not implemented yet")
34 | 
35 |     def __call__(self, x):
36 |         x = x.to(self.device)
37 |         # use call for compatablity with other vocoders or functions
38 |         return self.model(x)
39 | 
40 |     def generate(self, mel_specs):
41 |         """
42 |         Converts mel spectrogramma into an audio file.
43 |         Returns cpu audio files.
44 |         mel_specs - a batch of mel spectrogramms
45 |         """
46 | 
47 |         self.model.eval()
48 |         with torch.no_grad():
49 |             audio = self.model(mel_specs)
50 |             audio = audio * self.cfg.hifi.MAX_WAV_VALUE
51 |             audio = audio.cpu().numpy().astype("int16")
52 |         return audio
53 | 


--------------------------------------------------------------------------------
/input_process.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from string import punctuation
 3 | 
 4 | import numpy as np
 5 | 
 6 | # from g2p_en import G2p
 7 | from fs_two.text import text_to_sequence
 8 | from russian_g2p.Transcription import Transcription
 9 | 
10 | # NO CLEANERS FOR RUSSIAN DATASET
11 | CLEANERS = []
12 | transcriptor = Transcription()
13 | 
14 | def read_lexicon(lex_path):
15 |     lexicon = {}
16 |     with open(lex_path) as f:
17 |         for line in f:
18 |             temp = re.split(r"\s+", line.strip("\n"))
19 |             word = temp[0]
20 |             phones = temp[1:]
21 |             if word.lower() not in lexicon:
22 |                 lexicon[word.lower()] = phones
23 |     return lexicon
24 | 
25 | 
26 | def preprocess_eng(text, preprocess_config):
27 |     text = text.rstrip(punctuation)
28 |     lexicon = read_lexicon(preprocess_config["path"]["lexicon_path"])
29 | 
30 |     g2p = G2p()
31 |     phones = []
32 |     words = re.split(r"([,;.\-\?\!\s+])", text)
33 |     for w in words:
34 |         if w.lower() in lexicon:
35 |             phones += lexicon[w.lower()]
36 |         else:
37 |             phones += list(filter(lambda p: p != " ", g2p(w)))
38 |     phones = "{" + "}{".join(phones) + "}"
39 |     phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
40 |     phones = phones.replace("}{", " ")
41 | 
42 |     print("Raw Text Sequence: {}".format(text))
43 |     print("Phoneme Sequence: {}".format(phones))
44 |     sequence = np.array(text_to_sequence(phones, CLEANERS))
45 | 
46 |     return np.array(sequence)
47 | 
48 | 
49 | def preprocess_lang(text, preprocess_config):
50 |     text = text.rstrip(punctuation)
51 |     lexicon = read_lexicon(preprocess_config["path"]["lexicon_path"])
52 | 
53 |     phones = []
54 |     words = re.split(r"([,;.\-\?\!\s+])", text)
55 |     for w in words:
56 |         if w.lower() in lexicon:
57 |             phones += lexicon[w.lower()]
58 |         else:
59 |             phones += "."
60 |     phones = "{" + "}{".join(phones) + "}"
61 |     phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
62 |     phones = phones.replace("}{", " ")
63 | 
64 |     print("Raw Text Sequence: {}".format(text))
65 |     print("Phoneme Sequence: {}".format(phones))
66 |     sequence = np.array(text_to_sequence(phones, CLEANERS))
67 | 
68 |     return np.array(sequence)
69 | 
70 | 
71 | def preprocess_rus(text):
72 | 
73 |     text = text.rstrip(punctuation)
74 |     phones = []
75 |     words = re.split(r"([,;.\-\?\!\s+])", text)
76 |     sentences = transcriptor.transcribe([text])[0]
77 |     phones = [phoneme for s in sentences for phoneme in s+['sp']]
78 |     phones = "{" + "}{".join(phones) + "}"
79 |     phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
80 |     phones = phones.replace("}{", " ")
81 | 
82 |     print("Raw Text Sequence: {}".format(text))
83 |     print("Phoneme Sequence: {}".format(phones))
84 |     sequence = np.array(text_to_sequence(phones, []))
85 | 
86 |     return np.array(sequence)
87 | 


--------------------------------------------------------------------------------
/prepare_data.py:
--------------------------------------------------------------------------------
 1 | from omegaconf import OmegaConf
 2 | from fs_two.preprocessor.preprocessor import Preprocessor
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     preprocess_config = OmegaConf.load("./config.yaml")["preprocess_config"]
 7 |     preprocessor = Preprocessor(preprocess_config)
 8 |     preprocessor.build_from_path()
 9 | 
10 | 


--------------------------------------------------------------------------------
/pretrained/speakers.json:
--------------------------------------------------------------------------------
1 | {"Schirvind_A_abooks_voxforge": 0, "user1_mozilla": 1, "nikolaev_ailab": 2, "Litvinov_I_abooks_voxforge": 3, "user4_mozilla": 4, "mar_abooks_voxforge": 5, "Arhipova_Natalja_abooks_voxforge": 6, "Medvedeva_Galcova_Olga_abooks_voxforge": 7, "june_shaman": 8, "Tarinicheva_Tatjana_abooks_voxforge": 9, "Kvasha_Igor_abooks_voxforge": 10, "morti_shaman": 11, "Trifilov_Nikolai_abooks_voxforge": 12, "user17_mozilla": 13, "user26_mozilla": 14, "user12_mozilla": 15, "Sytnik_I_abooks_voxforge": 16, "user8_mozilla": 17, "Larionova-Ludm_abooks_voxforge": 18, "Bolshakova_Ksenija_abooks_voxforge": 19, "user5_mozilla": 20, "Kuznetsov_Vsevolod_abooks_voxforge": 21, "Kovaleva_Anna_abooks_voxforge": 22, "Suetin_Pavel_abooks_voxforge": 23, "user7_mozilla": 24, "Konjahin_V_abooks_voxforge": 25, "len_shaman": 26, "Stukalov_Vladimir_abooks_voxforge": 27, "user20_mozilla": 28, "Terenkov_Alexandr_abooks_voxforge": 29, "Taratorkin_Georgiy_abooks_voxforge": 30, "Vasiljev_Y_abooks_voxforge": 31, "Martjanov_O_abooks_voxforge": 32, "Chebaturkina_Elena_abooks_voxforge": 33, "Muhametzyanov_Radik_abooks_voxforge": 34, "Rezalin_Aleksandr_abooks_voxforge": 35, "russian_single": 36, "Zozulin_Viktor_abooks_voxforge": 37, "Zhirnov_Sergey_abooks_voxforge": 38, "hajdurova_ailab": 39, "user6_mozilla": 40, "Vesnik_E_abooks_voxforge": 41, "ira_abooks_voxforge": 42, "Kotov_Alexandr_abooks_voxforge": 43, "vsh_abooks_voxforge": 44, "minaev_ailab": 45, "joh_abooks_voxforge": 46, "Goblin_abooks_voxforge": 47, "Karpov_N_abooks_voxforge": 48, "user11_mozilla": 49, "Larionov_Vsevolod_abooks_voxforge": 50, "Kaljagin_A_abooks_voxforge": 51, "Vorobjeva_Irina_abooks_voxforge": 52, "Rosljakov_Mixail_abooks_voxforge": 53, "Kononov_Mikhail_abooks_voxforge": 54, "Efremov_Oleg_abooks_voxforge": 55, "Vihrov_V_abooks_voxforge": 56, "Pokrovsky_Boris_abooks_voxforge": 57, "noname_opentts": 58, "DrLutz_abooks_voxforge": 59, "Kuznetsov_Alexei_abooks_voxforge": 60, "Sushkov_Vladimir_abooks_voxforge": 61, "Grigorjev_Yurii_abooks_voxforge": 62, "Markin_Petr_abooks_voxforge": 63, "Popova_Alevtina_abooks_voxforge": 64, "Airapetova_Darja_abooks_voxforge": 65}


--------------------------------------------------------------------------------
/pretrained/stats.json:
--------------------------------------------------------------------------------
1 | {"pitch": [-7.016496333880942, 9.535745656686476, -0.03811425926007669, 0.9034625186368779], "energy": [-1.4277896881103516, 6.057352542877197, 58.567213377773356, 41.96484938662417]}


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | g2p-en == 2.1.0
 2 | inflect == 4.1.0
 3 | librosa == 0.7.2
 4 | matplotlib == 3.2.2
 5 | numba == 0.48
 6 | numpy == 1.19.0
 7 | pypinyin==0.39.0
 8 | pyworld == 0.2.10
 9 | PyYAML==5.4.1
10 | scikit-learn==0.23.2
11 | scipy == 1.5.0
12 | soundfile==0.10.3.post1
13 | tgt == 1.4.4
14 | torch == 1.7.0
15 | tqdm==4.46.1
16 | unidecode == 1.1.1
17 | pycwt


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torch
  4 | import math as m
  5 | import torch.nn.functional as F
  6 | import torch.nn as nn
  7 | from torch.utils.data import DataLoader
  8 | 
  9 | # from torch.utils.tensorboard import SummaryWriter
 10 | import wandb as logger
 11 | 
 12 | from tqdm import tqdm
 13 | from omegaconf import OmegaConf
 14 | 
 15 | from hifiapi import HIFIapi
 16 | 
 17 | from fs_two.utils.model import get_model, get_param_num
 18 | from fs_two.utils.tools import to_device, log, synth_one_sample
 19 | from fs_two.model import FastSpeech2Loss
 20 | from fs_two.dataset import Dataset
 21 | from fs_two.evaluate import evaluate
 22 | 
 23 | 
 24 | def main_train_step(
 25 |     model,
 26 |     batch,
 27 |     step,
 28 |     optimizer,
 29 |     cfg,
 30 |     Loss,
 31 | ):
 32 | 
 33 |     grad_acc_step = cfg.train_config["optimizer"]["grad_acc_step"]
 34 |     grad_clip_thresh = cfg.train_config["optimizer"]["grad_clip_thresh"]
 35 | 
 36 |     output = model(*(batch[2:]))
 37 | 
 38 |     losses = Loss(batch, output)
 39 |     total_loss = losses[0]
 40 | 
 41 |     # Backward
 42 | 
 43 |     total_loss = total_loss / grad_acc_step
 44 |     total_loss.backward()
 45 |     losses = [l.item() / grad_acc_step for l in losses[1:]]
 46 | 
 47 |     if step % grad_acc_step == 0:
 48 |         # Clipping gradients to avoid gradient explosion
 49 | 
 50 |         # Update weights
 51 |         # optimizer.update_lr()
 52 |         nn.utils.clip_grad_norm_(model.parameters(), grad_clip_thresh)
 53 |         optimizer.step_and_update_lr()
 54 |         optimizer.zero_grad()
 55 | 
 56 |     return losses, output
 57 | 
 58 | 
 59 | def train_logger(losses, step, total_step, outer_bar, log, logger):
 60 | 
 61 |     losses = [sum(losses)] + losses
 62 |     message1 = "Step {}/{}, ".format(step, total_step)
 63 |     message2 = """Total Loss: {:.4f},
 64 |                 Mel Loss: {:.4f},
 65 |                 Pitch Loss: {:.4f},
 66 |                 Energy Loss: {:.4f},
 67 |                 Duration Loss: {:.4f}
 68 |                 Mean pitch: {:.4f}
 69 |                 Std pitch: {:.4f}
 70 |                 """.format(
 71 |         *losses
 72 |     )
 73 | 
 74 |     outer_bar.write(message1 + message2)
 75 |     log(logger, "train", step, losses=losses)
 76 | 
 77 | 
 78 | def main(cfg):
 79 |     print("Prepare training ...")
 80 | 
 81 |     device = cfg.gpu
 82 |     # Get dataset
 83 |     dataset = Dataset(
 84 |         "train.txt",
 85 |         cfg.preprocess_config,
 86 |         cfg.train_config,
 87 |         sort=True,
 88 |         drop_last=True,
 89 |     )
 90 |     batch_size = cfg.train_config["optimizer"]["batch_size"]
 91 |     group_size = 4  # Set this larger than 1 to enable sorting in Dataset
 92 |     assert batch_size * group_size < len(dataset)
 93 |     loader = DataLoader(
 94 |         dataset,
 95 |         batch_size=batch_size * group_size,
 96 |         shuffle=True,
 97 |         collate_fn=dataset.collate_fn,
 98 |         num_workers=4,
 99 |     )
100 | 
101 |     # Prepare model
102 |     model, optimizer = get_model(cfg, device, train=True)
103 | 
104 |     # model = nn.DataParallel(model)
105 |     num_param = get_param_num(model)
106 |     Loss = FastSpeech2Loss(cfg.preprocess_config, cfg.model_config)
107 |     print("Number of FastSpeech2 Parameters:", num_param)
108 | 
109 |     # Load vocoder
110 |     vocoder = HIFIapi(cfg, cfg.gpu)
111 | 
112 |     # Init logger
113 |     for p in cfg.train_config["path"].values():
114 |         os.makedirs(p, exist_ok=True)
115 | 
116 |     os.environ["WANDB_API_KEY"] = cfg.logger.wandb_key
117 |     if cfg.logger.offline:
118 |         os.environ["WANDB_MODE"] = "offline"
119 | 
120 |     logger.init(name=cfg.exp_name, project="FS2", reinit=True)
121 | 
122 |     # Training
123 | 
124 |     step = cfg.tts.restore_step + 1
125 |     epoch = 1
126 |     total_step = cfg.train_config["step"]["total_step"]
127 |     outer_bar = tqdm(total=total_step, desc="Training", position=0)
128 |     outer_bar.n = cfg.tts.restore_step
129 |     outer_bar.update()
130 | 
131 |     if cfg.run_debug_eval:
132 |         print("RUN SANITY CHECK EVAL:")
133 |         message = evaluate(model, 0, cfg, logger, "val", vocoder, cfg.gpu)
134 | 
135 |     while True:
136 |         inner_bar = tqdm(
137 |             total=len(loader), desc="Epoch {}".format(epoch), position=1
138 |         )
139 |         for batchs in loader:
140 |             for batch in batchs:
141 |                 batch = to_device(batch, device)
142 | 
143 |                 # Forward
144 | 
145 |                 losses, output = main_train_step(
146 |                     model,
147 |                     batch,
148 |                     step,
149 |                     optimizer,
150 |                     cfg,
151 |                     Loss,
152 |                 )
153 | 
154 |                 if step % cfg.train_config.step.log_step == 0:
155 |                     train_logger(
156 |                         losses,
157 |                         step,
158 |                         total_step,
159 |                         outer_bar,
160 |                         log,
161 |                         logger,
162 |                     )
163 | 
164 |                 if step % cfg.train_config.step.synth_step == 0:
165 |                     (
166 |                         fig,
167 |                         wav_reconstruction,
168 |                         wav_prediction,
169 |                         tag,
170 |                     ) = synth_one_sample(
171 |                         batch,
172 |                         output,
173 |                         vocoder,
174 |                         cfg.model_config,
175 |                         cfg.preprocess_config,
176 |                     )
177 |                     log(
178 |                         logger,
179 |                         "train",
180 |                         fig=fig,
181 |                         tag="Training/step_{}_{}".format(step, tag),
182 |                     )
183 |                     sampling_rate = cfg.preprocess_config["preprocessing"][
184 |                         "audio"
185 |                     ]["sampling_rate"]
186 |                     log(
187 |                         logger,
188 |                         "train",
189 |                         audio=wav_reconstruction,
190 |                         sampling_rate=sampling_rate,
191 |                         tag="Training/step_{}_{}_reconstructed".format(
192 |                             step, tag
193 |                         ),
194 |                     )
195 |                     log(
196 |                         logger,
197 |                         "train",
198 |                         audio=wav_prediction,
199 |                         sampling_rate=sampling_rate,
200 |                         tag="Training/step_{}_{}_synthesized".format(step, tag),
201 |                     )
202 | 
203 |                 if step % cfg.train_config.step.val_step == 0:
204 |                     model.eval()
205 |                     message = evaluate(
206 |                         model, step, cfg, logger, "val", vocoder, cfg.gpu
207 |                     )
208 |                     outer_bar.write(message)
209 | 
210 |                     model.train()
211 | 
212 |                 if step % cfg.train_config.step.save_step == 0:
213 |                     model_weight = model.state_dict()
214 |                     embed_weight = model_weight["speaker_emb.weight"]
215 |                     del model_weight["speaker_emb.weight"]
216 | 
217 |                     torch.save(
218 |                         {
219 |                             "model": model_weight,
220 |                             "embedding": embed_weight,
221 |                             "optimizer": optimizer._optimizer.state_dict(),
222 |                         },
223 |                         os.path.join(
224 |                             cfg.train_config["path"]["ckpt_path"],
225 |                             "{}.pth.tar".format(step),
226 |                         ),
227 |                     )
228 | 
229 |                 if step == total_step:
230 |                     quit()
231 |                 step += 1
232 |                 outer_bar.update(1)
233 | 
234 |             inner_bar.update(1)
235 |         epoch += 1
236 | 
237 | 
238 | if __name__ == "__main__":
239 | 
240 |     configs = OmegaConf.load("./config.yaml")
241 |     main(configs)
242 | 


--------------------------------------------------------------------------------
/tts_king.py:
--------------------------------------------------------------------------------
 1 | # IMPORTS FOR PREPROCESS
 2 | import os
 3 | import torch
 4 | import numpy as np
 5 | from string import punctuation
 6 | from fs_two.text import text_to_sequence
 7 | 
 8 | # OTHER IMPORTS
 9 | from omegaconf import OmegaConf
10 | from fsapi import FSTWOapi
11 | 
12 | # from fs_two.preprocess import prepare_dataset_lj_speech
13 | from hifiapi import HIFIapi
14 | 
15 | from input_process import preprocess_rus, preprocess_eng
16 | 
17 | 
18 | class TTSKing:
19 |     def __init__(self, config_path="./config.yaml"):
20 |         self.cfg = OmegaConf.load(config_path)
21 |         self.tts = FSTWOapi(self.cfg, self.cfg.gpu)
22 |         self.vocoder = HIFIapi(self.cfg, self.cfg.gpu)
23 |         self.speakers = self.tts.speaker_names
24 | 
25 |     def generate_mel(
26 |         self,
27 |         text,
28 |         duration_control=1.0,
29 |         pitch_control=1.0,
30 |         energy_control=1.0,
31 |         speaker=0,
32 |     ):
33 | 
34 |         phonemes = self.text_preprocess(text)
35 | 
36 |         result = self.tts.generate(
37 |             phonemes,
38 |             duration_control,
39 |             pitch_control,
40 |             energy_control,
41 |             speaker_name=speaker,
42 |         )
43 | 
44 |         # mel, mel_postnet, log_duration_output, f0_output, energy_output
45 |         return result
46 | 
47 |     def mel_to_wav(self, mel_spec):
48 |         wav_cpu = self.vocoder.generate(mel_spec.transpose(1, 2))
49 |         return wav_cpu
50 | 
51 |     def speak(
52 |         self, text, duration_control=1.0, pitch_control=1.0, energy_control=1.0
53 |     ):
54 |         mel_specs_batch = self.generate_mel_batch(
55 |             text, duration_control, pitch_control, energy_control
56 |         )
57 |         return self.vocoder(mel_specs_batch)
58 | 
59 |     def text_preprocess(self, text):
60 |         return np.array([preprocess_rus(text)])
61 | 
62 |     def text_preprocess_eng(self, text):
63 |         return np.array([preprocess_eng(text, self.cfg.preprocess_config)])
64 | 
65 |     def to_torch_device(self, items):
66 |         return [torch.tensor(t).to(self.cfg.gpu) for t in items]
67 | 


--------------------------------------------------------------------------------