├── README.md ├── audio_process.py ├── config.yaml ├── data_utils ├── clean.py ├── dataset.py ├── dataset_w_stats.py ├── makecsv.py ├── remove_bad_grid.py └── replace.sh ├── dataset_review ├── .ipynb_checkpoints │ └── speakers_stats-checkpoint.ipynb ├── filter_speakers.ipynb ├── hist.png ├── least20.png ├── short_train.txt ├── short_val.txt ├── speakers_short.json ├── speakers_stats.ipynb ├── speakers_to_remove.txt └── top20.png ├── examples.ipynb ├── examples ├── Airapetova_Darja_abooks_voxforge.wav ├── Arhipova_Natalja_abooks_voxforge.wav ├── Bolshakova_Ksenija_abooks_voxforge.wav ├── Chebaturkina_Elena_abooks_voxforge.wav ├── DrLutz_abooks_voxforge.wav ├── Efremov_Oleg_abooks_voxforge.wav ├── Goblin_abooks_voxforge.wav ├── Goblin_dance.wav ├── Grigorjev_Yurii_abooks_voxforge.wav ├── Kaljagin_A_abooks_voxforge.wav ├── Karpov_N_abooks_voxforge.wav ├── Konjahin_V_abooks_voxforge.wav ├── Kononov_Mikhail_abooks_voxforge.wav ├── Kotov_Alexandr_abooks_voxforge.wav ├── Kovaleva_Anna_abooks_voxforge.wav ├── Kuznetsov_Alexei_abooks_voxforge.wav ├── Kuznetsov_Vsevolod_abooks_voxforge.wav ├── Kvasha_Igor_abooks_voxforge.wav ├── Larionov_Vsevolod_abooks_voxforge.wav ├── Larionova-Ludm_abooks_voxforge.wav ├── Litvinov_I_abooks_voxforge.wav ├── Markin_Petr_abooks_voxforge.wav ├── Martjanov_O_abooks_voxforge.wav ├── Medvedeva_Galcova_Olga_abooks_voxforge.wav ├── Muhametzyanov_Radik_abooks_voxforge.wav ├── Pokrovsky_Boris_abooks_voxforge.wav ├── Popova_Alevtina_abooks_voxforge.wav ├── Rezalin_Aleksandr_abooks_voxforge.wav ├── Rosljakov_Mixail_abooks_voxforge.wav ├── Schirvind_A_abooks_voxforge.wav ├── Stukalov_Vladimir_abooks_voxforge.wav ├── Suetin_Pavel_abooks_voxforge.wav ├── Sushkov_Vladimir_abooks_voxforge.wav ├── Sytnik_I_abooks_voxforge.wav ├── Taratorkin_Georgiy_abooks_voxforge.wav ├── Tarinicheva_Tatjana_abooks_voxforge.wav ├── Terenkov_Alexandr_abooks_voxforge.wav ├── Trifilov_Nikolai_abooks_voxforge.wav ├── Vasiljev_Y_abooks_voxforge.wav ├── Vesnik_E_abooks_voxforge.wav ├── Vihrov_V_abooks_voxforge.wav ├── Vorobjeva_Irina_abooks_voxforge.wav ├── Zhirnov_Sergey_abooks_voxforge.wav ├── Zozulin_Viktor_abooks_voxforge.wav ├── goblin_opentts.wav ├── hajdurova_ailab.wav ├── ira_abooks_voxforge.wav ├── joh_abooks_voxforge.wav ├── june_shaman.wav ├── len_shaman.wav ├── mar_abooks_voxforge.wav ├── minaev_ailab.wav ├── morti_shaman.wav ├── nikolaev_ailab.wav ├── noname_opentts.wav ├── russian_single.wav ├── user11_mozilla.wav ├── user12_mozilla.wav ├── user17_mozilla.wav ├── user1_mozilla.wav ├── user20_mozilla.wav ├── user26_mozilla.wav ├── user4_mozilla.wav ├── user5_mozilla.wav ├── user6_mozilla.wav ├── user7_mozilla.wav ├── user8_mozilla.wav └── vsh_abooks_voxforge.wav ├── fs_two ├── README.md ├── audio │ ├── __init__.py │ ├── audio_processing.py │ ├── stft.py │ └── tools.py ├── cwt │ ├── __init__.py │ └── cwt_utils.py ├── dataset.py ├── evaluate.py ├── model │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── fastspeech2.cpython-38.pyc │ │ ├── loss.cpython-38.pyc │ │ ├── modules.cpython-38.pyc │ │ └── optimizer.cpython-38.pyc │ ├── fastspeech2.py │ ├── loss.py │ ├── modules.py │ └── optimizer.py ├── prepare_align.py ├── preprocess.py ├── preprocessor │ ├── common_multi.py │ └── preprocessor.py ├── synthesize.py ├── text │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── cleaners.cpython-38.pyc │ │ ├── cmudict.cpython-38.pyc │ │ ├── numbers.cpython-38.pyc │ │ ├── pinyin.cpython-38.pyc │ │ ├── russian.cpython-38.pyc │ │ └── symbols.cpython-38.pyc │ ├── cleaners.py │ ├── cmudict.py │ ├── numbers.py │ ├── pinyin.py │ ├── russian.py │ └── symbols.py ├── transformer │ ├── Constants.py │ ├── Layers.py │ ├── Models.py │ ├── Modules.py │ ├── SubLayers.py │ ├── __init__.py │ └── __pycache__ │ │ ├── Constants.cpython-38.pyc │ │ ├── Layers.cpython-38.pyc │ │ ├── Models.cpython-38.pyc │ │ ├── Modules.cpython-38.pyc │ │ ├── SubLayers.cpython-38.pyc │ │ └── __init__.cpython-38.pyc └── utils │ ├── __pycache__ │ └── tools.cpython-38.pyc │ ├── model.py │ └── tools.py ├── fsapi.py ├── hifi ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ └── models.cpython-38.pyc ├── env.py ├── meldataset.py ├── models.py ├── utils.py └── vocoder │ ├── __pycache__ │ └── utils.cpython-38.pyc │ └── utils.py ├── hifiapi.py ├── input_process.py ├── prepare_data.py ├── pretrained ├── rus_all.dict ├── speakers.json └── stats.json ├── requirements.txt ├── train.py ├── tts_king.py └── voice_over.ipynb /README.md: -------------------------------------------------------------------------------- 1 | #### A video example with TTS voice - over using several speakers: 2 | [![Watch the video](https://img.youtube.com/vi/DB6pS-CoWVs/0.jpg)](https://www.youtube.com/watch?v=DB6pS-CoWVs&t) 3 | 4 | 5 | ## Brief 6 | 7 | We embarked from this implementation for the start:https://github.com/ming024/FastSpeech2 8 | 9 | However, we have made several changes so the code is not identical. 10 | 11 | For example: 12 | - We use masking for input grapheme tokens during training; 13 | - CWT was implemented as in the original paper but we did not observe any improvements. Final model was trained without CWT. But you can train a model on your data with it: use_cwt flag in config; 14 | - Data preprocessing is slightly different, especially in langauge specific parts. 15 | 16 | ### Dataset: 17 | 18 | Russian dataset was borrowed from here https://github.com/vlomme/Multi-Tacotron-Voice-Cloning. We did not use all the speakers and filtered them based on length and records quality. Only 65 speakers were used at the end. You can check all the examples in 'examples'. 19 | 20 | ### MFA: 21 | 22 | MFA was trained from scartch after preprocessing text with russian_g2p. Using MFA might be not straightforward, so we refer to this manual: https://github.com/ivanvovk/DurIAN#6-how-to-align-your-own-data 23 | 24 | 25 | 26 | # Usage 27 | 28 | 1. We use russian_g2p, so you will need to install it first. 29 | 30 | git init 31 | git clone https://github.com/nsu-ai/russian_g2p.git\ 32 | cd russian_g2p 33 | pip3 install -r requirements.txt 34 | pip install . 35 | 36 | 2. Then Install requirements.txt 37 | 38 | 3. Download weights: 39 | https://drive.google.com/drive/folders/1dX7ELe9C9-ja_liYrgph3Uu5Z5EMljjh?usp=sharing 40 | 41 | - Move hifi gan and FS2 weights into 'pretrained'; 42 | - Check that paths in config match; 43 | 44 | - tts.weights_path - path to pretrained FastSpeech model; 45 | - add speakers_json to the same folder as model weights - speaker names, it should be there right now for pretrained model; 46 | - add sats_json to the same folder as model weights - raw data pitch and energy stats; 47 | - hifi.weights_path - path to pretrnained HIFI Gan. 48 | 49 | 50 | 4. If all above is set check the notebook "examples.ipynb" 51 | 52 | # Training your own model 53 | 54 | 1. Assuming you preprocessed the data with MFA aligner. Your folders structure should be following: 55 | 56 | 57 | ``` 58 | data 59 | ├── speaker_one 60 | │ ├── record_1.TextGrid # genrated by MFA 61 | │ ├── record_1.wav 62 | │ └── record_1.lab # just a text file with a text string 63 | │ 64 | └── speaker_two 65 | ├── ... 66 | └── ... 67 | ``` 68 | 69 | 2. Once data is organized and the path to the data is set in config 'raw_path' run prepare_data.py. 70 | 71 | 3. Prepare_data.py will generate more files such as energy and pitch into a folder set by 'preprocessed_path' 72 | 73 | 4. Finally, set a path to a lexicon dict. Words and its translitirations generated by rissian_g2p. If you do not use rissian_g2p your dictionary will be different. An example can found in 'pretrained' folder. 74 | 75 | ## Have Fun! -------------------------------------------------------------------------------- /audio_process.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import glob 3 | 4 | def convert_mp3_to_wav(new_dir, filename, sr=None): 5 | filename = filename.split(".mp3")[0] 6 | if sr: 7 | return subprocess.call(["ffmpeg", f"-i {filename}.mp3 -ar {sr} {new_dir}/{filename}.wav"]) 8 | else: 9 | return subprocess.call(["ffmpeg", f"-i {filename}.mp3 {new_dir}/{filename}.wav"]) 10 | 11 | 12 | def convert_dataset(dir, new_dir, sr=None): 13 | for filename in glob.glob(f"{dir}/**/*.mp3"): 14 | convert_mp3_to_wav(new_dir, filename, sr) 15 | 16 | 17 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | exp_name: 'multi' 2 | gpu: 'cpu' 3 | run_debug_eval: false 4 | logger: 5 | offline: false 6 | wandb_key: 7 | 8 | 9 | tts: 10 | weights_path: './pretrained/290000.pth.tar' 11 | restore_step: 0 12 | 13 | hifi: 14 | weights_path: './pretrained/hifi.pth' 15 | MAX_WAV_VALUE: 32768 16 | resblock: "1" 17 | num_gpus: 0 18 | batch_size: 8 19 | learning_rate: 0.0002 20 | adam_b1: 0.8 21 | adam_b2: 0.99 22 | lr_decay: 0.999 23 | seed: 1234 24 | 25 | upsample_rates: [8,8,2,2] 26 | upsample_kernel_sizes: [16,16,4,4] 27 | upsample_initial_channel: 512 28 | resblock_kernel_sizes: [3,7,11] 29 | resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] 30 | resblock_initial_channel: 256 31 | 32 | segment_size: 8192 33 | num_mels: 80 34 | num_freq: 1025 35 | n_fft: 1024 36 | hop_size: 256 37 | win_size: 1024 38 | sampling_rate: 22050 39 | 40 | 41 | train_config: 42 | path: 43 | ckpt_path: "../output/ckpt/multi_final" 44 | result_path: "../output/result/multi_final" 45 | optimizer: 46 | batch_size: 16 47 | betas: [0.95, 0.999] 48 | eps: 0.00001 49 | weight_decay: 0.0 50 | grad_clip_thresh: 1.0 51 | grad_acc_step: 4 52 | warm_up_step: 4000 53 | anneal_steps: [300000, 400000, 500000] 54 | anneal_rate: 0.7 55 | step: 56 | total_step: 900000 57 | log_step: 100 58 | synth_step: 1000 59 | val_step: 1000 60 | save_step: 5000 61 | 62 | max_masks_per_sentence: 0.15 63 | 64 | preprocess_config: 65 | dataset: "MAIN" 66 | 67 | path: 68 | lexicon_path: "./rus_all.dict" 69 | raw_path: "./speakers" 70 | preprocessed_path: "./processed" 71 | 72 | 73 | preprocessing: 74 | val_size: 512 75 | text: 76 | text_cleaners: [] 77 | language: "ru" 78 | audio: 79 | sampling_rate: 22050 80 | max_wav_value: 32768.0 81 | stft: 82 | filter_length: 1024 83 | hop_length: 256 84 | win_length: 1024 85 | mel: 86 | n_mel_channels: 80 87 | mel_fmin: 0 88 | mel_fmax: 8000 # please set to 8000 for HiFi-GAN vocoder, set to null for MelGAN vocoder 89 | pitch: 90 | feature: "phoneme_level" # support 'phoneme_level' or 'frame_level' 91 | normalization: True 92 | energy: 93 | feature: "phoneme_level" # support 'phoneme_level' or 'frame_level' 94 | normalization: True 95 | 96 | model_config: 97 | transformer: 98 | encoder_layer: 4 99 | encoder_head: 2 100 | encoder_hidden: 256 101 | variance_hidden: 256 102 | decoder_layer: 6 103 | decoder_head: 2 104 | decoder_hidden: 256 105 | conv_filter_size: 1024 106 | conv_kernel_size: [9, 1] 107 | encoder_dropout: 0.2 108 | decoder_dropout: 0.2 109 | 110 | variance_predictor: 111 | filter_size: 256 112 | kernel_size: 3 113 | dropout: 0.5 114 | use_cwt: False 115 | variance_embedding: 116 | pitch_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing 117 | energy_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing 118 | n_bins: 256 119 | 120 | multi_speaker: True 121 | 122 | max_seq_len: 1000 123 | 124 | vocoder: 125 | model: "HiFi-GAN" # support 'HiFi-GAN', 'MelGAN' 126 | speaker: "universal" # support 'LJSpeech', 'universal' 127 | use_cpu: true 128 | -------------------------------------------------------------------------------- /data_utils/clean.py: -------------------------------------------------------------------------------- 1 | from string import ascii_letters, digits, whitespace 2 | 3 | cyrillic_letters = ( 4 | "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя" 5 | ) 6 | 7 | 8 | def strip(text): 9 | allowed_chars = cyrillic_letters # + digits + whitespace 10 | return "".join([c for c in text if c in allowed_chars]) 11 | 12 | 13 | with open("vocab.lab", "r") as r: 14 | lines = r.read() 15 | lines = sorted([strip(l) for l in lines.split("\n")], key=len) 16 | 17 | with open("./vocab_clean.txt", "w") as f: 18 | for text in lines: 19 | f.write(text + "\n") 20 | -------------------------------------------------------------------------------- /data_utils/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | # traverse root directory, and list directories as dirs and files as files 5 | 6 | FINAL_DIR = "./ailabs_speaker" 7 | 8 | os.makedirs(FINAL_DIR, exist_ok=True) 9 | 10 | 11 | def csv_dict(path): 12 | with open(path) as f: 13 | lines = f.readlines() 14 | 15 | for line in lines: 16 | line = line.split("|") 17 | yield line[0], line[2].lower() 18 | 19 | 20 | def make_record(f_path, name, text, speaker): 21 | DIR = os.path.join(FINAL_DIR, speaker) 22 | os.makedirs(DIR, exist_ok=True) 23 | destination_wav = os.path.join(DIR, name + ".wav") 24 | destination_lab = os.path.join(DIR, name + ".lab") 25 | shutil.copy(f_path, destination_wav) 26 | with open(destination_lab, "w") as f: 27 | f.write(text) 28 | 29 | 30 | texts = [] 31 | for root, dirs, files in os.walk("."): 32 | path = root.split(os.sep) 33 | if "metadata.csv" in files: 34 | csv_path = os.path.join(root, "metadata.csv") 35 | for name, text in csv_dict(csv_path): 36 | file_path = os.path.join(root, "wavs", name + ".wav") 37 | speaker = root.split("/")[-2] 38 | text = text.replace("ё", "йо") 39 | # make_record(file_path, name, text, speaker) 40 | texts = texts + text.split(" ") 41 | 42 | print(len(set(texts))) 43 | with open("./vocab.lab", "w") as f: 44 | for text in set(texts): 45 | f.write(text + "\n") 46 | -------------------------------------------------------------------------------- /data_utils/dataset_w_stats.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | 4 | from string import ascii_letters, digits, whitespace 5 | 6 | cyrillic_letters = ( 7 | "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ" 8 | ) 9 | 10 | 11 | def my_strip(text): 12 | allowed_chars = cyrillic_letters + digits + whitespace 13 | return "".join([c for c in text if c in allowed_chars]).replace("\n", "") 14 | 15 | 16 | # traverse root directory, and list directories as dirs and files as files 17 | 18 | SOURCE_DIR = "./dataset_main/speakers/" 19 | 20 | 21 | class SpeakerStat: 22 | def __init__(self): 23 | self.speakers = dict() 24 | 25 | def add(self, name): 26 | self.speakers[name] = [0, 0, ""] 27 | 28 | def update(self, name, text): 29 | len_words = len(text.split(" ")) 30 | self.speakers[name][0] += 1 31 | self.speakers[name][1] += len_words 32 | self.speakers[name][2] += " " + text 33 | 34 | def make_csv(self, file_path): 35 | csv_records = ["source_name|speaker_id|num_sentences|len_words"] 36 | for speaker in self.speakers: 37 | dataset_name = speaker.split("_")[-1] 38 | num_sentences = self.speakers[speaker][0] 39 | len_words = self.speakers[speaker][1] 40 | string = f"{dataset_name}|{speaker}|{num_sentences}|{len_words}" 41 | csv_records.append(string) 42 | 43 | self.save(file_path, csv_records) 44 | 45 | def save(self, file_path, records): 46 | with open(file_path, "w") as f: 47 | for text in records: 48 | f.write(text + "\n") 49 | 50 | def save_vocab(self, file_path): 51 | words = [] 52 | for speaker in self.speakers: 53 | sp_words = self.speakers[speaker][2].split(" ") 54 | sp_words = [w for w in sp_words if len(w) > 0] 55 | words += sp_words 56 | words = list(set(words)) 57 | words = sorted(words, key=len) 58 | print(f"unique words: {len(words)}") 59 | self.save(file_path, words) 60 | 61 | 62 | def csv_dict(path): 63 | with open(path) as f: 64 | lines = f.readlines() 65 | 66 | for line in lines: 67 | line = line.split("|") 68 | if len(line) == 3: 69 | yield line[0], line[2].lower() 70 | if len(line) == 2: 71 | yield line[0], line[1].lower() 72 | 73 | 74 | def make_record(f_path, text): 75 | with open(f_path, "w") as f: 76 | f.write(text) 77 | 78 | 79 | # def clean(s): 80 | # exclude = set( 81 | # list(string.punctuation) + ["", "_", "\n", "...", "..", "«", "»"] 82 | # ) 83 | # return my_strip("".join(ch for ch in s if ch not in exclude)) 84 | 85 | 86 | def main(): 87 | speakers_lib = SpeakerStat() 88 | 89 | for directory in os.listdir(SOURCE_DIR): 90 | full_directory = os.path.join(SOURCE_DIR, directory) 91 | speakers_lib.add(directory) 92 | csv_path = os.path.join(full_directory, "metadata.csv") 93 | for name, text in csv_dict(csv_path): 94 | text = my_strip(text) 95 | speakers_lib.update(directory, text) 96 | file_path = os.path.join(full_directory, name + ".txt") 97 | make_record(file_path, text) 98 | make_record(file_path.replace("txt", "lab"), text) 99 | 100 | speakers_lib.make_csv("./speaker_stats.csv") 101 | speakers_lib.save_vocab("./vocab.lab") 102 | 103 | 104 | if __name__ == "__main__": 105 | main() 106 | -------------------------------------------------------------------------------- /data_utils/makecsv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | # traverse root directory, and list directories as dirs and files as files 5 | 6 | SOURCE_DIR = "./dataset_main/speakers/amed_shaman/" 7 | 8 | # russian_single 9 | # noname_opentts 10 | 11 | csv_records = [] 12 | for file in os.listdir(SOURCE_DIR): 13 | if ".lab" in file: 14 | txt_path = os.path.join(SOURCE_DIR, file) 15 | with open(txt_path, "r") as f: 16 | text = f.read().replace("\n", "") 17 | string = f"{file.replace('.txt','')}|{text}|{text}" 18 | csv_records.append(string) 19 | 20 | final_path = os.path.join(SOURCE_DIR, "metadata.csv") 21 | with open(final_path, "w") as f: 22 | for text in set(csv_records): 23 | f.write(text + "\n") 24 | -------------------------------------------------------------------------------- /data_utils/remove_bad_grid.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | FOLDER = '/home/dev/other/fsp/data/dataset_main/speakers/' 4 | 5 | BAD_SANTA_LIST = '/home/dev/other/fsp/data/dataset_main/aligner/prev_unaligned.txt' 6 | 7 | def cat(f1, f2): 8 | return os.path.join(f1,f2) 9 | 10 | def make_key(path): 11 | path = ''.join(path.split('.')[0]) 12 | return '_'.join(path.split('/')) 13 | 14 | def get_path_dict(folder): 15 | path_dict = dict() 16 | for speaker in os.listdir(folder): 17 | if 'txt' in speaker: 18 | continue 19 | full_speaker = cat(folder, speaker) 20 | for rec in os.listdir(full_speaker): 21 | full_rec = cat(full_speaker, rec) 22 | key = make_key(cat(speaker,rec)) 23 | path_dict[key]=full_rec.split('.')[0] 24 | 25 | return path_dict 26 | 27 | def get_keys(bad_list): 28 | names = [] 29 | with open(bad_list) as f: 30 | names_list = f.read() 31 | 32 | names = [n.split(' ')[0].split('\t')[0] for n in names_list.split('\n')] 33 | print(f'found {len(names)} bad records') 34 | return names 35 | 36 | 37 | if __name__ =='__main__': 38 | path_dict = get_path_dict(FOLDER) 39 | names = get_keys(BAD_SANTA_LIST) 40 | for i, name in enumerate(names): 41 | if name in path_dict: 42 | path_to_remove = path_dict[name] 43 | else: 44 | continue 45 | try: 46 | os.remove(path_to_remove+'.wav') 47 | os.remove(path_to_remove+'.txt') 48 | os.remove(path_to_remove+'.lab') 49 | except Exception as e: 50 | print(e) 51 | print(f'{i+1} Removed {path_to_remove}') 52 | -------------------------------------------------------------------------------- /data_utils/replace.sh: -------------------------------------------------------------------------------- 1 | for folder in *mozilla*; do 2 | cd $folder 3 | for f in *.wav.*; do 4 | mv -- "$f" "${f/.wav*/}.lab" 5 | done 6 | cd .. 7 | done 8 | -------------------------------------------------------------------------------- /dataset_review/hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/dataset_review/hist.png -------------------------------------------------------------------------------- /dataset_review/least20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/dataset_review/least20.png -------------------------------------------------------------------------------- /dataset_review/speakers_short.json: -------------------------------------------------------------------------------- 1 | {"hajdurova_ailab": 0, "user17_mozilla": 1, "user20_mozilla": 2, "mar_abooks_voxforge": 3, "Vorobjeva_Irina_abooks_voxforge": 4, "Litvinov_I_abooks_voxforge": 5, "Rezalin_Aleksandr_abooks_voxforge": 6, "user6_mozilla": 7, "Vihrov_V_abooks_voxforge": 8, "user11_mozilla": 9, "morti_shaman": 10, "joh_abooks_voxforge": 11, "Kononov_Mikhail_abooks_voxforge": 12, "Sushkov_Vladimir_abooks_voxforge": 13, "Markin_Petr_abooks_voxforge": 14, "Stukalov_Vladimir_abooks_voxforge": 15, "user26_mozilla": 16, "june_shaman": 17, "user12_mozilla": 18, "Kuznetsov_Vsevolod_abooks_voxforge": 19, "vsh_abooks_voxforge": 20, "Schirvind_A_abooks_voxforge": 21, "Vasiljev_Y_abooks_voxforge": 22, "Goblin_abooks_voxforge": 23, "Tarinicheva_Tatjana_abooks_voxforge": 24, "Larionov_Vsevolod_abooks_voxforge": 25, "Kaljagin_A_abooks_voxforge": 26, "Terenkov_Alexandr_abooks_voxforge": 27, "Kuznetsov_Alexei_abooks_voxforge": 28, "Rosljakov_Mixail_abooks_voxforge": 29, "Kvasha_Igor_abooks_voxforge": 30, "user1_mozilla": 31, "russian_single": 32, "Zozulin_Viktor_abooks_voxforge": 33, "Zhirnov_Sergey_abooks_voxforge": 34, "Vesnik_E_abooks_voxforge": 35, "Pokrovsky_Boris_abooks_voxforge": 36, "Martjanov_O_abooks_voxforge": 37, "len_shaman": 38, "Trifilov_Nikolai_abooks_voxforge": 39, "Taratorkin_Georgiy_abooks_voxforge": 40, "user7_mozilla": 41, "Kovaleva_Anna_abooks_voxforge": 42, "Sytnik_I_abooks_voxforge": 43, "noname_opentts": 44, "user8_mozilla": 45, "ira_abooks_voxforge": 46, "Bolshakova_Ksenija_abooks_voxforge": 47, "Muhametzyanov_Radik_abooks_voxforge": 48, "Grigorjev_Yurii_abooks_voxforge": 49, "user5_mozilla": 50, "Efremov_Oleg_abooks_voxforge": 51, "Chebaturkina_Elena_abooks_voxforge": 52, "nikolaev_ailab": 53, "user4_mozilla": 54, "Kotov_Alexandr_abooks_voxforge": 55, "Arhipova_Natalja_abooks_voxforge": 56, "Suetin_Pavel_abooks_voxforge": 57, "Medvedeva_Galcova_Olga_abooks_voxforge": 58, "Airapetova_Darja_abooks_voxforge": 59, "Popova_Alevtina_abooks_voxforge": 60, "Konjahin_V_abooks_voxforge": 61, "DrLutz_abooks_voxforge": 62, "Karpov_N_abooks_voxforge": 63, "Larionova-Ludm_abooks_voxforge": 64, "minaev_ailab": 65} -------------------------------------------------------------------------------- /dataset_review/speakers_to_remove.txt: -------------------------------------------------------------------------------- 1 | Ljubimcev_Pavel_abooks_voxforge 2 | Zemcov_D_abooks_voxforge 3 | Gerasimov_Vladimir_abooks_voxforge 4 | Time_Elizaveta_abooks_voxforge 5 | Lazarev_Yurii_abooks_voxforge 6 | Ljelikova_Lidija_abooks_voxforge 7 | Podoruga_Alexander_abooks_voxforge 8 | Erisanova_I_abooks_voxforge 9 | Kupriyanov_Vasilij_abooks_voxforge 10 | Kocharjan_Suren_abooks_voxforge 11 | Chonishvili_S_abooks_voxforge 12 | ana_shaman 13 | Zjuzina_O_abooks_voxforge 14 | Myagkov_Andrey_abooks_voxforge 15 | Korolev_Vladimir_abooks_voxforge 16 | mat_abooks_voxforge 17 | Mihailovskii_abooks_voxforge 18 | Malyshkina_I_abooks_voxforge 19 | Savitskij_Nikolai_abooks_voxforge 20 | Rosenberg_Mikhail_abooks_voxforge 21 | Fedosov_S_abooks_voxforge 22 | Evstigneev_E_abooks_voxforge 23 | rio_shaman 24 | user22_mozilla 25 | Sidoruk_Al_abooks_voxforge 26 | Maksimov_V_abooks_voxforge 27 | Gubenko_N_abooks_voxforge 28 | Mushatin_Igor_abooks_voxforge 29 | Ivanova_M_abooks_voxforge 30 | Verovoi_Denis_abooks_voxforge 31 | Zamorev_Sergei_abooks_voxforge 32 | user9_mozilla 33 | Murasko_Igor_abooks_voxforge 34 | Nevinniy_Vyacheslav_abooks_voxforge 35 | user13_mozilla 36 | Koksharov_Aleksadr_abooks_voxforge 37 | user30_mozilla 38 | Ranevskaya_F_abooks_voxforge 39 | Basov_Ivan_abooks_voxforge 40 | Telegina_T_abooks_voxforge 41 | Kulagin_L_abooks_voxforge 42 | nsh_abooks_voxforge 43 | Klyukvin_A_abooks_voxforge 44 | Platonov_Maksim_abooks_voxforge 45 | Rajkin_Arkadij_abooks_voxforge 46 | Brockaja_Leontina_abooks_voxforge 47 | Ziganshina_Era_abooks_voxforge 48 | Zadvornih_Vyacheslav_abooks_voxforge 49 | Kurilov_Andrey_abooks_voxforge 50 | Burdelov_O_abooks_voxforge 51 | Kulyutnikov_abooks_voxforge 52 | user25_mozilla 53 | Tolubeev_V_abooks_voxforge 54 | Papanov_Anatoliy_abooks_voxforge 55 | Sergey_Shakurov_abooks_voxforge 56 | Gorbunov_S_abooks_voxforge 57 | Vjalikova_O_abooks_voxforge 58 | Samoylov_Oleg_abooks_voxforge 59 | Rjabcev_E_abooks_voxforge 60 | user10_mozilla 61 | Dubina_A_abooks_voxforge 62 | Cherhjak_M_abooks_voxforge 63 | Borisov_O_abooks_voxforge 64 | tray_shaman 65 | Levina_L_abooks_voxforge 66 | Korneva_Natalja_abooks_voxforge 67 | Solomin_Vitaliy_abooks_voxforge 68 | Osobik_Vladimir_abooks_voxforge 69 | yo_shaman 70 | Shishkin_O_abooks_voxforge 71 | Aroseva_O_abooks_voxforge 72 | Kornizkaja_Evgenija_abooks_voxforge 73 | Ternovskii_E_abooks_voxforge 74 | Bronevoy_L_abooks_voxforge 75 | Maretskaja_Vera_abooks_voxforge 76 | Golovataja_Lidija_abooks_voxforge 77 | Lanovoy_Vasiliy_abooks_voxforge 78 | user29_mozilla 79 | Sevjakov_V_abooks_voxforge 80 | user2_mozilla 81 | user15_mozilla 82 | Valijev_German_abooks_voxforge 83 | user23_mozilla 84 | Zaborovskii_J_abooks_voxforge 85 | Boris_Plotnikov_abooks_voxforge 86 | Skljar_Al_abooks_voxforge 87 | Kazakov_Alexei_abooks_voxforge 88 | Kolpakov_Artem_abooks_voxforge 89 | Isakov_Nikolai_abooks_voxforge 90 | Rossoshanskij_Aleksei_abooks_voxforge 91 | Smoktunovskiy_Innokentiy_abooks_voxforge 92 | Batalov_Alexey_abooks_voxforge 93 | Samoilov_V_abooks_voxforge 94 | Rovinskij_Vladimir_abooks_voxforge 95 | Pinsker_M_abooks_voxforge 96 | Bobylev_Ilia_abooks_voxforge 97 | Golub_Oleg_abooks_voxforge 98 | Zareckii_A_abooks_voxforge 99 | Kukushkin_A_abooks_voxforge 100 | Balakirev_A_abooks_voxforge 101 | Petrov_Victor_abooks_voxforge 102 | svu_abooks_voxforge 103 | Rudnichenko_V_abooks_voxforge 104 | Starchikov_S_abooks_voxforge 105 | user21_mozilla 106 | Lazarev_Al_abooks_voxforge 107 | Borzunov_A_abooks_voxforge 108 | Lebedeva_V_abooks_voxforge 109 | Vitorgan_E_abooks_voxforge 110 | Prudovskiy_Ilja_abooks_voxforge 111 | user19_mozilla 112 | Pozdnjakov_M_abooks_voxforge 113 | user18_mozilla 114 | Yankovsky_Oleg_abooks_voxforge 115 | user16_mozilla 116 | user28_mozilla 117 | Alexandr_Slobodskoy_abooks_voxforge 118 | Andrienko_A_abooks_voxforge 119 | evg_abooks_voxforge 120 | Martynyuk_Yu_abooks_voxforge 121 | user14_mozilla 122 | Hazov_Evgeniy_abooks_voxforge 123 | Mironov_Evgeniy_abooks_voxforge 124 | user3_mozilla 125 | Kiseljev_R_abooks_voxforge 126 | Malishevskiy_Evgeniy_abooks_voxforge 127 | ruslan_ruslan 128 | Samoedov_E_abooks_voxforge 129 | Baljan_Georgiy_abooks_voxforge 130 | esh_abooks_voxforge 131 | sve_abooks_voxforge 132 | Petrov_K_abooks_voxforge 133 | Putin_abooks_voxforge 134 | amed_shaman 135 | Kazarinova_Elena_abooks_voxforge 136 | Sitnik_Stanislav_abooks_voxforge 137 | mgn_abooks_voxforge 138 | Kuzmina_S_abooks_voxforge 139 | Kozii_N_abooks_voxforge 140 | Burlak_Vadim_abooks_voxforge 141 | user27_mozilla 142 | urp_abooks_voxforge 143 | Prohoda_Andrey_abooks_voxforge 144 | Mironov_A_abooks_voxforge 145 | Tabakov_Oleg_abooks_voxforge 146 | Sazykin_Ilja_abooks_voxforge 147 | Gabidulin_Ruslan_abooks_voxforge 148 | Gorelik_Tamara_abooks_voxforge 149 | Kolygo_Dmitrii_abooks_voxforge 150 | Staburov_Roman_abooks_voxforge 151 | Smehov_Veniamin_abooks_voxforge 152 | sun_abooks_voxforge 153 | Jurskii_S_abooks_voxforge 154 | Antonik_abooks_voxforge 155 | Plyatt_R_abooks_voxforge 156 | Perov_Danila_abooks_voxforge 157 | ski_abooks_voxforge 158 | Muravjeva_I_abooks_voxforge 159 | Gerd_Z_abooks_voxforge 160 | Gusev_A_abooks_voxforge 161 | Uryupin_Dmitii_abooks_voxforge 162 | Raschkin_Jrij_abooks_voxforge 163 | Andriyanov_AL_abooks_voxforge 164 | Ilinsky_Ig_abooks_voxforge 165 | Podlesny_Mark_abooks_voxforge 166 | Ktorov_Anatoliy_abooks_voxforge 167 | Kuznetsova_Valentina_abooks_voxforge 168 | Litvinova_N_abooks_voxforge 169 | Ulyanov_M_abooks_voxforge 170 | Zuravljev_Dmitriy_abooks_voxforge 171 | len_abooks_voxforge 172 | Bykov_Alexandr_abooks_voxforge 173 | Golubkina_Marija_abooks_voxforge 174 | Gaft_Valentin_abooks_voxforge 175 | Utochkina_O_abooks_voxforge 176 | Kindinov_Evgeniy_abooks_voxforge 177 | user24_mozilla 178 | Jakovlev_abooks_voxforge 179 | Lapkin_Ignatii_abooks_voxforge 180 | Smarzevskaja_Tatjana_abooks_voxforge 181 | Borisov_Grigorii_abooks_voxforge 182 | Koretskij_Vladimir_abooks_voxforge 183 | Semenova_Ekaterina_abooks_voxforge 184 | -------------------------------------------------------------------------------- /dataset_review/top20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/dataset_review/top20.png -------------------------------------------------------------------------------- /examples/Airapetova_Darja_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Airapetova_Darja_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Arhipova_Natalja_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Arhipova_Natalja_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Bolshakova_Ksenija_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Bolshakova_Ksenija_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Chebaturkina_Elena_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Chebaturkina_Elena_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/DrLutz_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/DrLutz_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Efremov_Oleg_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Efremov_Oleg_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Goblin_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Goblin_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Goblin_dance.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Goblin_dance.wav -------------------------------------------------------------------------------- /examples/Grigorjev_Yurii_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Grigorjev_Yurii_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Kaljagin_A_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kaljagin_A_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Karpov_N_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Karpov_N_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Konjahin_V_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Konjahin_V_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Kononov_Mikhail_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kononov_Mikhail_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Kotov_Alexandr_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kotov_Alexandr_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Kovaleva_Anna_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kovaleva_Anna_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Kuznetsov_Alexei_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kuznetsov_Alexei_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Kuznetsov_Vsevolod_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kuznetsov_Vsevolod_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Kvasha_Igor_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Kvasha_Igor_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Larionov_Vsevolod_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Larionov_Vsevolod_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Larionova-Ludm_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Larionova-Ludm_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Litvinov_I_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Litvinov_I_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Markin_Petr_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Markin_Petr_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Martjanov_O_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Martjanov_O_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Medvedeva_Galcova_Olga_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Medvedeva_Galcova_Olga_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Muhametzyanov_Radik_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Muhametzyanov_Radik_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Pokrovsky_Boris_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Pokrovsky_Boris_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Popova_Alevtina_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Popova_Alevtina_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Rezalin_Aleksandr_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Rezalin_Aleksandr_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Rosljakov_Mixail_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Rosljakov_Mixail_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Schirvind_A_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Schirvind_A_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Stukalov_Vladimir_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Stukalov_Vladimir_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Suetin_Pavel_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Suetin_Pavel_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Sushkov_Vladimir_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Sushkov_Vladimir_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Sytnik_I_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Sytnik_I_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Taratorkin_Georgiy_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Taratorkin_Georgiy_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Tarinicheva_Tatjana_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Tarinicheva_Tatjana_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Terenkov_Alexandr_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Terenkov_Alexandr_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Trifilov_Nikolai_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Trifilov_Nikolai_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Vasiljev_Y_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Vasiljev_Y_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Vesnik_E_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Vesnik_E_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Vihrov_V_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Vihrov_V_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Vorobjeva_Irina_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Vorobjeva_Irina_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Zhirnov_Sergey_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Zhirnov_Sergey_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/Zozulin_Viktor_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/Zozulin_Viktor_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/goblin_opentts.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/goblin_opentts.wav -------------------------------------------------------------------------------- /examples/hajdurova_ailab.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/hajdurova_ailab.wav -------------------------------------------------------------------------------- /examples/ira_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/ira_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/joh_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/joh_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/june_shaman.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/june_shaman.wav -------------------------------------------------------------------------------- /examples/len_shaman.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/len_shaman.wav -------------------------------------------------------------------------------- /examples/mar_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/mar_abooks_voxforge.wav -------------------------------------------------------------------------------- /examples/minaev_ailab.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/minaev_ailab.wav -------------------------------------------------------------------------------- /examples/morti_shaman.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/morti_shaman.wav -------------------------------------------------------------------------------- /examples/nikolaev_ailab.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/nikolaev_ailab.wav -------------------------------------------------------------------------------- /examples/noname_opentts.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/noname_opentts.wav -------------------------------------------------------------------------------- /examples/russian_single.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/russian_single.wav -------------------------------------------------------------------------------- /examples/user11_mozilla.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user11_mozilla.wav -------------------------------------------------------------------------------- /examples/user12_mozilla.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user12_mozilla.wav -------------------------------------------------------------------------------- /examples/user17_mozilla.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user17_mozilla.wav -------------------------------------------------------------------------------- /examples/user1_mozilla.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user1_mozilla.wav -------------------------------------------------------------------------------- /examples/user20_mozilla.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user20_mozilla.wav -------------------------------------------------------------------------------- /examples/user26_mozilla.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user26_mozilla.wav -------------------------------------------------------------------------------- /examples/user4_mozilla.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user4_mozilla.wav -------------------------------------------------------------------------------- /examples/user5_mozilla.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user5_mozilla.wav -------------------------------------------------------------------------------- /examples/user6_mozilla.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user6_mozilla.wav -------------------------------------------------------------------------------- /examples/user7_mozilla.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user7_mozilla.wav -------------------------------------------------------------------------------- /examples/user8_mozilla.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/user8_mozilla.wav -------------------------------------------------------------------------------- /examples/vsh_abooks_voxforge.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/examples/vsh_abooks_voxforge.wav -------------------------------------------------------------------------------- /fs_two/README.md: -------------------------------------------------------------------------------- 1 | # FastSpeech 2 - PyTorch Implementation 2 | 3 | This is a PyTorch implementation of Microsoft's text-to-speech system [**FastSpeech 2: Fast and High-Quality End-to-End Text to Speech**](https://arxiv.org/abs/2006.04558v1). 4 | This project is based on [xcmyz's implementation](https://github.com/xcmyz/FastSpeech) of FastSpeech. Feel free to use/modify the code. 5 | 6 | There are several versions of FastSpeech 2. 7 | This implementation is more similar to [version 1](https://arxiv.org/abs/2006.04558v1), which uses F0 values as the pitch features. 8 | On the other hand, pitch spectrograms extracted by continuous wavelet transform are used as the pitch features in the [later versions](https://arxiv.org/abs/2006.04558). 9 | 10 | ![](./img/model.png) 11 | 12 | # Updates 13 | - 2021/2/26: Support English and Mandarin TTS 14 | - 2021/2/26: Support multi-speaker TTS (AISHELL-3 and LibriTTS) 15 | - 2021/2/26: Support MelGAN and HiFi-GAN vocoder 16 | 17 | # Audio Samples 18 | Audio samples generated by this implementation can be found [here](https://ming024.github.io/FastSpeech2/). 19 | 20 | # Quickstart 21 | 22 | ## Dependencies 23 | You can install the Python dependencies with 24 | ``` 25 | pip3 install -r requirements.txt 26 | ``` 27 | 28 | ## Inference 29 | 30 | You have to download the [pretrained models](https://drive.google.com/drive/folders/1DOhZGlTLMbbAAFZmZGDdc77kz1PloS7F?usp=sharing) and put them in ``output/ckpt/LJSpeech/`` or ``output/ckpt/AISHELL3``. 31 | 32 | For English single-speaker TTS, run 33 | ``` 34 | python3 synthesize.py --text "YOUR_DESIRED_TEXT" --restore_step 900000 --mode single -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml 35 | ``` 36 | 37 | For Mandarin multi-speaker TTS, try 38 | ``` 39 | python3 synthesize.py --text "大家好" --speaker_id SPEAKER_ID --restore_step 900000 --mode single -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml 40 | ``` 41 | 42 | The generated utterances will be put in ``output/result/``. 43 | 44 | Here is an example of synthesized mel-spectrogram of the sentence "Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition", with the English single-speaker TTS model. 45 | ![](./img/synthesized_melspectrogram.png) 46 | 47 | ## Batch Inference 48 | Batch inference is also supported, try 49 | 50 | ``` 51 | python3 synthesize.py --source preprocessed_data/LJSpeech/val.txt --restore_step 900000 --mode batch -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml 52 | ``` 53 | to synthesize all utterances in ``preprocessed_data/LJSpeech/val.txt`` 54 | 55 | ## Controllability 56 | The pitch/volume/speaking rate of the synthesized utterances can be controlled by specifying the desired pitch/energy/duration ratios. 57 | For example, one can increase the speaking rate by 20 % and decrease the volume by 20 % by 58 | 59 | ``` 60 | python3 synthesize.py --text "YOUR_DESIRED_TEXT" --restore_step 900000 --mode single -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml --duration_control 0.8 --energy_control 0.8 61 | ``` 62 | 63 | # Training 64 | 65 | ## Datasets 66 | 67 | The supported datasets are 68 | 69 | - [LJSpeech](https://keithito.com/LJ-Speech-Dataset/): a single-speaker English dataset consists of 13100 short audio clips of a female speaker reading passages from 7 non-fiction books, approximately 24 hours in total. 70 | - [AISHELL-3](http://www.aishelltech.com/aishell_3): a Mandarin TTS dataset with 218 male and female speakers, roughly 85 hours in total. 71 | - [LibriTTS](https://research.google/tools/datasets/libri-tts/): a multi-speaker English dataset containing 585 hours of speech by 2456 speakers. 72 | 73 | We take LJSpeech as an example hereafter. 74 | 75 | ## Preprocessing 76 | 77 | First, run 78 | ``` 79 | python3 prepare_align.py config/LJSpeech/preprocess.yaml 80 | ``` 81 | for some preparations. 82 | 83 | As described in the paper, [Montreal Forced Aligner](https://montreal-forced-aligner.readthedocs.io/en/latest/) (MFA) is used to obtain the alignments between the utterances and the phoneme sequences. 84 | Alignments for the LJSpeech and AISHELL-3 datasets are provided [here](https://drive.google.com/drive/folders/1DBRkALpPd6FL9gjHMmMEdHODmkgNIIK4?usp=sharing). 85 | You have to unzip the files in ``preprocessed_data/LJSpeech/TextGrid/``. 86 | 87 | After that, run the preprocessing script by 88 | ``` 89 | python3 preprocess.py config/LJSpeech/preprocess.yaml 90 | ``` 91 | 92 | Alternately, you can align the corpus by yourself. 93 | Download the official MFA package and run 94 | ``` 95 | ./montreal-forced-aligner/bin/mfa_align raw_data/LJSpeech/ lexicon/librispeech-lexicon.txt english preprocessed_data/LJSpeech 96 | ``` 97 | or 98 | ``` 99 | ./montreal-forced-aligner/bin/mfa_train_and_align raw_data/LJSpeech/ lexicon/librispeech-lexicon.txt preprocessed_data/LJSpeech 100 | ``` 101 | 102 | to align the corpus and then run the preprocessing script. 103 | ``` 104 | python3 preprocess.py config/LJSpeech/preprocess.yaml 105 | ``` 106 | 107 | ## Training 108 | 109 | Train your model with 110 | ``` 111 | python3 train.py -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml 112 | ``` 113 | 114 | The model takes less than 10k steps (less than 1 hour on my GTX1080Ti GPU) of training to generate audio samples with acceptable quality, which is much more efficient than the autoregressive models such as Tacotron2. 115 | 116 | # TensorBoard 117 | 118 | Use 119 | ``` 120 | tensorboard --logdir output/log/LJSpeech 121 | ``` 122 | 123 | to serve TensorBoard on your localhost. 124 | The loss curves, synthesized mel-spectrograms, and audios are shown. 125 | 126 | ![](./img/tensorboard_loss.png) 127 | ![](./img/tensorboard_spec.png) 128 | ![](./img/tensorboard_audio.png) 129 | 130 | # Implementation Issues 131 | 132 | - Following [xcmyz's implementation](https://github.com/xcmyz/FastSpeech), I use an additional Tacotron-2-styled Postnet after the decoder, which is not used in the original paper. 133 | - Gradient clipping is used in the training. 134 | - In my experience, using phoneme-level pitch and energy prediction instead of frame-level prediction results in much better prosody, and normalizing the pitch and energy features also helps. Please refer to ``config/README.md`` for more details. 135 | 136 | Please inform me if you find any mistakes in this repo, or any useful tips to train the FastSpeech 2 model. 137 | 138 | # References 139 | - [FastSpeech 2: Fast and High-Quality End-to-End Text to Speech](https://arxiv.org/abs/2006.04558), Y. Ren, *et al*. 140 | - [xcmyz's FastSpeech implementation](https://github.com/xcmyz/FastSpeech) 141 | - [TensorSpeech's FastSpeech 2 implementation](https://github.com/TensorSpeech/TensorflowTTS) 142 | - [rishikksh20's FastSpeech 2 implementation](https://github.com/rishikksh20/FastSpeech2) 143 | -------------------------------------------------------------------------------- /fs_two/audio/__init__.py: -------------------------------------------------------------------------------- 1 | import fs_two.audio.tools 2 | import fs_two.audio.stft 3 | import fs_two.audio.audio_processing 4 | -------------------------------------------------------------------------------- /fs_two/audio/audio_processing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import librosa.util as librosa_util 4 | from scipy.signal import get_window 5 | 6 | 7 | def window_sumsquare( 8 | window, 9 | n_frames, 10 | hop_length, 11 | win_length, 12 | n_fft, 13 | dtype=np.float32, 14 | norm=None, 15 | ): 16 | """ 17 | # from librosa 0.6 18 | Compute the sum-square envelope of a window function at a given hop length. 19 | 20 | This is used to estimate modulation effects induced by windowing 21 | observations in short-time fourier transforms. 22 | 23 | Parameters 24 | ---------- 25 | window : string, tuple, number, callable, or list-like 26 | Window specification, as in `get_window` 27 | 28 | n_frames : int > 0 29 | The number of analysis frames 30 | 31 | hop_length : int > 0 32 | The number of samples to advance between frames 33 | 34 | win_length : [optional] 35 | The length of the window function. By default, this matches `n_fft`. 36 | 37 | n_fft : int > 0 38 | The length of each analysis frame. 39 | 40 | dtype : np.dtype 41 | The data type of the output 42 | 43 | Returns 44 | ------- 45 | wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` 46 | The sum-squared envelope of the window function 47 | """ 48 | if win_length is None: 49 | win_length = n_fft 50 | 51 | n = n_fft + hop_length * (n_frames - 1) 52 | x = np.zeros(n, dtype=dtype) 53 | 54 | # Compute the squared window at the desired length 55 | win_sq = get_window(window, win_length, fftbins=True) 56 | win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2 57 | win_sq = librosa_util.pad_center(win_sq, n_fft) 58 | 59 | # Fill the envelope 60 | for i in range(n_frames): 61 | sample = i * hop_length 62 | x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] 63 | return x 64 | 65 | 66 | def griffin_lim(magnitudes, stft_fn, n_iters=30): 67 | """ 68 | PARAMS 69 | ------ 70 | magnitudes: spectrogram magnitudes 71 | stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods 72 | """ 73 | 74 | angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) 75 | angles = angles.astype(np.float32) 76 | angles = torch.autograd.Variable(torch.from_numpy(angles)) 77 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 78 | 79 | for i in range(n_iters): 80 | _, angles = stft_fn.transform(signal) 81 | signal = stft_fn.inverse(magnitudes, angles).squeeze(1) 82 | return signal 83 | 84 | 85 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 86 | """ 87 | PARAMS 88 | ------ 89 | C: compression factor 90 | """ 91 | return torch.log(torch.clamp(x, min=clip_val) * C) 92 | 93 | 94 | def dynamic_range_decompression(x, C=1): 95 | """ 96 | PARAMS 97 | ------ 98 | C: compression factor used to compress 99 | """ 100 | return torch.exp(x) / C 101 | -------------------------------------------------------------------------------- /fs_two/audio/stft.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from scipy.signal import get_window 5 | from librosa.util import pad_center, tiny 6 | from librosa.filters import mel as librosa_mel_fn 7 | 8 | from fs_two.audio.audio_processing import ( 9 | dynamic_range_compression, 10 | dynamic_range_decompression, 11 | window_sumsquare, 12 | ) 13 | 14 | DEVICE = 3 15 | 16 | 17 | class STFT(torch.nn.Module): 18 | """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" 19 | 20 | def __init__(self, filter_length, hop_length, win_length, window="hann"): 21 | super(STFT, self).__init__() 22 | self.filter_length = filter_length 23 | self.hop_length = hop_length 24 | self.win_length = win_length 25 | self.window = window 26 | self.forward_transform = None 27 | scale = self.filter_length / self.hop_length 28 | fourier_basis = np.fft.fft(np.eye(self.filter_length)) 29 | 30 | cutoff = int((self.filter_length / 2 + 1)) 31 | fourier_basis = np.vstack( 32 | [ 33 | np.real(fourier_basis[:cutoff, :]), 34 | np.imag(fourier_basis[:cutoff, :]), 35 | ] 36 | ) 37 | 38 | forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) 39 | inverse_basis = torch.FloatTensor( 40 | np.linalg.pinv(scale * fourier_basis).T[:, None, :] 41 | ) 42 | 43 | if window is not None: 44 | assert filter_length >= win_length 45 | # get window and zero center pad it to filter_length 46 | fft_window = get_window(window, win_length, fftbins=True) 47 | fft_window = pad_center(fft_window, filter_length) 48 | fft_window = torch.from_numpy(fft_window).float() 49 | 50 | # window the bases 51 | forward_basis *= fft_window 52 | inverse_basis *= fft_window 53 | 54 | self.register_buffer("forward_basis", forward_basis.float()) 55 | self.register_buffer("inverse_basis", inverse_basis.float()) 56 | 57 | def transform(self, input_data): 58 | num_batches = input_data.size(0) 59 | num_samples = input_data.size(1) 60 | 61 | self.num_samples = num_samples 62 | 63 | # similar to librosa, reflect-pad the input 64 | input_data = input_data.view(num_batches, 1, num_samples) 65 | input_data = F.pad( 66 | input_data.unsqueeze(1), 67 | (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), 68 | mode="reflect", 69 | ) 70 | input_data = input_data.squeeze(1) 71 | 72 | forward_transform = F.conv1d( 73 | input_data.cuda(DEVICE), 74 | torch.autograd.Variable( 75 | self.forward_basis, requires_grad=False 76 | ).cuda(DEVICE), 77 | stride=self.hop_length, 78 | padding=0, 79 | ).cpu() 80 | 81 | cutoff = int((self.filter_length / 2) + 1) 82 | real_part = forward_transform[:, :cutoff, :] 83 | imag_part = forward_transform[:, cutoff:, :] 84 | 85 | magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2) 86 | phase = torch.autograd.Variable( 87 | torch.atan2(imag_part.data, real_part.data) 88 | ) 89 | 90 | return magnitude, phase 91 | 92 | def inverse(self, magnitude, phase): 93 | recombine_magnitude_phase = torch.cat( 94 | [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 95 | ) 96 | 97 | inverse_transform = F.conv_transpose1d( 98 | recombine_magnitude_phase, 99 | torch.autograd.Variable(self.inverse_basis, requires_grad=False), 100 | stride=self.hop_length, 101 | padding=0, 102 | ) 103 | 104 | if self.window is not None: 105 | window_sum = window_sumsquare( 106 | self.window, 107 | magnitude.size(-1), 108 | hop_length=self.hop_length, 109 | win_length=self.win_length, 110 | n_fft=self.filter_length, 111 | dtype=np.float32, 112 | ) 113 | # remove modulation effects 114 | approx_nonzero_indices = torch.from_numpy( 115 | np.where(window_sum > tiny(window_sum))[0] 116 | ) 117 | window_sum = torch.autograd.Variable( 118 | torch.from_numpy(window_sum), requires_grad=False 119 | ) 120 | window_sum = ( 121 | window_sum.cuda(DEVICE) if magnitude.is_cuda else window_sum 122 | ) 123 | inverse_transform[:, :, approx_nonzero_indices] /= window_sum[ 124 | approx_nonzero_indices 125 | ] 126 | 127 | # scale by hop ratio 128 | inverse_transform *= float(self.filter_length) / self.hop_length 129 | 130 | inverse_transform = inverse_transform[ 131 | :, :, int(self.filter_length / 2) : 132 | ] 133 | inverse_transform = inverse_transform[ 134 | :, :, : -int(self.filter_length / 2) : 135 | ] 136 | 137 | return inverse_transform 138 | 139 | def forward(self, input_data): 140 | self.magnitude, self.phase = self.transform(input_data) 141 | reconstruction = self.inverse(self.magnitude, self.phase) 142 | return reconstruction 143 | 144 | 145 | class TacotronSTFT(torch.nn.Module): 146 | def __init__( 147 | self, 148 | filter_length, 149 | hop_length, 150 | win_length, 151 | n_mel_channels, 152 | sampling_rate, 153 | mel_fmin, 154 | mel_fmax, 155 | ): 156 | super(TacotronSTFT, self).__init__() 157 | self.n_mel_channels = n_mel_channels 158 | self.sampling_rate = sampling_rate 159 | self.stft_fn = STFT(filter_length, hop_length, win_length) 160 | mel_basis = librosa_mel_fn( 161 | sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax 162 | ) 163 | mel_basis = torch.from_numpy(mel_basis).float() 164 | self.register_buffer("mel_basis", mel_basis) 165 | 166 | def spectral_normalize(self, magnitudes): 167 | output = dynamic_range_compression(magnitudes) 168 | return output 169 | 170 | def spectral_de_normalize(self, magnitudes): 171 | output = dynamic_range_decompression(magnitudes) 172 | return output 173 | 174 | def mel_spectrogram(self, y): 175 | """Computes mel-spectrograms from a batch of waves 176 | PARAMS 177 | ------ 178 | y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] 179 | 180 | RETURNS 181 | ------- 182 | mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) 183 | """ 184 | assert torch.min(y.data) >= -1 185 | assert torch.max(y.data) <= 1 186 | 187 | magnitudes, phases = self.stft_fn.transform(y) 188 | magnitudes = magnitudes.data 189 | mel_output = torch.matmul(self.mel_basis, magnitudes) 190 | mel_output = self.spectral_normalize(mel_output) 191 | energy = torch.norm(magnitudes, dim=1) 192 | 193 | return mel_output, energy 194 | -------------------------------------------------------------------------------- /fs_two/audio/tools.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from scipy.io.wavfile import write 4 | 5 | from fs_two.audio.audio_processing import griffin_lim 6 | 7 | 8 | def get_mel_from_wav(audio, _stft): 9 | audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1) 10 | audio = torch.autograd.Variable(audio, requires_grad=False) 11 | melspec, energy = _stft.mel_spectrogram(audio) 12 | melspec = torch.squeeze(melspec, 0).numpy().astype(np.float32) 13 | energy = torch.squeeze(energy, 0).numpy().astype(np.float32) 14 | 15 | return melspec, energy 16 | 17 | 18 | def inv_mel_spec(mel, out_filename, _stft, griffin_iters=60): 19 | mel = torch.stack([mel]) 20 | mel_decompress = _stft.spectral_de_normalize(mel) 21 | mel_decompress = mel_decompress.transpose(1, 2).data.cpu() 22 | spec_from_mel_scaling = 1000 23 | spec_from_mel = torch.mm(mel_decompress[0], _stft.mel_basis) 24 | spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) 25 | spec_from_mel = spec_from_mel * spec_from_mel_scaling 26 | 27 | audio = griffin_lim( 28 | torch.autograd.Variable(spec_from_mel[:, :, :-1]), _stft._stft_fn, griffin_iters 29 | ) 30 | 31 | audio = audio.squeeze() 32 | audio = audio.cpu().numpy() 33 | audio_path = out_filename 34 | write(audio_path, _stft.sampling_rate, audio) 35 | -------------------------------------------------------------------------------- /fs_two/cwt/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2017 Tom Runia 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to conditions. 11 | # 12 | # Author: Tom Runia 13 | # Date Created: 2018-04-16 14 | 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | -------------------------------------------------------------------------------- /fs_two/cwt/cwt_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import pycwt as wavelet 4 | from sklearn import preprocessing 5 | 6 | 7 | def mse(a, b): 8 | return ((a - b) ** 2).mean() 9 | 10 | 11 | # PREPROCESSING 12 | 13 | 14 | def transform_cwt(lf0, J=10): 15 | mother = wavelet.MexicanHat() 16 | dt = 0.005 17 | dj = 1 18 | s0 = dt * 2 19 | # Returns J + 1 scales 20 | Wavelet_lf0, scales, freqs, coi, fft, fftfreqs = wavelet.cwt( 21 | np.squeeze(lf0), dt, dj, s0, J, mother 22 | ) 23 | Wavelet_lf0 = np.real(Wavelet_lf0).T 24 | return Wavelet_lf0 25 | 26 | 27 | def inverse_cwt(wavelet_coefs, num_scales=10): 28 | lf0_rec = np.zeros([wavelet_coefs.shape[0], num_scales]) 29 | for i in range(0, num_scales): 30 | lf0_rec[:, i] = wavelet_coefs[:, i] * ((i + 1 + 2.5) ** (-2.5)) 31 | lf0_rec_sum = np.sum(lf0_rec, axis=1) 32 | lf0_rec_sum = preprocessing.scale(lf0_rec_sum) 33 | return lf0_rec_sum 34 | 35 | 36 | # TO REVERSE ADD 37 | # reverse = inverse_batch_cwt(wavelet_coefs, scales=10)*std + mean 38 | 39 | 40 | class TorchStandardScaler: 41 | def fit(self, x): 42 | self.mean = x.mean(0, keepdim=True) 43 | self.std = x.std(0, unbiased=False, keepdim=True) 44 | 45 | def transform(self, x): 46 | x -= self.mean 47 | x /= self.std + 1e-12 48 | return x 49 | 50 | 51 | scaler_tc = TorchStandardScaler() 52 | 53 | 54 | def inverse_batch_cwt(wavelet_coefs, num_scales=10): 55 | batch_size = wavelet_coefs.shape[0] 56 | length = wavelet_coefs.shape[1] 57 | lf0_rec = torch.zeros([batch_size, length, num_scales], dtype=torch.float32).to(wavelet_coefs.device) 58 | for i in range(0, num_scales): 59 | lf0_rec[:, :, i] = wavelet_coefs[:, :, i] * ((i + 1 + 2.5) ** (-2.5)) 60 | lf0_rec_sum = torch.sum(lf0_rec, axis=-1) 61 | # lf0_rec_sum = scaler(lf0_rec_sum) 62 | scaler_tc.fit(lf0_rec_sum) 63 | lf0_rec_sum = scaler_tc.transform(lf0_rec_sum) 64 | 65 | torch.nan_to_num(lf0_rec_sum, nan=0.0) 66 | return lf0_rec_sum 67 | -------------------------------------------------------------------------------- /fs_two/dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import os 4 | 5 | import random 6 | import numpy as np 7 | from torch.utils.data import Dataset 8 | 9 | from fs_two.text import text_to_sequence 10 | from fs_two.utils.tools import pad_1D, pad_2D 11 | from fs_two.text.symbols import _mask, _silences 12 | 13 | 14 | def random_mask(text, _silences, max_masks_per_sentence, _mask): 15 | # randonly mask some sentences 16 | # we do not want to mask short sentences 17 | 18 | text = text.split(" ") 19 | max_len = len(text) 20 | masks_count = int( 21 | max_masks_per_sentence * max_len 22 | ) # max_masks_per_sentence = 0.15 23 | if masks_count == 0: 24 | return text 25 | mask_indexes = random.choices(list(range(max_len)), k=masks_count) 26 | for ind in mask_indexes: 27 | if not text[ind] in _silences: 28 | text[ind] = _mask 29 | return " ".join(text) 30 | 31 | 32 | class Dataset(Dataset): 33 | def __init__( 34 | self, 35 | filename, 36 | preprocess_config, 37 | train_config, 38 | sort=False, 39 | drop_last=True, 40 | ): 41 | self._silences = [s.replace("@", "") for s in _silences] 42 | self.max_masks_per_sentence = train_config.max_masks_per_sentence 43 | self.dataset_name = preprocess_config["dataset"] 44 | self.preprocessed_path = preprocess_config["path"]["preprocessed_path"] 45 | self.cleaners = preprocess_config["preprocessing"]["text"][ 46 | "text_cleaners" 47 | ] 48 | self.batch_size = train_config["optimizer"]["batch_size"] 49 | 50 | ( 51 | self.basename, 52 | self.speaker, 53 | self.text, 54 | self.raw_text, 55 | ) = self.process_meta(filename) 56 | with open(os.path.join(self.preprocessed_path, "speakers.json")) as f: 57 | self.speaker_map = json.load(f) 58 | self.sort = sort 59 | self.drop_last = drop_last 60 | 61 | def __len__(self): 62 | return len(self.text) 63 | 64 | def __getitem__(self, idx): 65 | basename = self.basename[idx] 66 | speaker = self.speaker[idx] 67 | speaker_id = self.speaker_map[speaker] 68 | raw_text = self.raw_text[idx] 69 | phone = np.array(text_to_sequence(self.text[idx], self.cleaners)) 70 | mel_path = os.path.join( 71 | self.preprocessed_path, 72 | "mel", 73 | "{}-mel-{}.npy".format(speaker, basename), 74 | ) 75 | mel = np.load(mel_path) 76 | 77 | energy_path = os.path.join( 78 | self.preprocessed_path, 79 | "energy", 80 | "{}-energy-{}.npy".format(speaker, basename), 81 | ) 82 | energy = np.load(energy_path) 83 | duration_path = os.path.join( 84 | self.preprocessed_path, 85 | "duration", 86 | "{}-duration-{}.npy".format(speaker, basename), 87 | ) 88 | duration = np.load(duration_path) 89 | 90 | pitch_cwt_path = os.path.join( 91 | self.preprocessed_path, 92 | "pitch", 93 | "{}-cwt-pitch-{}.npy".format(speaker, basename), 94 | ) 95 | 96 | pitch_path = os.path.join( 97 | self.preprocessed_path, 98 | "pitch", 99 | "{}-pitch-{}.npy".format(speaker, basename), 100 | ) 101 | 102 | pitch_raw = np.load(pitch_path) 103 | pitch_cwt = np.load(pitch_cwt_path) 104 | 105 | pitch_mean_path = os.path.join( 106 | self.preprocessed_path, 107 | "pitch", 108 | "{}-pitch-mean-{}.npy".format(speaker, basename), 109 | ) 110 | pitch_mean = np.load(pitch_mean_path) 111 | 112 | pitch_std_path = os.path.join( 113 | self.preprocessed_path, 114 | "pitch", 115 | "{}-pitch-std-{}.npy".format(speaker, basename), 116 | ) 117 | pitch_std = np.load(pitch_std_path) 118 | 119 | sample = { 120 | "id": basename, 121 | "speaker": speaker_id, 122 | "text": phone, 123 | "raw_text": raw_text, 124 | "mel": mel, 125 | "energy": energy, 126 | "duration": duration, 127 | "pitch_raw": pitch_raw, 128 | "pitch_mean": pitch_mean, 129 | "pitch_std": pitch_std, 130 | "pitch_cwt": pitch_cwt, 131 | } 132 | 133 | return sample 134 | 135 | def process_meta(self, filename): 136 | with open( 137 | os.path.join(self.preprocessed_path, filename), 138 | "r", 139 | encoding="utf-8", 140 | ) as f: 141 | name = [] 142 | speaker = [] 143 | text = [] 144 | raw_text = [] 145 | for line in f.readlines(): 146 | n, s, t, r = line.strip("\n").split("|") 147 | name.append(n) 148 | speaker.append(s) 149 | if self.max_masks_per_sentence > 1: 150 | t = random_mask( 151 | t, self._silences, self.max_masks_per_sentence, _mask 152 | ) 153 | text.append(t) 154 | raw_text.append(r) 155 | 156 | return name, speaker, text, raw_text 157 | 158 | def reprocess(self, data, idxs): 159 | ids = [data[idx]["id"] for idx in idxs] 160 | speakers = [data[idx]["speaker"] for idx in idxs] 161 | texts = [data[idx]["text"] for idx in idxs] 162 | raw_texts = [data[idx]["raw_text"] for idx in idxs] 163 | mels = [data[idx]["mel"] for idx in idxs] 164 | 165 | pitches_mean = [data[idx]["pitch_mean"] for idx in idxs] 166 | pitches_std = [data[idx]["pitch_std"] for idx in idxs] 167 | pitches_cwt = [data[idx]["pitch_cwt"] for idx in idxs] 168 | pitches_raw = [data[idx]["pitch_raw"] for idx in idxs] 169 | 170 | energies = [data[idx]["energy"] for idx in idxs] 171 | durations = [data[idx]["duration"] for idx in idxs] 172 | 173 | text_lens = np.array([text.shape[0] for text in texts]) 174 | mel_lens = np.array([mel.shape[0] for mel in mels]) 175 | 176 | speakers = np.array(speakers) 177 | pitches_mean = np.array(pitches_mean) 178 | pitches_std = np.array(pitches_std) 179 | 180 | texts = pad_1D(texts) 181 | mels = pad_2D(mels) 182 | energies = pad_1D(energies) 183 | pitches_raw = pad_1D(pitches_raw) 184 | durations = pad_1D(durations) 185 | 186 | pitches_cwt = pad_2D(pitches_cwt) 187 | 188 | return ( 189 | ids, 190 | raw_texts, 191 | speakers, 192 | texts, 193 | text_lens, 194 | max(text_lens), 195 | mels, 196 | mel_lens, 197 | max(mel_lens), 198 | energies, 199 | durations, 200 | pitches_raw, 201 | pitches_cwt, 202 | pitches_mean, 203 | pitches_std, 204 | ) 205 | 206 | def collate_fn(self, data): 207 | data_size = len(data) 208 | 209 | if self.sort: 210 | len_arr = np.array([d["text"].shape[0] for d in data]) 211 | idx_arr = np.argsort(-len_arr) 212 | else: 213 | idx_arr = np.arange(data_size) 214 | 215 | tail = idx_arr[len(idx_arr) - (len(idx_arr) % self.batch_size) :] 216 | idx_arr = idx_arr[: len(idx_arr) - (len(idx_arr) % self.batch_size)] 217 | idx_arr = idx_arr.reshape((-1, self.batch_size)).tolist() 218 | if not self.drop_last and len(tail) > 0: 219 | idx_arr += [tail.tolist()] 220 | 221 | output = list() 222 | for idx in idx_arr: 223 | output.append(self.reprocess(data, idx)) 224 | 225 | return output 226 | -------------------------------------------------------------------------------- /fs_two/evaluate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | 5 | import torch 6 | import yaml 7 | import torch.nn as nn 8 | from torch.utils.data import DataLoader 9 | 10 | from fs_two.utils.model import get_model, get_vocoder 11 | from fs_two.utils.tools import to_device, log, synth_one_sample 12 | from fs_two.model import FastSpeech2Loss 13 | from fs_two.dataset import Dataset 14 | 15 | # TODO SET device via config 16 | 17 | 18 | def evaluate( 19 | model, step, cfg, logger=None, train_val="val", vocoder=None, device=0 20 | ): 21 | dataset = Dataset( 22 | "val.txt", 23 | cfg.preprocess_config, 24 | cfg.train_config, 25 | sort=False, 26 | drop_last=False, 27 | ) 28 | batch_size = cfg.train_config["optimizer"]["batch_size"] 29 | loader = DataLoader( 30 | dataset, 31 | batch_size=batch_size, 32 | shuffle=False, 33 | collate_fn=dataset.collate_fn, 34 | ) 35 | 36 | # Get loss function 37 | Loss = FastSpeech2Loss(cfg.preprocess_config, cfg.model_config) 38 | 39 | # Evaluation 40 | loss_sums = [0 for _ in range(6)] 41 | for batchs in loader: 42 | for batch in batchs: 43 | batch = to_device(batch, device) 44 | with torch.no_grad(): 45 | # Forward 46 | output = model(*(batch[2:])) 47 | 48 | # Cal Loss 49 | losses = Loss(batch, output) 50 | 51 | for i in range(1, len(losses)): 52 | loss_sums[i - 1] += losses[i].item() * len(batch[0]) 53 | 54 | loss_means = [loss_sum / len(dataset) for loss_sum in loss_sums] 55 | loss_means = [sum(loss_means)] + loss_means 56 | loss_logs = [step] + loss_means 57 | 58 | message = """Validation Step {}, 59 | Total Loss: {:.4f}, 60 | Mel Loss: {:.4f}, 61 | Pitch Loss: {:.4f}, 62 | Mean pitch {:.4f}, 63 | Std pitch {:.4f}""".format( 64 | *loss_logs 65 | ) 66 | 67 | if logger is not None: 68 | fig, wav_reconstruction, wav_prediction, tag = synth_one_sample( 69 | batch, 70 | output, 71 | vocoder, 72 | cfg.model_config, 73 | cfg.preprocess_config, 74 | ) 75 | 76 | log(logger, "val", step, losses=loss_means) 77 | log( 78 | logger, 79 | "val", 80 | fig=fig, 81 | tag="Validation/step_{}_{}".format(step, tag), 82 | ) 83 | sampling_rate = cfg.preprocess_config["preprocessing"]["audio"][ 84 | "sampling_rate" 85 | ] 86 | log( 87 | logger, 88 | "val", 89 | audio=wav_reconstruction, 90 | sampling_rate=sampling_rate, 91 | tag="Validation/step_{}_{}_reconstructed".format(step, tag), 92 | ) 93 | log( 94 | logger, 95 | "val", 96 | audio=wav_prediction, 97 | sampling_rate=sampling_rate, 98 | tag="Validation/step_{}_{}_synthesized".format(step, tag), 99 | ) 100 | 101 | return message 102 | 103 | 104 | if __name__ == "__main__": 105 | device = 0 106 | parser = argparse.ArgumentParser() 107 | parser.add_argument("--restore_step", type=int, default=30000) 108 | parser.add_argument( 109 | "-p", 110 | "--preprocess_config", 111 | type=str, 112 | required=True, 113 | help="path to preprocess.yaml", 114 | ) 115 | parser.add_argument( 116 | "-m", 117 | "--model_config", 118 | type=str, 119 | required=True, 120 | help="path to model.yaml", 121 | ) 122 | parser.add_argument( 123 | "-t", 124 | "--train_config", 125 | type=str, 126 | required=True, 127 | help="path to train.yaml", 128 | ) 129 | args = parser.parse_args() 130 | 131 | # Read Config 132 | preprocess_config = yaml.load( 133 | open(args.preprocess_config, "r"), Loader=yaml.FullLoader 134 | ) 135 | model_config = yaml.load( 136 | open(args.model_config, "r"), Loader=yaml.FullLoader 137 | ) 138 | train_config = yaml.load( 139 | open(args.train_config, "r"), Loader=yaml.FullLoader 140 | ) 141 | configs = (preprocess_config, model_config, train_config) 142 | 143 | # Get model 144 | model = get_model(args, configs, device, train=False).to(device) 145 | 146 | message = evaluate(model, args.restore_step, configs) 147 | print(message) 148 | -------------------------------------------------------------------------------- /fs_two/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .fastspeech2 import FastSpeech2 2 | from .loss import FastSpeech2Loss 3 | from .optimizer import ScheduledOptim 4 | -------------------------------------------------------------------------------- /fs_two/model/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/model/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/model/__pycache__/fastspeech2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/model/__pycache__/fastspeech2.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/model/__pycache__/loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/model/__pycache__/loss.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/model/__pycache__/modules.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/model/__pycache__/modules.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/model/__pycache__/optimizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/model/__pycache__/optimizer.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/model/fastspeech2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from fs_two.transformer import Encoder, Decoder, PostNet 8 | from .modules import VarianceAdaptor 9 | from fs_two.utils.tools import get_mask_from_lengths 10 | 11 | 12 | class FastSpeech2(nn.Module): 13 | """FastSpeech2""" 14 | 15 | def __init__( 16 | self, preprocess_config, model_config, n_speakers=None, device="cpu" 17 | ): 18 | super(FastSpeech2, self).__init__() 19 | self.model_config = model_config 20 | 21 | self.encoder = Encoder(model_config) 22 | self.variance_adaptor = VarianceAdaptor( 23 | preprocess_config, model_config, device 24 | ) 25 | self.decoder = Decoder(model_config) 26 | self.mel_linear = nn.Linear( 27 | model_config["transformer"]["decoder_hidden"], 28 | preprocess_config["preprocessing"]["mel"]["n_mel_channels"], 29 | ) 30 | nn.init.xavier_normal_(self.mel_linear.weight) 31 | self.speaker_emb = None 32 | 33 | if model_config["multi_speaker"]: 34 | if n_speakers is None: 35 | n_speakers = get_speakers_number(preprocess_config) 36 | self.speaker_emb = nn.Embedding( 37 | n_speakers, 38 | model_config["transformer"]["encoder_hidden"], 39 | ) 40 | 41 | self.postnet = PostNet() 42 | 43 | def forward( 44 | self, 45 | speakers, 46 | texts, 47 | src_lens, 48 | max_src_len, 49 | mels=None, 50 | mel_lens=None, 51 | max_mel_len=None, 52 | e_targets=None, 53 | d_targets=None, 54 | pitches_raw=None, 55 | pitches_cwt=None, 56 | pitches_mean=None, 57 | pitches_std=None, 58 | p_control=1.0, 59 | e_control=1.0, 60 | d_control=1.0, 61 | ): 62 | src_masks = get_mask_from_lengths( 63 | src_lens, max_src_len, device=texts.device 64 | ) 65 | mel_masks = ( 66 | get_mask_from_lengths(mel_lens, max_mel_len, device=texts.device) 67 | if mel_lens is not None 68 | else None 69 | ) 70 | 71 | output = self.encoder(texts, src_masks) 72 | if self.speaker_emb is not None: 73 | embedding = ( 74 | self.speaker_emb(speakers).unsqueeze(1).expand(-1, 1, -1) 75 | ) 76 | ( 77 | output, 78 | pitch_prediction, 79 | e_predictions, 80 | log_d_predictions, 81 | d_rounded, 82 | mel_lens, 83 | mel_masks, 84 | pitch_mean, 85 | pitch_std, 86 | ) = self.variance_adaptor( 87 | output, 88 | embedding, 89 | src_masks, 90 | mel_masks, 91 | max_mel_len, 92 | pitches_raw, 93 | pitches_cwt, 94 | e_targets, 95 | d_targets, 96 | p_control, 97 | e_control, 98 | d_control, 99 | ) 100 | 101 | output, mel_masks = self.decoder(output, mel_masks) 102 | output = self.mel_linear(output) 103 | 104 | postnet_output = self.postnet(output) + output 105 | 106 | return ( 107 | output, 108 | pitch_prediction, 109 | e_predictions, 110 | log_d_predictions, 111 | d_rounded, 112 | src_masks, 113 | mel_masks, 114 | src_lens, 115 | mel_lens, 116 | postnet_output, 117 | pitch_mean, 118 | pitch_std, 119 | ) 120 | 121 | 122 | def get_speakers_number(preprocess_config): 123 | speaker_json = os.path.join( 124 | preprocess_config["path"]["preprocessed_path"], 125 | "speakers.json", 126 | ) 127 | if os.path.exists(speaker_json): 128 | with open( 129 | speaker_json, 130 | "r", 131 | ) as f: 132 | n_speakers = len(json.load(f)) 133 | else: 134 | raise Exception( 135 | "Model is multispeaker but number of speakers was not provided explicitly" 136 | ) 137 | return n_speakers 138 | -------------------------------------------------------------------------------- /fs_two/model/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class FastSpeech2Loss(nn.Module): 6 | """FastSpeech2 Loss""" 7 | 8 | def __init__(self, preprocess_config, model_config): 9 | super(FastSpeech2Loss, self).__init__() 10 | self.pitch_feature_level = preprocess_config["preprocessing"]["pitch"][ 11 | "feature" 12 | ] 13 | self.energy_feature_level = preprocess_config["preprocessing"][ 14 | "energy" 15 | ]["feature"] 16 | self.mse_loss = nn.MSELoss() 17 | self.mae_loss = nn.L1Loss() 18 | 19 | if model_config.use_cwt: 20 | self.use_cwt = True 21 | else: 22 | self.use_cwt = False 23 | 24 | def forward(self, inputs, predictions): 25 | # TARGETS 26 | speaker_targets = inputs[2] 27 | ( 28 | mel_targets, 29 | _, 30 | _, 31 | energy_targets, 32 | duration_targets, 33 | pitches_raw_targets, 34 | pitches_cwt_targets, 35 | pitch_mean, 36 | pitch_std, 37 | ) = inputs[6:] 38 | 39 | # PREDICTIONS 40 | ( 41 | mel_predictions, 42 | pitch_predictions, 43 | energy_predictions, 44 | log_duration_predictions, 45 | _, 46 | src_masks, 47 | mel_masks, 48 | _, 49 | _, 50 | postnet_mel_predictions, 51 | pitch_mean_pred, 52 | pitch_std_pred, 53 | ) = predictions 54 | src_masks = ~src_masks 55 | mel_masks = ~mel_masks 56 | log_duration_targets = torch.log(duration_targets.float() + 1) 57 | mel_targets = mel_targets[:, : mel_masks.shape[1], :] 58 | mel_masks = mel_masks[:, : mel_masks.shape[1]] 59 | 60 | log_duration_targets.requires_grad = False 61 | pitches_raw_targets.requires_grad = False 62 | pitches_cwt_targets.requires_grad = False 63 | energy_targets.requires_grad = False 64 | mel_targets.requires_grad = False 65 | 66 | if self.use_cwt: 67 | pitch_mask = src_masks.unsqueeze(2) 68 | pitch_mask = pitch_mask.repeat(1, 1, 11) 69 | pitch_predictions = pitch_predictions.masked_select(pitch_mask) 70 | pitch_targets = pitches_cwt_targets.masked_select(pitch_mask) 71 | else: 72 | pitch_predictions = pitch_predictions.masked_select(src_masks) 73 | pitch_targets = pitches_raw_targets.masked_select(src_masks) 74 | 75 | energy_predictions = energy_predictions.masked_select(src_masks) 76 | energy_targets = energy_targets.masked_select(src_masks) 77 | 78 | log_duration_predictions = log_duration_predictions.masked_select( 79 | src_masks 80 | ) 81 | log_duration_targets = log_duration_targets.masked_select(src_masks) 82 | 83 | mel_predictions = mel_predictions * mel_masks.unsqueeze(-1) 84 | # mel_predictions = mel_predictions.masked_select(mel_masks.unsqueeze(-1)) 85 | postnet_mel_predictions = postnet_mel_predictions * mel_masks.unsqueeze( 86 | -1 87 | ) 88 | # postnet_mel_predictions = postnet_mel_predictions.masked_select(mel_masks.unsqueeze(-1)) 89 | 90 | # mel_targets = mel_targets.masked_select(mel_masks.unsqueeze(-1)) 91 | mel_targets = mel_targets * mel_masks.unsqueeze(-1) 92 | 93 | mel_loss = self.mse_loss(mel_predictions, mel_targets) 94 | mel_loss_mae = self.mae_loss(mel_predictions, mel_targets) 95 | postnet_mel_loss = self.mae_loss(postnet_mel_predictions, mel_targets) 96 | total_mel_loss = mel_loss + mel_loss_mae + postnet_mel_loss 97 | 98 | pitch_loss = self.mse_loss(pitch_predictions, pitch_targets) 99 | 100 | energy_loss = self.mse_loss(energy_predictions, energy_targets) 101 | duration_loss = self.mse_loss( 102 | log_duration_predictions, log_duration_targets 103 | ) 104 | 105 | if self.use_cwt: 106 | std_pitch_loss = self.mse_loss( 107 | pitch_std_pred, pitch_std.unsqueeze(1) 108 | ) 109 | mean_pitch_loss = self.mse_loss( 110 | pitch_mean_pred, pitch_mean.unsqueeze(1) 111 | ) 112 | else: 113 | # std and mean are used only for CWT prediction 114 | std_pitch_loss = torch.tensor([0]).to(pitch_loss.device) 115 | mean_pitch_loss = torch.tensor([0]).to(pitch_loss.device) 116 | 117 | total_loss = ( 118 | total_mel_loss 119 | + duration_loss 120 | + pitch_loss 121 | + energy_loss 122 | + mean_pitch_loss 123 | + std_pitch_loss 124 | ) 125 | 126 | return ( 127 | total_loss, 128 | total_mel_loss, 129 | pitch_loss, 130 | energy_loss, 131 | duration_loss, 132 | mean_pitch_loss, 133 | std_pitch_loss, 134 | ) 135 | -------------------------------------------------------------------------------- /fs_two/model/modules.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from collections import OrderedDict 4 | 5 | import torch 6 | import torch.nn as nn 7 | import numpy as np 8 | from torch.autograd import Function 9 | 10 | from fs_two.utils.tools import get_mask_from_lengths, pad 11 | from fs_two.cwt.cwt_utils import inverse_batch_cwt 12 | 13 | 14 | class VarianceAdaptor(nn.Module): 15 | """ Variance Adaptor """ 16 | def __init__(self, preprocess_config, model_config, device): 17 | super(VarianceAdaptor, self).__init__() 18 | self.device = device 19 | 20 | hidden_size = model_config["transformer"]["variance_hidden"] 21 | 22 | self.duration_predictor = VariancePredictor(model_config) 23 | self.length_regulator = LengthRegulator() 24 | 25 | if model_config.use_cwt: 26 | self.use_cwt = True 27 | self.pitch_predictor = VariancePredictor(model_config, 28 | output_size=11, 29 | dropout=0.1) 30 | else: 31 | self.use_cwt = False 32 | self.pitch_predictor = VariancePredictor(model_config) 33 | 34 | # PitchPredictor(hidden_size, cwt_size=11) 35 | 36 | self.energy_predictor = VariancePredictor(model_config) 37 | 38 | self.pitch_mean = CNNscalar(size_one=hidden_size, size_two=11) 39 | self.pitch_std = CNNscalar(size_one=hidden_size, size_two=11) 40 | 41 | self.pitch_feature_level = preprocess_config["preprocessing"]["pitch"][ 42 | "feature"] 43 | self.energy_feature_level = preprocess_config["preprocessing"][ 44 | "energy"]["feature"] 45 | assert self.pitch_feature_level in ["phoneme_level", "frame_level"] 46 | assert self.energy_feature_level in ["phoneme_level", "frame_level"] 47 | 48 | pitch_quantization = model_config["variance_embedding"][ 49 | "pitch_quantization"] 50 | energy_quantization = model_config["variance_embedding"][ 51 | "energy_quantization"] 52 | n_bins = model_config["variance_embedding"]["n_bins"] 53 | assert pitch_quantization in ["linear", "log"] 54 | assert energy_quantization in ["linear", "log"] 55 | with open( 56 | os.path.join(preprocess_config["path"]["preprocessed_path"], 57 | "stats.json")) as f: 58 | stats = json.load(f) 59 | pitch_min, pitch_max = stats["pitch"][:2] 60 | energy_min, energy_max = stats["energy"][:2] 61 | 62 | if pitch_quantization == "log": 63 | self.pitch_bins = nn.Parameter( 64 | torch.exp( 65 | torch.linspace(np.log(pitch_min), np.log(pitch_max), 66 | n_bins - 1)), 67 | requires_grad=False, 68 | ) 69 | else: 70 | self.pitch_bins = nn.Parameter( 71 | torch.linspace(pitch_min, pitch_max, n_bins - 1), 72 | requires_grad=False, 73 | ) 74 | if energy_quantization == "log": 75 | self.energy_bins = nn.Parameter( 76 | torch.exp( 77 | torch.linspace(np.log(energy_min), np.log(energy_max), 78 | n_bins - 1)), 79 | requires_grad=False, 80 | ) 81 | else: 82 | self.energy_bins = nn.Parameter( 83 | torch.linspace(energy_min, energy_max, n_bins - 1), 84 | requires_grad=False, 85 | ) 86 | 87 | self.pitch_embedding = nn.Embedding( 88 | n_bins, model_config["transformer"]["encoder_hidden"]) 89 | self.energy_embedding = nn.Embedding( 90 | n_bins, model_config["transformer"]["encoder_hidden"]) 91 | 92 | def get_pitch_embedding_normal(self, x, target, mask, control=1): 93 | prediction = self.pitch_predictor(x, mask) 94 | if target is not None: 95 | embedding = self.pitch_embedding( 96 | torch.bucketize(target, self.pitch_bins)) 97 | else: 98 | prediction = prediction * control 99 | embedding = self.pitch_embedding( 100 | torch.bucketize(prediction, self.pitch_bins)) 101 | return prediction, embedding 102 | 103 | def get_pitch_embedding_cwt(self, x, pitch_target_cwt, mask, control=1): 104 | # batch, seq_len, 10 -> batch, 10 -> batch, 1 105 | mask = mask.unsqueeze(2) 106 | mask = mask.repeat(1, 1, 11) 107 | pitch_cwt_prediction = self.pitch_predictor(x, mask) 108 | 109 | # NOTE: Might be more stable if train on Ground Truth 110 | # if pitch_target_cwt is None: 111 | # pitch_cwt = pitch_cwt_prediction 112 | # else: 113 | # pitch_cwt = pitch_target_cwt 114 | 115 | pitch_cwt = pitch_cwt_prediction 116 | 117 | pitch_mean = self.pitch_mean(x.detach(), pitch_cwt.detach()) 118 | pitch_std = self.pitch_std(x.detach(), pitch_cwt.detach()) 119 | 120 | pitch = inverse_batch_cwt(pitch_cwt) 121 | 122 | # print(pitch.shape) 123 | # print(pitch_std.shape) 124 | # print(pitch_mean.shape) 125 | pitch = (pitch * pitch_std) + pitch_mean 126 | 127 | pitch_embedding = self.pitch_embedding( 128 | torch.bucketize(pitch * control, self.pitch_bins)) 129 | return pitch_cwt_prediction, pitch_embedding, pitch_mean, pitch_std 130 | 131 | def get_energy_embedding(self, x, target, mask, control): 132 | prediction = self.energy_predictor(x, mask) 133 | if target is not None: 134 | embedding = self.energy_embedding( 135 | torch.bucketize(target, self.energy_bins)) 136 | else: 137 | prediction = prediction * control 138 | embedding = self.energy_embedding( 139 | torch.bucketize(prediction, self.energy_bins)) 140 | return prediction, embedding 141 | 142 | def forward( 143 | self, 144 | x, 145 | embedding, 146 | src_mask, 147 | mel_mask=None, 148 | max_len=None, 149 | pitch_raw_target=None, 150 | pitch_cwt_target=None, 151 | energy_target=None, 152 | duration_target=None, 153 | p_control=1.0, 154 | e_control=1.0, 155 | d_control=1.0, 156 | ): 157 | 158 | log_duration_prediction = self.duration_predictor(x, src_mask) 159 | x = x + embedding 160 | if self.use_cwt: 161 | ( 162 | pitch_prediction, 163 | pitch_embedding, 164 | pitch_mean, 165 | pitch_std, 166 | ) = self.get_pitch_embedding_cwt( 167 | x, 168 | pitch_cwt_target, 169 | src_mask, 170 | p_control, 171 | ) 172 | else: 173 | ( 174 | pitch_prediction, 175 | pitch_embedding, 176 | ) = self.get_pitch_embedding_normal( 177 | x, 178 | pitch_raw_target, 179 | src_mask, 180 | p_control, 181 | ) 182 | pitch_mean = None 183 | pitch_std = None 184 | 185 | x = x + pitch_embedding 186 | 187 | energy_prediction, energy_embedding = self.get_energy_embedding( 188 | x, 189 | energy_target, 190 | src_mask, 191 | e_control, 192 | ) 193 | x = x + energy_embedding 194 | 195 | if duration_target is not None: 196 | x, mel_len = self.length_regulator(x, duration_target, max_len) 197 | duration_rounded = duration_target 198 | else: 199 | duration_rounded = torch.clamp( 200 | (torch.round(torch.exp(log_duration_prediction) - 1) * 201 | d_control), 202 | min=0, 203 | ) 204 | x, mel_len = self.length_regulator(x, duration_rounded, max_len) 205 | mel_mask = get_mask_from_lengths(mel_len, device=self.device) 206 | 207 | return ( 208 | x, 209 | pitch_prediction, 210 | energy_prediction, 211 | log_duration_prediction, 212 | duration_rounded, 213 | mel_len, 214 | mel_mask, 215 | pitch_mean, 216 | pitch_std, 217 | ) 218 | 219 | 220 | class LengthRegulator(nn.Module): 221 | """ Length Regulator """ 222 | def __init__(self): 223 | super(LengthRegulator, self).__init__() 224 | 225 | def LR(self, x, duration, max_len): 226 | output = list() 227 | mel_len = list() 228 | for batch, expand_target in zip(x, duration): 229 | expanded = self.expand(batch, expand_target) 230 | output.append(expanded) 231 | mel_len.append(expanded.shape[0]) 232 | 233 | if max_len is not None: 234 | output = pad(output, max_len) 235 | else: 236 | output = pad(output) 237 | 238 | return output, torch.LongTensor(mel_len).to(x.device) 239 | 240 | def expand(self, batch, predicted): 241 | out = list() 242 | 243 | for i, vec in enumerate(batch): 244 | expand_size = predicted[i].item() 245 | out.append(vec.expand(max(int(expand_size), 0), -1)) 246 | out = torch.cat(out, 0) 247 | 248 | return out 249 | 250 | def forward(self, x, duration, max_len): 251 | output, mel_len = self.LR(x, duration, max_len) 252 | return output, mel_len 253 | 254 | 255 | class VariancePredictor(nn.Module): 256 | """ Duration, Pitch and Energy Predictor """ 257 | def __init__(self, model_config, output_size=1, dropout=None): 258 | super(VariancePredictor, self).__init__() 259 | 260 | self.input_size = model_config["transformer"]["variance_hidden"] 261 | self.filter_size = model_config["variance_predictor"]["filter_size"] 262 | self.kernel = model_config["variance_predictor"]["kernel_size"] 263 | self.conv_output_size = model_config["variance_predictor"][ 264 | "filter_size"] 265 | if dropout is None: 266 | self.dropout = model_config["variance_predictor"]["dropout"] 267 | else: 268 | self.dropout = dropout 269 | 270 | self.conv_layer = nn.Sequential( 271 | OrderedDict([ 272 | ( 273 | "conv1d_1", 274 | Conv( 275 | self.input_size, 276 | self.filter_size, 277 | kernel_size=self.kernel, 278 | padding=(self.kernel - 1) // 2, 279 | ), 280 | ), 281 | ("relu_1", nn.ReLU()), 282 | ("layer_norm_1", nn.LayerNorm(self.filter_size)), 283 | ("dropout_1", nn.Dropout(self.dropout)), 284 | ( 285 | "conv1d_2", 286 | Conv( 287 | self.filter_size, 288 | self.filter_size, 289 | kernel_size=self.kernel, 290 | padding=1, 291 | ), 292 | ), 293 | ("relu_2", nn.ReLU()), 294 | ("layer_norm_2", nn.LayerNorm(self.filter_size)), 295 | ("dropout_2", nn.Dropout(self.dropout)), 296 | ])) 297 | 298 | self.linear_layer = nn.Linear(self.conv_output_size, output_size) 299 | nn.init.xavier_normal_(self.linear_layer.weight) 300 | 301 | def forward(self, encoder_output, mask): 302 | out = self.conv_layer(encoder_output) 303 | out = self.linear_layer(out) 304 | out = out.squeeze(-1) 305 | 306 | if mask is not None: 307 | out = out.masked_fill(mask, 0.0) 308 | 309 | return out 310 | 311 | 312 | class Conv(nn.Module): 313 | """ 314 | Convolution Module 315 | """ 316 | def __init__( 317 | self, 318 | in_channels, 319 | out_channels, 320 | kernel_size=1, 321 | stride=1, 322 | padding=0, 323 | dilation=1, 324 | bias=True, 325 | w_init="linear", 326 | ): 327 | """ 328 | :param in_channels: dimension of input 329 | :param out_channels: dimension of output 330 | :param kernel_size: size of kernel 331 | :param stride: size of stride 332 | :param padding: size of padding 333 | :param dilation: dilation rate 334 | :param bias: boolean. if True, bias is included. 335 | :param w_init: str. weight inits with xavier initialization. 336 | """ 337 | super(Conv, self).__init__() 338 | 339 | self.conv = nn.Conv1d( 340 | in_channels, 341 | out_channels, 342 | kernel_size=kernel_size, 343 | stride=stride, 344 | padding=padding, 345 | dilation=dilation, 346 | bias=bias, 347 | ) 348 | nn.init.kaiming_normal_(self.conv.weight, nonlinearity="relu") 349 | 350 | def forward(self, x): 351 | x = x.contiguous().transpose(1, 2) 352 | x = self.conv(x) 353 | x = x.contiguous().transpose(1, 2) 354 | 355 | return x 356 | 357 | 358 | class CNNflat(nn.Module): 359 | def __init__(self, size, reduce=30): 360 | super(CNNflat, self).__init__() 361 | self.net = nn.Sequential( 362 | nn.Conv1d(size, 1, 1), 363 | nn.AdaptiveAvgPool1d(reduce), 364 | nn.LayerNorm(reduce), 365 | nn.ReLU(), 366 | ) 367 | 368 | def forward(self, x): 369 | x = x.transpose(1, 2) 370 | return self.net(x) 371 | 372 | 373 | class CNNscalar(nn.Module): 374 | def __init__(self, size_one, size_two, reduce=30): 375 | super(CNNscalar, self).__init__() 376 | self.flat_one = CNNflat(size_one, reduce) 377 | self.flat_two = CNNflat(size_two, reduce) 378 | self.linear = nn.Linear(reduce, 1) 379 | self.relu = nn.ReLU() 380 | 381 | def forward(self, x_one, x_two): 382 | x_one = self.flat_one(x_one) 383 | x_two = self.flat_two(x_two) 384 | out = self.linear(x_one + x_two) 385 | return self.relu(out).squeeze(1) 386 | -------------------------------------------------------------------------------- /fs_two/model/optimizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | class ScheduledOptim: 6 | """ A simple wrapper class for learning rate scheduling """ 7 | 8 | def __init__(self, model, train_config, model_config, current_step): 9 | 10 | self._optimizer = torch.optim.Adam( 11 | model.parameters(), 12 | betas=train_config["optimizer"]["betas"], 13 | eps=train_config["optimizer"]["eps"], 14 | weight_decay=train_config["optimizer"]["weight_decay"], 15 | ) 16 | self.n_warmup_steps = train_config["optimizer"]["warm_up_step"] 17 | self.anneal_steps = train_config["optimizer"]["anneal_steps"] 18 | self.anneal_rate = train_config["optimizer"]["anneal_rate"] 19 | self.current_step = current_step 20 | self.init_lr = np.power( 21 | model_config["transformer"]["encoder_hidden"], -0.5 22 | ) 23 | 24 | def step_and_update_lr(self): 25 | self._update_learning_rate() 26 | self._optimizer.step() 27 | 28 | def zero_grad(self): 29 | # print(self.init_lr) 30 | self._optimizer.zero_grad() 31 | 32 | def load_state_dict(self, path): 33 | self._optimizer.load_state_dict(path) 34 | 35 | def _get_lr_scale(self): 36 | lr = np.min( 37 | [ 38 | np.power(self.current_step, -0.5), 39 | np.power(self.n_warmup_steps, -1.5) * self.current_step, 40 | ] 41 | ) 42 | for s in self.anneal_steps: 43 | if self.current_step > s: 44 | lr = lr * self.anneal_rate 45 | return lr 46 | 47 | def _update_learning_rate(self): 48 | """ Learning rate scheduling per step """ 49 | self.current_step += 1 50 | lr = self.init_lr * self._get_lr_scale() 51 | 52 | for param_group in self._optimizer.param_groups: 53 | param_group["lr"] = lr -------------------------------------------------------------------------------- /fs_two/prepare_align.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import yaml 4 | 5 | from preprocessor import ljspeech, aishell3, libritts 6 | 7 | 8 | def main(config): 9 | if "LJSpeech" in config["dataset"]: 10 | ljspeech.prepare_align(config) 11 | if "AISHELL3" in config["dataset"]: 12 | aishell3.prepare_align(config) 13 | if "LibriTTS" in config["dataset"]: 14 | libritts.prepare_align(config) 15 | 16 | 17 | if __name__ == "__main__": 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("config", type=str, help="path to preprocess.yaml") 20 | args = parser.parse_args() 21 | 22 | config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader) 23 | main(config) 24 | -------------------------------------------------------------------------------- /fs_two/preprocess.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import yaml 4 | 5 | from fs_two.preprocessor.preprocessor import Preprocessor 6 | 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("config", type=str, help="path to preprocess.yaml") 11 | args = parser.parse_args() 12 | 13 | config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader) 14 | preprocessor = Preprocessor(config) 15 | preprocessor.build_from_path() 16 | -------------------------------------------------------------------------------- /fs_two/preprocessor/common_multi.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import librosa 4 | import numpy as np 5 | from scipy.io import wavfile 6 | from tqdm import tqdm 7 | 8 | 9 | def prepare_align(config): 10 | in_dir = config["path"]["corpus_path"] 11 | out_dir = config["path"]["raw_path"] 12 | sampling_rate = config["preprocessing"]["audio"]["sampling_rate"] 13 | max_wav_value = config["preprocessing"]["audio"]["max_wav_value"] 14 | for dataset in ["train", "test"]: 15 | print("Processing {}ing set...".format(dataset)) 16 | with open( 17 | os.path.join(in_dir, dataset, "content.txt"), encoding="utf-8" 18 | ) as f: 19 | for line in tqdm(f): 20 | wav_name, text = line.strip("\n").split("\t") 21 | speaker = wav_name[:7] 22 | text = text.split(" ")[1::2] 23 | wav_path = os.path.join( 24 | in_dir, dataset, "wav", speaker, wav_name 25 | ) 26 | if os.path.exists(wav_path): 27 | os.makedirs(os.path.join(out_dir, speaker), exist_ok=True) 28 | wav, _ = librosa.load(wav_path, sampling_rate) 29 | 30 | # ADD TO LOADER !!! 31 | 32 | wav = wav / max(abs(wav)) * max_wav_value 33 | wavfile.write( 34 | os.path.join(out_dir, speaker, wav_name), 35 | sampling_rate, 36 | wav.astype(np.int16), 37 | ) 38 | with open( 39 | os.path.join( 40 | out_dir, speaker, "{}.lab".format(wav_name[:11]) 41 | ), 42 | "w", 43 | ) as f1: 44 | f1.write(" ".join(text)) -------------------------------------------------------------------------------- /fs_two/synthesize.py: -------------------------------------------------------------------------------- 1 | import re 2 | import argparse 3 | from string import punctuation 4 | 5 | import torch 6 | import yaml 7 | import numpy as np 8 | from torch.utils.data import DataLoader 9 | from g2p_en import G2p 10 | from pypinyin import pinyin, Style 11 | 12 | from utils.model import get_model, get_vocoder 13 | from utils.tools import to_device, synth_samples 14 | from dataset import TextDataset 15 | from text import text_to_sequence 16 | 17 | torch.cuda.set_device(0) 18 | device = 0 19 | # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 20 | 21 | 22 | def read_lexicon(lex_path): 23 | lexicon = {} 24 | with open(lex_path) as f: 25 | for line in f: 26 | temp = re.split(r"\s+", line.strip("\n")) 27 | word = temp[0] 28 | phones = temp[1:] 29 | if word.lower() not in lexicon: 30 | lexicon[word.lower()] = phones 31 | return lexicon 32 | 33 | 34 | def preprocess_english(text, preprocess_config): 35 | text = text.rstrip(punctuation) 36 | lexicon = read_lexicon(preprocess_config["path"]["lexicon_path"]) 37 | 38 | g2p = G2p() 39 | phones = [] 40 | words = re.split(r"([,;.\-\?\!\s+])", text) 41 | for w in words: 42 | if w.lower() in lexicon: 43 | phones += lexicon[w.lower()] 44 | else: 45 | phones += list(filter(lambda p: p != " ", g2p(w))) 46 | phones = "{" + "}{".join(phones) + "}" 47 | phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones) 48 | phones = phones.replace("}{", " ") 49 | 50 | print("Raw Text Sequence: {}".format(text)) 51 | print("Phoneme Sequence: {}".format(phones)) 52 | sequence = np.array( 53 | text_to_sequence( 54 | phones, preprocess_config["preprocessing"]["text"]["text_cleaners"] 55 | ) 56 | ) 57 | 58 | return np.array(sequence) 59 | 60 | 61 | def preprocess_mandarin(text, preprocess_config): 62 | lexicon = read_lexicon(preprocess_config["path"]["lexicon_path"]) 63 | 64 | phones = [] 65 | pinyins = [ 66 | p[0] 67 | for p in pinyin( 68 | text, style=Style.TONE3, strict=False, neutral_tone_with_five=True 69 | ) 70 | ] 71 | for p in pinyins: 72 | if p in lexicon: 73 | phones += lexicon[p] 74 | else: 75 | phones.append("sp") 76 | 77 | phones = "{" + " ".join(phones) + "}" 78 | print("Raw Text Sequence: {}".format(text)) 79 | print("Phoneme Sequence: {}".format(phones)) 80 | sequence = np.array( 81 | text_to_sequence( 82 | phones, preprocess_config["preprocessing"]["text"]["text_cleaners"] 83 | ) 84 | ) 85 | 86 | return np.array(sequence) 87 | 88 | 89 | def synthesize(model, step, configs, vocoder, batchs, control_values): 90 | preprocess_config, model_config, train_config = configs 91 | pitch_control, energy_control, duration_control = control_values 92 | 93 | for batch in batchs: 94 | batch = to_device(batch, device) 95 | with torch.no_grad(): 96 | # Forward 97 | output = model( 98 | *(batch[2:]), 99 | p_control=pitch_control, 100 | e_control=energy_control, 101 | d_control=duration_control 102 | ) 103 | synth_samples( 104 | batch, 105 | output, 106 | vocoder, 107 | model_config, 108 | preprocess_config, 109 | train_config["path"]["result_path"], 110 | ) 111 | 112 | 113 | if __name__ == "__main__": 114 | 115 | parser = argparse.ArgumentParser() 116 | parser.add_argument("--restore_step", type=int, required=True) 117 | parser.add_argument( 118 | "--mode", 119 | type=str, 120 | choices=["batch", "single"], 121 | required=True, 122 | help="Synthesize a whole dataset or a single sentence", 123 | ) 124 | parser.add_argument( 125 | "--source", 126 | type=str, 127 | default=None, 128 | help="path to a source file with format like train.txt and val.txt, for batch mode only", 129 | ) 130 | parser.add_argument( 131 | "--text", 132 | type=str, 133 | default=None, 134 | help="raw text to synthesize, for single-sentence mode only", 135 | ) 136 | parser.add_argument( 137 | "--speaker_id", 138 | type=int, 139 | default=0, 140 | help="speaker ID for multi-speaker synthesis, for single-sentence mode only", 141 | ) 142 | parser.add_argument( 143 | "-p", 144 | "--preprocess_config", 145 | type=str, 146 | required=True, 147 | help="path to preprocess.yaml", 148 | ) 149 | parser.add_argument( 150 | "-m", 151 | "--model_config", 152 | type=str, 153 | required=True, 154 | help="path to model.yaml", 155 | ) 156 | parser.add_argument( 157 | "-t", 158 | "--train_config", 159 | type=str, 160 | required=True, 161 | help="path to train.yaml", 162 | ) 163 | parser.add_argument( 164 | "--pitch_control", 165 | type=float, 166 | default=1.0, 167 | help="control the pitch of the whole utterance, larger value for higher pitch", 168 | ) 169 | parser.add_argument( 170 | "--energy_control", 171 | type=float, 172 | default=1.0, 173 | help="control the energy of the whole utterance, larger value for larger volume", 174 | ) 175 | parser.add_argument( 176 | "--duration_control", 177 | type=float, 178 | default=1.0, 179 | help="control the speed of the whole utterance, larger value for slower speaking rate", 180 | ) 181 | args = parser.parse_args() 182 | 183 | # Check source texts 184 | if args.mode == "batch": 185 | assert args.source is not None and args.text is None 186 | if args.mode == "single": 187 | assert args.source is None and args.text is not None 188 | 189 | # Read Config 190 | preprocess_config = yaml.load( 191 | open(args.preprocess_config, "r"), Loader=yaml.FullLoader 192 | ) 193 | model_config = yaml.load( 194 | open(args.model_config, "r"), Loader=yaml.FullLoader 195 | ) 196 | train_config = yaml.load( 197 | open(args.train_config, "r"), Loader=yaml.FullLoader 198 | ) 199 | configs = (preprocess_config, model_config, train_config) 200 | 201 | # Get model 202 | model = get_model(args, configs, device, train=False) 203 | 204 | # Load vocoder 205 | vocoder = get_vocoder(model_config, device) 206 | 207 | # Preprocess texts 208 | if args.mode == "batch": 209 | # Get dataset 210 | dataset = TextDataset(args.source, preprocess_config) 211 | batchs = DataLoader( 212 | dataset, 213 | batch_size=8, 214 | collate_fn=dataset.collate_fn, 215 | ) 216 | if args.mode == "single": 217 | ids = raw_texts = [args.text[:100]] 218 | speakers = np.array([args.speaker_id]) 219 | if preprocess_config["preprocessing"]["text"]["language"] == "en": 220 | texts = np.array([preprocess_english(args.text, preprocess_config)]) 221 | elif preprocess_config["preprocessing"]["text"]["language"] == "zh": 222 | texts = np.array( 223 | [preprocess_mandarin(args.text, preprocess_config)] 224 | ) 225 | text_lens = np.array([len(texts[0])]) 226 | batchs = [(ids, raw_texts, speakers, texts, text_lens, max(text_lens))] 227 | 228 | control_values = ( 229 | args.pitch_control, 230 | args.energy_control, 231 | args.duration_control, 232 | ) 233 | 234 | synthesize( 235 | model, args.restore_step, configs, vocoder, batchs, control_values 236 | ) 237 | -------------------------------------------------------------------------------- /fs_two/text/__init__.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | import re 3 | from fs_two.text import cleaners 4 | from fs_two.text.symbols import symbols 5 | 6 | 7 | # Mappings from symbol to numeric ID and vice versa: 8 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 9 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 10 | 11 | # Regular expression matching text enclosed in curly braces: 12 | _curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)") 13 | 14 | 15 | def text_to_sequence(text, cleaner_names): 16 | """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 17 | 18 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 19 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 20 | 21 | Args: 22 | text: string to convert to a sequence 23 | cleaner_names: names of the cleaner functions to run the text through 24 | 25 | Returns: 26 | List of integers corresponding to the symbols in the text 27 | """ 28 | sequence = [] 29 | 30 | # Check for curly braces and treat their contents as ARPAbet: 31 | while len(text): 32 | m = _curly_re.match(text) 33 | 34 | if not m: 35 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 36 | break 37 | sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) 38 | sequence += _arpabet_to_sequence(m.group(2)) 39 | text = m.group(3) 40 | 41 | return sequence 42 | 43 | 44 | def sequence_to_text(sequence): 45 | """Converts a sequence of IDs back to a string""" 46 | result = "" 47 | for symbol_id in sequence: 48 | if symbol_id in _id_to_symbol: 49 | s = _id_to_symbol[symbol_id] 50 | # Enclose ARPAbet back in curly braces: 51 | if len(s) > 1 and s[0] == "@": 52 | s = "{%s}" % s[1:] 53 | result += s 54 | return result.replace("}{", " ") 55 | 56 | 57 | def _clean_text(text, cleaner_names): 58 | for name in cleaner_names: 59 | cleaner = getattr(cleaners, name) 60 | if not cleaner: 61 | raise Exception("Unknown cleaner: %s" % name) 62 | text = cleaner(text) 63 | return text 64 | 65 | 66 | def _symbols_to_sequence(symbols): 67 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 68 | 69 | 70 | def _arpabet_to_sequence(text): 71 | return _symbols_to_sequence(["@" + s for s in text.split()]) 72 | 73 | 74 | def _should_keep_symbol(s): 75 | return s in _symbol_to_id and s != "_" and s != "~" 76 | -------------------------------------------------------------------------------- /fs_two/text/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/text/__pycache__/cleaners.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/cleaners.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/text/__pycache__/cmudict.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/cmudict.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/text/__pycache__/numbers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/numbers.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/text/__pycache__/pinyin.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/pinyin.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/text/__pycache__/russian.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/russian.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/text/__pycache__/symbols.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/text/__pycache__/symbols.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/text/cleaners.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | ''' 4 | Cleaners are transformations that run over the input text at both training and eval time. 5 | 6 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 7 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 8 | 1. "english_cleaners" for English text 9 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 10 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 11 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 12 | the symbols in symbols.py to match your data). 13 | ''' 14 | 15 | 16 | # Regular expression matching whitespace: 17 | import re 18 | from unidecode import unidecode 19 | from .numbers import normalize_numbers 20 | _whitespace_re = re.compile(r'\s+') 21 | 22 | # List of (regular expression, replacement) pairs for abbreviations: 23 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ 24 | ('mrs', 'misess'), 25 | ('mr', 'mister'), 26 | ('dr', 'doctor'), 27 | ('st', 'saint'), 28 | ('co', 'company'), 29 | ('jr', 'junior'), 30 | ('maj', 'major'), 31 | ('gen', 'general'), 32 | ('drs', 'doctors'), 33 | ('rev', 'reverend'), 34 | ('lt', 'lieutenant'), 35 | ('hon', 'honorable'), 36 | ('sgt', 'sergeant'), 37 | ('capt', 'captain'), 38 | ('esq', 'esquire'), 39 | ('ltd', 'limited'), 40 | ('col', 'colonel'), 41 | ('ft', 'fort'), 42 | ]] 43 | 44 | 45 | def expand_abbreviations(text): 46 | for regex, replacement in _abbreviations: 47 | text = re.sub(regex, replacement, text) 48 | return text 49 | 50 | 51 | def expand_numbers(text): 52 | return normalize_numbers(text) 53 | 54 | 55 | def lowercase(text): 56 | return text.lower() 57 | 58 | 59 | def collapse_whitespace(text): 60 | return re.sub(_whitespace_re, ' ', text) 61 | 62 | 63 | def convert_to_ascii(text): 64 | return unidecode(text) 65 | 66 | 67 | def basic_cleaners(text): 68 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 69 | text = lowercase(text) 70 | text = collapse_whitespace(text) 71 | return text 72 | 73 | 74 | def transliteration_cleaners(text): 75 | '''Pipeline for non-English text that transliterates to ASCII.''' 76 | text = convert_to_ascii(text) 77 | text = lowercase(text) 78 | text = collapse_whitespace(text) 79 | return text 80 | 81 | 82 | def english_cleaners(text): 83 | '''Pipeline for English text, including number and abbreviation expansion.''' 84 | text = convert_to_ascii(text) 85 | text = lowercase(text) 86 | text = expand_numbers(text) 87 | text = expand_abbreviations(text) 88 | text = collapse_whitespace(text) 89 | return text 90 | -------------------------------------------------------------------------------- /fs_two/text/cmudict.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import re 4 | 5 | 6 | valid_symbols = [ 7 | "AA", 8 | "AA0", 9 | "AA1", 10 | "AA2", 11 | "AE", 12 | "AE0", 13 | "AE1", 14 | "AE2", 15 | "AH", 16 | "AH0", 17 | "AH1", 18 | "AH2", 19 | "AO", 20 | "AO0", 21 | "AO1", 22 | "AO2", 23 | "AW", 24 | "AW0", 25 | "AW1", 26 | "AW2", 27 | "AY", 28 | "AY0", 29 | "AY1", 30 | "AY2", 31 | "B", 32 | "CH", 33 | "D", 34 | "DH", 35 | "EH", 36 | "EH0", 37 | "EH1", 38 | "EH2", 39 | "ER", 40 | "ER0", 41 | "ER1", 42 | "ER2", 43 | "EY", 44 | "EY0", 45 | "EY1", 46 | "EY2", 47 | "F", 48 | "G", 49 | "HH", 50 | "IH", 51 | "IH0", 52 | "IH1", 53 | "IH2", 54 | "IY", 55 | "IY0", 56 | "IY1", 57 | "IY2", 58 | "JH", 59 | "K", 60 | "L", 61 | "M", 62 | "N", 63 | "NG", 64 | "OW", 65 | "OW0", 66 | "OW1", 67 | "OW2", 68 | "OY", 69 | "OY0", 70 | "OY1", 71 | "OY2", 72 | "P", 73 | "R", 74 | "S", 75 | "SH", 76 | "T", 77 | "TH", 78 | "UH", 79 | "UH0", 80 | "UH1", 81 | "UH2", 82 | "UW", 83 | "UW0", 84 | "UW1", 85 | "UW2", 86 | "V", 87 | "W", 88 | "Y", 89 | "Z", 90 | "ZH", 91 | ] 92 | 93 | _valid_symbol_set = set(valid_symbols) 94 | 95 | 96 | class CMUDict: 97 | """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict""" 98 | 99 | def __init__(self, file_or_path, keep_ambiguous=True): 100 | if isinstance(file_or_path, str): 101 | with open(file_or_path, encoding="latin-1") as f: 102 | entries = _parse_cmudict(f) 103 | else: 104 | entries = _parse_cmudict(file_or_path) 105 | if not keep_ambiguous: 106 | entries = {word: pron for word, pron in entries.items() if len(pron) == 1} 107 | self._entries = entries 108 | 109 | def __len__(self): 110 | return len(self._entries) 111 | 112 | def lookup(self, word): 113 | """Returns list of ARPAbet pronunciations of the given word.""" 114 | return self._entries.get(word.upper()) 115 | 116 | 117 | _alt_re = re.compile(r"\([0-9]+\)") 118 | 119 | 120 | def _parse_cmudict(file): 121 | cmudict = {} 122 | for line in file: 123 | if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"): 124 | parts = line.split(" ") 125 | word = re.sub(_alt_re, "", parts[0]) 126 | pronunciation = _get_pronunciation(parts[1]) 127 | if pronunciation: 128 | if word in cmudict: 129 | cmudict[word].append(pronunciation) 130 | else: 131 | cmudict[word] = [pronunciation] 132 | return cmudict 133 | 134 | 135 | def _get_pronunciation(s): 136 | parts = s.strip().split(" ") 137 | for part in parts: 138 | if part not in _valid_symbol_set: 139 | return None 140 | return " ".join(parts) 141 | -------------------------------------------------------------------------------- /fs_two/text/numbers.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | import inflect 4 | import re 5 | 6 | 7 | _inflect = inflect.engine() 8 | _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") 9 | _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") 10 | _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)") 11 | _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)") 12 | _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") 13 | _number_re = re.compile(r"[0-9]+") 14 | 15 | 16 | def _remove_commas(m): 17 | return m.group(1).replace(",", "") 18 | 19 | 20 | def _expand_decimal_point(m): 21 | return m.group(1).replace(".", " point ") 22 | 23 | 24 | def _expand_dollars(m): 25 | match = m.group(1) 26 | parts = match.split(".") 27 | if len(parts) > 2: 28 | return match + " dollars" # Unexpected format 29 | dollars = int(parts[0]) if parts[0] else 0 30 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 31 | if dollars and cents: 32 | dollar_unit = "dollar" if dollars == 1 else "dollars" 33 | cent_unit = "cent" if cents == 1 else "cents" 34 | return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit) 35 | elif dollars: 36 | dollar_unit = "dollar" if dollars == 1 else "dollars" 37 | return "%s %s" % (dollars, dollar_unit) 38 | elif cents: 39 | cent_unit = "cent" if cents == 1 else "cents" 40 | return "%s %s" % (cents, cent_unit) 41 | else: 42 | return "zero dollars" 43 | 44 | 45 | def _expand_ordinal(m): 46 | return _inflect.number_to_words(m.group(0)) 47 | 48 | 49 | def _expand_number(m): 50 | num = int(m.group(0)) 51 | if num > 1000 and num < 3000: 52 | if num == 2000: 53 | return "two thousand" 54 | elif num > 2000 and num < 2010: 55 | return "two thousand " + _inflect.number_to_words(num % 100) 56 | elif num % 100 == 0: 57 | return _inflect.number_to_words(num // 100) + " hundred" 58 | else: 59 | return _inflect.number_to_words( 60 | num, andword="", zero="oh", group=2 61 | ).replace(", ", " ") 62 | else: 63 | return _inflect.number_to_words(num, andword="") 64 | 65 | 66 | def normalize_numbers(text): 67 | text = re.sub(_comma_number_re, _remove_commas, text) 68 | text = re.sub(_pounds_re, r"\1 pounds", text) 69 | text = re.sub(_dollars_re, _expand_dollars, text) 70 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 71 | text = re.sub(_ordinal_re, _expand_ordinal, text) 72 | text = re.sub(_number_re, _expand_number, text) 73 | return text 74 | -------------------------------------------------------------------------------- /fs_two/text/pinyin.py: -------------------------------------------------------------------------------- 1 | initials = [ 2 | "b", 3 | "c", 4 | "ch", 5 | "d", 6 | "f", 7 | "g", 8 | "h", 9 | "j", 10 | "k", 11 | "l", 12 | "m", 13 | "n", 14 | "p", 15 | "q", 16 | "r", 17 | "s", 18 | "sh", 19 | "t", 20 | "w", 21 | "x", 22 | "y", 23 | "z", 24 | "zh", 25 | ] 26 | finals = [ 27 | "a1", 28 | "a2", 29 | "a3", 30 | "a4", 31 | "a5", 32 | "ai1", 33 | "ai2", 34 | "ai3", 35 | "ai4", 36 | "ai5", 37 | "an1", 38 | "an2", 39 | "an3", 40 | "an4", 41 | "an5", 42 | "ang1", 43 | "ang2", 44 | "ang3", 45 | "ang4", 46 | "ang5", 47 | "ao1", 48 | "ao2", 49 | "ao3", 50 | "ao4", 51 | "ao5", 52 | "e1", 53 | "e2", 54 | "e3", 55 | "e4", 56 | "e5", 57 | "ei1", 58 | "ei2", 59 | "ei3", 60 | "ei4", 61 | "ei5", 62 | "en1", 63 | "en2", 64 | "en3", 65 | "en4", 66 | "en5", 67 | "eng1", 68 | "eng2", 69 | "eng3", 70 | "eng4", 71 | "eng5", 72 | "er1", 73 | "er2", 74 | "er3", 75 | "er4", 76 | "er5", 77 | "i1", 78 | "i2", 79 | "i3", 80 | "i4", 81 | "i5", 82 | "ia1", 83 | "ia2", 84 | "ia3", 85 | "ia4", 86 | "ia5", 87 | "ian1", 88 | "ian2", 89 | "ian3", 90 | "ian4", 91 | "ian5", 92 | "iang1", 93 | "iang2", 94 | "iang3", 95 | "iang4", 96 | "iang5", 97 | "iao1", 98 | "iao2", 99 | "iao3", 100 | "iao4", 101 | "iao5", 102 | "ie1", 103 | "ie2", 104 | "ie3", 105 | "ie4", 106 | "ie5", 107 | "ii1", 108 | "ii2", 109 | "ii3", 110 | "ii4", 111 | "ii5", 112 | "iii1", 113 | "iii2", 114 | "iii3", 115 | "iii4", 116 | "iii5", 117 | "in1", 118 | "in2", 119 | "in3", 120 | "in4", 121 | "in5", 122 | "ing1", 123 | "ing2", 124 | "ing3", 125 | "ing4", 126 | "ing5", 127 | "iong1", 128 | "iong2", 129 | "iong3", 130 | "iong4", 131 | "iong5", 132 | "iou1", 133 | "iou2", 134 | "iou3", 135 | "iou4", 136 | "iou5", 137 | "o1", 138 | "o2", 139 | "o3", 140 | "o4", 141 | "o5", 142 | "ong1", 143 | "ong2", 144 | "ong3", 145 | "ong4", 146 | "ong5", 147 | "ou1", 148 | "ou2", 149 | "ou3", 150 | "ou4", 151 | "ou5", 152 | "u1", 153 | "u2", 154 | "u3", 155 | "u4", 156 | "u5", 157 | "ua1", 158 | "ua2", 159 | "ua3", 160 | "ua4", 161 | "ua5", 162 | "uai1", 163 | "uai2", 164 | "uai3", 165 | "uai4", 166 | "uai5", 167 | "uan1", 168 | "uan2", 169 | "uan3", 170 | "uan4", 171 | "uan5", 172 | "uang1", 173 | "uang2", 174 | "uang3", 175 | "uang4", 176 | "uang5", 177 | "uei1", 178 | "uei2", 179 | "uei3", 180 | "uei4", 181 | "uei5", 182 | "uen1", 183 | "uen2", 184 | "uen3", 185 | "uen4", 186 | "uen5", 187 | "uo1", 188 | "uo2", 189 | "uo3", 190 | "uo4", 191 | "uo5", 192 | "v1", 193 | "v2", 194 | "v3", 195 | "v4", 196 | "v5", 197 | "van1", 198 | "van2", 199 | "van3", 200 | "van4", 201 | "van5", 202 | "ve1", 203 | "ve2", 204 | "ve3", 205 | "ve4", 206 | "ve5", 207 | "vn1", 208 | "vn2", 209 | "vn3", 210 | "vn4", 211 | "vn5", 212 | ] 213 | valid_symbols = initials + finals + ["rr"] -------------------------------------------------------------------------------- /fs_two/text/russian.py: -------------------------------------------------------------------------------- 1 | valid_symbols = [ 2 | "A", 3 | "A0", 4 | "B", 5 | "B0", 6 | "D", 7 | "D0", 8 | "DZ", 9 | "DZ0", 10 | "DZH", 11 | "DZH0", 12 | "E0", 13 | "F", 14 | "F0", 15 | "G", 16 | "G0", 17 | "GH", 18 | "I", 19 | "I0", 20 | "J0", 21 | "K", 22 | "K0", 23 | "KH", 24 | "KH0", 25 | "L", 26 | "L0", 27 | "M", 28 | "M0", 29 | "N", 30 | "N0", 31 | "O", 32 | "O0", 33 | "P", 34 | "P0", 35 | "R", 36 | "R0", 37 | "S", 38 | "S0", 39 | "SH", 40 | "SH0", 41 | "T", 42 | "T0", 43 | "TS", 44 | "TS0", 45 | "TSH", 46 | "TSH0", 47 | "U", 48 | "U0", 49 | "V", 50 | "V0", 51 | "Y", 52 | "Y0", 53 | "Z", 54 | "Z0", 55 | "ZH", 56 | ] 57 | 58 | old_valid_symbols = [ 59 | "S", 60 | "Sj", 61 | "StS", 62 | "StSj", 63 | "Z", 64 | "Zj", 65 | "a", 66 | "b", 67 | "bj", 68 | "d", 69 | "dj", 70 | "e", 71 | "f", 72 | "g", 73 | "hrd", 74 | "i", 75 | "i2", 76 | "j", 77 | "jA", 78 | "jE", 79 | "jO", 80 | "jU", 81 | "k", 82 | "l", 83 | "lj", 84 | "m", 85 | "mj", 86 | "n", 87 | "nj", 88 | "o", 89 | "p", 90 | "pj", 91 | "r", 92 | "rj", 93 | "s", 94 | "sj", 95 | "t", 96 | "tS", 97 | "tSj", 98 | "tj", 99 | "ts", 100 | "u", 101 | "v", 102 | "vj", 103 | "x", 104 | "z", 105 | "zj", 106 | ] 107 | -------------------------------------------------------------------------------- /fs_two/text/symbols.py: -------------------------------------------------------------------------------- 1 | """ from https://github.com/keithito/tacotron """ 2 | 3 | """ 4 | Defines the set of symbols used in text input to the model. 5 | 6 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. """ 7 | 8 | from fs_two.text import cmudict, pinyin, russian 9 | 10 | _pad = "_" 11 | _mask = "mask" 12 | _punctuation = "!'(),.:;? " 13 | _special = "-" 14 | _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 15 | _silences = ["@sp", "@spn", "@sil"] 16 | 17 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 18 | _arpabet = ["@" + s for s in cmudict.valid_symbols] 19 | _pinyin = ["@" + s for s in pinyin.valid_symbols] 20 | _russian = ["@" + s for s in russian.valid_symbols + [_mask]] 21 | 22 | # Export all symbols: 23 | symbols = ( 24 | [_pad] 25 | + list(_special) 26 | + list(_punctuation) 27 | + list(_letters) 28 | + _arpabet 29 | # + _pinyin 30 | + _silences 31 | + _russian 32 | ) 33 | -------------------------------------------------------------------------------- /fs_two/transformer/Constants.py: -------------------------------------------------------------------------------- 1 | PAD = 0 2 | UNK = 1 3 | BOS = 2 4 | EOS = 3 5 | 6 | PAD_WORD = "" 7 | UNK_WORD = "" 8 | BOS_WORD = "" 9 | EOS_WORD = "" 10 | -------------------------------------------------------------------------------- /fs_two/transformer/Layers.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | import torch.nn as nn 5 | import numpy as np 6 | from torch.nn import functional as F 7 | 8 | from .SubLayers import MultiHeadAttention, PositionwiseFeedForward 9 | 10 | 11 | class FFTBlock(torch.nn.Module): 12 | """FFT Block""" 13 | 14 | def __init__( 15 | self, d_model, n_head, d_k, d_v, d_inner, kernel_size, dropout=0.1 16 | ): 17 | super(FFTBlock, self).__init__() 18 | self.slf_attn = MultiHeadAttention( 19 | n_head, d_model, d_k, d_v, dropout=dropout 20 | ) 21 | self.pos_ffn = PositionwiseFeedForward( 22 | d_model, d_inner, kernel_size, dropout=dropout 23 | ) 24 | 25 | def forward(self, enc_input, mask=None, slf_attn_mask=None): 26 | enc_output, enc_slf_attn = self.slf_attn( 27 | enc_input, enc_input, enc_input, mask=slf_attn_mask 28 | ) 29 | enc_output = enc_output.masked_fill(mask.unsqueeze(-1), 0) 30 | 31 | enc_output = self.pos_ffn(enc_output) 32 | enc_output = enc_output.masked_fill(mask.unsqueeze(-1), 0) 33 | 34 | return enc_output, enc_slf_attn 35 | 36 | 37 | class ConvNorm(torch.nn.Module): 38 | def __init__( 39 | self, 40 | in_channels, 41 | out_channels, 42 | kernel_size=1, 43 | stride=1, 44 | padding=None, 45 | dilation=1, 46 | bias=True, 47 | w_init_gain="linear", 48 | ): 49 | super(ConvNorm, self).__init__() 50 | 51 | if padding is None: 52 | assert kernel_size % 2 == 1 53 | padding = int(dilation * (kernel_size - 1) / 2) 54 | 55 | self.conv = torch.nn.Conv1d( 56 | in_channels, 57 | out_channels, 58 | kernel_size=kernel_size, 59 | stride=stride, 60 | padding=padding, 61 | dilation=dilation, 62 | bias=bias, 63 | ) 64 | 65 | def forward(self, signal): 66 | conv_signal = self.conv(signal) 67 | 68 | return conv_signal 69 | 70 | 71 | class PostNet(nn.Module): 72 | """ 73 | PostNet: Five 1-d convolution with 512 channels and kernel size 5 74 | """ 75 | 76 | def __init__( 77 | self, 78 | n_mel_channels=80, 79 | postnet_embedding_dim=512, 80 | postnet_kernel_size=5, 81 | postnet_n_convolutions=5, 82 | ): 83 | 84 | super(PostNet, self).__init__() 85 | self.convolutions = nn.ModuleList() 86 | 87 | self.convolutions.append( 88 | nn.Sequential( 89 | ConvNorm( 90 | n_mel_channels, 91 | postnet_embedding_dim, 92 | kernel_size=postnet_kernel_size, 93 | stride=1, 94 | padding=int((postnet_kernel_size - 1) / 2), 95 | dilation=1, 96 | w_init_gain="tanh", 97 | ), 98 | nn.BatchNorm1d(postnet_embedding_dim), 99 | ) 100 | ) 101 | 102 | for i in range(1, postnet_n_convolutions - 1): 103 | self.convolutions.append( 104 | nn.Sequential( 105 | ConvNorm( 106 | postnet_embedding_dim, 107 | postnet_embedding_dim, 108 | kernel_size=postnet_kernel_size, 109 | stride=1, 110 | padding=int((postnet_kernel_size - 1) / 2), 111 | dilation=1, 112 | w_init_gain="tanh", 113 | ), 114 | nn.BatchNorm1d(postnet_embedding_dim), 115 | ) 116 | ) 117 | 118 | self.convolutions.append( 119 | nn.Sequential( 120 | ConvNorm( 121 | postnet_embedding_dim, 122 | n_mel_channels, 123 | kernel_size=postnet_kernel_size, 124 | stride=1, 125 | padding=int((postnet_kernel_size - 1) / 2), 126 | dilation=1, 127 | w_init_gain="linear", 128 | ), 129 | nn.BatchNorm1d(n_mel_channels), 130 | ) 131 | ) 132 | 133 | def forward(self, x): 134 | x = x.contiguous().transpose(1, 2) 135 | 136 | for i in range(len(self.convolutions) - 1): 137 | x = F.dropout( 138 | torch.tanh(self.convolutions[i](x)), 0.5, self.training 139 | ) 140 | x = F.dropout(self.convolutions[-1](x), 0.5, self.training) 141 | 142 | x = x.contiguous().transpose(1, 2) 143 | return x -------------------------------------------------------------------------------- /fs_two/transformer/Models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | import fs_two.transformer.Constants as Constants 6 | from .Layers import FFTBlock 7 | from fs_two.text.symbols import symbols 8 | 9 | 10 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): 11 | """ Sinusoid position encoding table """ 12 | 13 | def cal_angle(position, hid_idx): 14 | return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) 15 | 16 | def get_posi_angle_vec(position): 17 | return [cal_angle(position, hid_j) for hid_j in range(d_hid)] 18 | 19 | sinusoid_table = np.array( 20 | [get_posi_angle_vec(pos_i) for pos_i in range(n_position)] 21 | ) 22 | 23 | sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 24 | sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 25 | 26 | if padding_idx is not None: 27 | # zero vector for padding dimension 28 | sinusoid_table[padding_idx] = 0.0 29 | 30 | return torch.FloatTensor(sinusoid_table) 31 | 32 | 33 | class Encoder(nn.Module): 34 | """ Encoder """ 35 | 36 | def __init__(self, config): 37 | super(Encoder, self).__init__() 38 | 39 | n_position = config["max_seq_len"] + 1 40 | n_src_vocab = len(symbols) + 1 41 | d_word_vec = config["transformer"]["encoder_hidden"] 42 | n_layers = config["transformer"]["encoder_layer"] 43 | n_head = config["transformer"]["encoder_head"] 44 | d_k = d_v = ( 45 | config["transformer"]["encoder_hidden"] 46 | // config["transformer"]["encoder_head"] 47 | ) 48 | d_model = config["transformer"]["encoder_hidden"] 49 | d_inner = config["transformer"]["conv_filter_size"] 50 | kernel_size = config["transformer"]["conv_kernel_size"] 51 | dropout = config["transformer"]["encoder_dropout"] 52 | 53 | self.max_seq_len = config["max_seq_len"] 54 | self.d_model = d_model 55 | 56 | self.src_word_emb = nn.Embedding( 57 | n_src_vocab, d_word_vec, padding_idx=Constants.PAD 58 | ) 59 | self.position_enc = nn.Parameter( 60 | get_sinusoid_encoding_table(n_position, d_word_vec).unsqueeze(0), 61 | requires_grad=False, 62 | ) 63 | 64 | self.layer_stack = nn.ModuleList( 65 | [ 66 | FFTBlock( 67 | d_model, 68 | n_head, 69 | d_k, 70 | d_v, 71 | d_inner, 72 | kernel_size, 73 | dropout=dropout, 74 | ) 75 | for _ in range(n_layers) 76 | ] 77 | ) 78 | 79 | def forward(self, src_seq, mask, return_attns=False): 80 | 81 | enc_slf_attn_list = [] 82 | batch_size, max_len = src_seq.shape[0], src_seq.shape[1] 83 | 84 | # -- Prepare masks 85 | slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) 86 | 87 | # -- Forward 88 | if not self.training and src_seq.shape[1] > self.max_seq_len: 89 | enc_output = self.src_word_emb( 90 | src_seq 91 | ) + get_sinusoid_encoding_table(src_seq.shape[1], self.d_model)[ 92 | : src_seq.shape[1], : 93 | ].unsqueeze( 94 | 0 95 | ).expand( 96 | batch_size, -1, -1 97 | ).to( 98 | src_seq.device 99 | ) 100 | else: 101 | enc_output = self.src_word_emb(src_seq) + self.position_enc[ 102 | :, :max_len, : 103 | ].expand(batch_size, -1, -1) 104 | 105 | for enc_layer in self.layer_stack: 106 | enc_output, enc_slf_attn = enc_layer( 107 | enc_output, mask=mask, slf_attn_mask=slf_attn_mask 108 | ) 109 | if return_attns: 110 | enc_slf_attn_list += [enc_slf_attn] 111 | 112 | return enc_output 113 | 114 | 115 | class Decoder(nn.Module): 116 | """ Decoder """ 117 | 118 | def __init__(self, config): 119 | super(Decoder, self).__init__() 120 | 121 | n_position = config["max_seq_len"] + 1 122 | d_word_vec = config["transformer"]["decoder_hidden"] 123 | n_layers = config["transformer"]["decoder_layer"] 124 | n_head = config["transformer"]["decoder_head"] 125 | d_k = d_v = ( 126 | config["transformer"]["decoder_hidden"] 127 | // config["transformer"]["decoder_head"] 128 | ) 129 | d_model = config["transformer"]["decoder_hidden"] 130 | d_inner = config["transformer"]["conv_filter_size"] 131 | kernel_size = config["transformer"]["conv_kernel_size"] 132 | dropout = config["transformer"]["decoder_dropout"] 133 | 134 | self.max_seq_len = config["max_seq_len"] 135 | self.d_model = d_model 136 | 137 | self.position_enc = nn.Parameter( 138 | get_sinusoid_encoding_table(n_position, d_word_vec).unsqueeze(0), 139 | requires_grad=False, 140 | ) 141 | 142 | self.layer_stack = nn.ModuleList( 143 | [ 144 | FFTBlock( 145 | d_model, 146 | n_head, 147 | d_k, 148 | d_v, 149 | d_inner, 150 | kernel_size, 151 | dropout=dropout, 152 | ) 153 | for _ in range(n_layers) 154 | ] 155 | ) 156 | 157 | def forward(self, enc_seq, mask, return_attns=False): 158 | 159 | dec_slf_attn_list = [] 160 | batch_size, max_len = enc_seq.shape[0], enc_seq.shape[1] 161 | 162 | # -- Forward 163 | if not self.training and enc_seq.shape[1] > self.max_seq_len: 164 | # -- Prepare masks 165 | slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) 166 | dec_output = enc_seq + get_sinusoid_encoding_table( 167 | enc_seq.shape[1], self.d_model 168 | )[: enc_seq.shape[1], :].unsqueeze(0).expand(batch_size, -1, -1).to( 169 | enc_seq.device 170 | ) 171 | else: 172 | max_len = min(max_len, self.max_seq_len) 173 | 174 | # -- Prepare masks 175 | slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) 176 | dec_output = enc_seq[:, :max_len, :] + self.position_enc[ 177 | :, :max_len, : 178 | ].expand(batch_size, -1, -1) 179 | mask = mask[:, :max_len] 180 | slf_attn_mask = slf_attn_mask[:, :, :max_len] 181 | 182 | for dec_layer in self.layer_stack: 183 | dec_output, dec_slf_attn = dec_layer( 184 | dec_output, mask=mask, slf_attn_mask=slf_attn_mask 185 | ) 186 | if return_attns: 187 | dec_slf_attn_list += [dec_slf_attn] 188 | 189 | return dec_output, mask 190 | -------------------------------------------------------------------------------- /fs_two/transformer/Modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | 6 | class ScaledDotProductAttention(nn.Module): 7 | """ Scaled Dot-Product Attention """ 8 | 9 | def __init__(self, temperature): 10 | super().__init__() 11 | self.temperature = temperature 12 | self.softmax = nn.Softmax(dim=2) 13 | 14 | def forward(self, q, k, v, mask=None): 15 | attn = torch.bmm(q, k.transpose(1, 2)) 16 | attn = attn / self.temperature 17 | 18 | if mask is not None: 19 | attn = attn.masked_fill(mask, -np.inf) 20 | 21 | attn = self.softmax(attn) 22 | output = torch.bmm(attn, v) 23 | 24 | return output, attn 25 | -------------------------------------------------------------------------------- /fs_two/transformer/SubLayers.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import numpy as np 4 | 5 | from .Modules import ScaledDotProductAttention 6 | 7 | 8 | class MultiHeadAttention(nn.Module): 9 | """ Multi-Head Attention module """ 10 | 11 | def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): 12 | super().__init__() 13 | 14 | self.n_head = n_head 15 | self.d_k = d_k 16 | self.d_v = d_v 17 | 18 | self.w_qs = nn.Linear(d_model, n_head * d_k) 19 | self.w_ks = nn.Linear(d_model, n_head * d_k) 20 | self.w_vs = nn.Linear(d_model, n_head * d_v) 21 | 22 | self.attention = ScaledDotProductAttention( 23 | temperature=np.power(d_k, 0.5) 24 | ) 25 | self.layer_norm = nn.LayerNorm(d_model) 26 | 27 | self.fc = nn.Linear(n_head * d_v, d_model) 28 | 29 | self.dropout = nn.Dropout(dropout) 30 | 31 | def forward(self, q, k, v, mask=None): 32 | 33 | d_k, d_v, n_head = self.d_k, self.d_v, self.n_head 34 | 35 | sz_b, len_q, _ = q.size() 36 | sz_b, len_k, _ = k.size() 37 | sz_b, len_v, _ = v.size() 38 | 39 | residual = q 40 | 41 | q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) 42 | k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) 43 | v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) 44 | q = ( 45 | q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) 46 | ) # (n*b) x lq x dk 47 | k = ( 48 | k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) 49 | ) # (n*b) x lk x dk 50 | v = ( 51 | v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) 52 | ) # (n*b) x lv x dv 53 | 54 | mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. 55 | output, attn = self.attention(q, k, v, mask=mask) 56 | 57 | output = output.view(n_head, sz_b, len_q, d_v) 58 | output = ( 59 | output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) 60 | ) # b x lq x (n*dv) 61 | 62 | output = self.dropout(self.fc(output)) 63 | output = self.layer_norm(output + residual) 64 | 65 | return output, attn 66 | 67 | 68 | class PositionwiseFeedForward(nn.Module): 69 | """ A two-feed-forward-layer module """ 70 | 71 | def __init__(self, d_in, d_hid, kernel_size, dropout=0.1): 72 | super().__init__() 73 | 74 | # Use Conv1D 75 | # position-wise 76 | self.w_1 = nn.Conv1d( 77 | d_in, 78 | d_hid, 79 | kernel_size=kernel_size[0], 80 | padding=(kernel_size[0] - 1) // 2, 81 | ) 82 | # position-wise 83 | self.w_2 = nn.Conv1d( 84 | d_hid, 85 | d_in, 86 | kernel_size=kernel_size[1], 87 | padding=(kernel_size[1] - 1) // 2, 88 | ) 89 | 90 | self.layer_norm = nn.LayerNorm(d_in) 91 | self.dropout = nn.Dropout(dropout) 92 | 93 | def forward(self, x): 94 | residual = x 95 | output = x.transpose(1, 2) 96 | output = self.w_2(F.relu(self.w_1(output))) 97 | output = output.transpose(1, 2) 98 | output = self.dropout(output) 99 | output = self.layer_norm(output+residual) 100 | 101 | return output -------------------------------------------------------------------------------- /fs_two/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from .Models import Encoder, Decoder 2 | from .Layers import PostNet -------------------------------------------------------------------------------- /fs_two/transformer/__pycache__/Constants.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/transformer/__pycache__/Constants.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/transformer/__pycache__/Layers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/transformer/__pycache__/Layers.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/transformer/__pycache__/Models.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/transformer/__pycache__/Models.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/transformer/__pycache__/Modules.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/transformer/__pycache__/Modules.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/transformer/__pycache__/SubLayers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/transformer/__pycache__/SubLayers.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/transformer/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/transformer/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/utils/__pycache__/tools.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/fs_two/utils/__pycache__/tools.cpython-38.pyc -------------------------------------------------------------------------------- /fs_two/utils/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | import torch 5 | import torch.nn as nn 6 | import numpy as np 7 | 8 | # import fs_two.hifigan as hifigan 9 | from fs_two.model import FastSpeech2, ScheduledOptim 10 | 11 | 12 | def get_model(cfg, device, train=False, isModel=True, isEmbedding=True): 13 | 14 | model = FastSpeech2(cfg.preprocess_config, cfg.model_config, device=device) 15 | if cfg.tts.load_path: 16 | 17 | ckpt = torch.load(cfg.tts.load_path, map_location=torch.device("cpu")) 18 | if isModel: 19 | model.load_state_dict(ckpt["model"], strict=False) 20 | if isEmbedding: 21 | try: 22 | model.load_state_dict(ckpt["embedding"], strict=False) 23 | except: 24 | print("missing embedding") 25 | print("Loaded model from", cfg.tts.load_path) 26 | 27 | if train: 28 | # model = nn.DataParallel(model) 29 | model.to(device) 30 | model.train() 31 | scheduled_optim = ScheduledOptim( 32 | model, cfg.train_config, cfg.model_config, cfg.tts.restore_step 33 | ) 34 | return model, scheduled_optim 35 | model.to(device) 36 | model.eval() 37 | model.requires_grad_ = False 38 | return model 39 | 40 | 41 | def get_param_num(model): 42 | num_param = sum(param.numel() for param in model.parameters()) 43 | return num_param 44 | 45 | 46 | def get_vocoder(hifigan, config, device): 47 | if config["vocoder"]["use_cpu"]: 48 | device = "cpu" 49 | name = config["vocoder"]["model"] 50 | speaker = config["vocoder"]["speaker"] 51 | 52 | if name == "MelGAN": 53 | if speaker == "LJSpeech": 54 | vocoder = torch.hub.load( 55 | "descriptinc/melgan-neurips", "load_melgan", "linda_johnson" 56 | ) 57 | elif speaker == "universal": 58 | vocoder = torch.hub.load( 59 | "descriptinc/melgan-neurips", "load_melgan", "multi_speaker" 60 | ) 61 | vocoder.mel2wav.eval() 62 | vocoder.mel2wav.to(device) 63 | elif name == "HiFi-GAN": 64 | with open("./fs_two/hifigan/config.json", "r") as f: 65 | config = json.load(f) 66 | config = hifigan.AttrDict(config) 67 | vocoder = hifigan.Generator(config) 68 | if speaker == "LJSpeech": 69 | ckpt = torch.load("hifigan/generator_LJSpeech.pth.tar") 70 | elif speaker == "universal": 71 | ckpt = torch.load( 72 | "/home/dev/other/fsp/weights/trained_original/hifi/generator_v1.pth", 73 | map_location="cpu", 74 | ) 75 | vocoder.load_state_dict(ckpt["generator"]) 76 | vocoder.eval() 77 | vocoder.remove_weight_norm() 78 | # vocoder = nn.DataParallel(vocoder) 79 | vocoder.to(device) 80 | 81 | return vocoder 82 | 83 | 84 | def vocoder_infer(mels, vocoder, model_config, preprocess_config, lengths=None): 85 | name = model_config["vocoder"]["model"] 86 | with torch.no_grad(): 87 | if name == "MelGAN": 88 | wavs = vocoder.inverse(mels / np.log(10)) 89 | elif name == "HiFi-GAN": 90 | wavs = vocoder(mels).squeeze(1) 91 | 92 | wavs = ( 93 | wavs.cpu().numpy() 94 | * preprocess_config["preprocessing"]["audio"]["max_wav_value"] 95 | ).astype("int16") 96 | wavs = [wav for wav in wavs] 97 | 98 | for i in range(len(mels)): 99 | if lengths is not None: 100 | wavs[i] = wavs[i][: lengths[i]] 101 | 102 | return wavs 103 | -------------------------------------------------------------------------------- /fs_two/utils/tools.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import random 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | import numpy as np 8 | import matplotlib 9 | from scipy.io import wavfile 10 | from matplotlib import pyplot as plt 11 | 12 | matplotlib.use("Agg") 13 | 14 | 15 | def to_device(data, device="cpu"): 16 | if len(data) == 15: 17 | ( 18 | ids, 19 | raw_texts, 20 | speakers, 21 | texts, 22 | src_lens, 23 | max_src_len, 24 | mels, 25 | mel_lens, 26 | max_mel_len, 27 | energies, 28 | durations, 29 | pitches_raw, 30 | pitches_cwt, 31 | pitches_mean, 32 | pitches_std, 33 | ) = data 34 | 35 | speakers = torch.from_numpy(speakers).long().to(device) 36 | texts = torch.from_numpy(texts).long().to(device) 37 | src_lens = torch.from_numpy(src_lens).to(device) 38 | mels = torch.from_numpy(mels).float().to(device) 39 | mel_lens = torch.from_numpy(mel_lens).to(device) 40 | energies = torch.from_numpy(energies).to(device) 41 | durations = torch.from_numpy(durations).long().to(device) 42 | 43 | pitches_cwt = torch.from_numpy(pitches_cwt).float().to(device) 44 | pitches_cwt = torch.nan_to_num(pitches_cwt, nan=0.0) 45 | 46 | pitches_raw = torch.from_numpy(pitches_raw).float().to(device) 47 | pitches_mean = torch.from_numpy(pitches_mean).float().to(device) 48 | pitches_std = torch.from_numpy(pitches_std).float().to(device) 49 | 50 | return ( 51 | ids, 52 | raw_texts, 53 | speakers, 54 | texts, 55 | src_lens, 56 | max_src_len, 57 | mels, 58 | mel_lens, 59 | max_mel_len, 60 | energies, 61 | durations, 62 | pitches_raw, 63 | pitches_cwt, 64 | pitches_mean, 65 | pitches_std, 66 | ) 67 | 68 | if len(data) == 6: 69 | ( 70 | ids, 71 | raw_texts, 72 | speakers, 73 | texts, 74 | src_lens, 75 | max_src_len, 76 | # speakers_emb, 77 | ) = data 78 | 79 | speakers = torch.from_numpy(speakers).long().to(device) 80 | texts = torch.from_numpy(texts).long().to(device) 81 | src_lens = torch.from_numpy(src_lens).to(device) 82 | 83 | return (ids, raw_texts, speakers, texts, src_lens, max_src_len) 84 | 85 | 86 | def log( 87 | logger, 88 | train_val, 89 | step=None, 90 | losses=None, 91 | fig=None, 92 | audio=None, 93 | sampling_rate=22050, 94 | tag="", 95 | ): 96 | losses_names = [ 97 | "Loss/total_loss", 98 | "Loss/mel_loss", 99 | "Loss/pitch_loss", 100 | "Loss/energy_loss", 101 | "Loss/duration_loss ", 102 | "Mean pitch loss", 103 | "Std pitch loss", 104 | ] 105 | 106 | if losses is not None: 107 | log_message = { 108 | f"{losses_names[i]} {train_val.upper()}": losses[i] 109 | for i in range(len(losses)) 110 | } 111 | logger.log(log_message) 112 | 113 | if fig is not None: 114 | logger.log({f"Spec {train_val.upper()}": fig}) 115 | 116 | if audio is not None: 117 | a = [logger.Audio(audio / max(abs(audio)), sample_rate=sampling_rate)] 118 | logger.log({f"Audio {train_val.upper()}": a}) 119 | 120 | 121 | def get_mask_from_lengths(lengths, max_len=None, device="cpu"): 122 | batch_size = lengths.shape[0] 123 | if max_len is None: 124 | max_len = torch.max(lengths).item() 125 | 126 | ids = ( 127 | torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(device) 128 | ) 129 | mask = ids >= lengths.unsqueeze(1).float().expand(-1, max_len) 130 | 131 | return mask 132 | 133 | 134 | def expand(values, durations): 135 | out = list() 136 | for value, d in zip(values, durations): 137 | out += [value] * max(0, int(d)) 138 | return np.array(out) 139 | 140 | 141 | def synth_one_sample( 142 | targets, predictions, vocoder, model_config, preprocess_config 143 | ): 144 | b_size = len(targets[0]) 145 | rand_id = random.randint(0, b_size - 1) 146 | basename = targets[0][rand_id] 147 | src_len = predictions[7][rand_id].item() 148 | mel_len = predictions[8][rand_id].item() 149 | mel_target = targets[6][rand_id, :mel_len].detach().transpose(0, 1) 150 | mel_prediction = predictions[9][rand_id, :mel_len].detach().transpose(0, 1) 151 | duration = targets[10][rand_id, :src_len].detach().cpu().numpy() 152 | if ( 153 | preprocess_config["preprocessing"]["pitch"]["feature"] 154 | == "phoneme_level" 155 | ): 156 | pitch = targets[11][rand_id, :src_len].detach().cpu().numpy() 157 | pitch = expand(pitch, duration) 158 | else: 159 | pitch = targets[11][rand_id, :mel_len].detach().cpu().numpy() 160 | if ( 161 | preprocess_config["preprocessing"]["energy"]["feature"] 162 | == "phoneme_level" 163 | ): 164 | energy = targets[9][rand_id, :src_len].detach().cpu().numpy() 165 | energy = expand(energy, duration) 166 | else: 167 | energy = targets[9][rand_id, :mel_len].detach().cpu().numpy() 168 | 169 | with open( 170 | os.path.join( 171 | preprocess_config["path"]["preprocessed_path"], "stats.json" 172 | ) 173 | ) as f: 174 | stats = json.load(f) 175 | stats = stats["pitch"] + stats["energy"][:2] 176 | 177 | fig = plot_mel( 178 | [ 179 | (mel_prediction.cpu().numpy(), pitch, energy), 180 | (mel_target.cpu().numpy(), pitch, energy), 181 | ], 182 | stats, 183 | ["Synthetized Spectrogram", "Ground-Truth Spectrogram"], 184 | ) 185 | 186 | if vocoder is not None: 187 | from .model import vocoder_infer 188 | 189 | if model_config["vocoder"]["use_cpu"]: 190 | mel_target = mel_target.to("cpu") 191 | wav_reconstruction = vocoder_infer( 192 | mel_target.unsqueeze(0), 193 | vocoder, 194 | model_config, 195 | preprocess_config, 196 | )[0] 197 | wav_prediction = vocoder_infer( 198 | mel_prediction.unsqueeze(0), 199 | vocoder, 200 | model_config, 201 | preprocess_config, 202 | )[0] 203 | else: 204 | wav_reconstruction = wav_prediction = None 205 | 206 | return fig, wav_reconstruction, wav_prediction, basename 207 | 208 | 209 | def synth_samples( 210 | targets, predictions, vocoder, model_config, preprocess_config, path 211 | ): 212 | 213 | basenames = targets[0] 214 | for i in range(len(predictions[0])): 215 | basename = basenames[i] 216 | src_len = predictions[7][i].item() 217 | mel_len = predictions[8][i].item() 218 | mel_prediction = predictions[9][i, :mel_len].detach().transpose(0, 1) 219 | duration = predictions[4][i, :src_len].detach().cpu().numpy() 220 | if ( 221 | preprocess_config["preprocessing"]["pitch"]["feature"] 222 | == "phoneme_level" 223 | ): 224 | pitch = predictions[1][i, :src_len].detach().cpu().numpy() 225 | pitch = expand(pitch, duration) 226 | else: 227 | pitch = predictions[1][i, :mel_len].detach().cpu().numpy() 228 | if ( 229 | preprocess_config["preprocessing"]["energy"]["feature"] 230 | == "phoneme_level" 231 | ): 232 | energy = predictions[2][i, :src_len].detach().cpu().numpy() 233 | energy = expand(energy, duration) 234 | else: 235 | energy = predictions[2][i, :mel_len].detach().cpu().numpy() 236 | 237 | with open( 238 | os.path.join( 239 | preprocess_config["path"]["preprocessed_path"], "stats.json" 240 | ) 241 | ) as f: 242 | stats = json.load(f) 243 | stats = stats["pitch"] + stats["energy"][:2] 244 | 245 | fig = plot_mel( 246 | [ 247 | (mel_prediction.cpu().numpy(), pitch, energy), 248 | ], 249 | stats, 250 | ["Synthetized Spectrogram"], 251 | ) 252 | plt.savefig(os.path.join(path, "{}.png".format(basename))) 253 | plt.close() 254 | 255 | from .model import vocoder_infer 256 | 257 | mel_predictions = predictions[9].transpose(1, 2) 258 | lengths = ( 259 | predictions[8] 260 | * preprocess_config["preprocessing"]["stft"]["hop_length"] 261 | ) 262 | wav_predictions = vocoder_infer( 263 | mel_predictions, 264 | vocoder, 265 | model_config, 266 | preprocess_config, 267 | lengths=lengths, 268 | ) 269 | 270 | sampling_rate = preprocess_config["preprocessing"]["audio"]["sampling_rate"] 271 | for wav, basename in zip(wav_predictions, basenames): 272 | wavfile.write( 273 | os.path.join(path, "{}.wav".format(basename)), sampling_rate, wav 274 | ) 275 | 276 | 277 | def plot_mel(data, stats, titles): 278 | fig, axes = plt.subplots(len(data), 1, squeeze=False) 279 | if titles is None: 280 | titles = [None for i in range(len(data))] 281 | pitch_min, pitch_max, pitch_mean, pitch_std, energy_min, energy_max = stats 282 | pitch_min = pitch_min * pitch_std + pitch_mean 283 | pitch_max = pitch_max * pitch_std + pitch_mean 284 | 285 | def add_axis(fig, old_ax): 286 | ax = fig.add_axes(old_ax.get_position(), anchor="W") 287 | ax.set_facecolor("None") 288 | return ax 289 | 290 | for i in range(len(data)): 291 | mel, pitch, energy = data[i] 292 | pitch = pitch * pitch_std + pitch_mean 293 | axes[i][0].imshow(mel, origin="lower") 294 | axes[i][0].set_aspect(2.5, adjustable="box") 295 | axes[i][0].set_ylim(0, mel.shape[0]) 296 | axes[i][0].set_title(titles[i], fontsize="medium") 297 | axes[i][0].tick_params(labelsize="x-small", left=False, labelleft=False) 298 | axes[i][0].set_anchor("W") 299 | 300 | ax1 = add_axis(fig, axes[i][0]) 301 | ax1.plot(pitch, color="tomato") 302 | ax1.set_xlim(0, mel.shape[1]) 303 | ax1.set_ylim(0, pitch_max) 304 | ax1.set_ylabel("F0", color="tomato") 305 | ax1.tick_params( 306 | labelsize="x-small", 307 | colors="tomato", 308 | bottom=False, 309 | labelbottom=False, 310 | ) 311 | 312 | ax2 = add_axis(fig, axes[i][0]) 313 | ax2.plot(energy, color="darkviolet") 314 | ax2.set_xlim(0, mel.shape[1]) 315 | ax2.set_ylim(energy_min, energy_max) 316 | ax2.set_ylabel("Energy", color="darkviolet") 317 | ax2.yaxis.set_label_position("right") 318 | ax2.tick_params( 319 | labelsize="x-small", 320 | colors="darkviolet", 321 | bottom=False, 322 | labelbottom=False, 323 | left=False, 324 | labelleft=False, 325 | right=True, 326 | labelright=True, 327 | ) 328 | 329 | return fig 330 | 331 | 332 | def pad_1D(inputs, PAD=0): 333 | def pad_data(x, length, PAD): 334 | x_padded = np.pad( 335 | x, (0, length - x.shape[0]), mode="constant", constant_values=PAD 336 | ) 337 | return x_padded 338 | 339 | max_len = max((len(x) for x in inputs)) 340 | padded = np.stack([pad_data(x, max_len, PAD) for x in inputs]) 341 | 342 | return padded 343 | 344 | 345 | def pad_2D(inputs, maxlen=None): 346 | def pad(x, max_len): 347 | PAD = 0 348 | if np.shape(x)[0] > max_len: 349 | raise ValueError("not max_len") 350 | 351 | s = np.shape(x)[1] 352 | x_padded = np.pad( 353 | x, 354 | (0, max_len - np.shape(x)[0]), 355 | mode="constant", 356 | constant_values=PAD, 357 | ) 358 | return x_padded[:, :s] 359 | 360 | if maxlen: 361 | output = np.stack([pad(x, maxlen) for x in inputs]) 362 | else: 363 | max_len = max(np.shape(x)[0] for x in inputs) 364 | output = np.stack([pad(x, max_len) for x in inputs]) 365 | 366 | return output 367 | 368 | 369 | def pad(input_ele, mel_max_length=None): 370 | if mel_max_length: 371 | max_len = mel_max_length 372 | else: 373 | max_len = max([input_ele[i].size(0) for i in range(len(input_ele))]) 374 | 375 | out_list = list() 376 | for i, batch in enumerate(input_ele): 377 | if len(batch.shape) == 1: 378 | one_batch_padded = F.pad( 379 | batch, (0, max_len - batch.size(0)), "constant", 0.0 380 | ) 381 | elif len(batch.shape) == 2: 382 | one_batch_padded = F.pad( 383 | batch, (0, 0, 0, max_len - batch.size(0)), "constant", 0.0 384 | ) 385 | out_list.append(one_batch_padded) 386 | out_padded = torch.stack(out_list) 387 | return out_padded 388 | -------------------------------------------------------------------------------- /fsapi.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | import numpy as np 5 | 6 | from fs_two.model import FastSpeech2 7 | 8 | 9 | class FSTWOapi: 10 | def __init__(self, config, device=0): 11 | weights_path = config.tts.weights_path 12 | model_folder = "/".join(weights_path.split("/")[:-1]) 13 | config.preprocess_config.path.preprocessed_path = model_folder 14 | 15 | self.speakers_dict, self.speaker_names = load_speakers_json( 16 | config.preprocess_config.path.preprocessed_path 17 | ) 18 | 19 | self.model = FastSpeech2( 20 | config.preprocess_config, 21 | config.model_config, 22 | len(self.speaker_names), 23 | ).to(device) 24 | # Load checkpoint if exists 25 | self.weights_path = weights_path 26 | if weights_path is not None: 27 | checkpoint = torch.load(weights_path, map_location="cpu") 28 | state = checkpoint["model"] 29 | state['speaker_emb.weight'] = checkpoint["embedding"] 30 | self.model.load_state_dict(checkpoint["model"]) 31 | 32 | self.cfg = config 33 | self.device = device 34 | 35 | # TODO get the righ restore step 36 | self.restore_step = 0 37 | 38 | def generate( 39 | self, 40 | phonemes, 41 | duration_control=1.0, 42 | pitch_control=1.0, 43 | energy_control=1.0, 44 | speaker_name=None, 45 | ): 46 | 47 | if speaker_name is not None: 48 | if not speaker_name in self.speakers_dict: 49 | raise Exception( 50 | f"Speaker {speaker_name} was not found in speakers.json" 51 | ) 52 | speaker_id = self.speakers_dict[speaker_name] 53 | speaker = torch.tensor(speaker_id).long().unsqueeze(0) 54 | speaker = speaker.to(self.device) 55 | self.model.eval() 56 | src_len = np.array([len(phonemes[0])]) 57 | result = self.model( 58 | speaker, 59 | torch.from_numpy(phonemes).long().to(self.device), 60 | torch.from_numpy(src_len).to(self.device), 61 | max(src_len), 62 | d_control=duration_control, 63 | p_control=pitch_control, 64 | e_control=energy_control, 65 | ) 66 | 67 | ( 68 | output, 69 | p_predictions, 70 | e_predictions, 71 | log_d_predictions, 72 | d_rounded, 73 | src_masks, 74 | mel_masks, 75 | src_lens, 76 | mel_lens, 77 | postnet_output, 78 | pitch_mean, 79 | pitch_std, 80 | ) = result 81 | 82 | return postnet_output 83 | 84 | 85 | def load_speakers_json(dir_path): 86 | json_paht = os.path.join(dir_path, "speakers.json") 87 | if os.path.exists(json_paht): 88 | with open( 89 | json_paht, 90 | "r", 91 | ) as f: 92 | speakers = json.load(f) 93 | else: 94 | print(f'Did not find speakers.josn at {dir_path}') 95 | 96 | return speakers, list(speakers.keys()) 97 | -------------------------------------------------------------------------------- /hifi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/hifi/__init__.py -------------------------------------------------------------------------------- /hifi/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/hifi/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /hifi/__pycache__/models.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/hifi/__pycache__/models.cpython-38.pyc -------------------------------------------------------------------------------- /hifi/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | 5 | class AttrDict(dict): 6 | def __init__(self, *args, **kwargs): 7 | super(AttrDict, self).__init__(*args, **kwargs) 8 | self.__dict__ = self 9 | 10 | 11 | def build_env(config, config_name, path): 12 | t_path = os.path.join(path, config_name) 13 | if config != t_path: 14 | os.makedirs(path, exist_ok=True) 15 | shutil.copyfile(config, os.path.join(path, config_name)) 16 | -------------------------------------------------------------------------------- /hifi/meldataset.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import random 4 | import torch 5 | import torch.utils.data 6 | import numpy as np 7 | from librosa.util import normalize 8 | from scipy.io.wavfile import read 9 | from librosa.filters import mel as librosa_mel_fn 10 | 11 | MAX_WAV_VALUE = 32768.0 12 | 13 | 14 | def load_wav(full_path): 15 | sampling_rate, data = read(full_path) 16 | return data, sampling_rate 17 | 18 | 19 | def dynamic_range_compression(x, C=1, clip_val=1e-5): 20 | return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) 21 | 22 | 23 | def dynamic_range_decompression(x, C=1): 24 | return np.exp(x) / C 25 | 26 | 27 | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): 28 | return torch.log(torch.clamp(x, min=clip_val) * C) 29 | 30 | 31 | def dynamic_range_decompression_torch(x, C=1): 32 | return torch.exp(x) / C 33 | 34 | 35 | def spectral_normalize_torch(magnitudes): 36 | output = dynamic_range_compression_torch(magnitudes) 37 | return output 38 | 39 | 40 | def spectral_de_normalize_torch(magnitudes): 41 | output = dynamic_range_decompression_torch(magnitudes) 42 | return output 43 | 44 | 45 | mel_basis = {} 46 | hann_window = {} 47 | 48 | 49 | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): 50 | if torch.min(y) < -1.: 51 | print('min value is ', torch.min(y)) 52 | if torch.max(y) > 1.: 53 | print('max value is ', torch.max(y)) 54 | 55 | global mel_basis, hann_window 56 | if fmax not in mel_basis: 57 | mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) 58 | mel_basis[str(fmax)+'_'+str(y.device) 59 | ] = torch.from_numpy(mel).float().to(y.device) 60 | hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) 61 | 62 | y = torch.nn.functional.pad(y.unsqueeze( 63 | 1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') 64 | y = y.squeeze(1) 65 | 66 | spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], 67 | center=center, pad_mode='reflect', normalized=False, onesided=True) 68 | 69 | spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9)) 70 | 71 | spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec) 72 | spec = spectral_normalize_torch(spec) 73 | 74 | return spec 75 | 76 | 77 | def get_dataset_filelist(a): 78 | with open(a.input_training_file, 'r', encoding='utf-8') as fi: 79 | training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') 80 | for x in fi.read().split('\n') if len(x) > 0] 81 | 82 | with open(a.input_validation_file, 'r', encoding='utf-8') as fi: 83 | validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') 84 | for x in fi.read().split('\n') if len(x) > 0] 85 | return training_files, validation_files 86 | 87 | 88 | class MelDataset(torch.utils.data.Dataset): 89 | def __init__(self, training_files, segment_size, n_fft, num_mels, 90 | hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1, 91 | device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None): 92 | self.audio_files = training_files 93 | random.seed(1234) 94 | if shuffle: 95 | random.shuffle(self.audio_files) 96 | self.segment_size = segment_size 97 | self.sampling_rate = sampling_rate 98 | self.split = split 99 | self.n_fft = n_fft 100 | self.num_mels = num_mels 101 | self.hop_size = hop_size 102 | self.win_size = win_size 103 | self.fmin = fmin 104 | self.fmax = fmax 105 | self.fmax_loss = fmax_loss 106 | self.cached_wav = None 107 | self.n_cache_reuse = n_cache_reuse 108 | self._cache_ref_count = 0 109 | self.device = device 110 | self.fine_tuning = fine_tuning 111 | self.base_mels_path = base_mels_path 112 | 113 | def __getitem__(self, index): 114 | filename = self.audio_files[index] 115 | if self._cache_ref_count == 0: 116 | audio, sampling_rate = load_wav(filename) 117 | audio = audio / MAX_WAV_VALUE 118 | if not self.fine_tuning: 119 | audio = normalize(audio) * 0.95 120 | self.cached_wav = audio 121 | if sampling_rate != self.sampling_rate: 122 | raise ValueError("{} SR doesn't match target {} SR".format( 123 | sampling_rate, self.sampling_rate)) 124 | self._cache_ref_count = self.n_cache_reuse 125 | else: 126 | audio = self.cached_wav 127 | self._cache_ref_count -= 1 128 | 129 | audio = torch.FloatTensor(audio) 130 | audio = audio.unsqueeze(0) 131 | 132 | if not self.fine_tuning: 133 | if self.split: 134 | if audio.size(1) >= self.segment_size: 135 | max_audio_start = audio.size(1) - self.segment_size 136 | audio_start = random.randint(0, max_audio_start) 137 | audio = audio[:, audio_start:audio_start+self.segment_size] 138 | else: 139 | audio = torch.nn.functional.pad( 140 | audio, (0, self.segment_size - audio.size(1)), 'constant') 141 | 142 | mel = mel_spectrogram(audio, self.n_fft, self.num_mels, 143 | self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax, 144 | center=False) 145 | else: 146 | mel = np.load( 147 | os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy')) 148 | mel = torch.from_numpy(mel) 149 | 150 | if len(mel.shape) < 3: 151 | mel = mel.unsqueeze(0) 152 | 153 | if self.split: 154 | frames_per_seg = math.ceil(self.segment_size / self.hop_size) 155 | 156 | if audio.size(1) >= self.segment_size: 157 | mel_start = random.randint( 158 | 0, mel.size(2) - frames_per_seg - 1) 159 | mel = mel[:, :, mel_start:mel_start + frames_per_seg] 160 | audio = audio[:, mel_start * 161 | self.hop_size:(mel_start + frames_per_seg) * self.hop_size] 162 | else: 163 | mel = torch.nn.functional.pad( 164 | mel, (0, frames_per_seg - mel.size(2)), 'constant') 165 | audio = torch.nn.functional.pad( 166 | audio, (0, self.segment_size - audio.size(1)), 'constant') 167 | 168 | mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels, 169 | self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss, 170 | center=False) 171 | 172 | return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze()) 173 | 174 | def __len__(self): 175 | return len(self.audio_files) 176 | -------------------------------------------------------------------------------- /hifi/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.nn as nn 4 | from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d 5 | from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm 6 | from hifi.vocoder.utils import init_weights, get_padding 7 | 8 | 9 | LRELU_SLOPE = 0.1 10 | 11 | 12 | class ResBlock1(torch.nn.Module): 13 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): 14 | super(ResBlock1, self).__init__() 15 | self.h = h 16 | self.convs1 = nn.ModuleList( 17 | [ 18 | weight_norm( 19 | Conv1d( 20 | channels, 21 | channels, 22 | kernel_size, 23 | 1, 24 | dilation=dilation[0], 25 | padding=get_padding(kernel_size, dilation[0]), 26 | ) 27 | ), 28 | weight_norm( 29 | Conv1d( 30 | channels, 31 | channels, 32 | kernel_size, 33 | 1, 34 | dilation=dilation[1], 35 | padding=get_padding(kernel_size, dilation[1]), 36 | ) 37 | ), 38 | weight_norm( 39 | Conv1d( 40 | channels, 41 | channels, 42 | kernel_size, 43 | 1, 44 | dilation=dilation[2], 45 | padding=get_padding(kernel_size, dilation[2]), 46 | ) 47 | ), 48 | ] 49 | ) 50 | self.convs1.apply(init_weights) 51 | 52 | self.convs2 = nn.ModuleList( 53 | [ 54 | weight_norm( 55 | Conv1d( 56 | channels, 57 | channels, 58 | kernel_size, 59 | 1, 60 | dilation=1, 61 | padding=get_padding(kernel_size, 1), 62 | ) 63 | ), 64 | weight_norm( 65 | Conv1d( 66 | channels, 67 | channels, 68 | kernel_size, 69 | 1, 70 | dilation=1, 71 | padding=get_padding(kernel_size, 1), 72 | ) 73 | ), 74 | weight_norm( 75 | Conv1d( 76 | channels, 77 | channels, 78 | kernel_size, 79 | 1, 80 | dilation=1, 81 | padding=get_padding(kernel_size, 1), 82 | ) 83 | ), 84 | ] 85 | ) 86 | self.convs2.apply(init_weights) 87 | 88 | def forward(self, x): 89 | for c1, c2 in zip(self.convs1, self.convs2): 90 | xt = F.leaky_relu(x, LRELU_SLOPE) 91 | xt = c1(xt) 92 | xt = F.leaky_relu(xt, LRELU_SLOPE) 93 | xt = c2(xt) 94 | x = xt + x 95 | return x 96 | 97 | def remove_weight_norm(self): 98 | for l in self.convs1: 99 | remove_weight_norm(l) 100 | for l in self.convs2: 101 | remove_weight_norm(l) 102 | 103 | 104 | class ResBlock2(torch.nn.Module): 105 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): 106 | super(ResBlock2, self).__init__() 107 | self.h = h 108 | self.convs = nn.ModuleList( 109 | [ 110 | weight_norm( 111 | Conv1d( 112 | channels, 113 | channels, 114 | kernel_size, 115 | 1, 116 | dilation=dilation[0], 117 | padding=get_padding(kernel_size, dilation[0]), 118 | ) 119 | ), 120 | weight_norm( 121 | Conv1d( 122 | channels, 123 | channels, 124 | kernel_size, 125 | 1, 126 | dilation=dilation[1], 127 | padding=get_padding(kernel_size, dilation[1]), 128 | ) 129 | ), 130 | ] 131 | ) 132 | self.convs.apply(init_weights) 133 | 134 | def forward(self, x): 135 | for c in self.convs: 136 | xt = F.leaky_relu(x, LRELU_SLOPE) 137 | xt = c(xt) 138 | x = xt + x 139 | return x 140 | 141 | def remove_weight_norm(self): 142 | for l in self.convs: 143 | remove_weight_norm(l) 144 | 145 | 146 | class Generator(torch.nn.Module): 147 | def __init__(self, h): 148 | super(Generator, self).__init__() 149 | self.h = h 150 | self.num_kernels = len(h.resblock_kernel_sizes) 151 | self.num_upsamples = len(h.upsample_rates) 152 | self.conv_pre = weight_norm( 153 | Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3) 154 | ) 155 | resblock = ResBlock1 if h.resblock == "1" else ResBlock2 156 | 157 | self.ups = nn.ModuleList() 158 | for i, (u, k) in enumerate( 159 | zip(h.upsample_rates, h.upsample_kernel_sizes) 160 | ): 161 | self.ups.append( 162 | weight_norm( 163 | ConvTranspose1d( 164 | h.upsample_initial_channel // (2 ** i), 165 | h.upsample_initial_channel // (2 ** (i + 1)), 166 | k, 167 | u, 168 | padding=(k - u) // 2, 169 | ) 170 | ) 171 | ) 172 | 173 | self.resblocks = nn.ModuleList() 174 | for i in range(len(self.ups)): 175 | ch = h.upsample_initial_channel // (2 ** (i + 1)) 176 | for j, (k, d) in enumerate( 177 | zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes) 178 | ): 179 | self.resblocks.append(resblock(h, ch, k, d)) 180 | 181 | self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) 182 | self.ups.apply(init_weights) 183 | self.conv_post.apply(init_weights) 184 | 185 | def forward(self, x): 186 | x = self.conv_pre(x) 187 | for i in range(self.num_upsamples): 188 | x = F.leaky_relu(x, LRELU_SLOPE) 189 | x = self.ups[i](x) 190 | xs = None 191 | for j in range(self.num_kernels): 192 | if xs is None: 193 | xs = self.resblocks[i * self.num_kernels + j](x) 194 | else: 195 | xs += self.resblocks[i * self.num_kernels + j](x) 196 | x = xs / self.num_kernels 197 | x = F.leaky_relu(x) 198 | x = self.conv_post(x) 199 | x = torch.tanh(x) 200 | 201 | return x 202 | 203 | def remove_weight_norm(self): 204 | print("Removing weight norm for inference HIFI GAN...") 205 | for l in self.ups: 206 | remove_weight_norm(l) 207 | for l in self.resblocks: 208 | l.remove_weight_norm() 209 | remove_weight_norm(self.conv_pre) 210 | remove_weight_norm(self.conv_post) 211 | 212 | 213 | class DiscriminatorP(torch.nn.Module): 214 | def __init__( 215 | self, period, kernel_size=5, stride=3, use_spectral_norm=False 216 | ): 217 | super(DiscriminatorP, self).__init__() 218 | self.period = period 219 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 220 | self.convs = nn.ModuleList( 221 | [ 222 | norm_f( 223 | Conv2d( 224 | 1, 225 | 32, 226 | (kernel_size, 1), 227 | (stride, 1), 228 | padding=(get_padding(5, 1), 0), 229 | ) 230 | ), 231 | norm_f( 232 | Conv2d( 233 | 32, 234 | 128, 235 | (kernel_size, 1), 236 | (stride, 1), 237 | padding=(get_padding(5, 1), 0), 238 | ) 239 | ), 240 | norm_f( 241 | Conv2d( 242 | 128, 243 | 512, 244 | (kernel_size, 1), 245 | (stride, 1), 246 | padding=(get_padding(5, 1), 0), 247 | ) 248 | ), 249 | norm_f( 250 | Conv2d( 251 | 512, 252 | 1024, 253 | (kernel_size, 1), 254 | (stride, 1), 255 | padding=(get_padding(5, 1), 0), 256 | ) 257 | ), 258 | norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), 259 | ] 260 | ) 261 | self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) 262 | 263 | def forward(self, x): 264 | fmap = [] 265 | 266 | # 1d to 2d 267 | b, c, t = x.shape 268 | if t % self.period != 0: # pad first 269 | n_pad = self.period - (t % self.period) 270 | x = F.pad(x, (0, n_pad), "reflect") 271 | t = t + n_pad 272 | x = x.view(b, c, t // self.period, self.period) 273 | 274 | for l in self.convs: 275 | x = l(x) 276 | x = F.leaky_relu(x, LRELU_SLOPE) 277 | fmap.append(x) 278 | x = self.conv_post(x) 279 | fmap.append(x) 280 | x = torch.flatten(x, 1, -1) 281 | 282 | return x, fmap 283 | 284 | 285 | class MultiPeriodDiscriminator(torch.nn.Module): 286 | def __init__(self): 287 | super(MultiPeriodDiscriminator, self).__init__() 288 | self.discriminators = nn.ModuleList( 289 | [ 290 | DiscriminatorP(2), 291 | DiscriminatorP(3), 292 | DiscriminatorP(5), 293 | DiscriminatorP(7), 294 | DiscriminatorP(11), 295 | ] 296 | ) 297 | 298 | def forward(self, y, y_hat): 299 | y_d_rs = [] 300 | y_d_gs = [] 301 | fmap_rs = [] 302 | fmap_gs = [] 303 | for i, d in enumerate(self.discriminators): 304 | y_d_r, fmap_r = d(y) 305 | y_d_g, fmap_g = d(y_hat) 306 | y_d_rs.append(y_d_r) 307 | fmap_rs.append(fmap_r) 308 | y_d_gs.append(y_d_g) 309 | fmap_gs.append(fmap_g) 310 | 311 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 312 | 313 | 314 | class DiscriminatorS(torch.nn.Module): 315 | def __init__(self, use_spectral_norm=False): 316 | super(DiscriminatorS, self).__init__() 317 | norm_f = weight_norm if use_spectral_norm == False else spectral_norm 318 | self.convs = nn.ModuleList( 319 | [ 320 | norm_f(Conv1d(1, 128, 15, 1, padding=7)), 321 | norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), 322 | norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), 323 | norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), 324 | norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), 325 | norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), 326 | norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), 327 | ] 328 | ) 329 | self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) 330 | 331 | def forward(self, x): 332 | fmap = [] 333 | for l in self.convs: 334 | x = l(x) 335 | x = F.leaky_relu(x, LRELU_SLOPE) 336 | fmap.append(x) 337 | x = self.conv_post(x) 338 | fmap.append(x) 339 | x = torch.flatten(x, 1, -1) 340 | 341 | return x, fmap 342 | 343 | 344 | class MultiScaleDiscriminator(torch.nn.Module): 345 | def __init__(self): 346 | super(MultiScaleDiscriminator, self).__init__() 347 | self.discriminators = nn.ModuleList( 348 | [ 349 | DiscriminatorS(use_spectral_norm=True), 350 | DiscriminatorS(), 351 | DiscriminatorS(), 352 | ] 353 | ) 354 | self.meanpools = nn.ModuleList( 355 | [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)] 356 | ) 357 | 358 | def forward(self, y, y_hat): 359 | y_d_rs = [] 360 | y_d_gs = [] 361 | fmap_rs = [] 362 | fmap_gs = [] 363 | for i, d in enumerate(self.discriminators): 364 | if i != 0: 365 | y = self.meanpools[i - 1](y) 366 | y_hat = self.meanpools[i - 1](y_hat) 367 | y_d_r, fmap_r = d(y) 368 | y_d_g, fmap_g = d(y_hat) 369 | y_d_rs.append(y_d_r) 370 | fmap_rs.append(fmap_r) 371 | y_d_gs.append(y_d_g) 372 | fmap_gs.append(fmap_g) 373 | 374 | return y_d_rs, y_d_gs, fmap_rs, fmap_gs 375 | 376 | 377 | def feature_loss(fmap_r, fmap_g): 378 | loss = 0 379 | for dr, dg in zip(fmap_r, fmap_g): 380 | for rl, gl in zip(dr, dg): 381 | loss += torch.mean(torch.abs(rl - gl)) 382 | 383 | return loss * 2 384 | 385 | 386 | def discriminator_loss(disc_real_outputs, disc_generated_outputs): 387 | loss = 0 388 | r_losses = [] 389 | g_losses = [] 390 | for dr, dg in zip(disc_real_outputs, disc_generated_outputs): 391 | r_loss = torch.mean((1 - dr) ** 2) 392 | g_loss = torch.mean(dg ** 2) 393 | loss += r_loss + g_loss 394 | r_losses.append(r_loss.item()) 395 | g_losses.append(g_loss.item()) 396 | 397 | return loss, r_losses, g_losses 398 | 399 | 400 | def generator_loss(disc_outputs): 401 | loss = 0 402 | gen_losses = [] 403 | for dg in disc_outputs: 404 | l = torch.mean((1 - dg) ** 2) 405 | gen_losses.append(l) 406 | loss += l 407 | 408 | return loss, gen_losses -------------------------------------------------------------------------------- /hifi/utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pylab as plt 2 | import glob 3 | import os 4 | import matplotlib 5 | import torch 6 | from torch.nn.utils import weight_norm 7 | matplotlib.use("Agg") 8 | 9 | 10 | def plot_spectrogram(spectrogram): 11 | fig, ax = plt.subplots(figsize=(10, 2)) 12 | im = ax.imshow(spectrogram, aspect="auto", origin="lower", 13 | interpolation='none') 14 | plt.colorbar(im, ax=ax) 15 | 16 | fig.canvas.draw() 17 | plt.close() 18 | 19 | return fig 20 | 21 | 22 | def init_weights(m, mean=0.0, std=0.01): 23 | classname = m.__class__.__name__ 24 | if classname.find("Conv") != -1: 25 | m.weight.data.normal_(mean, std) 26 | 27 | 28 | def apply_weight_norm(m): 29 | classname = m.__class__.__name__ 30 | if classname.find("Conv") != -1: 31 | weight_norm(m) 32 | 33 | 34 | def get_padding(kernel_size, dilation=1): 35 | return int((kernel_size*dilation - dilation)/2) 36 | 37 | 38 | def load_checkpoint(filepath, device): 39 | assert os.path.isfile(filepath) 40 | print("Loading '{}'".format(filepath)) 41 | checkpoint_dict = torch.load(filepath, map_location=device) 42 | print("Complete.") 43 | return checkpoint_dict 44 | 45 | 46 | def save_checkpoint(filepath, obj): 47 | print("Saving checkpoint to {}".format(filepath)) 48 | torch.save(obj, filepath) 49 | print("Complete.") 50 | 51 | 52 | def scan_checkpoint(cp_dir, prefix): 53 | pattern = os.path.join(cp_dir, prefix + '????????') 54 | cp_list = glob.glob(pattern) 55 | if len(cp_list) == 0: 56 | return None 57 | return sorted(cp_list)[-1] 58 | -------------------------------------------------------------------------------- /hifi/vocoder/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/diff7/tts-king/2c859161c9deb5174a7754fe78289ef58b4ae3d2/hifi/vocoder/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /hifi/vocoder/utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pylab as plt 2 | import glob 3 | import os 4 | import matplotlib 5 | import torch 6 | from torch.nn.utils import weight_norm 7 | 8 | matplotlib.use("Agg") 9 | 10 | 11 | def plot_spectrogram(spectrogram): 12 | fig, ax = plt.subplots(figsize=(10, 2)) 13 | im = ax.imshow( 14 | spectrogram, aspect="auto", origin="lower", interpolation="none" 15 | ) 16 | plt.colorbar(im, ax=ax) 17 | 18 | fig.canvas.draw() 19 | plt.close() 20 | 21 | return fig 22 | 23 | 24 | def init_weights(m, mean=0.0, std=0.01): 25 | classname = m.__class__.__name__ 26 | if classname.find("Conv") != -1: 27 | m.weight.data.normal_(mean, std) 28 | 29 | 30 | def apply_weight_norm(m): 31 | classname = m.__class__.__name__ 32 | if classname.find("Conv") != -1: 33 | weight_norm(m) 34 | 35 | 36 | def get_padding(kernel_size, dilation=1): 37 | return int((kernel_size * dilation - dilation) / 2) 38 | 39 | 40 | def load_checkpoint(filepath, device): 41 | assert os.path.isfile(filepath) 42 | print("Loading '{}'".format(filepath)) 43 | checkpoint_dict = torch.load(filepath, map_location=device) 44 | print("Complete.") 45 | return checkpoint_dict 46 | 47 | 48 | def save_checkpoint(filepath, obj): 49 | print("Saving checkpoint to {}".format(filepath)) 50 | torch.save(obj, filepath) 51 | print("Complete.") 52 | 53 | 54 | def scan_checkpoint(cp_dir, prefix): 55 | pattern = os.path.join(cp_dir, prefix + "????????") 56 | cp_list = glob.glob(pattern) 57 | if len(cp_list) == 0: 58 | return None 59 | return sorted(cp_list)[-1] -------------------------------------------------------------------------------- /hifiapi.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from hifi.models import Generator 3 | 4 | 5 | class AttrDict(dict): 6 | def __init__(self, *args, **kwargs): 7 | super(AttrDict, self).__init__(*args, **kwargs) 8 | self.__dict__ = self 9 | 10 | 11 | class HIFIapi: 12 | def __init__(self, config, device="gpu"): 13 | if config.model_config["vocoder"]["use_cpu"]: 14 | device = "cpu" 15 | 16 | # Load checkpoint if exists 17 | weights_path = config.hifi.weights_path 18 | 19 | self.model = Generator(config.hifi) 20 | if weights_path is not None: 21 | checkpoint = torch.load(weights_path, map_location="cpu") 22 | self.model.load_state_dict(checkpoint["generator"]) 23 | 24 | self.cfg = config 25 | self.device = device 26 | 27 | self.model.to(device) 28 | self.model.remove_weight_norm() 29 | self.model.eval() 30 | 31 | # TODO: 32 | def train(self): 33 | raise NotImplemented(" Train for HiFi was not implemented yet") 34 | 35 | def __call__(self, x): 36 | x = x.to(self.device) 37 | # use call for compatablity with other vocoders or functions 38 | return self.model(x) 39 | 40 | def generate(self, mel_specs): 41 | """ 42 | Converts mel spectrogramma into an audio file. 43 | Returns cpu audio files. 44 | mel_specs - a batch of mel spectrogramms 45 | """ 46 | 47 | self.model.eval() 48 | with torch.no_grad(): 49 | audio = self.model(mel_specs) 50 | audio = audio * self.cfg.hifi.MAX_WAV_VALUE 51 | audio = audio.cpu().numpy().astype("int16") 52 | return audio 53 | -------------------------------------------------------------------------------- /input_process.py: -------------------------------------------------------------------------------- 1 | import re 2 | from string import punctuation 3 | 4 | import numpy as np 5 | 6 | # from g2p_en import G2p 7 | from fs_two.text import text_to_sequence 8 | from russian_g2p.Transcription import Transcription 9 | 10 | # NO CLEANERS FOR RUSSIAN DATASET 11 | CLEANERS = [] 12 | transcriptor = Transcription() 13 | 14 | def read_lexicon(lex_path): 15 | lexicon = {} 16 | with open(lex_path) as f: 17 | for line in f: 18 | temp = re.split(r"\s+", line.strip("\n")) 19 | word = temp[0] 20 | phones = temp[1:] 21 | if word.lower() not in lexicon: 22 | lexicon[word.lower()] = phones 23 | return lexicon 24 | 25 | 26 | def preprocess_eng(text, preprocess_config): 27 | text = text.rstrip(punctuation) 28 | lexicon = read_lexicon(preprocess_config["path"]["lexicon_path"]) 29 | 30 | g2p = G2p() 31 | phones = [] 32 | words = re.split(r"([,;.\-\?\!\s+])", text) 33 | for w in words: 34 | if w.lower() in lexicon: 35 | phones += lexicon[w.lower()] 36 | else: 37 | phones += list(filter(lambda p: p != " ", g2p(w))) 38 | phones = "{" + "}{".join(phones) + "}" 39 | phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones) 40 | phones = phones.replace("}{", " ") 41 | 42 | print("Raw Text Sequence: {}".format(text)) 43 | print("Phoneme Sequence: {}".format(phones)) 44 | sequence = np.array(text_to_sequence(phones, CLEANERS)) 45 | 46 | return np.array(sequence) 47 | 48 | 49 | def preprocess_lang(text, preprocess_config): 50 | text = text.rstrip(punctuation) 51 | lexicon = read_lexicon(preprocess_config["path"]["lexicon_path"]) 52 | 53 | phones = [] 54 | words = re.split(r"([,;.\-\?\!\s+])", text) 55 | for w in words: 56 | if w.lower() in lexicon: 57 | phones += lexicon[w.lower()] 58 | else: 59 | phones += "." 60 | phones = "{" + "}{".join(phones) + "}" 61 | phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones) 62 | phones = phones.replace("}{", " ") 63 | 64 | print("Raw Text Sequence: {}".format(text)) 65 | print("Phoneme Sequence: {}".format(phones)) 66 | sequence = np.array(text_to_sequence(phones, CLEANERS)) 67 | 68 | return np.array(sequence) 69 | 70 | 71 | def preprocess_rus(text): 72 | 73 | text = text.rstrip(punctuation) 74 | phones = [] 75 | words = re.split(r"([,;.\-\?\!\s+])", text) 76 | sentences = transcriptor.transcribe([text])[0] 77 | phones = [phoneme for s in sentences for phoneme in s+['sp']] 78 | phones = "{" + "}{".join(phones) + "}" 79 | phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones) 80 | phones = phones.replace("}{", " ") 81 | 82 | print("Raw Text Sequence: {}".format(text)) 83 | print("Phoneme Sequence: {}".format(phones)) 84 | sequence = np.array(text_to_sequence(phones, [])) 85 | 86 | return np.array(sequence) 87 | -------------------------------------------------------------------------------- /prepare_data.py: -------------------------------------------------------------------------------- 1 | from omegaconf import OmegaConf 2 | from fs_two.preprocessor.preprocessor import Preprocessor 3 | 4 | 5 | if __name__ == "__main__": 6 | preprocess_config = OmegaConf.load("./config.yaml")["preprocess_config"] 7 | preprocessor = Preprocessor(preprocess_config) 8 | preprocessor.build_from_path() 9 | 10 | -------------------------------------------------------------------------------- /pretrained/speakers.json: -------------------------------------------------------------------------------- 1 | {"Schirvind_A_abooks_voxforge": 0, "user1_mozilla": 1, "nikolaev_ailab": 2, "Litvinov_I_abooks_voxforge": 3, "user4_mozilla": 4, "mar_abooks_voxforge": 5, "Arhipova_Natalja_abooks_voxforge": 6, "Medvedeva_Galcova_Olga_abooks_voxforge": 7, "june_shaman": 8, "Tarinicheva_Tatjana_abooks_voxforge": 9, "Kvasha_Igor_abooks_voxforge": 10, "morti_shaman": 11, "Trifilov_Nikolai_abooks_voxforge": 12, "user17_mozilla": 13, "user26_mozilla": 14, "user12_mozilla": 15, "Sytnik_I_abooks_voxforge": 16, "user8_mozilla": 17, "Larionova-Ludm_abooks_voxforge": 18, "Bolshakova_Ksenija_abooks_voxforge": 19, "user5_mozilla": 20, "Kuznetsov_Vsevolod_abooks_voxforge": 21, "Kovaleva_Anna_abooks_voxforge": 22, "Suetin_Pavel_abooks_voxforge": 23, "user7_mozilla": 24, "Konjahin_V_abooks_voxforge": 25, "len_shaman": 26, "Stukalov_Vladimir_abooks_voxforge": 27, "user20_mozilla": 28, "Terenkov_Alexandr_abooks_voxforge": 29, "Taratorkin_Georgiy_abooks_voxforge": 30, "Vasiljev_Y_abooks_voxforge": 31, "Martjanov_O_abooks_voxforge": 32, "Chebaturkina_Elena_abooks_voxforge": 33, "Muhametzyanov_Radik_abooks_voxforge": 34, "Rezalin_Aleksandr_abooks_voxforge": 35, "russian_single": 36, "Zozulin_Viktor_abooks_voxforge": 37, "Zhirnov_Sergey_abooks_voxforge": 38, "hajdurova_ailab": 39, "user6_mozilla": 40, "Vesnik_E_abooks_voxforge": 41, "ira_abooks_voxforge": 42, "Kotov_Alexandr_abooks_voxforge": 43, "vsh_abooks_voxforge": 44, "minaev_ailab": 45, "joh_abooks_voxforge": 46, "Goblin_abooks_voxforge": 47, "Karpov_N_abooks_voxforge": 48, "user11_mozilla": 49, "Larionov_Vsevolod_abooks_voxforge": 50, "Kaljagin_A_abooks_voxforge": 51, "Vorobjeva_Irina_abooks_voxforge": 52, "Rosljakov_Mixail_abooks_voxforge": 53, "Kononov_Mikhail_abooks_voxforge": 54, "Efremov_Oleg_abooks_voxforge": 55, "Vihrov_V_abooks_voxforge": 56, "Pokrovsky_Boris_abooks_voxforge": 57, "noname_opentts": 58, "DrLutz_abooks_voxforge": 59, "Kuznetsov_Alexei_abooks_voxforge": 60, "Sushkov_Vladimir_abooks_voxforge": 61, "Grigorjev_Yurii_abooks_voxforge": 62, "Markin_Petr_abooks_voxforge": 63, "Popova_Alevtina_abooks_voxforge": 64, "Airapetova_Darja_abooks_voxforge": 65} -------------------------------------------------------------------------------- /pretrained/stats.json: -------------------------------------------------------------------------------- 1 | {"pitch": [-7.016496333880942, 9.535745656686476, -0.03811425926007669, 0.9034625186368779], "energy": [-1.4277896881103516, 6.057352542877197, 58.567213377773356, 41.96484938662417]} -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | g2p-en == 2.1.0 2 | inflect == 4.1.0 3 | librosa == 0.7.2 4 | matplotlib == 3.2.2 5 | numba == 0.48 6 | numpy == 1.19.0 7 | pypinyin==0.39.0 8 | pyworld == 0.2.10 9 | PyYAML==5.4.1 10 | scikit-learn==0.23.2 11 | scipy == 1.5.0 12 | soundfile==0.10.3.post1 13 | tgt == 1.4.4 14 | torch == 1.7.0 15 | tqdm==4.46.1 16 | unidecode == 1.1.1 17 | pycwt -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import math as m 5 | import torch.nn.functional as F 6 | import torch.nn as nn 7 | from torch.utils.data import DataLoader 8 | 9 | # from torch.utils.tensorboard import SummaryWriter 10 | import wandb as logger 11 | 12 | from tqdm import tqdm 13 | from omegaconf import OmegaConf 14 | 15 | from hifiapi import HIFIapi 16 | 17 | from fs_two.utils.model import get_model, get_param_num 18 | from fs_two.utils.tools import to_device, log, synth_one_sample 19 | from fs_two.model import FastSpeech2Loss 20 | from fs_two.dataset import Dataset 21 | from fs_two.evaluate import evaluate 22 | 23 | 24 | def main_train_step( 25 | model, 26 | batch, 27 | step, 28 | optimizer, 29 | cfg, 30 | Loss, 31 | ): 32 | 33 | grad_acc_step = cfg.train_config["optimizer"]["grad_acc_step"] 34 | grad_clip_thresh = cfg.train_config["optimizer"]["grad_clip_thresh"] 35 | 36 | output = model(*(batch[2:])) 37 | 38 | losses = Loss(batch, output) 39 | total_loss = losses[0] 40 | 41 | # Backward 42 | 43 | total_loss = total_loss / grad_acc_step 44 | total_loss.backward() 45 | losses = [l.item() / grad_acc_step for l in losses[1:]] 46 | 47 | if step % grad_acc_step == 0: 48 | # Clipping gradients to avoid gradient explosion 49 | 50 | # Update weights 51 | # optimizer.update_lr() 52 | nn.utils.clip_grad_norm_(model.parameters(), grad_clip_thresh) 53 | optimizer.step_and_update_lr() 54 | optimizer.zero_grad() 55 | 56 | return losses, output 57 | 58 | 59 | def train_logger(losses, step, total_step, outer_bar, log, logger): 60 | 61 | losses = [sum(losses)] + losses 62 | message1 = "Step {}/{}, ".format(step, total_step) 63 | message2 = """Total Loss: {:.4f}, 64 | Mel Loss: {:.4f}, 65 | Pitch Loss: {:.4f}, 66 | Energy Loss: {:.4f}, 67 | Duration Loss: {:.4f} 68 | Mean pitch: {:.4f} 69 | Std pitch: {:.4f} 70 | """.format( 71 | *losses 72 | ) 73 | 74 | outer_bar.write(message1 + message2) 75 | log(logger, "train", step, losses=losses) 76 | 77 | 78 | def main(cfg): 79 | print("Prepare training ...") 80 | 81 | device = cfg.gpu 82 | # Get dataset 83 | dataset = Dataset( 84 | "train.txt", 85 | cfg.preprocess_config, 86 | cfg.train_config, 87 | sort=True, 88 | drop_last=True, 89 | ) 90 | batch_size = cfg.train_config["optimizer"]["batch_size"] 91 | group_size = 4 # Set this larger than 1 to enable sorting in Dataset 92 | assert batch_size * group_size < len(dataset) 93 | loader = DataLoader( 94 | dataset, 95 | batch_size=batch_size * group_size, 96 | shuffle=True, 97 | collate_fn=dataset.collate_fn, 98 | num_workers=4, 99 | ) 100 | 101 | # Prepare model 102 | model, optimizer = get_model(cfg, device, train=True) 103 | 104 | # model = nn.DataParallel(model) 105 | num_param = get_param_num(model) 106 | Loss = FastSpeech2Loss(cfg.preprocess_config, cfg.model_config) 107 | print("Number of FastSpeech2 Parameters:", num_param) 108 | 109 | # Load vocoder 110 | vocoder = HIFIapi(cfg, cfg.gpu) 111 | 112 | # Init logger 113 | for p in cfg.train_config["path"].values(): 114 | os.makedirs(p, exist_ok=True) 115 | 116 | os.environ["WANDB_API_KEY"] = cfg.logger.wandb_key 117 | if cfg.logger.offline: 118 | os.environ["WANDB_MODE"] = "offline" 119 | 120 | logger.init(name=cfg.exp_name, project="FS2", reinit=True) 121 | 122 | # Training 123 | 124 | step = cfg.tts.restore_step + 1 125 | epoch = 1 126 | total_step = cfg.train_config["step"]["total_step"] 127 | outer_bar = tqdm(total=total_step, desc="Training", position=0) 128 | outer_bar.n = cfg.tts.restore_step 129 | outer_bar.update() 130 | 131 | if cfg.run_debug_eval: 132 | print("RUN SANITY CHECK EVAL:") 133 | message = evaluate(model, 0, cfg, logger, "val", vocoder, cfg.gpu) 134 | 135 | while True: 136 | inner_bar = tqdm( 137 | total=len(loader), desc="Epoch {}".format(epoch), position=1 138 | ) 139 | for batchs in loader: 140 | for batch in batchs: 141 | batch = to_device(batch, device) 142 | 143 | # Forward 144 | 145 | losses, output = main_train_step( 146 | model, 147 | batch, 148 | step, 149 | optimizer, 150 | cfg, 151 | Loss, 152 | ) 153 | 154 | if step % cfg.train_config.step.log_step == 0: 155 | train_logger( 156 | losses, 157 | step, 158 | total_step, 159 | outer_bar, 160 | log, 161 | logger, 162 | ) 163 | 164 | if step % cfg.train_config.step.synth_step == 0: 165 | ( 166 | fig, 167 | wav_reconstruction, 168 | wav_prediction, 169 | tag, 170 | ) = synth_one_sample( 171 | batch, 172 | output, 173 | vocoder, 174 | cfg.model_config, 175 | cfg.preprocess_config, 176 | ) 177 | log( 178 | logger, 179 | "train", 180 | fig=fig, 181 | tag="Training/step_{}_{}".format(step, tag), 182 | ) 183 | sampling_rate = cfg.preprocess_config["preprocessing"][ 184 | "audio" 185 | ]["sampling_rate"] 186 | log( 187 | logger, 188 | "train", 189 | audio=wav_reconstruction, 190 | sampling_rate=sampling_rate, 191 | tag="Training/step_{}_{}_reconstructed".format( 192 | step, tag 193 | ), 194 | ) 195 | log( 196 | logger, 197 | "train", 198 | audio=wav_prediction, 199 | sampling_rate=sampling_rate, 200 | tag="Training/step_{}_{}_synthesized".format(step, tag), 201 | ) 202 | 203 | if step % cfg.train_config.step.val_step == 0: 204 | model.eval() 205 | message = evaluate( 206 | model, step, cfg, logger, "val", vocoder, cfg.gpu 207 | ) 208 | outer_bar.write(message) 209 | 210 | model.train() 211 | 212 | if step % cfg.train_config.step.save_step == 0: 213 | model_weight = model.state_dict() 214 | embed_weight = model_weight["speaker_emb.weight"] 215 | del model_weight["speaker_emb.weight"] 216 | 217 | torch.save( 218 | { 219 | "model": model_weight, 220 | "embedding": embed_weight, 221 | "optimizer": optimizer._optimizer.state_dict(), 222 | }, 223 | os.path.join( 224 | cfg.train_config["path"]["ckpt_path"], 225 | "{}.pth.tar".format(step), 226 | ), 227 | ) 228 | 229 | if step == total_step: 230 | quit() 231 | step += 1 232 | outer_bar.update(1) 233 | 234 | inner_bar.update(1) 235 | epoch += 1 236 | 237 | 238 | if __name__ == "__main__": 239 | 240 | configs = OmegaConf.load("./config.yaml") 241 | main(configs) 242 | -------------------------------------------------------------------------------- /tts_king.py: -------------------------------------------------------------------------------- 1 | # IMPORTS FOR PREPROCESS 2 | import os 3 | import torch 4 | import numpy as np 5 | from string import punctuation 6 | from fs_two.text import text_to_sequence 7 | 8 | # OTHER IMPORTS 9 | from omegaconf import OmegaConf 10 | from fsapi import FSTWOapi 11 | 12 | # from fs_two.preprocess import prepare_dataset_lj_speech 13 | from hifiapi import HIFIapi 14 | 15 | from input_process import preprocess_rus, preprocess_eng 16 | 17 | 18 | class TTSKing: 19 | def __init__(self, config_path="./config.yaml"): 20 | self.cfg = OmegaConf.load(config_path) 21 | self.tts = FSTWOapi(self.cfg, self.cfg.gpu) 22 | self.vocoder = HIFIapi(self.cfg, self.cfg.gpu) 23 | self.speakers = self.tts.speaker_names 24 | 25 | def generate_mel( 26 | self, 27 | text, 28 | duration_control=1.0, 29 | pitch_control=1.0, 30 | energy_control=1.0, 31 | speaker=0, 32 | ): 33 | 34 | phonemes = self.text_preprocess(text) 35 | 36 | result = self.tts.generate( 37 | phonemes, 38 | duration_control, 39 | pitch_control, 40 | energy_control, 41 | speaker_name=speaker, 42 | ) 43 | 44 | # mel, mel_postnet, log_duration_output, f0_output, energy_output 45 | return result 46 | 47 | def mel_to_wav(self, mel_spec): 48 | wav_cpu = self.vocoder.generate(mel_spec.transpose(1, 2)) 49 | return wav_cpu 50 | 51 | def speak( 52 | self, text, duration_control=1.0, pitch_control=1.0, energy_control=1.0 53 | ): 54 | mel_specs_batch = self.generate_mel_batch( 55 | text, duration_control, pitch_control, energy_control 56 | ) 57 | return self.vocoder(mel_specs_batch) 58 | 59 | def text_preprocess(self, text): 60 | return np.array([preprocess_rus(text)]) 61 | 62 | def text_preprocess_eng(self, text): 63 | return np.array([preprocess_eng(text, self.cfg.preprocess_config)]) 64 | 65 | def to_torch_device(self, items): 66 | return [torch.tensor(t).to(self.cfg.gpu) for t in items] 67 | --------------------------------------------------------------------------------