├── models ├── __init__.py ├── common │ ├── __init__.py │ └── loss.py ├── tacotron2 │ ├── __init__.py │ └── loss.py ├── diacritizers │ ├── __init__.py │ ├── shakkala │ │ ├── LICENSE.md │ │ ├── __init__.py │ │ ├── network.py │ │ └── symbols.py │ └── shakkelha │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── network.py │ │ └── symbols.py └── fastpitch │ ├── __init__.py │ └── fastpitch │ ├── attn_loss_function.py │ ├── alignment.py │ ├── loss_function.py │ ├── transformer.py │ └── LICENSE ├── pretrained ├── README.md └── hifigan-asc-v1 │ └── config.json ├── vocoder ├── hifigan │ ├── __init__.py │ ├── env.py │ ├── LICENSE │ └── denoiser.py ├── __init__.py └── vocos │ ├── LICENSE │ ├── __init__.py │ ├── feature_extractors.py │ ├── models.py │ ├── heads.py │ ├── modules.py │ ├── spectral_ops.py │ └── pretrained.py ├── app ├── favicon.png ├── models.yaml ├── static │ └── mappings.js └── index.html ├── data └── sampler │ ├── mulip_dict │ └── sampler_weights ├── configs ├── basic.yaml ├── nawar_tc2.yaml ├── nawar_fp.yaml ├── nawar_tc2_adv.yaml └── nawar_fp_adv.yaml ├── utils ├── plotting.py ├── logging.py ├── audio.py ├── __init__.py ├── app_utils.py ├── training.py └── make_html.py ├── text ├── symbols.py └── __init__.py ├── app.py ├── scripts ├── preprocess_audio.py ├── preprocess_text.py ├── extract_f0.py ├── train_fp.py ├── train_tc2.py ├── train_fp_adv.py └── train_tc2_adv.py ├── download_files.py ├── .gitignore ├── inference.py ├── test.py ├── train.py └── README.md /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pretrained/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vocoder/hifigan/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/tacotron2/__init__.py: -------------------------------------------------------------------------------- 1 | from .networks import Tacotron2, Tacotron2Wave -------------------------------------------------------------------------------- /app/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nipponjo/tts-arabic-pytorch/HEAD/app/favicon.png -------------------------------------------------------------------------------- /data/sampler/mulip_dict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nipponjo/tts-arabic-pytorch/HEAD/data/sampler/mulip_dict -------------------------------------------------------------------------------- /data/sampler/sampler_weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nipponjo/tts-arabic-pytorch/HEAD/data/sampler/sampler_weights -------------------------------------------------------------------------------- /app/models.yaml: -------------------------------------------------------------------------------- 1 | fastpitch_mse: 2 | type: fastpitch 3 | path: pretrained/fastpitch_ar_mse.pth 4 | 5 | fastpitch_adv: 6 | type: fastpitch 7 | path: pretrained/fastpitch_ar_adv.pth 8 | 9 | tacotron2_mse: 10 | type: tacotron2 11 | path: pretrained/tacotron2_ar_mse.pth 12 | 13 | tacotron2_adv: 14 | type: tacotron2 15 | path: pretrained/tacotron2_ar_adv.pth 16 | -------------------------------------------------------------------------------- /models/diacritizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .shakkelha.network import Shakkelha 2 | from .shakkala.network import Shakkala 3 | 4 | def load_vowelizer(name: str, config): 5 | if name == 'shakkala': 6 | shakkala = Shakkala(sd_path=config.shakkala_path) 7 | return shakkala 8 | elif name == 'shakkelha': 9 | shakkelha = Shakkelha(sd_path=config.shakkelha_path) 10 | return shakkelha 11 | else: 12 | print('...') -------------------------------------------------------------------------------- /vocoder/hifigan/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | 5 | class AttrDict(dict): 6 | def __init__(self, *args, **kwargs): 7 | super(AttrDict, self).__init__(*args, **kwargs) 8 | self.__dict__ = self 9 | 10 | 11 | def build_env(config, config_name, path): 12 | t_path = os.path.join(path, config_name) 13 | if config != t_path: 14 | os.makedirs(path, exist_ok=True) 15 | shutil.copyfile(config, os.path.join(path, config_name)) 16 | -------------------------------------------------------------------------------- /configs/basic.yaml: -------------------------------------------------------------------------------- 1 | # training 2 | epochs: 500 3 | decoder_max_step: 3000 4 | 5 | random_seed: False 6 | 7 | batch_size: 8 8 | learning_rate: 1.0e-3 9 | weight_decay: 1.0e-6 10 | grad_clip_thresh: 1.0 11 | 12 | cache_dataset: True 13 | use_cuda_if_available: True 14 | 15 | balanced_sampling: False 16 | 17 | # vocoder 18 | vocoder_state_path: pretrained/hifigan-asc-v1/hifigan-asc.pth 19 | vocoder_config_path: pretrained/hifigan-asc-v1/config.json 20 | 21 | # diacritizers 22 | shakkala_path: pretrained/diacritizers/shakkala_second_model6.pth 23 | shakkelha_path: pretrained/diacritizers/shakkelha_rnn_3_big_20.pth 24 | 25 | -------------------------------------------------------------------------------- /vocoder/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def load_hifigan(state_dict_path, config_file): 4 | import json 5 | 6 | from vocoder.hifigan.env import AttrDict 7 | from vocoder.hifigan.models import Generator 8 | 9 | with open(config_file) as f: 10 | data = f.read() 11 | json_config = json.loads(data) 12 | h = AttrDict(json_config) 13 | 14 | generator = Generator(h) 15 | state_dict_g = torch.load(state_dict_path, map_location='cpu') 16 | generator.load_state_dict(state_dict_g['generator']) 17 | 18 | generator.eval() 19 | generator.remove_weight_norm() 20 | return generator 21 | -------------------------------------------------------------------------------- /models/diacritizers/shakkala/LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 Shakkala Project 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | -------------------------------------------------------------------------------- /configs/nawar_tc2.yaml: -------------------------------------------------------------------------------- 1 | 2 | # restore_model: '' 3 | restore_model: ./pretrained/tacotron2_ar_adv.pth 4 | # restore_model: ./checkpoints/exp_tc2/states.pth 5 | 6 | log_dir: logs/exp_tc2 7 | checkpoint_dir: checkpoints/exp_tc2 8 | 9 | # dataset 10 | train_wavs_path: G:/data/arabic-speech-corpus/wav_new 11 | train_labels: ./data/train_phon.txt 12 | 13 | test_wavs_path: G:/data/arabic-speech-corpus/test set/wav_new 14 | test_labels: ./data/test_phon.txt 15 | 16 | label_pattern: '"(?P.*)" "(?P.*)"' 17 | # label_pattern: (?P.*)\|(?P.*) 18 | 19 | # optimizers 20 | g_lr: 1.0e-3 # lr for AdamW optimizer (generator) 21 | g_beta1: 0.9 # beta1 for AdamW optimizer (generator) 22 | g_beta2: 0.999 # beta2 for AdamW optimizer (generator) 23 | 24 | n_save_states_iter: 10 25 | n_save_backup_iter: 1000 26 | -------------------------------------------------------------------------------- /pretrained/hifigan-asc-v1/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 16, 5 | "learning_rate": 0.0002, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.999, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [8,8,2,2], 12 | "upsample_kernel_sizes": [16,16,4,4], 13 | "upsample_initial_channel": 512, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "segment_size": 8192, 18 | "num_mels": 80, 19 | "num_freq": 1025, 20 | "n_fft": 1024, 21 | "hop_size": 256, 22 | "win_size": 1024, 23 | 24 | "sampling_rate": 22050, 25 | 26 | "fmin": 0, 27 | "fmax": 8000, 28 | "fmax_for_loss": null, 29 | 30 | "num_workers": 4, 31 | 32 | "dist_config": { 33 | "dist_backend": "nccl", 34 | "dist_url": "tcp://localhost:54321", 35 | "world_size": 1 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /configs/nawar_fp.yaml: -------------------------------------------------------------------------------- 1 | 2 | # restore_model: '' 3 | restore_model: ./pretrained/fastpitch_ar_adv.pth 4 | # restore_model: ./checkpoints/exp_fp/states.pth 5 | 6 | log_dir: logs/exp_fp 7 | checkpoint_dir: checkpoints/exp_fp 8 | 9 | # dataset 10 | train_wavs_path: G:/data/arabic-speech-corpus/wav_new 11 | train_labels: ./data/train_phon.txt 12 | 13 | test_wavs_path: G:/data/arabic-speech-corpus/test set/wav_new 14 | test_labels: ./data/test_phon.txt 15 | 16 | label_pattern: '"(?P.*)" "(?P.*)"' 17 | # label_pattern: (?P.*)\|(?P.*) 18 | 19 | # for fastpitch 20 | f0_dict_path: ./data/pitch_dict.pt 21 | 22 | f0_mean: 130.05478 23 | f0_std: 22.86267 24 | 25 | # batch sizes 26 | max_lengths: [1000, 1300, 1850, 30000] # 1 frame ≈ 11.6ms 27 | batch_sizes: [10, 8, 6, 4] 28 | 29 | # optimizers 30 | g_lr: 1.0e-4 # lr for AdamW optimizer (generator) 31 | g_beta1: 0.9 # beta1 for AdamW optimizer (generator) 32 | g_beta2: 0.999 # beta2 for AdamW optimizer (generator) 33 | 34 | n_save_states_iter: 100 35 | n_save_backup_iter: 1000 36 | -------------------------------------------------------------------------------- /configs/nawar_tc2_adv.yaml: -------------------------------------------------------------------------------- 1 | 2 | # restore_model: '' 3 | restore_model: ./pretrained/tacotron2_ar_adv.pth 4 | # restore_model: ./checkpoints/exp_tc2_adv/states.pth 5 | 6 | log_dir: logs/exp_tc2_adv 7 | checkpoint_dir: checkpoints/exp_tc2_adv 8 | 9 | # dataset 10 | train_wavs_path: G:/data/arabic-speech-corpus/wav_new 11 | train_labels: ./data/train_phon.txt 12 | 13 | test_wavs_path: G:/data/arabic-speech-corpus/test set/wav_new 14 | test_labels: ./data/test_phon.txt 15 | 16 | label_pattern: '"(?P.*)" "(?P.*)"' 17 | # label_pattern: (?P.*)\|(?P.*) 18 | 19 | # loss weights 20 | gan_loss_weight: 4. 21 | feat_loss_weight: 1. 22 | 23 | # optimizers 24 | g_lr: 1.0e-4 # lr for AdamW optimizer (generator) 25 | g_beta1: 0. # beta1 for AdamW optimizer (generator) 26 | g_beta2: 0.99 # beta2 for AdamW optimizer (generator) 27 | 28 | d_lr: 1.0e-4 # lr for AdamW optimizer (discriminator) 29 | d_beta1: 0. # beta1 for AdamW optimizer (discriminator) 30 | d_beta2: 0.99 # beta2 for AdamW optimizer (discriminator) 31 | 32 | n_save_states_iter: 10 33 | n_save_backup_iter: 1000 34 | -------------------------------------------------------------------------------- /vocoder/hifigan/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jungil Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /vocoder/vocos/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Charactr Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /models/tacotron2/loss.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | 5 | class Tacotron2Loss(nn.Module): 6 | def __init__(self, 7 | mel_loss_scale=1.0): 8 | super().__init__() 9 | self.mel_loss_scale = mel_loss_scale 10 | 11 | def forward(self, 12 | mel_out, 13 | mel_out_postnet, 14 | mel_padded, 15 | gate_out, 16 | gate_padded): 17 | 18 | rnn_mel_loss = F.mse_loss(mel_out, mel_padded) 19 | postnet_mel_loss = F.mse_loss(mel_out_postnet, mel_padded) 20 | gate_loss = F.binary_cross_entropy_with_logits( 21 | gate_out, gate_padded) 22 | 23 | meta = { 24 | 'mel_loss_rnn': rnn_mel_loss.clone().detach(), 25 | 'mel_loss_postnet': postnet_mel_loss.clone().detach(), 26 | 'gate_loss': gate_loss.clone().detach(), 27 | } 28 | 29 | loss = self.mel_loss_scale * rnn_mel_loss \ 30 | + self.mel_loss_scale * postnet_mel_loss \ 31 | + gate_loss 32 | 33 | return loss, meta 34 | 35 | -------------------------------------------------------------------------------- /utils/plotting.py: -------------------------------------------------------------------------------- 1 | #import matplotlib 2 | # matplotlib.use("Agg") 3 | import matplotlib.pylab as plt 4 | 5 | 6 | def get_alignment_figure(img): 7 | fig = plt.figure(figsize=(6, 4)) 8 | plt.imshow(img, aspect='auto', origin='lower', 9 | interpolation='none') 10 | plt.xlabel('Spectrogram frame') 11 | plt.ylabel('Input token') 12 | plt.colorbar() 13 | plt.tight_layout() 14 | return fig 15 | 16 | 17 | def get_spectrogram_figure(spec): 18 | fig = plt.figure(figsize=(12, 3)) 19 | plt.imshow(spec, aspect='auto', origin='lower', 20 | interpolation='none') 21 | plt.xlabel('Frame') 22 | plt.ylabel('Channel') 23 | plt.colorbar() 24 | plt.tight_layout() 25 | return fig 26 | 27 | 28 | def get_specs_figure(specs, xlabels): 29 | n = len(specs) 30 | fig, axes = plt.subplots(n, 1, figsize=(12, 3*n)) 31 | 32 | for i, ax in enumerate(axes): 33 | im = ax.imshow(specs[i], aspect='auto', origin='lower', 34 | interpolation='none') 35 | ax.set_xlabel(xlabels[i]) 36 | ax.set_ylabel('Channel') 37 | plt.colorbar(im, ax=ax) 38 | 39 | plt.tight_layout() 40 | return fig 41 | -------------------------------------------------------------------------------- /text/symbols.py: -------------------------------------------------------------------------------- 1 | 2 | PADDING_TOKEN = '_pad_' 3 | EOS_TOKEN = '_eos_' 4 | DOUBLING_TOKEN = '_dbl_' 5 | SEPARATOR_TOKEN = '_+_' 6 | 7 | EOS_TOKENS = [SEPARATOR_TOKEN, EOS_TOKEN] 8 | 9 | symbols = [ 10 | # special tokens 11 | PADDING_TOKEN, # padding 12 | EOS_TOKEN, # eos-token 13 | '_sil_', # silence 14 | DOUBLING_TOKEN, # doubling 15 | SEPARATOR_TOKEN, # word separator 16 | # consonants 17 | '<', # hamza 18 | 'b', # baa' 19 | 't', # taa' 20 | '^', # thaa' 21 | 'j', # jiim 22 | 'H', # Haa' 23 | 'x', # xaa' 24 | 'd', # daal 25 | '*', # dhaal 26 | 'r', # raa' 27 | 'z', # zaay 28 | 's', # siin 29 | '$', # shiin 30 | 'S', # Saad 31 | 'D', # Daad 32 | 'T', # Taa' 33 | 'Z', # Zhaa' 34 | 'E', # 3ayn 35 | 'g', # ghain 36 | 'f', # faa' 37 | 'q', # qaaf 38 | 'k', # kaaf 39 | 'l', # laam 40 | 'm', # miim 41 | 'n', # nuun 42 | 'h', # haa' 43 | 'w', # waaw 44 | 'y', # yaa' 45 | 'v', # /v/ for loanwords e.g. in u'fydyw': u'v i0 d y uu1', 46 | # vowels 47 | 'a', # short 48 | 'u', 49 | 'i', 50 | 'aa', # long 51 | 'uu', 52 | 'ii', 53 | ] 54 | -------------------------------------------------------------------------------- /models/diacritizers/shakkelha/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Neural Arabic Text Diacritization: Outperforming State of the Art Using FFNN and RNN 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import uvicorn 4 | from fastapi import FastAPI 5 | from fastapi.responses import FileResponse, Response 6 | from fastapi.staticfiles import StaticFiles 7 | from pydantic import BaseModel 8 | 9 | from utils.app_utils import TTSManager 10 | 11 | app = FastAPI() 12 | 13 | use_cuda_if_available = True 14 | tts_manager = TTSManager('app/static', use_cuda_if_available=use_cuda_if_available) 15 | 16 | class TTSRequest(BaseModel): 17 | buckw: str 18 | rate: float 19 | denoise: float 20 | 21 | app.mount('/static', StaticFiles(directory='./app/static'), 'static') 22 | 23 | 24 | @app.get('/') 25 | async def main(): 26 | return FileResponse('./app/index.html') 27 | 28 | 29 | @app.get('/{filename}') 30 | async def get_file(filename: str): 31 | filepath = f'./app/{filename}' 32 | if os.path.exists(filepath): 33 | return FileResponse(filepath) 34 | return Response(status_code=404) 35 | 36 | 37 | @app.post('/api/tts') 38 | async def tts(req: TTSRequest): 39 | print(req) 40 | response_data = tts_manager.tts(req.buckw, req.rate, 41 | req.denoise) 42 | 43 | return response_data 44 | 45 | 46 | if __name__ == '__main__': 47 | uvicorn.run(app, host="127.0.0.1", port=8000) 48 | -------------------------------------------------------------------------------- /models/diacritizers/shakkala/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .symbols import input_vocab_to_int, output_int_to_vocab 3 | 4 | 5 | def combine_text_with_harakat(input_sent: str, output_sent: str): 6 | #fix combine differences 7 | input_length = len(input_sent) 8 | output_length = len(output_sent) # harakat_stack.size() 9 | for _ in range(0,(input_length-output_length)): 10 | output_sent.append("") 11 | 12 | #combine with text 13 | text = "" 14 | for character, haraka in zip(input_sent, output_sent): 15 | if haraka == '' or haraka == 'ـ': 16 | haraka = '' 17 | text += character + "" + haraka 18 | 19 | return text 20 | 21 | def encode(input_text: str, max_sentence: int=315): 22 | input_letters_ids = [input_vocab_to_int.get(ch, input_vocab_to_int['']) for ch in input_text] 23 | if max_sentence is not None: 24 | input_ids_pad = input_letters_ids + (max_sentence - len(input_letters_ids))*[0,] 25 | else: 26 | input_ids_pad = input_letters_ids 27 | return input_ids_pad, input_letters_ids 28 | 29 | def decode(probs, text_input: str, input_letters_ids): 30 | diacrits = [output_int_to_vocab[i] for i in torch.argmax(probs[0], dim=1).tolist()[:len(input_letters_ids)]] 31 | return combine_text_with_harakat(text_input, diacrits) 32 | -------------------------------------------------------------------------------- /configs/nawar_fp_adv.yaml: -------------------------------------------------------------------------------- 1 | 2 | # restore_model: '' 3 | restore_model: ./pretrained/fastpitch_ar_adv.pth 4 | # restore_model: ./checkpoints/exp_fp_adv/states.pth 5 | 6 | log_dir: logs/exp_fp_adv 7 | checkpoint_dir: checkpoints/exp_fp_adv 8 | 9 | # dataset 10 | train_wavs_path: G:/data/arabic-speech-corpus/wav_new 11 | train_labels: ./data/train_phon.txt 12 | 13 | test_wavs_path: G:/data/arabic-speech-corpus/test set/wav_new 14 | test_labels: ./data/test_phon.txt 15 | 16 | label_pattern: '"(?P.*)" "(?P.*)"' 17 | # label_pattern: (?P.*)\|(?P.*) 18 | 19 | # for fastpitch 20 | f0_dict_path: ./data/pitch_dict.pt 21 | 22 | f0_mean: 130.05478 23 | f0_std: 22.86267 24 | 25 | # loss weights 26 | gan_loss_weight: 3. 27 | feat_loss_weight: 1. 28 | 29 | # batch sizes 30 | max_lengths: [1000, 1300, 1850, 30000] # 1 frame ≈ 11.6ms 31 | batch_sizes: [10, 8, 6, 4] 32 | 33 | # optimizers 34 | g_lr: 1.0e-4 # lr for AdamW optimizer (generator) 35 | g_beta1: 0. # beta1 for AdamW optimizer (generator) 36 | g_beta2: 0.99 # beta2 for AdamW optimizer (generator) 37 | 38 | d_lr: 1.0e-4 # lr for AdamW optimizer (discriminator) 39 | d_beta1: 0. # beta1 for AdamW optimizer (discriminator) 40 | d_beta2: 0.99 # beta2 for AdamW optimizer (discriminator) 41 | 42 | n_save_states_iter: 100 43 | n_save_backup_iter: 1000 44 | -------------------------------------------------------------------------------- /scripts/preprocess_audio.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import os 3 | import torch 4 | import torchaudio 5 | import librosa 6 | import numpy as np 7 | from tqdm import tqdm 8 | 9 | # %% 10 | 11 | wav_path = 'G:/data/arabic-speech-corpus/wav' 12 | wav_new_path = 'G:/data/arabic-speech-corpus/wav_new' 13 | 14 | sr_target = 22050 15 | silence_audio_size = 256 * 3 16 | 17 | device = 'cuda' 18 | 19 | wav_fpaths = [f.path for f in os.scandir(wav_path) if f.path.endswith('.wav')] 20 | # waves = make_dataset_from_subdirs(wavs_path) 21 | 22 | if not os.path.exists(wav_new_path): 23 | os.makedirs(wav_new_path) 24 | print(f"Created folder @ {wav_new_path}") 25 | 26 | # %% 27 | 28 | for wav_fpath in tqdm(wav_fpaths): 29 | 30 | fname = os.path.basename(wav_fpath) 31 | 32 | fpath = os.path.join(wav_path, fname) 33 | wave, sr = torchaudio.load(fpath) 34 | 35 | if sr != sr_target: 36 | wave = wave.to(device) 37 | wave = torchaudio.functional.resample(wave, sr, sr_target, 38 | lowpass_filter_width=1024) 39 | 40 | wave_ = wave[0].cpu().numpy() 41 | wave_ = wave_ / np.abs(wave_).max() * 0.999 42 | wave_ = librosa.effects.trim( 43 | wave_, top_db=23, frame_length=1024, hop_length=256)[0] 44 | wave_ = np.append(wave_, [0.]*silence_audio_size) 45 | 46 | torchaudio.save(f'{wav_new_path}/{fname}', 47 | torch.Tensor(wave_).unsqueeze(0), sr_target) 48 | 49 | 50 | # %% 51 | -------------------------------------------------------------------------------- /models/diacritizers/shakkelha/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .symbols import (RNN_BIG_CHARACTERS_MAPPING, 3 | DIACRITICS_LIST, ARABIC_LETTERS_LIST, RNN_REV_CLASSES_MAPPING, RNN_SMALL_CHARACTERS_MAPPING) 4 | 5 | 6 | def remove_diacritics(data, DIACRITICS_LIST): 7 | return data.translate(str.maketrans('', '', ''.join(DIACRITICS_LIST))) 8 | 9 | CHARACTERS_MAPPING = RNN_BIG_CHARACTERS_MAPPING 10 | # CHARACTERS_MAPPING = RNN_SMALL_CHARACTERS_MAPPING 11 | REV_CLASSES_MAPPING = RNN_REV_CLASSES_MAPPING 12 | 13 | 14 | def encode(input_text:str): 15 | x = [CHARACTERS_MAPPING['']] 16 | for idx, char in enumerate(input_text): 17 | if char in DIACRITICS_LIST: 18 | continue 19 | if char not in CHARACTERS_MAPPING: 20 | x.append(CHARACTERS_MAPPING['']) 21 | else: 22 | x.append(CHARACTERS_MAPPING[char]) 23 | 24 | x.append(CHARACTERS_MAPPING['']) 25 | 26 | return x 27 | 28 | def decode(probs, input_text:str): 29 | probs = probs[0][1:] 30 | 31 | output = '' 32 | for char, prediction in zip(remove_diacritics(input_text, DIACRITICS_LIST), probs): 33 | output += char 34 | 35 | if char not in ARABIC_LETTERS_LIST: 36 | continue 37 | 38 | prediction = torch.argmax(prediction).item() 39 | 40 | if '<' in REV_CLASSES_MAPPING[prediction]: 41 | continue 42 | 43 | output += REV_CLASSES_MAPPING[prediction] 44 | 45 | return output -------------------------------------------------------------------------------- /app/static/mappings.js: -------------------------------------------------------------------------------- 1 | arabicToBuckw = { 2 | '\u0628': 'b', '\u0630': '*', '\u0637': 'T', '\u0645': 'm', 3 | '\u062a': 't', '\u0631': 'r', '\u0638': 'Z', '\u0646': 'n', 4 | '\u062b': '^', '\u0632': 'z', '\u0639': 'E', '\u0647': 'h', 5 | '\u062c': 'j', '\u0633': 's', '\u063a': 'g', '\u062d': 'H', 6 | '\u0642': 'q', '\u0641': 'f', '\u062e': 'x', '\u0635': 'S', 7 | '\u0634': '$', '\u062f': 'd', '\u0636': 'D', '\u0643': 'k', 8 | '\u0623': '>', '\u0621': '\'', '\u0626': '}', '\u0624': '&', 9 | '\u0625': '<', '\u0622': '|', '\u0627': 'A', '\u0649': 'Y', 10 | '\u0629': 'p', '\u064a': 'y', '\u0644': 'l', '\u0648': 'w', 11 | '\u064b': 'F', '\u064c': 'N', '\u064d': 'K', '\u064e': 'a', 12 | '\u064f': 'u', '\u0650': 'i', '\u0651': '~', '\u0652': 'o' 13 | } 14 | 15 | buckwToArabic = { 16 | 'b': '\u0628', '*': '\u0630', 'T': '\u0637', 'm': '\u0645', 17 | 't': '\u062a', 'r': '\u0631', 'Z': '\u0638', 'n': '\u0646', 18 | '^': '\u062b', 'z': '\u0632', 'E': '\u0639', 'h': '\u0647', 19 | 'j': '\u062c', 's': '\u0633', 'g': '\u063a', 'H': '\u062d', 20 | 'q': '\u0642', 'f': '\u0641', 'x': '\u062e', 'S': '\u0635', 21 | '$': '\u0634', 'd': '\u062f', 'D': '\u0636', 'k': '\u0643', 22 | '>': '\u0623', '\'': '\u0621', '}': '\u0626', '&': '\u0624', 23 | '<': '\u0625', '|': '\u0622', 'A': '\u0627', 'Y': '\u0649', 24 | 'p': '\u0629', 'y': '\u064a', 'l': '\u0644', 'w': '\u0648', 25 | 'F': '\u064b', 'N': '\u064c', 'K': '\u064d', 'a': '\u064e', 26 | 'u': '\u064f', 'i': '\u0650', '~': '\u0651', 'o': '\u0652' 27 | } -------------------------------------------------------------------------------- /utils/logging.py: -------------------------------------------------------------------------------- 1 | from torch.utils.tensorboard import SummaryWriter 2 | from utils.plotting import get_alignment_figure, get_specs_figure 3 | 4 | 5 | class TBLogger(SummaryWriter): 6 | def __init__(self, log_dir): 7 | super(TBLogger, self).__init__(log_dir) 8 | 9 | def add_training_data(self, meta, grad_norm, 10 | learning_rate, tb_step: int): 11 | 12 | for k, v in meta.items(): 13 | self.add_scalar(f'train/{k}', v.item(), tb_step) 14 | self.add_scalar("train/grad_norm", grad_norm, tb_step) 15 | self.add_scalar("train/learning_rate", learning_rate, tb_step) 16 | 17 | def add_parameters(self, model, tb_step: int): 18 | 19 | for tag, value in model.named_parameters(): 20 | tag = tag.replace('.', '/') 21 | self.add_histogram(tag, value.data.cpu().numpy(), tb_step) 22 | 23 | def add_sample(self, alignment, mel_pred, 24 | mel_targ, mel_infer, len_targ, 25 | tb_step: int): 26 | 27 | self.add_figure( 28 | "alignment", 29 | get_alignment_figure(alignment.detach().cpu().numpy().T), 30 | tb_step) 31 | 32 | self.add_figure( 33 | "spectrograms", 34 | get_specs_figure([ 35 | mel_infer.detach().cpu().numpy(), 36 | mel_pred[:, :len_targ].detach().cpu().numpy(), 37 | mel_targ[:, :len_targ].detach().cpu().numpy(), 38 | ], 39 | ['Frames (inferred)', 'Frames (predicted)', 'Frames (target)'] 40 | ), tb_step) 41 | -------------------------------------------------------------------------------- /models/fastpitch/__init__.py: -------------------------------------------------------------------------------- 1 | from .networks import FastPitch, FastPitch2Wave 2 | 3 | net_config = {'n_mel_channels': 80, 4 | 'n_symbols': 148, 5 | 'padding_idx': 0, 6 | 'symbols_embedding_dim': 384, 7 | 'in_fft_n_layers': 6, 8 | 'in_fft_n_heads': 1, 9 | 'in_fft_d_head': 64, 10 | 'in_fft_conv1d_kernel_size': 3, 11 | 'in_fft_conv1d_filter_size': 1536, 12 | 'in_fft_output_size': 384, 13 | 'p_in_fft_dropout': 0.1, 14 | 'p_in_fft_dropatt': 0.1, 15 | 'p_in_fft_dropemb': 0.0, 16 | 'out_fft_n_layers': 6, 17 | 'out_fft_n_heads': 1, 18 | 'out_fft_d_head': 64, 19 | 'out_fft_conv1d_kernel_size': 3, 20 | 'out_fft_conv1d_filter_size': 1536, 21 | 'out_fft_output_size': 384, 22 | 'p_out_fft_dropout': 0.1, 23 | 'p_out_fft_dropatt': 0.1, 24 | 'p_out_fft_dropemb': 0.0, 25 | 'dur_predictor_kernel_size': 3, 26 | 'dur_predictor_filter_size': 256, 27 | 'p_dur_predictor_dropout': 0.1, 28 | 'dur_predictor_n_layers': 2, 29 | 'pitch_predictor_kernel_size': 3, 30 | 'pitch_predictor_filter_size': 256, 31 | 'p_pitch_predictor_dropout': 0.1, 32 | 'pitch_predictor_n_layers': 2, 33 | 'pitch_embedding_kernel_size': 3, 34 | 'n_speakers': 1, 35 | 'speaker_emb_weight': 1.0, 36 | 'energy_predictor_kernel_size': 3, 37 | 'energy_predictor_filter_size': 256, 38 | 'p_energy_predictor_dropout': 0.1, 39 | 'energy_predictor_n_layers': 2, 40 | 'energy_conditioning': True, 41 | 'energy_embedding_kernel_size': 3} 42 | -------------------------------------------------------------------------------- /utils/audio.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from librosa.filters import mel as librosa_mel_fn 4 | 5 | 6 | class MelSpectrogram(nn.Module): 7 | def __init__(self, 8 | sample_rate: int = 22050, 9 | n_fft: int = 1024, 10 | win_length: int = 1024, 11 | hop_length: int = 256, 12 | n_mels: int = 80, 13 | f_min: float = 0, 14 | f_max: float = 8000.0, 15 | norm: str = 'slaney', 16 | center: bool = False 17 | ): 18 | super().__init__() 19 | self.sample_rate = sample_rate 20 | self.n_fft = n_fft 21 | self.hop_length = hop_length 22 | self.win_length = win_length 23 | self.center = center 24 | 25 | self.pad_length = int((n_fft - hop_length)/2) 26 | 27 | mel_basis = torch.Tensor(librosa_mel_fn(sr=sample_rate, 28 | n_fft=n_fft, n_mels=n_mels, 29 | fmin=f_min, fmax=f_max, 30 | norm=norm)) 31 | window_fn = torch.hann_window(win_length) 32 | self.register_buffer('mel_basis', mel_basis) 33 | self.register_buffer('window_fn', window_fn) 34 | 35 | def forward(self, x): 36 | x_pad = torch.nn.functional.pad( 37 | x, (self.pad_length, self.pad_length), mode='reflect') 38 | spec_lin = torch.stft(x_pad, self.n_fft, 39 | self.hop_length, 40 | self.win_length, 41 | self.window_fn, 42 | center=self.center, 43 | return_complex=True) # [B, F, T] 44 | spec_mag = spec_lin.abs().pow_(2).add_(1e-9).sqrt_() 45 | spec_mel = torch.matmul(self.mel_basis, spec_mag) # [B, mels, T] 46 | return spec_mel 47 | -------------------------------------------------------------------------------- /scripts/preprocess_text.py: -------------------------------------------------------------------------------- 1 | # This file needs to be run in the main folder 2 | # %% 3 | import text 4 | from utils import read_lines_from_file 5 | 6 | 7 | def write_lines_to_file(path, lines, mode='w', encoding='utf-8'): 8 | with open(path, mode, encoding=encoding) as f: 9 | for i, line in enumerate(lines): 10 | if i == len(lines)-1: 11 | f.write(line) 12 | break 13 | f.write(line + '\n') 14 | 15 | # %% 16 | 17 | 18 | lines = read_lines_from_file('./data/train-orthographic-transcript.txt') 19 | #lines = read_lines_from_file('./data/test-orthographic-transcript.txt') 20 | 21 | new_lines_arabic = [] 22 | new_lines_phonetic = [] 23 | new_lines_buckw = [] 24 | 25 | for line in lines: 26 | wav_name, utterance = line.split('" "') 27 | wav_name, utterance = wav_name[1:], utterance[:-1] 28 | utterance = utterance.replace("a~", "~a") \ 29 | .replace("i~", "~i") \ 30 | .replace("u~", "~u") \ 31 | .replace(" - ", " ") 32 | 33 | utterance_arab = text.buckwalter_to_arabic(utterance) 34 | utterance_phon = text.buckwalter_to_phonemes(utterance) 35 | 36 | line_new_ara = f'"{wav_name}" "{utterance_arab}"' 37 | new_lines_arabic.append(line_new_ara) 38 | 39 | line_new_pho = f'"{wav_name}" "{utterance_phon}"' 40 | new_lines_phonetic.append(line_new_pho) 41 | 42 | line_new_buckw = f'"{wav_name}" "{utterance}"' 43 | new_lines_buckw.append(line_new_buckw) 44 | 45 | 46 | # %% train 47 | 48 | write_lines_to_file('./data/train_arab.txt', new_lines_arabic) 49 | write_lines_to_file('./data/train_phon.txt', new_lines_phonetic) 50 | write_lines_to_file('./data/train_buckw.txt', new_lines_buckw) 51 | 52 | # %% test 53 | 54 | write_lines_to_file('./data/test_arab.txt', new_lines_arabic) 55 | write_lines_to_file('./data/test_phon.txt', new_lines_phonetic) 56 | write_lines_to_file('./data/test_buckw.txt', new_lines_buckw) 57 | -------------------------------------------------------------------------------- /vocoder/vocos/__init__.py: -------------------------------------------------------------------------------- 1 | # from vocos.pretrained import Vocos 2 | 3 | 4 | __version__ = "0.1.0" 5 | 6 | 7 | 8 | config_24k = {'sample_rate': 24000, 9 | 'initial_learning_rate': '5e-4', 10 | 'mel_loss_coeff': 45, 11 | 'mrd_loss_coeff': 0.1, 12 | 'num_warmup_steps': 0, 13 | 'pretrain_mel_steps': 0, 14 | 'evaluate_utmos': True, 15 | 'evaluate_pesq': True, 16 | 'evaluate_periodicty': True, 17 | 'feature_extractor': {'class_path': 'vocos.feature_extractors.MelSpectrogramFeatures', 18 | 'init_args': {'sample_rate': 24000, 19 | 'n_fft': 1024, 20 | 'hop_length': 256, 21 | 'n_mels': 100, 22 | 'padding': 'center'}}, 23 | 'backbone': {'class_path': 'vocos.models.VocosBackbone', 24 | 'init_args': {'input_channels': 100, 25 | 'dim': 512, 26 | 'intermediate_dim': 1536, 27 | 'num_layers': 8}}, 28 | 'head': {'class_path': 'vocos.heads.ISTFTHead', 29 | 'init_args': {'dim': 512, 30 | 'n_fft': 1024, 31 | 'hop_length': 256, 32 | 'padding': 'center'}}} 33 | 34 | 35 | config_22k = {'sample_rate': 22050, 36 | 'initial_learning_rate': '5e-4', 37 | 'mel_loss_coeff': 45, 38 | 'mrd_loss_coeff': 0.1, 39 | 'num_warmup_steps': 0, 40 | 'pretrain_mel_steps': 0, 41 | 'evaluate_utmos': True, 42 | 'evaluate_pesq': True, 43 | 'evaluate_periodicty': True, 44 | 'feature_extractor': {'class_path': 'vocos.feature_extractors.MelSpectrogramFeatures', 45 | 'init_args': { 46 | 'sample_rate': 24000, 47 | 'n_fft': 1024, 48 | 'hop_length': 256, 49 | 'n_mels': 80, 50 | 51 | 'padding': 'same', 52 | 'f_min': 0, 53 | 'f_max': 8000, 54 | 'norm': "slaney", 55 | 'mel_scale': "slaney", 56 | 57 | }}, 58 | 'backbone': {'class_path': 'vocos.models.VocosBackbone', 59 | 'init_args': {'input_channels': 80, 60 | 'dim': 512, 61 | 'intermediate_dim': 1536, 62 | 'num_layers': 8}}, 63 | 'head': {'class_path': 'vocos.heads.ISTFTHead', 64 | 'init_args': {'dim': 512, 65 | 'n_fft': 1024, 66 | 'hop_length': 256, 67 | 'padding': 'same'}}} 68 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import yaml 3 | try: 4 | from yaml import CLoader as Loader 5 | except ImportError: 6 | from yaml import Loader 7 | 8 | 9 | class DictConfig(object): 10 | """Creates a Config object from a dict 11 | such that object attributes correspond to dict keys. 12 | """ 13 | 14 | def __init__(self, config_dict): 15 | self.__dict__.update(config_dict) 16 | 17 | def __str__(self): 18 | return '\n'.join(f"{key}: {val}" for key, val in self.__dict__.items()) 19 | 20 | def __repr__(self): 21 | return self.__str__() 22 | 23 | 24 | def get_custom_config(fname): 25 | with open(fname, 'r') as stream: 26 | config_dict = yaml.load(stream, Loader) 27 | config = DictConfig(config_dict) 28 | return config 29 | 30 | 31 | def get_basic_config(): 32 | return get_custom_config('configs/basic.yaml') 33 | 34 | 35 | def get_config(fname): 36 | config = get_basic_config() 37 | custom_config = get_custom_config(fname) 38 | 39 | config.__dict__.update(custom_config.__dict__) 40 | return config 41 | 42 | 43 | def read_lines_from_file(path, encoding='utf-8'): 44 | lines = [] 45 | with open(path, 'r', encoding=encoding) as f: 46 | for line in f: 47 | lines.append(line.strip()) 48 | return lines 49 | 50 | def write_lines_to_file(path, lines, mode='w', encoding='utf-8'): 51 | with open(path, mode, encoding=encoding) as f: 52 | for i, line in enumerate(lines): 53 | if i == len(lines)-1: 54 | f.write(line) 55 | break 56 | f.write(line + '\n') 57 | 58 | 59 | def progbar(iterable, length=30, symbol='='): 60 | """Wrapper generator function for an iterable. 61 | Prints a progressbar when yielding an item. \\ 62 | Args: 63 | iterable: an object supporting iteration 64 | length: length of the progressbar 65 | """ 66 | n = len(iterable) 67 | for i, item in enumerate(iterable): 68 | steps = length*(i+1) // n 69 | sys.stdout.write('\r') 70 | sys.stdout.write(f"[{symbol*steps:{length}}] {(100/n*(i+1)):.1f}%") 71 | if i == (n-1): 72 | sys.stdout.write('\n') 73 | sys.stdout.flush() 74 | yield item 75 | -------------------------------------------------------------------------------- /text/__init__.py: -------------------------------------------------------------------------------- 1 | from text.symbols import symbols, DOUBLING_TOKEN, EOS_TOKEN, SEPARATOR_TOKEN 2 | from text.phonetise_buckwalter import ( 3 | arabic_to_buckwalter, 4 | buckwalter_to_arabic, 5 | process_utterance 6 | ) 7 | 8 | vowels = ['aa', 'AA', 'uu0', 'uu1', 'UU0', 'UU1', 'ii0', 'ii1', 9 | 'II0', 'II1', 'a', 'A', 'u0', 'u1', 'U0', 'U1', 'i0', 'i1', 10 | 'I0', 'I1'] 11 | 12 | vowel_map = { 13 | 'aa': 'aa', 'AA': 'aa', 14 | 'uu0': 'uu', 'uu1': 'uu', 'UU0': 'uu', 'UU1': 'uu', 15 | 'ii0': 'ii', 'ii1': 'ii', 'II0': 'ii', 'II1': 'ii', 16 | 'a': 'a', 'A': 'a', 17 | 'u0': 'u', 'u1': 'u', 'U0': 'u', 'U1': 'u', 18 | 'i0': 'i', 'i1': 'i', 'I0': 'i', 'I1': 'i' 19 | } 20 | 21 | phon_to_id_ = {phon: i for i, phon in enumerate(symbols)} 22 | 23 | 24 | def tokens_to_ids(phonemes, phon_to_id=None): 25 | if phon_to_id is None: 26 | return [phon_to_id_[phon] for phon in phonemes] 27 | return [phon_to_id[phon] for phon in phonemes] 28 | 29 | 30 | def ids_to_tokens(ids): 31 | return [symbols[id] for id in ids] 32 | 33 | 34 | def arabic_to_phonemes(arabic): 35 | buckw = arabic_to_buckwalter(arabic) 36 | return process_utterance(buckw) 37 | 38 | 39 | def buckwalter_to_phonemes(buckw): 40 | return process_utterance(buckw) 41 | 42 | 43 | def phonemes_to_tokens(phonemes: str, append_space=True): 44 | phonemes = phonemes \ 45 | .replace("sil", "") \ 46 | .replace("+", "_+_") \ 47 | .split() 48 | for i, phon in enumerate(phonemes): 49 | if len(phon) == 2 and phon not in vowels and phon[0] == phon[1]: 50 | phonemes[i] = phon[0] 51 | phonemes.insert(i+1, DOUBLING_TOKEN) 52 | if phonemes[i] in vowels: 53 | phonemes[i] = vowel_map[phonemes[i]] 54 | 55 | if append_space: 56 | phonemes.append(SEPARATOR_TOKEN) 57 | 58 | phonemes.append(EOS_TOKEN) 59 | 60 | return phonemes 61 | 62 | 63 | def buckwalter_to_tokens(buckw, append_space=True): 64 | phonemes = buckwalter_to_phonemes(buckw) 65 | tokens = phonemes_to_tokens(phonemes, append_space=append_space) 66 | return tokens 67 | 68 | 69 | def arabic_to_tokens(arabic, append_space=True): 70 | buckw = arabic_to_buckwalter(arabic) 71 | tokens = buckwalter_to_tokens(buckw, append_space=append_space) 72 | return tokens 73 | 74 | 75 | def simplify_phonemes(phonemes): 76 | for k, v in vowel_map.items(): 77 | phonemes = phonemes.replace(k, v) 78 | return phonemes 79 | -------------------------------------------------------------------------------- /scripts/extract_f0.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import os 3 | import torch 4 | import librosa 5 | import numpy as np 6 | import torch.nn.functional as F 7 | 8 | from tqdm import tqdm 9 | from utils.audio import MelSpectrogram 10 | from utils import write_lines_to_file 11 | 12 | # %% CONFIG 13 | 14 | wavs_path = 'G:/data/arabic-speech-corpus/wav_new' 15 | 16 | waves = [f.path for f in os.scandir(wavs_path) if f.path.endswith('.wav')] 17 | print(f"{len(waves)} wave files found at {wavs_path}") 18 | 19 | mel_trf = MelSpectrogram() 20 | 21 | # %% extract pitch (f0) values 22 | 23 | pitch_dict = {} 24 | 25 | for i, wav_path in tqdm(enumerate(waves), total=len(waves)): 26 | wav, sr = librosa.load(wav_path, sr=mel_trf.sample_rate) 27 | 28 | wav_name = os.path.basename(wav_path) 29 | if wav_name in pitch_dict: 30 | continue 31 | mel_spec = mel_trf(torch.tensor(wav)[None])[0] # [mel_bands, T] 32 | 33 | # estimate pitch 34 | pitch_mel, voiced_flag, voiced_probs = librosa.pyin( 35 | wav, sr=mel_trf.sample_rate, 36 | fmin=librosa.note_to_hz('C2'), 37 | fmax=librosa.note_to_hz('C7'), 38 | frame_length=mel_trf.win_length, 39 | hop_length=mel_trf.hop_length) 40 | 41 | pitch_mel = np.where(np.isnan(pitch_mel), 0., pitch_mel) # set nan to zero 42 | pitch_mel = torch.from_numpy(pitch_mel) 43 | pitch_mel = F.pad(pitch_mel, (0, mel_spec.size(1) - pitch_mel.size(0))) # pad to mel length 44 | 45 | pitch_dict[wav_name] = pitch_mel 46 | 47 | if i % 10 == 0: # save intermediate dict 48 | torch.save(pitch_dict, './data/pitch_dict.pt') 49 | 50 | torch.save(pitch_dict, './data/pitch_dict.pt') 51 | 52 | 53 | # %% calculate pitch mean and std 54 | 55 | pitch_dict = torch.load('./data/pitch_dict.pt') 56 | 57 | rmean = 0 58 | rvar = 0 59 | ndata = 0 60 | 61 | for pitch_mel in pitch_dict.values(): 62 | pitch_mel = np.where(np.isnan(pitch_mel), 0.0, pitch_mel) 63 | 64 | pitch_mel_ = pitch_mel[pitch_mel > 1] 65 | p_mean = np.mean(pitch_mel_) 66 | p_var = np.var(pitch_mel_) 67 | p_len = len(pitch_mel_) 68 | 69 | rvar = ((ndata-1)*rvar + (p_len-1)*p_var) / (ndata + p_len - 1) + \ 70 | ndata*p_len*(p_mean - rmean)**2 / ((ndata + p_len)*(ndata + p_len - 1)) 71 | 72 | rmean = (p_len*p_mean + ndata*rmean) / (p_len + ndata) 73 | 74 | ndata += p_len 75 | 76 | mean, std = rmean, np.sqrt(rvar) 77 | print('mean ', mean) 78 | print('std ', std) 79 | 80 | write_lines_to_file(path='./data/mean_std.txt', 81 | lines=[f"mean: {mean}", 82 | f"std: {std}"]) 83 | -------------------------------------------------------------------------------- /download_files.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import pathlib 3 | import gdown 4 | 5 | # %% 6 | 7 | FILES_DICT = { 8 | # TACOTRON 9 | "tacotron2_ar_mse.pth": { 10 | "path": "pretrained/tacotron2_ar_mse.pth", 11 | "url": "https://drive.google.com/file/d/1GCu-ZAcfJuT5qfzlKItcNqtuVNa7CNy9/view?usp=sharing", 12 | "download": True, 13 | }, 14 | "tacotron2_ar_adv.pth": { 15 | "path": "pretrained/tacotron2_ar_adv.pth", 16 | "url": "https://drive.google.com/file/d/1FusCFZIXSVCQ9Q6PLb91GIkEnhn_zWRS/view?usp=sharing", 17 | "download": True, 18 | }, 19 | # FASTPITCH 20 | "fastpitch_ar_mse.pth": { 21 | "path": "pretrained/fastpitch_ar_mse.pth", 22 | "url": "https://drive.google.com/file/d/1sliRc62wjPTnPWBVQ95NDUgnCSH5E8M0/view?usp=sharing", 23 | "download": True, 24 | }, 25 | "fastpitch_ar_adv.pth": { 26 | "path": "pretrained/fastpitch_ar_adv.pth", 27 | "url": "https://drive.google.com/file/d/1-vZOhi9To_78-yRslC6sFLJBUjwgJT-D/view?usp=sharing", 28 | "download": True, 29 | }, 30 | "fastpitch_ar_ms.pth": { 31 | "path": "pretrained/fastpitch_ar_ms.pth", 32 | "url": "https://drive.google.com/file/d/18IYUSRXvLErVjaDORj_TKzUxs90l61Ja/view?usp=sharing", 33 | "download": True, 34 | }, 35 | # HIFIGAN 36 | "hifigan-asc.pth": { 37 | "path": "pretrained/hifigan-asc-v1/hifigan-asc.pth", 38 | "url": "https://drive.google.com/file/d/1zSYYnJFS-gQox-IeI71hVY-fdPysxuFK/view?usp=sharing", 39 | "download": True, 40 | }, 41 | # DIACRITIZERS 42 | "shakkelha_rnn_3_big_20.pth": { 43 | "path": "pretrained/diacritizers/shakkelha_rnn_3_big_20.pth", 44 | "url": "https://drive.google.com/file/d/1CbDjbuBr-798x88vjLGtMPSB2Y1KwD68/view?usp=sharing", 45 | "download": True, 46 | }, 47 | "shakkala_second_model6.pth": { 48 | "path": "pretrained/diacritizers/shakkala_second_model6.pth", 49 | "url": "https://drive.google.com/file/d/1hgMGqXLTc58Gq_bN7WpuBWscBxX-rXXd/view?usp=sharing", 50 | "download": True, 51 | }, 52 | 53 | } 54 | 55 | # %% 56 | 57 | root_dir = pathlib.Path(__file__).parent 58 | 59 | for file_dict in FILES_DICT.values(): 60 | file_path = root_dir.joinpath(file_dict['path']) 61 | 62 | if not file_path.parent.exists(): 63 | file_path.parent.mkdir(parents=True, exist_ok=True) 64 | 65 | if file_path.exists(): 66 | print(file_dict['path'], "already exists!") 67 | elif file_dict.get('download', True): 68 | print("Downloading ", file_dict['path'], "...") 69 | output_filepath = gdown.download(file_dict['url'], output=file_path.as_posix(), fuzzy=True) 70 | -------------------------------------------------------------------------------- /models/diacritizers/shakkelha/network.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from . import encode, decode 6 | from typing import Union, List 7 | 8 | 9 | class Shakkelha(nn.Module): 10 | def __init__(self, 11 | dim_input: int=91, 12 | dim_output: int=19, 13 | sd_path: str=None): 14 | super().__init__() 15 | self.emb0 = nn.Embedding(dim_input, 25) 16 | 17 | self.lstm0 = nn.LSTM(25, 256, batch_first=True, bidirectional=True) 18 | self.lstm1 = nn.LSTM(512, 256, batch_first=True, bidirectional=True) 19 | 20 | self.dropout = nn.Dropout(p=0.5) 21 | 22 | self.dense0 = nn.Linear(512, 512) 23 | self.dense1 = nn.Linear(512, 512) 24 | self.dense2 = nn.Linear(512, dim_output) 25 | 26 | self.eval() 27 | 28 | if sd_path is not None: 29 | self.load_state_dict(torch.load(sd_path)) 30 | 31 | def forward(self, x: torch.Tensor): 32 | x = self.emb0(x) 33 | 34 | x, _ = self.lstm0(x) 35 | x = self.dropout(x) 36 | x, _ = self.lstm1(x) 37 | x = self.dropout(x) 38 | 39 | x = F.relu(self.dense0(x)) 40 | x = F.relu(self.dense1(x)) 41 | x = F.softmax(self.dense2(x), dim=-1) 42 | 43 | return x 44 | 45 | @torch.inference_mode() 46 | def infer(self, x: torch.Tensor): 47 | return self.forward(x) 48 | 49 | def _predict_list(self, input_list: List[str], return_probs: bool=False): 50 | output_list = [] 51 | probs_list = [] 52 | for input_text in input_list: 53 | if return_probs: 54 | output_text, probs = self._predict_single(input_text, return_probs=True) 55 | output_list.append(output_text) 56 | probs_list.append(probs) 57 | else: 58 | output_list.append(self._predict_single(input_text)) 59 | 60 | if return_probs: 61 | return output_list, probs_list 62 | 63 | return output_list 64 | 65 | def _predict_single(self, input_text: str, return_probs: bool=False): 66 | ids = encode(input_text) 67 | input = torch.LongTensor(ids)[None].to(self.emb0.weight.device) 68 | probs = self.infer(input).cpu() 69 | output = decode(probs, input_text) 70 | 71 | if return_probs: 72 | return output, probs 73 | 74 | return output 75 | 76 | def predict(self, input: Union[str, List[str]], return_probs: bool=False): 77 | if isinstance(input, str): 78 | return self._predict_single(input, return_probs=return_probs) 79 | 80 | return self._predict_list(input, return_probs=return_probs) -------------------------------------------------------------------------------- /utils/app_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torchaudio 4 | 5 | import text 6 | 7 | from . import get_custom_config, get_config 8 | from vocoder import load_hifigan 9 | from vocoder.hifigan.denoiser import Denoiser 10 | 11 | config = get_config('./configs/basic.yaml') 12 | models_config = get_custom_config('./app/models.yaml') 13 | 14 | def load_models(): 15 | models = [] 16 | for model_name, model_dict in models_config.__dict__.items(): 17 | sd_path = model_dict['path'] 18 | if not os.path.exists(sd_path): 19 | print(f"No model @ {sd_path}") 20 | continue 21 | 22 | if model_dict['type'] == 'tacotron2': 23 | from models.tacotron2 import Tacotron2 24 | model = Tacotron2(sd_path) 25 | elif model_dict['type'] == 'fastpitch': 26 | from models.fastpitch import FastPitch 27 | model = FastPitch(sd_path) 28 | else: 29 | print(f"Model type: {model_dict['type']} not supported") 30 | continue 31 | 32 | models.append((model_name, model)) 33 | 34 | return models 35 | 36 | class TTSManager: 37 | def __init__(self, out_dir, 38 | use_cuda_if_available = True, 39 | sample_rate = 22_050): 40 | 41 | if not os.path.exists(out_dir): 42 | os.makedirs(out_dir) 43 | print(f"Created folder: {out_dir}") 44 | 45 | device = torch.device( 46 | 'cuda' if torch.cuda.is_available() and use_cuda_if_available else 'cpu') 47 | 48 | self.vocoder = load_hifigan(config.vocoder_state_path, 49 | config.vocoder_config_path) 50 | self.denoiser = Denoiser(self.vocoder, mode='zeros') 51 | self.vocoder.to(device) 52 | self.denoiser.to(device) 53 | 54 | self.sample_rate = sample_rate 55 | self.models = load_models() 56 | self.out_dir = out_dir 57 | self.device = device 58 | 59 | @torch.inference_mode() 60 | def tts(self, text_buckw, speed=1, denoise=0.01): 61 | 62 | response_data = [] 63 | 64 | for i, (model_name, model) in enumerate(self.models): 65 | model.to(self.device) 66 | mel_spec = model.ttmel(text_buckw, speed=speed) 67 | wave = self.vocoder(mel_spec) 68 | wave_den = self.denoiser(wave, denoise) 69 | 70 | wave_den /= wave_den.abs().max() 71 | wave_den *= 0.99 72 | 73 | torchaudio.save(f'./app/static/wave{i}.wav', 74 | wave_den.cpu(), self.sample_rate) 75 | 76 | response_data.append({ 77 | 'name': model_name, 78 | 'phon': '', 79 | 'id': i, 80 | }) 81 | model.cpu() 82 | 83 | 84 | return response_data 85 | -------------------------------------------------------------------------------- /models/fastpitch/fastpitch/attn_loss_function.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | 19 | 20 | class AttentionCTCLoss(torch.nn.Module): 21 | def __init__(self, blank_logprob=-1): 22 | super(AttentionCTCLoss, self).__init__() 23 | self.log_softmax = torch.nn.LogSoftmax(dim=-1) 24 | self.blank_logprob = blank_logprob 25 | self.CTCLoss = nn.CTCLoss(zero_infinity=True) 26 | 27 | def forward(self, attn_logprob, in_lens, out_lens): 28 | key_lens = in_lens 29 | query_lens = out_lens 30 | max_key_len = attn_logprob.size(-1) 31 | 32 | # Reorder input to [query_len, batch_size, key_len] 33 | attn_logprob = attn_logprob.squeeze(1) 34 | attn_logprob = attn_logprob.permute(1, 0, 2) 35 | 36 | # Add blank label 37 | attn_logprob = F.pad( 38 | input=attn_logprob, 39 | pad=(1, 0, 0, 0, 0, 0), 40 | value=self.blank_logprob) 41 | 42 | # Convert to log probabilities 43 | # Note: Mask out probs beyond key_len 44 | key_inds = torch.arange( 45 | max_key_len+1, 46 | device=attn_logprob.device, 47 | dtype=torch.long) 48 | attn_logprob.masked_fill_( 49 | key_inds.view(1,1,-1) > key_lens.view(1,-1,1), # key_inds >= key_lens+1 50 | -float("inf")) 51 | attn_logprob = self.log_softmax(attn_logprob) 52 | 53 | # Target sequences 54 | target_seqs = key_inds[1:].unsqueeze(0) 55 | target_seqs = target_seqs.repeat(key_lens.numel(), 1) 56 | 57 | # Evaluate CTC loss 58 | cost = self.CTCLoss( 59 | attn_logprob, target_seqs, 60 | input_lengths=query_lens, target_lengths=key_lens) 61 | return cost 62 | 63 | 64 | class AttentionBinarizationLoss(torch.nn.Module): 65 | def __init__(self): 66 | super(AttentionBinarizationLoss, self).__init__() 67 | 68 | def forward(self, hard_attention, soft_attention, eps=1e-12): 69 | log_sum = torch.log(torch.clamp(soft_attention[hard_attention == 1], 70 | min=eps)).sum() 71 | return -log_sum / hard_attention.sum() 72 | -------------------------------------------------------------------------------- /models/diacritizers/shakkala/network.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from .lstm_hsm import LSTMHardSigmoid 4 | from . import encode, decode 5 | from typing import Union, List 6 | 7 | 8 | class Shakkala(nn.Module): 9 | def __init__(self, 10 | dim_input: int=149, 11 | dim_output: int=28, 12 | sd_path: str=None): 13 | super().__init__() 14 | self.emb_input = nn.Embedding(dim_input, 288) 15 | 16 | self.lstm0 = LSTMHardSigmoid(288, hidden_size=288, bidirectional=True, batch_first=True) 17 | self.bn0 = nn.BatchNorm1d(576, momentum=0.01, eps=0.001) 18 | self.lstm1 = LSTMHardSigmoid(576, hidden_size=144, bidirectional=True, batch_first=True) 19 | self.lstm2 = LSTMHardSigmoid(288, hidden_size=96, bidirectional=True, batch_first=True) 20 | 21 | self.dense0 = nn.Linear(192, dim_output) 22 | 23 | self.eval() 24 | self.max_sentence = None 25 | 26 | if sd_path is not None: 27 | self.load_state_dict(torch.load(sd_path)) 28 | 29 | def forward(self, x: torch.Tensor): 30 | x = self.emb_input(x) 31 | 32 | x, _ = self.lstm0(x) 33 | x = self.bn0(x.transpose(1,2)).transpose(1,2) 34 | x, _ = self.lstm1(x) 35 | x, _ = self.lstm2(x) 36 | 37 | x = self.dense0(x) 38 | x = nn.Softmax(dim=-1)(x) 39 | 40 | return x 41 | 42 | @torch.inference_mode() 43 | def infer(self, x: torch.Tensor): 44 | return self.forward(x) 45 | 46 | def _predict_list(self, input_list: List[str], return_probs: bool=False): 47 | output_list = [] 48 | probs_list = [] 49 | for input_text in input_list: 50 | if return_probs: 51 | output_text, probs = self._predict_single(input_text, return_probs=True) 52 | output_list.append(output_text) 53 | probs_list.append(probs) 54 | else: 55 | output_list.append(self._predict_single(input_text)) 56 | 57 | if return_probs: 58 | return output_list, return_probs 59 | 60 | return output_list 61 | 62 | def _predict_single(self, input_text: str, return_probs: bool=False): 63 | input_ids_pad, input_letters_ids = encode(input_text, self.max_sentence) 64 | input = torch.LongTensor(input_ids_pad)[None].to(self.emb_input.weight.device) 65 | probs = self.infer(input).cpu() 66 | output = decode(probs, input_text, input_letters_ids) 67 | 68 | if return_probs: 69 | return output, probs 70 | 71 | return output 72 | 73 | def predict(self, input: Union[str, List[str]], return_probs: bool=False): 74 | if isinstance(input, str): 75 | return self._predict_single(input, return_probs=return_probs) 76 | 77 | return self._predict_list(input, return_probs=return_probs) 78 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # This repo 2 | *.pth 3 | *.pt 4 | *.wav 5 | tmp/ 6 | checkpoints/ 7 | logs/ 8 | tb_logs/ 9 | node_modules/ 10 | # config.json 11 | g_02500000 12 | old/ 13 | samples/ 14 | exp/ 15 | 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | wheels/ 38 | share/python-wheels/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | MANIFEST 43 | 44 | # PyInstaller 45 | # Usually these files are written by a python script from a template 46 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 47 | *.manifest 48 | *.spec 49 | 50 | # Installer logs 51 | pip-log.txt 52 | pip-delete-this-directory.txt 53 | 54 | # Unit test / coverage reports 55 | htmlcov/ 56 | .tox/ 57 | .nox/ 58 | .coverage 59 | .coverage.* 60 | .cache 61 | nosetests.xml 62 | coverage.xml 63 | *.cover 64 | *.py,cover 65 | .hypothesis/ 66 | .pytest_cache/ 67 | cover/ 68 | 69 | # Translations 70 | *.mo 71 | *.pot 72 | 73 | # Django stuff: 74 | *.log 75 | local_settings.py 76 | db.sqlite3 77 | db.sqlite3-journal 78 | 79 | # Flask stuff: 80 | instance/ 81 | .webassets-cache 82 | 83 | # Scrapy stuff: 84 | .scrapy 85 | 86 | # Sphinx documentation 87 | docs/_build/ 88 | 89 | # PyBuilder 90 | .pybuilder/ 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # IPython 97 | profile_default/ 98 | ipython_config.py 99 | 100 | # pyenv 101 | # For a library or package, you might want to ignore these files since the code is 102 | # intended to run in multiple environments; otherwise, check them in: 103 | # .python-version 104 | 105 | # pipenv 106 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 107 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 108 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 109 | # install all needed dependencies. 110 | #Pipfile.lock 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | -------------------------------------------------------------------------------- /models/fastpitch/fastpitch/alignment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | from numba import jit, prange 17 | 18 | 19 | @jit(nopython=True) 20 | def mas(log_attn_map, width=1): 21 | # assumes mel x text 22 | opt = np.zeros_like(log_attn_map) 23 | log_attn_map = log_attn_map.copy() 24 | log_attn_map[0, 1:] = -np.inf 25 | log_p = np.zeros_like(log_attn_map) 26 | log_p[0, :] = log_attn_map[0, :] 27 | prev_ind = np.zeros_like(log_attn_map, dtype=np.int64) 28 | for i in range(1, log_attn_map.shape[0]): 29 | for j in range(log_attn_map.shape[1]): # for each text dim 30 | prev_j = np.arange(max(0, j-width), j+1) 31 | prev_log = np.array([log_p[i-1, prev_idx] for prev_idx in prev_j]) 32 | 33 | ind = np.argmax(prev_log) 34 | log_p[i, j] = log_attn_map[i, j] + prev_log[ind] 35 | prev_ind[i, j] = prev_j[ind] 36 | 37 | # now backtrack 38 | curr_text_idx = log_attn_map.shape[1]-1 39 | for i in range(log_attn_map.shape[0]-1, -1, -1): 40 | opt[i, curr_text_idx] = 1 41 | curr_text_idx = prev_ind[i, curr_text_idx] 42 | opt[0, curr_text_idx] = 1 43 | return opt 44 | 45 | 46 | @jit(nopython=True) 47 | def mas_width1(log_attn_map): 48 | """mas with hardcoded width=1""" 49 | # assumes mel x text 50 | neg_inf = log_attn_map.dtype.type(-np.inf) 51 | log_p = log_attn_map.copy() 52 | log_p[0, 1:] = neg_inf 53 | for i in range(1, log_p.shape[0]): 54 | prev_log1 = neg_inf 55 | for j in range(log_p.shape[1]): 56 | prev_log2 = log_p[i-1, j] 57 | log_p[i, j] += max(prev_log1, prev_log2) 58 | prev_log1 = prev_log2 59 | 60 | # now backtrack 61 | opt = np.zeros_like(log_p) 62 | one = opt.dtype.type(1) 63 | j = log_p.shape[1]-1 64 | for i in range(log_p.shape[0]-1, 0, -1): 65 | opt[i, j] = one 66 | if log_p[i-1, j-1] >= log_p[i-1, j]: 67 | j -= 1 68 | if j == 0: 69 | opt[1:i, j] = one 70 | break 71 | opt[0, j] = one 72 | return opt 73 | 74 | 75 | @jit(nopython=True, parallel=True) 76 | def b_mas(b_log_attn_map, in_lens, out_lens, width=1): 77 | assert width == 1 78 | attn_out = np.zeros_like(b_log_attn_map) 79 | 80 | for b in prange(b_log_attn_map.shape[0]): 81 | out = mas_width1(b_log_attn_map[b, 0, :out_lens[b], :in_lens[b]]) 82 | attn_out[b, 0, :out_lens[b], :in_lens[b]] = out 83 | return attn_out 84 | -------------------------------------------------------------------------------- /utils/training.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def save_states(fname, 7 | model, 8 | optimizer, 9 | n_iter, epoch, 10 | net_config, config): 11 | torch.save({'model': model.state_dict(), 12 | 'optim': optimizer.state_dict(), 13 | 'epoch': epoch, 14 | 'iter': n_iter, 15 | 'config': net_config, 16 | }, 17 | f'{config.checkpoint_dir}/{fname}') 18 | 19 | def save_states_gan(fname, 20 | model, model_d, 21 | optimizer, optimizer_d, 22 | n_iter, epoch, 23 | net_config, config): 24 | torch.save({'model': model.state_dict(), 25 | 'model_d': model_d.state_dict(), 26 | 'optim': optimizer.state_dict(), 27 | 'optim_d': optimizer_d.state_dict(), 28 | 'epoch': epoch, 'iter': n_iter, 29 | 'config': net_config, 30 | }, 31 | f'{config.checkpoint_dir}/{fname}') 32 | 33 | 34 | def batch_to_device(batch, device): 35 | text_padded, input_lengths, mel_padded, gate_padded, \ 36 | output_lengths = batch 37 | 38 | text_padded = text_padded.to(device, non_blocking=True) 39 | input_lengths = input_lengths.to(device, non_blocking=True) 40 | mel_padded = mel_padded.to(device, non_blocking=True) 41 | gate_padded = gate_padded.to(device, non_blocking=True) 42 | output_lengths = output_lengths.to(device, non_blocking=True) 43 | 44 | return (text_padded, input_lengths, mel_padded, gate_padded, 45 | output_lengths) 46 | 47 | 48 | @torch.inference_mode() 49 | def validate(model, test_loader, writer, device, n_iter): 50 | loss_sum = 0 51 | n_test_sum = 0 52 | 53 | model.eval() 54 | 55 | for batch in test_loader: 56 | text_padded, input_lengths, mel_padded, gate_padded, \ 57 | output_lengths = batch_to_device(batch, device) 58 | 59 | y_pred = model(text_padded, input_lengths, 60 | mel_padded, output_lengths) 61 | mel_out, mel_out_postnet, gate_pred, alignments = y_pred 62 | 63 | mel_loss = F.mse_loss(mel_out, mel_padded) + \ 64 | F.mse_loss(mel_out_postnet, mel_padded) 65 | gate_loss = F.binary_cross_entropy_with_logits(gate_pred, gate_padded) 66 | loss = mel_loss + gate_loss 67 | 68 | loss_sum += mel_padded.size(0)*loss.item() 69 | n_test_sum += mel_padded.size(0) 70 | 71 | val_loss = loss_sum / n_test_sum 72 | 73 | idx = random.randint(0, mel_padded.size(0) - 1) 74 | mel_infer, *_ = model.infer( 75 | text_padded[idx:idx+1], input_lengths[idx:idx+1]) 76 | 77 | writer.add_sample( 78 | alignments[idx, :, :input_lengths[idx].item()], 79 | mel_out[idx], mel_padded[idx], mel_infer[0], 80 | output_lengths[idx], n_iter) 81 | 82 | writer.add_scalar('loss/val_loss', val_loss, n_iter) 83 | 84 | model.train() 85 | 86 | return val_loss 87 | 88 | -------------------------------------------------------------------------------- /vocoder/hifigan/denoiser.py: -------------------------------------------------------------------------------- 1 | # adapted from https://github.com/rishikksh20/HiFi-GAN/blob/main/denoiser.py 2 | 3 | # MIT License 4 | 5 | # Copyright (c) 2020 Rishikesh 6 | 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import torch 26 | import torch.nn as nn 27 | import torchaudio 28 | 29 | class Denoiser(nn.Module): 30 | """ Removes model bias from audio produced with hifigan """ 31 | 32 | def __init__(self, hifigan, filter_length=1024, n_overlap=4, 33 | win_length=1024, mode='zeros', **infer_kw): 34 | super().__init__() 35 | 36 | w = next(p for name, p in hifigan.named_parameters() 37 | if name.endswith('.weight')) 38 | 39 | # self.stft = STFT(filter_length=filter_length, 40 | # hop_length=int(filter_length/n_overlap), 41 | # win_length=win_length).to(w.device) 42 | 43 | self.stft = torchaudio.transforms.Spectrogram(filter_length, 44 | hop_length=int(filter_length/n_overlap), 45 | win_length=win_length, power=None).to(w.device) 46 | self.istft = torchaudio.transforms.InverseSpectrogram(filter_length, 47 | hop_length=int(filter_length/n_overlap), 48 | win_length=win_length).to(w.device) 49 | 50 | mel_init = {'zeros': torch.zeros, 'normal': torch.randn}[mode] 51 | mel_input = mel_init((1, 80, 88), dtype=w.dtype, device=w.device) 52 | 53 | with torch.no_grad(): 54 | bias_audio = hifigan(mel_input, **infer_kw).float() 55 | 56 | if len(bias_audio.size()) > 2: 57 | bias_audio = bias_audio.squeeze(0) 58 | elif len(bias_audio.size()) < 2: 59 | bias_audio = bias_audio.unsqueeze(0) 60 | assert len(bias_audio.size()) == 2 61 | 62 | bias_spec = self.stft(bias_audio).abs() 63 | 64 | self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None]) 65 | 66 | def forward(self, audio, strength=0.1): 67 | audio_spec = self.stft(audio.float()) 68 | audio_spec_mag, audio_spec_phase = audio_spec.abs(), audio_spec.angle() 69 | audio_spec_denoised = audio_spec_mag - self.bias_spec * strength 70 | audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) 71 | audio_denoised = self.istft(audio_spec_denoised*torch.exp(1j*audio_spec_phase)) 72 | return audio_denoised 73 | -------------------------------------------------------------------------------- /models/common/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from torch import Tensor 6 | from typing import Optional, List 7 | 8 | 9 | def extract_chunks(A: Tensor, 10 | ofx: Tensor, 11 | mel_ids: Optional[Tensor] = None, 12 | chunk_len: int = 128): 13 | """ 14 | Args: 15 | A (Tensor): spectrograms [B, F, T] 16 | ofx (Tensor): offsets [num_chunks,] 17 | mel_ids (Tensor): [num_chunks,] 18 | Returns: 19 | chunks (Tensor): [num_chunks, F, chunk_len] 20 | """ 21 | ids = torch.arange(0, chunk_len, device=A.device)[None,:].repeat(len(mel_ids), 1) + ofx[:,None] 22 | 23 | if mel_ids is None: 24 | mel_ids = torch.arange(0, A.size(0), device=A.device)[:,None] * A.size(2) 25 | ids = ids + mel_ids[:,None] * A.size(2) 26 | 27 | chunks = A.transpose(0, 1).flatten(1)[:, ids.long()].transpose(0, 1) 28 | return chunks 29 | 30 | 31 | def calc_feature_match_loss(fmaps_gen: List[Tensor], 32 | fmaps_org: List[Tensor] 33 | ): 34 | 35 | loss_fmatch = 0. 36 | for (fmap_gen, fmap_org) in zip(fmaps_gen, fmaps_org): 37 | fmap_org.detach_() 38 | loss_fmatch += (fmap_gen - fmap_org).abs().mean() 39 | 40 | loss_fmatch = loss_fmatch / len(fmaps_gen) 41 | return loss_fmatch 42 | 43 | 44 | class Conv2DSpectralNorm(nn.Conv2d): 45 | """Convolution layer that applies Spectral Normalization before every call.""" 46 | 47 | def __init__(self, cnum_in: int, cnum_out: int, 48 | kernel_size: int, stride: int, padding: int = 0, 49 | n_iter: int = 1, eps: float = 1e-12, 50 | bias: bool = True): 51 | super().__init__(cnum_in, 52 | cnum_out, kernel_size=kernel_size, 53 | stride=stride, padding=padding, bias=bias) 54 | self.register_buffer("weight_u", torch.empty(self.weight.size(0), 1)) 55 | nn.init.trunc_normal_(self.weight_u) 56 | self.n_iter = n_iter 57 | self.eps = eps 58 | 59 | def l2_norm(self, x): 60 | return F.normalize(x, p=2, dim=0, eps=self.eps) 61 | 62 | def forward(self, x): 63 | 64 | weight_orig = self.weight.flatten(1).detach() 65 | 66 | for _ in range(self.n_iter): 67 | v = self.l2_norm(weight_orig.t() @ self.weight_u) 68 | self.weight_u = self.l2_norm(weight_orig @ v) 69 | 70 | sigma = self.weight_u.t() @ weight_orig @ v 71 | self.weight.data.div_(sigma) 72 | 73 | x = super().forward(x) 74 | 75 | return x 76 | 77 | 78 | class DConv(nn.Module): 79 | def __init__(self, cnum_in, 80 | cnum_out, ksize=5, stride=2, padding='auto'): 81 | super().__init__() 82 | padding = (ksize-1)//2 if padding == 'auto' else padding 83 | self.conv_sn = Conv2DSpectralNorm( 84 | cnum_in, cnum_out, ksize, stride, padding) 85 | #self.conv_sn = spectral_norm(nn.Conv2d(cnum_in, cnum_out, ksize, stride, padding)) 86 | self.leaky = nn.LeakyReLU(negative_slope=0.2) 87 | 88 | def forward(self, x): 89 | x = self.conv_sn(x) 90 | x = self.leaky(x) 91 | return x 92 | 93 | 94 | class PatchDiscriminator(nn.Module): 95 | def __init__(self, cnum_in, cnum): 96 | super().__init__() 97 | self.conv1 = DConv(cnum_in, cnum) 98 | self.conv2 = DConv(cnum, 2*cnum) 99 | self.conv3 = DConv(2*cnum, 4*cnum) 100 | self.conv4 = DConv(4*cnum, 4*cnum) 101 | self.conv5 = DConv(4*cnum, 4*cnum) 102 | 103 | def forward(self, x): 104 | x1 = self.conv1(x) 105 | x2 = self.conv2(x1) 106 | x3 = self.conv3(x2) 107 | x4 = self.conv4(x3) 108 | x5 = self.conv5(x4) 109 | x = nn.Flatten()(x5) 110 | 111 | return x, [x1, x2, x3, x4] -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import torch 4 | import torchaudio 5 | import text 6 | import utils.make_html as html 7 | 8 | from utils import progbar, read_lines_from_file 9 | 10 | # default: 11 | # python inference.py --list data/infer_text.txt --out_dir samples/results --model fastpitch --checkpoint pretrained/fastpitch_ar_adv.pth --batch_size 2 --denoise 0 12 | 13 | # Examples: 14 | # python inference.py --list data/infer_text.txt --out_dir samples/res_tc2_adv0 --model tacotron2 --checkpoint pretrained/tacotron2_ar_adv.pth --batch_size 2 15 | # python inference.py --list data/infer_text.txt --out_dir samples/res_tc2_adv1 --model tacotron2 --checkpoint pretrained/tacotron2_ar_adv.pth --batch_size 2 --denoise 0.005 16 | # python inference.py --list data/infer_text.txt --out_dir samples/res_fp_adv0 --model fastpitch --checkpoint pretrained/fastpitch_ar_adv.pth --batch_size 2 17 | # python inference.py --list data/infer_text.txt --out_dir samples/res_fp_adv1 --model fastpitch --checkpoint pretrained/fastpitch_ar_adv.pth --batch_size 2 --denoise 0.005 18 | # python inference.py --list data/infer_text.txt --out_dir samples/res_fp_adv2 --model fastpitch --checkpoint pretrained/fastpitch_ar_adv.pth --batch_size 2 --denoise 0.005 --vocoder_sd pretrained/hifigan-asc-v1/g_02500000 --vocoder_config pretrained/hifigan-asc-v1/config.json 19 | 20 | 21 | def infer(args): 22 | use_cuda_if_available = not args.cpu 23 | device = torch.device( 24 | 'cuda' if torch.cuda.is_available() and use_cuda_if_available else 'cpu') 25 | 26 | if args.model == 'fastpitch': 27 | from models.fastpitch import FastPitch2Wave 28 | model = FastPitch2Wave(args.checkpoint, 29 | vocoder_sd=args.vocoder_sd, 30 | vocoder_config=args.vocoder_config) 31 | elif args.model == 'tacotron2': 32 | from models.tacotron2 import Tacotron2Wave 33 | model = Tacotron2Wave(args.checkpoint, 34 | vocoder_sd=args.vocoder_sd, 35 | vocoder_config=args.vocoder_config) 36 | else: 37 | raise "model type not supported" 38 | 39 | model = model.to(device) 40 | model.eval() 41 | 42 | if not os.path.exists(f"{args.out_dir}/wavs"): 43 | os.makedirs(f"{args.out_dir}/wavs") 44 | 45 | static_lines = read_lines_from_file(args.list) 46 | static_batches = [static_lines[k:k+args.batch_size] 47 | for k in range(0, len(static_lines), args.batch_size)] 48 | 49 | idx = 0 50 | with open(os.path.join(args.out_dir, 'index.html'), 'w', encoding='utf-8') as f: 51 | f.write(html.make_html_start()) 52 | 53 | for batch in progbar(static_batches): 54 | # infer batch 55 | wav_list = model.tts(batch, 56 | batch_size=args.batch_size, 57 | denoise=args.denoise, 58 | speed=args.speed) 59 | 60 | # save wavs and add entries to html file 61 | for (text_line, wav) in zip(batch, wav_list): 62 | torchaudio.save(f'{args.out_dir}/wavs/static{idx}.wav', 63 | wav.unsqueeze(0), 64 | 22_050) 65 | 66 | text_buckw = text.arabic_to_buckwalter(text_line) 67 | text_arabic = text.buckwalter_to_arabic(text_buckw) 68 | t_phon = text.buckwalter_to_phonemes(text_buckw) 69 | t_phon = text.simplify_phonemes( 70 | t_phon.replace(' ', '').replace('+', ' ')) 71 | 72 | f.write(html.make_sample_entry2( 73 | f'wavs/static{idx}.wav', 74 | text_arabic, 75 | f"{idx}) {t_phon}")) 76 | 77 | idx += 1 78 | 79 | f.write(html.make_volume_script(0.5)) 80 | f.write(html.make_html_end()) 81 | 82 | print(f"Saved files to: {args.out_dir}") 83 | 84 | 85 | def main(): 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument( 88 | '--list', type=str, default='./data/infer_text.txt') 89 | parser.add_argument( 90 | '--model', type=str, default='fastpitch') 91 | parser.add_argument( 92 | '--checkpoint', type=str, default='pretrained/fastpitch_ar_adv.pth') 93 | parser.add_argument('--vocoder_sd', type=str, default=None) 94 | parser.add_argument('--vocoder_config', type=str, default=None) 95 | parser.add_argument('--out_dir', type=str, default='samples/results') 96 | parser.add_argument('--speed', type=float, default=1.0) 97 | parser.add_argument('--denoise', type=float, default=0) 98 | parser.add_argument('--batch_size', type=int, default=2) 99 | parser.add_argument('--cpu', action='store_true') 100 | args = parser.parse_args() 101 | 102 | infer(args) 103 | 104 | 105 | if __name__ == '__main__': 106 | main() 107 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import torch 4 | import torchaudio 5 | 6 | import text 7 | import utils.make_html as html 8 | from utils.plotting import get_spectrogram_figure 9 | from vocoder import load_hifigan 10 | from vocoder.hifigan.denoiser import Denoiser 11 | from utils import get_basic_config 12 | 13 | #default: 14 | # python test.py --model fastpitch --checkpoint pretrained/fastpitch_ar_adv.pth --out_dir samples/test 15 | 16 | # Examples: 17 | # python test.py --model fastpitch --checkpoint pretrained/fastpitch_ar_adv.pth --out_dir samples/test_fp_adv 18 | # python test.py --model fastpitch --checkpoint pretrained/fastpitch_ar_adv.pth --denoise 0.01 --out_dir samples/test_fp_adv_d 19 | # python test.py --model fastpitch --checkpoint pretrained/fastpitch_ar_mse.pth --out_dir samples/test_fp_mse 20 | 21 | # python test.py --model tacotron2 --checkpoint pretrained/tacotron2_ar_adv.pth --out_dir samples/test_tc2_adv 22 | # python test.py --model tacotron2 --checkpoint pretrained/tacotron2_ar_adv.pth --denoise 0.01 --out_dir samples/test_tc2_adv_d 23 | # python test.py --model tacotron2 --checkpoint pretrained/tacotron2_ar_mse.pth --out_dir samples/test_tc2_mse 24 | 25 | 26 | def test(args, text_arabic): 27 | 28 | use_cuda_if_available = not args.cpu 29 | device = torch.device( 30 | 'cuda' if torch.cuda.is_available() and use_cuda_if_available else 'cpu') 31 | out_dir = args.out_dir 32 | sample_rate = 22_050 33 | 34 | # Load model 35 | if args.model == 'fastpitch': 36 | from models.fastpitch import FastPitch 37 | model = FastPitch(args.checkpoint) 38 | elif args.model == 'tacotron2': 39 | from models.tacotron2 import Tacotron2 40 | model = Tacotron2(args.checkpoint) 41 | else: 42 | raise "model type not supported" 43 | 44 | print(f'Loaded {args.model} from: {args.checkpoint}') 45 | model.eval() 46 | 47 | # Load vocoder model 48 | if args.vocoder_sd is None or args.vocoder_config is None: 49 | config = get_basic_config() 50 | if args.vocoder_sd is None: args.vocoder_sd = config.vocoder_state_path 51 | if args.vocoder_config is None: args.vocoder_config = config.vocoder_config_path 52 | vocoder = load_hifigan( 53 | state_dict_path=args.vocoder_sd, 54 | config_file=args.vocoder_config) 55 | print(f'Loaded vocoder from: {args.vocoder_sd}') 56 | 57 | model, vocoder = model.to(device), vocoder.to(device) 58 | denoiser = Denoiser(vocoder) 59 | 60 | # Infer spectrogram and wave 61 | with torch.inference_mode(): 62 | mel_spec = model.ttmel(text_arabic, vowelizer=args.vowelizer) 63 | wave = vocoder(mel_spec[None]) 64 | if args.denoise > 0: 65 | wave = denoiser(wave, args.denoise) 66 | 67 | # Save wave and images 68 | if not os.path.exists(out_dir): 69 | os.makedirs(out_dir) 70 | print(f"Created folder: {out_dir}") 71 | 72 | torchaudio.save(f'{out_dir}/wave.wav', wave[0].cpu(), sample_rate) 73 | 74 | get_spectrogram_figure(mel_spec.cpu()).savefig( 75 | f'{out_dir}/mel_spec.png') 76 | 77 | t_phon = text.arabic_to_phonemes(text_arabic) 78 | t_phon = text.simplify_phonemes(t_phon.replace(' ', '').replace('+', ' ')) 79 | 80 | with open(f'{out_dir}/index.html', 'w', encoding='utf-8') as f: 81 | f.write(html.make_html_start()) 82 | f.write(html.make_h_tag("Test sample", n=1)) 83 | f.write(html.make_sample_entry2(f"./wave.wav", text_arabic, t_phon)) 84 | f.write(html.make_h_tag("Spectrogram")) 85 | f.write(html.make_img_tag('./mel_spec.png')) 86 | f.write(html.make_volume_script(0.42)) 87 | f.write(html.make_html_end()) 88 | 89 | print(f"Saved test sample to: {out_dir}") 90 | 91 | if not args.do_not_play: 92 | try: 93 | import sounddevice as sd 94 | sd.play(wave[0, 0].cpu(), sample_rate, blocking=True) 95 | except: 96 | pass 97 | 98 | 99 | def main(): 100 | parser = argparse.ArgumentParser() 101 | parser.add_argument('--text', type=str, 102 | default="أَلسَّلامُ عَلَيكُم يا صَديقي") 103 | parser.add_argument('--model', type=str, default='fastpitch') 104 | parser.add_argument( 105 | '--checkpoint', default='pretrained/fastpitch_ar_adv.pth') 106 | parser.add_argument('--vocoder_sd', type=str, default=None) 107 | parser.add_argument('--vocoder_config', type=str, default=None) 108 | parser.add_argument('--denoise', type=float, default=0) 109 | parser.add_argument('--out_dir', default='samples/test') 110 | parser.add_argument('--vowelizer', default=None) 111 | parser.add_argument('--cpu', action='store_true') 112 | 113 | parser.add_argument('--do_not_play', action='store_true') 114 | args = parser.parse_args() 115 | 116 | text_arabic = args.text 117 | 118 | test(args, text_arabic) 119 | 120 | 121 | if __name__ == '__main__': 122 | main() 123 | -------------------------------------------------------------------------------- /vocoder/vocos/feature_extractors.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | import torchaudio 5 | # from encodec import EncodecModel 6 | from torch import nn 7 | 8 | from .modules import safe_log 9 | 10 | 11 | class FeatureExtractor(nn.Module): 12 | """Base class for feature extractors.""" 13 | 14 | def forward(self, audio: torch.Tensor, **kwargs) -> torch.Tensor: 15 | """ 16 | Extract features from the given audio. 17 | 18 | Args: 19 | audio (Tensor): Input audio waveform. 20 | 21 | Returns: 22 | Tensor: Extracted features of shape (B, C, L), where B is the batch size, 23 | C denotes output features, and L is the sequence length. 24 | """ 25 | raise NotImplementedError("Subclasses must implement the forward method.") 26 | 27 | 28 | class MelSpectrogramFeatures(FeatureExtractor): 29 | def __init__(self, sample_rate=24000, 30 | n_fft=1024, 31 | hop_length=256, 32 | n_mels=100, 33 | padding="center", 34 | f_min: float = 0, 35 | f_max: float = None, 36 | norm: str = None, 37 | mel_scale: str = "htk", 38 | ): 39 | super().__init__() 40 | if padding not in ["center", "same"]: 41 | raise ValueError("Padding must be 'center' or 'same'.") 42 | self.padding = padding 43 | self.mel_spec = torchaudio.transforms.MelSpectrogram( 44 | sample_rate=sample_rate, 45 | n_fft=n_fft, 46 | hop_length=hop_length, 47 | n_mels=n_mels, 48 | center=padding == "center", 49 | power=1, 50 | f_min=f_min, 51 | f_max=f_max, 52 | norm=norm, 53 | mel_scale=mel_scale, 54 | ) 55 | 56 | self.clip_val = 1e-5 57 | 58 | def forward(self, audio, **kwargs): 59 | if self.padding == "same": 60 | pad = self.mel_spec.win_length - self.mel_spec.hop_length 61 | audio = torch.nn.functional.pad(audio, (pad // 2, pad // 2), mode="reflect") 62 | mel = self.mel_spec(audio) 63 | features = safe_log(mel, clip_val=self.clip_val) 64 | return features 65 | 66 | 67 | # class EncodecFeatures(FeatureExtractor): 68 | # def __init__( 69 | # self, 70 | # encodec_model: str = "encodec_24khz", 71 | # bandwidths: List[float] = [1.5, 3.0, 6.0, 12.0], 72 | # train_codebooks: bool = False, 73 | # ): 74 | # super().__init__() 75 | # if encodec_model == "encodec_24khz": 76 | # encodec = EncodecModel.encodec_model_24khz 77 | # elif encodec_model == "encodec_48khz": 78 | # encodec = EncodecModel.encodec_model_48khz 79 | # else: 80 | # raise ValueError( 81 | # f"Unsupported encodec_model: {encodec_model}. Supported options are 'encodec_24khz' and 'encodec_48khz'." 82 | # ) 83 | # self.encodec = encodec(pretrained=True) 84 | # for param in self.encodec.parameters(): 85 | # param.requires_grad = False 86 | # self.num_q = self.encodec.quantizer.get_num_quantizers_for_bandwidth( 87 | # self.encodec.frame_rate, bandwidth=max(bandwidths) 88 | # ) 89 | # codebook_weights = torch.cat([vq.codebook for vq in self.encodec.quantizer.vq.layers[: self.num_q]], dim=0) 90 | # self.codebook_weights = torch.nn.Parameter(codebook_weights, requires_grad=train_codebooks) 91 | # self.bandwidths = bandwidths 92 | 93 | # @torch.no_grad() 94 | # def get_encodec_codes(self, audio): 95 | # audio = audio.unsqueeze(1) 96 | # emb = self.encodec.encoder(audio) 97 | # codes = self.encodec.quantizer.encode(emb, self.encodec.frame_rate, self.encodec.bandwidth) 98 | # return codes 99 | 100 | # def forward(self, audio: torch.Tensor, **kwargs): 101 | # bandwidth_id = kwargs.get("bandwidth_id") 102 | # if bandwidth_id is None: 103 | # raise ValueError("The 'bandwidth_id' argument is required") 104 | # self.encodec.eval() # Force eval mode as Pytorch Lightning automatically sets child modules to training mode 105 | # self.encodec.set_target_bandwidth(self.bandwidths[bandwidth_id]) 106 | # codes = self.get_encodec_codes(audio) 107 | # # Instead of summing in the loop, it stores subsequent VQ dictionaries in a single `self.codebook_weights` 108 | # # with offsets given by the number of bins, and finally summed in a vectorized operation. 109 | # offsets = torch.arange( 110 | # 0, self.encodec.quantizer.bins * len(codes), self.encodec.quantizer.bins, device=audio.device 111 | # ) 112 | # embeddings_idxs = codes + offsets.view(-1, 1, 1) 113 | # features = torch.nn.functional.embedding(embeddings_idxs, self.codebook_weights).sum(dim=0) 114 | # return features.transpose(1, 2) 115 | -------------------------------------------------------------------------------- /vocoder/vocos/models.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn.utils import weight_norm 6 | 7 | from .modules import ConvNeXtBlock, ResBlock1, AdaLayerNorm 8 | 9 | 10 | class Backbone(nn.Module): 11 | """Base class for the generator's backbone. It preserves the same temporal resolution across all layers.""" 12 | 13 | def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor: 14 | """ 15 | Args: 16 | x (Tensor): Input tensor of shape (B, C, L), where B is the batch size, 17 | C denotes output features, and L is the sequence length. 18 | 19 | Returns: 20 | Tensor: Output of shape (B, L, H), where B is the batch size, L is the sequence length, 21 | and H denotes the model dimension. 22 | """ 23 | raise NotImplementedError("Subclasses must implement the forward method.") 24 | 25 | 26 | class VocosBackbone(Backbone): 27 | """ 28 | Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization 29 | 30 | Args: 31 | input_channels (int): Number of input features channels. 32 | dim (int): Hidden dimension of the model. 33 | intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock. 34 | num_layers (int): Number of ConvNeXtBlock layers. 35 | layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`. 36 | adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. 37 | None means non-conditional model. Defaults to None. 38 | """ 39 | 40 | def __init__( 41 | self, 42 | input_channels: int, 43 | dim: int, 44 | intermediate_dim: int, 45 | num_layers: int, 46 | layer_scale_init_value: Optional[float] = None, 47 | adanorm_num_embeddings: Optional[int] = None, 48 | ): 49 | super().__init__() 50 | self.input_channels = input_channels 51 | self.embed = nn.Conv1d(input_channels, dim, kernel_size=7, padding=3) 52 | self.adanorm = adanorm_num_embeddings is not None 53 | if adanorm_num_embeddings: 54 | self.norm = AdaLayerNorm(adanorm_num_embeddings, dim, eps=1e-6) 55 | else: 56 | self.norm = nn.LayerNorm(dim, eps=1e-6) 57 | layer_scale_init_value = layer_scale_init_value or 1 / num_layers 58 | self.convnext = nn.ModuleList( 59 | [ 60 | ConvNeXtBlock( 61 | dim=dim, 62 | intermediate_dim=intermediate_dim, 63 | layer_scale_init_value=layer_scale_init_value, 64 | adanorm_num_embeddings=adanorm_num_embeddings, 65 | ) 66 | for _ in range(num_layers) 67 | ] 68 | ) 69 | self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6) 70 | self.apply(self._init_weights) 71 | 72 | def _init_weights(self, m): 73 | if isinstance(m, (nn.Conv1d, nn.Linear)): 74 | nn.init.trunc_normal_(m.weight, std=0.02) 75 | nn.init.constant_(m.bias, 0) 76 | 77 | def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor: 78 | bandwidth_id = kwargs.get('bandwidth_id', None) 79 | x = self.embed(x) 80 | if self.adanorm: 81 | assert bandwidth_id is not None 82 | x = self.norm(x.transpose(1, 2), cond_embedding_id=bandwidth_id) 83 | else: 84 | x = self.norm(x.transpose(1, 2)) 85 | x = x.transpose(1, 2) 86 | for conv_block in self.convnext: 87 | x = conv_block(x, cond_embedding_id=bandwidth_id) 88 | x = self.final_layer_norm(x.transpose(1, 2)) 89 | return x 90 | 91 | 92 | class VocosResNetBackbone(Backbone): 93 | """ 94 | Vocos backbone module built with ResBlocks. 95 | 96 | Args: 97 | input_channels (int): Number of input features channels. 98 | dim (int): Hidden dimension of the model. 99 | num_blocks (int): Number of ResBlock1 blocks. 100 | layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to None. 101 | """ 102 | 103 | def __init__( 104 | self, input_channels, dim, num_blocks, layer_scale_init_value=None, 105 | ): 106 | super().__init__() 107 | self.input_channels = input_channels 108 | self.embed = weight_norm(nn.Conv1d(input_channels, dim, kernel_size=3, padding=1)) 109 | layer_scale_init_value = layer_scale_init_value or 1 / num_blocks / 3 110 | self.resnet = nn.Sequential( 111 | *[ResBlock1(dim=dim, layer_scale_init_value=layer_scale_init_value) for _ in range(num_blocks)] 112 | ) 113 | 114 | def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor: 115 | x = self.embed(x) 116 | x = self.resnet(x) 117 | x = x.transpose(1, 2) 118 | return x 119 | -------------------------------------------------------------------------------- /utils/make_html.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | 4 | def make_html_start(title: Union[str, None] = None): 5 | 6 | style = """ 7 | * { 8 | box-sizing: border-box; 9 | } 10 | 11 | body { 12 | margin-left: 2rem; 13 | } 14 | header { 15 | padding-top: 0.5rem; 16 | height: 5rem; 17 | } 18 | .sample { 19 | font-family: sans-serif; 20 | font-weight: 500; 21 | font-size: 1.2rem; 22 | width: max(60vw, 60rem); 23 | border-bottom: 2px solid #aaa; 24 | padding: 0.5rem 0 0.5rem 0; 25 | } 26 | .audio-wrapper { 27 | display: flex; 28 | align-items: center; 29 | justify-content: space-between; 30 | width: 60rem; 31 | flex-wrap: wrap; 32 | } 33 | .audio-wrapper label { 34 | display: inline-block; 35 | width: 3.5rem; 36 | } 37 | .audio-row { 38 | display: flex; 39 | align-items: center; 40 | } 41 | audio { 42 | height: 2rem; 43 | width: 22rem; 44 | margin-right: 1rem; 45 | } 46 | .text-arabic { 47 | font-size: 1.6rem; 48 | margin: 0.5rem; 49 | } 50 | .row-title { 51 | width: 6rem; 52 | } 53 | """ 54 | 55 | title = f"{title}" if title is not None else "" 56 | html = f""" 57 | 58 | 59 | 60 | 61 | 62 | Samples 63 | 66 | {title} 67 | 68 | 69 | """ 70 | return html 71 | 72 | 73 | def make_html_end(): 74 | html = """ 75 | 76 | """ 77 | return html 78 | 79 | 80 | def make_sample_entry(wav_path: str, text: str): 81 | text = text.replace('<', '<').replace('>', '>') 82 | html = f"""
83 | 84 |
85 | {text} 86 |
87 | """ 88 | return html 89 | 90 | 91 | def make_sample_entry2(wav_path: str, text0: str, 92 | text1: str, ar_dir: str = 'ltr'): 93 | text0 = text0.replace('<', '<').replace('>', '>') 94 | text1 = text1.replace('<', '<').replace('>', '>') 95 | html = f"""
96 | 97 |
98 | {text0} 99 |
100 | {text1} 101 |
102 | """ 103 | return html 104 | 105 | 106 | def make_double_entry(wav_gen: str, wav_ref: str, 107 | text0: str, text1: str, ar_dir: str = 'ltr'): 108 | text0 = text0.replace('<', '<').replace('>', '>') 109 | text1 = text1.replace('<', '<').replace('>', '>') 110 | html = f"""
111 |
112 | 113 | 114 | 115 | 116 |
117 |
118 | {text0} 119 |
120 | {text1} 121 |
122 | """ 123 | return html 124 | 125 | def make_multi_entry(wavs_list, row_titles, 126 | text0: str, text1: str, ar_dir: str='ltr'): 127 | text0 = text0.replace('<', '<').replace('>', '>') 128 | text1 = text1.replace('<', '<').replace('>', '>') 129 | 130 | rows = "" 131 | for i in range(0,len(wavs_list),2): 132 | row_title = row_titles[i//2] 133 | rows += f"""
134 | {row_title} 135 | 136 | 137 | 138 | 139 |
140 | """ 141 | 142 | html = f"""
143 |
144 | {rows} 145 |
146 |
147 | {text0} 148 |
149 | {text1} 150 |
151 | """ 152 | 153 | 154 | return html 155 | 156 | def make_h_tag(text: str, n: int = 2): 157 | html = f"""{text} 158 | """ 159 | return html 160 | 161 | 162 | def make_img_tag(src: str, alt: str = ""): 163 | html = f"""{alt} 164 | """ 165 | return html 166 | 167 | 168 | def make_volume_script(volume: float = 0.35): 169 | html = f""" 173 | """ 174 | return html 175 | -------------------------------------------------------------------------------- /models/diacritizers/shakkala/symbols.py: -------------------------------------------------------------------------------- 1 | input_vocab_to_int = {'“': 55, 2 | 'ئ': 56, 3 | '°': 5, 4 | 'و': 6, 5 | 'ε': 7, 6 | 'ﺇ': 57, 7 | 'ﺈ': 58, 8 | 'ﻧ': 102, 9 | '\t': 8, 10 | '\u200f': 60, 11 | 'ـ': 9, 12 | '۷': 106, 13 | 'ﺄ': 61, 14 | '۸': 10, 15 | '•': 62, 16 | 'ו': 64, 17 | 'ظ': 11, 18 | 'ر': 65, 19 | 'ﻠ': 66, 20 | 'ψ': 12, 21 | 'ﻛ': 13, 22 | '': 2, 23 | 'χ': 14, 24 | 'ز': 92, 25 | 'آ': 15, 26 | 'ﺁ': 16, 27 | 'ا': 68, 28 | '؛': 17, 29 | 'έ': 69, 30 | 'ي': 70, 31 | 'ث': 71, 32 | 'ك': 72, 33 | 'أ': 73, 34 | '«': 74, 35 | 'ص': 75, 36 | 'υ': 20, 37 | 'ﻹ': 21, 38 | 'ﺔ': 76, 39 | 'ت': 22, 40 | '…': 23, 41 | 'ό': 77, 42 | 'τ': 78, 43 | 'ش': 18, 44 | 'غ': 24, 45 | 'ﻻ': 79, 46 | '﴿': 25, 47 | 'ج': 59, 48 | 'σ': 27, 49 | 'ρ': 26, 50 | 'ن': 81, 51 | 'س': 82, 52 | 'ﻵ': 83, 53 | '\xa0': 84, 54 | '”': 85, 55 | '\u200d': 31, 56 | 'ﻓ': 33, 57 | 'ﻴ': 88, 58 | 'ω': 89, 59 | 'ﺌ': 90, 60 | '‘': 34, 61 | 'κ': 35, 62 | 'γ': 80, 63 | 'ل': 29, 64 | 'ط': 93, 65 | 'ﺂ': 96, 66 | 'ι': 36, 67 | 'ع': 95, 68 | 'ν': 63, 69 | 'ﻷ': 98, 70 | 'ے': 37, 71 | 'ق': 38, 72 | 'خ': 19, 73 | 'ى': 39, 74 | '\xad': 40, 75 | 'ح': 86, 76 | 'ώ': 103, 77 | ' ': 28, 78 | '\u202b': 94, 79 | '’': 41, 80 | '–': 42, 81 | '': 3, 82 | 'ﻣ': 43, 83 | '﴾': 44, 84 | 'ٰ': 45, 85 | '': 1, 86 | '»': 30, 87 | 'ذ': 97, 88 | 'ﺑ': 32, 89 | 'ﻟ': 99, 90 | 'ف': 46, 91 | 'د': 104, 92 | '۵': 109, 93 | 'ﺃ': 87, 94 | 'α': 47, 95 | 'م': 48, 96 | 'ه': 49, 97 | '\u202c': 108, 98 | 'ؤ': 50, 99 | 'θ': 51, 100 | 'ﺋ': 100, 101 | 'ی': 105, 102 | '´': 110, 103 | 'ض': 111, 104 | '': 0, 105 | '\u200b': 52, 106 | '٪': 91, 107 | 'ί': 112, 108 | 'إ': 119, 109 | '؟': 101, 110 | 'ﺒ': 113, 111 | 'ο': 114, 112 | '‰': 115, 113 | 'π': 116, 114 | '\u200e': 117, 115 | 'ﮐ': 53, 116 | 'ب': 118, 117 | 'ٱ': 67, 118 | 'μ': 54, 119 | 'ة': 107, 120 | 'ء': 120} 121 | 122 | output_int_to_vocab = {0: '', 123 | 1: '', 124 | 2: '', 125 | 3: '', 126 | 4: 'ـ', 127 | 5: 'َ', 128 | 6: 'ُّ', 129 | 7: 'َّ', 130 | 8: 'ـ', 131 | 9: 'ِّ', 132 | 10: 'ّ', 133 | 11: 'ّْ', 134 | 12: 'ٍّ', 135 | 13: 'ِّ', 136 | 14: 'ٍّ', 137 | 15: 'ٌّ', 138 | 16: 'َّ', 139 | 17: 'ُ', 140 | 18: 'ٌّ', 141 | 19: 'ًّ', 142 | 20: 'ْ', 143 | 21: 'ٍ', 144 | 22: 'ِ', 145 | 23: 'ُّ', 146 | 24: 'ًّ', 147 | 25: 'ٌ', 148 | 26: 'ً', 149 | 27: 'ّّ'} 150 | -------------------------------------------------------------------------------- /scripts/train_fp.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | from torch.utils.data import DataLoader 6 | from torch.utils.tensorboard import SummaryWriter 7 | 8 | from models.fastpitch import net_config 9 | from models.fastpitch.fastpitch.model import FastPitch 10 | from models.fastpitch.fastpitch.data_function import (TTSCollate, batch_to_gpu) 11 | from models.fastpitch.fastpitch.loss_function import FastPitchLoss 12 | from models.fastpitch.fastpitch.attn_loss_function import AttentionBinarizationLoss 13 | from utils.data import DynBatchDataset 14 | from utils import get_config 15 | from utils.training import save_states 16 | # %% 17 | 18 | try: 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--config', type=str, 21 | default="configs/nawar_fp.yaml", help="Path to yaml config file") 22 | args = parser.parse_args() 23 | config_path = args.config 24 | except: 25 | config_path = './configs/nawar_fp.yaml' 26 | 27 | # %% 28 | 29 | config = get_config(config_path) 30 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 31 | 32 | # make checkpoint folder if nonexistent 33 | if not os.path.isdir(config.checkpoint_dir): 34 | os.makedirs(os.path.abspath(config.checkpoint_dir)) 35 | print(f"Created checkpoint folder @ {config.checkpoint_dir}") 36 | 37 | 38 | train_dataset = DynBatchDataset( 39 | txtpath=config.train_labels, 40 | wavpath=config.train_wavs_path, 41 | label_pattern=config.label_pattern, 42 | f0_dict_path=config.f0_dict_path, 43 | f0_mean=config.f0_mean, f0_std=config.f0_std, 44 | max_lengths=config.max_lengths, 45 | batch_sizes=config.batch_sizes, 46 | ) 47 | 48 | # %% 49 | 50 | collate_fn = TTSCollate() 51 | 52 | config.batch_size = 1 53 | sampler, shuffle, drop_last = None, True, True 54 | train_loader = DataLoader(train_dataset, 55 | batch_size=config.batch_size, 56 | collate_fn=lambda x: collate_fn(x[0]), 57 | shuffle=shuffle, drop_last=drop_last, 58 | sampler=sampler) 59 | 60 | # %% Generator 61 | 62 | model = FastPitch(**net_config).to(device) 63 | 64 | optimizer = torch.optim.AdamW(model.parameters(), 65 | lr=config.g_lr, 66 | betas=(config.g_beta1, config.g_beta2), 67 | weight_decay=config.weight_decay) 68 | 69 | criterion = FastPitchLoss() 70 | attention_kl_loss = AttentionBinarizationLoss() 71 | 72 | # %% 73 | # resume from existing checkpoint 74 | n_epoch, n_iter = 0, 0 75 | 76 | if config.restore_model != '': 77 | state_dicts = torch.load(config.restore_model) 78 | model.load_state_dict(state_dicts['model']) 79 | if 'optim' in state_dicts: 80 | optimizer.load_state_dict(state_dicts['optim']) 81 | if 'epoch' in state_dicts: 82 | n_epoch = state_dicts['epoch'] 83 | if 'iter' in state_dicts: 84 | n_iter = state_dicts['iter'] 85 | else: 86 | # from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/fastpitch__pyt_ckpt 87 | model_sd = torch.load('G:/models/fastpitch/nvidia_fastpitch_210824+cfg.pt') 88 | model.load_state_dict( 89 | {k.removeprefix('module.'): v for k, v in model_sd['state_dict'].items()}) 90 | 91 | # %% 92 | writer = SummaryWriter(config.log_dir) 93 | 94 | # %% TRAINING LOOP 95 | 96 | model.train() 97 | 98 | for epoch in range(n_epoch, config.epochs): 99 | train_dataset.shuffle() 100 | for batch in train_loader: 101 | 102 | x, y, _ = batch_to_gpu(batch) 103 | 104 | y_pred = model(x) 105 | 106 | mel_out, *_, attn_soft, attn_hard, _, _ = y_pred 107 | _, _, mel_padded, output_lengths, *_ = x 108 | 109 | # generator step 110 | loss, meta = criterion(y_pred, y) 111 | 112 | binarization_loss = attention_kl_loss(attn_hard, attn_soft) 113 | loss += 1.0 * binarization_loss 114 | 115 | optimizer.zero_grad() 116 | loss.backward() 117 | grad_norm = torch.nn.utils.clip_grad_norm_( 118 | model.parameters(), 1000.) 119 | optimizer.step() 120 | 121 | # LOGGING 122 | meta['kl_loss'] = binarization_loss.clone().detach() 123 | 124 | print(f"loss: {meta['loss'].item()} gnorm: {grad_norm}") 125 | 126 | for k, v in meta.items(): 127 | writer.add_scalar(f'train/{k}', v.item(), n_iter) 128 | 129 | if n_iter % config.n_save_states_iter == 0: 130 | save_states(f'states.pth', model, 131 | optimizer, n_iter, 132 | epoch, net_config, config) 133 | 134 | if n_iter % config.n_save_backup_iter == 0 and n_iter > 0: 135 | save_states(f'states_{n_iter}.pth', model, 136 | optimizer, n_iter, 137 | epoch, net_config, config) 138 | 139 | n_iter += 1 140 | 141 | 142 | save_states(f'states.pth', model, 143 | optimizer, n_iter, 144 | epoch, net_config, config) 145 | 146 | 147 | # %% 148 | 149 | # (mel_out, 0 150 | # dec_mask, 1 151 | # dur_pred, 2 152 | # log_dur_pred, 3 153 | # pitch_pred, 4 154 | # pitch_tgt, 5 155 | # energy_pred, 6 156 | # energy_tgt, 7 157 | # attn_soft, 8 158 | # attn_hard, 9 159 | # attn_dur, 10 160 | # attn_logprob, 11 161 | # ) = model_out 162 | 163 | # x = [text_padded, input_lengths, mel_padded, output_lengths, 164 | # pitch_padded, energy_padded, speaker, attn_prior, audiopaths] 165 | 166 | # y = [mel_padded, input_lengths, output_lengths] 167 | -------------------------------------------------------------------------------- /scripts/train_tc2.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import argparse 3 | import os 4 | import torch 5 | from torch.utils.data import DataLoader 6 | from models.tacotron2.tacotron2_ms import Tacotron2MS 7 | 8 | from utils import get_config 9 | from utils.data import ArabDataset, text_mel_collate_fn 10 | from utils.logging import TBLogger 11 | from utils.training import batch_to_device, save_states 12 | 13 | from models.tacotron2.loss import Tacotron2Loss 14 | 15 | # %% 16 | 17 | try: 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--config', type=str, 20 | default="configs/nawar_tc2.yaml", help="Path to yaml config file") 21 | args = parser.parse_args() 22 | config_path = args.config 23 | except: 24 | config_path = './configs/nawar_tc2.yaml' 25 | 26 | # %% 27 | 28 | config = get_config(config_path) 29 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 30 | 31 | # set random seed 32 | if config.random_seed != False: 33 | torch.manual_seed(config.random_seed) 34 | torch.cuda.manual_seed_all(config.random_seed) 35 | import numpy as np 36 | np.random.seed(config.random_seed) 37 | 38 | # make checkpoint folder if nonexistent 39 | if not os.path.isdir(config.checkpoint_dir): 40 | os.makedirs(os.path.abspath(config.checkpoint_dir)) 41 | print(f"Created checkpoint_dir folder: {config.checkpoint_dir}") 42 | 43 | # datasets 44 | train_dataset = ArabDataset(txtpath=config.train_labels, 45 | wavpath=config.train_wavs_path, 46 | label_pattern=config.label_pattern) 47 | # test_dataset = ArabDataset(config.test_labels, config.test_wavs_path) 48 | 49 | # optional: balanced sampling 50 | sampler, shuffle, drop_last = None, True, True 51 | if config.balanced_sampling: 52 | weights = torch.load(config.sampler_weights_file) 53 | 54 | sampler = torch.utils.data.WeightedRandomSampler( 55 | weights, len(weights), replacement=False) 56 | shuffle, drop_last = False, False 57 | 58 | # dataloaders 59 | train_loader = DataLoader(train_dataset, 60 | batch_size=config.batch_size, 61 | collate_fn=text_mel_collate_fn, 62 | shuffle=shuffle, drop_last=drop_last, 63 | sampler=sampler) 64 | 65 | # test_loader = DataLoader(test_dataset, 66 | # batch_size=config.batch_size, drop_last=False, 67 | # shuffle=False, collate_fn=text_mel_collate_fn) 68 | 69 | # %% Generator 70 | model = Tacotron2MS(n_symbol=40, num_speakers=40) 71 | model = model.to(device) 72 | model.decoder.decoder_max_step = config.decoder_max_step 73 | 74 | optimizer = torch.optim.AdamW(model.parameters(), 75 | lr=config.g_lr, 76 | betas=(config.g_beta1, config.g_beta2), 77 | weight_decay=config.weight_decay) 78 | criterion = Tacotron2Loss(mel_loss_scale=1.0) 79 | 80 | # %% 81 | # resume from existing checkpoint 82 | n_epoch, n_iter = 0, 0 83 | 84 | if config.restore_model != '': 85 | state_dicts = torch.load(config.restore_model) 86 | model.load_state_dict(state_dicts['model']) 87 | if 'optim' in state_dicts: 88 | optimizer.load_state_dict(state_dicts['optim']) 89 | if 'epoch' in state_dicts: 90 | n_epoch = state_dicts['epoch'] 91 | if 'iter' in state_dicts: 92 | n_iter = state_dicts['iter'] 93 | 94 | # %% 95 | # tensorboard writer 96 | writer = TBLogger(config.log_dir) 97 | 98 | # %% 99 | 100 | def trunc_batch(batch, N): 101 | return (batch[0][:N], batch[1][:N], batch[2][:N], 102 | batch[3][:N], batch[4][:N]) 103 | 104 | # %% TRAINING LOOP 105 | 106 | model.train() 107 | 108 | for epoch in range(n_epoch, config.epochs): 109 | print(f"Epoch: {epoch}") 110 | for batch in train_loader: 111 | 112 | if batch[-1][0] > 2000: 113 | batch = trunc_batch(batch, 6) 114 | 115 | text_padded, input_lengths, mel_padded, gate_padded, \ 116 | output_lengths = batch_to_device(batch, device) 117 | 118 | y_pred = model(text_padded, input_lengths, 119 | mel_padded, output_lengths, 120 | torch.zeros_like(output_lengths)) 121 | mel_out, mel_out_postnet, gate_out, alignments = y_pred 122 | 123 | # GENERATOR 124 | loss, meta = criterion(mel_out, mel_out_postnet, mel_padded, 125 | gate_out, gate_padded) 126 | 127 | optimizer.zero_grad() 128 | loss.backward() 129 | grad_norm = torch.nn.utils.clip_grad_norm_( 130 | model.parameters(), config.grad_clip_thresh) 131 | optimizer.step() 132 | 133 | # LOGGING 134 | meta['loss'] = loss.clone().detach() 135 | 136 | print(f"loss: {loss.item()}, grad_norm: {grad_norm.item()}") 137 | 138 | writer.add_training_data(meta, grad_norm.item(), 139 | config.learning_rate, n_iter) 140 | 141 | 142 | if n_iter % config.n_save_states_iter == 0: 143 | save_states(f'states.pth', model, 144 | optimizer, n_iter, 145 | epoch, None, config) 146 | 147 | if n_iter % config.n_save_backup_iter == 0 and n_iter > 0: 148 | save_states(f'states_{n_iter}.pth', model, 149 | optimizer, n_iter, 150 | epoch, None, config) 151 | 152 | n_iter += 1 153 | 154 | # VALIDATE 155 | # val_loss = validate(model, test_loader, writer, device, n_iter) 156 | # print(f"Validation loss: {val_loss}") 157 | 158 | 159 | save_states(f'states.pth', model, 160 | optimizer, n_iter, 161 | epoch, None, config) 162 | 163 | 164 | # %% 165 | -------------------------------------------------------------------------------- /models/fastpitch/fastpitch/loss_function.py: -------------------------------------------------------------------------------- 1 | # ***************************************************************************** 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of the NVIDIA CORPORATION nor the 12 | # names of its contributors may be used to endorse or promote products 13 | # derived from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | # 26 | # ***************************************************************************** 27 | 28 | from typing import Optional 29 | 30 | import torch 31 | import torch.nn.functional as F 32 | from torch import nn 33 | 34 | # from common.utils import mask_from_lens 35 | from .attn_loss_function import AttentionCTCLoss 36 | 37 | def mask_from_lens(lens, max_len: Optional[int] = None): 38 | if max_len is None: 39 | max_len = lens.max() 40 | ids = torch.arange(0, max_len, device=lens.device, dtype=lens.dtype) 41 | mask = torch.lt(ids, lens.unsqueeze(1)) 42 | return mask 43 | 44 | 45 | class FastPitchLoss(nn.Module): 46 | def __init__(self, mel_loss_scale=1.0, 47 | dur_predictor_loss_scale=1.0, 48 | pitch_predictor_loss_scale=1.0, attn_loss_scale=1.0, 49 | energy_predictor_loss_scale=0.1): 50 | super(FastPitchLoss, self).__init__() 51 | self.mel_loss_scale = mel_loss_scale 52 | self.dur_predictor_loss_scale = dur_predictor_loss_scale 53 | self.pitch_predictor_loss_scale = pitch_predictor_loss_scale 54 | self.energy_predictor_loss_scale = energy_predictor_loss_scale 55 | self.attn_loss_scale = attn_loss_scale 56 | self.attn_ctc_loss = AttentionCTCLoss() 57 | 58 | def forward(self, model_out, targets, is_training=True, meta_agg='mean'): 59 | (mel_out, dec_mask, dur_pred, log_dur_pred, pitch_pred, pitch_tgt, 60 | energy_pred, energy_tgt, attn_soft, attn_hard, attn_dur, 61 | attn_logprob) = model_out 62 | 63 | (mel_tgt, in_lens, out_lens) = targets 64 | 65 | dur_tgt = attn_dur 66 | dur_lens = in_lens 67 | 68 | mel_tgt.requires_grad = False 69 | # (B,H,T) => (B,T,H) 70 | mel_tgt = mel_tgt.transpose(1, 2) 71 | 72 | dur_mask = mask_from_lens(dur_lens, max_len=dur_tgt.size(1)) 73 | log_dur_tgt = torch.log(dur_tgt.float() + 1) 74 | loss_fn = F.mse_loss 75 | dur_pred_loss = loss_fn(log_dur_pred, log_dur_tgt, reduction='none') 76 | dur_pred_loss = (dur_pred_loss * dur_mask).sum() / dur_mask.sum() 77 | 78 | ldiff = mel_tgt.size(1) - mel_out.size(1) 79 | mel_out = F.pad(mel_out, (0, 0, 0, ldiff, 0, 0), value=0.0) 80 | mel_mask = mel_tgt.ne(0).float() 81 | loss_fn = F.mse_loss 82 | mel_loss = loss_fn(mel_out, mel_tgt, reduction='none') 83 | mel_loss = (mel_loss * mel_mask).sum() / mel_mask.sum() 84 | 85 | ldiff = pitch_tgt.size(2) - pitch_pred.size(2) 86 | pitch_pred = F.pad(pitch_pred, (0, ldiff, 0, 0, 0, 0), value=0.0) 87 | pitch_loss = F.mse_loss(pitch_tgt, pitch_pred, reduction='none') 88 | pitch_loss = (pitch_loss * dur_mask.unsqueeze(1)).sum() / dur_mask.sum() 89 | 90 | if energy_pred is not None: 91 | energy_pred = F.pad(energy_pred, (0, ldiff, 0, 0), value=0.0) 92 | energy_loss = F.mse_loss(energy_tgt, energy_pred, reduction='none') 93 | energy_loss = (energy_loss * dur_mask).sum() / dur_mask.sum() 94 | else: 95 | energy_loss = 0 96 | 97 | # Attention loss 98 | attn_loss = self.attn_ctc_loss(attn_logprob, in_lens, out_lens) 99 | 100 | loss = (mel_loss * self.mel_loss_scale 101 | + dur_pred_loss * self.dur_predictor_loss_scale 102 | + pitch_loss * self.pitch_predictor_loss_scale 103 | + energy_loss * self.energy_predictor_loss_scale 104 | + attn_loss * self.attn_loss_scale) 105 | 106 | meta = { 107 | 'loss': loss.clone().detach(), 108 | 'mel_loss': mel_loss.clone().detach(), 109 | 'duration_predictor_loss': dur_pred_loss.clone().detach(), 110 | 'pitch_loss': pitch_loss.clone().detach(), 111 | 'attn_loss': attn_loss.clone().detach(), 112 | 'dur_error': (torch.abs(dur_pred - dur_tgt).sum() 113 | / dur_mask.sum()).detach(), 114 | } 115 | 116 | if energy_pred is not None: 117 | meta['energy_loss'] = energy_loss.clone().detach() 118 | 119 | assert meta_agg in ('sum', 'mean') 120 | if meta_agg == 'sum': 121 | bsz = mel_out.size(0) 122 | meta = {k: v * bsz for k, v in meta.items()} 123 | return loss, meta 124 | -------------------------------------------------------------------------------- /vocoder/vocos/heads.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | from torch import nn 5 | from torchaudio.functional.functional import _hz_to_mel, _mel_to_hz 6 | 7 | from .spectral_ops import IMDCT, ISTFT 8 | from .modules import symexp 9 | 10 | 11 | class FourierHead(nn.Module): 12 | """Base class for inverse fourier modules.""" 13 | 14 | def forward(self, x: torch.Tensor) -> torch.Tensor: 15 | """ 16 | Args: 17 | x (Tensor): Input tensor of shape (B, L, H), where B is the batch size, 18 | L is the sequence length, and H denotes the model dimension. 19 | 20 | Returns: 21 | Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal. 22 | """ 23 | raise NotImplementedError("Subclasses must implement the forward method.") 24 | 25 | 26 | class ISTFTHead(FourierHead): 27 | """ 28 | ISTFT Head module for predicting STFT complex coefficients. 29 | 30 | Args: 31 | dim (int): Hidden dimension of the model. 32 | n_fft (int): Size of Fourier transform. 33 | hop_length (int): The distance between neighboring sliding window frames, which should align with 34 | the resolution of the input features. 35 | padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". 36 | """ 37 | 38 | def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same"): 39 | super().__init__() 40 | out_dim = n_fft + 2 41 | self.out = torch.nn.Linear(dim, out_dim) 42 | self.istft = ISTFT(n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding) 43 | 44 | def forward(self, x: torch.Tensor) -> torch.Tensor: 45 | """ 46 | Forward pass of the ISTFTHead module. 47 | 48 | Args: 49 | x (Tensor): Input tensor of shape (B, L, H), where B is the batch size, 50 | L is the sequence length, and H denotes the model dimension. 51 | 52 | Returns: 53 | Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal. 54 | """ 55 | x = self.out(x).transpose(1, 2) 56 | mag, p = x.chunk(2, dim=1) 57 | mag = torch.exp(mag) 58 | mag = torch.clip(mag, max=1e2) # safeguard to prevent excessively large magnitudes 59 | # wrapping happens here. These two lines produce real and imaginary value 60 | x = torch.cos(p) 61 | y = torch.sin(p) 62 | # recalculating phase here does not produce anything new 63 | # only costs time 64 | # phase = torch.atan2(y, x) 65 | # S = mag * torch.exp(phase * 1j) 66 | # better directly produce the complex value 67 | S = mag * (x + 1j * y) 68 | audio = self.istft(S) 69 | return audio 70 | 71 | 72 | class IMDCTSymExpHead(FourierHead): 73 | """ 74 | IMDCT Head module for predicting MDCT coefficients with symmetric exponential function 75 | 76 | Args: 77 | dim (int): Hidden dimension of the model. 78 | mdct_frame_len (int): Length of the MDCT frame. 79 | padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". 80 | sample_rate (int, optional): The sample rate of the audio. If provided, the last layer will be initialized 81 | based on perceptual scaling. Defaults to None. 82 | clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False. 83 | """ 84 | 85 | def __init__( 86 | self, 87 | dim: int, 88 | mdct_frame_len: int, 89 | padding: str = "same", 90 | sample_rate: Optional[int] = None, 91 | clip_audio: bool = False, 92 | ): 93 | super().__init__() 94 | out_dim = mdct_frame_len // 2 95 | self.out = nn.Linear(dim, out_dim) 96 | self.imdct = IMDCT(frame_len=mdct_frame_len, padding=padding) 97 | self.clip_audio = clip_audio 98 | 99 | if sample_rate is not None: 100 | # optionally init the last layer following mel-scale 101 | m_max = _hz_to_mel(sample_rate // 2) 102 | m_pts = torch.linspace(0, m_max, out_dim) 103 | f_pts = _mel_to_hz(m_pts) 104 | scale = 1 - (f_pts / f_pts.max()) 105 | 106 | with torch.no_grad(): 107 | self.out.weight.mul_(scale.view(-1, 1)) 108 | 109 | def forward(self, x: torch.Tensor) -> torch.Tensor: 110 | """ 111 | Forward pass of the IMDCTSymExpHead module. 112 | 113 | Args: 114 | x (Tensor): Input tensor of shape (B, L, H), where B is the batch size, 115 | L is the sequence length, and H denotes the model dimension. 116 | 117 | Returns: 118 | Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal. 119 | """ 120 | x = self.out(x) 121 | x = symexp(x) 122 | x = torch.clip(x, min=-1e2, max=1e2) # safeguard to prevent excessively large magnitudes 123 | audio = self.imdct(x) 124 | if self.clip_audio: 125 | audio = torch.clip(x, min=-1.0, max=1.0) 126 | 127 | return audio 128 | 129 | 130 | class IMDCTCosHead(FourierHead): 131 | """ 132 | IMDCT Head module for predicting MDCT coefficients with parametrizing MDCT = exp(m) · cos(p) 133 | 134 | Args: 135 | dim (int): Hidden dimension of the model. 136 | mdct_frame_len (int): Length of the MDCT frame. 137 | padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". 138 | clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False. 139 | """ 140 | 141 | def __init__(self, dim: int, mdct_frame_len: int, padding: str = "same", clip_audio: bool = False): 142 | super().__init__() 143 | self.clip_audio = clip_audio 144 | self.out = nn.Linear(dim, mdct_frame_len) 145 | self.imdct = IMDCT(frame_len=mdct_frame_len, padding=padding) 146 | 147 | def forward(self, x: torch.Tensor) -> torch.Tensor: 148 | """ 149 | Forward pass of the IMDCTCosHead module. 150 | 151 | Args: 152 | x (Tensor): Input tensor of shape (B, L, H), where B is the batch size, 153 | L is the sequence length, and H denotes the model dimension. 154 | 155 | Returns: 156 | Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal. 157 | """ 158 | x = self.out(x) 159 | m, p = x.chunk(2, dim=2) 160 | m = torch.exp(m).clip(max=1e2) # safeguard to prevent excessively large magnitudes 161 | audio = self.imdct(m * torch.cos(p)) 162 | if self.clip_audio: 163 | audio = torch.clip(x, min=-1.0, max=1.0) 164 | return audio 165 | -------------------------------------------------------------------------------- /app/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | TTS Arabic 8 | 9 | 10 | 126 | 127 | 128 | 129 | 130 |

TTS Tacotron2 Arabic

131 |
132 |
133 | 134 | 139 |
140 | 141 |
142 |
143 | 144 | 145 |
146 | 147 |
148 | 149 | 150 | 151 | 152 | 153 | 154 | 157 |
158 | 159 | 160 |
161 |
    162 | 163 |
164 | 165 |
166 | 167 | 168 | 242 | 243 | 244 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import torch 4 | import torch.nn.functional as F 5 | from torch.utils.data import DataLoader 6 | from models.tacotron2.tacotron2_ms import Tacotron2MS 7 | 8 | from utils import get_config 9 | from utils.data import ArabDataset, text_mel_collate_fn 10 | from utils.logging import TBLogger 11 | from utils.training import * 12 | 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--config', type=str, 16 | default="configs/nawar.yaml", help="Path to yaml config file") 17 | 18 | 19 | @torch.inference_mode() 20 | def validate(model, test_loader, writer, device, n_iter): 21 | loss_sum = 0 22 | n_test_sum = 0 23 | 24 | model.eval() 25 | 26 | for batch in test_loader: 27 | text_padded, input_lengths, mel_padded, gate_padded, \ 28 | output_lengths = batch_to_device(batch, device) 29 | 30 | y_pred = model(text_padded, input_lengths, 31 | mel_padded, output_lengths, 32 | torch.zeros_like(output_lengths)) 33 | mel_out, mel_out_postnet, gate_pred, alignments = y_pred 34 | 35 | mel_loss = F.mse_loss(mel_out, mel_padded) + \ 36 | F.mse_loss(mel_out_postnet, mel_padded) 37 | gate_loss = F.binary_cross_entropy_with_logits(gate_pred, gate_padded) 38 | loss = mel_loss + gate_loss 39 | 40 | loss_sum += mel_padded.size(0)*loss.item() 41 | n_test_sum += mel_padded.size(0) 42 | 43 | val_loss = loss_sum / n_test_sum 44 | 45 | idx = random.randint(0, mel_padded.size(0) - 1) 46 | mel_infer, *_ = model.infer( 47 | text_padded[idx:idx+1], input_lengths[idx:idx+1]*0, input_lengths[idx:idx+1]) 48 | 49 | writer.add_sample( 50 | alignments[idx, :, :input_lengths[idx].item()], 51 | mel_out[idx], mel_padded[idx], mel_infer[0], 52 | output_lengths[idx], n_iter) 53 | 54 | writer.add_scalar('loss/val_loss', val_loss, n_iter) 55 | 56 | model.train() 57 | 58 | return val_loss 59 | 60 | 61 | def training_loop(model, 62 | optimizer, 63 | train_loader, 64 | test_loader, 65 | writer, 66 | device, 67 | config, 68 | n_epoch, 69 | n_iter): 70 | 71 | model.train() 72 | 73 | for epoch in range(n_epoch, config.epochs): 74 | print(f"Epoch: {epoch}") 75 | for batch in train_loader: 76 | 77 | text_padded, input_lengths, mel_padded, gate_padded, \ 78 | output_lengths = batch_to_device(batch, device) 79 | 80 | y_pred = model(text_padded, input_lengths, 81 | mel_padded, output_lengths, 82 | torch.zeros_like(output_lengths)) 83 | mel_out, mel_out_postnet, gate_out, _ = y_pred 84 | 85 | optimizer.zero_grad() 86 | 87 | # LOSS 88 | mel_loss = F.mse_loss(mel_out, mel_padded) + \ 89 | F.mse_loss(mel_out_postnet, mel_padded) 90 | gate_loss = F.binary_cross_entropy_with_logits( 91 | gate_out, gate_padded) 92 | loss = mel_loss + gate_loss 93 | 94 | loss.backward() 95 | grad_norm = torch.nn.utils.clip_grad_norm_( 96 | model.parameters(), config.grad_clip_thresh) 97 | optimizer.step() 98 | 99 | # LOGGING 100 | print(f"loss: {loss.item()}, grad_norm: {grad_norm.item()}") 101 | 102 | writer.add_training_data(loss.item(), grad_norm.item(), 103 | config.learning_rate, n_iter) 104 | 105 | if n_iter % config.n_save_states_iter == 0: 106 | save_states(f'states.pth', model, optimizer, 107 | n_iter, epoch, config) 108 | 109 | if n_iter % config.n_save_backup_iter == 0 and n_iter > 0: 110 | save_states(f'states_{n_iter}.pth', model, 111 | optimizer, n_iter, epoch, config) 112 | 113 | n_iter += 1 114 | 115 | # VALIDATE 116 | val_loss = validate(model, test_loader, writer, device, n_iter) 117 | print(f"Validation loss: {val_loss}") 118 | 119 | save_states(f'states_{n_iter}.pth', model, 120 | optimizer, n_iter, epoch, config) 121 | 122 | 123 | def main(): 124 | args = parser.parse_args() 125 | config = get_config(args.config) 126 | 127 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 128 | 129 | # set random seed 130 | if config.random_seed != False: 131 | torch.manual_seed(config.random_seed) 132 | torch.cuda.manual_seed_all(config.random_seed) 133 | import numpy as np 134 | np.random.seed(config.random_seed) 135 | 136 | # make checkpoint folder if nonexistent 137 | if not os.path.isdir(config.checkpoint_dir): 138 | os.makedirs(os.path.abspath(config.checkpoint_dir)) 139 | print(f"Created checkpoint_dir folder: {config.checkpoint_dir}") 140 | 141 | # datasets 142 | if config.cache_dataset: 143 | print('Caching datasets ...') 144 | train_dataset = ArabDataset(config.train_labels, config.train_wavs_path, 145 | cache=config.cache_dataset) 146 | test_dataset = ArabDataset(config.test_labels, config.test_wavs_path, 147 | cache=config.cache_dataset) 148 | 149 | # optional: balanced sampling 150 | sampler, shuffle, drop_last = None, True, True 151 | if config.balanced_sampling: 152 | weights = torch.load(config.sampler_weights_file) 153 | 154 | sampler = torch.utils.data.WeightedRandomSampler( 155 | weights, len(weights), replacement=False) 156 | shuffle, drop_last = False, False 157 | 158 | # dataloaders 159 | train_loader = DataLoader(train_dataset, 160 | batch_size=config.batch_size, 161 | collate_fn=text_mel_collate_fn, 162 | shuffle=shuffle, drop_last=drop_last, 163 | sampler=sampler) 164 | 165 | test_loader = DataLoader(test_dataset, 166 | batch_size=config.batch_size, drop_last=False, 167 | shuffle=False, collate_fn=text_mel_collate_fn) 168 | 169 | # construct model 170 | model = Tacotron2MS(n_symbol=40) 171 | model = model.to(device) 172 | model.decoder.decoder_max_step = config.decoder_max_step 173 | 174 | # optimizer 175 | optimizer = torch.optim.AdamW(model.parameters(), 176 | lr=config.learning_rate, 177 | weight_decay=config.weight_decay) 178 | 179 | # resume from existing checkpoint 180 | n_epoch, n_iter = 0, 0 181 | if config.restore_model != '': 182 | state_dicts = torch.load(config.restore_model) 183 | model.load_state_dict(state_dicts['model']) 184 | if 'optim' in state_dicts: 185 | optimizer.load_state_dict(state_dicts['optim']) 186 | if 'epoch' in state_dicts: 187 | n_epoch = state_dicts['epoch'] 188 | if 'iter' in state_dicts: 189 | n_iter = state_dicts['iter'] 190 | 191 | # tensorboard writer 192 | writer = TBLogger(config.log_dir) 193 | 194 | # start training 195 | training_loop(model, 196 | optimizer, 197 | train_loader, 198 | test_loader, 199 | writer, 200 | device, 201 | config, 202 | n_epoch, 203 | n_iter) 204 | 205 | 206 | if __name__ == '__main__': 207 | main() 208 | -------------------------------------------------------------------------------- /scripts/train_fp_adv.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | from torch.utils.data import DataLoader 6 | from torch.utils.tensorboard import SummaryWriter 7 | 8 | from models.fastpitch import net_config 9 | from models.fastpitch.fastpitch.model import FastPitch 10 | from models.fastpitch.fastpitch.data_function import (TTSCollate, batch_to_gpu) 11 | from models.fastpitch.fastpitch.loss_function import FastPitchLoss 12 | from models.fastpitch.fastpitch.attn_loss_function import AttentionBinarizationLoss 13 | from models.common.loss import (PatchDiscriminator, 14 | calc_feature_match_loss, 15 | extract_chunks) 16 | from utils.data import DynBatchDataset 17 | from utils import get_config 18 | from utils.training import save_states_gan as save_states 19 | # %% 20 | 21 | try: 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--config', type=str, 24 | default="configs/nawar_fp_adv.yaml", help="Path to yaml config file") 25 | args = parser.parse_args() 26 | config_path = args.config 27 | except: 28 | config_path = './configs/nawar_fp_adv.yaml' 29 | 30 | # %% 31 | 32 | config = get_config(config_path) 33 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 34 | 35 | # make checkpoint folder if nonexistent 36 | if not os.path.isdir(config.checkpoint_dir): 37 | os.makedirs(os.path.abspath(config.checkpoint_dir)) 38 | print(f"Created checkpoint folder @ {config.checkpoint_dir}") 39 | 40 | 41 | train_dataset = DynBatchDataset( 42 | txtpath=config.train_labels, 43 | wavpath=config.train_wavs_path, 44 | label_pattern=config.label_pattern, 45 | f0_dict_path=config.f0_dict_path, 46 | f0_mean=config.f0_mean, f0_std=config.f0_std, 47 | max_lengths=config.max_lengths, 48 | batch_sizes=config.batch_sizes, 49 | ) 50 | 51 | # %% 52 | 53 | collate_fn = TTSCollate() 54 | 55 | config.batch_size = 1 56 | sampler, shuffle, drop_last = None, True, True 57 | train_loader = DataLoader(train_dataset, 58 | batch_size=config.batch_size, 59 | collate_fn=lambda x: collate_fn(x[0]), 60 | shuffle=shuffle, drop_last=drop_last, 61 | sampler=sampler) 62 | 63 | # %% Generator 64 | 65 | model = FastPitch(**net_config).to(device) 66 | 67 | optimizer = torch.optim.AdamW(model.parameters(), 68 | lr=config.g_lr, 69 | betas=(config.g_beta1, config.g_beta2), 70 | weight_decay=config.weight_decay) 71 | 72 | criterion = FastPitchLoss() 73 | attention_kl_loss = AttentionBinarizationLoss() 74 | 75 | # %% Discriminator 76 | 77 | critic = PatchDiscriminator(1, 32).to(device) 78 | 79 | optimizer_d = torch.optim.AdamW(critic.parameters(), 80 | lr=config.d_lr, 81 | betas=(config.d_beta1, config.d_beta2), 82 | weight_decay=config.weight_decay) 83 | chunk_len = 128 84 | 85 | # %% 86 | # resume from existing checkpoint 87 | n_epoch, n_iter = 0, 0 88 | 89 | if config.restore_model != '': 90 | state_dicts = torch.load(config.restore_model) 91 | model.load_state_dict(state_dicts['model']) 92 | if 'model_d' in state_dicts: 93 | critic.load_state_dict(state_dicts['model_d'], strict=False) 94 | if 'optim' in state_dicts: 95 | optimizer.load_state_dict(state_dicts['optim']) 96 | if 'optim_d' in state_dicts: 97 | optimizer_d.load_state_dict(state_dicts['optim_d']) 98 | if 'epoch' in state_dicts: 99 | n_epoch = state_dicts['epoch'] 100 | if 'iter' in state_dicts: 101 | n_iter = state_dicts['iter'] 102 | else: 103 | # from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dle/models/fastpitch__pyt_ckpt 104 | model_sd = torch.load('G:/models/fastpitch/nvidia_fastpitch_210824+cfg.pt') 105 | model.load_state_dict( 106 | {k.removeprefix('module.'): v for k, v in model_sd['state_dict'].items()}) 107 | 108 | # %% 109 | writer = SummaryWriter(config.log_dir) 110 | 111 | # %% TRAINING LOOP 112 | 113 | model.train() 114 | 115 | for epoch in range(n_epoch, config.epochs): 116 | train_dataset.shuffle() 117 | for batch in train_loader: 118 | 119 | x, y, _ = batch_to_gpu(batch) 120 | 121 | y_pred = model(x) 122 | 123 | mel_out, *_, attn_soft, attn_hard, _, _ = y_pred 124 | _, _, mel_padded, output_lengths, *_ = x 125 | 126 | # extract chunks for critic 127 | Nchunks = mel_out.size(0) 128 | tar_len_ = min(output_lengths.min().item(), chunk_len) 129 | mel_ids = torch.randint(0, mel_out.size(0), (Nchunks,)).cuda(non_blocking=True) 130 | ofx_perc = torch.rand(Nchunks).cuda(non_blocking=True) 131 | out_lens = output_lengths[mel_ids] 132 | 133 | ofx = (ofx_perc * (out_lens + tar_len_) - tar_len_/2) \ 134 | .clamp(out_lens*0, out_lens - tar_len_).long() 135 | 136 | chunks_org = extract_chunks(mel_padded, ofx, mel_ids, tar_len_) # mel_padded: B F T 137 | chunks_gen = extract_chunks(mel_out.transpose(1,2), ofx, mel_ids, tar_len_) # mel_out: B T F 138 | 139 | chunks_org_ = (chunks_org.unsqueeze(1) + 4.5) / 2.5 140 | chunks_gen_ = (chunks_gen.unsqueeze(1) + 4.5) / 2.5 141 | 142 | # discriminator step 143 | d_org, fmaps_org = critic(chunks_org_.requires_grad_(True)) 144 | d_gen, _ = critic(chunks_gen_.detach()) 145 | 146 | loss_d = 0.5*(d_org - 1).square().mean() + 0.5*d_gen.square().mean() 147 | 148 | critic.zero_grad() 149 | loss_d.backward() 150 | optimizer_d.step() 151 | 152 | # generator step 153 | loss, meta = criterion(y_pred, y) 154 | 155 | d_gen2, fmaps_gen = critic(chunks_gen_) 156 | loss_score = (d_gen2 - 1).square().mean() 157 | loss_fmatch = calc_feature_match_loss(fmaps_gen, fmaps_org) 158 | 159 | loss += config.gan_loss_weight * loss_score 160 | loss += config.feat_loss_weight * loss_fmatch 161 | 162 | binarization_loss = attention_kl_loss(attn_hard, attn_soft) 163 | loss += 1.0 * binarization_loss 164 | 165 | optimizer.zero_grad() 166 | loss.backward() 167 | grad_norm = torch.nn.utils.clip_grad_norm_( 168 | model.parameters(), 1000.) 169 | optimizer.step() 170 | 171 | # LOGGING 172 | meta['loss_d'] = loss_d.clone().detach() 173 | meta['score'] = loss_score.clone().detach() 174 | meta['fmatch'] = loss_fmatch.clone().detach() 175 | meta['kl_loss'] = binarization_loss.clone().detach() 176 | 177 | print(f"loss: {meta['loss'].item()} gnorm: {grad_norm}") 178 | 179 | for k, v in meta.items(): 180 | writer.add_scalar(f'train/{k}', v.item(), n_iter) 181 | 182 | if n_iter % config.n_save_states_iter == 0: 183 | save_states(f'states.pth', model, critic, 184 | optimizer, optimizer_d, n_iter, 185 | epoch, net_config, config) 186 | 187 | if n_iter % config.n_save_backup_iter == 0 and n_iter > 0: 188 | save_states(f'states_{n_iter}.pth', model, critic, 189 | optimizer, optimizer_d, n_iter, 190 | epoch, net_config, config) 191 | 192 | n_iter += 1 193 | 194 | 195 | save_states(f'states.pth', model, critic, 196 | optimizer, optimizer_d, n_iter, 197 | epoch, net_config, config) 198 | 199 | 200 | # %% 201 | 202 | # (mel_out, 0 203 | # dec_mask, 1 204 | # dur_pred, 2 205 | # log_dur_pred, 3 206 | # pitch_pred, 4 207 | # pitch_tgt, 5 208 | # energy_pred, 6 209 | # energy_tgt, 7 210 | # attn_soft, 8 211 | # attn_hard, 9 212 | # attn_dur, 10 213 | # attn_logprob, 11 214 | # ) = model_out 215 | 216 | # x = [text_padded, input_lengths, mel_padded, output_lengths, 217 | # pitch_padded, energy_padded, speaker, attn_prior, audiopaths] 218 | 219 | # y = [mel_padded, input_lengths, output_lengths] 220 | -------------------------------------------------------------------------------- /scripts/train_tc2_adv.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import argparse 3 | import os 4 | import torch 5 | from torch.utils.data import DataLoader 6 | from models.tacotron2.tacotron2_ms import Tacotron2MS 7 | 8 | from utils import get_config 9 | from utils.data import ArabDataset, text_mel_collate_fn 10 | from utils.logging import TBLogger 11 | from utils.training import batch_to_device, save_states_gan as save_states 12 | 13 | from models.common.loss import PatchDiscriminator, extract_chunks, calc_feature_match_loss 14 | from models.tacotron2.loss import Tacotron2Loss 15 | 16 | # %% 17 | 18 | try: 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--config', type=str, 21 | default="configs/nawar_tc2_adv.yaml", help="Path to yaml config file") 22 | args = parser.parse_args() 23 | config_path = args.config 24 | except: 25 | config_path = './configs/nawar_tc2_adv.yaml' 26 | 27 | # %% 28 | 29 | config = get_config(config_path) 30 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 31 | 32 | # set random seed 33 | if config.random_seed != False: 34 | torch.manual_seed(config.random_seed) 35 | torch.cuda.manual_seed_all(config.random_seed) 36 | import numpy as np 37 | np.random.seed(config.random_seed) 38 | 39 | # make checkpoint folder if nonexistent 40 | if not os.path.isdir(config.checkpoint_dir): 41 | os.makedirs(os.path.abspath(config.checkpoint_dir)) 42 | print(f"Created checkpoint_dir folder: {config.checkpoint_dir}") 43 | 44 | # datasets 45 | train_dataset = ArabDataset(txtpath=config.train_labels, 46 | wavpath=config.train_wavs_path, 47 | label_pattern=config.label_pattern) 48 | # test_dataset = ArabDataset(config.test_labels, config.test_wavs_path) 49 | 50 | # optional: balanced sampling 51 | sampler, shuffle, drop_last = None, True, True 52 | if config.balanced_sampling: 53 | weights = torch.load(config.sampler_weights_file) 54 | 55 | sampler = torch.utils.data.WeightedRandomSampler( 56 | weights, len(weights), replacement=False) 57 | shuffle, drop_last = False, False 58 | 59 | # dataloaders 60 | train_loader = DataLoader(train_dataset, 61 | batch_size=config.batch_size, 62 | collate_fn=text_mel_collate_fn, 63 | shuffle=shuffle, drop_last=drop_last, 64 | sampler=sampler) 65 | 66 | # test_loader = DataLoader(test_dataset, 67 | # batch_size=config.batch_size, drop_last=False, 68 | # shuffle=False, collate_fn=text_mel_collate_fn) 69 | 70 | # %% Generator 71 | model = Tacotron2MS(n_symbol=40, num_speakers=40) 72 | model = model.to(device) 73 | model.decoder.decoder_max_step = config.decoder_max_step 74 | 75 | optimizer = torch.optim.AdamW(model.parameters(), 76 | lr=config.g_lr, 77 | betas=(config.g_beta1, config.g_beta2), 78 | weight_decay=config.weight_decay) 79 | criterion = Tacotron2Loss(mel_loss_scale=1.0) 80 | 81 | # %% Discriminator 82 | critic = PatchDiscriminator(1, 32).to(device) 83 | 84 | optimizer_d = torch.optim.AdamW(critic.parameters(), 85 | lr=config.d_lr, 86 | betas=(config.d_beta1, config.d_beta2), 87 | weight_decay=config.weight_decay) 88 | tar_len = 128 89 | 90 | # %% 91 | # resume from existing checkpoint 92 | n_epoch, n_iter = 0, 0 93 | 94 | if config.restore_model != '': 95 | state_dicts = torch.load(config.restore_model) 96 | model.load_state_dict(state_dicts['model']) 97 | if 'model_d' in state_dicts: 98 | critic.load_state_dict(state_dicts['model_d'], strict=False) 99 | if 'optim' in state_dicts: 100 | optimizer.load_state_dict(state_dicts['optim']) 101 | if 'optim_d' in state_dicts: 102 | optimizer_d.load_state_dict(state_dicts['optim_d']) 103 | if 'epoch' in state_dicts: 104 | n_epoch = state_dicts['epoch'] 105 | if 'iter' in state_dicts: 106 | n_iter = state_dicts['iter'] 107 | 108 | # %% 109 | # tensorboard writer 110 | writer = TBLogger(config.log_dir) 111 | 112 | # %% 113 | 114 | def trunc_batch(batch, N): 115 | return (batch[0][:N], batch[1][:N], batch[2][:N], 116 | batch[3][:N], batch[4][:N]) 117 | 118 | # %% TRAINING LOOP 119 | 120 | model.train() 121 | 122 | for epoch in range(n_epoch, config.epochs): 123 | print(f"Epoch: {epoch}") 124 | for batch in train_loader: 125 | 126 | if batch[-1][0] > 2000: 127 | batch = trunc_batch(batch, 6) 128 | 129 | text_padded, input_lengths, mel_padded, gate_padded, \ 130 | output_lengths = batch_to_device(batch, device) 131 | 132 | y_pred = model(text_padded, input_lengths, 133 | mel_padded, output_lengths, 134 | torch.zeros_like(output_lengths)) 135 | mel_out, mel_out_postnet, gate_out, alignments = y_pred 136 | 137 | # extract chunks for critic 138 | Nchunks = mel_out.size(0) 139 | tar_len_ = min(output_lengths.min().item(), tar_len) 140 | mel_ids = torch.randint(0, mel_out.size(0), (Nchunks,)).cuda(non_blocking=True) 141 | ofx_perc = torch.rand(Nchunks).cuda(non_blocking=True) 142 | out_lens = output_lengths[mel_ids] 143 | 144 | ofx = (ofx_perc * (out_lens + tar_len_) - tar_len_/2) \ 145 | .clamp(out_lens*0, out_lens - tar_len_).long() 146 | 147 | chunks_org = extract_chunks( 148 | mel_padded, ofx, mel_ids, tar_len_) # mel_padded: B F T 149 | chunks_gen = extract_chunks( 150 | mel_out_postnet, ofx, mel_ids, tar_len_) # mel_out_postnet: B F T 151 | 152 | chunks_org_ = (chunks_org.unsqueeze(1) + 4.5) / 2.5 153 | chunks_gen_ = (chunks_gen.unsqueeze(1) + 4.5) / 2.5 154 | 155 | # DISCRIMINATOR 156 | d_org, fmaps_org = critic(chunks_org_.requires_grad_(True)) 157 | d_gen, _ = critic(chunks_gen_.detach()) 158 | 159 | loss_d = 0.5*(d_org - 1).square().mean() + 0.5*d_gen.square().mean() 160 | 161 | critic.zero_grad() 162 | loss_d.backward() 163 | optimizer_d.step() 164 | 165 | # GENERATOR 166 | loss, meta = criterion(mel_out, mel_out_postnet, mel_padded, 167 | gate_out, gate_padded) 168 | 169 | d_gen2, fmaps_gen = critic(chunks_gen_) 170 | loss_score = (d_gen2 - 1).square().mean() 171 | loss_fmatch = calc_feature_match_loss(fmaps_gen, fmaps_org) 172 | 173 | loss += config.gan_loss_weight * loss_score 174 | loss += config.feat_loss_weight * loss_fmatch 175 | 176 | optimizer.zero_grad() 177 | loss.backward() 178 | grad_norm = torch.nn.utils.clip_grad_norm_( 179 | model.parameters(), config.grad_clip_thresh) 180 | optimizer.step() 181 | 182 | # LOGGING 183 | meta['score'] = loss_score.clone().detach() 184 | meta['fmatch'] = loss_fmatch.clone().detach() 185 | meta['loss'] = loss.clone().detach() 186 | 187 | print(f"loss: {loss.item()}, grad_norm: {grad_norm.item()}") 188 | 189 | writer.add_training_data(meta, grad_norm.item(), 190 | config.learning_rate, n_iter) 191 | 192 | 193 | if n_iter % config.n_save_states_iter == 0: 194 | save_states(f'states.pth', model, critic, 195 | optimizer, optimizer_d, n_iter, 196 | epoch, None, config) 197 | 198 | if n_iter % config.n_save_backup_iter == 0 and n_iter > 0: 199 | save_states(f'states_{n_iter}.pth', model, critic, 200 | optimizer, optimizer_d, n_iter, 201 | epoch, None, config) 202 | 203 | n_iter += 1 204 | 205 | # VALIDATE 206 | # val_loss = validate(model, test_loader, writer, device, n_iter) 207 | # print(f"Validation loss: {val_loss}") 208 | 209 | 210 | save_states(f'states.pth', model, critic, 211 | optimizer, optimizer_d, n_iter, 212 | epoch, None, config) 213 | 214 | 215 | # %% 216 | -------------------------------------------------------------------------------- /models/diacritizers/shakkelha/symbols.py: -------------------------------------------------------------------------------- 1 | ARABIC_LETTERS_LIST = 'ءآأؤإئابةتثجحخدذرزسشصضطظعغفقكلمنهوىي' 2 | 3 | DIACRITICS_LIST = ['َ', 'ً', 'ِ', 'ٍ', 'ُ', 'ٌ', 'ْ', 'ّ'] 4 | 5 | RNN_BIG_CHARACTERS_MAPPING = {'': 0, 6 | '': 1, 7 | '': 2, 8 | '': 3, 9 | '\n': 4, 10 | ' ': 5, 11 | '!': 6, 12 | '"': 7, 13 | '&': 8, 14 | "'": 9, 15 | '(': 10, 16 | ')': 11, 17 | '*': 12, 18 | '+': 13, 19 | ',': 14, 20 | '-': 15, 21 | '.': 16, 22 | '/': 17, 23 | '0': 18, 24 | '1': 19, 25 | '2': 20, 26 | '3': 21, 27 | '4': 22, 28 | '5': 23, 29 | '6': 24, 30 | '7': 25, 31 | '8': 26, 32 | '9': 27, 33 | ':': 28, 34 | ';': 29, 35 | '=': 30, 36 | '[': 31, 37 | ']': 32, 38 | '_': 33, 39 | '`': 34, 40 | '{': 35, 41 | '}': 36, 42 | '~': 37, 43 | '«': 38, 44 | '»': 39, 45 | '،': 40, 46 | '؛': 41, 47 | '؟': 42, 48 | 'ء': 43, 49 | 'آ': 44, 50 | 'أ': 45, 51 | 'ؤ': 46, 52 | 'إ': 47, 53 | 'ئ': 48, 54 | 'ا': 49, 55 | 'ب': 50, 56 | 'ة': 51, 57 | 'ت': 52, 58 | 'ث': 53, 59 | 'ج': 54, 60 | 'ح': 55, 61 | 'خ': 56, 62 | 'د': 57, 63 | 'ذ': 58, 64 | 'ر': 59, 65 | 'ز': 60, 66 | 'س': 61, 67 | 'ش': 62, 68 | 'ص': 63, 69 | 'ض': 64, 70 | 'ط': 65, 71 | 'ظ': 66, 72 | 'ع': 67, 73 | 'غ': 68, 74 | 'ف': 69, 75 | 'ق': 70, 76 | 'ك': 71, 77 | 'ل': 72, 78 | 'م': 73, 79 | 'ن': 74, 80 | 'ه': 75, 81 | 'و': 76, 82 | 'ى': 77, 83 | 'ي': 78, 84 | '٠': 79, 85 | '١': 80, 86 | '٢': 81, 87 | '٤': 82, 88 | '\u200d': 83, 89 | '\u200f': 84, 90 | '–': 85, 91 | '’': 86, 92 | '“': 87, 93 | '…': 88, 94 | '﴾': 89, 95 | '﴿': 90} 96 | 97 | RNN_SMALL_CHARACTERS_MAPPING = {'': 0, 98 | '': 1, 99 | '': 2, 100 | '': 3, 101 | '\n': 4, 102 | ' ': 5, 103 | '!': 6, 104 | '"': 7, 105 | "'": 8, 106 | '(': 9, 107 | ')': 10, 108 | '*': 11, 109 | ',': 12, 110 | '-': 13, 111 | '.': 14, 112 | '/': 15, 113 | '0': 16, 114 | '1': 17, 115 | '2': 18, 116 | '3': 19, 117 | '4': 20, 118 | '5': 21, 119 | '6': 22, 120 | '7': 23, 121 | '8': 24, 122 | '9': 25, 123 | ':': 26, 124 | ';': 27, 125 | '[': 28, 126 | ']': 29, 127 | '`': 30, 128 | '{': 31, 129 | '}': 32, 130 | '~': 33, 131 | '«': 34, 132 | '»': 35, 133 | '،': 36, 134 | '؛': 37, 135 | '؟': 38, 136 | 'ء': 39, 137 | 'آ': 40, 138 | 'أ': 41, 139 | 'ؤ': 42, 140 | 'إ': 43, 141 | 'ئ': 44, 142 | 'ا': 45, 143 | 'ب': 46, 144 | 'ة': 47, 145 | 'ت': 48, 146 | 'ث': 49, 147 | 'ج': 50, 148 | 'ح': 51, 149 | 'خ': 52, 150 | 'د': 53, 151 | 'ذ': 54, 152 | 'ر': 55, 153 | 'ز': 56, 154 | 'س': 57, 155 | 'ش': 58, 156 | 'ص': 59, 157 | 'ض': 60, 158 | 'ط': 61, 159 | 'ظ': 62, 160 | 'ع': 63, 161 | 'غ': 64, 162 | 'ف': 65, 163 | 'ق': 66, 164 | 'ك': 67, 165 | 'ل': 68, 166 | 'م': 69, 167 | 'ن': 70, 168 | 'ه': 71, 169 | 'و': 72, 170 | 'ى': 73, 171 | 'ي': 74, 172 | '\u200f': 75, 173 | '–': 76} 174 | 175 | RNN_REV_CLASSES_MAPPING = {0: '', 176 | 1: 'َ', 177 | 2: 'ً', 178 | 3: 'ُ', 179 | 4: 'ٌ', 180 | 5: 'ِ', 181 | 6: 'ٍ', 182 | 7: 'ْ', 183 | 8: 'ّ', 184 | 9: 'َّ', 185 | 10: 'ًّ', 186 | 11: 'ُّ', 187 | 12: 'ٌّ', 188 | 13: 'ِّ', 189 | 14: 'ٍّ', 190 | 15: '', 191 | 16: '', 192 | 17: '', 193 | 18: ''} 194 | -------------------------------------------------------------------------------- /vocoder/vocos/modules.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn.utils import weight_norm, remove_weight_norm 6 | 7 | 8 | class ConvNeXtBlock(nn.Module): 9 | """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal. 10 | 11 | Args: 12 | dim (int): Number of input channels. 13 | intermediate_dim (int): Dimensionality of the intermediate layer. 14 | layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. 15 | Defaults to None. 16 | adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm. 17 | None means non-conditional LayerNorm. Defaults to None. 18 | """ 19 | 20 | def __init__( 21 | self, 22 | dim: int, 23 | intermediate_dim: int, 24 | layer_scale_init_value: float, 25 | adanorm_num_embeddings: Optional[int] = None, 26 | ): 27 | super().__init__() 28 | self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv 29 | self.adanorm = adanorm_num_embeddings is not None 30 | if adanorm_num_embeddings: 31 | self.norm = AdaLayerNorm(adanorm_num_embeddings, dim, eps=1e-6) 32 | else: 33 | self.norm = nn.LayerNorm(dim, eps=1e-6) 34 | self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers 35 | self.act = nn.GELU() 36 | self.pwconv2 = nn.Linear(intermediate_dim, dim) 37 | self.gamma = ( 38 | nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) 39 | if layer_scale_init_value > 0 40 | else None 41 | ) 42 | 43 | def forward(self, x: torch.Tensor, cond_embedding_id: Optional[torch.Tensor] = None) -> torch.Tensor: 44 | residual = x 45 | x = self.dwconv(x) 46 | x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) 47 | if self.adanorm: 48 | assert cond_embedding_id is not None 49 | x = self.norm(x, cond_embedding_id) 50 | else: 51 | x = self.norm(x) 52 | x = self.pwconv1(x) 53 | x = self.act(x) 54 | x = self.pwconv2(x) 55 | if self.gamma is not None: 56 | x = self.gamma * x 57 | x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) 58 | 59 | x = residual + x 60 | return x 61 | 62 | 63 | class AdaLayerNorm(nn.Module): 64 | """ 65 | Adaptive Layer Normalization module with learnable embeddings per `num_embeddings` classes 66 | 67 | Args: 68 | num_embeddings (int): Number of embeddings. 69 | embedding_dim (int): Dimension of the embeddings. 70 | """ 71 | 72 | def __init__(self, num_embeddings: int, embedding_dim: int, eps: float = 1e-6): 73 | super().__init__() 74 | self.eps = eps 75 | self.dim = embedding_dim 76 | self.scale = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim) 77 | self.shift = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim) 78 | torch.nn.init.ones_(self.scale.weight) 79 | torch.nn.init.zeros_(self.shift.weight) 80 | 81 | def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor) -> torch.Tensor: 82 | scale = self.scale(cond_embedding_id) 83 | shift = self.shift(cond_embedding_id) 84 | x = nn.functional.layer_norm(x, (self.dim,), eps=self.eps) 85 | x = x * scale + shift 86 | return x 87 | 88 | 89 | class ResBlock1(nn.Module): 90 | """ 91 | ResBlock adapted from HiFi-GAN V1 (https://github.com/jik876/hifi-gan) with dilated 1D convolutions, 92 | but without upsampling layers. 93 | 94 | Args: 95 | dim (int): Number of input channels. 96 | kernel_size (int, optional): Size of the convolutional kernel. Defaults to 3. 97 | dilation (tuple[int], optional): Dilation factors for the dilated convolutions. 98 | Defaults to (1, 3, 5). 99 | lrelu_slope (float, optional): Negative slope of the LeakyReLU activation function. 100 | Defaults to 0.1. 101 | layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling. 102 | Defaults to None. 103 | """ 104 | 105 | def __init__( 106 | self, 107 | dim: int, 108 | kernel_size: int = 3, 109 | dilation: Tuple[int, int, int] = (1, 3, 5), 110 | lrelu_slope: float = 0.1, 111 | layer_scale_init_value: Optional[float] = None, 112 | ): 113 | super().__init__() 114 | self.lrelu_slope = lrelu_slope 115 | self.convs1 = nn.ModuleList( 116 | [ 117 | weight_norm( 118 | nn.Conv1d( 119 | dim, 120 | dim, 121 | kernel_size, 122 | 1, 123 | dilation=dilation[0], 124 | padding=self.get_padding(kernel_size, dilation[0]), 125 | ) 126 | ), 127 | weight_norm( 128 | nn.Conv1d( 129 | dim, 130 | dim, 131 | kernel_size, 132 | 1, 133 | dilation=dilation[1], 134 | padding=self.get_padding(kernel_size, dilation[1]), 135 | ) 136 | ), 137 | weight_norm( 138 | nn.Conv1d( 139 | dim, 140 | dim, 141 | kernel_size, 142 | 1, 143 | dilation=dilation[2], 144 | padding=self.get_padding(kernel_size, dilation[2]), 145 | ) 146 | ), 147 | ] 148 | ) 149 | 150 | self.convs2 = nn.ModuleList( 151 | [ 152 | weight_norm(nn.Conv1d(dim, dim, kernel_size, 1, dilation=1, padding=self.get_padding(kernel_size, 1))), 153 | weight_norm(nn.Conv1d(dim, dim, kernel_size, 1, dilation=1, padding=self.get_padding(kernel_size, 1))), 154 | weight_norm(nn.Conv1d(dim, dim, kernel_size, 1, dilation=1, padding=self.get_padding(kernel_size, 1))), 155 | ] 156 | ) 157 | 158 | self.gamma = nn.ParameterList( 159 | [ 160 | nn.Parameter(layer_scale_init_value * torch.ones(dim, 1), requires_grad=True) 161 | if layer_scale_init_value is not None 162 | else None, 163 | nn.Parameter(layer_scale_init_value * torch.ones(dim, 1), requires_grad=True) 164 | if layer_scale_init_value is not None 165 | else None, 166 | nn.Parameter(layer_scale_init_value * torch.ones(dim, 1), requires_grad=True) 167 | if layer_scale_init_value is not None 168 | else None, 169 | ] 170 | ) 171 | 172 | def forward(self, x: torch.Tensor) -> torch.Tensor: 173 | for c1, c2, gamma in zip(self.convs1, self.convs2, self.gamma): 174 | xt = torch.nn.functional.leaky_relu(x, negative_slope=self.lrelu_slope) 175 | xt = c1(xt) 176 | xt = torch.nn.functional.leaky_relu(xt, negative_slope=self.lrelu_slope) 177 | xt = c2(xt) 178 | if gamma is not None: 179 | xt = gamma * xt 180 | x = xt + x 181 | return x 182 | 183 | def remove_weight_norm(self): 184 | for l in self.convs1: 185 | remove_weight_norm(l) 186 | for l in self.convs2: 187 | remove_weight_norm(l) 188 | 189 | @staticmethod 190 | def get_padding(kernel_size: int, dilation: int = 1) -> int: 191 | return int((kernel_size * dilation - dilation) / 2) 192 | 193 | 194 | def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor: 195 | """ 196 | Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values. 197 | 198 | Args: 199 | x (Tensor): Input tensor. 200 | clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7. 201 | 202 | Returns: 203 | Tensor: Element-wise logarithm of the input tensor with clipping applied. 204 | """ 205 | return torch.log(torch.clip(x, min=clip_val)) 206 | 207 | 208 | def symlog(x: torch.Tensor) -> torch.Tensor: 209 | return torch.sign(x) * torch.log1p(x.abs()) 210 | 211 | 212 | def symexp(x: torch.Tensor) -> torch.Tensor: 213 | return torch.sign(x) * (torch.exp(x.abs()) - 1) 214 | -------------------------------------------------------------------------------- /vocoder/vocos/spectral_ops.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | import torch 4 | from torch import nn, view_as_real, view_as_complex 5 | 6 | 7 | class ISTFT(nn.Module): 8 | """ 9 | Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with 10 | windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges. 11 | See issue: https://github.com/pytorch/pytorch/issues/62323 12 | Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs. 13 | The NOLA constraint is met as we trim padded samples anyway. 14 | 15 | Args: 16 | n_fft (int): Size of Fourier transform. 17 | hop_length (int): The distance between neighboring sliding window frames. 18 | win_length (int): The size of window frame and STFT filter. 19 | padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". 20 | """ 21 | 22 | def __init__(self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"): 23 | super().__init__() 24 | if padding not in ["center", "same"]: 25 | raise ValueError("Padding must be 'center' or 'same'.") 26 | self.padding = padding 27 | self.n_fft = n_fft 28 | self.hop_length = hop_length 29 | self.win_length = win_length 30 | window = torch.hann_window(win_length) 31 | self.register_buffer("window", window) 32 | 33 | def forward(self, spec: torch.Tensor) -> torch.Tensor: 34 | """ 35 | Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram. 36 | 37 | Args: 38 | spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size, 39 | N is the number of frequency bins, and T is the number of time frames. 40 | 41 | Returns: 42 | Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal. 43 | """ 44 | if self.padding == "center": 45 | # Fallback to pytorch native implementation 46 | return torch.istft(spec, self.n_fft, self.hop_length, self.win_length, self.window, center=True) 47 | elif self.padding == "same": 48 | pad = (self.win_length - self.hop_length) // 2 49 | else: 50 | raise ValueError("Padding must be 'center' or 'same'.") 51 | 52 | assert spec.dim() == 3, "Expected a 3D tensor as input" 53 | B, N, T = spec.shape 54 | 55 | # Inverse FFT 56 | ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward") 57 | ifft = ifft * self.window[None, :, None] 58 | 59 | # Overlap and Add 60 | output_size = (T - 1) * self.hop_length + self.win_length 61 | y = torch.nn.functional.fold( 62 | ifft, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length), 63 | )[:, 0, 0, pad:-pad] 64 | 65 | # Window envelope 66 | window_sq = self.window.square().expand(1, T, -1).transpose(1, 2) 67 | window_envelope = torch.nn.functional.fold( 68 | window_sq, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length), 69 | ).squeeze()[pad:-pad] 70 | 71 | # Normalize 72 | assert (window_envelope > 1e-11).all() 73 | y = y / window_envelope 74 | 75 | return y 76 | 77 | 78 | class MDCT(nn.Module): 79 | """ 80 | Modified Discrete Cosine Transform (MDCT) module. 81 | 82 | Args: 83 | frame_len (int): Length of the MDCT frame. 84 | padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". 85 | """ 86 | 87 | def __init__(self, frame_len: int, padding: str = "same"): 88 | super().__init__() 89 | if padding not in ["center", "same"]: 90 | raise ValueError("Padding must be 'center' or 'same'.") 91 | self.padding = padding 92 | self.frame_len = frame_len 93 | N = frame_len // 2 94 | n0 = (N + 1) / 2 95 | window = torch.from_numpy(scipy.signal.cosine(frame_len)).float() 96 | self.register_buffer("window", window) 97 | 98 | pre_twiddle = torch.exp(-1j * torch.pi * torch.arange(frame_len) / frame_len) 99 | post_twiddle = torch.exp(-1j * torch.pi * n0 * (torch.arange(N) + 0.5) / N) 100 | # view_as_real: NCCL Backend does not support ComplexFloat data type 101 | # https://github.com/pytorch/pytorch/issues/71613 102 | self.register_buffer("pre_twiddle", view_as_real(pre_twiddle)) 103 | self.register_buffer("post_twiddle", view_as_real(post_twiddle)) 104 | 105 | def forward(self, audio: torch.Tensor) -> torch.Tensor: 106 | """ 107 | Apply the Modified Discrete Cosine Transform (MDCT) to the input audio. 108 | 109 | Args: 110 | audio (Tensor): Input audio waveform of shape (B, T), where B is the batch size 111 | and T is the length of the audio. 112 | 113 | Returns: 114 | Tensor: MDCT coefficients of shape (B, L, N), where L is the number of output frames 115 | and N is the number of frequency bins. 116 | """ 117 | if self.padding == "center": 118 | audio = torch.nn.functional.pad(audio, (self.frame_len // 2, self.frame_len // 2)) 119 | elif self.padding == "same": 120 | # hop_length is 1/2 frame_len 121 | audio = torch.nn.functional.pad(audio, (self.frame_len // 4, self.frame_len // 4)) 122 | else: 123 | raise ValueError("Padding must be 'center' or 'same'.") 124 | 125 | x = audio.unfold(-1, self.frame_len, self.frame_len // 2) 126 | N = self.frame_len // 2 127 | x = x * self.window.expand(x.shape) 128 | X = torch.fft.fft(x * view_as_complex(self.pre_twiddle).expand(x.shape), dim=-1)[..., :N] 129 | res = X * view_as_complex(self.post_twiddle).expand(X.shape) * np.sqrt(1 / N) 130 | return torch.real(res) * np.sqrt(2) 131 | 132 | 133 | class IMDCT(nn.Module): 134 | """ 135 | Inverse Modified Discrete Cosine Transform (IMDCT) module. 136 | 137 | Args: 138 | frame_len (int): Length of the MDCT frame. 139 | padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". 140 | """ 141 | 142 | def __init__(self, frame_len: int, padding: str = "same"): 143 | super().__init__() 144 | if padding not in ["center", "same"]: 145 | raise ValueError("Padding must be 'center' or 'same'.") 146 | self.padding = padding 147 | self.frame_len = frame_len 148 | N = frame_len // 2 149 | n0 = (N + 1) / 2 150 | window = torch.from_numpy(scipy.signal.cosine(frame_len)).float() 151 | self.register_buffer("window", window) 152 | 153 | pre_twiddle = torch.exp(1j * torch.pi * n0 * torch.arange(N * 2) / N) 154 | post_twiddle = torch.exp(1j * torch.pi * (torch.arange(N * 2) + n0) / (N * 2)) 155 | self.register_buffer("pre_twiddle", view_as_real(pre_twiddle)) 156 | self.register_buffer("post_twiddle", view_as_real(post_twiddle)) 157 | 158 | def forward(self, X: torch.Tensor) -> torch.Tensor: 159 | """ 160 | Apply the Inverse Modified Discrete Cosine Transform (IMDCT) to the input MDCT coefficients. 161 | 162 | Args: 163 | X (Tensor): Input MDCT coefficients of shape (B, L, N), where B is the batch size, 164 | L is the number of frames, and N is the number of frequency bins. 165 | 166 | Returns: 167 | Tensor: Reconstructed audio waveform of shape (B, T), where T is the length of the audio. 168 | """ 169 | B, L, N = X.shape 170 | Y = torch.zeros((B, L, N * 2), dtype=X.dtype, device=X.device) 171 | Y[..., :N] = X 172 | Y[..., N:] = -1 * torch.conj(torch.flip(X, dims=(-1,))) 173 | y = torch.fft.ifft(Y * view_as_complex(self.pre_twiddle).expand(Y.shape), dim=-1) 174 | y = torch.real(y * view_as_complex(self.post_twiddle).expand(y.shape)) * np.sqrt(N) * np.sqrt(2) 175 | result = y * self.window.expand(y.shape) 176 | output_size = (1, (L + 1) * N) 177 | audio = torch.nn.functional.fold( 178 | result.transpose(1, 2), 179 | output_size=output_size, 180 | kernel_size=(1, self.frame_len), 181 | stride=(1, self.frame_len // 2), 182 | )[:, 0, 0, :] 183 | 184 | if self.padding == "center": 185 | pad = self.frame_len // 2 186 | elif self.padding == "same": 187 | pad = self.frame_len // 4 188 | else: 189 | raise ValueError("Padding must be 'center' or 'same'.") 190 | 191 | audio = audio[:, pad:-pad] 192 | return audio 193 | -------------------------------------------------------------------------------- /models/fastpitch/fastpitch/transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Optional 16 | 17 | import torch 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | 21 | # modifications: 22 | # moved mask_from_lens to this file 23 | 24 | # from common.utils import mask_from_lens 25 | 26 | def mask_from_lens(lens, max_len: Optional[int] = None): 27 | if max_len is None: 28 | max_len = lens.max() 29 | ids = torch.arange(0, max_len, device=lens.device, dtype=lens.dtype) 30 | mask = torch.lt(ids, lens.unsqueeze(1)) 31 | return mask 32 | 33 | 34 | class PositionalEmbedding(nn.Module): 35 | def __init__(self, demb): 36 | super(PositionalEmbedding, self).__init__() 37 | self.demb = demb 38 | inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb)) 39 | self.register_buffer('inv_freq', inv_freq) 40 | 41 | def forward(self, pos_seq, bsz=None): 42 | sinusoid_inp = torch.matmul(torch.unsqueeze(pos_seq, -1), 43 | torch.unsqueeze(self.inv_freq, 0)) 44 | pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=1) 45 | if bsz is not None: 46 | return pos_emb[None, :, :].expand(bsz, -1, -1) 47 | else: 48 | return pos_emb[None, :, :] 49 | 50 | 51 | class PositionwiseConvFF(nn.Module): 52 | def __init__(self, d_model, d_inner, kernel_size, dropout, pre_lnorm=False): 53 | super(PositionwiseConvFF, self).__init__() 54 | 55 | self.d_model = d_model 56 | self.d_inner = d_inner 57 | self.dropout = dropout 58 | 59 | self.CoreNet = nn.Sequential( 60 | nn.Conv1d(d_model, d_inner, kernel_size, 1, (kernel_size // 2)), 61 | nn.ReLU(), 62 | # nn.Dropout(dropout), # worse convergence 63 | nn.Conv1d(d_inner, d_model, kernel_size, 1, (kernel_size // 2)), 64 | nn.Dropout(dropout), 65 | ) 66 | self.layer_norm = nn.LayerNorm(d_model) 67 | self.pre_lnorm = pre_lnorm 68 | 69 | def forward(self, inp): 70 | return self._forward(inp) 71 | 72 | def _forward(self, inp): 73 | if self.pre_lnorm: 74 | # layer normalization + positionwise feed-forward 75 | core_out = inp.transpose(1, 2) 76 | core_out = self.CoreNet(self.layer_norm(core_out).to(inp.dtype)) 77 | core_out = core_out.transpose(1, 2) 78 | 79 | # residual connection 80 | output = core_out + inp 81 | else: 82 | # positionwise feed-forward 83 | core_out = inp.transpose(1, 2) 84 | core_out = self.CoreNet(core_out) 85 | core_out = core_out.transpose(1, 2) 86 | 87 | # residual connection + layer normalization 88 | output = self.layer_norm(inp + core_out).to(inp.dtype) 89 | 90 | return output 91 | 92 | 93 | class MultiHeadAttn(nn.Module): 94 | def __init__(self, n_head, d_model, d_head, dropout, dropatt=0.1, 95 | pre_lnorm=False): 96 | super(MultiHeadAttn, self).__init__() 97 | 98 | self.n_head = n_head 99 | self.d_model = d_model 100 | self.d_head = d_head 101 | self.scale = 1 / (d_head ** 0.5) 102 | self.pre_lnorm = pre_lnorm 103 | 104 | self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head) 105 | self.drop = nn.Dropout(dropout) 106 | self.dropatt = nn.Dropout(dropatt) 107 | self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) 108 | self.layer_norm = nn.LayerNorm(d_model) 109 | 110 | def forward(self, inp, attn_mask=None): 111 | return self._forward(inp, attn_mask) 112 | 113 | def _forward(self, inp, attn_mask=None): 114 | residual = inp 115 | 116 | if self.pre_lnorm: 117 | # layer normalization 118 | inp = self.layer_norm(inp) 119 | 120 | n_head, d_head = self.n_head, self.d_head 121 | 122 | head_q, head_k, head_v = torch.chunk(self.qkv_net(inp), 3, dim=2) 123 | head_q = head_q.view(inp.size(0), inp.size(1), n_head, d_head) 124 | head_k = head_k.view(inp.size(0), inp.size(1), n_head, d_head) 125 | head_v = head_v.view(inp.size(0), inp.size(1), n_head, d_head) 126 | 127 | q = head_q.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head) 128 | k = head_k.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head) 129 | v = head_v.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head) 130 | 131 | attn_score = torch.bmm(q, k.transpose(1, 2)) 132 | attn_score.mul_(self.scale) 133 | 134 | if attn_mask is not None: 135 | attn_mask = attn_mask.unsqueeze(1).to(attn_score.dtype) 136 | attn_mask = attn_mask.repeat(n_head, attn_mask.size(2), 1) 137 | attn_score.masked_fill_(attn_mask.to(torch.bool), -float('inf')) 138 | 139 | attn_prob = F.softmax(attn_score, dim=2) 140 | attn_prob = self.dropatt(attn_prob) 141 | attn_vec = torch.bmm(attn_prob, v) 142 | 143 | attn_vec = attn_vec.view(n_head, inp.size(0), inp.size(1), d_head) 144 | attn_vec = attn_vec.permute(1, 2, 0, 3).contiguous().view( 145 | inp.size(0), inp.size(1), n_head * d_head) 146 | 147 | # linear projection 148 | attn_out = self.o_net(attn_vec) 149 | attn_out = self.drop(attn_out) 150 | 151 | if self.pre_lnorm: 152 | # residual connection 153 | output = residual + attn_out 154 | else: 155 | # residual connection + layer normalization 156 | output = self.layer_norm(residual + attn_out) 157 | 158 | output = output.to(attn_out.dtype) 159 | 160 | return output 161 | 162 | 163 | class TransformerLayer(nn.Module): 164 | def __init__(self, n_head, d_model, d_head, d_inner, kernel_size, dropout, 165 | **kwargs): 166 | super(TransformerLayer, self).__init__() 167 | 168 | self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs) 169 | self.pos_ff = PositionwiseConvFF(d_model, d_inner, kernel_size, dropout, 170 | pre_lnorm=kwargs.get('pre_lnorm')) 171 | 172 | def forward(self, dec_inp, mask=None): 173 | output = self.dec_attn(dec_inp, attn_mask=~mask.squeeze(2)) 174 | output *= mask 175 | output = self.pos_ff(output) 176 | output *= mask 177 | return output 178 | 179 | 180 | class FFTransformer(nn.Module): 181 | def __init__(self, n_layer, n_head, d_model, d_head, d_inner, kernel_size, 182 | dropout, dropatt, dropemb=0.0, embed_input=True, 183 | n_embed=None, d_embed=None, padding_idx=0, pre_lnorm=False): 184 | super(FFTransformer, self).__init__() 185 | self.d_model = d_model 186 | self.n_head = n_head 187 | self.d_head = d_head 188 | self.padding_idx = padding_idx 189 | 190 | if embed_input: 191 | self.word_emb = nn.Embedding(n_embed, d_embed or d_model, 192 | padding_idx=self.padding_idx) 193 | else: 194 | self.word_emb = None 195 | 196 | self.pos_emb = PositionalEmbedding(self.d_model) 197 | self.drop = nn.Dropout(dropemb) 198 | self.layers = nn.ModuleList() 199 | 200 | for _ in range(n_layer): 201 | self.layers.append( 202 | TransformerLayer( 203 | n_head, d_model, d_head, d_inner, kernel_size, dropout, 204 | dropatt=dropatt, pre_lnorm=pre_lnorm) 205 | ) 206 | 207 | def forward(self, dec_inp, seq_lens=None, conditioning=0): 208 | if self.word_emb is None: 209 | inp = dec_inp 210 | mask = mask_from_lens(seq_lens).unsqueeze(2) 211 | else: 212 | inp = self.word_emb(dec_inp) 213 | # [bsz x L x 1] 214 | mask = (dec_inp != self.padding_idx).unsqueeze(2) 215 | 216 | pos_seq = torch.arange(inp.size(1), device=inp.device).to(inp.dtype) 217 | pos_emb = self.pos_emb(pos_seq) * mask 218 | 219 | out = self.drop(inp + pos_emb + conditioning) 220 | 221 | for layer in self.layers: 222 | out = layer(out, mask=mask) 223 | 224 | # out = self.drop(out) 225 | return out, mask 226 | -------------------------------------------------------------------------------- /models/fastpitch/fastpitch/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. 10 | 11 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. 12 | 13 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. 14 | 15 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. 16 | 17 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. 18 | 19 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. 20 | 21 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). 22 | 23 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. 24 | 25 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." 26 | 27 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 28 | 29 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 30 | 31 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 32 | 33 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: 34 | 35 | You must give any other recipients of the Work or Derivative Works a copy of this License; and 36 | You must cause any modified files to carry prominent notices stating that You changed the files; and 37 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and 38 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. 39 | 40 | You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 41 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 42 | 43 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 44 | 45 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 46 | 47 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 48 | 49 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. 50 | 51 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /vocoder/vocos/pretrained.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Dict, Tuple, Union, Optional 4 | 5 | import torch 6 | import yaml 7 | from huggingface_hub import hf_hub_download 8 | from torch import nn 9 | from . import config_22k, config_24k 10 | from .feature_extractors import FeatureExtractor, MelSpectrogramFeatures 11 | from .heads import FourierHead, ISTFTHead 12 | from .models import Backbone, VocosBackbone 13 | 14 | 15 | def instantiate_class(args: Union[Any, Tuple[Any, ...]], init: Dict[str, Any]) -> Any: 16 | """Instantiates a class with the given args and init. 17 | 18 | Args: 19 | args: Positional arguments required for instantiation. 20 | init: Dict of the form {"class_path":...,"init_args":...}. 21 | 22 | Returns: 23 | The instantiated class object. 24 | """ 25 | kwargs = init.get("init_args", {}) 26 | if not isinstance(args, tuple): 27 | args = (args,) 28 | class_module, class_name = init["class_path"].rsplit(".", 1) 29 | module = __import__(class_module, fromlist=[class_name]) 30 | args_class = getattr(module, class_name) 31 | return args_class(*args, **kwargs) 32 | 33 | 34 | class MelVocos(nn.Module): 35 | def __init__(self, config_name='24k'): 36 | super().__init__() 37 | 38 | config = {'22k': config_22k, '24k': config_24k,}[config_name] 39 | 40 | self.feature_extractor = MelSpectrogramFeatures( 41 | **config['feature_extractor']['init_args']) 42 | self.backbone = VocosBackbone(**config['backbone']['init_args']) 43 | self.head = ISTFTHead(**config['head']['init_args']) 44 | 45 | self.n_mels = config['feature_extractor']['init_args']['n_mels'] 46 | 47 | 48 | self.bias_vec = self.make_denoising_vector() 49 | def new_denoising_vector(m, y): 50 | m.bias_vec = m.make_denoising_vector() 51 | self.register_load_state_dict_post_hook(new_denoising_vector) 52 | 53 | # self.register_buffer('bias_vec', self.make_denoising_vector()) 54 | 55 | @property 56 | def device(self): 57 | return next(self.parameters()).device 58 | 59 | @torch.inference_mode() 60 | def make_denoising_vector(self): 61 | mel_rand = torch.zeros((1, self.n_mels, 88), device=self.device) 62 | bias_feats = self.backbone(mel_rand) 63 | 64 | x_bias = self.head.out(bias_feats).transpose(1, 2) 65 | mag_bias, _ = x_bias.chunk(2, dim=1) 66 | mag_bias = torch.exp(mag_bias) 67 | mag_bias = torch.clip(mag_bias, max=1e2) # safeguard to prevent excessively large magnitudes 68 | 69 | mag_bias_vec = mag_bias[:,:,0:1] # [1, 513, 1] 70 | 71 | return mag_bias_vec 72 | 73 | def forward(self, mel_spec, denoise=0.): # [B, bands=100, frames] 74 | 75 | bb_feats = self.backbone(mel_spec) 76 | x = self.head.out(bb_feats).transpose(1, 2) 77 | 78 | mag, p = x.chunk(2, dim=1) 79 | mag = torch.exp(mag) 80 | # mag = torch.clip(mag, max=1e2) # safeguard to prevent excessively large magnitudes 81 | # wrapping happens here. These two lines produce real and imaginary value 82 | x, y = torch.cos(p), torch.sin(p) 83 | # phase = torch.atan2(y, x) 84 | 85 | mag = mag - denoise*self.bias_vec #.to(mel_spec.device) 86 | 87 | mag = torch.clamp(mag, min=0., max=1e2) 88 | 89 | S = mag * (x + 1j * y) 90 | 91 | wave = self.head.istft(S) 92 | 93 | return wave # [B, samples] 94 | 95 | def reconstruct(self, wave, denoise=0.): 96 | mel_spec = self.feature_extractor(wave) 97 | return self.forward(mel_spec, denoise=denoise) 98 | 99 | 100 | class Vocos(nn.Module): 101 | """ 102 | The Vocos class represents a Fourier-based neural vocoder for audio synthesis. 103 | This class is primarily designed for inference, with support for loading from pretrained 104 | model checkpoints. It consists of three main components: a feature extractor, 105 | a backbone, and a head. 106 | """ 107 | 108 | def __init__( 109 | self, feature_extractor: FeatureExtractor, backbone: Backbone, head: FourierHead, 110 | ): 111 | super().__init__() 112 | self.feature_extractor = feature_extractor 113 | self.backbone = backbone 114 | self.head = head 115 | 116 | @classmethod 117 | def from_hparams(cls, config_path: str) -> Vocos: 118 | """ 119 | Class method to create a new Vocos model instance from hyperparameters stored in a yaml configuration file. 120 | """ 121 | with open(config_path, "r") as f: 122 | config = yaml.safe_load(f) 123 | feature_extractor = instantiate_class(args=(), init=config["feature_extractor"]) 124 | backbone = instantiate_class(args=(), init=config["backbone"]) 125 | head = instantiate_class(args=(), init=config["head"]) 126 | model = cls(feature_extractor=feature_extractor, backbone=backbone, head=head) 127 | return model 128 | 129 | @classmethod 130 | def from_pretrained(cls, repo_id: str, revision: Optional[str] = None) -> Vocos: 131 | """ 132 | Class method to create a new Vocos model instance from a pre-trained model stored in the Hugging Face model hub. 133 | """ 134 | config_path = hf_hub_download(repo_id=repo_id, filename="config.yaml", revision=revision) 135 | model_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin", revision=revision) 136 | model = cls.from_hparams(config_path) 137 | state_dict = torch.load(model_path, map_location="cpu") 138 | # if isinstance(model.feature_extractor, EncodecFeatures): 139 | # encodec_parameters = { 140 | # "feature_extractor.encodec." + key: value 141 | # for key, value in model.feature_extractor.encodec.state_dict().items() 142 | # } 143 | # state_dict.update(encodec_parameters) 144 | model.load_state_dict(state_dict) 145 | model.eval() 146 | return model 147 | 148 | @torch.inference_mode() 149 | def forward(self, audio_input: torch.Tensor, **kwargs: Any) -> torch.Tensor: 150 | """ 151 | Method to run a copy-synthesis from audio waveform. The feature extractor first processes the audio input, 152 | which is then passed through the backbone and the head to reconstruct the audio output. 153 | 154 | Args: 155 | audio_input (Tensor): The input tensor representing the audio waveform of shape (B, T), 156 | where B is the batch size and L is the waveform length. 157 | 158 | 159 | Returns: 160 | Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T). 161 | """ 162 | features = self.feature_extractor(audio_input, **kwargs) 163 | audio_output = self.decode(features, **kwargs) 164 | return audio_output 165 | 166 | @torch.inference_mode() 167 | def decode(self, features_input: torch.Tensor, **kwargs: Any) -> torch.Tensor: 168 | """ 169 | Method to decode audio waveform from already calculated features. The features input is passed through 170 | the backbone and the head to reconstruct the audio output. 171 | 172 | Args: 173 | features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size, 174 | C denotes the feature dimension, and L is the sequence length. 175 | 176 | Returns: 177 | Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T). 178 | """ 179 | x = self.backbone(features_input, **kwargs) 180 | audio_output = self.head(x) 181 | return audio_output 182 | 183 | # @torch.inference_mode() 184 | # def codes_to_features(self, codes: torch.Tensor) -> torch.Tensor: 185 | # """ 186 | # Transforms an input sequence of discrete tokens (codes) into feature embeddings using the feature extractor's 187 | # codebook weights. 188 | 189 | # Args: 190 | # codes (Tensor): The input tensor. Expected shape is (K, L) or (K, B, L), 191 | # where K is the number of codebooks, B is the batch size and L is the sequence length. 192 | 193 | # Returns: 194 | # Tensor: Features of shape (B, C, L), where B is the batch size, C denotes the feature dimension, 195 | # and L is the sequence length. 196 | # """ 197 | # assert isinstance( 198 | # self.feature_extractor, EncodecFeatures 199 | # ), "Feature extractor should be an instance of EncodecFeatures" 200 | 201 | # if codes.dim() == 2: 202 | # codes = codes.unsqueeze(1) 203 | 204 | # n_bins = self.feature_extractor.encodec.quantizer.bins 205 | # offsets = torch.arange(0, n_bins * len(codes), n_bins, device=codes.device) 206 | # embeddings_idxs = codes + offsets.view(-1, 1, 1) 207 | # features = torch.nn.functional.embedding(embeddings_idxs, self.feature_extractor.codebook_weights).sum(dim=0) 208 | # features = features.transpose(1, 2) 209 | 210 | # return features 211 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tts-arabic-pytorch 2 | [[Samples 1]](https://nipponjo.github.io/tts-arabic-samples) 3 | [[Samples 2]](https://nipponjo.github.io/tts-arabic-speakers) 4 | [[Package / ONNX models]](https://github.com/nipponjo/tts_arabic) 5 | [[Flutter app]](https://github.com/nipponjo/tts-arabic-flutter) 6 | [[arXiv]](https://arxiv.org/abs/2512.00937) 7 | 8 | TTS models (Tacotron2, FastPitch), trained on [Nawar Halabi](https://github.com/nawarhalabi)'s [Arabic Speech Corpus](http://en.arabicspeechcorpus.com/), including the [HiFi-GAN vocoder](https://github.com/jik876/hifi-gan) for direct TTS inference. 9 | 10 |
11 | 12 |
13 | 14 | **Manuscript** 15 | 16 | More information about this repository can also be found in the manuscript *Arabic TTS with FastPitch: Reproducible Baselines, Adversarial Training, and Oversmoothing Analysis* ([arXiv](https://arxiv.org/abs/2512.00937) | [ResearchGate](https://www.researchgate.net/publication/398225096_Arabic_TTS_with_FastPitch_Reproducible_Baselines_Adversarial_Training_and_Oversmoothing_Analysis)). 17 | 18 | **Related Papers** 19 | 20 | Tacotron2 | *Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions* ([arXiv](https://arxiv.org/abs/1712.05884)) 21 | 22 | FastPitch | *FastPitch: Parallel Text-to-speech with Pitch Prediction* ([arXiv](https://arxiv.org/abs/2006.06873)) 23 | 24 | HiFi-GAN | *HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis* ([arXiv](https://arxiv.org/abs/2010.05646)) 25 | 26 | 27 | 28 | ## Audio Samples 29 | 30 | You can listen to some audio samples [here](https://nipponjo.github.io/tts-arabic-samples). 31 | 32 | ## Multispeaker model (in progress) 33 | 34 | Multispeaker weights are available for the FastPitch model. 35 | Currently, another male voice and two female voices have been added. 36 | Audio samples can be found [here](https://nipponjo.github.io/tts-arabic-speakers). Download weights [here](https://drive.google.com/u/0/uc?id=18IYUSRXvLErVjaDORj_TKzUxs90l61Ja&export=download). There also exists an [ONNX version](https://github.com/nipponjo/tts_arabic) for this model. 37 | 38 | The multispeaker dataset was created by synthesizing data with [Coqui](https://github.com/coqui-ai)'s [XTTS-v2](https://huggingface.co/coqui/XTTS-v2) model and a mix of voices from the [Tunisian_MSA](https://www.openslr.org/46/) dataset. 39 | 40 | ## Quick Setup 41 | 42 | The models were trained with the mse loss as described in the papers. I also trained the models using an additional adversarial loss (adv). The difference is not large, but I think that the (adv) version often sounds a bit clearer. You can compare them yourself. 43 | 44 | Running `python download_files.py` will download all pretrained weights, alternatively: 45 | 46 | Download the pretrained weights for the Tacotron2 model ([mse](https://drive.google.com/u/0/uc?id=1GCu-ZAcfJuT5qfzlKItcNqtuVNa7CNy9&export=download) | [adv](https://drive.google.com/u/0/uc?id=1FusCFZIXSVCQ9Q6PLb91GIkEnhn_zWRS&export=download)). 47 | 48 | Download the pretrained weights for the FastPitch model ([mse](https://drive.google.com/u/0/uc?id=1sliRc62wjPTnPWBVQ95NDUgnCSH5E8M0&export=download) | [adv](https://drive.google.com/u/0/uc?id=1-vZOhi9To_78-yRslC6sFLJBUjwgJT-D&export=download)). 49 | 50 | Download the [HiFi-GAN vocoder](https://github.com/jik876/hifi-gan) weights ([link](https://drive.google.com/u/0/uc?id=1zSYYnJFS-gQox-IeI71hVY-fdPysxuFK&export=download)). Either put them into `pretrained/hifigan-asc-v1` or edit the following lines in `configs/basic.yaml`. 51 | 52 | ```yaml 53 | # vocoder 54 | vocoder_state_path: pretrained/hifigan-asc-v1/hifigan-asc.pth 55 | vocoder_config_path: pretrained/hifigan-asc-v1/config.json 56 | ``` 57 | 58 | This repo includes the diacritization models [Shakkala](https://github.com/Barqawiz/Shakkala) and [Shakkelha](https://github.com/AliOsm/shakkelha). 59 | 60 | The weights can be downloaded [here](https://drive.google.com/u/1/uc?id=1MIZ_t7pqAQP-R3vwWWQTJMER8yPm1uB1&export=download). There also exists a [separate repo](https://github.com/nipponjo/arabic-vocalization) and [package](https://github.com/nipponjo/arabic_vocalizer). 61 | 62 | -> Alternatively, [download all models](https://drive.google.com/u/1/uc?id=1FD2J-xUk48JPF9TeS8ZKHzDC_ZNBfLd8&export=download) and put the content of the zip file into the `pretrained` folder. 63 | 64 | ## Required packages: 65 | 66 | `torch torchaudio pyyaml` 67 | 68 | ~ for training: `librosa matplotlib tensorboard` 69 | 70 | ~ for the demo app: `fastapi "uvicorn[standard]"` 71 | 72 | ## Using the models 73 | 74 | The `Tacotron2`/`FastPitch` from `models.tacotron2`/`models.fastpitch` are wrappers that simplify text-to-mel inference. The `Tacotron2Wave`/`FastPitch2Wave` models includes the [HiFi-GAN vocoder](https://github.com/jik876/hifi-gan) for direct text-to-speech inference. 75 | 76 | ## Inference options 77 | 78 | ```python 79 | text = "اَلسَّلامُ عَلَيكُم يَا صَدِيقِي." 80 | 81 | wave = model.tts( 82 | text_input = text, # input text 83 | speed = 1, # speaking speed 84 | denoise = 0.005, # HifiGAN denoiser strength 85 | speaker_id = 0, # speaker id 86 | batch_size = 2, # batch size for batched inference 87 | vowelizer = None, # vowelizer model 88 | pitch_mul = 1, # pitch multiplier (for FastPitch) 89 | pitch_add = 0, # pitch offset (for FastPitch) 90 | return_mel = False # return mel spectrogram? 91 | ) 92 | ``` 93 | 94 | ## Inferring the Mel spectrogram 95 | 96 | ```python 97 | from models.tacotron2 import Tacotron2 98 | model = Tacotron2('pretrained/tacotron2_ar_adv.pth') 99 | model = model.cuda() 100 | mel_spec = model.ttmel("اَلسَّلامُ عَلَيكُم يَا صَدِيقِي.") 101 | ``` 102 | 103 | ```python 104 | from models.fastpitch import FastPitch 105 | model = FastPitch('pretrained/fastpitch_ar_adv.pth') 106 | model = model.cuda() 107 | mel_spec = model.ttmel("اَلسَّلامُ عَلَيكُم يَا صَدِيقِي.") 108 | ``` 109 | 110 | ## End-to-end Text-to-Speech 111 | 112 | ```python 113 | from models.tacotron2 import Tacotron2Wave 114 | model = Tacotron2Wave('pretrained/tacotron2_ar_adv.pth') 115 | model = model.cuda() 116 | wave = model.tts("اَلسَّلامُ عَلَيكُم يَا صَدِيقِي.") 117 | 118 | wave_list = model.tts(["صِفر" ,"واحِد" ,"إِثنان", "ثَلاثَة" ,"أَربَعَة" ,"خَمسَة", "سِتَّة" ,"سَبعَة" ,"ثَمانِيَة", "تِسعَة" ,"عَشَرَة"]) 119 | ``` 120 | 121 | ```python 122 | from models.fastpitch import FastPitch2Wave 123 | model = FastPitch2Wave('pretrained/fastpitch_ar_adv.pth') 124 | model = model.cuda() 125 | wave = model.tts("اَلسَّلامُ عَلَيكُم يَا صَدِيقِي.") 126 | 127 | wave_list = model.tts(["صِفر" ,"واحِد" ,"إِثنان", "ثَلاثَة" ,"أَربَعَة" ,"خَمسَة", "سِتَّة" ,"سَبعَة" ,"ثَمانِيَة", "تِسعَة" ,"عَشَرَة"]) 128 | ``` 129 | 130 | By default, Arabic letters are converted using the [Buckwalter transliteration](https://en.wikipedia.org/wiki/Buckwalter_transliteration), which can also be used directly. 131 | 132 | ```python 133 | wave = model.tts(">als~alAmu Ealaykum yA Sadiyqiy.") 134 | wave_list = model.tts(["Sifr", "wAHid", "arbaEap", "xamsap", "sit~ap", "sabEap", "^amAniyap", "tisEap", "Ea$arap"]) 135 | ``` 136 | 137 | ## Unvocalized text 138 | 139 | ```python 140 | text_unvoc = "اللغة العربية هي أكثر اللغات السامية تحدثا، وإحدى أكثر اللغات انتشارا في العالم" 141 | wave_shakkala = model.tts(text_unvoc, vowelizer='shakkala') 142 | wave_shakkelha = model.tts(text_unvoc, vowelizer='shakkelha') 143 | ``` 144 | 145 | 146 | ### Inference from text file 147 | 148 | ```bash 149 | python inference.py 150 | # default parameters: 151 | python inference.py --list data/infer_text.txt --out_dir samples/results --model fastpitch --checkpoint pretrained/fastpitch_ar_adv.pth --batch_size 2 --denoise 0 152 | ``` 153 | 154 | ## Testing the model 155 | 156 | To test the model run: 157 | ```bash 158 | python test.py 159 | # default parameters: 160 | python test.py --model fastpitch --checkpoint pretrained/fastpitch_ar_adv.pth --out_dir samples/test 161 | ``` 162 | 163 | ## Processing details 164 | 165 | This repo uses Nawar Halabi's [Arabic-Phonetiser](https://github.com/nawarhalabi/Arabic-Phonetiser) but simplifies the result such that different contexts are ignored (see `text/symbols.py`). Further, a doubled consonant is represented as consonant + doubling-token. 166 | 167 | The Tacotron2 model can sometimes struggle to pronounce the last phoneme of a sentence when it ends in an unvocalized consonant. The pronunciation is more reliable if one appends a word-separator token at the end and cuts it off using the alignments weights (details in `models.networks`). This option is implemented as a default postprocessing step that can be disabled by setting `postprocess_mel=False`. 168 | 169 | 170 | ## Training the model 171 | 172 | Before training, the audio files must be resampled. The model was trained after preprocessing the files using `scripts/preprocess_audio.py`. 173 | 174 | To train the model with options specified in the config file run: 175 | ```bash 176 | python train.py 177 | # default parameters: 178 | python train.py --config configs/nawar.yaml 179 | ``` 180 | 181 | 182 | ## Web app 183 | 184 | The web app uses the FastAPI library. To run the app you need the following packages: 185 | 186 | fastapi: for the backend api | uvicorn: for serving the app 187 | 188 | Install with: `pip install fastapi "uvicorn[standard]"` 189 | 190 | Run with: `python app.py` 191 | 192 | Preview: 193 | 194 |
195 | 196 |
197 | 198 | 199 | 200 | ## Acknowledgements 201 | 202 | I referred to NVIDIA's [Tacotron2 implementation](https://github.com/NVIDIA/tacotron2) for details on model training. 203 | 204 | The FastPitch files stem from NVIDIA's [DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples/) 205 | --------------------------------------------------------------------------------