├── .gitignore ├── LICENSE ├── README.md ├── compare_infer.py ├── datahelper ├── __init__.py ├── libritts_3.py ├── ljspeech_3.py ├── timit_3.py ├── whisper_val.py └── wtimit_3.py ├── datapreper ├── generate.py ├── pseudo_whisper.py └── silero_vad.py ├── experiments ├── quickvc │ └── quickvc.pth.txt └── s2uu2s │ └── epoch=440-step=409942.ckpt.txt ├── infer.py ├── libs ├── FastSpeech2 │ ├── __init__.py │ ├── __pycache__ │ │ └── __init__.cpython-310.pyc │ ├── model │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-310.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── fastspeech2.cpython-310.pyc │ │ │ ├── fastspeech2.cpython-38.pyc │ │ │ ├── loss.cpython-310.pyc │ │ │ ├── loss.cpython-38.pyc │ │ │ ├── modules.cpython-310.pyc │ │ │ ├── modules.cpython-38.pyc │ │ │ ├── optimizer.cpython-310.pyc │ │ │ └── optimizer.cpython-38.pyc │ │ ├── fastspeech2.py │ │ ├── loss.py │ │ ├── modules.py │ │ └── optimizer.py │ ├── transformer │ │ ├── Constants.py │ │ ├── Layers.py │ │ ├── Models.py │ │ ├── Modules.py │ │ ├── SubLayers.py │ │ ├── __init__.py │ │ └── __pycache__ │ │ │ ├── Constants.cpython-310.pyc │ │ │ ├── Constants.cpython-38.pyc │ │ │ ├── Layers.cpython-310.pyc │ │ │ ├── Layers.cpython-38.pyc │ │ │ ├── Models.cpython-310.pyc │ │ │ ├── Models.cpython-38.pyc │ │ │ ├── Modules.cpython-310.pyc │ │ │ ├── Modules.cpython-38.pyc │ │ │ ├── SubLayers.cpython-310.pyc │ │ │ ├── SubLayers.cpython-38.pyc │ │ │ ├── __init__.cpython-310.pyc │ │ │ └── __init__.cpython-38.pyc │ └── utils │ │ ├── __pycache__ │ │ ├── model.cpython-310.pyc │ │ ├── model.cpython-38.pyc │ │ ├── tools.cpython-310.pyc │ │ └── tools.cpython-38.pyc │ │ └── tools.py ├── JDC │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ └── model.cpython-310.pyc │ ├── bst.t7 │ └── model.py ├── __init__.py ├── hifigan │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── model.cpython-310.pyc │ │ ├── models.cpython-310.pyc │ │ └── models.cpython-38.pyc │ ├── config.json │ ├── model.py │ ├── models.py │ └── my_config_v1_16000.json ├── hubert │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── model.cpython-310.pyc │ │ └── model.cpython-38.pyc │ ├── model.py │ └── utils.py └── wavlm │ ├── WavLM-Large.pt.txt │ ├── WavLM.py │ ├── __pycache__ │ ├── WavLM.cpython-310.pyc │ ├── WavLM.cpython-38.pyc │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-38.pyc │ ├── modules.cpython-310.pyc │ └── modules.cpython-38.pyc │ └── modules.py ├── minimal_quickvc ├── commons.py ├── models.py ├── modules.py └── utils.py ├── minimal_wesper ├── config │ ├── LJ_hubert_layer12 │ │ └── stats.json │ ├── my_model16000.yaml │ └── my_preprocess16k_LJ.yaml └── whisper_normal.py ├── models ├── __init__.py ├── discriminators.py ├── loss.py ├── s2u.py └── u2s.py ├── raw ├── data_in_the_wild │ ├── W2S_403_headset │ │ ├── 403_headset.wav │ │ ├── DisocoGAN_403_headset.wav │ │ ├── MSpeC_403_headset.wav │ │ ├── WES_403_headset.wav │ │ ├── s000_403_headset.wav │ │ ├── s000_QuickVC_403_headset.wav │ │ ├── s000_W2S_403_headset.wav │ │ ├── s001_403_headset.wav │ │ ├── s001_QuickVC_403_headset.wav │ │ ├── s001_W2S_403_headset.wav │ │ ├── s002_403_headset.wav │ │ ├── s002_QuickVC_403_headset.wav │ │ ├── s002_W2S_403_headset.wav │ │ ├── s003_403_headset.wav │ │ ├── s003_QuickVC_403_headset.wav │ │ └── s003_W2S_403_headset.wav │ ├── W2S_416_headset │ │ ├── 416_headset.wav │ │ ├── DisocoGAN_416_headset.wav │ │ ├── MSpeC_416_headset.wav │ │ ├── WES_416_headset.wav │ │ ├── s000_416_headset.wav │ │ ├── s000_QuickVC_416_headset.wav │ │ ├── s001_416_headset.wav │ │ ├── s001_QuickVC_416_headset.wav │ │ ├── s002_416_headset.wav │ │ ├── s002_QuickVC_416_headset.wav │ │ ├── s003_416_headset.wav │ │ └── s003_QuickVC_416_headset.wav │ ├── fw001 │ │ ├── AGAN-W2SC_fn001.wav │ │ ├── BLSTM_fn001.wav │ │ ├── CycleGAN-VC_fn001.wav │ │ ├── GMM_fn001.wav │ │ ├── WES_fw001.wav │ │ ├── fw001.wav │ │ ├── s000_QuickVC_fw001.wav │ │ ├── s000_fw001.wav │ │ ├── s001_QuickVC_fw001.wav │ │ ├── s001_fw001.wav │ │ ├── s002_QuickVC_fw001.wav │ │ ├── s002_fw001.wav │ │ ├── s003_QuickVC_fw001.wav │ │ └── s003_fw001.wav │ ├── fw002 │ │ ├── AGAN-W2SC_fn002.wav │ │ ├── BLSTM_fn002.wav │ │ ├── CycleGAN-VC_fn002.wav │ │ ├── GMM_fn002.wav │ │ ├── WES_fw002.wav │ │ ├── fw002.wav │ │ ├── s000_QuickVC_fw002.wav │ │ ├── s000_fw002.wav │ │ ├── s001_QuickVC_fw002.wav │ │ ├── s001_fw002.wav │ │ ├── s002_QuickVC_fw002.wav │ │ ├── s002_fw002.wav │ │ ├── s003_QuickVC_fw002.wav │ │ └── s003_fw002.wav │ └── sample_whisper │ │ ├── WES_sample_whisper.wav │ │ ├── s000_QuickVC_sample_whisper.wav │ │ ├── s000_sample_whisper.wav │ │ ├── s001_QuickVC_sample_whisper.wav │ │ ├── s001_sample_whisper.wav │ │ ├── s002_QuickVC_sample_whisper.wav │ │ ├── s002_sample_whisper.wav │ │ ├── s003_QuickVC_sample_whisper.wav │ │ ├── s003_sample_whisper.wav │ │ └── sample_whisper.wav ├── freevc │ ├── s000u003w.wav │ ├── s001u003w.wav │ ├── s002u003w.wav │ └── s003u003w.wav ├── gt │ ├── s000u003n.wav │ ├── s000u003w.wav │ ├── s001u003n.wav │ ├── s001u003w.wav │ ├── s002u003n.wav │ ├── s002u003w.wav │ ├── s003u003n.wav │ └── s003u003w.wav ├── pseudo │ ├── s000u003n.wav │ ├── s001u003n.wav │ ├── s002u003n.wav │ └── s003u003n.wav ├── quickvc │ ├── s000u003w.wav │ ├── s001u003w.wav │ ├── s002u003w.wav │ └── s003u003w.wav ├── s2u_fs2_hifigan │ ├── s000u003w.wav │ ├── s001u003w.wav │ ├── s002u003w.wav │ └── s003u003w.wav ├── s2u_ms_istft_vits │ ├── s000u003w.wav │ ├── s001u003w.wav │ ├── s002u003w.wav │ └── s003u003w.wav ├── s2u_u2s │ ├── s000u003w.wav │ ├── s001u003w.wav │ ├── s002u003w.wav │ └── s003u003w.wav ├── softvc │ ├── s000u003w.wav │ ├── s001u003w.wav │ ├── s002u003w.wav │ └── s003u003w.wav ├── test │ ├── s000u003w.wav │ ├── s001u003w.wav │ ├── s002u003w.wav │ └── s003u003w.wav └── wesper │ ├── s000u003w.wav │ ├── s001u003w.wav │ ├── s002u003w.wav │ └── s003u003w.wav ├── requirements.txt ├── resources └── system_diagram.png ├── u2ss2u.py └── utils ├── __init_.py ├── audioprep.py ├── config.py ├── s2f0.py └── s2fhubert.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .ipynb_checkpoints/ 3 | *.pyc 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 tan90xx 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DistillW2N 2 | 3 | PyTorch Implementation of [DistillW2N: A Lightweight One-Shot Whisper to Normal Voice Conversion Model Using Distillation of Self-Supervised Features](https://ieeexplore.ieee.org/abstract/document/10888480) 4 | 5 | ## Quick Started 6 | ### Setup 7 | 1. Create a Python environment with e.g. conda: `conda create --name distillw2n python=3.10.12 --yes` 8 | 2. Activate the new environment: `conda activate distillw2n` 9 | 3. Install torch and torchaudio: `pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121` 10 | 4. Update the packages: `sudo apt-get update && apt-get install -y libsndfile1 ffmpeg` 11 | 5. Install requirements with `pip install -r requirements.txt` 12 | 6. Download models with links given in [txt](https://github.com/tan90xx/distillw2n/blob/master/experiments/) 13 | 14 | ### Inference 15 | - For quickvc and wesper please run: `python compare_infer.py` 16 | - For our models please run: `python infer.py` 17 | 18 | ### Training 19 | - Please run: `python u2ss2u.py` 20 | 21 | ## Datasets 22 | You just need to download the datasets under `YOURPATH`. 23 | - Dataset Download 24 | - For the libritts, ljspeech, and timit datasets, [datahelper](https://github.com/tan90xx/distillw2n/tree/master/datahelper) will automatically download if they are not found at `YOURPATH`. 25 | - For the wtimit dataset, you will need to request it via email. Follow the appropriate procedures to obtain access and download the dataset to `YOURPATH`. 26 | - Dataset Preparation (Option) 27 | - [datapreper](https://github.com/tan90xx/distillw2n/tree/master/datapreper) offers options for ppw (Pseudo-whisper) and vad (Voice Activity Detection) versions. You can choose to apply these processing steps according to your project's requirements. 28 | 29 | ## Credits 30 | This implementation builds on 31 | - [SoundStream](https://github.com/kaiidams/soundstream-pytorch) for the training pipeline. 32 | -------------------------------------------------------------------------------- /compare_infer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import soundfile as sf 4 | import librosa 5 | 6 | def init_model(model_type): 7 | 8 | if model_type == 'quickvc': 9 | from minimal_quickvc.models import SynthesizerTrn 10 | from minimal_quickvc.utils import load_checkpoint 11 | model = SynthesizerTrn().eval().to('cuda') 12 | model_path = './experiments/quickvc/quickvc.pth' 13 | _ = load_checkpoint(model_path, model, None) 14 | embedder_model = torch.hub.load( 15 | "bshall/hubert:main", "hubert_soft").eval().to('cuda') 16 | 17 | elif model_type == 'wesper': 18 | from minimal_wesper.whisper_normal import SynthesizerTrn, load_hubert 19 | model = SynthesizerTrn().eval().to('cuda') 20 | embedder_model = load_hubert(device='cuda') 21 | 22 | return embedder_model, model 23 | 24 | 25 | class Inferer: 26 | def __init__(self, model_type): 27 | self.model_type = model_type 28 | self.hubert, self.model = init_model(model_type) 29 | self.conv_sr = 16000 30 | 31 | def vc_fn(self, audio): 32 | with torch.no_grad(): 33 | wav_src = torch.from_numpy(audio).unsqueeze(0).unsqueeze(0).to('cuda') 34 | c = self.hubert.units(wav_src) 35 | c = c.transpose(2, 1) 36 | 37 | if self.model_type == 'quickvc': 38 | mel_tgt = torch.zeros(1, 80, 64).to('cuda') 39 | audio = self.model.infer(c, mel=mel_tgt) 40 | 41 | elif self.model_type == 'wesper': 42 | audio = self.model.infer(c) 43 | 44 | audio = audio.squeeze(0).squeeze(0).cpu().numpy() 45 | audio = (audio * 32767).astype(np.int16) 46 | return audio 47 | 48 | def file_infer(self, fname, oname): 49 | audio, _ = librosa.load(fname, sr=self.conv_sr) 50 | audio_out = self.vc_fn(audio) 51 | sf.write(oname, audio_out, self.conv_sr) 52 | return audio_out 53 | 54 | inferer = Inferer('quickvc') 55 | audio_out = inferer.file_infer('./raw/gt/s000u003w.wav', 's000u003w_quickvc.wav') 56 | inferer = Inferer('wesper') 57 | audio_out = inferer.file_infer('./raw/gt/s000u003w.wav', 's000u003w_wesper.wav') -------------------------------------------------------------------------------- /datahelper/__init__.py: -------------------------------------------------------------------------------- 1 | from .timit_3 import TIMIT 2 | from .wtimit_3 import WTIMIT 3 | from .ljspeech_3 import LJSPEECH 4 | from .libritts_3 import LIBRITTS 5 | from .whisper_val import WHISPER 6 | 7 | __all__ = ['TIMIT', 'WTIMIT', 'LJSPEECH', 'LIBRITTS', 'WHISPER'] -------------------------------------------------------------------------------- /datahelper/libritts_3.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from typing import Tuple, Union 4 | 5 | import torchaudio 6 | from torch import Tensor 7 | from torch.utils.data import Dataset 8 | from torchaudio._internal import download_url_to_file 9 | from torchaudio.datasets.utils import _extract_tar 10 | 11 | URL = "train-clean-100" 12 | FOLDER_IN_ARCHIVE = "LibriTTS" 13 | _CHECKSUMS = { 14 | "http://www.openslr.org/resources/60/dev-clean.tar.gz": "da0864e1bd26debed35da8a869dd5c04dfc27682921936de7cff9c8a254dbe1a", # noqa: E501 15 | "http://www.openslr.org/resources/60/dev-other.tar.gz": "d413eda26f3a152ac7c9cf3658ef85504dfb1b625296e5fa83727f5186cca79c", # noqa: E501 16 | "http://www.openslr.org/resources/60/test-clean.tar.gz": "234ea5b25859102a87024a4b9b86641f5b5aaaf1197335c95090cde04fe9a4f5", # noqa: E501 17 | "http://www.openslr.org/resources/60/test-other.tar.gz": "33a5342094f3bba7ccc2e0500b9e72d558f72eb99328ac8debe1d9080402f10d", # noqa: E501 18 | "http://www.openslr.org/resources/60/train-clean-100.tar.gz": "c5608bf1ef74bb621935382b8399c5cdd51cd3ee47cec51f00f885a64c6c7f6b", # noqa: E501 19 | "http://www.openslr.org/resources/60/train-clean-360.tar.gz": "ce7cff44dcac46009d18379f37ef36551123a1dc4e5c8e4eb73ae57260de4886", # noqa: E501 20 | "http://www.openslr.org/resources/60/train-other-500.tar.gz": "e35f7e34deeb2e2bdfe4403d88c8fdd5fbf64865cae41f027a185a6965f0a5df", # noqa: E501 21 | } 22 | 23 | 24 | def load_libritts_item( 25 | fileid: str, 26 | path: str, 27 | ext_audio: str, 28 | ext_original_txt: str, 29 | ext_normalized_txt: str, 30 | ) -> Tuple[Tensor, int, str, str, int, int, str]: 31 | speaker_id, chapter_id, segment_id, utterance_id = fileid.split("_") 32 | utterance_id = fileid 33 | 34 | normalized_text = utterance_id + ext_normalized_txt 35 | normalized_text = os.path.join(path, speaker_id, chapter_id, normalized_text) 36 | 37 | original_text = utterance_id + ext_original_txt 38 | original_text = os.path.join(path, speaker_id, chapter_id, original_text) 39 | 40 | file_audio = utterance_id + ext_audio 41 | file_audio = os.path.join(path, speaker_id, chapter_id, file_audio) 42 | 43 | # Load audio 44 | waveform, sample_rate = torchaudio.load(file_audio) 45 | fileid_audio = Path(str(file_audio).replace('LibriTTS', "LibriTTS-{}".format("vad-ppw"))) 46 | waveform_ppw, sample_rate = torchaudio.load(fileid_audio) 47 | fileid_audio = Path(str(file_audio).replace('LibriTTS', "LibriTTS-{}".format("vad"))) 48 | waveform_vad, sample_rate = torchaudio.load(fileid_audio) 49 | 50 | # Load original text 51 | with open(original_text) as ft: 52 | original_text = ft.readline() 53 | 54 | # Load normalized text 55 | with open(normalized_text, "r") as ft: 56 | normalized_text = ft.readline() 57 | 58 | return ( 59 | waveform, 60 | waveform_ppw, 61 | waveform_vad, 62 | sample_rate, 63 | original_text, 64 | normalized_text, 65 | int(speaker_id), 66 | int(chapter_id), 67 | utterance_id, 68 | ) 69 | 70 | 71 | class LIBRITTS(Dataset): 72 | """*LibriTTS* :cite:`Zen2019LibriTTSAC` dataset. 73 | 74 | Args: 75 | root (str or Path): Path to the directory where the dataset is found or downloaded. 76 | url (str, optional): The URL to download the dataset from, 77 | or the type of the dataset to dowload. 78 | Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``, 79 | ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and 80 | ``"train-other-500"``. (default: ``"train-clean-100"``) 81 | folder_in_archive (str, optional): 82 | The top-level directory of the dataset. (default: ``"LibriTTS"``) 83 | download (bool, optional): 84 | Whether to download the dataset if it is not found at root path. (default: ``False``). 85 | """ 86 | 87 | _ext_original_txt = ".original.txt" 88 | _ext_normalized_txt = ".normalized.txt" 89 | _ext_audio = ".wav" 90 | 91 | def __init__( 92 | self, 93 | root: Union[str, Path], 94 | url: str = URL, 95 | folder_in_archive: str = FOLDER_IN_ARCHIVE, 96 | download: bool = False, 97 | ) -> None: 98 | 99 | if url in [ 100 | "dev-clean", 101 | "dev-other", 102 | "test-clean", 103 | "test-other", 104 | "train-clean-100", 105 | "train-clean-360", 106 | "train-other-500", 107 | ]: 108 | 109 | ext_archive = ".tar.gz" 110 | base_url = "http://www.openslr.org/resources/60/" 111 | 112 | url = os.path.join(base_url, url + ext_archive) 113 | 114 | # Get string representation of 'root' in case Path object is passed 115 | root = os.fspath(root) 116 | 117 | basename = os.path.basename(url) 118 | archive = os.path.join(root, basename) 119 | 120 | basename = basename.split(".")[0] 121 | folder_in_archive = os.path.join(folder_in_archive, basename) 122 | 123 | self._path = os.path.join(root, folder_in_archive) 124 | 125 | if download: 126 | if not os.path.isdir(self._path): 127 | if not os.path.isfile(archive): 128 | checksum = _CHECKSUMS.get(url, None) 129 | download_url_to_file(url, archive, hash_prefix=checksum) 130 | _extract_tar(archive) 131 | else: 132 | if not os.path.exists(self._path): 133 | raise RuntimeError( 134 | f"The path {self._path} doesn't exist. " 135 | "Please check the ``root`` path or set `download=True` to download it" 136 | ) 137 | 138 | self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*/*/*" + self._ext_audio)) 139 | 140 | def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int, int, str]: 141 | """Load the n-th sample from the dataset. 142 | 143 | Args: 144 | n (int): The index of the sample to be loaded 145 | 146 | Returns: 147 | Tuple of the following items; 148 | 149 | Tensor: 150 | Waveform 151 | int: 152 | Sample rate 153 | str: 154 | Original text 155 | str: 156 | Normalized text 157 | int: 158 | Speaker ID 159 | int: 160 | Chapter ID 161 | str: 162 | Utterance ID 163 | """ 164 | fileid = self._walker[n] 165 | return load_libritts_item( 166 | fileid, 167 | self._path, 168 | self._ext_audio, 169 | self._ext_original_txt, 170 | self._ext_normalized_txt, 171 | ) 172 | 173 | def __len__(self) -> int: 174 | return len(self._walker) 175 | -------------------------------------------------------------------------------- /datahelper/ljspeech_3.py: -------------------------------------------------------------------------------- 1 | # Adopted from torchaudio.datasets.LJSPEECH 2 | import csv 3 | import os 4 | from pathlib import Path 5 | from typing import Tuple, Union 6 | 7 | import torchaudio 8 | from torch import Tensor 9 | from torch.utils.data import Dataset 10 | from torchaudio._internal import download_url_to_file 11 | # from torchaudio.datasets.utils import _extract_tar 12 | import torch 13 | 14 | _RELEASE_CONFIGS = { 15 | "release1": { 16 | "folder_in_archive": "wavs", 17 | "url": "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", 18 | "checksum": "be1a30453f28eb8dd26af4101ae40cbf2c50413b1bb21936cbcdc6fae3de8aa5", 19 | } 20 | } 21 | 22 | 23 | class LJSPEECH(Dataset): 24 | """*LJSpeech-1.1* :cite:`ljspeech17` dataset. 25 | 26 | Args: 27 | root (str or Path): Path to the directory where the dataset is found or downloaded. 28 | url (str, optional): The URL to download the dataset from. 29 | (default: ``"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"``) 30 | folder_in_archive (str, optional): 31 | The top-level directory of the dataset. (default: ``"wavs"``) 32 | download (bool, optional): 33 | Whether to download the dataset if it is not found at root path. (default: ``False``). 34 | """ 35 | 36 | def __init__( 37 | self, 38 | root: Union[str, Path], 39 | url: str = _RELEASE_CONFIGS["release1"]["url"], 40 | folder_in_archive: str = _RELEASE_CONFIGS["release1"]["folder_in_archive"], 41 | download: bool = False, 42 | process_type_1: str="pseudo", 43 | process_type_2: str="se-vad", 44 | ) -> None: 45 | 46 | self._parse_filesystem(root, url, folder_in_archive, download, process_type_1, process_type_2) 47 | 48 | def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, 49 | download: bool, process_type_1: str, process_type_2: str) -> None: 50 | root = Path(root) 51 | 52 | basename = os.path.basename(url) 53 | archive = root / basename 54 | 55 | basename = Path(basename.split(".tar.bz2")[0]) 56 | folder_in_archive = basename / folder_in_archive 57 | 58 | self._path = root / folder_in_archive 59 | self._metadata_path = root / basename / "metadata.csv" 60 | 61 | self._process_type_1 = process_type_1 62 | self._process_type_2 = process_type_2 63 | ''' 64 | if download: 65 | if not os.path.isdir(self._path): 66 | if not os.path.isfile(archive): 67 | checksum = _RELEASE_CONFIGS["release1"]["checksum"] 68 | download_url_to_file(url, archive, hash_prefix=checksum) 69 | _extract_tar(archive) 70 | else: 71 | if not os.path.exists(self._path): 72 | raise RuntimeError( 73 | f"The path {self._path} doesn't exist. " 74 | "Please check the ``root`` path or set `download=True` to download it" 75 | ) 76 | ''' 77 | with open(self._metadata_path, "r", newline="", encoding='utf-8') as metadata: 78 | flist = csv.reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE) 79 | self._flist = list(flist) 80 | 81 | def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]: 82 | """Load the n-th sample from the dataset. 83 | 84 | Args: 85 | n (int): The index of the sample to be loaded 86 | 87 | Returns: 88 | Tuple of the following items; 89 | 90 | Tensor: 91 | Waveform 92 | int: 93 | Sample rate 94 | str: 95 | Transcript 96 | str: 97 | Normalized Transcript 98 | """ 99 | line = self._flist[n] 100 | fileid, transcript, normalized_transcript = line 101 | fileid_audio_o = self._path / (fileid + ".wav") 102 | # fileid_audio = Path(str(fileid_audio_o).replace('LJSpeech-1.1', "LJSpeech-1.1-{}".format("ppw"))) 103 | fileid_audio = fileid_audio_o 104 | waveform, sample_rate = torchaudio.load(fileid_audio) 105 | fileid_audio = Path(str(fileid_audio_o).replace('LJSpeech-1.1', "LJSpeech-1.1-{}".format(self._process_type_1))) 106 | waveform_pseudo, sample_rate = torchaudio.load(fileid_audio) 107 | fileid_audio = Path(str(fileid_audio_o).replace('LJSpeech-1.1', "LJSpeech-1.1-{}".format(self._process_type_2))) 108 | waveform_vad, sample_rate = torchaudio.load(fileid_audio) 109 | return ( 110 | waveform, 111 | waveform_pseudo, 112 | waveform_vad, 113 | sample_rate, 114 | transcript, 115 | normalized_transcript, 116 | ) 117 | 118 | def __len__(self) -> int: 119 | return len(self._flist) 120 | -------------------------------------------------------------------------------- /datahelper/timit_3.py: -------------------------------------------------------------------------------- 1 | """TIMIT data generator.""" 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import os 7 | import os 8 | from pathlib import Path 9 | from typing import Tuple, Union 10 | import torchaudio 11 | from torch import Tensor 12 | from torch.utils.data import Dataset 13 | 14 | 15 | _TIMIT_TRAIN_DATASETS = [ 16 | ["timit/TIMIT/data/TRAIN", (".WAV", ".WRD")], 17 | ] 18 | _TIMIT_TEST_DATASETS = [ 19 | ["timit/TIMIT/data/TEST", (".WAV", ".WRD")], 20 | ] 21 | 22 | 23 | def _collect_data(directory, input_ext, target_ext): 24 | """Traverses directory collecting input and target files.""" 25 | # Directory from string to tuple pair of strings 26 | # key: the filepath to a datafile including the datafile's basename. Example, 27 | # if the datafile was "/path/to/datafile.wav" then the key would be 28 | # "/path/to/datafile" 29 | # value: a pair of strings (input_filepath, target_filepath) 30 | data_files = dict() 31 | for root, _, filenames in os.walk(directory): 32 | input_files = [filename for filename in filenames if input_ext in filename] 33 | for input_filename in input_files: 34 | basename = input_filename.strip(input_ext) 35 | input_file = os.path.join(root, input_filename) 36 | target_file = os.path.join(root, basename + target_ext) 37 | key = os.path.join(root, basename) 38 | assert os.path.exists(target_file) 39 | assert key not in data_files 40 | data_files[key] = (input_file, target_file) 41 | return data_files 42 | 43 | class TIMIT(Dataset): 44 | def __init__( 45 | self, 46 | root: Union[str, Path], 47 | training: bool, 48 | ) -> None: 49 | self._parse_filesystem(root, training) 50 | 51 | def _parse_filesystem(self, root: str, training: bool) -> None: 52 | datasets = (_TIMIT_TRAIN_DATASETS if training else _TIMIT_TEST_DATASETS) 53 | for data_dir, (audio_ext, transcription_ext) in datasets: 54 | data_dir = os.path.join(root, data_dir) 55 | data_files = _collect_data(data_dir, audio_ext, transcription_ext) 56 | data_pairs = data_files.values() 57 | self._flist = [] 58 | for input_file, _ in sorted(data_pairs): 59 | self._flist.append(input_file) 60 | 61 | def __getitem__(self, n: int) -> Tuple[Tensor, int, str]: 62 | input_file = self._flist[n] 63 | # out_filepath = input_file.strip(".WAV") + ".wav" 64 | waveform, sample_rate = torchaudio.load(input_file) 65 | fileid_audio = Path(str(input_file).replace('TIMIT', "TIMIT-{}".format("vad"))) 66 | waveform_p, sample_rate = torchaudio.load(fileid_audio) 67 | fileid_audio = Path(str(input_file).replace('TIMIT', "TIMIT-{}".format("vad"))) 68 | waveform_v, sample_rate = torchaudio.load(fileid_audio) 69 | return ( 70 | waveform, 71 | waveform_p, 72 | waveform_v, 73 | sample_rate, 74 | input_file) 75 | 76 | def __len__(self) -> int: 77 | return len(self._flist) 78 | 79 | if __name__ == "__main__": 80 | ds = TIMIT("/data/ssd0/tianyi.tan", training=True) 81 | print(len(ds)) -------------------------------------------------------------------------------- /datahelper/whisper_val.py: -------------------------------------------------------------------------------- 1 | """TIMIT data generator.""" 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import os 7 | from pathlib import Path 8 | from typing import Tuple, Union 9 | import torchaudio 10 | from torch import Tensor 11 | from torch.utils.data import Dataset 12 | 13 | class WHISPER(Dataset): 14 | def __init__( 15 | self, 16 | root: Union[str, Path], 17 | ) -> None: 18 | self._parse_filesystem(root) 19 | 20 | def _parse_filesystem(self, root: str) -> None: 21 | data_dir = "_1_normal_trim" 22 | data_dir = Path(os.path.join(root, data_dir)) 23 | self._flist = [] 24 | for in_path in data_dir.rglob("*.wav"): 25 | self._flist.append(in_path) 26 | 27 | def __getitem__(self, n: int) -> Tuple[Tensor, int, str]: 28 | input_file = self._flist[n] 29 | waveform, sample_rate = torchaudio.load(input_file) 30 | fileid_audio = Path(str(input_file).replace('_1_normal_trim', "_1_ppw_trim")) 31 | waveform_p, sample_rate = torchaudio.load(fileid_audio) 32 | return ( 33 | waveform, 34 | waveform_p, 35 | sample_rate) 36 | 37 | def __len__(self) -> int: 38 | return len(self._flist) 39 | 40 | if __name__ == "__main__": 41 | ds = WHISPER("YOURPATH") 42 | print(len(ds)) -------------------------------------------------------------------------------- /datahelper/wtimit_3.py: -------------------------------------------------------------------------------- 1 | """TIMIT data generator.""" 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import os 7 | import re 8 | from pathlib import Path 9 | from typing import Tuple, Union 10 | import torchaudio 11 | from torch import Tensor 12 | from torch.utils.data import Dataset 13 | import numpy as np 14 | from scipy.ndimage import distance_transform_edt 15 | import torch 16 | 17 | def fill_nans(data_matrix): 18 | """Fills NaN's with nearest neighbours. 19 | 20 | This method is adapted from the method `fill`, which you can find here: 21 | https://stackoverflow.com/posts/9262129/revisions 22 | 23 | :param data_matrix: numpy array of real-valued data. 24 | :return: data_matrix: Same but without NaN's. 25 | """ 26 | 27 | indices = distance_transform_edt( 28 | np.isnan(data_matrix), return_distances=False, return_indices=True 29 | ) 30 | return data_matrix[tuple(indices)] 31 | 32 | class WTIMIT(Dataset): 33 | def __init__( 34 | self, 35 | root: Union[str, Path], 36 | ) -> None: 37 | self._parse_filesystem(root) 38 | 39 | def _parse_filesystem(self, root: str) -> None: 40 | data_dir = "wtimit/normal" 41 | data_dir = Path(os.path.join(root, data_dir)) 42 | self._flist = [] 43 | for in_path in data_dir.rglob("*.wav"): 44 | # if in_path.name.startswith("s10"): 45 | # s[0-1] 46 | # if re.match(r"s[0-1]\d{2}u0(0[3-9]|1[0-2])n\.wav$", in_path.name): 47 | self._flist.append(in_path) 48 | 49 | def __getitem__(self, n: int) -> Tuple[Tensor, int, str]: 50 | input_file = self._flist[n] 51 | waveform, sample_rate = torchaudio.load(input_file) 52 | fileid_audio = Path(str(input_file).replace('normal', "vad-ppw")) 53 | waveform_p, sample_rate = torchaudio.load(fileid_audio) 54 | fileid_audio = Path(str(input_file).replace('normal', "vad")) 55 | waveform_v, sample_rate = torchaudio.load(fileid_audio) 56 | return ( 57 | waveform, 58 | waveform_p, 59 | waveform_v, 60 | sample_rate, 61 | input_file) 62 | 63 | def __len__(self) -> int: 64 | return len(self._flist) 65 | 66 | if __name__ == "__main__": 67 | ds = WTIMIT("YOURPATH") 68 | print(len(ds)) -------------------------------------------------------------------------------- /datapreper/generate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | from multiprocessing import cpu_count 4 | from concurrent.futures import ProcessPoolExecutor 5 | from tqdm import tqdm 6 | 7 | def preprocess_dataset(args): 8 | if args.mode == "vad": 9 | from silero_vad import process_wav 10 | elif args.mode == "ppw": 11 | from pseudo_whisper import process_wav 12 | args.out_dir.mkdir(parents=True, exist_ok=True) 13 | 14 | futures = [] 15 | executor = ProcessPoolExecutor(max_workers=cpu_count()) 16 | print(f"VAD for audio in {args.in_dir}") 17 | list_list = list(args.in_dir.rglob("*.wav")) 18 | for i, in_path in enumerate(list_list): 19 | # if i % 2 == 1: 20 | relative_path = in_path.relative_to(args.in_dir) 21 | out_path = args.out_dir / relative_path 22 | out_path.parent.mkdir(parents=True, exist_ok=True) 23 | futures.append( 24 | executor.submit(process_wav, in_path, out_path, args.sample_rate) 25 | ) 26 | 27 | results = [future.result() for future in tqdm(futures)] 28 | 29 | lengths = {path.stem: length for path, length in results} 30 | seconds = sum(lengths.values()) 31 | hours = seconds / 3600 32 | print(f"Wrote {len(lengths)} utterances ({hours:.2f} hours)") 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser(description="Resample an audio dataset.") 37 | parser.add_argument( 38 | "in_dir", metavar="in-dir", help="path to the dataset directory.", type=Path 39 | ) 40 | parser.add_argument( 41 | "out_dir", metavar="out-dir", help="path to the output directory.", type=Path 42 | ) 43 | parser.add_argument( 44 | '--mode', choices=['vad', 'ppw'], required=True, help="Select processing mode: 'vad' or 'psuedo_whisper'" 45 | ) 46 | parser.add_argument( 47 | "--sample-rate", 48 | help="target sample rate (default 16kHz)", 49 | type=int, 50 | default=16000, 51 | ) 52 | args = parser.parse_args() 53 | preprocess_dataset(args) -------------------------------------------------------------------------------- /datapreper/pseudo_whisper.py: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/chaufanglin/Normal2Whisper/blob/main/utils.py 2 | import numpy as np 3 | from scipy.signal import lfilter 4 | import soundfile as sf 5 | import librosa 6 | from librosa import lpc 7 | import pyworld as pw 8 | 9 | def wav2world(x, fs, fft_size=None): 10 | """Convenience function to do all WORLD analysis steps in a single call. 11 | In this case only `frame_period` can be configured and other parameters 12 | are fixed to their defaults. Likewise, F0 estimation is fixed to 13 | DIO plus StoneMask refinement. 14 | Parameters 15 | ---------- 16 | x : ndarray 17 | Input waveform signal. 18 | fs : int 19 | Sample rate of input signal in Hz. 20 | fft_size : int 21 | Length of Fast Fourier Transform (in number of samples) 22 | The resulting dimension of `ap` adn `sp` will be `fft_size` // 2 + 1 23 | Returns 24 | ------- 25 | f0 : ndarray 26 | F0 contour. 27 | sp : ndarray 28 | Spectral envelope. 29 | ap : ndarray 30 | Aperiodicity. 31 | t : ndarray 32 | Temporal position of each frame. 33 | """ 34 | f0, t = pw.harvest(x, fs) 35 | sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size) 36 | ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) 37 | return f0, sp, ap, t 38 | 39 | 40 | def moving_average(data, length): 41 | output = np.empty(data.shape) 42 | maf = np.bartlett(length)/length # Bartlett window is a triangular window 43 | for i in range(data.shape[0]): 44 | output[i,:] = np.convolve(data[i,:], maf,'same') 45 | return output 46 | 47 | 48 | def gfm_iaif_glottal_remove(s_gvl, nv=48, ng=3, d=0.99, win=None): 49 | """ 50 | Glootal removal function based on GFM-IAIF. 51 | 52 | Note: 53 | Function originally coded by Olivier Perrotin (https://github.com/operrotin/GFM-IAIF). 54 | This code is translated to Python and adapted by Zhaofeng Lin (linzh@tcd.ie) 55 | Parameters: 56 | ---------- 57 | s_gvl: Speech signal frame 58 | nv: Order of LP analysis for vocal tract (def. 48) 59 | ng: Order of LP analysis for glottal source (def. 3) 60 | d: Leaky integration coefficient (def. 0.99) 61 | win: Window used before LPC (def. Hanning) 62 | 63 | Returns: 64 | ------- 65 | s_v: Speech signal with glottis contribution cancelled 66 | """ 67 | 68 | # ----- Set default parameters ------------------------------------------- 69 | if win is None: 70 | # Window for LPC estimation 71 | win = np.hanning(len(s_gvl)) 72 | 73 | # ----- Addition of pre-frame -------------------------------------------- 74 | # For the successive removals of the estimated LPC envelopes, a 75 | # mean-normalized pre-frame ramp is added at the beginning of the frame 76 | # in order to diminish ripple. The ramp is removed after each filtering. 77 | Lpf = nv + 1 # Pre-frame length 78 | x_gvl = np.concatenate([np.linspace(-s_gvl[0], s_gvl[0], Lpf), s_gvl]) # Prepend 79 | idx_pf = np.arange(Lpf, len(x_gvl)) # Indexes that exclude the pre-frame 80 | 81 | # ----- Cancel lip radiation contribution -------------------------------- 82 | # Define lip radiation filter 83 | al = [1, -d] 84 | 85 | # Integration of signal using filter 1/[1 -d z^(-1)] 86 | # - Input signal (for LPC estimation) 87 | s_gv = lfilter([1], al, s_gvl) 88 | # - Pre-framed input signal (for LPC envelope removal) 89 | x_gv = lfilter([1], al, x_gvl) 90 | 91 | # ----- Gross glottis estimation ----------------------------------------- 92 | # Iterative estimation of glottis with ng first order filters 93 | ag1 = lpc(s_gv*win, order=1) # First 1st order LPC estimation 94 | 95 | for i in range(ng-2): 96 | # Cancel current estimate of glottis contribution from speech signal 97 | x_v1x = lfilter(ag1,1,x_gv) # Inverse filtering 98 | s_v1x = x_v1x[idx_pf] # Remove pre-ramp 99 | 100 | # Next 1st order LPC estimation 101 | ag1x = lpc(s_v1x*win, order=1) # 1st order LPC 102 | 103 | # Update gross estimate of glottis contribution 104 | ag1 = np.convolve(ag1,ag1x) # Combine 1st order estimation with previous 105 | 106 | 107 | # ----- Gross vocal tract estimation ------------------------------------- 108 | # Cancel gross estimate of glottis contribution from speech signal 109 | x_v1 = lfilter(ag1,1,x_gv) # Inverse filtering 110 | s_v1 = x_v1[idx_pf] # Remove pre-ramp 111 | 112 | # Gross estimate of the vocal tract filter 113 | av1 = lpc(s_v1*win, order=nv) # nv order LPC estimation 114 | 115 | # ----- Fine glottis estimation ------------------------------------------ 116 | # Cancel gross estimate of vocal tract contribution from speech signal 117 | x_g1 = lfilter(av1,1,x_gv) # Inverse filtering 118 | s_g1 = x_g1[idx_pf] # Remove pre-ramp 119 | 120 | # Fine estimate of the glottis filter 121 | ag = lpc(s_g1*win, order=ng) # ng order LPC estimation 122 | 123 | # ----- Fine vocal tract estimation -------------------------------------- 124 | # Cancel fine estimate of glottis contribution from speech signal 125 | x_v = lfilter(ag,1,x_gv) # Inverse filtering 126 | s_v = x_v[idx_pf] # Remove pre-ramp 127 | 128 | return s_v 129 | 130 | 131 | def pesudo_whisper_gen(s_n, fs, Lv=16): 132 | """ 133 | Pesudo whispered speech generating function, using GFM-IAIF and moving averge filtering. 134 | 135 | Note: 136 | This code is written by Zhaofeng Lin (linzh@tcd.ie) 137 | 138 | Parameters: 139 | ---------- 140 | s_n: Normal speech wavform 141 | fs: Sample rate 142 | Lv: order of LP analysis for vocal tract (default: 16) 143 | 144 | Returns: 145 | ------- 146 | y_pw: Pesudo whispered speech wavform 147 | """ 148 | 149 | EPSILON = 1e-8 150 | 151 | # Overlapp-add (OLA) method 152 | nfft = pw.get_cheaptrick_fft_size(fs) 153 | win_length = int(30*fs/1000) # 30ms * fs / 1000 154 | nhop = round(win_length / 2) 155 | window = np.hamming(win_length) 156 | nframes = int(np.ceil(s_n.size / nhop)) 157 | 158 | s_gfm = np.zeros(s_n.shape) # allocate output speech without glottal source 159 | 160 | for n in range(nframes): 161 | startPoint = n * nhop # starting point of windowing 162 | if startPoint + win_length > s_n.size: 163 | s_gfm[startPoint - nhop + win_length: ] = EPSILON 164 | continue 165 | else: 166 | sn_frame = s_n[startPoint : startPoint+win_length] * window 167 | 168 | s_gfm_frame = gfm_iaif_glottal_remove(sn_frame, Lv) 169 | 170 | s_gfm[startPoint: startPoint + win_length] = s_gfm[startPoint: startPoint + win_length] + s_gfm_frame 171 | 172 | # Extract GFM 173 | f0_gfm, sp_gfm, ap_gfm, _ = wav2world(s_gfm, fs) 174 | 175 | # Moving Averge Filtering 176 | maf_freq = 400 # 400 Hz 177 | maf_w_len = round(maf_freq/fs * nfft) # 400 Hz 178 | sp_maf = moving_average(sp_gfm, maf_w_len) 179 | 180 | # Zero F0 and unit Ap 181 | f0_zero = np.zeros(f0_gfm.shape) + EPSILON 182 | ap_unit = np.ones(ap_gfm.shape) - EPSILON 183 | 184 | y_pw = pw.synthesize(f0_zero, sp_maf, ap_unit, fs, pw.default_frame_period) 185 | 186 | return y_pw 187 | 188 | 189 | def process_wav(in_path, out_path, sample_rate): 190 | normal, fs_ = sf.read(in_path) 191 | if sample_rate != fs_: 192 | normal = librosa.resample(normal, fs_, sample_rate) 193 | pesudo_whisper = pesudo_whisper_gen(normal, sample_rate) 194 | sf.write(out_path, pesudo_whisper, sample_rate) 195 | return out_path, len(pesudo_whisper) / sample_rate -------------------------------------------------------------------------------- /datapreper/silero_vad.py: -------------------------------------------------------------------------------- 1 | import torch 2 | torch.set_num_threads(1) 3 | from typing import List 4 | USE_ONNX = True # change this to True if you want to test onnx model 5 | 6 | model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', 7 | model='silero_vad', 8 | force_reload=False, 9 | onnx=USE_ONNX) 10 | 11 | (get_speech_timestamps, 12 | save_audio, 13 | read_audio, 14 | VADIterator, 15 | collect_chunks) = utils 16 | 17 | def replace_chunks(tss: List[dict], 18 | wav: torch.Tensor): 19 | chunks = [] 20 | cur_start = 0 21 | for i in tss: 22 | silence_part = torch.zeros_like(wav[cur_start: i['start']]) 23 | chunks.append(silence_part) 24 | voiced_part = wav[i['start']: i['end']] 25 | chunks.append(voiced_part) 26 | cur_start = i['end'] 27 | silence_part = torch.zeros_like(wav[cur_start:]) 28 | chunks.append(silence_part) 29 | result = torch.cat(chunks) 30 | if torch.all(result == 0): 31 | return wav 32 | return result 33 | 34 | 35 | def process_wav(in_path, out_path, sample_rate): 36 | wav = read_audio(in_path, sampling_rate=sample_rate) 37 | # get speech timestamps from full audio file 38 | speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sample_rate) 39 | # merge all speech chunks to one audio 40 | save_audio(out_path, 41 | replace_chunks(speech_timestamps, wav), sampling_rate=sample_rate) 42 | return out_path, wav.size(-1) / sample_rate -------------------------------------------------------------------------------- /experiments/quickvc/quickvc.pth.txt: -------------------------------------------------------------------------------- 1 | https://drive.google.com/drive/folders/1DF6RgIHHkn2aoyyUMt4_hPitKSc2YR9d 2 | # mv G_1200000.pth quickvc.pth -------------------------------------------------------------------------------- /experiments/s2uu2s/epoch=440-step=409942.ckpt.txt: -------------------------------------------------------------------------------- 1 | # https://box.nju.edu.cn/f/be9207d41cba4c8a98db/?dl=1 2 | https://github.com/tan90xx/distillw2n/releases/download/v1.0/epoch.440-step.409942.ckpt 3 | -------------------------------------------------------------------------------- /infer.py: -------------------------------------------------------------------------------- 1 | from u2ss2u import StreamableModel 2 | import torch 3 | import torchaudio 4 | import nemo.collections.asr as nemo_asr 5 | DEVICE="cuda:0" 6 | 7 | model = StreamableModel( 8 | batch_size=42, 9 | sample_rate=16_000, 10 | segment_length=32270, 11 | padding='same', 12 | dataset='timit') 13 | 14 | checkpoint_path = './experiments/s2uu2s/epoch=440-step=409942.ckpt' 15 | checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage, weights_only=True) 16 | model.load_state_dict(checkpoint['state_dict'], strict=False) 17 | model = model.to(DEVICE) 18 | model.eval() 19 | 20 | hubert_soft = torch.hub.load("bshall/hubert:main", f"hubert_soft").to(DEVICE) 21 | 22 | speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large") 23 | speaker_model = speaker_model.to(DEVICE) 24 | speaker_model.eval() 25 | 26 | x_trg, sr = torchaudio.load(f'./raw/gt/s000u003n.wav') 27 | spkemb = speaker_model.infer_segment(x_trg.squeeze(0))[0] 28 | 29 | x, sr = torchaudio.load('./raw/gt/s000u003w.wav') 30 | x = torchaudio.functional.resample(x, sr, 16000) 31 | # z, hubert = model(x.to(DEVICE), spkemb.to(DEVICE)) 32 | # torchaudio.save('test0.wav', z.squeeze(1).detach().cpu(), 16000) 33 | 34 | # spec = model.spec.to(DEVICE) 35 | # encoder = model.spec.to(DEVICE) 36 | reencoder = model.reencoder.to(DEVICE) 37 | decoder = model.decoder.to(DEVICE) 38 | 39 | hubert_soft = torch.hub.load("bshall/hubert:main", f"hubert_soft").to(DEVICE) 40 | hubert = hubert_soft.units(x.unsqueeze(0).to(DEVICE)) 41 | hubert = hubert.clone().to(DEVICE) 42 | hubert = torch.transpose(hubert, -1, -2) 43 | z = reencoder(hubert.to(DEVICE), spkemb.to(DEVICE)) 44 | z = decoder(z.to(DEVICE)) 45 | torchaudio.save('test1.wav', z.squeeze(1).detach().cpu(), 16000) -------------------------------------------------------------------------------- /libs/FastSpeech2/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import FastSpeech2 2 | 3 | __all__ = ['FastSpeech2'] -------------------------------------------------------------------------------- /libs/FastSpeech2/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .fastspeech2 import FastSpeech2 2 | from .loss import FastSpeech2Loss 3 | from .optimizer import ScheduledOptim -------------------------------------------------------------------------------- /libs/FastSpeech2/model/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/model/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/model/__pycache__/fastspeech2.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/fastspeech2.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/model/__pycache__/fastspeech2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/fastspeech2.cpython-38.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/model/__pycache__/loss.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/loss.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/model/__pycache__/loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/loss.cpython-38.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/model/__pycache__/modules.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/modules.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/model/__pycache__/modules.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/modules.cpython-38.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/model/__pycache__/optimizer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/optimizer.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/model/__pycache__/optimizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/optimizer.cpython-38.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/model/fastspeech2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from libs.FastSpeech2.transformer import Encoder, Decoder, PostNet 9 | from .modules import VarianceAdaptor 10 | from libs.FastSpeech2.utils.tools import get_mask_from_lengths 11 | 12 | 13 | class FastSpeech2(nn.Module): 14 | """ FastSpeech2 """ 15 | 16 | def __init__(self, preprocess_config, model_config): 17 | super(FastSpeech2, self).__init__() 18 | self.model_config = model_config 19 | 20 | self.encoder = Encoder(model_config) 21 | self.variance_adaptor = VarianceAdaptor(preprocess_config, model_config) 22 | self.decoder = Decoder(model_config) 23 | self.mel_linear = nn.Linear( 24 | model_config["transformer"]["decoder_hidden"], 25 | preprocess_config["preprocessing"]["mel"]["n_mel_channels"], 26 | ) 27 | self.postnet = PostNet() 28 | 29 | self.speaker_emb = None 30 | if model_config["multi_speaker"]: 31 | with open( 32 | os.path.join( 33 | preprocess_config["path"]["preprocessed_path"], "speakers.json" 34 | ), 35 | "r", 36 | ) as f: 37 | n_speaker = len(json.load(f)) 38 | self.speaker_emb = nn.Embedding( 39 | n_speaker, 40 | model_config["transformer"]["encoder_hidden"], 41 | ) 42 | 43 | def forward( 44 | self, 45 | speakers, 46 | texts, 47 | src_lens, 48 | max_src_len, 49 | mels=None, 50 | mel_lens=None, 51 | max_mel_len=None, 52 | p_targets=None, 53 | e_targets=None, 54 | d_targets=None, 55 | p_control=1.0, 56 | e_control=1.0, 57 | d_control=1.0, 58 | ): 59 | src_masks = get_mask_from_lengths(src_lens, max_src_len) 60 | mel_masks = ( 61 | get_mask_from_lengths(mel_lens, max_mel_len) 62 | if mel_lens is not None 63 | else None 64 | ) 65 | 66 | 67 | assert texts.shape[2] == 256 or texts.shape[2] == 768, print(f"####transformer texts {texts.shape}, src_masks {src_masks.shape}") ### rkmt 2022.5.19 68 | 69 | output = self.encoder(texts, src_masks) 70 | 71 | if self.speaker_emb is not None: 72 | output = output + self.speaker_emb(speakers).unsqueeze(1).expand( 73 | -1, max_src_len, -1 74 | ) 75 | 76 | ( 77 | output, 78 | p_predictions, 79 | e_predictions, 80 | log_d_predictions, 81 | d_rounded, 82 | mel_lens, 83 | mel_masks, 84 | ) = self.variance_adaptor( 85 | output, 86 | src_masks, 87 | mel_masks, 88 | max_mel_len, 89 | p_targets, 90 | e_targets, 91 | d_targets, 92 | p_control, 93 | e_control, 94 | d_control, 95 | ) 96 | 97 | output, mel_masks = self.decoder(output, mel_masks) 98 | output = self.mel_linear(output) 99 | 100 | postnet_output = self.postnet(output) + output 101 | 102 | return ( 103 | output, 104 | postnet_output, 105 | p_predictions, 106 | e_predictions, 107 | log_d_predictions, 108 | d_rounded, 109 | src_masks, 110 | mel_masks, 111 | src_lens, 112 | mel_lens, 113 | ) -------------------------------------------------------------------------------- /libs/FastSpeech2/model/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class FastSpeech2Loss(nn.Module): 6 | """ FastSpeech2 Loss """ 7 | 8 | def __init__(self, preprocess_config, model_config): 9 | super(FastSpeech2Loss, self).__init__() 10 | self.pitch_feature_level = preprocess_config["preprocessing"]["pitch"][ 11 | "feature" 12 | ] 13 | self.energy_feature_level = preprocess_config["preprocessing"]["energy"][ 14 | "feature" 15 | ] 16 | self.mse_loss = nn.MSELoss() 17 | self.mae_loss = nn.L1Loss() 18 | 19 | def forward(self, inputs, predictions): 20 | ( 21 | mel_targets, 22 | _, 23 | _, 24 | pitch_targets, 25 | energy_targets, 26 | duration_targets, 27 | ) = inputs[6:] 28 | ( 29 | mel_predictions, 30 | postnet_mel_predictions, 31 | pitch_predictions, 32 | energy_predictions, 33 | log_duration_predictions, 34 | _, 35 | src_masks, 36 | mel_masks, 37 | _, 38 | _, 39 | ) = predictions 40 | src_masks = ~src_masks 41 | mel_masks = ~mel_masks 42 | log_duration_targets = torch.log(duration_targets.float() + 1) 43 | mel_targets = mel_targets[:, : mel_masks.shape[1], :] 44 | mel_masks = mel_masks[:, :mel_masks.shape[1]] 45 | 46 | log_duration_targets.requires_grad = False 47 | pitch_targets.requires_grad = False 48 | energy_targets.requires_grad = False 49 | mel_targets.requires_grad = False 50 | 51 | if self.pitch_feature_level == "phoneme_level": 52 | pitch_predictions = pitch_predictions.masked_select(src_masks) 53 | pitch_targets = pitch_targets.masked_select(src_masks) 54 | elif self.pitch_feature_level == "frame_level": 55 | pitch_predictions = pitch_predictions.masked_select(mel_masks) 56 | pitch_targets = pitch_targets.masked_select(mel_masks) 57 | 58 | if self.energy_feature_level == "phoneme_level": 59 | energy_predictions = energy_predictions.masked_select(src_masks) 60 | energy_targets = energy_targets.masked_select(src_masks) 61 | if self.energy_feature_level == "frame_level": 62 | energy_predictions = energy_predictions.masked_select(mel_masks) 63 | energy_targets = energy_targets.masked_select(mel_masks) 64 | 65 | log_duration_predictions = log_duration_predictions.masked_select(src_masks) 66 | log_duration_targets = log_duration_targets.masked_select(src_masks) 67 | 68 | mel_predictions = mel_predictions.masked_select(mel_masks.unsqueeze(-1)) 69 | postnet_mel_predictions = postnet_mel_predictions.masked_select( 70 | mel_masks.unsqueeze(-1) 71 | ) 72 | mel_targets = mel_targets.masked_select(mel_masks.unsqueeze(-1)) 73 | 74 | mel_loss = self.mae_loss(mel_predictions, mel_targets) 75 | postnet_mel_loss = self.mae_loss(postnet_mel_predictions, mel_targets) 76 | 77 | pitch_loss = self.mse_loss(pitch_predictions, pitch_targets) 78 | energy_loss = self.mse_loss(energy_predictions, energy_targets) 79 | duration_loss = self.mse_loss(log_duration_predictions, log_duration_targets) 80 | 81 | total_loss = ( 82 | mel_loss + postnet_mel_loss + duration_loss + pitch_loss + energy_loss 83 | ) 84 | 85 | return ( 86 | total_loss, 87 | mel_loss, 88 | postnet_mel_loss, 89 | pitch_loss, 90 | energy_loss, 91 | duration_loss, 92 | ) 93 | -------------------------------------------------------------------------------- /libs/FastSpeech2/model/modules.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import copy 4 | import math 5 | from collections import OrderedDict 6 | 7 | import torch 8 | import torch.nn as nn 9 | import numpy as np 10 | import torch.nn.functional as F 11 | 12 | from libs.FastSpeech2.utils.tools import get_mask_from_lengths, pad 13 | 14 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 15 | 16 | 17 | class VarianceAdaptor(nn.Module): 18 | """Variance Adaptor""" 19 | 20 | def __init__(self, preprocess_config, model_config): 21 | super(VarianceAdaptor, self).__init__() 22 | 23 | self.duration_predictor = VariancePredictor(model_config) 24 | self.length_regulator = LengthRegulator() 25 | 26 | self.pitch_predictor = VariancePredictor(model_config) 27 | self.energy_predictor = VariancePredictor(model_config) 28 | 29 | self.pitch_feature_level = preprocess_config["preprocessing"]["pitch"][ 30 | "feature" 31 | ] 32 | self.energy_feature_level = preprocess_config["preprocessing"]["energy"][ 33 | "feature" 34 | ] 35 | assert self.pitch_feature_level in ["phoneme_level", "frame_level"] 36 | assert self.energy_feature_level in ["phoneme_level", "frame_level"] 37 | 38 | pitch_quantization = model_config["variance_embedding"]["pitch_quantization"] 39 | energy_quantization = model_config["variance_embedding"]["energy_quantization"] 40 | n_bins = model_config["variance_embedding"]["n_bins"] 41 | assert pitch_quantization in ["linear", "log"] 42 | assert energy_quantization in ["linear", "log"] 43 | with open( 44 | os.path.join(preprocess_config["path"]["preprocessed_path"], "stats.json") 45 | ) as f: 46 | assert f is not None # rkmt 2022.10.24 47 | stats = json.load(f) 48 | pitch_min, pitch_max = stats["pitch"][:2] 49 | energy_min, energy_max = stats["energy"][:2] 50 | 51 | if pitch_quantization == "log": 52 | self.pitch_bins = nn.Parameter( 53 | torch.exp( 54 | torch.linspace(np.log(pitch_min), np.log(pitch_max), n_bins - 1) 55 | ), 56 | requires_grad=False, 57 | ) 58 | else: 59 | self.pitch_bins = nn.Parameter( 60 | torch.linspace(pitch_min, pitch_max, n_bins - 1), 61 | requires_grad=False, 62 | ) 63 | if energy_quantization == "log": 64 | self.energy_bins = nn.Parameter( 65 | torch.exp( 66 | torch.linspace(np.log(energy_min), np.log(energy_max), n_bins - 1) 67 | ), 68 | requires_grad=False, 69 | ) 70 | else: 71 | self.energy_bins = nn.Parameter( 72 | torch.linspace(energy_min, energy_max, n_bins - 1), 73 | requires_grad=False, 74 | ) 75 | 76 | self.pitch_embedding = nn.Embedding( 77 | n_bins, model_config["transformer"]["encoder_hidden"] 78 | ) 79 | self.energy_embedding = nn.Embedding( 80 | n_bins, model_config["transformer"]["encoder_hidden"] 81 | ) 82 | 83 | def get_pitch_embedding(self, x, target, mask, control): 84 | prediction = self.pitch_predictor(x, mask) 85 | if target is not None: 86 | embedding = self.pitch_embedding(torch.bucketize(target, self.pitch_bins)) 87 | else: 88 | prediction = prediction * control 89 | embedding = self.pitch_embedding( 90 | torch.bucketize(prediction, self.pitch_bins) 91 | ) 92 | return prediction, embedding 93 | 94 | def get_energy_embedding(self, x, target, mask, control): 95 | prediction = self.energy_predictor(x, mask) 96 | if target is not None: 97 | embedding = self.energy_embedding(torch.bucketize(target, self.energy_bins)) 98 | else: 99 | prediction = prediction * control 100 | embedding = self.energy_embedding( 101 | torch.bucketize(prediction, self.energy_bins) 102 | ) 103 | return prediction, embedding 104 | 105 | def forward( 106 | self, 107 | x, 108 | src_mask, 109 | mel_mask=None, 110 | max_len=None, 111 | pitch_target=None, 112 | energy_target=None, 113 | duration_target=None, 114 | p_control=1.0, 115 | e_control=1.0, 116 | d_control=1.0, 117 | ): 118 | # assert duration_target is not None # rkmt 2022.8.3 119 | #log_duration_prediction = self.duration_predictor(x, src_mask) 120 | log_duration_prediction = None 121 | if self.pitch_feature_level == "phoneme_level": 122 | pitch_prediction, pitch_embedding = self.get_pitch_embedding( 123 | x, pitch_target, src_mask, p_control 124 | ) 125 | x = x + pitch_embedding 126 | if self.energy_feature_level == "phoneme_level": 127 | energy_prediction, energy_embedding = self.get_energy_embedding( 128 | x, energy_target, src_mask, p_control 129 | ) 130 | x = x + energy_embedding 131 | 132 | if duration_target is not None: 133 | #print("### x", x, duration_target, max_len) ### rkmt 2022.7.3 134 | 135 | ## bypasss length_regulator (rkmt 2022.8.3) 136 | #x, mel_len = self.length_regulator(x, duration_target, max_len) 137 | mel_len = torch.tensor([x.shape[1]], device=x.device) 138 | 139 | #print("#### x", x, x.shape, type(x), x.device, "\nmel_len", mel_len, type(mel_len), mel_len.device, "\max_len", max_len) 140 | duration_rounded = duration_target 141 | 142 | #### rkmt 2022.6.7 143 | if mel_mask is None: 144 | mel_mask = get_mask_from_lengths(mel_len) 145 | else: 146 | log_duration_prediction = self.duration_predictor(x, src_mask) 147 | duration_rounded = torch.clamp( 148 | (torch.round(torch.exp(log_duration_prediction) - 1) * d_control), 149 | min=0, 150 | ) 151 | ### rkmt 2022.6.7 152 | #print("### duration", duration_rounded.shape, duration_rounded, "max_len", max_len) #### rkmt 153 | x, mel_len = self.length_regulator(x, duration_rounded, max_len) 154 | mel_mask = get_mask_from_lengths(mel_len) 155 | #print("### mel_mask", mel_len, mel_mask.shape, mel_mask) #### rkmt 2022.6.7 156 | 157 | if self.pitch_feature_level == "frame_level": 158 | pitch_prediction, pitch_embedding = self.get_pitch_embedding( 159 | x, pitch_target, mel_mask, p_control 160 | ) 161 | x = x + pitch_embedding 162 | if self.energy_feature_level == "frame_level": 163 | energy_prediction, energy_embedding = self.get_energy_embedding( 164 | x, energy_target, mel_mask, p_control 165 | ) 166 | x = x + energy_embedding 167 | 168 | return ( 169 | x, 170 | pitch_prediction, 171 | energy_prediction, 172 | log_duration_prediction, 173 | duration_rounded, 174 | mel_len, 175 | mel_mask, 176 | ) 177 | 178 | 179 | class LengthRegulator(nn.Module): 180 | """Length Regulator""" 181 | 182 | def __init__(self): 183 | super(LengthRegulator, self).__init__() 184 | 185 | def LR(self, x, duration, max_len): 186 | output = list() 187 | mel_len = list() 188 | for batch, expand_target in zip(x, duration): 189 | expanded = self.expand(batch, expand_target) 190 | output.append(expanded) 191 | mel_len.append(expanded.shape[0]) 192 | 193 | if max_len is not None: 194 | output = pad(output, max_len) 195 | else: 196 | output = pad(output) 197 | 198 | return output, torch.LongTensor(mel_len).to(device) 199 | 200 | def expand(self, batch, predicted): 201 | out = list() 202 | 203 | for i, vec in enumerate(batch): 204 | expand_size = predicted[i].item() 205 | out.append(vec.expand(max(int(expand_size), 0), -1)) 206 | out = torch.cat(out, 0) 207 | 208 | return out 209 | 210 | def forward(self, x, duration, max_len): 211 | output, mel_len = self.LR(x, duration, max_len) 212 | return output, mel_len 213 | 214 | 215 | class VariancePredictor(nn.Module): 216 | """Duration, Pitch and Energy Predictor""" 217 | 218 | def __init__(self, model_config): 219 | super(VariancePredictor, self).__init__() 220 | 221 | self.input_size = model_config["transformer"]["encoder_hidden"] 222 | self.filter_size = model_config["variance_predictor"]["filter_size"] 223 | self.kernel = model_config["variance_predictor"]["kernel_size"] 224 | self.conv_output_size = model_config["variance_predictor"]["filter_size"] 225 | self.dropout = model_config["variance_predictor"]["dropout"] 226 | 227 | self.conv_layer = nn.Sequential( 228 | OrderedDict( 229 | [ 230 | ( 231 | "conv1d_1", 232 | Conv( 233 | self.input_size, 234 | self.filter_size, 235 | kernel_size=self.kernel, 236 | padding=(self.kernel - 1) // 2, 237 | ), 238 | ), 239 | ("relu_1", nn.ReLU()), 240 | ("layer_norm_1", nn.LayerNorm(self.filter_size)), 241 | ("dropout_1", nn.Dropout(self.dropout)), 242 | ( 243 | "conv1d_2", 244 | Conv( 245 | self.filter_size, 246 | self.filter_size, 247 | kernel_size=self.kernel, 248 | padding=1, 249 | ), 250 | ), 251 | ("relu_2", nn.ReLU()), 252 | ("layer_norm_2", nn.LayerNorm(self.filter_size)), 253 | ("dropout_2", nn.Dropout(self.dropout)), 254 | ] 255 | ) 256 | ) 257 | 258 | self.linear_layer = nn.Linear(self.conv_output_size, 1) 259 | 260 | def forward(self, encoder_output, mask): 261 | out = self.conv_layer(encoder_output) 262 | out = self.linear_layer(out) 263 | out = out.squeeze(-1) 264 | 265 | if mask is not None: 266 | out = out.masked_fill(mask, 0.0) 267 | 268 | return out 269 | 270 | 271 | class Conv(nn.Module): 272 | """ 273 | Convolution Module 274 | """ 275 | 276 | def __init__( 277 | self, 278 | in_channels, 279 | out_channels, 280 | kernel_size=1, 281 | stride=1, 282 | padding=0, 283 | dilation=1, 284 | bias=True, 285 | w_init="linear", 286 | ): 287 | """ 288 | :param in_channels: dimension of input 289 | :param out_channels: dimension of output 290 | :param kernel_size: size of kernel 291 | :param stride: size of stride 292 | :param padding: size of padding 293 | :param dilation: dilation rate 294 | :param bias: boolean. if True, bias is included. 295 | :param w_init: str. weight inits with xavier initialization. 296 | """ 297 | super(Conv, self).__init__() 298 | 299 | self.conv = nn.Conv1d( 300 | in_channels, 301 | out_channels, 302 | kernel_size=kernel_size, 303 | stride=stride, 304 | padding=padding, 305 | dilation=dilation, 306 | bias=bias, 307 | ) 308 | 309 | def forward(self, x): 310 | x = x.contiguous().transpose(1, 2) 311 | x = self.conv(x) 312 | x = x.contiguous().transpose(1, 2) 313 | 314 | return x 315 | -------------------------------------------------------------------------------- /libs/FastSpeech2/model/optimizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | class ScheduledOptim: 6 | """ A simple wrapper class for learning rate scheduling """ 7 | 8 | def __init__(self, model, train_config, model_config, current_step): 9 | 10 | self._optimizer = torch.optim.Adam( 11 | model.parameters(), 12 | betas=train_config["optimizer"]["betas"], 13 | eps=train_config["optimizer"]["eps"], 14 | weight_decay=train_config["optimizer"]["weight_decay"], 15 | ) 16 | self.n_warmup_steps = train_config["optimizer"]["warm_up_step"] 17 | self.anneal_steps = train_config["optimizer"]["anneal_steps"] 18 | self.anneal_rate = train_config["optimizer"]["anneal_rate"] 19 | self.current_step = current_step 20 | self.init_lr = np.power(model_config["transformer"]["encoder_hidden"], -0.5) 21 | 22 | def step_and_update_lr(self): 23 | self._update_learning_rate() 24 | self._optimizer.step() 25 | 26 | def zero_grad(self): 27 | # print(self.init_lr) 28 | self._optimizer.zero_grad() 29 | 30 | def load_state_dict(self, path): 31 | self._optimizer.load_state_dict(path) 32 | 33 | def _get_lr_scale(self): 34 | lr = np.min( 35 | [ 36 | np.power(self.current_step, -0.5), 37 | np.power(self.n_warmup_steps, -1.5) * self.current_step, 38 | ] 39 | ) 40 | for s in self.anneal_steps: 41 | if self.current_step > s: 42 | lr = lr * self.anneal_rate 43 | return lr 44 | 45 | def _update_learning_rate(self): 46 | """ Learning rate scheduling per step """ 47 | self.current_step += 1 48 | lr = self.init_lr * self._get_lr_scale() 49 | 50 | for param_group in self._optimizer.param_groups: 51 | param_group["lr"] = lr 52 | -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/Constants.py: -------------------------------------------------------------------------------- 1 | PAD = 0 2 | UNK = 1 3 | BOS = 2 4 | EOS = 3 5 | 6 | PAD_WORD = "" 7 | UNK_WORD = "" 8 | BOS_WORD = "" 9 | EOS_WORD = "" 10 | -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/Layers.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | import torch.nn as nn 5 | import numpy as np 6 | from torch.nn import functional as F 7 | 8 | from .SubLayers import MultiHeadAttention, PositionwiseFeedForward 9 | 10 | 11 | class FFTBlock(torch.nn.Module): 12 | """FFT Block""" 13 | 14 | def __init__(self, d_model, n_head, d_k, d_v, d_inner, kernel_size, dropout=0.1): 15 | super(FFTBlock, self).__init__() 16 | self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) 17 | self.pos_ffn = PositionwiseFeedForward( 18 | d_model, d_inner, kernel_size, dropout=dropout 19 | ) 20 | 21 | def forward(self, enc_input, mask=None, slf_attn_mask=None): 22 | enc_output, enc_slf_attn = self.slf_attn( 23 | enc_input, enc_input, enc_input, mask=slf_attn_mask 24 | ) 25 | enc_output = enc_output.masked_fill(mask.unsqueeze(-1), 0) 26 | 27 | enc_output = self.pos_ffn(enc_output) 28 | enc_output = enc_output.masked_fill(mask.unsqueeze(-1), 0) 29 | 30 | return enc_output, enc_slf_attn 31 | 32 | 33 | class ConvNorm(torch.nn.Module): 34 | def __init__( 35 | self, 36 | in_channels, 37 | out_channels, 38 | kernel_size=1, 39 | stride=1, 40 | padding=None, 41 | dilation=1, 42 | bias=True, 43 | w_init_gain="linear", 44 | ): 45 | super(ConvNorm, self).__init__() 46 | 47 | if padding is None: 48 | assert kernel_size % 2 == 1 49 | padding = int(dilation * (kernel_size - 1) / 2) 50 | 51 | self.conv = torch.nn.Conv1d( 52 | in_channels, 53 | out_channels, 54 | kernel_size=kernel_size, 55 | stride=stride, 56 | padding=padding, 57 | dilation=dilation, 58 | bias=bias, 59 | ) 60 | 61 | def forward(self, signal): 62 | conv_signal = self.conv(signal) 63 | 64 | return conv_signal 65 | 66 | 67 | class PostNet(nn.Module): 68 | """ 69 | PostNet: Five 1-d convolution with 512 channels and kernel size 5 70 | """ 71 | 72 | def __init__( 73 | self, 74 | n_mel_channels=80, 75 | postnet_embedding_dim=512, 76 | postnet_kernel_size=5, 77 | postnet_n_convolutions=5, 78 | ): 79 | 80 | super(PostNet, self).__init__() 81 | self.convolutions = nn.ModuleList() 82 | 83 | self.convolutions.append( 84 | nn.Sequential( 85 | ConvNorm( 86 | n_mel_channels, 87 | postnet_embedding_dim, 88 | kernel_size=postnet_kernel_size, 89 | stride=1, 90 | padding=int((postnet_kernel_size - 1) / 2), 91 | dilation=1, 92 | w_init_gain="tanh", 93 | ), 94 | nn.BatchNorm1d(postnet_embedding_dim), 95 | ) 96 | ) 97 | 98 | for i in range(1, postnet_n_convolutions - 1): 99 | self.convolutions.append( 100 | nn.Sequential( 101 | ConvNorm( 102 | postnet_embedding_dim, 103 | postnet_embedding_dim, 104 | kernel_size=postnet_kernel_size, 105 | stride=1, 106 | padding=int((postnet_kernel_size - 1) / 2), 107 | dilation=1, 108 | w_init_gain="tanh", 109 | ), 110 | nn.BatchNorm1d(postnet_embedding_dim), 111 | ) 112 | ) 113 | 114 | self.convolutions.append( 115 | nn.Sequential( 116 | ConvNorm( 117 | postnet_embedding_dim, 118 | n_mel_channels, 119 | kernel_size=postnet_kernel_size, 120 | stride=1, 121 | padding=int((postnet_kernel_size - 1) / 2), 122 | dilation=1, 123 | w_init_gain="linear", 124 | ), 125 | nn.BatchNorm1d(n_mel_channels), 126 | ) 127 | ) 128 | 129 | def forward(self, x): 130 | x = x.contiguous().transpose(1, 2) 131 | 132 | for i in range(len(self.convolutions) - 1): 133 | x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training) 134 | x = F.dropout(self.convolutions[-1](x), 0.5, self.training) 135 | 136 | x = x.contiguous().transpose(1, 2) 137 | return x 138 | -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/Models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | import libs.FastSpeech2.transformer.Constants as Constants 6 | from .Layers import FFTBlock 7 | 8 | 9 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): 10 | """ Sinusoid position encoding table """ 11 | 12 | def cal_angle(position, hid_idx): 13 | return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) 14 | 15 | def get_posi_angle_vec(position): 16 | return [cal_angle(position, hid_j) for hid_j in range(d_hid)] 17 | 18 | sinusoid_table = np.array( 19 | [get_posi_angle_vec(pos_i) for pos_i in range(n_position)] 20 | ) 21 | 22 | sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 23 | sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 24 | 25 | if padding_idx is not None: 26 | # zero vector for padding dimension 27 | sinusoid_table[padding_idx] = 0.0 28 | 29 | return torch.FloatTensor(sinusoid_table) 30 | 31 | 32 | class Encoder(nn.Module): 33 | """ Encoder """ 34 | 35 | def __init__(self, config): 36 | super(Encoder, self).__init__() 37 | 38 | n_position = config["max_seq_len"] + 1 39 | # n_src_vocab = len(symbols) + 1 40 | n_src_vocab = 360 + 1 41 | d_word_vec = config["transformer"]["encoder_hidden"] 42 | n_layers = config["transformer"]["encoder_layer"] 43 | n_head = config["transformer"]["encoder_head"] 44 | d_k = d_v = ( 45 | config["transformer"]["encoder_hidden"] 46 | // config["transformer"]["encoder_head"] 47 | ) 48 | d_model = config["transformer"]["encoder_hidden"] 49 | d_inner = config["transformer"]["conv_filter_size"] 50 | kernel_size = config["transformer"]["conv_kernel_size"] 51 | dropout = config["transformer"]["encoder_dropout"] 52 | 53 | self.max_seq_len = config["max_seq_len"] 54 | self.d_model = d_model 55 | 56 | self.src_word_emb = nn.Embedding( 57 | n_src_vocab, d_word_vec, padding_idx=Constants.PAD 58 | ) 59 | if config["soft_unit"]: # rkmt 2022.7.3 60 | self.soft_unit_dim = 256 if not ('soft_unit_dim' in config) else int(config['soft_unit_dim']) 61 | 62 | if self.soft_unit_dim == 256: 63 | self.src_word_emb = nn.Identity() 64 | else: 65 | self.src_word_emb = nn.Linear(self.soft_unit_dim, 256) 66 | 67 | self.position_enc = nn.Parameter( 68 | get_sinusoid_encoding_table(n_position, d_word_vec).unsqueeze(0), 69 | requires_grad=False, 70 | ) 71 | 72 | self.layer_stack = nn.ModuleList( 73 | [ 74 | FFTBlock( 75 | d_model, n_head, d_k, d_v, d_inner, kernel_size, dropout=dropout 76 | ) 77 | for _ in range(n_layers) 78 | ] 79 | ) 80 | 81 | def forward(self, src_seq, mask, return_attns=False): 82 | 83 | enc_slf_attn_list = [] 84 | batch_size, max_len = src_seq.shape[0], src_seq.shape[1] 85 | 86 | # -- Prepare masks 87 | slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) 88 | 89 | ''' 90 | print("### src_seq", src_seq.shape, type(src_seq), src_seq.dtype) 91 | print("### src_sec[..]", src_seq[0,0,:10]) 92 | print("###", self.src_word_emb(src_seq).shape) 93 | ''' 94 | 95 | # -- Forward 96 | if not self.training and src_seq.shape[1] > self.max_seq_len: 97 | enc_output = self.src_word_emb(src_seq) + get_sinusoid_encoding_table( 98 | src_seq.shape[1], self.d_model 99 | )[: src_seq.shape[1], :].unsqueeze(0).expand(batch_size, -1, -1).to( 100 | src_seq.device 101 | ) 102 | else: 103 | enc_output = self.src_word_emb(src_seq) + self.position_enc[ 104 | :, :max_len, : 105 | ].expand(batch_size, -1, -1) 106 | 107 | for enc_layer in self.layer_stack: 108 | enc_output, enc_slf_attn = enc_layer( 109 | enc_output, mask=mask, slf_attn_mask=slf_attn_mask 110 | ) 111 | if return_attns: 112 | enc_slf_attn_list += [enc_slf_attn] 113 | 114 | return enc_output 115 | 116 | 117 | class Decoder(nn.Module): 118 | """ Decoder """ 119 | 120 | def __init__(self, config): 121 | super(Decoder, self).__init__() 122 | 123 | n_position = config["max_seq_len"] + 1 124 | d_word_vec = config["transformer"]["decoder_hidden"] 125 | n_layers = config["transformer"]["decoder_layer"] 126 | n_head = config["transformer"]["decoder_head"] 127 | d_k = d_v = ( 128 | config["transformer"]["decoder_hidden"] 129 | // config["transformer"]["decoder_head"] 130 | ) 131 | d_model = config["transformer"]["decoder_hidden"] 132 | d_inner = config["transformer"]["conv_filter_size"] 133 | kernel_size = config["transformer"]["conv_kernel_size"] 134 | dropout = config["transformer"]["decoder_dropout"] 135 | 136 | self.max_seq_len = config["max_seq_len"] 137 | self.d_model = d_model 138 | 139 | self.position_enc = nn.Parameter( 140 | get_sinusoid_encoding_table(n_position, d_word_vec).unsqueeze(0), 141 | requires_grad=False, 142 | ) 143 | 144 | self.layer_stack = nn.ModuleList( 145 | [ 146 | FFTBlock( 147 | d_model, n_head, d_k, d_v, d_inner, kernel_size, dropout=dropout 148 | ) 149 | for _ in range(n_layers) 150 | ] 151 | ) 152 | 153 | def forward(self, enc_seq, mask, return_attns=False): 154 | 155 | dec_slf_attn_list = [] 156 | batch_size, max_len = enc_seq.shape[0], enc_seq.shape[1] 157 | 158 | # -- Forward 159 | if not self.training and enc_seq.shape[1] > self.max_seq_len: 160 | # -- Prepare masks 161 | slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) 162 | dec_output = enc_seq + get_sinusoid_encoding_table( 163 | enc_seq.shape[1], self.d_model 164 | )[: enc_seq.shape[1], :].unsqueeze(0).expand(batch_size, -1, -1).to( 165 | enc_seq.device 166 | ) 167 | else: 168 | max_len = min(max_len, self.max_seq_len) 169 | 170 | # -- Prepare masks 171 | slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) 172 | dec_output = enc_seq[:, :max_len, :] + self.position_enc[ 173 | :, :max_len, : 174 | ].expand(batch_size, -1, -1) 175 | mask = mask[:, :max_len] 176 | slf_attn_mask = slf_attn_mask[:, :, :max_len] 177 | 178 | for dec_layer in self.layer_stack: 179 | dec_output, dec_slf_attn = dec_layer( 180 | dec_output, mask=mask, slf_attn_mask=slf_attn_mask 181 | ) 182 | if return_attns: 183 | dec_slf_attn_list += [dec_slf_attn] 184 | 185 | return dec_output, mask 186 | -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/Modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | 6 | class ScaledDotProductAttention(nn.Module): 7 | """ Scaled Dot-Product Attention """ 8 | 9 | def __init__(self, temperature): 10 | super().__init__() 11 | self.temperature = temperature 12 | self.softmax = nn.Softmax(dim=2) 13 | 14 | def forward(self, q, k, v, mask=None): 15 | 16 | attn = torch.bmm(q, k.transpose(1, 2)) 17 | attn = attn / self.temperature 18 | 19 | if mask is not None: 20 | attn = attn.masked_fill(mask, -np.inf) 21 | 22 | attn = self.softmax(attn) 23 | output = torch.bmm(attn, v) 24 | 25 | return output, attn 26 | -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/SubLayers.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import numpy as np 4 | 5 | from .Modules import ScaledDotProductAttention 6 | 7 | 8 | class MultiHeadAttention(nn.Module): 9 | """ Multi-Head Attention module """ 10 | 11 | def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): 12 | super().__init__() 13 | 14 | self.n_head = n_head 15 | self.d_k = d_k 16 | self.d_v = d_v 17 | 18 | self.w_qs = nn.Linear(d_model, n_head * d_k) 19 | self.w_ks = nn.Linear(d_model, n_head * d_k) 20 | self.w_vs = nn.Linear(d_model, n_head * d_v) 21 | 22 | self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) 23 | self.layer_norm = nn.LayerNorm(d_model) 24 | 25 | self.fc = nn.Linear(n_head * d_v, d_model) 26 | 27 | self.dropout = nn.Dropout(dropout) 28 | 29 | def forward(self, q, k, v, mask=None): 30 | 31 | d_k, d_v, n_head = self.d_k, self.d_v, self.n_head 32 | 33 | sz_b, len_q, _ = q.size() 34 | sz_b, len_k, _ = k.size() 35 | sz_b, len_v, _ = v.size() 36 | 37 | residual = q 38 | 39 | q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) 40 | k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) 41 | v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) 42 | q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk 43 | k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk 44 | v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv 45 | 46 | mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. 47 | output, attn = self.attention(q, k, v, mask=mask) 48 | 49 | output = output.view(n_head, sz_b, len_q, d_v) 50 | output = ( 51 | output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) 52 | ) # b x lq x (n*dv) 53 | 54 | output = self.dropout(self.fc(output)) 55 | output = self.layer_norm(output + residual) 56 | 57 | return output, attn 58 | 59 | 60 | class PositionwiseFeedForward(nn.Module): 61 | """ A two-feed-forward-layer module """ 62 | 63 | def __init__(self, d_in, d_hid, kernel_size, dropout=0.1): 64 | super().__init__() 65 | 66 | # Use Conv1D 67 | # position-wise 68 | self.w_1 = nn.Conv1d( 69 | d_in, 70 | d_hid, 71 | kernel_size=kernel_size[0], 72 | padding=(kernel_size[0] - 1) // 2, 73 | ) 74 | # position-wise 75 | self.w_2 = nn.Conv1d( 76 | d_hid, 77 | d_in, 78 | kernel_size=kernel_size[1], 79 | padding=(kernel_size[1] - 1) // 2, 80 | ) 81 | 82 | self.layer_norm = nn.LayerNorm(d_in) 83 | self.dropout = nn.Dropout(dropout) 84 | 85 | def forward(self, x): 86 | residual = x 87 | output = x.transpose(1, 2) 88 | output = self.w_2(F.relu(self.w_1(output))) 89 | output = output.transpose(1, 2) 90 | output = self.dropout(output) 91 | output = self.layer_norm(output + residual) 92 | 93 | return output 94 | -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from .Models import Encoder, Decoder 2 | from .Layers import PostNet -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/__pycache__/Constants.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Constants.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/__pycache__/Constants.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Constants.cpython-38.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/__pycache__/Layers.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Layers.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/__pycache__/Layers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Layers.cpython-38.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/__pycache__/Models.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Models.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/__pycache__/Models.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Models.cpython-38.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/__pycache__/Modules.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Modules.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/__pycache__/Modules.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Modules.cpython-38.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/__pycache__/SubLayers.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/SubLayers.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/__pycache__/SubLayers.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/SubLayers.cpython-38.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/transformer/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/utils/__pycache__/model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/utils/__pycache__/model.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/utils/__pycache__/model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/utils/__pycache__/model.cpython-38.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/utils/__pycache__/tools.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/utils/__pycache__/tools.cpython-310.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/utils/__pycache__/tools.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/utils/__pycache__/tools.cpython-38.pyc -------------------------------------------------------------------------------- /libs/FastSpeech2/utils/tools.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | 5 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 6 | 7 | def get_mask_from_lengths(lengths, max_len=None): 8 | batch_size = lengths.shape[0] 9 | if max_len is None: 10 | max_len = torch.max(lengths).item() 11 | 12 | ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(device) 13 | mask = ids >= lengths.unsqueeze(1).expand(-1, max_len) 14 | 15 | return mask 16 | 17 | def pad_2D(inputs, maxlen=None): 18 | def pad(x, max_len): 19 | PAD = 0 20 | if np.shape(x)[0] > max_len: 21 | raise ValueError("not max_len") 22 | 23 | s = np.shape(x)[1] 24 | x_padded = np.pad( 25 | x, (0, max_len - np.shape(x)[0]), mode="constant", constant_values=PAD 26 | ) 27 | return x_padded[:, :s] 28 | 29 | if maxlen: 30 | output = np.stack([pad(x, maxlen) for x in inputs]) 31 | else: 32 | max_len = max(np.shape(x)[0] for x in inputs) 33 | output = np.stack([pad(x, max_len) for x in inputs]) 34 | 35 | return output 36 | 37 | 38 | def pad(input_ele, mel_max_length=None): 39 | if mel_max_length: 40 | max_len = mel_max_length 41 | else: 42 | max_len = max([input_ele[i].size(0) for i in range(len(input_ele))]) 43 | 44 | out_list = list() 45 | for i, batch in enumerate(input_ele): 46 | if len(batch.shape) == 1: 47 | one_batch_padded = F.pad( 48 | batch, (0, max_len - batch.size(0)), "constant", 0.0 49 | ) 50 | elif len(batch.shape) == 2: 51 | one_batch_padded = F.pad( 52 | batch, (0, 0, 0, max_len - batch.size(0)), "constant", 0.0 53 | ) 54 | out_list.append(one_batch_padded) 55 | out_padded = torch.stack(out_list) 56 | return out_padded 57 | -------------------------------------------------------------------------------- /libs/JDC/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import JDCNet -------------------------------------------------------------------------------- /libs/JDC/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/JDC/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /libs/JDC/__pycache__/model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/JDC/__pycache__/model.cpython-310.pyc -------------------------------------------------------------------------------- /libs/JDC/bst.t7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/JDC/bst.t7 -------------------------------------------------------------------------------- /libs/JDC/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of model from: 3 | Kum et al. - "Joint Detection and Classification of Singing Voice Melody Using 4 | Convolutional Recurrent Neural Networks" (2019) 5 | Link: https://www.semanticscholar.org/paper/Joint-Detection-and-Classification-of-Singing-Voice-Kum-Nam/60a2ad4c7db43bace75805054603747fcd062c0d 6 | """ 7 | import torch 8 | from torch import nn 9 | 10 | class JDCNet(nn.Module): 11 | """ 12 | Joint Detection and Classification Network model for singing voice melody. 13 | """ 14 | def __init__(self, num_class=722, seq_len=31, leaky_relu_slope=0.01): 15 | super().__init__() 16 | self.num_class = num_class 17 | 18 | # input = (b, 1, 31, 513), b = batch size 19 | self.conv_block = nn.Sequential( 20 | nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1, bias=False), # out: (b, 64, 31, 513) 21 | nn.BatchNorm2d(num_features=64), 22 | nn.LeakyReLU(leaky_relu_slope, inplace=True), 23 | nn.Conv2d(64, 64, 3, padding=1, bias=False), # (b, 64, 31, 513) 24 | ) 25 | 26 | # res blocks 27 | self.res_block1 = ResBlock(in_channels=64, out_channels=128) # (b, 128, 31, 128) 28 | self.res_block2 = ResBlock(in_channels=128, out_channels=192) # (b, 192, 31, 32) 29 | self.res_block3 = ResBlock(in_channels=192, out_channels=256) # (b, 256, 31, 8) 30 | 31 | # pool block 32 | self.pool_block = nn.Sequential( 33 | nn.BatchNorm2d(num_features=256), 34 | nn.LeakyReLU(leaky_relu_slope, inplace=True), 35 | nn.MaxPool2d(kernel_size=(1, 4)), # (b, 256, 31, 2) 36 | nn.Dropout(p=0.2), 37 | ) 38 | 39 | # maxpool layers (for auxiliary network inputs) 40 | # in = (b, 128, 31, 513) from conv_block, out = (b, 128, 31, 2) 41 | self.maxpool1 = nn.MaxPool2d(kernel_size=(1, 40)) 42 | # in = (b, 128, 31, 128) from res_block1, out = (b, 128, 31, 2) 43 | self.maxpool2 = nn.MaxPool2d(kernel_size=(1, 20)) 44 | # in = (b, 128, 31, 32) from res_block2, out = (b, 128, 31, 2) 45 | self.maxpool3 = nn.MaxPool2d(kernel_size=(1, 10)) 46 | 47 | # in = (b, 640, 31, 2), out = (b, 256, 31, 2) 48 | self.detector_conv = nn.Sequential( 49 | nn.Conv2d(640, 256, 1, bias=False), 50 | nn.BatchNorm2d(256), 51 | nn.LeakyReLU(leaky_relu_slope, inplace=True), 52 | nn.Dropout(p=0.2), 53 | ) 54 | 55 | # input: (b, 31, 512) - resized from (b, 256, 31, 2) 56 | self.bilstm_classifier = nn.LSTM( 57 | input_size=512, hidden_size=256, 58 | batch_first=True, bidirectional=True) # (b, 31, 512) 59 | 60 | # input: (b, 31, 512) - resized from (b, 256, 31, 2) 61 | self.bilstm_detector = nn.LSTM( 62 | input_size=512, hidden_size=256, 63 | batch_first=True, bidirectional=True) # (b, 31, 512) 64 | 65 | # input: (b * 31, 512) 66 | self.classifier = nn.Linear(in_features=512, out_features=self.num_class) # (b * 31, num_class) 67 | 68 | # input: (b * 31, 512) 69 | self.detector = nn.Linear(in_features=512, out_features=2) # (b * 31, 2) - binary classifier 70 | 71 | # initialize weights 72 | self.apply(self.init_weights) 73 | 74 | def get_feature_GAN(self, x): 75 | seq_len = x.shape[-2] 76 | x = x.float().transpose(-1, -2) 77 | 78 | convblock_out = self.conv_block(x) 79 | 80 | resblock1_out = self.res_block1(convblock_out) 81 | resblock2_out = self.res_block2(resblock1_out) 82 | resblock3_out = self.res_block3(resblock2_out) 83 | poolblock_out = self.pool_block[0](resblock3_out) 84 | poolblock_out = self.pool_block[1](poolblock_out) 85 | 86 | return poolblock_out.transpose(-1, -2) 87 | 88 | def get_feature(self, x): 89 | seq_len = x.shape[-2] 90 | x = x.float().transpose(-1, -2) 91 | 92 | convblock_out = self.conv_block(x) 93 | 94 | resblock1_out = self.res_block1(convblock_out) 95 | resblock2_out = self.res_block2(resblock1_out) 96 | resblock3_out = self.res_block3(resblock2_out) 97 | poolblock_out = self.pool_block[0](resblock3_out) 98 | poolblock_out = self.pool_block[1](poolblock_out) 99 | 100 | return self.pool_block[2](poolblock_out) 101 | 102 | def forward(self, x): 103 | """ 104 | Returns: 105 | classification_prediction, detection_prediction 106 | sizes: (b, 31, 722), (b, 31, 2) 107 | """ 108 | ############################### 109 | # forward pass for classifier # 110 | ############################### 111 | seq_len = x.shape[-1] 112 | x = x.float().transpose(-1, -2) 113 | 114 | convblock_out = self.conv_block(x) 115 | 116 | resblock1_out = self.res_block1(convblock_out) 117 | resblock2_out = self.res_block2(resblock1_out) 118 | resblock3_out = self.res_block3(resblock2_out) 119 | 120 | 121 | poolblock_out = self.pool_block[0](resblock3_out) 122 | poolblock_out = self.pool_block[1](poolblock_out) 123 | GAN_feature = poolblock_out.transpose(-1, -2) 124 | poolblock_out = self.pool_block[2](poolblock_out) 125 | 126 | # (b, 256, 31, 2) => (b, 31, 256, 2) => (b, 31, 512) 127 | classifier_out = poolblock_out.permute(0, 2, 1, 3).contiguous().view((-1, seq_len, 512)) 128 | classifier_out, _ = self.bilstm_classifier(classifier_out) # ignore the hidden states 129 | 130 | classifier_out = classifier_out.contiguous().view((-1, 512)) # (b * 31, 512) 131 | classifier_out = self.classifier(classifier_out) 132 | classifier_out = classifier_out.view((-1, seq_len, self.num_class)) # (b, 31, num_class) 133 | 134 | # sizes: (b, 31, 722), (b, 31, 2) 135 | # classifier output consists of predicted pitch classes per frame 136 | # detector output consists of: (isvoice, notvoice) estimates per frame 137 | return torch.abs(classifier_out.squeeze(-1)), GAN_feature, poolblock_out 138 | 139 | @staticmethod 140 | def init_weights(m): 141 | if isinstance(m, nn.Linear): 142 | nn.init.kaiming_uniform_(m.weight) 143 | if m.bias is not None: 144 | nn.init.constant_(m.bias, 0) 145 | elif isinstance(m, nn.Conv2d): 146 | nn.init.xavier_normal_(m.weight) 147 | elif isinstance(m, nn.LSTM) or isinstance(m, nn.LSTMCell): 148 | for p in m.parameters(): 149 | if p.data is None: 150 | continue 151 | 152 | if len(p.shape) >= 2: 153 | nn.init.orthogonal_(p.data) 154 | else: 155 | nn.init.normal_(p.data) 156 | 157 | 158 | class ResBlock(nn.Module): 159 | def __init__(self, in_channels: int, out_channels: int, leaky_relu_slope=0.01): 160 | super().__init__() 161 | self.downsample = in_channels != out_channels 162 | 163 | # BN / LReLU / MaxPool layer before the conv layer - see Figure 1b in the paper 164 | self.pre_conv = nn.Sequential( 165 | nn.BatchNorm2d(num_features=in_channels), 166 | nn.LeakyReLU(leaky_relu_slope, inplace=True), 167 | nn.MaxPool2d(kernel_size=(1, 2)), # apply downsampling on the y axis only 168 | ) 169 | 170 | # conv layers 171 | self.conv = nn.Sequential( 172 | nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 173 | kernel_size=3, padding=1, bias=False), 174 | nn.BatchNorm2d(out_channels), 175 | nn.LeakyReLU(leaky_relu_slope, inplace=True), 176 | nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False), 177 | ) 178 | 179 | # 1 x 1 convolution layer to match the feature dimensions 180 | self.conv1by1 = None 181 | if self.downsample: 182 | self.conv1by1 = nn.Conv2d(in_channels, out_channels, 1, bias=False) 183 | 184 | def forward(self, x): 185 | x = self.pre_conv(x) 186 | if self.downsample: 187 | x = self.conv(x) + self.conv1by1(x) 188 | else: 189 | x = self.conv(x) + x 190 | return x -------------------------------------------------------------------------------- /libs/__init__.py: -------------------------------------------------------------------------------- 1 | from .hubert.model import HubertSoft 2 | from .JDC.model import JDCNet 3 | 4 | __all__ = ['HubertSoft', 'JDCNet'] -------------------------------------------------------------------------------- /libs/hifigan/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import Generator 2 | 3 | 4 | class AttrDict(dict): 5 | def __init__(self, *args, **kwargs): 6 | super(AttrDict, self).__init__(*args, **kwargs) 7 | self.__dict__ = self 8 | 9 | from .model import hifigan 10 | 11 | __all__ = ['hifigan'] -------------------------------------------------------------------------------- /libs/hifigan/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hifigan/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /libs/hifigan/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hifigan/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /libs/hifigan/__pycache__/model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hifigan/__pycache__/model.cpython-310.pyc -------------------------------------------------------------------------------- /libs/hifigan/__pycache__/models.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hifigan/__pycache__/models.cpython-310.pyc -------------------------------------------------------------------------------- /libs/hifigan/__pycache__/models.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hifigan/__pycache__/models.cpython-38.pyc -------------------------------------------------------------------------------- /libs/hifigan/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 16, 5 | "learning_rate": 0.0002, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.999, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [8,8,2,2], 12 | "upsample_kernel_sizes": [16,16,4,4], 13 | "upsample_initial_channel": 512, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "segment_size": 8192, 18 | "num_mels": 80, 19 | "num_freq": 1025, 20 | "n_fft": 1024, 21 | "hop_size": 256, 22 | "win_size": 1024, 23 | 24 | "sampling_rate": 22050, 25 | 26 | "fmin": 0, 27 | "fmax": 8000, 28 | "fmax_for_loss": null, 29 | 30 | "num_workers": 4, 31 | 32 | "dist_config": { 33 | "dist_backend": "nccl", 34 | "dist_url": "tcp://localhost:54321", 35 | "world_size": 1 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /libs/hifigan/model.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | import libs.hifigan as hifigan 4 | 5 | 6 | def get_vocoder(config, device): 7 | 8 | with open("./libs/hifigan/my_config_v1_16000.json", "r") as f: 9 | config = json.load(f) 10 | config = hifigan.AttrDict(config) 11 | vocoder = hifigan.Generator(config) 12 | ckpt = torch.load("./libs/hifigan/g_00180000.zip") 13 | vocoder.load_state_dict(ckpt["generator"]) 14 | vocoder.eval() 15 | vocoder.remove_weight_norm() 16 | vocoder.to(device) 17 | 18 | return vocoder 19 | 20 | 21 | def vocoder_infer(mels, vocoder): 22 | 23 | with torch.no_grad(): 24 | wavs = vocoder(mels).squeeze(1) # rkmt 2022.6.1 25 | 26 | # wavs = (wavs.cpu().numpy() * 32768.0).astype("int16") 27 | # wavs = [wav for wav in wavs] 28 | 29 | return wavs 30 | -------------------------------------------------------------------------------- /libs/hifigan/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.nn import Conv1d, ConvTranspose1d 5 | from torch.nn.utils import weight_norm, remove_weight_norm 6 | 7 | LRELU_SLOPE = 0.1 8 | 9 | 10 | def init_weights(m, mean=0.0, std=0.01): 11 | classname = m.__class__.__name__ 12 | if classname.find("Conv") != -1: 13 | m.weight.data.normal_(mean, std) 14 | 15 | 16 | def get_padding(kernel_size, dilation=1): 17 | return int((kernel_size * dilation - dilation) / 2) 18 | 19 | 20 | class ResBlock(torch.nn.Module): 21 | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): 22 | super(ResBlock, self).__init__() 23 | self.h = h 24 | self.convs1 = nn.ModuleList( 25 | [ 26 | weight_norm( 27 | Conv1d( 28 | channels, 29 | channels, 30 | kernel_size, 31 | 1, 32 | dilation=dilation[0], 33 | padding=get_padding(kernel_size, dilation[0]), 34 | ) 35 | ), 36 | weight_norm( 37 | Conv1d( 38 | channels, 39 | channels, 40 | kernel_size, 41 | 1, 42 | dilation=dilation[1], 43 | padding=get_padding(kernel_size, dilation[1]), 44 | ) 45 | ), 46 | weight_norm( 47 | Conv1d( 48 | channels, 49 | channels, 50 | kernel_size, 51 | 1, 52 | dilation=dilation[2], 53 | padding=get_padding(kernel_size, dilation[2]), 54 | ) 55 | ), 56 | ] 57 | ) 58 | self.convs1.apply(init_weights) 59 | 60 | self.convs2 = nn.ModuleList( 61 | [ 62 | weight_norm( 63 | Conv1d( 64 | channels, 65 | channels, 66 | kernel_size, 67 | 1, 68 | dilation=1, 69 | padding=get_padding(kernel_size, 1), 70 | ) 71 | ), 72 | weight_norm( 73 | Conv1d( 74 | channels, 75 | channels, 76 | kernel_size, 77 | 1, 78 | dilation=1, 79 | padding=get_padding(kernel_size, 1), 80 | ) 81 | ), 82 | weight_norm( 83 | Conv1d( 84 | channels, 85 | channels, 86 | kernel_size, 87 | 1, 88 | dilation=1, 89 | padding=get_padding(kernel_size, 1), 90 | ) 91 | ), 92 | ] 93 | ) 94 | self.convs2.apply(init_weights) 95 | 96 | def forward(self, x): 97 | for c1, c2 in zip(self.convs1, self.convs2): 98 | xt = F.leaky_relu(x, LRELU_SLOPE) 99 | xt = c1(xt) 100 | xt = F.leaky_relu(xt, LRELU_SLOPE) 101 | xt = c2(xt) 102 | x = xt + x 103 | return x 104 | 105 | def remove_weight_norm(self): 106 | for l in self.convs1: 107 | remove_weight_norm(l) 108 | for l in self.convs2: 109 | remove_weight_norm(l) 110 | 111 | 112 | class Generator(torch.nn.Module): 113 | def __init__(self, h): 114 | super(Generator, self).__init__() 115 | self.h = h 116 | self.num_kernels = len(h.resblock_kernel_sizes) 117 | self.num_upsamples = len(h.upsample_rates) 118 | self.conv_pre = weight_norm( 119 | Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3) 120 | ) 121 | resblock = ResBlock 122 | 123 | self.ups = nn.ModuleList() 124 | for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): 125 | self.ups.append( 126 | weight_norm( 127 | ConvTranspose1d( 128 | h.upsample_initial_channel // (2 ** i), 129 | h.upsample_initial_channel // (2 ** (i + 1)), 130 | k, 131 | u, 132 | padding=(k - u) // 2, 133 | ) 134 | ) 135 | ) 136 | 137 | self.resblocks = nn.ModuleList() 138 | for i in range(len(self.ups)): 139 | ch = h.upsample_initial_channel // (2 ** (i + 1)) 140 | for j, (k, d) in enumerate( 141 | zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes) 142 | ): 143 | self.resblocks.append(resblock(h, ch, k, d)) 144 | 145 | self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) 146 | self.ups.apply(init_weights) 147 | self.conv_post.apply(init_weights) 148 | 149 | def forward(self, x): 150 | x = self.conv_pre(x) 151 | for i in range(self.num_upsamples): 152 | x = F.leaky_relu(x, LRELU_SLOPE) 153 | x = self.ups[i](x) 154 | xs = None 155 | for j in range(self.num_kernels): 156 | if xs is None: 157 | xs = self.resblocks[i * self.num_kernels + j](x) 158 | else: 159 | xs += self.resblocks[i * self.num_kernels + j](x) 160 | x = xs / self.num_kernels 161 | x = F.leaky_relu(x) 162 | x = self.conv_post(x) 163 | x = torch.tanh(x) 164 | 165 | return x 166 | 167 | def remove_weight_norm(self): 168 | for l in self.ups: 169 | remove_weight_norm(l) 170 | for l in self.resblocks: 171 | l.remove_weight_norm() 172 | remove_weight_norm(self.conv_pre) 173 | remove_weight_norm(self.conv_post) -------------------------------------------------------------------------------- /libs/hifigan/my_config_v1_16000.json: -------------------------------------------------------------------------------- 1 | { 2 | "resblock": "1", 3 | "num_gpus": 0, 4 | "batch_size": 16, 5 | "learning_rate": 0.0002, 6 | "adam_b1": 0.8, 7 | "adam_b2": 0.99, 8 | "lr_decay": 0.999, 9 | "seed": 1234, 10 | 11 | "upsample_rates": [8,5,4,2], 12 | "upsample_kernel_sizes": [16,10,8,4], 13 | "upsample_initial_channel": 512, 14 | "resblock_kernel_sizes": [3,7,11], 15 | "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], 16 | 17 | "segment_size": 8000, 18 | "num_mels": 80, 19 | "num_freq": 1025, 20 | "n_fft": 1024, 21 | "hop_size": 320, 22 | "win_size": 1024, 23 | 24 | "sampling_rate": 16000, 25 | 26 | "fmin": 0, 27 | "fmax": 8000, 28 | "fmax_for_loss": null, 29 | 30 | "num_workers": 4, 31 | 32 | "dist_config": { 33 | "dist_backend": "nccl", 34 | "dist_url": "tcp://localhost:54321", 35 | "world_size": 1 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /libs/hubert/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import ( 2 | Hubert, 3 | HubertDiscrete, 4 | HubertSoft, 5 | hubert_discrete, 6 | hubert_soft, 7 | kmeans100, 8 | ) 9 | -------------------------------------------------------------------------------- /libs/hubert/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hubert/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /libs/hubert/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hubert/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /libs/hubert/__pycache__/model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hubert/__pycache__/model.cpython-310.pyc -------------------------------------------------------------------------------- /libs/hubert/__pycache__/model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hubert/__pycache__/model.cpython-38.pyc -------------------------------------------------------------------------------- /libs/hubert/model.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Optional, Tuple 3 | import random 4 | 5 | from sklearn.cluster import KMeans 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present 11 | 12 | URLS = { 13 | "hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-discrete-e9416457.pt", 14 | "hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt", 15 | "kmeans100": "https://github.com/bshall/hubert/releases/download/v0.1/kmeans100-50f36a95.pt", 16 | } 17 | 18 | 19 | class Hubert(nn.Module): 20 | def __init__(self, num_label_embeddings: int = 100, mask: bool = True): 21 | super().__init__() 22 | self._mask = mask 23 | self.feature_extractor = FeatureExtractor() 24 | self.feature_projection = FeatureProjection() 25 | self.positional_embedding = PositionalConvEmbedding() 26 | self.norm = nn.LayerNorm(768) 27 | self.dropout = nn.Dropout(0.1) 28 | self.encoder = TransformerEncoder( 29 | nn.TransformerEncoderLayer( 30 | 768, 12, 3072, activation="gelu", batch_first=True 31 | ), 32 | 12, 33 | ) 34 | self.proj = nn.Linear(768, 256) 35 | 36 | self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_()) 37 | self.label_embedding = nn.Embedding(num_label_embeddings, 256) 38 | 39 | def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 40 | mask = None 41 | if self.training and self._mask: 42 | mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2) 43 | x[mask] = self.masked_spec_embed.to(x.dtype) 44 | return x, mask 45 | 46 | def encode( 47 | self, x: torch.Tensor, layer: Optional[int] = None 48 | ) -> Tuple[torch.Tensor, torch.Tensor]: 49 | x = self.feature_extractor(x) 50 | x = self.feature_projection(x.transpose(1, 2)) 51 | x, mask = self.mask(x) 52 | x = x + self.positional_embedding(x) 53 | x = self.dropout(self.norm(x)) 54 | x = self.encoder(x, output_layer=layer) 55 | return x, mask 56 | 57 | def logits(self, x: torch.Tensor) -> torch.Tensor: 58 | logits = torch.cosine_similarity( 59 | x.unsqueeze(2), 60 | self.label_embedding.weight.unsqueeze(0).unsqueeze(0), 61 | dim=-1, 62 | ) 63 | return logits / 0.1 64 | 65 | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 66 | x, mask = self.encode(x) 67 | x = self.proj(x) 68 | logits = self.logits(x) 69 | return logits, mask 70 | 71 | 72 | class HubertSoft(Hubert): 73 | def __init__(self): 74 | super().__init__() 75 | 76 | @torch.inference_mode() 77 | def units(self, wav: torch.Tensor) -> torch.Tensor: 78 | wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) 79 | x, _ = self.encode(wav) 80 | x = self.proj(x) 81 | x = x.transpose(2, 1) 82 | return x 83 | 84 | 85 | class HubertDiscrete(Hubert): 86 | def __init__(self, kmeans): 87 | super().__init__(504) 88 | self.kmeans = kmeans 89 | 90 | @torch.inference_mode() 91 | def units(self, wav: torch.Tensor) -> torch.LongTensor: 92 | wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) 93 | x, _ = self.encode(wav, layer=7) 94 | x = self.kmeans.predict(x.squeeze().cpu().numpy()) 95 | return torch.tensor(x, dtype=torch.long, device=wav.device) 96 | 97 | 98 | class FeatureExtractor(nn.Module): 99 | def __init__(self): 100 | super().__init__() 101 | self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False) 102 | self.norm0 = nn.GroupNorm(512, 512) 103 | self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False) 104 | self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False) 105 | self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False) 106 | self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False) 107 | self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False) 108 | self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False) 109 | 110 | def forward(self, x: torch.Tensor) -> torch.Tensor: 111 | x = F.gelu(self.norm0(self.conv0(x))) 112 | x = F.gelu(self.conv1(x)) 113 | x = F.gelu(self.conv2(x)) 114 | x = F.gelu(self.conv3(x)) 115 | x = F.gelu(self.conv4(x)) 116 | x = F.gelu(self.conv5(x)) 117 | x = F.gelu(self.conv6(x)) 118 | return x 119 | 120 | 121 | class FeatureProjection(nn.Module): 122 | def __init__(self): 123 | super().__init__() 124 | self.norm = nn.LayerNorm(512) 125 | self.projection = nn.Linear(512, 768) 126 | self.dropout = nn.Dropout(0.1) 127 | 128 | def forward(self, x: torch.Tensor) -> torch.Tensor: 129 | x = self.norm(x) 130 | x = self.projection(x) 131 | x = self.dropout(x) 132 | return x 133 | 134 | 135 | class PositionalConvEmbedding(nn.Module): 136 | def __init__(self): 137 | super().__init__() 138 | self.conv = nn.Conv1d( 139 | 768, 140 | 768, 141 | kernel_size=128, 142 | padding=128 // 2, 143 | groups=16, 144 | ) 145 | self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2) 146 | 147 | def forward(self, x: torch.Tensor) -> torch.Tensor: 148 | x = self.conv(x.transpose(1, 2)) 149 | x = F.gelu(x[:, :, :-1]) 150 | return x.transpose(1, 2) 151 | 152 | 153 | class TransformerEncoder(nn.Module): 154 | def __init__( 155 | self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int 156 | ) -> None: 157 | super(TransformerEncoder, self).__init__() 158 | self.layers = nn.ModuleList( 159 | [copy.deepcopy(encoder_layer) for _ in range(num_layers)] 160 | ) 161 | self.num_layers = num_layers 162 | 163 | def forward( 164 | self, 165 | src: torch.Tensor, 166 | mask: torch.Tensor = None, 167 | src_key_padding_mask: torch.Tensor = None, 168 | output_layer: Optional[int] = None, 169 | ) -> torch.Tensor: 170 | output = src 171 | for layer in self.layers[:output_layer]: 172 | output = layer( 173 | output, src_mask=mask, src_key_padding_mask=src_key_padding_mask 174 | ) 175 | return output 176 | 177 | 178 | def _compute_mask( 179 | shape: Tuple[int, int], 180 | mask_prob: float, 181 | mask_length: int, 182 | device: torch.device, 183 | min_masks: int = 0, 184 | ) -> torch.Tensor: 185 | batch_size, sequence_length = shape 186 | 187 | if mask_length < 1: 188 | raise ValueError("`mask_length` has to be bigger than 0.") 189 | 190 | if mask_length > sequence_length: 191 | raise ValueError( 192 | f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" 193 | ) 194 | 195 | # compute number of masked spans in batch 196 | num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random()) 197 | num_masked_spans = max(num_masked_spans, min_masks) 198 | 199 | # make sure num masked indices <= sequence_length 200 | if num_masked_spans * mask_length > sequence_length: 201 | num_masked_spans = sequence_length // mask_length 202 | 203 | # SpecAugment mask to fill 204 | mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool) 205 | 206 | # uniform distribution to sample from, make sure that offset samples are < sequence_length 207 | uniform_dist = torch.ones( 208 | (batch_size, sequence_length - (mask_length - 1)), device=device 209 | ) 210 | 211 | # get random indices to mask 212 | mask_indices = torch.multinomial(uniform_dist, num_masked_spans) 213 | 214 | # expand masked indices to masked spans 215 | mask_indices = ( 216 | mask_indices.unsqueeze(dim=-1) 217 | .expand((batch_size, num_masked_spans, mask_length)) 218 | .reshape(batch_size, num_masked_spans * mask_length) 219 | ) 220 | offsets = ( 221 | torch.arange(mask_length, device=device)[None, None, :] 222 | .expand((batch_size, num_masked_spans, mask_length)) 223 | .reshape(batch_size, num_masked_spans * mask_length) 224 | ) 225 | mask_idxs = mask_indices + offsets 226 | 227 | # scatter indices to mask 228 | mask = mask.scatter(1, mask_idxs, True) 229 | 230 | return mask 231 | 232 | 233 | def hubert_discrete( 234 | pretrained: bool = True, 235 | progress: bool = True, 236 | ) -> HubertDiscrete: 237 | r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. 238 | Args: 239 | pretrained (bool): load pretrained weights into the model 240 | progress (bool): show progress bar when downloading model 241 | """ 242 | kmeans = kmeans100(pretrained=pretrained, progress=progress) 243 | hubert = HubertDiscrete(kmeans) 244 | if pretrained: 245 | checkpoint = torch.hub.load_state_dict_from_url( 246 | URLS["hubert-discrete"], progress=progress 247 | ) 248 | consume_prefix_in_state_dict_if_present(checkpoint, "module.") 249 | hubert.load_state_dict(checkpoint) 250 | hubert.eval() 251 | return hubert 252 | 253 | 254 | def hubert_soft( 255 | pretrained: bool = True, 256 | progress: bool = True, 257 | ) -> HubertSoft: 258 | r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. 259 | Args: 260 | pretrained (bool): load pretrained weights into the model 261 | progress (bool): show progress bar when downloading model 262 | """ 263 | hubert = HubertSoft() 264 | if pretrained: 265 | checkpoint = torch.hub.load_state_dict_from_url( 266 | URLS["hubert-soft"], progress=progress 267 | ) 268 | consume_prefix_in_state_dict_if_present(checkpoint, "module.") 269 | hubert.load_state_dict(checkpoint) 270 | hubert.eval() 271 | return hubert 272 | 273 | 274 | def _kmeans( 275 | num_clusters: int, pretrained: bool = True, progress: bool = True 276 | ) -> KMeans: 277 | kmeans = KMeans(num_clusters) 278 | if pretrained: 279 | checkpoint = torch.hub.load_state_dict_from_url( 280 | URLS[f"kmeans{num_clusters}"], progress=progress 281 | ) 282 | kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"] 283 | kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"] 284 | kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy() 285 | return kmeans 286 | 287 | 288 | def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans: 289 | r""" 290 | k-means checkpoint for HuBERT-Discrete with 100 clusters. 291 | Args: 292 | pretrained (bool): load pretrained weights into the model 293 | progress (bool): show progress bar when downloading model 294 | """ 295 | return _kmeans(100, pretrained, progress) 296 | -------------------------------------------------------------------------------- /libs/hubert/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class Metric: 5 | def __init__(self): 6 | self.steps = 0 7 | self.value = 0 8 | 9 | def update(self, value): 10 | self.steps += 1 11 | self.value += (value - self.value) / self.steps 12 | return self.value 13 | 14 | def reset(self): 15 | self.steps = 0 16 | self.value = 0 17 | 18 | 19 | def save_checkpoint( 20 | checkpoint_dir, 21 | hubert, 22 | optimizer, 23 | scaler, 24 | step, 25 | loss, 26 | best, 27 | logger, 28 | ): 29 | state = { 30 | "hubert": hubert.state_dict(), 31 | "optimizer": optimizer.state_dict(), 32 | "scaler": scaler.state_dict(), 33 | "step": step, 34 | "loss": loss, 35 | } 36 | checkpoint_dir.mkdir(exist_ok=True, parents=True) 37 | checkpoint_path = checkpoint_dir / f"model-{step}.pt" 38 | torch.save(state, checkpoint_path) 39 | if best: 40 | best_path = checkpoint_dir / "model-best.pt" 41 | torch.save(state, best_path) 42 | logger.info(f"Saved checkpoint: {checkpoint_path.stem}") 43 | 44 | 45 | def load_checkpoint( 46 | load_path, 47 | hubert, 48 | optimizer, 49 | scaler, 50 | rank, 51 | logger, 52 | ): 53 | logger.info(f"Loading checkpoint from {load_path}") 54 | checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"}) 55 | hubert.load_state_dict(checkpoint["hubert"]) 56 | scaler.load_state_dict(checkpoint["scaler"]) 57 | optimizer.load_state_dict(checkpoint["optimizer"]) 58 | return checkpoint["step"], checkpoint["loss"] 59 | -------------------------------------------------------------------------------- /libs/wavlm/WavLM-Large.pt.txt: -------------------------------------------------------------------------------- 1 | https://github.com/microsoft/unilm/tree/master/wavlm -------------------------------------------------------------------------------- /libs/wavlm/__pycache__/WavLM.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/wavlm/__pycache__/WavLM.cpython-310.pyc -------------------------------------------------------------------------------- /libs/wavlm/__pycache__/WavLM.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/wavlm/__pycache__/WavLM.cpython-38.pyc -------------------------------------------------------------------------------- /libs/wavlm/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/wavlm/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /libs/wavlm/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/wavlm/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /libs/wavlm/__pycache__/modules.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/wavlm/__pycache__/modules.cpython-310.pyc -------------------------------------------------------------------------------- /libs/wavlm/__pycache__/modules.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/wavlm/__pycache__/modules.cpython-38.pyc -------------------------------------------------------------------------------- /minimal_quickvc/commons.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def init_weights(m, mean=0.0, std=0.01): 5 | classname = m.__class__.__name__ 6 | if classname.find("Conv") != -1: 7 | m.weight.data.normal_(mean, std) 8 | 9 | 10 | def get_padding(kernel_size, dilation=1): 11 | return int((kernel_size*dilation - dilation)/2) 12 | 13 | 14 | def rand_slice_segments(x, x_lengths=None, segment_size=4): 15 | b, d, t = x.size() 16 | if x_lengths is None: 17 | x_lengths = t 18 | ids_str_max = x_lengths - segment_size + 1 19 | ids_str = (torch.rand([b]).to(device=x.device) 20 | * ids_str_max).to(dtype=torch.long) 21 | ret = slice_segments(x, ids_str, segment_size) 22 | return ret, ids_str 23 | 24 | 25 | @torch.jit.script 26 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): 27 | n_channels_int = n_channels[0] 28 | in_act = input_a + input_b 29 | t_act = torch.tanh(in_act[:, :n_channels_int, :]) 30 | s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) 31 | acts = t_act * s_act 32 | return acts 33 | 34 | 35 | def sequence_mask(length, max_length=None): 36 | if max_length is None: 37 | max_length = length.max() 38 | x = torch.arange(max_length, dtype=length.dtype, device=length.device) 39 | return x.unsqueeze(0) < length.unsqueeze(1) 40 | 41 | 42 | -------------------------------------------------------------------------------- /minimal_quickvc/modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | from torch.nn import Conv1d 6 | from torch.nn.utils import weight_norm, remove_weight_norm 7 | 8 | from . import commons 9 | 10 | 11 | LRELU_SLOPE = 0.1 12 | 13 | 14 | class LayerNorm(nn.Module): 15 | def __init__(self, channels, eps=1e-5): 16 | super().__init__() 17 | self.channels = channels 18 | self.eps = eps 19 | 20 | self.gamma = nn.Parameter(torch.ones(channels)) 21 | self.beta = nn.Parameter(torch.zeros(channels)) 22 | 23 | def forward(self, x): 24 | x = x.transpose(1, -1) 25 | x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) 26 | return x.transpose(1, -1) 27 | 28 | 29 | class ConvReluNorm(nn.Module): 30 | def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): 31 | super().__init__() 32 | self.in_channels = in_channels 33 | self.hidden_channels = hidden_channels 34 | self.out_channels = out_channels 35 | self.kernel_size = kernel_size 36 | self.n_layers = n_layers 37 | self.p_dropout = p_dropout 38 | assert n_layers > 1, "Number of layers should be larger than 0." 39 | 40 | self.conv_layers = nn.ModuleList() 41 | self.norm_layers = nn.ModuleList() 42 | self.conv_layers.append( 43 | nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 44 | self.norm_layers.append(LayerNorm(hidden_channels)) 45 | self.relu_drop = nn.Sequential( 46 | nn.ReLU(), 47 | nn.Dropout(p_dropout)) 48 | for _ in range(n_layers-1): 49 | self.conv_layers.append(nn.Conv1d( 50 | hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) 51 | self.norm_layers.append(LayerNorm(hidden_channels)) 52 | self.proj = nn.Conv1d(hidden_channels, out_channels, 1) 53 | self.proj.weight.data.zero_() 54 | self.proj.bias.data.zero_() 55 | 56 | def forward(self, x, x_mask): 57 | x_org = x 58 | for i in range(self.n_layers): 59 | x = self.conv_layers[i](x * x_mask) 60 | x = self.norm_layers[i](x) 61 | x = self.relu_drop(x) 62 | x = x_org + self.proj(x) 63 | return x * x_mask 64 | 65 | 66 | class DDSConv(nn.Module): 67 | """ 68 | Dialted and Depth-Separable Convolution 69 | """ 70 | 71 | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): 72 | super().__init__() 73 | self.channels = channels 74 | self.kernel_size = kernel_size 75 | self.n_layers = n_layers 76 | self.p_dropout = p_dropout 77 | 78 | self.drop = nn.Dropout(p_dropout) 79 | self.convs_sep = nn.ModuleList() 80 | self.convs_1x1 = nn.ModuleList() 81 | self.norms_1 = nn.ModuleList() 82 | self.norms_2 = nn.ModuleList() 83 | for i in range(n_layers): 84 | dilation = kernel_size ** i 85 | padding = (kernel_size * dilation - dilation) // 2 86 | self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, 87 | groups=channels, dilation=dilation, padding=padding 88 | )) 89 | self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) 90 | self.norms_1.append(LayerNorm(channels)) 91 | self.norms_2.append(LayerNorm(channels)) 92 | 93 | def forward(self, x, x_mask, g=None): 94 | if g is not None: 95 | x = x + g 96 | for i in range(self.n_layers): 97 | y = self.convs_sep[i](x * x_mask) 98 | y = self.norms_1[i](y) 99 | y = F.gelu(y) 100 | y = self.convs_1x1[i](y) 101 | y = self.norms_2[i](y) 102 | y = F.gelu(y) 103 | y = self.drop(y) 104 | x = x + y 105 | return x * x_mask 106 | 107 | 108 | class WN(torch.nn.Module): 109 | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): 110 | super(WN, self).__init__() 111 | assert (kernel_size % 2 == 1) 112 | self.hidden_channels = hidden_channels 113 | self.kernel_size = kernel_size, 114 | self.dilation_rate = dilation_rate 115 | self.n_layers = n_layers 116 | self.gin_channels = gin_channels 117 | self.p_dropout = p_dropout 118 | 119 | self.in_layers = torch.nn.ModuleList() 120 | self.res_skip_layers = torch.nn.ModuleList() 121 | self.drop = nn.Dropout(p_dropout) 122 | 123 | if gin_channels != 0: 124 | cond_layer = torch.nn.Conv1d( 125 | gin_channels, 2*hidden_channels*n_layers, 1) 126 | self.cond_layer = torch.nn.utils.weight_norm( 127 | cond_layer, name='weight') 128 | 129 | for i in range(n_layers): 130 | dilation = dilation_rate ** i 131 | padding = int((kernel_size * dilation - dilation) / 2) 132 | in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, 133 | dilation=dilation, padding=padding) 134 | in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') 135 | self.in_layers.append(in_layer) 136 | 137 | # last one is not necessary 138 | if i < n_layers - 1: 139 | res_skip_channels = 2 * hidden_channels 140 | else: 141 | res_skip_channels = hidden_channels 142 | 143 | res_skip_layer = torch.nn.Conv1d( 144 | hidden_channels, res_skip_channels, 1) 145 | res_skip_layer = torch.nn.utils.weight_norm( 146 | res_skip_layer, name='weight') 147 | self.res_skip_layers.append(res_skip_layer) 148 | 149 | def forward(self, x, x_mask, g=None, **kwargs): 150 | output = torch.zeros_like(x) 151 | n_channels_tensor = torch.IntTensor([self.hidden_channels]) 152 | 153 | if g is not None: 154 | g = self.cond_layer(g) 155 | 156 | for i in range(self.n_layers): 157 | x_in = self.in_layers[i](x) 158 | if g is not None: 159 | cond_offset = i * 2 * self.hidden_channels 160 | g_l = g[:, cond_offset:cond_offset+2*self.hidden_channels, :] 161 | else: 162 | g_l = torch.zeros_like(x_in) 163 | 164 | acts = commons.fused_add_tanh_sigmoid_multiply( 165 | x_in, 166 | g_l, 167 | n_channels_tensor) 168 | acts = self.drop(acts) 169 | 170 | res_skip_acts = self.res_skip_layers[i](acts) 171 | if i < self.n_layers - 1: 172 | res_acts = res_skip_acts[:, :self.hidden_channels, :] 173 | x = (x + res_acts) * x_mask 174 | output = output + res_skip_acts[:, self.hidden_channels:, :] 175 | else: 176 | output = output + res_skip_acts 177 | return output * x_mask 178 | 179 | def remove_weight_norm(self): 180 | if self.gin_channels != 0: 181 | torch.nn.utils.remove_weight_norm(self.cond_layer) 182 | for l in self.in_layers: 183 | torch.nn.utils.remove_weight_norm(l) 184 | for l in self.res_skip_layers: 185 | torch.nn.utils.remove_weight_norm(l) 186 | 187 | 188 | class ResBlock1(torch.nn.Module): 189 | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): 190 | super(ResBlock1, self).__init__() 191 | self.convs1 = nn.ModuleList([ 192 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 193 | padding=commons.get_padding(kernel_size, dilation[0]))), 194 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 195 | padding=commons.get_padding(kernel_size, dilation[1]))), 196 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], 197 | padding=commons.get_padding(kernel_size, dilation[2]))) 198 | ]) 199 | self.convs1.apply(commons.init_weights) 200 | 201 | self.convs2 = nn.ModuleList([ 202 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 203 | padding=commons.get_padding(kernel_size, 1))), 204 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 205 | padding=commons.get_padding(kernel_size, 1))), 206 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, 207 | padding=commons.get_padding(kernel_size, 1))) 208 | ]) 209 | self.convs2.apply(commons.init_weights) 210 | 211 | def forward(self, x, x_mask=None): 212 | for c1, c2 in zip(self.convs1, self.convs2): 213 | xt = F.leaky_relu(x, LRELU_SLOPE) 214 | if x_mask is not None: 215 | xt = xt * x_mask 216 | xt = c1(xt) 217 | xt = F.leaky_relu(xt, LRELU_SLOPE) 218 | # print(xt.size()) 219 | if x_mask is not None: 220 | xt = xt * x_mask 221 | xt = c2(xt) 222 | # print(xt.size()) 223 | x = xt + x 224 | if x_mask is not None: 225 | x = x * x_mask 226 | return x 227 | 228 | def remove_weight_norm(self): 229 | for l in self.convs1: 230 | remove_weight_norm(l) 231 | for l in self.convs2: 232 | remove_weight_norm(l) 233 | 234 | 235 | class ResBlock2(torch.nn.Module): 236 | def __init__(self, channels, kernel_size=3, dilation=(1, 3)): 237 | super(ResBlock2, self).__init__() 238 | self.convs = nn.ModuleList([ 239 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], 240 | padding=commons.get_padding(kernel_size, dilation[0]))), 241 | weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], 242 | padding=commons.get_padding(kernel_size, dilation[1]))) 243 | ]) 244 | self.convs.apply(commons.init_weights) 245 | 246 | def forward(self, x, x_mask=None): 247 | for c in self.convs: 248 | xt = F.leaky_relu(x, LRELU_SLOPE) 249 | if x_mask is not None: 250 | xt = xt * x_mask 251 | xt = c(xt) 252 | x = xt + x 253 | if x_mask is not None: 254 | x = x * x_mask 255 | return x 256 | 257 | def remove_weight_norm(self): 258 | for l in self.convs: 259 | remove_weight_norm(l) 260 | 261 | 262 | class Log(nn.Module): 263 | def forward(self, x, x_mask, reverse=False, **kwargs): 264 | if not reverse: 265 | y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask 266 | logdet = torch.sum(-y, [1, 2]) 267 | return y, logdet 268 | else: 269 | x = torch.exp(x) * x_mask 270 | return x 271 | 272 | 273 | class Flip(nn.Module): 274 | def forward(self, x, *args, reverse=False, **kwargs): 275 | x = torch.flip(x, [1]) 276 | if not reverse: 277 | logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) 278 | return x, logdet 279 | else: 280 | return x 281 | 282 | 283 | class ResidualCouplingLayer(nn.Module): 284 | def __init__(self, 285 | channels, 286 | hidden_channels, 287 | kernel_size, 288 | dilation_rate, 289 | n_layers, 290 | p_dropout=0, 291 | gin_channels=0, 292 | mean_only=False): 293 | assert channels % 2 == 0, "channels should be divisible by 2" 294 | super().__init__() 295 | self.channels = channels 296 | self.hidden_channels = hidden_channels 297 | self.kernel_size = kernel_size 298 | self.dilation_rate = dilation_rate 299 | self.n_layers = n_layers 300 | self.half_channels = channels // 2 301 | self.mean_only = mean_only 302 | 303 | self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) 304 | self.enc = WN(hidden_channels, kernel_size, dilation_rate, 305 | n_layers, p_dropout=p_dropout, gin_channels=gin_channels) 306 | self.post = nn.Conv1d( 307 | hidden_channels, self.half_channels * (2 - mean_only), 1) 308 | self.post.weight.data.zero_() 309 | self.post.bias.data.zero_() 310 | 311 | def forward(self, x, x_mask, g=None, reverse=False): 312 | x0, x1 = torch.split(x, [self.half_channels]*2, 1) 313 | h = self.pre(x0) * x_mask 314 | h = self.enc(h, x_mask, g=g) 315 | stats = self.post(h) * x_mask 316 | if not self.mean_only: 317 | m, logs = torch.split(stats, [self.half_channels]*2, 1) 318 | else: 319 | m = stats 320 | logs = torch.zeros_like(m) 321 | 322 | if not reverse: 323 | x1 = m + x1 * torch.exp(logs) * x_mask 324 | x = torch.cat([x0, x1], 1) 325 | logdet = torch.sum(logs, [1, 2]) 326 | return x, logdet 327 | else: 328 | x1 = (x1 - m) * torch.exp(-logs) * x_mask 329 | x = torch.cat([x0, x1], 1) 330 | return x 331 | -------------------------------------------------------------------------------- /minimal_quickvc/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import logging as logger 4 | 5 | 6 | def load_checkpoint(checkpoint_path, model, optimizer=None): 7 | assert os.path.isfile(checkpoint_path) 8 | checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') 9 | iteration = checkpoint_dict['iteration'] 10 | learning_rate = checkpoint_dict['learning_rate'] 11 | if optimizer is not None: 12 | optimizer.load_state_dict(checkpoint_dict['optimizer']) 13 | saved_state_dict = checkpoint_dict['model'] 14 | if hasattr(model, 'module'): 15 | state_dict = model.module.state_dict() 16 | else: 17 | state_dict = model.state_dict() 18 | new_state_dict = {} 19 | for k, v in state_dict.items(): 20 | try: 21 | new_state_dict[k] = saved_state_dict[k] 22 | except: 23 | logger.info("%s is not in the checkpoint" % k) 24 | new_state_dict[k] = v 25 | if hasattr(model, 'module'): 26 | model.module.load_state_dict(new_state_dict) 27 | else: 28 | model.load_state_dict(new_state_dict) 29 | logger.info("Loaded checkpoint '{}' (iteration {})" .format( 30 | checkpoint_path, iteration)) 31 | return model, optimizer, learning_rate, iteration 32 | -------------------------------------------------------------------------------- /minimal_wesper/config/LJ_hubert_layer12/stats.json: -------------------------------------------------------------------------------- 1 | {"pitch": [-3.017691628597761, 14.210434825858718, 206.99761689758864, 49.12105044064982], "energy": [-1.1875702142715454, 16.95836639404297, 20.390984369035806, 17.155741255242276]} -------------------------------------------------------------------------------- /minimal_wesper/config/my_model16000.yaml: -------------------------------------------------------------------------------- 1 | transformer: 2 | encoder_layer: 4 3 | encoder_head: 2 4 | encoder_hidden: 256 5 | decoder_layer: 6 6 | decoder_head: 2 7 | decoder_hidden: 256 8 | conv_filter_size: 1024 9 | conv_kernel_size: [9, 1] 10 | encoder_dropout: 0.2 11 | decoder_dropout: 0.2 12 | 13 | variance_predictor: 14 | filter_size: 256 15 | kernel_size: 3 16 | dropout: 0.5 17 | 18 | variance_embedding: 19 | pitch_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing 20 | energy_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing 21 | n_bins: 256 22 | 23 | # gst: 24 | # use_gst: False 25 | # conv_filters: [32, 32, 64, 64, 128, 128] 26 | # gru_hidden: 128 27 | # token_size: 128 28 | # n_style_token: 10 29 | # attn_head: 4 30 | 31 | multi_speaker: False 32 | 33 | max_seq_len: 1000 34 | 35 | soft_unit: True # rkmt 2022.7.3 for HuBERT soft unit 36 | #soft_unit_dim: 768 # rkmt 2022.7.9 for HuBERT 37 | 38 | vocoder: 39 | model: "HiFi-GAN16k" # support 'HiFi-GAN', 'MelGAN' Hifi-GAN16k 40 | speaker: "universal" # was LJSpeech : support 'LJSpeech', 'universal' 41 | -------------------------------------------------------------------------------- /minimal_wesper/config/my_preprocess16k_LJ.yaml: -------------------------------------------------------------------------------- 1 | dataset: "rkmt" 2 | 3 | path: 4 | raw_path: "/home/rekimoto/Dropbox/my/data/LJSpeech1.1/wavs" 5 | preprocessed_path: "./minimal_wesper/config/LJ_hubert_layer12" 6 | hubert_checkpoint_path: "./HuBERT/model-layer12-450000.pt" 7 | 8 | preprocessing: 9 | val_size: 512 # validation size 10 | text: 11 | text_cleaners: ["unit"] # was ["english_cleaners"] ["tkn"] 12 | # layer: 7 # layer number. 0 if use soft unit 13 | language: "en" 14 | audio: 15 | sampling_rate: 16000 # was 22050 16 | max_wav_value: 32768.0 17 | stft: 18 | filter_length: 1024 19 | hop_length: 320 # was 256 20 | win_length: 1024 21 | mel: 22 | n_mel_channels: 80 23 | mel_fmin: 0 24 | mel_fmax: 8000 # please set to 8000 for HiFi-GAN vocoder, set to null for MelGAN vocoder 25 | pitch: 26 | feature: "phoneme_level" # support 'phoneme_level' or 'frame_level' 27 | normalization: True 28 | energy: 29 | feature: "phoneme_level" # support 'phoneme_level' or 'frame_level' 30 | normalization: True 31 | -------------------------------------------------------------------------------- /minimal_wesper/whisper_normal.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | import yaml 4 | 5 | from torch import nn 6 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present 7 | 8 | # FastSpeech2 9 | from libs.FastSpeech2 import FastSpeech2 10 | from libs.FastSpeech2.utils.tools import pad_2D 11 | # HuBERT 12 | from libs.hubert.model import HubertSoft 13 | # Hifigan 14 | from libs import hifigan 15 | from libs.hifigan.model import vocoder_infer 16 | 17 | def load_fastspeech2(device='cuda'): 18 | checkpoint_path = 'https://github.com/rkmt/wesper-demo/releases/download/v0.1/googletts_neutral_best.tar' 19 | preprocess_config = './minimal_wesper/config/my_preprocess16k_LJ.yaml' 20 | model_config = './minimal_wesper/config/my_model16000.yaml' 21 | preprocess_config = yaml.load(open(preprocess_config, "r"), Loader=yaml.FullLoader) 22 | model_config = yaml.load(open(model_config, "r"), Loader=yaml.FullLoader) 23 | 24 | model = FastSpeech2(preprocess_config, model_config).to(device) 25 | if checkpoint_path.startswith("http"): 26 | ckpt = torch.hub.load_state_dict_from_url(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.hub.load_state_dict_from_url(checkpoint_path) 27 | else: 28 | ckpt = torch.load(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.load(checkpoint_path) 29 | model.load_state_dict(ckpt["model"], strict=True) 30 | 31 | model = model.to(device) 32 | model.eval() 33 | model.requires_grad_ = False 34 | return model 35 | 36 | 37 | def load_hubert(device='cuda'): 38 | checkpoint_path = "https://github.com/rkmt/wesper-demo/releases/download/v0.1/model-layer12-450000.pt" 39 | if checkpoint_path.startswith("http"): 40 | checkpoint = torch.hub.load_state_dict_from_url(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.hub.load_state_dict_from_url(checkpoint_path) 41 | else: 42 | checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.load(checkpoint_path) 43 | hubert = HubertSoft().to(device) 44 | 45 | checkpoint = checkpoint['hubert'] if checkpoint['hubert'] is not None else checkpoint 46 | consume_prefix_in_state_dict_if_present(checkpoint, "module.") 47 | 48 | hubert.load_state_dict(checkpoint, strict=True) 49 | hubert.eval().to(device) 50 | return hubert 51 | 52 | 53 | def load_hifigan(device='cuda'): 54 | checkpoint_path='https://github.com/rkmt/wesper-demo/releases/download/v0.1/g_00205000' 55 | with open("./libs/hifigan/my_config_v1_16000.json", "r") as f: 56 | config = json.load(f) 57 | config = hifigan.AttrDict(config) 58 | vocoder = hifigan.Generator(config) 59 | if checkpoint_path.startswith("http"): 60 | ckpt = torch.hub.load_state_dict_from_url(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.hub.load_state_dict_from_url(checkpoint_path) 61 | else: 62 | ckpt = torch.load(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.load(checkpoint_path) 63 | 64 | vocoder.load_state_dict(ckpt['generator']) 65 | vocoder.eval() 66 | vocoder.remove_weight_norm() 67 | vocoder.to(device) 68 | 69 | return vocoder 70 | 71 | 72 | class SynthesizerTrn(nn.Module): 73 | def __init__(self): 74 | super().__init__() 75 | self.device = 'cuda' 76 | self.fs2model = load_fastspeech2(device=self.device) 77 | self.vocoder = load_hifigan(device=self.device) 78 | 79 | def infer(self, c): 80 | c = c.squeeze(0).detach().cpu().numpy() 81 | c = pad_2D([c]) 82 | c = torch.from_numpy(c).to(self.device) 83 | speakers = torch.tensor([0], device=self.device) 84 | max_src_len = c.shape[1] 85 | src_lens = torch.tensor([max_src_len], device=self.device) 86 | 87 | with torch.no_grad(): 88 | output = self.fs2model(speakers, c, src_lens, max_src_len) 89 | mel_len = output[9][0].item() 90 | mel_prediction = output[1][0, :mel_len].detach().transpose(0, 1) 91 | 92 | with torch.no_grad(): 93 | o = vocoder_infer(mel_prediction.unsqueeze(0),self.vocoder,)[0] 94 | return o 95 | 96 | 97 | class MyWhisper2Normal(object): 98 | def __init__(self, args): 99 | self.device = args.device 100 | 101 | self.encoder = load_hubert(device=self.device) 102 | self.syn = SynthesizerTrn() 103 | 104 | def convert(self, wav_from): 105 | wav_t = torch.from_numpy(wav_from).unsqueeze(0).unsqueeze(0).to(self.device) 106 | with torch.no_grad(): 107 | units = self.encoder.units(wav_t) 108 | wav_prediction = self.syn.infer(units) 109 | wav_prediction = (wav_prediction.cpu().numpy() * 32768.0).astype("int16") 110 | return wav_prediction 111 | 112 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .loss import MultiScaleMelSpectrogramLoss, t_axis_distill_loss 2 | from .discriminators import WaveDiscriminator, ReconstructionLoss, STFTDiscriminator 3 | from .s2u import call_feature_by_name, DVAEDecoder 4 | from .u2s import Reencoder, Decoder -------------------------------------------------------------------------------- /models/discriminators.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from typing import List, Tuple 4 | 5 | class ReconstructionLoss(nn.Module): 6 | """Reconstruction loss from https://arxiv.org/pdf/2107.03312.pdf 7 | but uses STFT instead of mel-spectrogram 8 | """ 9 | def __init__(self, eps=1e-5): 10 | super().__init__() 11 | self.eps = eps 12 | 13 | def forward(self, input, target): 14 | loss = 0 15 | input = input.to(torch.float32) 16 | target = target.to(torch.float32) 17 | for i in range(6, 12): 18 | s = 2 ** i 19 | alpha = (s / 2) ** 0.5 20 | # We use STFT instead of 64-bin mel-spectrogram as n_fft=64 is too small 21 | # for 64 bins. 22 | x = torch.stft(input, n_fft=s, hop_length=s // 4, win_length=s, normalized=True, onesided=True, return_complex=True) 23 | x = torch.abs(x) 24 | y = torch.stft(target, n_fft=s, hop_length=s // 4, win_length=s, normalized=True, onesided=True, return_complex=True) 25 | y = torch.abs(y) 26 | if x.shape[-1] > y.shape[-1]: 27 | x = x[:, :, :y.shape[-1]] 28 | elif x.shape[-1] < y.shape[-1]: 29 | y = y[:, :, :x.shape[-1]] 30 | loss += torch.mean(torch.abs(x - y)) 31 | loss += alpha * torch.mean(torch.square(torch.log(x + self.eps) - torch.log(y + self.eps))) 32 | return loss / (12 - 6) 33 | 34 | 35 | class ResNet2d(nn.Module): 36 | def __init__( 37 | self, 38 | n_channels: int, 39 | factor: int, 40 | stride: Tuple[int, int] 41 | ) -> None: 42 | # https://arxiv.org/pdf/2005.00341.pdf 43 | # The original paper uses layer normalization, but here 44 | # we use batch normalization. 45 | super().__init__() 46 | self.conv0 = nn.Conv2d( 47 | n_channels, 48 | n_channels, 49 | kernel_size=(3, 3), 50 | padding='same') 51 | self.bn0 = nn.BatchNorm2d( 52 | n_channels 53 | ) 54 | self.conv1 = nn.Conv2d( 55 | n_channels, 56 | factor * n_channels, 57 | kernel_size=(stride[0] + 2, stride[1] + 2), 58 | stride=stride) 59 | self.bn1 = nn.BatchNorm2d( 60 | factor * n_channels 61 | ) 62 | self.conv2 = nn.Conv2d( 63 | n_channels, 64 | factor * n_channels, 65 | kernel_size=1, 66 | stride=stride) 67 | self.bn2 = nn.BatchNorm2d( 68 | factor * n_channels 69 | ) 70 | self.pad = nn.ReflectionPad2d([ 71 | (stride[1] + 1) // 2, 72 | (stride[1] + 2) // 2, 73 | (stride[0] + 1) // 2, 74 | (stride[0] + 2) // 2, 75 | ]) 76 | self.activation = nn.LeakyReLU(0.3) 77 | 78 | def forward(self, input): 79 | x = self.conv0(input) 80 | x = self.bn0(x) 81 | x = self.activation(x) 82 | x = self.pad(x) 83 | x = self.conv1(x) 84 | x = self.bn1(x) 85 | 86 | # shortcut 87 | y = self.conv2(input) 88 | y = self.bn2(y) 89 | 90 | x += y 91 | x = self.activation(x) 92 | return x 93 | 94 | 95 | class WaveDiscriminator(nn.Module): 96 | r"""MelGAN discriminator from https://arxiv.org/pdf/1910.06711.pdf 97 | """ 98 | def __init__(self, resolution: int = 1, n_channels: int = 4) -> None: 99 | super().__init__() 100 | assert resolution >= 1 101 | if resolution == 1: 102 | self.avg_pool = nn.Identity() 103 | else: 104 | self.avg_pool = nn.AvgPool1d(resolution * 2, stride=resolution) 105 | self.activation = nn.LeakyReLU(0.2, inplace=True) 106 | self.layers = nn.ModuleList([ 107 | nn.utils.weight_norm(nn.Conv1d(1, n_channels, kernel_size=15, padding=7)), 108 | nn.utils.weight_norm(nn.Conv1d(n_channels, 4 * n_channels, kernel_size=41, stride=4, padding=20, groups=4)), 109 | nn.utils.weight_norm(nn.Conv1d(4 * n_channels, 16 * n_channels, kernel_size=41, stride=4, padding=20, groups=16)), 110 | nn.utils.weight_norm(nn.Conv1d(16 * n_channels, 64 * n_channels, kernel_size=41, stride=4, padding=20, groups=64)), 111 | nn.utils.weight_norm(nn.Conv1d(64 * n_channels, 256 * n_channels, kernel_size=41, stride=4, padding=20, groups=256)), 112 | nn.utils.weight_norm(nn.Conv1d(256 * n_channels, 256 * n_channels, kernel_size=5, padding=2)), 113 | nn.utils.weight_norm(nn.Conv1d(256 * n_channels, 1, kernel_size=3, padding=1)), 114 | ]) 115 | 116 | def forward(self, x: torch.Tensor) -> List[torch.Tensor]: 117 | x = self.avg_pool(x) 118 | feats = [] 119 | for layer in self.layers[:-1]: 120 | x = layer(x) 121 | feats.append(x) 122 | x = self.activation(x) 123 | feats.append(self.layers[-1](x)) 124 | return feats 125 | 126 | 127 | class STFTDiscriminator(nn.Module): 128 | r"""STFT-based discriminator from https://arxiv.org/pdf/2107.03312.pdf 129 | """ 130 | def __init__( 131 | self, n_fft: int = 1024, hop_length: int = 256, 132 | n_channels: int = 32 133 | ) -> None: 134 | super().__init__() 135 | self.n_fft = n_fft 136 | self.hop_length = hop_length 137 | n = n_fft // 2 + 1 138 | for _ in range(6): 139 | n = (n - 1) // 2 + 1 140 | self.layers = nn.Sequential( 141 | nn.Conv2d(1, n_channels, kernel_size=7, padding='same'), 142 | nn.LeakyReLU(0.3, inplace=True), 143 | ResNet2d(n_channels, 2, stride=(2, 1)), 144 | ResNet2d(2 * n_channels, 2, stride=(2, 2)), 145 | ResNet2d(4 * n_channels, 1, stride=(2, 1)), 146 | ResNet2d(4 * n_channels, 2, stride=(2, 2)), 147 | ResNet2d(8 * n_channels, 1, stride=(2, 1)), 148 | ResNet2d(8 * n_channels, 2, stride=(2, 2)), 149 | nn.Conv2d(16 * n_channels, 1, kernel_size=(n, 1)) 150 | ) 151 | 152 | def forward(self, input: torch.Tensor) -> torch.Tensor: 153 | assert input.shape[1] == 1 154 | # input: [batch, channel, sequence] 155 | x = torch.squeeze(input, 1).to(torch.float32) # torch.stft() doesn't accept float16 156 | x = torch.stft(x, self.n_fft, self.hop_length, normalized=True, onesided=True, return_complex=True) 157 | x = torch.abs(x) 158 | x = torch.unsqueeze(x, dim=1) 159 | x = self.layers(x) 160 | return x -------------------------------------------------------------------------------- /models/loss.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import torch 3 | import torch.nn as nn 4 | import typing 5 | from typing import List 6 | from collections import namedtuple 7 | from scipy import signal 8 | from librosa.filters import mel as librosa_mel_fn 9 | import math 10 | 11 | # Adapted from https://github.com/descriptinc/descript-audio-codec/blob/main/dac/nn/loss.py under the MIT license. 12 | # LICENSE is in incl_licenses directory. 13 | class MultiScaleMelSpectrogramLoss(nn.Module): 14 | 15 | def __init__( 16 | self, 17 | sampling_rate: int, 18 | n_mels: List[int] = [5, 10, 20, 40, 80, 160, 320], 19 | window_lengths: List[int] = [32, 64, 128, 256, 512, 1024, 2048], 20 | loss_fn: typing.Callable = nn.L1Loss(), 21 | clamp_eps: float = 1e-5, 22 | mag_weight: float = 0.0, 23 | log_weight: float = 1.0, 24 | pow: float = 1.0, 25 | weight: float = 1.0, 26 | match_stride: bool = False, 27 | mel_fmin: List[float] = [0, 0, 0, 0, 0, 0, 0], 28 | mel_fmax: List[float] = [None, None, None, None, None, None, None], 29 | window_type: str = "hann", 30 | ): 31 | super().__init__() 32 | self.sampling_rate = sampling_rate 33 | 34 | STFTParams = namedtuple( 35 | "STFTParams", 36 | ["window_length", "hop_length", "window_type", "match_stride"], 37 | ) 38 | 39 | self.stft_params = [ 40 | STFTParams( 41 | window_length=w, 42 | hop_length=w // 4, 43 | match_stride=match_stride, 44 | window_type=window_type, 45 | ) 46 | for w in window_lengths 47 | ] 48 | self.n_mels = n_mels 49 | self.loss_fn = loss_fn 50 | self.clamp_eps = clamp_eps 51 | self.log_weight = log_weight 52 | self.mag_weight = mag_weight 53 | self.weight = weight 54 | self.mel_fmin = mel_fmin 55 | self.mel_fmax = mel_fmax 56 | self.pow = pow 57 | 58 | @staticmethod 59 | @functools.lru_cache(None) 60 | def get_window( 61 | window_type, 62 | window_length, 63 | ): 64 | return signal.get_window(window_type, window_length) 65 | 66 | @staticmethod 67 | @functools.lru_cache(None) 68 | def get_mel_filters(sr, n_fft, n_mels, fmin, fmax): 69 | return librosa_mel_fn(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) 70 | 71 | def mel_spectrogram( 72 | self, 73 | wav, 74 | n_mels, 75 | fmin, 76 | fmax, 77 | window_length, 78 | hop_length, 79 | match_stride, 80 | window_type, 81 | ): 82 | 83 | B, C, T = wav.shape 84 | 85 | if match_stride: 86 | assert ( 87 | hop_length == window_length // 4 88 | ), "For match_stride, hop must equal n_fft // 4" 89 | right_pad = math.ceil(T / hop_length) * hop_length - T 90 | pad = (window_length - hop_length) // 2 91 | else: 92 | right_pad = 0 93 | pad = 0 94 | 95 | wav = torch.nn.functional.pad(wav, (pad, pad + right_pad), mode="reflect") 96 | 97 | window = self.get_window(window_type, window_length) 98 | window = torch.from_numpy(window).to(wav.device).float() 99 | 100 | stft = torch.stft( 101 | wav.reshape(-1, T), 102 | n_fft=window_length, 103 | hop_length=hop_length, 104 | window=window, 105 | return_complex=True, 106 | center=True, 107 | ) 108 | _, nf, nt = stft.shape 109 | stft = stft.reshape(B, C, nf, nt) 110 | if match_stride: 111 | stft = stft[..., 2:-2] 112 | magnitude = torch.abs(stft) 113 | 114 | nf = magnitude.shape[2] 115 | mel_basis = self.get_mel_filters( 116 | self.sampling_rate, 2 * (nf - 1), n_mels, fmin, fmax 117 | ) 118 | mel_basis = torch.from_numpy(mel_basis).to(wav.device) 119 | mel_spectrogram = magnitude.transpose(2, -1) @ mel_basis.T 120 | mel_spectrogram = mel_spectrogram.transpose(-1, 2) 121 | 122 | return mel_spectrogram 123 | 124 | def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: 125 | 126 | loss = 0.0 127 | for n_mels, fmin, fmax, s in zip( 128 | self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params 129 | ): 130 | kwargs = { 131 | "n_mels": n_mels, 132 | "fmin": fmin, 133 | "fmax": fmax, 134 | "window_length": s.window_length, 135 | "hop_length": s.hop_length, 136 | "match_stride": s.match_stride, 137 | "window_type": s.window_type, 138 | } 139 | 140 | x_mels = self.mel_spectrogram(x, **kwargs) 141 | y_mels = self.mel_spectrogram(y, **kwargs) 142 | x_logmels = torch.log( 143 | x_mels.clamp(min=self.clamp_eps).pow(self.pow) 144 | ) / torch.log(torch.tensor(10.0)) 145 | y_logmels = torch.log( 146 | y_mels.clamp(min=self.clamp_eps).pow(self.pow) 147 | ) / torch.log(torch.tensor(10.0)) 148 | 149 | loss += self.log_weight * self.loss_fn(x_logmels, y_logmels) 150 | loss += self.mag_weight * self.loss_fn(x_logmels, y_logmels) 151 | 152 | return loss 153 | 154 | # t_axis_distill_loss copied from https://github.com/ZhangXInFD/SpeechTokenizer 155 | class t_axis_distill_loss(nn.Module): 156 | def __init__(self, **params): 157 | super().__init__() 158 | 159 | def forward(self, feature, target_feature, lambda_sim=1): 160 | n = min(feature.size(1), target_feature.size(1)) 161 | l1_loss = torch.nn.functional.mse_loss(feature[:, :n], target_feature[:, :n], reduction='mean') 162 | sim_loss = - torch.log(torch.sigmoid(torch.nn.functional.cosine_similarity(feature[:, :n], target_feature[:, :n], axis=-1))).mean() 163 | distill_loss = l1_loss + lambda_sim * sim_loss 164 | return distill_loss -------------------------------------------------------------------------------- /models/s2u.py: -------------------------------------------------------------------------------- 1 | # Torch and related libraries 2 | import torch 3 | import torch.nn as nn 4 | from nnAudio import features 5 | from utils.config import Config 6 | 7 | def call_feature_by_name(name, *args, **kwargs): 8 | func = globals().get(name) 9 | if func and callable(func): 10 | return func(*args, **kwargs) 11 | else: 12 | print("Function not found or not callable.") 13 | 14 | # Learnable MFCCs Extractor 15 | class mfcc(nn.Module): 16 | def __init__(self, trainable=False, **params): 17 | super().__init__() 18 | config = Config({}) 19 | self.spec = features.MFCC( 20 | sr=config.sample_rate, 21 | n_fft=config.n_fft, 22 | win_length=config.win_length, 23 | hop_length=config.hop_length, 24 | n_mfcc=config.n_mels, 25 | trainable_mel=trainable, 26 | trainable_STFT=trainable, 27 | ) 28 | # self.conv = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=1) 29 | self.linear = nn.Linear(config.n_mels, config.n_embed_dim) 30 | 31 | def forward(self, input): 32 | x = self.spec(input) 33 | # y = torch.repeat_interleave(x, 2, dim=1) 34 | # y = self.conv(x) 35 | x = x.permute(0, 2, 1) 36 | y = self.linear(x) 37 | y = y.permute(0, 2, 1) 38 | return y 39 | 40 | class melspec(nn.Module): 41 | def __init__(self, **params): 42 | super().__init__() 43 | # self.spec = features.MelSpectrogram( 44 | # sr=16000, 45 | # n_fft=1024, 46 | # win_length=1024, 47 | # hop_length=320, 48 | # n_mels=256, 49 | # fmin=0.0, 50 | # fmax=None, 51 | # trainable_mel=True, 52 | # trainable_STFT=True 53 | # ) 54 | self.spec = features.gammatone.Gammatonegram( 55 | sr=16000, 56 | n_fft=1024, 57 | hop_length=320, 58 | n_bins=256, 59 | fmin=0.0, 60 | fmax=None, 61 | trainable_bins=True, 62 | trainable_STFT=True 63 | ) 64 | 65 | def forward(self, input): 66 | # logmel = F.interpolate(logmel, scale_factor=2) 67 | x = self.spec(input) 68 | return x[..., :-1] 69 | 70 | class stftspec(nn.Module): 71 | def __init__(self, **params): 72 | super().__init__() 73 | self.spec = features.STFT( 74 | n_fft=1024, 75 | win_length=1024, 76 | freq_bins=256, 77 | hop_length=320, 78 | output_format="Magnitude", 79 | ) # trainable=True, 80 | 81 | def forward(self, input): 82 | return self.spec(input) 83 | 84 | 85 | # Encoder 86 | class ConvNeXtBlock(nn.Module): 87 | def __init__( 88 | self, 89 | dim: int, 90 | intermediate_dim: int, 91 | kernel, dilation, 92 | layer_scale_init_value: float = 1e-6, 93 | ): 94 | # ConvNeXt Block copied from Vocos. 95 | super().__init__() 96 | self.dwconv = nn.Conv1d(dim, dim, 97 | kernel_size=kernel, padding=dilation*(kernel//2), 98 | dilation=dilation, groups=dim 99 | ) # depthwise conv 100 | 101 | self.norm = nn.LayerNorm(dim, eps=1e-6) 102 | self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers 103 | self.act = nn.GELU() 104 | self.pwconv2 = nn.Linear(intermediate_dim, dim) 105 | self.gamma = ( 106 | nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True) 107 | if layer_scale_init_value > 0 108 | else None 109 | ) 110 | 111 | def forward(self, x: torch.Tensor, cond = None) -> torch.Tensor: 112 | residual = x 113 | x = self.dwconv(x) 114 | x = x.transpose(1, 2) # (B, C, T) -> (B, T, C) 115 | x = self.norm(x) 116 | x = self.pwconv1(x) 117 | x = self.act(x) 118 | x = self.pwconv2(x) 119 | if self.gamma is not None: 120 | x = self.gamma * x 121 | x = x.transpose(1, 2) # (B, T, C) -> (B, C, T) 122 | 123 | x = residual + x 124 | return x 125 | 126 | class DVAEDecoder(nn.Module): 127 | def __init__(self, idim, odim, 128 | n_layer = 12, bn_dim = 64, hidden = 256, 129 | kernel = 7, dilation = 2, up = False 130 | ): 131 | super().__init__() 132 | self.up = up 133 | self.conv_in = nn.Sequential( 134 | nn.Conv1d(idim, bn_dim, 3, 1, 1), nn.GELU(), 135 | nn.Conv1d(bn_dim, hidden, 3, 1, 1) 136 | ) 137 | self.decoder_block = nn.ModuleList([ 138 | ConvNeXtBlock(hidden, hidden* 4, kernel, dilation,) 139 | for _ in range(n_layer)]) 140 | self.conv_out = nn.Conv1d(hidden, odim, kernel_size=1, bias=False) 141 | # self.layernorm1 = nn.LayerNorm(256) 142 | # self.layernorm2 = nn.LayerNorm(256, bias=False) 143 | 144 | def forward(self, input, conditioning=None): 145 | # B, T, C 146 | # x = self.layernorm1(input) 147 | x = input.transpose(1, 2) 148 | x = self.conv_in(x) 149 | for f in self.decoder_block: 150 | x = f(x, conditioning) 151 | x = self.conv_out(x) 152 | x = x.transpose(1, 2) 153 | # x = self.layernorm2(x) 154 | return x 155 | -------------------------------------------------------------------------------- /models/u2s.py: -------------------------------------------------------------------------------- 1 | # Torch and related libraries 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | # Reencoder 7 | class FiLMLayer(nn.Module): 8 | """ 9 | Feature-wise Linear Modulation (FiLM) layer 10 | Reference: https://arxiv.org/abs/1709.07871 11 | """ 12 | def __init__(self, in_channels, out_channels, cond_channels): 13 | super(FiLMLayer, self).__init__() 14 | self.in_channels = in_channels 15 | self.film = nn.Conv1d(cond_channels, (in_channels + out_channels), 1) 16 | 17 | def forward(self, x, c): 18 | gamma, beta = torch.chunk(self.film(c.unsqueeze(2)), chunks=2, dim=1) 19 | return gamma * x + beta 20 | 21 | class StyleAdaptiveLayerNorm(nn.Module): 22 | def __init__(self, in_channels, cond_channels): 23 | """ 24 | Style Adaptive Layer Normalization (SALN) module. 25 | 26 | Parameters: 27 | in_channels: The number of channels in the input feature maps. 28 | cond_channels: The number of channels in the conditioning input. 29 | """ 30 | super(StyleAdaptiveLayerNorm, self).__init__() 31 | self.in_channels = in_channels 32 | 33 | self.saln = nn.Linear(cond_channels, in_channels * 2, 1) 34 | self.norm = nn.LayerNorm(in_channels, elementwise_affine=False) 35 | 36 | self.reset_parameters() 37 | 38 | def reset_parameters(self): 39 | nn.init.constant_(self.saln.bias.data[:self.in_channels], 1) 40 | nn.init.constant_(self.saln.bias.data[self.in_channels:], 0) 41 | 42 | def forward(self, x, c): 43 | c = self.saln(c.unsqueeze(1)) 44 | gamma, beta = torch.chunk(c, chunks=2, dim=-1) 45 | return gamma * self.norm(x) + beta 46 | 47 | class ConvNeXtBlock_Adapt(nn.Module): 48 | def __init__(self, gin_channels, layer_scale_init_value: float = 1e-6,): 49 | super().__init__() 50 | self.dwconv = nn.Conv1d(256, 256, kernel_size=7, padding=3, groups=256) 51 | self.norm = StyleAdaptiveLayerNorm(256, gin_channels) 52 | self.pwconv_2 = nn.Sequential(nn.Linear(256, 256*4), 53 | nn.GELU(), 54 | nn.Linear(256*4, 256)) 55 | self.gamma = ( 56 | nn.Parameter(layer_scale_init_value * torch.ones(256), requires_grad=True) 57 | if layer_scale_init_value > 0 58 | else None 59 | ) 60 | 61 | def forward(self, x, c) -> torch.Tensor: 62 | residual = x # 24,256,102 63 | x = self.dwconv(x) # 24,512,102 64 | x = self.norm(x.transpose(1, 2), c) # 24,512,102 65 | x = self.pwconv_2(x) 66 | if self.gamma is not None: 67 | x = self.gamma * x 68 | x = x.transpose(1, 2) 69 | x = residual + x 70 | return x 71 | 72 | class Reencoder(torch.nn.Module): 73 | def __init__(self, n_layers: int, wavenet_embed_dim: int, 74 | decoder_causal: bool = False, nn_type='conv'): 75 | super(Reencoder, self).__init__() 76 | self.nn_type = nn_type 77 | if nn_type == 'film': 78 | self.film = FiLMLayer(in_channels=256, out_channels=256, cond_channels=192) 79 | elif nn_type == 'adapt': 80 | self.adapt = ConvNeXtBlock_Adapt(gin_channels=192) 81 | elif nn_type == 'norm': 82 | self.norm = StyleAdaptiveLayerNorm(256, 192) 83 | # self.conv_out = torch.nn.Conv1d(256, 512, 1) 84 | 85 | 86 | def forward(self, c_code, spk_emb): # c_code.shape [B, 256, 100] 87 | if self.nn_type == 'conv': 88 | spk_emb = self.spk_proj(spk_emb.unsqueeze(2)) # [B, 256] 89 | c_code = c_code + spk_emb 90 | # z = self.conv_out(c_code) 91 | elif self.nn_type == 'film': 92 | x = self.film(c_code, spk_emb) 93 | c_code = self.adapt(c_code, spk_emb) 94 | # z = self.conv_out(c_code) 95 | elif self.nn_type == 'adapt': 96 | c_code = self.adapt(c_code, spk_emb) 97 | # z = self.conv_out(c_code) 98 | elif self.nn_type == 'norm': 99 | x = self.norm(c_code.transpose(1, 2), spk_emb) 100 | c_code = x.transpose(1, 2) 101 | # z = self.conv_out(c_code) 102 | # elif self.nn_type == 'wo': 103 | # # z = self.conv_out(c_code) 104 | return c_code 105 | 106 | # Decoder copied from https://github.com/kaiidams/soundstream-pytorch 107 | class ResNet1d(nn.Module): 108 | def __init__( 109 | self, 110 | n_channels, 111 | kernel_size: int = 7, 112 | padding: str = 'valid', 113 | dilation: int = 1 114 | ) -> None: 115 | super().__init__() 116 | assert padding in ['valid', 'same'] 117 | self.kernel_size = kernel_size 118 | self.padding = padding 119 | self.dilation = dilation 120 | self._padding_size = (kernel_size // 2) * dilation 121 | self.conv0 = nn.Conv1d( 122 | n_channels, 123 | n_channels, 124 | kernel_size=kernel_size, 125 | padding=padding, 126 | dilation=dilation) 127 | self.conv1 = nn.Conv1d( 128 | n_channels, 129 | n_channels, 130 | kernel_size=1) 131 | 132 | def forward(self, input): 133 | y = input 134 | x = self.conv0(input) 135 | x = F.elu(x) 136 | x = self.conv1(x) 137 | if self.padding == 'valid': 138 | y = y[:, :, self._padding_size:-self._padding_size] 139 | x += y 140 | x = F.elu(x) 141 | return x 142 | 143 | class DecoderBlock(nn.Module): 144 | def __init__( 145 | self, 146 | n_channels: int, 147 | padding: str, 148 | stride: int 149 | ) -> None: 150 | super().__init__() 151 | assert padding in ['valid', 'same'] 152 | self.layers = nn.Sequential( 153 | nn.ConvTranspose1d( 154 | n_channels, n_channels // 2, 155 | kernel_size=2 * stride, 156 | padding=(2 * stride) // 2 if padding == 'same' else 0, 157 | stride=stride), 158 | nn.ELU(), 159 | ResNet1d(n_channels // 2, padding=padding, dilation=1), 160 | ResNet1d(n_channels // 2, padding=padding, dilation=3), 161 | ResNet1d(n_channels // 2, padding=padding, dilation=9), 162 | ) 163 | 164 | def forward(self, input: torch.Tensor) -> torch.Tensor: 165 | return self.layers(input) 166 | 167 | class Decoder(nn.Module): 168 | def __init__(self, n_channels: int, padding): 169 | super().__init__() 170 | assert padding in ['valid', 'same'] 171 | self.layers = nn.Sequential( 172 | nn.Conv1d(16 * n_channels, 16 * n_channels, kernel_size=7, padding=padding), 173 | nn.ELU(), 174 | DecoderBlock(16 * n_channels, padding=padding, stride=8), 175 | DecoderBlock(8 * n_channels, padding=padding, stride=5), 176 | DecoderBlock(4 * n_channels, padding=padding, stride=4), 177 | DecoderBlock(2 * n_channels, padding=padding, stride=2), 178 | nn.Conv1d(n_channels, 1, kernel_size=7, padding=padding), 179 | nn.Tanh(), 180 | ) 181 | 182 | def forward(self, input: torch.Tensor) -> torch.Tensor: 183 | return self.layers(input) -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/DisocoGAN_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/DisocoGAN_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/MSpeC_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/MSpeC_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/WES_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/WES_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/s000_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s000_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/s000_QuickVC_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s000_QuickVC_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/s000_W2S_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s000_W2S_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/s001_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s001_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/s001_QuickVC_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s001_QuickVC_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/s001_W2S_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s001_W2S_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/s002_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s002_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/s002_QuickVC_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s002_QuickVC_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/s002_W2S_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s002_W2S_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/s003_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s003_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/s003_QuickVC_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s003_QuickVC_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_403_headset/s003_W2S_403_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s003_W2S_403_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_416_headset/416_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/416_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_416_headset/DisocoGAN_416_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/DisocoGAN_416_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_416_headset/MSpeC_416_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/MSpeC_416_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_416_headset/WES_416_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/WES_416_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_416_headset/s000_416_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s000_416_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_416_headset/s000_QuickVC_416_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s000_QuickVC_416_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_416_headset/s001_416_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s001_416_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_416_headset/s001_QuickVC_416_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s001_QuickVC_416_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_416_headset/s002_416_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s002_416_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_416_headset/s002_QuickVC_416_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s002_QuickVC_416_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_416_headset/s003_416_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s003_416_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/W2S_416_headset/s003_QuickVC_416_headset.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s003_QuickVC_416_headset.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/AGAN-W2SC_fn001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/AGAN-W2SC_fn001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/BLSTM_fn001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/BLSTM_fn001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/CycleGAN-VC_fn001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/CycleGAN-VC_fn001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/GMM_fn001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/GMM_fn001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/WES_fw001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/WES_fw001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/fw001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/fw001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/s000_QuickVC_fw001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s000_QuickVC_fw001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/s000_fw001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s000_fw001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/s001_QuickVC_fw001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s001_QuickVC_fw001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/s001_fw001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s001_fw001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/s002_QuickVC_fw001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s002_QuickVC_fw001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/s002_fw001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s002_fw001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/s003_QuickVC_fw001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s003_QuickVC_fw001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw001/s003_fw001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s003_fw001.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/AGAN-W2SC_fn002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/AGAN-W2SC_fn002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/BLSTM_fn002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/BLSTM_fn002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/CycleGAN-VC_fn002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/CycleGAN-VC_fn002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/GMM_fn002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/GMM_fn002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/WES_fw002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/WES_fw002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/fw002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/fw002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/s000_QuickVC_fw002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s000_QuickVC_fw002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/s000_fw002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s000_fw002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/s001_QuickVC_fw002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s001_QuickVC_fw002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/s001_fw002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s001_fw002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/s002_QuickVC_fw002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s002_QuickVC_fw002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/s002_fw002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s002_fw002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/s003_QuickVC_fw002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s003_QuickVC_fw002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/fw002/s003_fw002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s003_fw002.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/sample_whisper/WES_sample_whisper.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/WES_sample_whisper.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/sample_whisper/s000_QuickVC_sample_whisper.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s000_QuickVC_sample_whisper.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/sample_whisper/s000_sample_whisper.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s000_sample_whisper.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/sample_whisper/s001_QuickVC_sample_whisper.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s001_QuickVC_sample_whisper.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/sample_whisper/s001_sample_whisper.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s001_sample_whisper.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/sample_whisper/s002_QuickVC_sample_whisper.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s002_QuickVC_sample_whisper.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/sample_whisper/s002_sample_whisper.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s002_sample_whisper.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/sample_whisper/s003_QuickVC_sample_whisper.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s003_QuickVC_sample_whisper.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/sample_whisper/s003_sample_whisper.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s003_sample_whisper.wav -------------------------------------------------------------------------------- /raw/data_in_the_wild/sample_whisper/sample_whisper.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/sample_whisper.wav -------------------------------------------------------------------------------- /raw/freevc/s000u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/freevc/s000u003w.wav -------------------------------------------------------------------------------- /raw/freevc/s001u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/freevc/s001u003w.wav -------------------------------------------------------------------------------- /raw/freevc/s002u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/freevc/s002u003w.wav -------------------------------------------------------------------------------- /raw/freevc/s003u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/freevc/s003u003w.wav -------------------------------------------------------------------------------- /raw/gt/s000u003n.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s000u003n.wav -------------------------------------------------------------------------------- /raw/gt/s000u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s000u003w.wav -------------------------------------------------------------------------------- /raw/gt/s001u003n.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s001u003n.wav -------------------------------------------------------------------------------- /raw/gt/s001u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s001u003w.wav -------------------------------------------------------------------------------- /raw/gt/s002u003n.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s002u003n.wav -------------------------------------------------------------------------------- /raw/gt/s002u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s002u003w.wav -------------------------------------------------------------------------------- /raw/gt/s003u003n.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s003u003n.wav -------------------------------------------------------------------------------- /raw/gt/s003u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s003u003w.wav -------------------------------------------------------------------------------- /raw/pseudo/s000u003n.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/pseudo/s000u003n.wav -------------------------------------------------------------------------------- /raw/pseudo/s001u003n.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/pseudo/s001u003n.wav -------------------------------------------------------------------------------- /raw/pseudo/s002u003n.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/pseudo/s002u003n.wav -------------------------------------------------------------------------------- /raw/pseudo/s003u003n.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/pseudo/s003u003n.wav -------------------------------------------------------------------------------- /raw/quickvc/s000u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/quickvc/s000u003w.wav -------------------------------------------------------------------------------- /raw/quickvc/s001u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/quickvc/s001u003w.wav -------------------------------------------------------------------------------- /raw/quickvc/s002u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/quickvc/s002u003w.wav -------------------------------------------------------------------------------- /raw/quickvc/s003u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/quickvc/s003u003w.wav -------------------------------------------------------------------------------- /raw/s2u_fs2_hifigan/s000u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_fs2_hifigan/s000u003w.wav -------------------------------------------------------------------------------- /raw/s2u_fs2_hifigan/s001u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_fs2_hifigan/s001u003w.wav -------------------------------------------------------------------------------- /raw/s2u_fs2_hifigan/s002u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_fs2_hifigan/s002u003w.wav -------------------------------------------------------------------------------- /raw/s2u_fs2_hifigan/s003u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_fs2_hifigan/s003u003w.wav -------------------------------------------------------------------------------- /raw/s2u_ms_istft_vits/s000u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_ms_istft_vits/s000u003w.wav -------------------------------------------------------------------------------- /raw/s2u_ms_istft_vits/s001u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_ms_istft_vits/s001u003w.wav -------------------------------------------------------------------------------- /raw/s2u_ms_istft_vits/s002u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_ms_istft_vits/s002u003w.wav -------------------------------------------------------------------------------- /raw/s2u_ms_istft_vits/s003u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_ms_istft_vits/s003u003w.wav -------------------------------------------------------------------------------- /raw/s2u_u2s/s000u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_u2s/s000u003w.wav -------------------------------------------------------------------------------- /raw/s2u_u2s/s001u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_u2s/s001u003w.wav -------------------------------------------------------------------------------- /raw/s2u_u2s/s002u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_u2s/s002u003w.wav -------------------------------------------------------------------------------- /raw/s2u_u2s/s003u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_u2s/s003u003w.wav -------------------------------------------------------------------------------- /raw/softvc/s000u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/softvc/s000u003w.wav -------------------------------------------------------------------------------- /raw/softvc/s001u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/softvc/s001u003w.wav -------------------------------------------------------------------------------- /raw/softvc/s002u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/softvc/s002u003w.wav -------------------------------------------------------------------------------- /raw/softvc/s003u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/softvc/s003u003w.wav -------------------------------------------------------------------------------- /raw/test/s000u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/test/s000u003w.wav -------------------------------------------------------------------------------- /raw/test/s001u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/test/s001u003w.wav -------------------------------------------------------------------------------- /raw/test/s002u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/test/s002u003w.wav -------------------------------------------------------------------------------- /raw/test/s003u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/test/s003u003w.wav -------------------------------------------------------------------------------- /raw/wesper/s000u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/wesper/s000u003w.wav -------------------------------------------------------------------------------- /raw/wesper/s001u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/wesper/s001u003w.wav -------------------------------------------------------------------------------- /raw/wesper/s002u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/wesper/s002u003w.wav -------------------------------------------------------------------------------- /raw/wesper/s003u003w.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/wesper/s003u003w.wav -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nnAudio 2 | pesq 3 | Cython 4 | packaging 5 | nemo_toolkit[asr] -------------------------------------------------------------------------------- /resources/system_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/resources/system_diagram.png -------------------------------------------------------------------------------- /utils/__init_.py: -------------------------------------------------------------------------------- 1 | from .s2f0 import load_F0_models, wav2F0 2 | from .s2fhubert import load_hubert, wav2units 3 | from .audioprep import resample_if_needed, squeeze_and_normalize, pad_if_needed -------------------------------------------------------------------------------- /utils/audioprep.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | import torch.nn.functional as F 4 | 5 | # Resampling if necessary 6 | def resample_if_needed(signal, orig_sr, target_sr): 7 | if orig_sr != target_sr: 8 | return torchaudio.functional.resample(signal, orig_sr, target_sr) 9 | return signal 10 | # Squeeze and normalize 11 | def squeeze_and_normalize(signal): 12 | signal = torch.squeeze(signal) 13 | return signal * (0.95 / torch.max(signal)) 14 | # Pad if necessary 15 | def pad_if_needed(signal, length): 16 | if signal.shape[0] < length: 17 | return F.pad(signal, [0, length - signal.shape[0]], "constant") 18 | return signal 19 | 20 | def process_signal(signal, orig_sr, target_sr, target_len, segment_len): 21 | signal = resample_if_needed(signal, orig_sr, target_sr) 22 | signal = squeeze_and_normalize(signal) 23 | signal = signal[:target_len] 24 | signal = pad_if_needed(signal, segment_len) 25 | return signal -------------------------------------------------------------------------------- /utils/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from omegaconf import OmegaConf 3 | 4 | DEFAULT_DICT = { 5 | # Configuration ID 6 | 'id': "null", 7 | # Training configuration 8 | 'seed': 1234, 9 | 'lr': 1e-6, 10 | 'b1': 0.5, 11 | 'b2': 0.9, 12 | 'segment_length': 32270, 13 | # Model configuration 14 | 'n_channels': 16, 15 | 'n_embed_dim': 256, 16 | 'n_reencoder_layer': 1, 17 | 'n_encoder_layer': 12, 18 | 'sample_rate': 16000, 19 | 'n_mels': 128, 20 | 'n_fft': 1024, 21 | 'win_length': 1024, 22 | 'hop_length': 320, 23 | 'trainable': True, 24 | 'padding': 'same', 25 | # ROOT 26 | 'pseudo_rate': 0.4, 27 | 'datasets_root': '/data/ssd1/tianyi.tan/soundstream', 28 | 'F0_model_path': './libs/JDC/bst.t7', 29 | } 30 | 31 | class ConfigItem(dict): 32 | __slots__ = () 33 | 34 | def __init__(self, config_dict=None): 35 | if config_dict is None: 36 | config_dict = dict() 37 | if isinstance(config_dict, ConfigItem): 38 | config_dict = config_dict.to_dict() 39 | assert isinstance(config_dict, dict) 40 | 41 | # Set attributes (not dict in ConfigItem) 42 | for key, value in config_dict.items(): 43 | if isinstance(value, (list, tuple)): 44 | value = [ConfigItem(x) if isinstance(x, dict) else x for x in value] 45 | elif isinstance(value, dict): 46 | value = ConfigItem(value) 47 | elif isinstance(value, ConfigItem): 48 | value = ConfigItem(value.to_dict()) 49 | elif isinstance(value, str) and value.lower() == 'none': 50 | value = None 51 | self[key] = value 52 | 53 | def __getattr__(self, item): 54 | try: 55 | return self[item] 56 | except KeyError: 57 | raise AttributeError(item) 58 | 59 | def __setattr__(self, name, value): 60 | self[name] = value 61 | 62 | def to_dict(self, recursive=True): 63 | conf_dict = {} 64 | for k, v in self.items(): 65 | if isinstance(v, ConfigItem) and recursive: 66 | v = v.to_dict(recursive) 67 | conf_dict[k] = v 68 | return conf_dict 69 | 70 | def update(self, obj): 71 | assert isinstance(obj, (ConfigItem, dict)) 72 | 73 | for k, v in obj.items(): 74 | if k not in self or not isinstance(v, (ConfigItem, dict)): 75 | self[k] = v 76 | else: 77 | self[k].update(v) 78 | 79 | 80 | class Config(ConfigItem): 81 | def __init__(self, yaml_object, dot_list=None): 82 | super().__init__(DEFAULT_DICT) 83 | 84 | # Check yaml_object 85 | if isinstance(yaml_object, str): 86 | assert os.path.isfile(yaml_object), yaml_object 87 | cfg = OmegaConf.load(yaml_object) 88 | if dot_list is not None: 89 | cfg_extra = OmegaConf.from_dotlist(dot_list) 90 | cfg = OmegaConf.merge(cfg, cfg_extra) 91 | yaml_object = OmegaConf.to_container(cfg, resolve=True) 92 | 93 | if isinstance(yaml_object, dict): 94 | yaml_object = ConfigItem(yaml_object) 95 | 96 | assert isinstance(yaml_object, ConfigItem) 97 | 98 | self.update(yaml_object) -------------------------------------------------------------------------------- /utils/s2f0.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from libs import JDCNet 3 | 4 | def load_F0_models(path, device): 5 | F0_model = JDCNet(num_class=1, seq_len=192) 6 | params = torch.load(path, map_location=device, weights_only=True)['net'] 7 | F0_model.load_state_dict(params) 8 | _ = F0_model.train() 9 | return F0_model 10 | 11 | def wav2F0(mels, F0_model, device, norm=True): 12 | mels = mels.to(device) 13 | mels = mels.squeeze() 14 | F0_model = F0_model.to(device) 15 | with torch.no_grad(): 16 | F0_real, _, _ = F0_model(mels.unsqueeze(1)) 17 | 18 | # normalize f0 19 | # Remove unvoiced frames (replace with -1) 20 | gt_glob_f0s = [] 21 | f0_targets = [] 22 | norm_f0 = True 23 | if not norm_f0: 24 | f0_targets.append(F0_real) 25 | else: 26 | for bib in range(len(F0_real)): 27 | voiced_indices = F0_real[bib] > 5.0 28 | f0_voiced = F0_real[bib][voiced_indices] 29 | 30 | if len(f0_voiced) != 0: 31 | # Convert to log scale 32 | log_f0 = f0_voiced.log2() 33 | 34 | # Calculate mean and standard deviation 35 | mean_f0 = log_f0.mean() 36 | std_f0 = log_f0.std() 37 | if norm: 38 | # Normalize the F0 sequence 39 | normalized_f0 = (log_f0 - mean_f0) / std_f0 40 | else: 41 | normalized_f0 = log_f0 42 | 43 | # Create the normalized F0 sequence with unvoiced frames 44 | normalized_sequence = torch.zeros_like(F0_real[bib]) 45 | normalized_sequence[voiced_indices] = normalized_f0.to(normalized_sequence.dtype) 46 | normalized_sequence[~voiced_indices] = -10 # Assign -10 to unvoiced frames 47 | 48 | gt_glob_f0s.append(mean_f0) 49 | else: 50 | normalized_sequence = torch.zeros_like(F0_real[bib]) - 10.0 51 | gt_glob_f0s.append(torch.tensor(0.0).to(device)) 52 | 53 | # f0_targets.append(normalized_sequence[single_side_context // 200:-single_side_context // 200]) 54 | f0_targets.append(normalized_sequence) 55 | 56 | f0_targets = torch.stack(f0_targets).to(device) 57 | # fill nan with -10 58 | f0_targets[torch.isnan(f0_targets)] = -10.0 59 | # fill inf with -10 60 | f0_targets[torch.isinf(f0_targets)] = -10.0 61 | 62 | return f0_targets -------------------------------------------------------------------------------- /utils/s2fhubert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present 4 | import numpy as np 5 | from libs import HubertSoft 6 | 7 | # Fine-tuned Soft-Hubert Block copied from https://github.com/rkmt/wesper-demo. 8 | def load_hubert(checkpoint_path=None, device='cuda:0'): 9 | print("### load_hubert", checkpoint_path, device) 10 | assert checkpoint_path is not None 11 | print("### loading checkpoint from: ", checkpoint_path) 12 | checkpoint = torch.load(checkpoint_path) 13 | hubert = HubertSoft().to(device) 14 | 15 | checkpoint = checkpoint['hubert'] if checkpoint['hubert'] is not None else checkpoint 16 | consume_prefix_in_state_dict_if_present(checkpoint, "module.") 17 | 18 | hubert.load_state_dict(checkpoint, strict=True) 19 | hubert = hubert.eval().to(device) 20 | return hubert 21 | 22 | def wav2units(wav, encoder, layer=None, device='cuda:0'): 23 | ''' 24 | encoder: HuBERT 25 | ''' 26 | if type(wav) == np.ndarray: 27 | wav = torch.tensor([wav], dtype=torch.float32, device=device) 28 | else: 29 | wav = wav.to(device) 30 | assert type(wav) == torch.Tensor 31 | if len(wav.shape) == 2: 32 | wav = wav.unsqueeze(0) 33 | with torch.inference_mode(): # wav -> HuBERT soft units 34 | if layer is None or layer < 0: 35 | units = encoder.units(wav) 36 | else: 37 | wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) 38 | units, _ = encoder.encode(wav, layer=layer) 39 | return units --------------------------------------------------------------------------------