├── .gitignore
├── LICENSE
├── README.md
├── compare_infer.py
├── datahelper
    ├── __init__.py
    ├── libritts_3.py
    ├── ljspeech_3.py
    ├── timit_3.py
    ├── whisper_val.py
    └── wtimit_3.py
├── datapreper
    ├── generate.py
    ├── pseudo_whisper.py
    └── silero_vad.py
├── experiments
    ├── quickvc
    │   └── quickvc.pth.txt
    └── s2uu2s
    │   └── epoch=440-step=409942.ckpt.txt
├── infer.py
├── libs
    ├── FastSpeech2
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   └── __init__.cpython-310.pyc
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-310.pyc
    │   │   │   ├── __init__.cpython-38.pyc
    │   │   │   ├── fastspeech2.cpython-310.pyc
    │   │   │   ├── fastspeech2.cpython-38.pyc
    │   │   │   ├── loss.cpython-310.pyc
    │   │   │   ├── loss.cpython-38.pyc
    │   │   │   ├── modules.cpython-310.pyc
    │   │   │   ├── modules.cpython-38.pyc
    │   │   │   ├── optimizer.cpython-310.pyc
    │   │   │   └── optimizer.cpython-38.pyc
    │   │   ├── fastspeech2.py
    │   │   ├── loss.py
    │   │   ├── modules.py
    │   │   └── optimizer.py
    │   ├── transformer
    │   │   ├── Constants.py
    │   │   ├── Layers.py
    │   │   ├── Models.py
    │   │   ├── Modules.py
    │   │   ├── SubLayers.py
    │   │   ├── __init__.py
    │   │   └── __pycache__
    │   │   │   ├── Constants.cpython-310.pyc
    │   │   │   ├── Constants.cpython-38.pyc
    │   │   │   ├── Layers.cpython-310.pyc
    │   │   │   ├── Layers.cpython-38.pyc
    │   │   │   ├── Models.cpython-310.pyc
    │   │   │   ├── Models.cpython-38.pyc
    │   │   │   ├── Modules.cpython-310.pyc
    │   │   │   ├── Modules.cpython-38.pyc
    │   │   │   ├── SubLayers.cpython-310.pyc
    │   │   │   ├── SubLayers.cpython-38.pyc
    │   │   │   ├── __init__.cpython-310.pyc
    │   │   │   └── __init__.cpython-38.pyc
    │   └── utils
    │   │   ├── __pycache__
    │   │       ├── model.cpython-310.pyc
    │   │       ├── model.cpython-38.pyc
    │   │       ├── tools.cpython-310.pyc
    │   │       └── tools.cpython-38.pyc
    │   │   └── tools.py
    ├── JDC
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   └── model.cpython-310.pyc
    │   ├── bst.t7
    │   └── model.py
    ├── __init__.py
    ├── hifigan
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── model.cpython-310.pyc
    │   │   ├── models.cpython-310.pyc
    │   │   └── models.cpython-38.pyc
    │   ├── config.json
    │   ├── model.py
    │   ├── models.py
    │   └── my_config_v1_16000.json
    ├── hubert
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── model.cpython-310.pyc
    │   │   └── model.cpython-38.pyc
    │   ├── model.py
    │   └── utils.py
    └── wavlm
    │   ├── WavLM-Large.pt.txt
    │   ├── WavLM.py
    │   ├── __pycache__
    │       ├── WavLM.cpython-310.pyc
    │       ├── WavLM.cpython-38.pyc
    │       ├── __init__.cpython-310.pyc
    │       ├── __init__.cpython-38.pyc
    │       ├── modules.cpython-310.pyc
    │       └── modules.cpython-38.pyc
    │   └── modules.py
├── minimal_quickvc
    ├── commons.py
    ├── models.py
    ├── modules.py
    └── utils.py
├── minimal_wesper
    ├── config
    │   ├── LJ_hubert_layer12
    │   │   └── stats.json
    │   ├── my_model16000.yaml
    │   └── my_preprocess16k_LJ.yaml
    └── whisper_normal.py
├── models
    ├── __init__.py
    ├── discriminators.py
    ├── loss.py
    ├── s2u.py
    └── u2s.py
├── raw
    ├── data_in_the_wild
    │   ├── W2S_403_headset
    │   │   ├── 403_headset.wav
    │   │   ├── DisocoGAN_403_headset.wav
    │   │   ├── MSpeC_403_headset.wav
    │   │   ├── WES_403_headset.wav
    │   │   ├── s000_403_headset.wav
    │   │   ├── s000_QuickVC_403_headset.wav
    │   │   ├── s000_W2S_403_headset.wav
    │   │   ├── s001_403_headset.wav
    │   │   ├── s001_QuickVC_403_headset.wav
    │   │   ├── s001_W2S_403_headset.wav
    │   │   ├── s002_403_headset.wav
    │   │   ├── s002_QuickVC_403_headset.wav
    │   │   ├── s002_W2S_403_headset.wav
    │   │   ├── s003_403_headset.wav
    │   │   ├── s003_QuickVC_403_headset.wav
    │   │   └── s003_W2S_403_headset.wav
    │   ├── W2S_416_headset
    │   │   ├── 416_headset.wav
    │   │   ├── DisocoGAN_416_headset.wav
    │   │   ├── MSpeC_416_headset.wav
    │   │   ├── WES_416_headset.wav
    │   │   ├── s000_416_headset.wav
    │   │   ├── s000_QuickVC_416_headset.wav
    │   │   ├── s001_416_headset.wav
    │   │   ├── s001_QuickVC_416_headset.wav
    │   │   ├── s002_416_headset.wav
    │   │   ├── s002_QuickVC_416_headset.wav
    │   │   ├── s003_416_headset.wav
    │   │   └── s003_QuickVC_416_headset.wav
    │   ├── fw001
    │   │   ├── AGAN-W2SC_fn001.wav
    │   │   ├── BLSTM_fn001.wav
    │   │   ├── CycleGAN-VC_fn001.wav
    │   │   ├── GMM_fn001.wav
    │   │   ├── WES_fw001.wav
    │   │   ├── fw001.wav
    │   │   ├── s000_QuickVC_fw001.wav
    │   │   ├── s000_fw001.wav
    │   │   ├── s001_QuickVC_fw001.wav
    │   │   ├── s001_fw001.wav
    │   │   ├── s002_QuickVC_fw001.wav
    │   │   ├── s002_fw001.wav
    │   │   ├── s003_QuickVC_fw001.wav
    │   │   └── s003_fw001.wav
    │   ├── fw002
    │   │   ├── AGAN-W2SC_fn002.wav
    │   │   ├── BLSTM_fn002.wav
    │   │   ├── CycleGAN-VC_fn002.wav
    │   │   ├── GMM_fn002.wav
    │   │   ├── WES_fw002.wav
    │   │   ├── fw002.wav
    │   │   ├── s000_QuickVC_fw002.wav
    │   │   ├── s000_fw002.wav
    │   │   ├── s001_QuickVC_fw002.wav
    │   │   ├── s001_fw002.wav
    │   │   ├── s002_QuickVC_fw002.wav
    │   │   ├── s002_fw002.wav
    │   │   ├── s003_QuickVC_fw002.wav
    │   │   └── s003_fw002.wav
    │   └── sample_whisper
    │   │   ├── WES_sample_whisper.wav
    │   │   ├── s000_QuickVC_sample_whisper.wav
    │   │   ├── s000_sample_whisper.wav
    │   │   ├── s001_QuickVC_sample_whisper.wav
    │   │   ├── s001_sample_whisper.wav
    │   │   ├── s002_QuickVC_sample_whisper.wav
    │   │   ├── s002_sample_whisper.wav
    │   │   ├── s003_QuickVC_sample_whisper.wav
    │   │   ├── s003_sample_whisper.wav
    │   │   └── sample_whisper.wav
    ├── freevc
    │   ├── s000u003w.wav
    │   ├── s001u003w.wav
    │   ├── s002u003w.wav
    │   └── s003u003w.wav
    ├── gt
    │   ├── s000u003n.wav
    │   ├── s000u003w.wav
    │   ├── s001u003n.wav
    │   ├── s001u003w.wav
    │   ├── s002u003n.wav
    │   ├── s002u003w.wav
    │   ├── s003u003n.wav
    │   └── s003u003w.wav
    ├── pseudo
    │   ├── s000u003n.wav
    │   ├── s001u003n.wav
    │   ├── s002u003n.wav
    │   └── s003u003n.wav
    ├── quickvc
    │   ├── s000u003w.wav
    │   ├── s001u003w.wav
    │   ├── s002u003w.wav
    │   └── s003u003w.wav
    ├── s2u_fs2_hifigan
    │   ├── s000u003w.wav
    │   ├── s001u003w.wav
    │   ├── s002u003w.wav
    │   └── s003u003w.wav
    ├── s2u_ms_istft_vits
    │   ├── s000u003w.wav
    │   ├── s001u003w.wav
    │   ├── s002u003w.wav
    │   └── s003u003w.wav
    ├── s2u_u2s
    │   ├── s000u003w.wav
    │   ├── s001u003w.wav
    │   ├── s002u003w.wav
    │   └── s003u003w.wav
    ├── softvc
    │   ├── s000u003w.wav
    │   ├── s001u003w.wav
    │   ├── s002u003w.wav
    │   └── s003u003w.wav
    ├── test
    │   ├── s000u003w.wav
    │   ├── s001u003w.wav
    │   ├── s002u003w.wav
    │   └── s003u003w.wav
    └── wesper
    │   ├── s000u003w.wav
    │   ├── s001u003w.wav
    │   ├── s002u003w.wav
    │   └── s003u003w.wav
├── requirements.txt
├── resources
    └── system_diagram.png
├── u2ss2u.py
└── utils
    ├── __init_.py
    ├── audioprep.py
    ├── config.py
    ├── s2f0.py
    └── s2fhubert.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .ipynb_checkpoints/
3 | *.pyc
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 tan90xx
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DistillW2N
 2 | 
 3 | PyTorch Implementation of [DistillW2N: A Lightweight One-Shot Whisper to Normal Voice Conversion Model Using Distillation of Self-Supervised Features](https://ieeexplore.ieee.org/abstract/document/10888480)
 4 | 
 5 | ## Quick Started
 6 | ### Setup
 7 | 1. Create a Python environment with e.g. conda: `conda create --name distillw2n python=3.10.12 --yes`
 8 | 2. Activate the new environment: `conda activate distillw2n`
 9 | 3. Install torch and torchaudio: `pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121`
10 | 4. Update the packages: `sudo apt-get update && apt-get install -y libsndfile1 ffmpeg`
11 | 5. Install requirements with `pip install -r requirements.txt`
12 | 6. Download models with links given in [txt](https://github.com/tan90xx/distillw2n/blob/master/experiments/)
13 | 
14 | ### Inference
15 | - For quickvc and wesper please run: `python compare_infer.py`
16 | - For our models please run: `python infer.py`
17 | 
18 | ### Training
19 | - Please run: `python u2ss2u.py`
20 | 
21 | ## Datasets
22 | You just need to download the datasets under `YOURPATH`. 
23 | - Dataset Download
24 |   - For the libritts, ljspeech, and timit datasets, [datahelper](https://github.com/tan90xx/distillw2n/tree/master/datahelper) will automatically download if they are not found at `YOURPATH`.
25 |   - For the wtimit dataset, you will need to request it via email. Follow the appropriate procedures to obtain access and download the dataset to `YOURPATH`.
26 | - Dataset Preparation (Option)
27 |   - [datapreper](https://github.com/tan90xx/distillw2n/tree/master/datapreper) offers options for ppw (Pseudo-whisper) and vad (Voice Activity Detection) versions. You can choose to apply these processing steps according to your project's requirements.
28 | 
29 | ## Credits
30 | This implementation builds on
31 | - [SoundStream](https://github.com/kaiidams/soundstream-pytorch) for the training pipeline.
32 | 


--------------------------------------------------------------------------------
/compare_infer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import soundfile as sf
 4 | import librosa
 5 | 
 6 | def init_model(model_type):
 7 | 
 8 |     if model_type == 'quickvc':
 9 |         from minimal_quickvc.models import SynthesizerTrn
10 |         from minimal_quickvc.utils import load_checkpoint
11 |         model = SynthesizerTrn().eval().to('cuda')
12 |         model_path = './experiments/quickvc/quickvc.pth'
13 |         _ = load_checkpoint(model_path, model, None)
14 |         embedder_model = torch.hub.load(
15 |             "bshall/hubert:main", "hubert_soft").eval().to('cuda')
16 |         
17 |     elif model_type == 'wesper':
18 |         from minimal_wesper.whisper_normal import SynthesizerTrn, load_hubert
19 |         model = SynthesizerTrn().eval().to('cuda')
20 |         embedder_model = load_hubert(device='cuda')
21 | 
22 |     return embedder_model, model
23 | 
24 | 
25 | class Inferer:
26 |     def __init__(self, model_type):
27 |         self.model_type = model_type
28 |         self.hubert, self.model = init_model(model_type)
29 |         self.conv_sr = 16000
30 | 
31 |     def vc_fn(self, audio):
32 |         with torch.no_grad():
33 |             wav_src = torch.from_numpy(audio).unsqueeze(0).unsqueeze(0).to('cuda')
34 |             c = self.hubert.units(wav_src)
35 |             c = c.transpose(2, 1)
36 | 
37 |             if self.model_type == 'quickvc':
38 |                 mel_tgt = torch.zeros(1, 80, 64).to('cuda')
39 |                 audio = self.model.infer(c, mel=mel_tgt)
40 | 
41 |             elif self.model_type == 'wesper':
42 |                 audio = self.model.infer(c)
43 |                 
44 |             audio = audio.squeeze(0).squeeze(0).cpu().numpy()
45 |             audio = (audio * 32767).astype(np.int16)
46 |         return audio
47 |     
48 |     def file_infer(self, fname, oname):
49 |         audio, _ = librosa.load(fname, sr=self.conv_sr)
50 |         audio_out = self.vc_fn(audio)
51 |         sf.write(oname, audio_out, self.conv_sr)
52 |         return audio_out
53 |     
54 | inferer = Inferer('quickvc')
55 | audio_out = inferer.file_infer('./raw/gt/s000u003w.wav', 's000u003w_quickvc.wav')
56 | inferer = Inferer('wesper')
57 | audio_out = inferer.file_infer('./raw/gt/s000u003w.wav', 's000u003w_wesper.wav')


--------------------------------------------------------------------------------
/datahelper/__init__.py:
--------------------------------------------------------------------------------
1 | from .timit_3 import TIMIT
2 | from .wtimit_3 import WTIMIT
3 | from .ljspeech_3 import LJSPEECH
4 | from .libritts_3 import LIBRITTS
5 | from .whisper_val import WHISPER
6 | 
7 | __all__ = ['TIMIT', 'WTIMIT', 'LJSPEECH', 'LIBRITTS', 'WHISPER']


--------------------------------------------------------------------------------
/datahelper/libritts_3.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pathlib import Path
  3 | from typing import Tuple, Union
  4 | 
  5 | import torchaudio
  6 | from torch import Tensor
  7 | from torch.utils.data import Dataset
  8 | from torchaudio._internal import download_url_to_file
  9 | from torchaudio.datasets.utils import _extract_tar
 10 | 
 11 | URL = "train-clean-100"
 12 | FOLDER_IN_ARCHIVE = "LibriTTS"
 13 | _CHECKSUMS = {
 14 |     "http://www.openslr.org/resources/60/dev-clean.tar.gz": "da0864e1bd26debed35da8a869dd5c04dfc27682921936de7cff9c8a254dbe1a",  # noqa: E501
 15 |     "http://www.openslr.org/resources/60/dev-other.tar.gz": "d413eda26f3a152ac7c9cf3658ef85504dfb1b625296e5fa83727f5186cca79c",  # noqa: E501
 16 |     "http://www.openslr.org/resources/60/test-clean.tar.gz": "234ea5b25859102a87024a4b9b86641f5b5aaaf1197335c95090cde04fe9a4f5",  # noqa: E501
 17 |     "http://www.openslr.org/resources/60/test-other.tar.gz": "33a5342094f3bba7ccc2e0500b9e72d558f72eb99328ac8debe1d9080402f10d",  # noqa: E501
 18 |     "http://www.openslr.org/resources/60/train-clean-100.tar.gz": "c5608bf1ef74bb621935382b8399c5cdd51cd3ee47cec51f00f885a64c6c7f6b",  # noqa: E501
 19 |     "http://www.openslr.org/resources/60/train-clean-360.tar.gz": "ce7cff44dcac46009d18379f37ef36551123a1dc4e5c8e4eb73ae57260de4886",  # noqa: E501
 20 |     "http://www.openslr.org/resources/60/train-other-500.tar.gz": "e35f7e34deeb2e2bdfe4403d88c8fdd5fbf64865cae41f027a185a6965f0a5df",  # noqa: E501
 21 | }
 22 | 
 23 | 
 24 | def load_libritts_item(
 25 |     fileid: str,
 26 |     path: str,
 27 |     ext_audio: str,
 28 |     ext_original_txt: str,
 29 |     ext_normalized_txt: str,
 30 | ) -> Tuple[Tensor, int, str, str, int, int, str]:
 31 |     speaker_id, chapter_id, segment_id, utterance_id = fileid.split("_")
 32 |     utterance_id = fileid
 33 | 
 34 |     normalized_text = utterance_id + ext_normalized_txt
 35 |     normalized_text = os.path.join(path, speaker_id, chapter_id, normalized_text)
 36 | 
 37 |     original_text = utterance_id + ext_original_txt
 38 |     original_text = os.path.join(path, speaker_id, chapter_id, original_text)
 39 | 
 40 |     file_audio = utterance_id + ext_audio
 41 |     file_audio = os.path.join(path, speaker_id, chapter_id, file_audio)
 42 | 
 43 |     # Load audio
 44 |     waveform, sample_rate = torchaudio.load(file_audio)
 45 |     fileid_audio = Path(str(file_audio).replace('LibriTTS', "LibriTTS-{}".format("vad-ppw")))
 46 |     waveform_ppw, sample_rate = torchaudio.load(fileid_audio)
 47 |     fileid_audio = Path(str(file_audio).replace('LibriTTS', "LibriTTS-{}".format("vad")))
 48 |     waveform_vad, sample_rate = torchaudio.load(fileid_audio)
 49 | 
 50 |     # Load original text
 51 |     with open(original_text) as ft:
 52 |         original_text = ft.readline()
 53 | 
 54 |     # Load normalized text
 55 |     with open(normalized_text, "r") as ft:
 56 |         normalized_text = ft.readline()
 57 | 
 58 |     return (
 59 |         waveform,
 60 |         waveform_ppw,
 61 |         waveform_vad,
 62 |         sample_rate,
 63 |         original_text,
 64 |         normalized_text,
 65 |         int(speaker_id),
 66 |         int(chapter_id),
 67 |         utterance_id,
 68 |     )
 69 | 
 70 | 
 71 | class LIBRITTS(Dataset):
 72 |     """*LibriTTS* :cite:`Zen2019LibriTTSAC` dataset.
 73 | 
 74 |     Args:
 75 |         root (str or Path): Path to the directory where the dataset is found or downloaded.
 76 |         url (str, optional): The URL to download the dataset from,
 77 |             or the type of the dataset to dowload.
 78 |             Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``,
 79 |             ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and
 80 |             ``"train-other-500"``. (default: ``"train-clean-100"``)
 81 |         folder_in_archive (str, optional):
 82 |             The top-level directory of the dataset. (default: ``"LibriTTS"``)
 83 |         download (bool, optional):
 84 |             Whether to download the dataset if it is not found at root path. (default: ``False``).
 85 |     """
 86 | 
 87 |     _ext_original_txt = ".original.txt"
 88 |     _ext_normalized_txt = ".normalized.txt"
 89 |     _ext_audio = ".wav"
 90 | 
 91 |     def __init__(
 92 |         self,
 93 |         root: Union[str, Path],
 94 |         url: str = URL,
 95 |         folder_in_archive: str = FOLDER_IN_ARCHIVE,
 96 |         download: bool = False,
 97 |     ) -> None:
 98 | 
 99 |         if url in [
100 |             "dev-clean",
101 |             "dev-other",
102 |             "test-clean",
103 |             "test-other",
104 |             "train-clean-100",
105 |             "train-clean-360",
106 |             "train-other-500",
107 |         ]:
108 | 
109 |             ext_archive = ".tar.gz"
110 |             base_url = "http://www.openslr.org/resources/60/"
111 | 
112 |             url = os.path.join(base_url, url + ext_archive)
113 | 
114 |         # Get string representation of 'root' in case Path object is passed
115 |         root = os.fspath(root)
116 | 
117 |         basename = os.path.basename(url)
118 |         archive = os.path.join(root, basename)
119 | 
120 |         basename = basename.split(".")[0]
121 |         folder_in_archive = os.path.join(folder_in_archive, basename)
122 | 
123 |         self._path = os.path.join(root, folder_in_archive)
124 | 
125 |         if download:
126 |             if not os.path.isdir(self._path):
127 |                 if not os.path.isfile(archive):
128 |                     checksum = _CHECKSUMS.get(url, None)
129 |                     download_url_to_file(url, archive, hash_prefix=checksum)
130 |                 _extract_tar(archive)
131 |         else:
132 |             if not os.path.exists(self._path):
133 |                 raise RuntimeError(
134 |                     f"The path {self._path} doesn't exist. "
135 |                     "Please check the ``root`` path or set `download=True` to download it"
136 |                 )
137 | 
138 |         self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*/*/*" + self._ext_audio))
139 | 
140 |     def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int, int, str]:
141 |         """Load the n-th sample from the dataset.
142 | 
143 |         Args:
144 |             n (int): The index of the sample to be loaded
145 | 
146 |         Returns:
147 |             Tuple of the following items;
148 | 
149 |             Tensor:
150 |                 Waveform
151 |             int:
152 |                 Sample rate
153 |             str:
154 |                 Original text
155 |             str:
156 |                 Normalized text
157 |             int:
158 |                 Speaker ID
159 |             int:
160 |                 Chapter ID
161 |             str:
162 |                 Utterance ID
163 |         """
164 |         fileid = self._walker[n]
165 |         return load_libritts_item(
166 |             fileid,
167 |             self._path,
168 |             self._ext_audio,
169 |             self._ext_original_txt,
170 |             self._ext_normalized_txt,
171 |         )
172 | 
173 |     def __len__(self) -> int:
174 |         return len(self._walker)
175 | 


--------------------------------------------------------------------------------
/datahelper/ljspeech_3.py:
--------------------------------------------------------------------------------
  1 | # Adopted from torchaudio.datasets.LJSPEECH
  2 | import csv
  3 | import os
  4 | from pathlib import Path
  5 | from typing import Tuple, Union
  6 | 
  7 | import torchaudio
  8 | from torch import Tensor
  9 | from torch.utils.data import Dataset
 10 | from torchaudio._internal import download_url_to_file
 11 | # from torchaudio.datasets.utils import _extract_tar
 12 | import torch
 13 | 
 14 | _RELEASE_CONFIGS = {
 15 |     "release1": {
 16 |         "folder_in_archive": "wavs",
 17 |         "url": "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2",
 18 |         "checksum": "be1a30453f28eb8dd26af4101ae40cbf2c50413b1bb21936cbcdc6fae3de8aa5",
 19 |     }
 20 | }
 21 | 
 22 | 
 23 | class LJSPEECH(Dataset):
 24 |     """*LJSpeech-1.1* :cite:`ljspeech17` dataset.
 25 | 
 26 |     Args:
 27 |         root (str or Path): Path to the directory where the dataset is found or downloaded.
 28 |         url (str, optional): The URL to download the dataset from.
 29 |             (default: ``"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"``)
 30 |         folder_in_archive (str, optional):
 31 |             The top-level directory of the dataset. (default: ``"wavs"``)
 32 |         download (bool, optional):
 33 |             Whether to download the dataset if it is not found at root path. (default: ``False``).
 34 |     """
 35 | 
 36 |     def __init__(
 37 |         self,
 38 |         root: Union[str, Path],
 39 |         url: str = _RELEASE_CONFIGS["release1"]["url"],
 40 |         folder_in_archive: str = _RELEASE_CONFIGS["release1"]["folder_in_archive"],
 41 |         download: bool = False,
 42 |         process_type_1: str="pseudo",
 43 |         process_type_2: str="se-vad",
 44 |     ) -> None:
 45 | 
 46 |         self._parse_filesystem(root, url, folder_in_archive, download, process_type_1, process_type_2)
 47 | 
 48 |     def _parse_filesystem(self, root: str, url: str, folder_in_archive: str, 
 49 |                           download: bool, process_type_1: str, process_type_2: str) -> None:
 50 |         root = Path(root)
 51 | 
 52 |         basename = os.path.basename(url)
 53 |         archive = root / basename
 54 | 
 55 |         basename = Path(basename.split(".tar.bz2")[0])
 56 |         folder_in_archive = basename / folder_in_archive
 57 | 
 58 |         self._path = root / folder_in_archive
 59 |         self._metadata_path = root / basename / "metadata.csv"
 60 |         
 61 |         self._process_type_1 = process_type_1
 62 |         self._process_type_2 = process_type_2
 63 |         '''
 64 |         if download:
 65 |             if not os.path.isdir(self._path):
 66 |                 if not os.path.isfile(archive):
 67 |                     checksum = _RELEASE_CONFIGS["release1"]["checksum"]
 68 |                     download_url_to_file(url, archive, hash_prefix=checksum)
 69 |                 _extract_tar(archive)
 70 |         else:
 71 |             if not os.path.exists(self._path):
 72 |                 raise RuntimeError(
 73 |                     f"The path {self._path} doesn't exist. "
 74 |                     "Please check the ``root`` path or set `download=True` to download it"
 75 |                 )
 76 |         '''
 77 |         with open(self._metadata_path, "r", newline="", encoding='utf-8') as metadata:
 78 |             flist = csv.reader(metadata, delimiter="|", quoting=csv.QUOTE_NONE)
 79 |             self._flist = list(flist)
 80 | 
 81 |     def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]:
 82 |         """Load the n-th sample from the dataset.
 83 | 
 84 |         Args:
 85 |             n (int): The index of the sample to be loaded
 86 | 
 87 |         Returns:
 88 |             Tuple of the following items;
 89 | 
 90 |             Tensor:
 91 |                 Waveform
 92 |             int:
 93 |                 Sample rate
 94 |             str:
 95 |                 Transcript
 96 |             str:
 97 |                 Normalized Transcript
 98 |         """
 99 |         line = self._flist[n]
100 |         fileid, transcript, normalized_transcript = line
101 |         fileid_audio_o = self._path / (fileid + ".wav")
102 |         # fileid_audio = Path(str(fileid_audio_o).replace('LJSpeech-1.1', "LJSpeech-1.1-{}".format("ppw")))
103 |         fileid_audio = fileid_audio_o
104 |         waveform, sample_rate = torchaudio.load(fileid_audio)
105 |         fileid_audio = Path(str(fileid_audio_o).replace('LJSpeech-1.1', "LJSpeech-1.1-{}".format(self._process_type_1)))
106 |         waveform_pseudo, sample_rate = torchaudio.load(fileid_audio)
107 |         fileid_audio = Path(str(fileid_audio_o).replace('LJSpeech-1.1', "LJSpeech-1.1-{}".format(self._process_type_2)))
108 |         waveform_vad, sample_rate = torchaudio.load(fileid_audio)
109 |         return (
110 |             waveform,
111 |             waveform_pseudo,
112 |             waveform_vad,
113 |             sample_rate,
114 |             transcript,
115 |             normalized_transcript,
116 |         )
117 | 
118 |     def __len__(self) -> int:
119 |         return len(self._flist)
120 | 


--------------------------------------------------------------------------------
/datahelper/timit_3.py:
--------------------------------------------------------------------------------
 1 | """TIMIT data generator."""
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | 
 6 | import os
 7 | import os
 8 | from pathlib import Path
 9 | from typing import Tuple, Union
10 | import torchaudio
11 | from torch import Tensor
12 | from torch.utils.data import Dataset
13 | 
14 | 
15 | _TIMIT_TRAIN_DATASETS = [
16 |     ["timit/TIMIT/data/TRAIN", (".WAV", ".WRD")],
17 | ]
18 | _TIMIT_TEST_DATASETS = [
19 |     ["timit/TIMIT/data/TEST", (".WAV", ".WRD")],
20 | ]
21 | 
22 | 
23 | def _collect_data(directory, input_ext, target_ext):
24 |   """Traverses directory collecting input and target files."""
25 |   # Directory from string to tuple pair of strings
26 |   # key: the filepath to a datafile including the datafile's basename. Example,
27 |   #   if the datafile was "/path/to/datafile.wav" then the key would be
28 |   #   "/path/to/datafile"
29 |   # value: a pair of strings (input_filepath, target_filepath)
30 |   data_files = dict()
31 |   for root, _, filenames in os.walk(directory):
32 |     input_files = [filename for filename in filenames if input_ext in filename]
33 |     for input_filename in input_files:
34 |       basename = input_filename.strip(input_ext)
35 |       input_file = os.path.join(root, input_filename)
36 |       target_file = os.path.join(root, basename + target_ext)
37 |       key = os.path.join(root, basename)
38 |       assert os.path.exists(target_file)
39 |       assert key not in data_files
40 |       data_files[key] = (input_file, target_file)
41 |   return data_files
42 | 
43 | class TIMIT(Dataset):
44 |     def __init__(
45 |         self,
46 |         root: Union[str, Path],
47 |         training: bool,
48 |     ) -> None:
49 |       self._parse_filesystem(root, training)
50 |     
51 |     def _parse_filesystem(self, root: str, training: bool) -> None:
52 |       datasets = (_TIMIT_TRAIN_DATASETS if training else _TIMIT_TEST_DATASETS)
53 |       for data_dir, (audio_ext, transcription_ext) in datasets:
54 |         data_dir = os.path.join(root, data_dir)
55 |         data_files = _collect_data(data_dir, audio_ext, transcription_ext)
56 |         data_pairs = data_files.values()
57 |         self._flist = []
58 |         for input_file, _ in sorted(data_pairs):
59 |           self._flist.append(input_file)
60 | 
61 |     def __getitem__(self, n: int) -> Tuple[Tensor, int, str]:
62 |       input_file = self._flist[n]
63 |       # out_filepath = input_file.strip(".WAV") + ".wav"
64 |       waveform, sample_rate = torchaudio.load(input_file)
65 |       fileid_audio = Path(str(input_file).replace('TIMIT', "TIMIT-{}".format("vad")))
66 |       waveform_p, sample_rate = torchaudio.load(fileid_audio)
67 |       fileid_audio = Path(str(input_file).replace('TIMIT', "TIMIT-{}".format("vad")))
68 |       waveform_v, sample_rate = torchaudio.load(fileid_audio)
69 |       return (
70 |             waveform,
71 |             waveform_p,
72 |             waveform_v,
73 |             sample_rate,
74 |             input_file)
75 |     
76 |     def __len__(self) -> int:
77 |       return len(self._flist)
78 |     
79 | if __name__ == "__main__":
80 |   ds = TIMIT("/data/ssd0/tianyi.tan", training=True)
81 |   print(len(ds))


--------------------------------------------------------------------------------
/datahelper/whisper_val.py:
--------------------------------------------------------------------------------
 1 | """TIMIT data generator."""
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | 
 6 | import os
 7 | from pathlib import Path
 8 | from typing import Tuple, Union
 9 | import torchaudio
10 | from torch import Tensor
11 | from torch.utils.data import Dataset
12 | 
13 | class WHISPER(Dataset):
14 |     def __init__(
15 |         self,
16 |         root: Union[str, Path],
17 |     ) -> None:
18 |       self._parse_filesystem(root)
19 |     
20 |     def _parse_filesystem(self, root: str) -> None:
21 |       data_dir = "_1_normal_trim"
22 |       data_dir = Path(os.path.join(root, data_dir))
23 |       self._flist = []
24 |       for in_path in data_dir.rglob("*.wav"):
25 |         self._flist.append(in_path)
26 | 
27 |     def __getitem__(self, n: int) -> Tuple[Tensor, int, str]:
28 |       input_file = self._flist[n]
29 |       waveform, sample_rate = torchaudio.load(input_file)
30 |       fileid_audio = Path(str(input_file).replace('_1_normal_trim', "_1_ppw_trim"))
31 |       waveform_p, sample_rate = torchaudio.load(fileid_audio)
32 |       return (
33 |             waveform,
34 |             waveform_p,
35 |             sample_rate)
36 |     
37 |     def __len__(self) -> int:
38 |       return len(self._flist)
39 |     
40 | if __name__ == "__main__":
41 |   ds = WHISPER("YOURPATH")
42 |   print(len(ds))


--------------------------------------------------------------------------------
/datahelper/wtimit_3.py:
--------------------------------------------------------------------------------
 1 | """TIMIT data generator."""
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | 
 6 | import os
 7 | import re
 8 | from pathlib import Path
 9 | from typing import Tuple, Union
10 | import torchaudio
11 | from torch import Tensor
12 | from torch.utils.data import Dataset
13 | import numpy as np
14 | from scipy.ndimage import distance_transform_edt
15 | import torch
16 | 
17 | def fill_nans(data_matrix):
18 |     """Fills NaN's with nearest neighbours.
19 | 
20 |     This method is adapted from the method `fill`, which you can find here:
21 |     https://stackoverflow.com/posts/9262129/revisions
22 | 
23 |     :param data_matrix: numpy array of real-valued data.
24 |     :return: data_matrix: Same but without NaN's.
25 |     """
26 |     
27 |     indices = distance_transform_edt(
28 |         np.isnan(data_matrix), return_distances=False, return_indices=True
29 |     )
30 |     return data_matrix[tuple(indices)]
31 | 
32 | class WTIMIT(Dataset):
33 |     def __init__(
34 |         self,
35 |         root: Union[str, Path],
36 |     ) -> None:
37 |       self._parse_filesystem(root)
38 |     
39 |     def _parse_filesystem(self, root: str) -> None:
40 |       data_dir = "wtimit/normal"
41 |       data_dir = Path(os.path.join(root, data_dir))
42 |       self._flist = []
43 |       for in_path in data_dir.rglob("*.wav"):
44 |         # if in_path.name.startswith("s10"):
45 |         # s[0-1]
46 |         # if re.match(r"s[0-1]\d{2}u0(0[3-9]|1[0-2])n\.wav$", in_path.name):
47 |         self._flist.append(in_path)
48 | 
49 |     def __getitem__(self, n: int) -> Tuple[Tensor, int, str]:
50 |       input_file = self._flist[n]
51 |       waveform, sample_rate = torchaudio.load(input_file)
52 |       fileid_audio = Path(str(input_file).replace('normal', "vad-ppw"))
53 |       waveform_p, sample_rate = torchaudio.load(fileid_audio)
54 |       fileid_audio = Path(str(input_file).replace('normal', "vad"))
55 |       waveform_v, sample_rate = torchaudio.load(fileid_audio)
56 |       return (
57 |             waveform,
58 |             waveform_p,
59 |             waveform_v,
60 |             sample_rate,
61 |             input_file)
62 |     
63 |     def __len__(self) -> int:
64 |       return len(self._flist)
65 |     
66 | if __name__ == "__main__":
67 |   ds = WTIMIT("YOURPATH")
68 |   print(len(ds))


--------------------------------------------------------------------------------
/datapreper/generate.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | from multiprocessing import cpu_count
 4 | from concurrent.futures import ProcessPoolExecutor
 5 | from tqdm import tqdm
 6 | 
 7 | def preprocess_dataset(args):
 8 |     if args.mode == "vad":
 9 |         from silero_vad import process_wav
10 |     elif args.mode == "ppw":
11 |         from pseudo_whisper import process_wav
12 |     args.out_dir.mkdir(parents=True, exist_ok=True)
13 | 
14 |     futures = []
15 |     executor = ProcessPoolExecutor(max_workers=cpu_count())
16 |     print(f"VAD for audio in {args.in_dir}")
17 |     list_list = list(args.in_dir.rglob("*.wav"))
18 |     for i, in_path in enumerate(list_list):
19 |         # if i % 2 == 1:
20 |         relative_path = in_path.relative_to(args.in_dir)
21 |         out_path = args.out_dir / relative_path
22 |         out_path.parent.mkdir(parents=True, exist_ok=True)
23 |         futures.append(
24 |             executor.submit(process_wav, in_path, out_path, args.sample_rate)
25 |         )
26 | 
27 |     results = [future.result() for future in tqdm(futures)]
28 | 
29 |     lengths = {path.stem: length for path, length in results}
30 |     seconds = sum(lengths.values())
31 |     hours = seconds / 3600
32 |     print(f"Wrote {len(lengths)} utterances ({hours:.2f} hours)")
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     parser = argparse.ArgumentParser(description="Resample an audio dataset.")
37 |     parser.add_argument(
38 |         "in_dir", metavar="in-dir", help="path to the dataset directory.", type=Path
39 |     )
40 |     parser.add_argument(
41 |         "out_dir", metavar="out-dir", help="path to the output directory.", type=Path
42 |     )
43 |     parser.add_argument(
44 |         '--mode', choices=['vad', 'ppw'], required=True, help="Select processing mode: 'vad' or 'psuedo_whisper'"
45 |     )
46 |     parser.add_argument(
47 |         "--sample-rate",
48 |         help="target sample rate (default 16kHz)",
49 |         type=int,
50 |         default=16000,
51 |     )
52 |     args = parser.parse_args()
53 |     preprocess_dataset(args)


--------------------------------------------------------------------------------
/datapreper/pseudo_whisper.py:
--------------------------------------------------------------------------------
  1 | # Copied from https://github.com/chaufanglin/Normal2Whisper/blob/main/utils.py
  2 | import numpy as np
  3 | from scipy.signal import lfilter
  4 | import soundfile as sf
  5 | import librosa
  6 | from librosa import lpc
  7 | import pyworld as pw
  8 | 
  9 | def wav2world(x, fs, fft_size=None):
 10 |     """Convenience function to do all WORLD analysis steps in a single call.
 11 |     In this case only `frame_period` can be configured and other parameters
 12 |     are fixed to their defaults. Likewise, F0 estimation is fixed to
 13 |     DIO plus StoneMask refinement.
 14 |     Parameters
 15 |     ----------
 16 |     x : ndarray
 17 |         Input waveform signal.
 18 |     fs : int
 19 |         Sample rate of input signal in Hz.
 20 |     fft_size : int
 21 |         Length of Fast Fourier Transform (in number of samples)
 22 |         The resulting dimension of `ap` adn `sp` will be `fft_size` // 2 + 1
 23 |     Returns
 24 |     -------
 25 |     f0 : ndarray
 26 |         F0 contour.
 27 |     sp : ndarray
 28 |         Spectral envelope.
 29 |     ap : ndarray
 30 |         Aperiodicity.
 31 |     t  : ndarray
 32 |         Temporal position of each frame.
 33 |     """
 34 |     f0, t = pw.harvest(x, fs)
 35 |     sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size)
 36 |     ap = pw.d4c(x, f0, t, fs, fft_size=fft_size)
 37 |     return f0, sp, ap, t
 38 | 
 39 | 
 40 | def moving_average(data, length):
 41 |     output = np.empty(data.shape)
 42 |     maf = np.bartlett(length)/length  # Bartlett window is a triangular window
 43 |     for i in range(data.shape[0]):
 44 |         output[i,:] = np.convolve(data[i,:], maf,'same')
 45 |     return output
 46 | 
 47 | 
 48 | def gfm_iaif_glottal_remove(s_gvl, nv=48, ng=3, d=0.99, win=None):
 49 |     """
 50 |     Glootal removal function based on GFM-IAIF.
 51 | 
 52 |     Note:
 53 |     Function originally coded by Olivier Perrotin (https://github.com/operrotin/GFM-IAIF). 
 54 |     This code is translated to Python and adapted by Zhaofeng Lin (linzh@tcd.ie)
 55 |     Parameters:
 56 |     ----------
 57 |         s_gvl: Speech signal frame
 58 |         nv: Order of LP analysis for vocal tract (def. 48)
 59 |         ng: Order of LP analysis for glottal source (def. 3)
 60 |         d: Leaky integration coefficient (def. 0.99)
 61 |         win: Window used before LPC (def. Hanning)
 62 | 
 63 |     Returns:
 64 |     -------
 65 |         s_v: Speech signal with glottis contribution cancelled 
 66 |     """
 67 | 
 68 |     # ----- Set default parameters -------------------------------------------
 69 |     if win is None:
 70 |         # Window for LPC estimation
 71 |         win = np.hanning(len(s_gvl))
 72 | 
 73 |     # ----- Addition of pre-frame --------------------------------------------
 74 |     # For the successive removals of the estimated LPC envelopes, a
 75 |     # mean-normalized pre-frame ramp is added at the beginning of the frame
 76 |     # in order to diminish ripple. The ramp is removed after each filtering.
 77 |     Lpf = nv + 1  # Pre-frame length
 78 |     x_gvl = np.concatenate([np.linspace(-s_gvl[0], s_gvl[0], Lpf), s_gvl])  # Prepend
 79 |     idx_pf = np.arange(Lpf, len(x_gvl))  # Indexes that exclude the pre-frame
 80 | 
 81 |     # ----- Cancel lip radiation contribution --------------------------------
 82 |     # Define lip radiation filter
 83 |     al = [1, -d]
 84 | 
 85 |     # Integration of signal using filter 1/[1 -d z^(-1)]
 86 |     # - Input signal (for LPC estimation)
 87 |     s_gv = lfilter([1], al, s_gvl)
 88 |     # - Pre-framed input signal (for LPC envelope removal)
 89 |     x_gv = lfilter([1], al, x_gvl)
 90 | 
 91 |     # ----- Gross glottis estimation -----------------------------------------
 92 |     # Iterative estimation of glottis with ng first order filters
 93 |     ag1 = lpc(s_gv*win, order=1)         # First 1st order LPC estimation
 94 | 
 95 |     for i in range(ng-2):
 96 |         # Cancel current estimate of glottis contribution from speech signal
 97 |         x_v1x = lfilter(ag1,1,x_gv)        # Inverse filtering
 98 |         s_v1x = x_v1x[idx_pf]        # Remove pre-ramp
 99 | 
100 |         # Next 1st order LPC estimation
101 |         ag1x = lpc(s_v1x*win, order=1)        # 1st order LPC
102 | 
103 |         # Update gross estimate of glottis contribution
104 |         ag1 = np.convolve(ag1,ag1x)        # Combine 1st order estimation with previous
105 | 
106 | 
107 |     # ----- Gross vocal tract estimation -------------------------------------
108 |     # Cancel gross estimate of glottis contribution from speech signal
109 |     x_v1 = lfilter(ag1,1,x_gv)       # Inverse filtering
110 |     s_v1 = x_v1[idx_pf]         # Remove pre-ramp
111 | 
112 |     # Gross estimate of the vocal tract filter
113 |     av1 = lpc(s_v1*win, order=nv)        # nv order LPC estimation
114 | 
115 |     # ----- Fine glottis estimation ------------------------------------------
116 |     # Cancel gross estimate of vocal tract contribution from speech signal
117 |     x_g1 = lfilter(av1,1,x_gv)       # Inverse filtering
118 |     s_g1 = x_g1[idx_pf]         # Remove pre-ramp
119 | 
120 |     # Fine estimate of the glottis filter
121 |     ag = lpc(s_g1*win, order=ng)        # ng order LPC estimation
122 | 
123 |     # ----- Fine vocal tract estimation --------------------------------------
124 |     # Cancel fine estimate of glottis contribution from speech signal
125 |     x_v = lfilter(ag,1,x_gv)       # Inverse filtering
126 |     s_v = x_v[idx_pf]         # Remove pre-ramp
127 | 
128 |     return s_v
129 | 
130 | 
131 | def pesudo_whisper_gen(s_n, fs, Lv=16):
132 |     """
133 |     Pesudo whispered speech generating function, using GFM-IAIF and moving averge filtering.
134 | 
135 |     Note:
136 |     This code is written by Zhaofeng Lin (linzh@tcd.ie)
137 | 
138 |     Parameters:
139 |     ----------
140 |         s_n: Normal speech wavform 
141 |         fs: Sample rate
142 |         Lv: order of LP analysis for vocal tract (default: 16)
143 | 
144 |     Returns:
145 |     -------
146 |         y_pw: Pesudo whispered speech wavform
147 |     """
148 | 
149 |     EPSILON = 1e-8
150 | 
151 |     # Overlapp-add (OLA) method
152 |     nfft = pw.get_cheaptrick_fft_size(fs)
153 |     win_length = int(30*fs/1000) # 30ms * fs / 1000
154 |     nhop = round(win_length / 2)
155 |     window = np.hamming(win_length)
156 |     nframes = int(np.ceil(s_n.size / nhop))
157 | 
158 |     s_gfm = np.zeros(s_n.shape)     # allocate output speech without glottal source
159 | 
160 |     for n in range(nframes):
161 |         startPoint = n * nhop     # starting point of windowing
162 |         if startPoint + win_length > s_n.size:
163 |             s_gfm[startPoint - nhop + win_length: ] = EPSILON
164 |             continue
165 |         else:
166 |             sn_frame = s_n[startPoint : startPoint+win_length] * window
167 | 
168 |         s_gfm_frame = gfm_iaif_glottal_remove(sn_frame, Lv)
169 | 
170 |         s_gfm[startPoint: startPoint + win_length] = s_gfm[startPoint: startPoint + win_length] + s_gfm_frame
171 | 
172 |     # Extract GFM
173 |     f0_gfm, sp_gfm, ap_gfm, _ = wav2world(s_gfm, fs)
174 | 
175 |     # Moving Averge Filtering
176 |     maf_freq = 400  # 400 Hz
177 |     maf_w_len = round(maf_freq/fs * nfft)    # 400 Hz
178 |     sp_maf = moving_average(sp_gfm, maf_w_len)
179 | 
180 |     # Zero F0 and unit Ap
181 |     f0_zero = np.zeros(f0_gfm.shape) + EPSILON
182 |     ap_unit = np.ones(ap_gfm.shape) - EPSILON
183 | 
184 |     y_pw = pw.synthesize(f0_zero, sp_maf, ap_unit, fs, pw.default_frame_period)
185 | 
186 |     return y_pw
187 | 
188 | 
189 | def process_wav(in_path, out_path, sample_rate):
190 |     normal, fs_ = sf.read(in_path)
191 |     if sample_rate != fs_:
192 |         normal = librosa.resample(normal, fs_, sample_rate)
193 |     pesudo_whisper = pesudo_whisper_gen(normal, sample_rate)
194 |     sf.write(out_path, pesudo_whisper, sample_rate)
195 |     return out_path, len(pesudo_whisper) / sample_rate


--------------------------------------------------------------------------------
/datapreper/silero_vad.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | torch.set_num_threads(1)
 3 | from typing import List
 4 | USE_ONNX = True # change this to True if you want to test onnx model
 5 |   
 6 | model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
 7 |                               model='silero_vad',
 8 |                               force_reload=False,
 9 |                               onnx=USE_ONNX)
10 | 
11 | (get_speech_timestamps,
12 |  save_audio,
13 |  read_audio,
14 |  VADIterator,
15 |  collect_chunks) = utils
16 | 
17 | def replace_chunks(tss: List[dict],
18 |                 wav: torch.Tensor):
19 |     chunks = []
20 |     cur_start = 0
21 |     for i in tss:
22 |         silence_part = torch.zeros_like(wav[cur_start: i['start']])
23 |         chunks.append(silence_part)
24 |         voiced_part = wav[i['start']: i['end']]
25 |         chunks.append(voiced_part)
26 |         cur_start = i['end']
27 |     silence_part = torch.zeros_like(wav[cur_start:])
28 |     chunks.append(silence_part)
29 |     result = torch.cat(chunks)
30 |     if torch.all(result == 0):
31 |         return wav
32 |     return result
33 | 
34 | 
35 | def process_wav(in_path, out_path, sample_rate):
36 |     wav = read_audio(in_path, sampling_rate=sample_rate)
37 |     # get speech timestamps from full audio file
38 |     speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sample_rate)
39 |     # merge all speech chunks to one audio
40 |     save_audio(out_path,
41 |             replace_chunks(speech_timestamps, wav), sampling_rate=sample_rate)
42 |     return out_path, wav.size(-1) / sample_rate


--------------------------------------------------------------------------------
/experiments/quickvc/quickvc.pth.txt:
--------------------------------------------------------------------------------
1 | https://drive.google.com/drive/folders/1DF6RgIHHkn2aoyyUMt4_hPitKSc2YR9d
2 | # mv G_1200000.pth quickvc.pth


--------------------------------------------------------------------------------
/experiments/s2uu2s/epoch=440-step=409942.ckpt.txt:
--------------------------------------------------------------------------------
1 | # https://box.nju.edu.cn/f/be9207d41cba4c8a98db/?dl=1
2 | https://github.com/tan90xx/distillw2n/releases/download/v1.0/epoch.440-step.409942.ckpt
3 | 


--------------------------------------------------------------------------------
/infer.py:
--------------------------------------------------------------------------------
 1 | from u2ss2u import StreamableModel
 2 | import torch
 3 | import torchaudio
 4 | import nemo.collections.asr as nemo_asr
 5 | DEVICE="cuda:0"
 6 | 
 7 | model = StreamableModel(
 8 |     batch_size=42,
 9 |     sample_rate=16_000,
10 |     segment_length=32270,
11 |     padding='same',
12 |     dataset='timit')
13 | 
14 | checkpoint_path = './experiments/s2uu2s/epoch=440-step=409942.ckpt'
15 | checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage, weights_only=True)
16 | model.load_state_dict(checkpoint['state_dict'], strict=False)
17 | model = model.to(DEVICE)
18 | model.eval()
19 | 
20 | hubert_soft = torch.hub.load("bshall/hubert:main", f"hubert_soft").to(DEVICE)
21 | 
22 | speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large")
23 | speaker_model = speaker_model.to(DEVICE)
24 | speaker_model.eval()
25 | 
26 | x_trg, sr = torchaudio.load(f'./raw/gt/s000u003n.wav')
27 | spkemb = speaker_model.infer_segment(x_trg.squeeze(0))[0]
28 | 
29 | x, sr = torchaudio.load('./raw/gt/s000u003w.wav')
30 | x = torchaudio.functional.resample(x, sr, 16000)
31 | # z, hubert = model(x.to(DEVICE), spkemb.to(DEVICE))
32 | # torchaudio.save('test0.wav', z.squeeze(1).detach().cpu(), 16000)
33 | 
34 | # spec = model.spec.to(DEVICE)
35 | # encoder = model.spec.to(DEVICE)
36 | reencoder = model.reencoder.to(DEVICE)
37 | decoder = model.decoder.to(DEVICE)
38 | 
39 | hubert_soft = torch.hub.load("bshall/hubert:main", f"hubert_soft").to(DEVICE)
40 | hubert = hubert_soft.units(x.unsqueeze(0).to(DEVICE))
41 | hubert = hubert.clone().to(DEVICE)
42 | hubert = torch.transpose(hubert, -1, -2)
43 | z = reencoder(hubert.to(DEVICE), spkemb.to(DEVICE))
44 | z = decoder(z.to(DEVICE))
45 | torchaudio.save('test1.wav', z.squeeze(1).detach().cpu(), 16000)


--------------------------------------------------------------------------------
/libs/FastSpeech2/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import FastSpeech2
2 | 
3 | __all__ = ['FastSpeech2']


--------------------------------------------------------------------------------
/libs/FastSpeech2/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .fastspeech2 import FastSpeech2
2 | from .loss import FastSpeech2Loss
3 | from .optimizer import ScheduledOptim


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/__pycache__/fastspeech2.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/fastspeech2.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/__pycache__/fastspeech2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/fastspeech2.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/__pycache__/loss.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/loss.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/__pycache__/loss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/loss.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/__pycache__/modules.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/modules.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/__pycache__/modules.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/modules.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/__pycache__/optimizer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/optimizer.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/__pycache__/optimizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/model/__pycache__/optimizer.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/fastspeech2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | from libs.FastSpeech2.transformer import Encoder, Decoder, PostNet
  9 | from .modules import VarianceAdaptor
 10 | from libs.FastSpeech2.utils.tools import get_mask_from_lengths
 11 | 
 12 | 
 13 | class FastSpeech2(nn.Module):
 14 |     """ FastSpeech2 """
 15 | 
 16 |     def __init__(self, preprocess_config, model_config):
 17 |         super(FastSpeech2, self).__init__()
 18 |         self.model_config = model_config
 19 | 
 20 |         self.encoder = Encoder(model_config)
 21 |         self.variance_adaptor = VarianceAdaptor(preprocess_config, model_config)
 22 |         self.decoder = Decoder(model_config)
 23 |         self.mel_linear = nn.Linear(
 24 |             model_config["transformer"]["decoder_hidden"],
 25 |             preprocess_config["preprocessing"]["mel"]["n_mel_channels"],
 26 |         )
 27 |         self.postnet = PostNet()
 28 | 
 29 |         self.speaker_emb = None
 30 |         if model_config["multi_speaker"]:
 31 |             with open(
 32 |                 os.path.join(
 33 |                     preprocess_config["path"]["preprocessed_path"], "speakers.json"
 34 |                 ),
 35 |                 "r",
 36 |             ) as f:
 37 |                 n_speaker = len(json.load(f))
 38 |             self.speaker_emb = nn.Embedding(
 39 |                 n_speaker,
 40 |                 model_config["transformer"]["encoder_hidden"],
 41 |             )
 42 | 
 43 |     def forward(
 44 |         self,
 45 |         speakers,
 46 |         texts,
 47 |         src_lens,
 48 |         max_src_len,
 49 |         mels=None,
 50 |         mel_lens=None,
 51 |         max_mel_len=None,
 52 |         p_targets=None,
 53 |         e_targets=None,
 54 |         d_targets=None,
 55 |         p_control=1.0,
 56 |         e_control=1.0,
 57 |         d_control=1.0,
 58 |     ):
 59 |         src_masks = get_mask_from_lengths(src_lens, max_src_len)
 60 |         mel_masks = (
 61 |             get_mask_from_lengths(mel_lens, max_mel_len)
 62 |             if mel_lens is not None
 63 |             else None
 64 |         )
 65 | 
 66 | 
 67 |         assert texts.shape[2] == 256 or texts.shape[2] == 768, print(f"####transformer  texts {texts.shape}, src_masks {src_masks.shape}")  ### rkmt 2022.5.19
 68 | 
 69 |         output = self.encoder(texts, src_masks)
 70 | 
 71 |         if self.speaker_emb is not None:
 72 |             output = output + self.speaker_emb(speakers).unsqueeze(1).expand(
 73 |                 -1, max_src_len, -1
 74 |             )
 75 | 
 76 |         (
 77 |             output,
 78 |             p_predictions,
 79 |             e_predictions,
 80 |             log_d_predictions,
 81 |             d_rounded,
 82 |             mel_lens,
 83 |             mel_masks,
 84 |         ) = self.variance_adaptor(
 85 |             output,
 86 |             src_masks,
 87 |             mel_masks,
 88 |             max_mel_len,
 89 |             p_targets,
 90 |             e_targets,
 91 |             d_targets,
 92 |             p_control,
 93 |             e_control,
 94 |             d_control,
 95 |         )
 96 | 
 97 |         output, mel_masks = self.decoder(output, mel_masks)
 98 |         output = self.mel_linear(output)
 99 | 
100 |         postnet_output = self.postnet(output) + output
101 | 
102 |         return (
103 |             output,
104 |             postnet_output,
105 |             p_predictions,
106 |             e_predictions,
107 |             log_d_predictions,
108 |             d_rounded,
109 |             src_masks,
110 |             mel_masks,
111 |             src_lens,
112 |             mel_lens,
113 |         )


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class FastSpeech2Loss(nn.Module):
 6 |     """ FastSpeech2 Loss """
 7 | 
 8 |     def __init__(self, preprocess_config, model_config):
 9 |         super(FastSpeech2Loss, self).__init__()
10 |         self.pitch_feature_level = preprocess_config["preprocessing"]["pitch"][
11 |             "feature"
12 |         ]
13 |         self.energy_feature_level = preprocess_config["preprocessing"]["energy"][
14 |             "feature"
15 |         ]
16 |         self.mse_loss = nn.MSELoss()
17 |         self.mae_loss = nn.L1Loss()
18 | 
19 |     def forward(self, inputs, predictions):
20 |         (
21 |             mel_targets,
22 |             _,
23 |             _,
24 |             pitch_targets,
25 |             energy_targets,
26 |             duration_targets,
27 |         ) = inputs[6:]
28 |         (
29 |             mel_predictions,
30 |             postnet_mel_predictions,
31 |             pitch_predictions,
32 |             energy_predictions,
33 |             log_duration_predictions,
34 |             _,
35 |             src_masks,
36 |             mel_masks,
37 |             _,
38 |             _,
39 |         ) = predictions
40 |         src_masks = ~src_masks
41 |         mel_masks = ~mel_masks
42 |         log_duration_targets = torch.log(duration_targets.float() + 1)
43 |         mel_targets = mel_targets[:, : mel_masks.shape[1], :]
44 |         mel_masks = mel_masks[:, :mel_masks.shape[1]]
45 | 
46 |         log_duration_targets.requires_grad = False
47 |         pitch_targets.requires_grad = False
48 |         energy_targets.requires_grad = False
49 |         mel_targets.requires_grad = False
50 | 
51 |         if self.pitch_feature_level == "phoneme_level":
52 |             pitch_predictions = pitch_predictions.masked_select(src_masks)
53 |             pitch_targets = pitch_targets.masked_select(src_masks)
54 |         elif self.pitch_feature_level == "frame_level":
55 |             pitch_predictions = pitch_predictions.masked_select(mel_masks)
56 |             pitch_targets = pitch_targets.masked_select(mel_masks)
57 | 
58 |         if self.energy_feature_level == "phoneme_level":
59 |             energy_predictions = energy_predictions.masked_select(src_masks)
60 |             energy_targets = energy_targets.masked_select(src_masks)
61 |         if self.energy_feature_level == "frame_level":
62 |             energy_predictions = energy_predictions.masked_select(mel_masks)
63 |             energy_targets = energy_targets.masked_select(mel_masks)
64 | 
65 |         log_duration_predictions = log_duration_predictions.masked_select(src_masks)
66 |         log_duration_targets = log_duration_targets.masked_select(src_masks)
67 | 
68 |         mel_predictions = mel_predictions.masked_select(mel_masks.unsqueeze(-1))
69 |         postnet_mel_predictions = postnet_mel_predictions.masked_select(
70 |             mel_masks.unsqueeze(-1)
71 |         )
72 |         mel_targets = mel_targets.masked_select(mel_masks.unsqueeze(-1))
73 | 
74 |         mel_loss = self.mae_loss(mel_predictions, mel_targets)
75 |         postnet_mel_loss = self.mae_loss(postnet_mel_predictions, mel_targets)
76 | 
77 |         pitch_loss = self.mse_loss(pitch_predictions, pitch_targets)
78 |         energy_loss = self.mse_loss(energy_predictions, energy_targets)
79 |         duration_loss = self.mse_loss(log_duration_predictions, log_duration_targets)
80 | 
81 |         total_loss = (
82 |             mel_loss + postnet_mel_loss + duration_loss + pitch_loss + energy_loss
83 |         )
84 | 
85 |         return (
86 |             total_loss,
87 |             mel_loss,
88 |             postnet_mel_loss,
89 |             pitch_loss,
90 |             energy_loss,
91 |             duration_loss,
92 |         )
93 | 


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/modules.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import copy
  4 | import math
  5 | from collections import OrderedDict
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import numpy as np
 10 | import torch.nn.functional as F
 11 | 
 12 | from libs.FastSpeech2.utils.tools import get_mask_from_lengths, pad
 13 | 
 14 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 15 | 
 16 | 
 17 | class VarianceAdaptor(nn.Module):
 18 |     """Variance Adaptor"""
 19 | 
 20 |     def __init__(self, preprocess_config, model_config):
 21 |         super(VarianceAdaptor, self).__init__()
 22 | 
 23 |         self.duration_predictor = VariancePredictor(model_config)
 24 |         self.length_regulator = LengthRegulator()
 25 | 
 26 |         self.pitch_predictor = VariancePredictor(model_config)
 27 |         self.energy_predictor = VariancePredictor(model_config)
 28 | 
 29 |         self.pitch_feature_level = preprocess_config["preprocessing"]["pitch"][
 30 |             "feature"
 31 |         ]
 32 |         self.energy_feature_level = preprocess_config["preprocessing"]["energy"][
 33 |             "feature"
 34 |         ]
 35 |         assert self.pitch_feature_level in ["phoneme_level", "frame_level"]
 36 |         assert self.energy_feature_level in ["phoneme_level", "frame_level"]
 37 | 
 38 |         pitch_quantization = model_config["variance_embedding"]["pitch_quantization"]
 39 |         energy_quantization = model_config["variance_embedding"]["energy_quantization"]
 40 |         n_bins = model_config["variance_embedding"]["n_bins"]
 41 |         assert pitch_quantization in ["linear", "log"]
 42 |         assert energy_quantization in ["linear", "log"]
 43 |         with open(
 44 |             os.path.join(preprocess_config["path"]["preprocessed_path"], "stats.json")
 45 |         ) as f:
 46 |             assert f is not None # rkmt 2022.10.24
 47 |             stats = json.load(f)
 48 |             pitch_min, pitch_max = stats["pitch"][:2]
 49 |             energy_min, energy_max = stats["energy"][:2]
 50 | 
 51 |         if pitch_quantization == "log":
 52 |             self.pitch_bins = nn.Parameter(
 53 |                 torch.exp(
 54 |                     torch.linspace(np.log(pitch_min), np.log(pitch_max), n_bins - 1)
 55 |                 ),
 56 |                 requires_grad=False,
 57 |             )
 58 |         else:
 59 |             self.pitch_bins = nn.Parameter(
 60 |                 torch.linspace(pitch_min, pitch_max, n_bins - 1),
 61 |                 requires_grad=False,
 62 |             )
 63 |         if energy_quantization == "log":
 64 |             self.energy_bins = nn.Parameter(
 65 |                 torch.exp(
 66 |                     torch.linspace(np.log(energy_min), np.log(energy_max), n_bins - 1)
 67 |                 ),
 68 |                 requires_grad=False,
 69 |             )
 70 |         else:
 71 |             self.energy_bins = nn.Parameter(
 72 |                 torch.linspace(energy_min, energy_max, n_bins - 1),
 73 |                 requires_grad=False,
 74 |             )
 75 | 
 76 |         self.pitch_embedding = nn.Embedding(
 77 |             n_bins, model_config["transformer"]["encoder_hidden"]
 78 |         )
 79 |         self.energy_embedding = nn.Embedding(
 80 |             n_bins, model_config["transformer"]["encoder_hidden"]
 81 |         )
 82 | 
 83 |     def get_pitch_embedding(self, x, target, mask, control):
 84 |         prediction = self.pitch_predictor(x, mask)
 85 |         if target is not None:
 86 |             embedding = self.pitch_embedding(torch.bucketize(target, self.pitch_bins))
 87 |         else:
 88 |             prediction = prediction * control
 89 |             embedding = self.pitch_embedding(
 90 |                 torch.bucketize(prediction, self.pitch_bins)
 91 |             )
 92 |         return prediction, embedding
 93 | 
 94 |     def get_energy_embedding(self, x, target, mask, control):
 95 |         prediction = self.energy_predictor(x, mask)
 96 |         if target is not None:
 97 |             embedding = self.energy_embedding(torch.bucketize(target, self.energy_bins))
 98 |         else:
 99 |             prediction = prediction * control
100 |             embedding = self.energy_embedding(
101 |                 torch.bucketize(prediction, self.energy_bins)
102 |             )
103 |         return prediction, embedding
104 | 
105 |     def forward(
106 |         self,
107 |         x,
108 |         src_mask,
109 |         mel_mask=None,
110 |         max_len=None,
111 |         pitch_target=None,
112 |         energy_target=None,
113 |         duration_target=None,
114 |         p_control=1.0,
115 |         e_control=1.0,
116 |         d_control=1.0,
117 |     ):
118 |         # assert duration_target is not None # rkmt 2022.8.3
119 |         #log_duration_prediction = self.duration_predictor(x, src_mask)
120 |         log_duration_prediction = None
121 |         if self.pitch_feature_level == "phoneme_level":
122 |             pitch_prediction, pitch_embedding = self.get_pitch_embedding(
123 |                 x, pitch_target, src_mask, p_control
124 |             )
125 |             x = x + pitch_embedding
126 |         if self.energy_feature_level == "phoneme_level":
127 |             energy_prediction, energy_embedding = self.get_energy_embedding(
128 |                 x, energy_target, src_mask, p_control
129 |             )
130 |             x = x + energy_embedding
131 | 
132 |         if duration_target is not None:
133 |             #print("### x", x, duration_target, max_len) ### rkmt 2022.7.3
134 | 
135 |             ## bypasss length_regulator (rkmt 2022.8.3)
136 |             #x, mel_len = self.length_regulator(x, duration_target, max_len)
137 |             mel_len = torch.tensor([x.shape[1]], device=x.device)
138 | 
139 |             #print("#### x", x, x.shape, type(x), x.device, "\nmel_len", mel_len, type(mel_len), mel_len.device, "\max_len", max_len)
140 |             duration_rounded = duration_target
141 | 
142 |             #### rkmt 2022.6.7
143 |             if mel_mask is None:
144 |                 mel_mask = get_mask_from_lengths(mel_len)
145 |         else:
146 |             log_duration_prediction = self.duration_predictor(x, src_mask)
147 |             duration_rounded = torch.clamp(
148 |                 (torch.round(torch.exp(log_duration_prediction) - 1) * d_control),
149 |                 min=0,
150 |             )
151 |             ### rkmt 2022.6.7
152 |             #print("### duration", duration_rounded.shape, duration_rounded, "max_len", max_len) #### rkmt
153 |             x, mel_len = self.length_regulator(x, duration_rounded, max_len)
154 |             mel_mask = get_mask_from_lengths(mel_len)
155 |             #print("### mel_mask", mel_len, mel_mask.shape, mel_mask) #### rkmt 2022.6.7
156 | 
157 |         if self.pitch_feature_level == "frame_level":
158 |             pitch_prediction, pitch_embedding = self.get_pitch_embedding(
159 |                 x, pitch_target, mel_mask, p_control
160 |             )
161 |             x = x + pitch_embedding
162 |         if self.energy_feature_level == "frame_level":
163 |             energy_prediction, energy_embedding = self.get_energy_embedding(
164 |                 x, energy_target, mel_mask, p_control
165 |             )
166 |             x = x + energy_embedding
167 | 
168 |         return (
169 |             x,
170 |             pitch_prediction,
171 |             energy_prediction,
172 |             log_duration_prediction,
173 |             duration_rounded,
174 |             mel_len,
175 |             mel_mask,
176 |         )
177 | 
178 | 
179 | class LengthRegulator(nn.Module):
180 |     """Length Regulator"""
181 | 
182 |     def __init__(self):
183 |         super(LengthRegulator, self).__init__()
184 | 
185 |     def LR(self, x, duration, max_len):
186 |         output = list()
187 |         mel_len = list()
188 |         for batch, expand_target in zip(x, duration):
189 |             expanded = self.expand(batch, expand_target)
190 |             output.append(expanded)
191 |             mel_len.append(expanded.shape[0])
192 | 
193 |         if max_len is not None:
194 |             output = pad(output, max_len)
195 |         else:
196 |             output = pad(output)
197 | 
198 |         return output, torch.LongTensor(mel_len).to(device)
199 | 
200 |     def expand(self, batch, predicted):
201 |         out = list()
202 | 
203 |         for i, vec in enumerate(batch):
204 |             expand_size = predicted[i].item()
205 |             out.append(vec.expand(max(int(expand_size), 0), -1))
206 |         out = torch.cat(out, 0)
207 | 
208 |         return out
209 | 
210 |     def forward(self, x, duration, max_len):
211 |         output, mel_len = self.LR(x, duration, max_len)
212 |         return output, mel_len
213 | 
214 | 
215 | class VariancePredictor(nn.Module):
216 |     """Duration, Pitch and Energy Predictor"""
217 | 
218 |     def __init__(self, model_config):
219 |         super(VariancePredictor, self).__init__()
220 | 
221 |         self.input_size = model_config["transformer"]["encoder_hidden"]
222 |         self.filter_size = model_config["variance_predictor"]["filter_size"]
223 |         self.kernel = model_config["variance_predictor"]["kernel_size"]
224 |         self.conv_output_size = model_config["variance_predictor"]["filter_size"]
225 |         self.dropout = model_config["variance_predictor"]["dropout"]
226 | 
227 |         self.conv_layer = nn.Sequential(
228 |             OrderedDict(
229 |                 [
230 |                     (
231 |                         "conv1d_1",
232 |                         Conv(
233 |                             self.input_size,
234 |                             self.filter_size,
235 |                             kernel_size=self.kernel,
236 |                             padding=(self.kernel - 1) // 2,
237 |                         ),
238 |                     ),
239 |                     ("relu_1", nn.ReLU()),
240 |                     ("layer_norm_1", nn.LayerNorm(self.filter_size)),
241 |                     ("dropout_1", nn.Dropout(self.dropout)),
242 |                     (
243 |                         "conv1d_2",
244 |                         Conv(
245 |                             self.filter_size,
246 |                             self.filter_size,
247 |                             kernel_size=self.kernel,
248 |                             padding=1,
249 |                         ),
250 |                     ),
251 |                     ("relu_2", nn.ReLU()),
252 |                     ("layer_norm_2", nn.LayerNorm(self.filter_size)),
253 |                     ("dropout_2", nn.Dropout(self.dropout)),
254 |                 ]
255 |             )
256 |         )
257 | 
258 |         self.linear_layer = nn.Linear(self.conv_output_size, 1)
259 | 
260 |     def forward(self, encoder_output, mask):
261 |         out = self.conv_layer(encoder_output)
262 |         out = self.linear_layer(out)
263 |         out = out.squeeze(-1)
264 | 
265 |         if mask is not None:
266 |             out = out.masked_fill(mask, 0.0)
267 | 
268 |         return out
269 | 
270 | 
271 | class Conv(nn.Module):
272 |     """
273 |     Convolution Module
274 |     """
275 | 
276 |     def __init__(
277 |         self,
278 |         in_channels,
279 |         out_channels,
280 |         kernel_size=1,
281 |         stride=1,
282 |         padding=0,
283 |         dilation=1,
284 |         bias=True,
285 |         w_init="linear",
286 |     ):
287 |         """
288 |         :param in_channels: dimension of input
289 |         :param out_channels: dimension of output
290 |         :param kernel_size: size of kernel
291 |         :param stride: size of stride
292 |         :param padding: size of padding
293 |         :param dilation: dilation rate
294 |         :param bias: boolean. if True, bias is included.
295 |         :param w_init: str. weight inits with xavier initialization.
296 |         """
297 |         super(Conv, self).__init__()
298 | 
299 |         self.conv = nn.Conv1d(
300 |             in_channels,
301 |             out_channels,
302 |             kernel_size=kernel_size,
303 |             stride=stride,
304 |             padding=padding,
305 |             dilation=dilation,
306 |             bias=bias,
307 |         )
308 | 
309 |     def forward(self, x):
310 |         x = x.contiguous().transpose(1, 2)
311 |         x = self.conv(x)
312 |         x = x.contiguous().transpose(1, 2)
313 | 
314 |         return x
315 | 


--------------------------------------------------------------------------------
/libs/FastSpeech2/model/optimizer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | 
 5 | class ScheduledOptim:
 6 |     """ A simple wrapper class for learning rate scheduling """
 7 | 
 8 |     def __init__(self, model, train_config, model_config, current_step):
 9 | 
10 |         self._optimizer = torch.optim.Adam(
11 |             model.parameters(),
12 |             betas=train_config["optimizer"]["betas"],
13 |             eps=train_config["optimizer"]["eps"],
14 |             weight_decay=train_config["optimizer"]["weight_decay"],
15 |         )
16 |         self.n_warmup_steps = train_config["optimizer"]["warm_up_step"]
17 |         self.anneal_steps = train_config["optimizer"]["anneal_steps"]
18 |         self.anneal_rate = train_config["optimizer"]["anneal_rate"]
19 |         self.current_step = current_step
20 |         self.init_lr = np.power(model_config["transformer"]["encoder_hidden"], -0.5)
21 | 
22 |     def step_and_update_lr(self):
23 |         self._update_learning_rate()
24 |         self._optimizer.step()
25 | 
26 |     def zero_grad(self):
27 |         # print(self.init_lr)
28 |         self._optimizer.zero_grad()
29 | 
30 |     def load_state_dict(self, path):
31 |         self._optimizer.load_state_dict(path)
32 | 
33 |     def _get_lr_scale(self):
34 |         lr = np.min(
35 |             [
36 |                 np.power(self.current_step, -0.5),
37 |                 np.power(self.n_warmup_steps, -1.5) * self.current_step,
38 |             ]
39 |         )
40 |         for s in self.anneal_steps:
41 |             if self.current_step > s:
42 |                 lr = lr * self.anneal_rate
43 |         return lr
44 | 
45 |     def _update_learning_rate(self):
46 |         """ Learning rate scheduling per step """
47 |         self.current_step += 1
48 |         lr = self.init_lr * self._get_lr_scale()
49 | 
50 |         for param_group in self._optimizer.param_groups:
51 |             param_group["lr"] = lr
52 | 


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/Constants.py:
--------------------------------------------------------------------------------
 1 | PAD = 0
 2 | UNK = 1
 3 | BOS = 2
 4 | EOS = 3
 5 | 
 6 | PAD_WORD = "<blank>"
 7 | UNK_WORD = "<unk>"
 8 | BOS_WORD = "<s>"
 9 | EOS_WORD = "</s>"
10 | 


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/Layers.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import numpy as np
  6 | from torch.nn import functional as F
  7 | 
  8 | from .SubLayers import MultiHeadAttention, PositionwiseFeedForward
  9 | 
 10 | 
 11 | class FFTBlock(torch.nn.Module):
 12 |     """FFT Block"""
 13 | 
 14 |     def __init__(self, d_model, n_head, d_k, d_v, d_inner, kernel_size, dropout=0.1):
 15 |         super(FFTBlock, self).__init__()
 16 |         self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
 17 |         self.pos_ffn = PositionwiseFeedForward(
 18 |             d_model, d_inner, kernel_size, dropout=dropout
 19 |         )
 20 | 
 21 |     def forward(self, enc_input, mask=None, slf_attn_mask=None):
 22 |         enc_output, enc_slf_attn = self.slf_attn(
 23 |             enc_input, enc_input, enc_input, mask=slf_attn_mask
 24 |         )
 25 |         enc_output = enc_output.masked_fill(mask.unsqueeze(-1), 0)
 26 | 
 27 |         enc_output = self.pos_ffn(enc_output)
 28 |         enc_output = enc_output.masked_fill(mask.unsqueeze(-1), 0)
 29 | 
 30 |         return enc_output, enc_slf_attn
 31 | 
 32 | 
 33 | class ConvNorm(torch.nn.Module):
 34 |     def __init__(
 35 |         self,
 36 |         in_channels,
 37 |         out_channels,
 38 |         kernel_size=1,
 39 |         stride=1,
 40 |         padding=None,
 41 |         dilation=1,
 42 |         bias=True,
 43 |         w_init_gain="linear",
 44 |     ):
 45 |         super(ConvNorm, self).__init__()
 46 | 
 47 |         if padding is None:
 48 |             assert kernel_size % 2 == 1
 49 |             padding = int(dilation * (kernel_size - 1) / 2)
 50 | 
 51 |         self.conv = torch.nn.Conv1d(
 52 |             in_channels,
 53 |             out_channels,
 54 |             kernel_size=kernel_size,
 55 |             stride=stride,
 56 |             padding=padding,
 57 |             dilation=dilation,
 58 |             bias=bias,
 59 |         )
 60 | 
 61 |     def forward(self, signal):
 62 |         conv_signal = self.conv(signal)
 63 | 
 64 |         return conv_signal
 65 | 
 66 | 
 67 | class PostNet(nn.Module):
 68 |     """
 69 |     PostNet: Five 1-d convolution with 512 channels and kernel size 5
 70 |     """
 71 | 
 72 |     def __init__(
 73 |         self,
 74 |         n_mel_channels=80,
 75 |         postnet_embedding_dim=512,
 76 |         postnet_kernel_size=5,
 77 |         postnet_n_convolutions=5,
 78 |     ):
 79 | 
 80 |         super(PostNet, self).__init__()
 81 |         self.convolutions = nn.ModuleList()
 82 | 
 83 |         self.convolutions.append(
 84 |             nn.Sequential(
 85 |                 ConvNorm(
 86 |                     n_mel_channels,
 87 |                     postnet_embedding_dim,
 88 |                     kernel_size=postnet_kernel_size,
 89 |                     stride=1,
 90 |                     padding=int((postnet_kernel_size - 1) / 2),
 91 |                     dilation=1,
 92 |                     w_init_gain="tanh",
 93 |                 ),
 94 |                 nn.BatchNorm1d(postnet_embedding_dim),
 95 |             )
 96 |         )
 97 | 
 98 |         for i in range(1, postnet_n_convolutions - 1):
 99 |             self.convolutions.append(
100 |                 nn.Sequential(
101 |                     ConvNorm(
102 |                         postnet_embedding_dim,
103 |                         postnet_embedding_dim,
104 |                         kernel_size=postnet_kernel_size,
105 |                         stride=1,
106 |                         padding=int((postnet_kernel_size - 1) / 2),
107 |                         dilation=1,
108 |                         w_init_gain="tanh",
109 |                     ),
110 |                     nn.BatchNorm1d(postnet_embedding_dim),
111 |                 )
112 |             )
113 | 
114 |         self.convolutions.append(
115 |             nn.Sequential(
116 |                 ConvNorm(
117 |                     postnet_embedding_dim,
118 |                     n_mel_channels,
119 |                     kernel_size=postnet_kernel_size,
120 |                     stride=1,
121 |                     padding=int((postnet_kernel_size - 1) / 2),
122 |                     dilation=1,
123 |                     w_init_gain="linear",
124 |                 ),
125 |                 nn.BatchNorm1d(n_mel_channels),
126 |             )
127 |         )
128 | 
129 |     def forward(self, x):
130 |         x = x.contiguous().transpose(1, 2)
131 | 
132 |         for i in range(len(self.convolutions) - 1):
133 |             x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
134 |         x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
135 | 
136 |         x = x.contiguous().transpose(1, 2)
137 |         return x
138 | 


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/Models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import numpy as np
  4 | 
  5 | import libs.FastSpeech2.transformer.Constants as Constants
  6 | from .Layers import FFTBlock
  7 | 
  8 | 
  9 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
 10 |     """ Sinusoid position encoding table """
 11 | 
 12 |     def cal_angle(position, hid_idx):
 13 |         return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
 14 | 
 15 |     def get_posi_angle_vec(position):
 16 |         return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
 17 | 
 18 |     sinusoid_table = np.array(
 19 |         [get_posi_angle_vec(pos_i) for pos_i in range(n_position)]
 20 |     )
 21 | 
 22 |     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
 23 |     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
 24 | 
 25 |     if padding_idx is not None:
 26 |         # zero vector for padding dimension
 27 |         sinusoid_table[padding_idx] = 0.0
 28 | 
 29 |     return torch.FloatTensor(sinusoid_table)
 30 | 
 31 | 
 32 | class Encoder(nn.Module):
 33 |     """ Encoder """
 34 | 
 35 |     def __init__(self, config):
 36 |         super(Encoder, self).__init__()
 37 | 
 38 |         n_position = config["max_seq_len"] + 1
 39 |         # n_src_vocab = len(symbols) + 1
 40 |         n_src_vocab = 360 + 1
 41 |         d_word_vec = config["transformer"]["encoder_hidden"]
 42 |         n_layers = config["transformer"]["encoder_layer"]
 43 |         n_head = config["transformer"]["encoder_head"]
 44 |         d_k = d_v = (
 45 |             config["transformer"]["encoder_hidden"]
 46 |             // config["transformer"]["encoder_head"]
 47 |         )
 48 |         d_model = config["transformer"]["encoder_hidden"]
 49 |         d_inner = config["transformer"]["conv_filter_size"]
 50 |         kernel_size = config["transformer"]["conv_kernel_size"]
 51 |         dropout = config["transformer"]["encoder_dropout"]
 52 | 
 53 |         self.max_seq_len = config["max_seq_len"]
 54 |         self.d_model = d_model
 55 | 
 56 |         self.src_word_emb = nn.Embedding(
 57 |             n_src_vocab, d_word_vec, padding_idx=Constants.PAD
 58 |         )
 59 |         if config["soft_unit"]:   # rkmt 2022.7.3
 60 |             self.soft_unit_dim = 256 if not ('soft_unit_dim' in config) else int(config['soft_unit_dim']) 
 61 | 
 62 |             if self.soft_unit_dim == 256:
 63 |                 self.src_word_emb = nn.Identity()
 64 |             else:            
 65 |                 self.src_word_emb = nn.Linear(self.soft_unit_dim, 256)
 66 | 
 67 |         self.position_enc = nn.Parameter(
 68 |             get_sinusoid_encoding_table(n_position, d_word_vec).unsqueeze(0),
 69 |             requires_grad=False,
 70 |         )
 71 | 
 72 |         self.layer_stack = nn.ModuleList(
 73 |             [
 74 |                 FFTBlock(
 75 |                     d_model, n_head, d_k, d_v, d_inner, kernel_size, dropout=dropout
 76 |                 )
 77 |                 for _ in range(n_layers)
 78 |             ]
 79 |         )
 80 | 
 81 |     def forward(self, src_seq, mask, return_attns=False):
 82 | 
 83 |         enc_slf_attn_list = []
 84 |         batch_size, max_len = src_seq.shape[0], src_seq.shape[1]
 85 | 
 86 |         # -- Prepare masks
 87 |         slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
 88 | 
 89 |         '''
 90 |         print("### src_seq", src_seq.shape, type(src_seq), src_seq.dtype)
 91 |         print("### src_sec[..]", src_seq[0,0,:10])
 92 |         print("###", self.src_word_emb(src_seq).shape)
 93 |         '''
 94 | 
 95 |         # -- Forward
 96 |         if not self.training and src_seq.shape[1] > self.max_seq_len:
 97 |             enc_output = self.src_word_emb(src_seq) + get_sinusoid_encoding_table(
 98 |                 src_seq.shape[1], self.d_model
 99 |             )[: src_seq.shape[1], :].unsqueeze(0).expand(batch_size, -1, -1).to(
100 |                 src_seq.device
101 |             )
102 |         else:
103 |             enc_output = self.src_word_emb(src_seq) + self.position_enc[
104 |                 :, :max_len, :
105 |             ].expand(batch_size, -1, -1)
106 | 
107 |         for enc_layer in self.layer_stack:
108 |             enc_output, enc_slf_attn = enc_layer(
109 |                 enc_output, mask=mask, slf_attn_mask=slf_attn_mask
110 |             )
111 |             if return_attns:
112 |                 enc_slf_attn_list += [enc_slf_attn]
113 | 
114 |         return enc_output
115 | 
116 | 
117 | class Decoder(nn.Module):
118 |     """ Decoder """
119 | 
120 |     def __init__(self, config):
121 |         super(Decoder, self).__init__()
122 | 
123 |         n_position = config["max_seq_len"] + 1
124 |         d_word_vec = config["transformer"]["decoder_hidden"]
125 |         n_layers = config["transformer"]["decoder_layer"]
126 |         n_head = config["transformer"]["decoder_head"]
127 |         d_k = d_v = (
128 |             config["transformer"]["decoder_hidden"]
129 |             // config["transformer"]["decoder_head"]
130 |         )
131 |         d_model = config["transformer"]["decoder_hidden"]
132 |         d_inner = config["transformer"]["conv_filter_size"]
133 |         kernel_size = config["transformer"]["conv_kernel_size"]
134 |         dropout = config["transformer"]["decoder_dropout"]
135 | 
136 |         self.max_seq_len = config["max_seq_len"]
137 |         self.d_model = d_model
138 | 
139 |         self.position_enc = nn.Parameter(
140 |             get_sinusoid_encoding_table(n_position, d_word_vec).unsqueeze(0),
141 |             requires_grad=False,
142 |         )
143 | 
144 |         self.layer_stack = nn.ModuleList(
145 |             [
146 |                 FFTBlock(
147 |                     d_model, n_head, d_k, d_v, d_inner, kernel_size, dropout=dropout
148 |                 )
149 |                 for _ in range(n_layers)
150 |             ]
151 |         )
152 | 
153 |     def forward(self, enc_seq, mask, return_attns=False):
154 | 
155 |         dec_slf_attn_list = []
156 |         batch_size, max_len = enc_seq.shape[0], enc_seq.shape[1]
157 | 
158 |         # -- Forward
159 |         if not self.training and enc_seq.shape[1] > self.max_seq_len:
160 |             # -- Prepare masks
161 |             slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
162 |             dec_output = enc_seq + get_sinusoid_encoding_table(
163 |                 enc_seq.shape[1], self.d_model
164 |             )[: enc_seq.shape[1], :].unsqueeze(0).expand(batch_size, -1, -1).to(
165 |                 enc_seq.device
166 |             )
167 |         else:
168 |             max_len = min(max_len, self.max_seq_len)
169 | 
170 |             # -- Prepare masks
171 |             slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1)
172 |             dec_output = enc_seq[:, :max_len, :] + self.position_enc[
173 |                 :, :max_len, :
174 |             ].expand(batch_size, -1, -1)
175 |             mask = mask[:, :max_len]
176 |             slf_attn_mask = slf_attn_mask[:, :, :max_len]
177 | 
178 |         for dec_layer in self.layer_stack:
179 |             dec_output, dec_slf_attn = dec_layer(
180 |                 dec_output, mask=mask, slf_attn_mask=slf_attn_mask
181 |             )
182 |             if return_attns:
183 |                 dec_slf_attn_list += [dec_slf_attn]
184 | 
185 |         return dec_output, mask
186 | 


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/Modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | 
 6 | class ScaledDotProductAttention(nn.Module):
 7 |     """ Scaled Dot-Product Attention """
 8 | 
 9 |     def __init__(self, temperature):
10 |         super().__init__()
11 |         self.temperature = temperature
12 |         self.softmax = nn.Softmax(dim=2)
13 | 
14 |     def forward(self, q, k, v, mask=None):
15 | 
16 |         attn = torch.bmm(q, k.transpose(1, 2))
17 |         attn = attn / self.temperature
18 | 
19 |         if mask is not None:
20 |             attn = attn.masked_fill(mask, -np.inf)
21 | 
22 |         attn = self.softmax(attn)
23 |         output = torch.bmm(attn, v)
24 | 
25 |         return output, attn
26 | 


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/SubLayers.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | import numpy as np
 4 | 
 5 | from .Modules import ScaledDotProductAttention
 6 | 
 7 | 
 8 | class MultiHeadAttention(nn.Module):
 9 |     """ Multi-Head Attention module """
10 | 
11 |     def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
12 |         super().__init__()
13 | 
14 |         self.n_head = n_head
15 |         self.d_k = d_k
16 |         self.d_v = d_v
17 | 
18 |         self.w_qs = nn.Linear(d_model, n_head * d_k)
19 |         self.w_ks = nn.Linear(d_model, n_head * d_k)
20 |         self.w_vs = nn.Linear(d_model, n_head * d_v)
21 | 
22 |         self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
23 |         self.layer_norm = nn.LayerNorm(d_model)
24 | 
25 |         self.fc = nn.Linear(n_head * d_v, d_model)
26 | 
27 |         self.dropout = nn.Dropout(dropout)
28 | 
29 |     def forward(self, q, k, v, mask=None):
30 | 
31 |         d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
32 | 
33 |         sz_b, len_q, _ = q.size()
34 |         sz_b, len_k, _ = k.size()
35 |         sz_b, len_v, _ = v.size()
36 | 
37 |         residual = q
38 | 
39 |         q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
40 |         k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
41 |         v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
42 |         q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k)  # (n*b) x lq x dk
43 |         k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k)  # (n*b) x lk x dk
44 |         v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v)  # (n*b) x lv x dv
45 | 
46 |         mask = mask.repeat(n_head, 1, 1)  # (n*b) x .. x ..
47 |         output, attn = self.attention(q, k, v, mask=mask)
48 | 
49 |         output = output.view(n_head, sz_b, len_q, d_v)
50 |         output = (
51 |             output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1)
52 |         )  # b x lq x (n*dv)
53 | 
54 |         output = self.dropout(self.fc(output))
55 |         output = self.layer_norm(output + residual)
56 | 
57 |         return output, attn
58 | 
59 | 
60 | class PositionwiseFeedForward(nn.Module):
61 |     """ A two-feed-forward-layer module """
62 | 
63 |     def __init__(self, d_in, d_hid, kernel_size, dropout=0.1):
64 |         super().__init__()
65 | 
66 |         # Use Conv1D
67 |         # position-wise
68 |         self.w_1 = nn.Conv1d(
69 |             d_in,
70 |             d_hid,
71 |             kernel_size=kernel_size[0],
72 |             padding=(kernel_size[0] - 1) // 2,
73 |         )
74 |         # position-wise
75 |         self.w_2 = nn.Conv1d(
76 |             d_hid,
77 |             d_in,
78 |             kernel_size=kernel_size[1],
79 |             padding=(kernel_size[1] - 1) // 2,
80 |         )
81 | 
82 |         self.layer_norm = nn.LayerNorm(d_in)
83 |         self.dropout = nn.Dropout(dropout)
84 | 
85 |     def forward(self, x):
86 |         residual = x
87 |         output = x.transpose(1, 2)
88 |         output = self.w_2(F.relu(self.w_1(output)))
89 |         output = output.transpose(1, 2)
90 |         output = self.dropout(output)
91 |         output = self.layer_norm(output + residual)
92 | 
93 |         return output
94 | 


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | from .Models import Encoder, Decoder
2 | from .Layers import PostNet


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/__pycache__/Constants.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Constants.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/__pycache__/Constants.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Constants.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/__pycache__/Layers.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Layers.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/__pycache__/Layers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Layers.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/__pycache__/Models.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Models.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/__pycache__/Models.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Models.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/__pycache__/Modules.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Modules.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/__pycache__/Modules.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/Modules.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/__pycache__/SubLayers.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/SubLayers.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/__pycache__/SubLayers.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/SubLayers.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/transformer/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/transformer/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/utils/__pycache__/model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/utils/__pycache__/model.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/utils/__pycache__/model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/utils/__pycache__/model.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/utils/__pycache__/tools.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/utils/__pycache__/tools.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/utils/__pycache__/tools.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/FastSpeech2/utils/__pycache__/tools.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/FastSpeech2/utils/tools.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import numpy as np
 4 | 
 5 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 6 | 
 7 | def get_mask_from_lengths(lengths, max_len=None):
 8 |     batch_size = lengths.shape[0]
 9 |     if max_len is None:
10 |         max_len = torch.max(lengths).item()
11 | 
12 |     ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(device)
13 |     mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
14 | 
15 |     return mask
16 | 
17 | def pad_2D(inputs, maxlen=None):
18 |     def pad(x, max_len):
19 |         PAD = 0
20 |         if np.shape(x)[0] > max_len:
21 |             raise ValueError("not max_len")
22 | 
23 |         s = np.shape(x)[1]
24 |         x_padded = np.pad(
25 |             x, (0, max_len - np.shape(x)[0]), mode="constant", constant_values=PAD
26 |         )
27 |         return x_padded[:, :s]
28 | 
29 |     if maxlen:
30 |         output = np.stack([pad(x, maxlen) for x in inputs])
31 |     else:
32 |         max_len = max(np.shape(x)[0] for x in inputs)
33 |         output = np.stack([pad(x, max_len) for x in inputs])
34 | 
35 |     return output
36 | 
37 | 
38 | def pad(input_ele, mel_max_length=None):
39 |     if mel_max_length:
40 |         max_len = mel_max_length
41 |     else:
42 |         max_len = max([input_ele[i].size(0) for i in range(len(input_ele))])
43 | 
44 |     out_list = list()
45 |     for i, batch in enumerate(input_ele):
46 |         if len(batch.shape) == 1:
47 |             one_batch_padded = F.pad(
48 |                 batch, (0, max_len - batch.size(0)), "constant", 0.0
49 |             )
50 |         elif len(batch.shape) == 2:
51 |             one_batch_padded = F.pad(
52 |                 batch, (0, 0, 0, max_len - batch.size(0)), "constant", 0.0
53 |             )
54 |         out_list.append(one_batch_padded)
55 |     out_padded = torch.stack(out_list)
56 |     return out_padded
57 | 


--------------------------------------------------------------------------------
/libs/JDC/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import JDCNet


--------------------------------------------------------------------------------
/libs/JDC/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/JDC/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/JDC/__pycache__/model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/JDC/__pycache__/model.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/JDC/bst.t7:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/JDC/bst.t7


--------------------------------------------------------------------------------
/libs/JDC/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementation of model from:
  3 | Kum et al. - "Joint Detection and Classification of Singing Voice Melody Using
  4 | Convolutional Recurrent Neural Networks" (2019)
  5 | Link: https://www.semanticscholar.org/paper/Joint-Detection-and-Classification-of-Singing-Voice-Kum-Nam/60a2ad4c7db43bace75805054603747fcd062c0d
  6 | """
  7 | import torch
  8 | from torch import nn
  9 |         
 10 | class JDCNet(nn.Module):
 11 |     """
 12 |     Joint Detection and Classification Network model for singing voice melody.
 13 |     """
 14 |     def __init__(self, num_class=722, seq_len=31, leaky_relu_slope=0.01):
 15 |         super().__init__()
 16 |         self.num_class = num_class
 17 | 
 18 |         # input = (b, 1, 31, 513), b = batch size
 19 |         self.conv_block = nn.Sequential(
 20 |             nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1, bias=False),  # out: (b, 64, 31, 513)
 21 |             nn.BatchNorm2d(num_features=64),
 22 |             nn.LeakyReLU(leaky_relu_slope, inplace=True),
 23 |             nn.Conv2d(64, 64, 3, padding=1, bias=False),  # (b, 64, 31, 513)
 24 |         )
 25 | 
 26 |         # res blocks
 27 |         self.res_block1 = ResBlock(in_channels=64, out_channels=128)  # (b, 128, 31, 128)
 28 |         self.res_block2 = ResBlock(in_channels=128, out_channels=192)  # (b, 192, 31, 32)
 29 |         self.res_block3 = ResBlock(in_channels=192, out_channels=256)  # (b, 256, 31, 8)
 30 | 
 31 |         # pool block
 32 |         self.pool_block = nn.Sequential(
 33 |             nn.BatchNorm2d(num_features=256),
 34 |             nn.LeakyReLU(leaky_relu_slope, inplace=True),
 35 |             nn.MaxPool2d(kernel_size=(1, 4)),  # (b, 256, 31, 2)
 36 |             nn.Dropout(p=0.2),
 37 |         )
 38 | 
 39 |         # maxpool layers (for auxiliary network inputs)
 40 |         # in = (b, 128, 31, 513) from conv_block, out = (b, 128, 31, 2)
 41 |         self.maxpool1 = nn.MaxPool2d(kernel_size=(1, 40))
 42 |         # in = (b, 128, 31, 128) from res_block1, out = (b, 128, 31, 2)
 43 |         self.maxpool2 = nn.MaxPool2d(kernel_size=(1, 20))
 44 |         # in = (b, 128, 31, 32) from res_block2, out = (b, 128, 31, 2)
 45 |         self.maxpool3 = nn.MaxPool2d(kernel_size=(1, 10))
 46 | 
 47 |         # in = (b, 640, 31, 2), out = (b, 256, 31, 2)
 48 |         self.detector_conv = nn.Sequential(
 49 |             nn.Conv2d(640, 256, 1, bias=False),
 50 |             nn.BatchNorm2d(256),
 51 |             nn.LeakyReLU(leaky_relu_slope, inplace=True),
 52 |             nn.Dropout(p=0.2),
 53 |         )
 54 | 
 55 |         # input: (b, 31, 512) - resized from (b, 256, 31, 2)
 56 |         self.bilstm_classifier = nn.LSTM(
 57 |             input_size=512, hidden_size=256,
 58 |             batch_first=True, bidirectional=True)  # (b, 31, 512)
 59 | 
 60 |         # input: (b, 31, 512) - resized from (b, 256, 31, 2)
 61 |         self.bilstm_detector = nn.LSTM(
 62 |             input_size=512, hidden_size=256,
 63 |             batch_first=True, bidirectional=True)  # (b, 31, 512)
 64 | 
 65 |         # input: (b * 31, 512)
 66 |         self.classifier = nn.Linear(in_features=512, out_features=self.num_class)  # (b * 31, num_class)
 67 | 
 68 |         # input: (b * 31, 512)
 69 |         self.detector = nn.Linear(in_features=512, out_features=2)  # (b * 31, 2) - binary classifier
 70 | 
 71 |         # initialize weights
 72 |         self.apply(self.init_weights)
 73 | 
 74 |     def get_feature_GAN(self, x):
 75 |         seq_len = x.shape[-2]
 76 |         x = x.float().transpose(-1, -2)
 77 |         
 78 |         convblock_out = self.conv_block(x)
 79 |         
 80 |         resblock1_out = self.res_block1(convblock_out)
 81 |         resblock2_out = self.res_block2(resblock1_out)
 82 |         resblock3_out = self.res_block3(resblock2_out)
 83 |         poolblock_out = self.pool_block[0](resblock3_out)
 84 |         poolblock_out = self.pool_block[1](poolblock_out)
 85 |         
 86 |         return poolblock_out.transpose(-1, -2)
 87 |         
 88 |     def get_feature(self, x):
 89 |         seq_len = x.shape[-2]
 90 |         x = x.float().transpose(-1, -2)
 91 |         
 92 |         convblock_out = self.conv_block(x)
 93 |         
 94 |         resblock1_out = self.res_block1(convblock_out)
 95 |         resblock2_out = self.res_block2(resblock1_out)
 96 |         resblock3_out = self.res_block3(resblock2_out)
 97 |         poolblock_out = self.pool_block[0](resblock3_out)
 98 |         poolblock_out = self.pool_block[1](poolblock_out)
 99 |         
100 |         return self.pool_block[2](poolblock_out)
101 |         
102 |     def forward(self, x):
103 |         """
104 |         Returns:
105 |             classification_prediction, detection_prediction
106 |             sizes: (b, 31, 722), (b, 31, 2)
107 |         """
108 |         ###############################
109 |         # forward pass for classifier #
110 |         ###############################
111 |         seq_len = x.shape[-1]
112 |         x = x.float().transpose(-1, -2)
113 |         
114 |         convblock_out = self.conv_block(x)
115 |         
116 |         resblock1_out = self.res_block1(convblock_out)
117 |         resblock2_out = self.res_block2(resblock1_out)
118 |         resblock3_out = self.res_block3(resblock2_out)
119 |         
120 |         
121 |         poolblock_out = self.pool_block[0](resblock3_out)
122 |         poolblock_out = self.pool_block[1](poolblock_out)
123 |         GAN_feature = poolblock_out.transpose(-1, -2)
124 |         poolblock_out = self.pool_block[2](poolblock_out)
125 |         
126 |         # (b, 256, 31, 2) => (b, 31, 256, 2) => (b, 31, 512)
127 |         classifier_out = poolblock_out.permute(0, 2, 1, 3).contiguous().view((-1, seq_len, 512))
128 |         classifier_out, _ = self.bilstm_classifier(classifier_out)  # ignore the hidden states
129 | 
130 |         classifier_out = classifier_out.contiguous().view((-1, 512))  # (b * 31, 512)
131 |         classifier_out = self.classifier(classifier_out)
132 |         classifier_out = classifier_out.view((-1, seq_len, self.num_class))  # (b, 31, num_class)
133 |         
134 |         # sizes: (b, 31, 722), (b, 31, 2)
135 |         # classifier output consists of predicted pitch classes per frame
136 |         # detector output consists of: (isvoice, notvoice) estimates per frame
137 |         return torch.abs(classifier_out.squeeze(-1)), GAN_feature, poolblock_out
138 | 
139 |     @staticmethod
140 |     def init_weights(m):
141 |         if isinstance(m, nn.Linear):
142 |             nn.init.kaiming_uniform_(m.weight)
143 |             if m.bias is not None:
144 |                 nn.init.constant_(m.bias, 0)
145 |         elif isinstance(m, nn.Conv2d):
146 |             nn.init.xavier_normal_(m.weight)
147 |         elif isinstance(m, nn.LSTM) or isinstance(m, nn.LSTMCell):
148 |             for p in m.parameters():
149 |                 if p.data is None:
150 |                     continue
151 | 
152 |                 if len(p.shape) >= 2:
153 |                     nn.init.orthogonal_(p.data)
154 |                 else:
155 |                     nn.init.normal_(p.data)
156 |                     
157 | 
158 | class ResBlock(nn.Module):
159 |     def __init__(self, in_channels: int, out_channels: int, leaky_relu_slope=0.01):
160 |         super().__init__()
161 |         self.downsample = in_channels != out_channels
162 | 
163 |         # BN / LReLU / MaxPool layer before the conv layer - see Figure 1b in the paper
164 |         self.pre_conv = nn.Sequential(
165 |             nn.BatchNorm2d(num_features=in_channels),
166 |             nn.LeakyReLU(leaky_relu_slope, inplace=True),
167 |             nn.MaxPool2d(kernel_size=(1, 2)),  # apply downsampling on the y axis only
168 |         )
169 | 
170 |         # conv layers
171 |         self.conv = nn.Sequential(
172 |             nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
173 |                       kernel_size=3, padding=1, bias=False),
174 |             nn.BatchNorm2d(out_channels),
175 |             nn.LeakyReLU(leaky_relu_slope, inplace=True),
176 |             nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
177 |         )
178 | 
179 |         # 1 x 1 convolution layer to match the feature dimensions
180 |         self.conv1by1 = None
181 |         if self.downsample:
182 |             self.conv1by1 = nn.Conv2d(in_channels, out_channels, 1, bias=False)
183 | 
184 |     def forward(self, x):
185 |         x = self.pre_conv(x)
186 |         if self.downsample:
187 |             x = self.conv(x) + self.conv1by1(x)
188 |         else:
189 |             x = self.conv(x) + x
190 |         return x


--------------------------------------------------------------------------------
/libs/__init__.py:
--------------------------------------------------------------------------------
1 | from .hubert.model import HubertSoft
2 | from .JDC.model import JDCNet
3 | 
4 | __all__ = ['HubertSoft', 'JDCNet']


--------------------------------------------------------------------------------
/libs/hifigan/__init__.py:
--------------------------------------------------------------------------------
 1 | from .models import Generator
 2 | 
 3 | 
 4 | class AttrDict(dict):
 5 |     def __init__(self, *args, **kwargs):
 6 |         super(AttrDict, self).__init__(*args, **kwargs)
 7 |         self.__dict__ = self
 8 | 
 9 | from .model import hifigan
10 | 
11 | __all__ = ['hifigan']


--------------------------------------------------------------------------------
/libs/hifigan/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hifigan/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/hifigan/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hifigan/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/hifigan/__pycache__/model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hifigan/__pycache__/model.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/hifigan/__pycache__/models.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hifigan/__pycache__/models.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/hifigan/__pycache__/models.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hifigan/__pycache__/models.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/hifigan/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 16,
 5 |     "learning_rate": 0.0002,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.999,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [8,8,2,2],
12 |     "upsample_kernel_sizes": [16,16,4,4],
13 |     "upsample_initial_channel": 512,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "segment_size": 8192,
18 |     "num_mels": 80,
19 |     "num_freq": 1025,
20 |     "n_fft": 1024,
21 |     "hop_size": 256,
22 |     "win_size": 1024,
23 | 
24 |     "sampling_rate": 22050,
25 | 
26 |     "fmin": 0,
27 |     "fmax": 8000,
28 |     "fmax_for_loss": null,
29 | 
30 |     "num_workers": 4,
31 | 
32 |     "dist_config": {
33 |         "dist_backend": "nccl",
34 |         "dist_url": "tcp://localhost:54321",
35 |         "world_size": 1
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/libs/hifigan/model.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | import libs.hifigan as hifigan
 4 | 
 5 | 
 6 | def get_vocoder(config, device):
 7 | 
 8 |     with open("./libs/hifigan/my_config_v1_16000.json", "r") as f:
 9 |         config = json.load(f)
10 |     config = hifigan.AttrDict(config)
11 |     vocoder = hifigan.Generator(config)
12 |     ckpt = torch.load("./libs/hifigan/g_00180000.zip")
13 |     vocoder.load_state_dict(ckpt["generator"])
14 |     vocoder.eval()
15 |     vocoder.remove_weight_norm()
16 |     vocoder.to(device)
17 | 
18 |     return vocoder
19 | 
20 | 
21 | def vocoder_infer(mels, vocoder):
22 | 
23 |     with torch.no_grad():
24 |         wavs = vocoder(mels).squeeze(1) # rkmt 2022.6.1
25 | 
26 |     # wavs = (wavs.cpu().numpy() * 32768.0).astype("int16")
27 |     # wavs = [wav for wav in wavs]
28 | 
29 |     return wavs
30 | 


--------------------------------------------------------------------------------
/libs/hifigan/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.nn import Conv1d, ConvTranspose1d
  5 | from torch.nn.utils import weight_norm, remove_weight_norm
  6 | 
  7 | LRELU_SLOPE = 0.1
  8 | 
  9 | 
 10 | def init_weights(m, mean=0.0, std=0.01):
 11 |     classname = m.__class__.__name__
 12 |     if classname.find("Conv") != -1:
 13 |         m.weight.data.normal_(mean, std)
 14 | 
 15 | 
 16 | def get_padding(kernel_size, dilation=1):
 17 |     return int((kernel_size * dilation - dilation) / 2)
 18 | 
 19 | 
 20 | class ResBlock(torch.nn.Module):
 21 |     def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
 22 |         super(ResBlock, self).__init__()
 23 |         self.h = h
 24 |         self.convs1 = nn.ModuleList(
 25 |             [
 26 |                 weight_norm(
 27 |                     Conv1d(
 28 |                         channels,
 29 |                         channels,
 30 |                         kernel_size,
 31 |                         1,
 32 |                         dilation=dilation[0],
 33 |                         padding=get_padding(kernel_size, dilation[0]),
 34 |                     )
 35 |                 ),
 36 |                 weight_norm(
 37 |                     Conv1d(
 38 |                         channels,
 39 |                         channels,
 40 |                         kernel_size,
 41 |                         1,
 42 |                         dilation=dilation[1],
 43 |                         padding=get_padding(kernel_size, dilation[1]),
 44 |                     )
 45 |                 ),
 46 |                 weight_norm(
 47 |                     Conv1d(
 48 |                         channels,
 49 |                         channels,
 50 |                         kernel_size,
 51 |                         1,
 52 |                         dilation=dilation[2],
 53 |                         padding=get_padding(kernel_size, dilation[2]),
 54 |                     )
 55 |                 ),
 56 |             ]
 57 |         )
 58 |         self.convs1.apply(init_weights)
 59 | 
 60 |         self.convs2 = nn.ModuleList(
 61 |             [
 62 |                 weight_norm(
 63 |                     Conv1d(
 64 |                         channels,
 65 |                         channels,
 66 |                         kernel_size,
 67 |                         1,
 68 |                         dilation=1,
 69 |                         padding=get_padding(kernel_size, 1),
 70 |                     )
 71 |                 ),
 72 |                 weight_norm(
 73 |                     Conv1d(
 74 |                         channels,
 75 |                         channels,
 76 |                         kernel_size,
 77 |                         1,
 78 |                         dilation=1,
 79 |                         padding=get_padding(kernel_size, 1),
 80 |                     )
 81 |                 ),
 82 |                 weight_norm(
 83 |                     Conv1d(
 84 |                         channels,
 85 |                         channels,
 86 |                         kernel_size,
 87 |                         1,
 88 |                         dilation=1,
 89 |                         padding=get_padding(kernel_size, 1),
 90 |                     )
 91 |                 ),
 92 |             ]
 93 |         )
 94 |         self.convs2.apply(init_weights)
 95 | 
 96 |     def forward(self, x):
 97 |         for c1, c2 in zip(self.convs1, self.convs2):
 98 |             xt = F.leaky_relu(x, LRELU_SLOPE)
 99 |             xt = c1(xt)
100 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
101 |             xt = c2(xt)
102 |             x = xt + x
103 |         return x
104 | 
105 |     def remove_weight_norm(self):
106 |         for l in self.convs1:
107 |             remove_weight_norm(l)
108 |         for l in self.convs2:
109 |             remove_weight_norm(l)
110 | 
111 | 
112 | class Generator(torch.nn.Module):
113 |     def __init__(self, h):
114 |         super(Generator, self).__init__()
115 |         self.h = h
116 |         self.num_kernels = len(h.resblock_kernel_sizes)
117 |         self.num_upsamples = len(h.upsample_rates)
118 |         self.conv_pre = weight_norm(
119 |             Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)
120 |         )
121 |         resblock = ResBlock
122 | 
123 |         self.ups = nn.ModuleList()
124 |         for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
125 |             self.ups.append(
126 |                 weight_norm(
127 |                     ConvTranspose1d(
128 |                         h.upsample_initial_channel // (2 ** i),
129 |                         h.upsample_initial_channel // (2 ** (i + 1)),
130 |                         k,
131 |                         u,
132 |                         padding=(k - u) // 2,
133 |                     )
134 |                 )
135 |             )
136 | 
137 |         self.resblocks = nn.ModuleList()
138 |         for i in range(len(self.ups)):
139 |             ch = h.upsample_initial_channel // (2 ** (i + 1))
140 |             for j, (k, d) in enumerate(
141 |                 zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
142 |             ):
143 |                 self.resblocks.append(resblock(h, ch, k, d))
144 | 
145 |         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
146 |         self.ups.apply(init_weights)
147 |         self.conv_post.apply(init_weights)
148 | 
149 |     def forward(self, x):
150 |         x = self.conv_pre(x)
151 |         for i in range(self.num_upsamples):
152 |             x = F.leaky_relu(x, LRELU_SLOPE)
153 |             x = self.ups[i](x)
154 |             xs = None
155 |             for j in range(self.num_kernels):
156 |                 if xs is None:
157 |                     xs = self.resblocks[i * self.num_kernels + j](x)
158 |                 else:
159 |                     xs += self.resblocks[i * self.num_kernels + j](x)
160 |             x = xs / self.num_kernels
161 |         x = F.leaky_relu(x)
162 |         x = self.conv_post(x)
163 |         x = torch.tanh(x)
164 | 
165 |         return x
166 | 
167 |     def remove_weight_norm(self):
168 |         for l in self.ups:
169 |             remove_weight_norm(l)
170 |         for l in self.resblocks:
171 |             l.remove_weight_norm()
172 |         remove_weight_norm(self.conv_pre)
173 |         remove_weight_norm(self.conv_post)


--------------------------------------------------------------------------------
/libs/hifigan/my_config_v1_16000.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "resblock": "1",
 3 |     "num_gpus": 0,
 4 |     "batch_size": 16,
 5 |     "learning_rate": 0.0002,
 6 |     "adam_b1": 0.8,
 7 |     "adam_b2": 0.99,
 8 |     "lr_decay": 0.999,
 9 |     "seed": 1234,
10 | 
11 |     "upsample_rates": [8,5,4,2],
12 |     "upsample_kernel_sizes": [16,10,8,4],
13 |     "upsample_initial_channel": 512,
14 |     "resblock_kernel_sizes": [3,7,11],
15 |     "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16 | 
17 |     "segment_size": 8000,
18 |     "num_mels": 80,
19 |     "num_freq": 1025,
20 |     "n_fft": 1024,
21 |     "hop_size": 320,
22 |     "win_size": 1024,
23 | 
24 |     "sampling_rate": 16000,
25 | 
26 |     "fmin": 0,
27 |     "fmax": 8000,
28 |     "fmax_for_loss": null,
29 | 
30 |     "num_workers": 4,
31 | 
32 |     "dist_config": {
33 |         "dist_backend": "nccl",
34 |         "dist_url": "tcp://localhost:54321",
35 |         "world_size": 1
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/libs/hubert/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import (
2 |     Hubert,
3 |     HubertDiscrete,
4 |     HubertSoft,
5 |     hubert_discrete,
6 |     hubert_soft,
7 |     kmeans100,
8 | )
9 | 


--------------------------------------------------------------------------------
/libs/hubert/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hubert/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/hubert/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hubert/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/hubert/__pycache__/model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hubert/__pycache__/model.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/hubert/__pycache__/model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/hubert/__pycache__/model.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/hubert/model.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import Optional, Tuple
  3 | import random
  4 | 
  5 | from sklearn.cluster import KMeans
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
 11 | 
 12 | URLS = {
 13 |     "hubert-discrete": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-discrete-e9416457.pt",
 14 |     "hubert-soft": "https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt",
 15 |     "kmeans100": "https://github.com/bshall/hubert/releases/download/v0.1/kmeans100-50f36a95.pt",
 16 | }
 17 | 
 18 | 
 19 | class Hubert(nn.Module):
 20 |     def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
 21 |         super().__init__()
 22 |         self._mask = mask
 23 |         self.feature_extractor = FeatureExtractor()
 24 |         self.feature_projection = FeatureProjection()
 25 |         self.positional_embedding = PositionalConvEmbedding()
 26 |         self.norm = nn.LayerNorm(768)
 27 |         self.dropout = nn.Dropout(0.1)
 28 |         self.encoder = TransformerEncoder(
 29 |             nn.TransformerEncoderLayer(
 30 |                 768, 12, 3072, activation="gelu", batch_first=True
 31 |             ),
 32 |             12,
 33 |         )
 34 |         self.proj = nn.Linear(768, 256)
 35 | 
 36 |         self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
 37 |         self.label_embedding = nn.Embedding(num_label_embeddings, 256)
 38 | 
 39 |     def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 40 |         mask = None
 41 |         if self.training and self._mask:
 42 |             mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
 43 |             x[mask] = self.masked_spec_embed.to(x.dtype)
 44 |         return x, mask
 45 | 
 46 |     def encode(
 47 |         self, x: torch.Tensor, layer: Optional[int] = None
 48 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 49 |         x = self.feature_extractor(x)
 50 |         x = self.feature_projection(x.transpose(1, 2))
 51 |         x, mask = self.mask(x)
 52 |         x = x + self.positional_embedding(x)
 53 |         x = self.dropout(self.norm(x))
 54 |         x = self.encoder(x, output_layer=layer)
 55 |         return x, mask
 56 | 
 57 |     def logits(self, x: torch.Tensor) -> torch.Tensor:
 58 |         logits = torch.cosine_similarity(
 59 |             x.unsqueeze(2),
 60 |             self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
 61 |             dim=-1,
 62 |         )
 63 |         return logits / 0.1
 64 | 
 65 |     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 66 |         x, mask = self.encode(x)
 67 |         x = self.proj(x)
 68 |         logits = self.logits(x)
 69 |         return logits, mask
 70 | 
 71 | 
 72 | class HubertSoft(Hubert):
 73 |     def __init__(self):
 74 |         super().__init__()
 75 | 
 76 |     @torch.inference_mode()
 77 |     def units(self, wav: torch.Tensor) -> torch.Tensor:
 78 |         wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
 79 |         x, _ = self.encode(wav)
 80 |         x = self.proj(x)
 81 |         x = x.transpose(2, 1)
 82 |         return x
 83 | 
 84 | 
 85 | class HubertDiscrete(Hubert):
 86 |     def __init__(self, kmeans):
 87 |         super().__init__(504)
 88 |         self.kmeans = kmeans
 89 | 
 90 |     @torch.inference_mode()
 91 |     def units(self, wav: torch.Tensor) -> torch.LongTensor:
 92 |         wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
 93 |         x, _ = self.encode(wav, layer=7)
 94 |         x = self.kmeans.predict(x.squeeze().cpu().numpy())
 95 |         return torch.tensor(x, dtype=torch.long, device=wav.device)
 96 | 
 97 | 
 98 | class FeatureExtractor(nn.Module):
 99 |     def __init__(self):
100 |         super().__init__()
101 |         self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
102 |         self.norm0 = nn.GroupNorm(512, 512)
103 |         self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
104 |         self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
105 |         self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
106 |         self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
107 |         self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
108 |         self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
109 | 
110 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
111 |         x = F.gelu(self.norm0(self.conv0(x)))
112 |         x = F.gelu(self.conv1(x))
113 |         x = F.gelu(self.conv2(x))
114 |         x = F.gelu(self.conv3(x))
115 |         x = F.gelu(self.conv4(x))
116 |         x = F.gelu(self.conv5(x))
117 |         x = F.gelu(self.conv6(x))
118 |         return x
119 | 
120 | 
121 | class FeatureProjection(nn.Module):
122 |     def __init__(self):
123 |         super().__init__()
124 |         self.norm = nn.LayerNorm(512)
125 |         self.projection = nn.Linear(512, 768)
126 |         self.dropout = nn.Dropout(0.1)
127 | 
128 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
129 |         x = self.norm(x)
130 |         x = self.projection(x)
131 |         x = self.dropout(x)
132 |         return x
133 | 
134 | 
135 | class PositionalConvEmbedding(nn.Module):
136 |     def __init__(self):
137 |         super().__init__()
138 |         self.conv = nn.Conv1d(
139 |             768,
140 |             768,
141 |             kernel_size=128,
142 |             padding=128 // 2,
143 |             groups=16,
144 |         )
145 |         self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
146 | 
147 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
148 |         x = self.conv(x.transpose(1, 2))
149 |         x = F.gelu(x[:, :, :-1])
150 |         return x.transpose(1, 2)
151 | 
152 | 
153 | class TransformerEncoder(nn.Module):
154 |     def __init__(
155 |         self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
156 |     ) -> None:
157 |         super(TransformerEncoder, self).__init__()
158 |         self.layers = nn.ModuleList(
159 |             [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
160 |         )
161 |         self.num_layers = num_layers
162 | 
163 |     def forward(
164 |         self,
165 |         src: torch.Tensor,
166 |         mask: torch.Tensor = None,
167 |         src_key_padding_mask: torch.Tensor = None,
168 |         output_layer: Optional[int] = None,
169 |     ) -> torch.Tensor:
170 |         output = src
171 |         for layer in self.layers[:output_layer]:
172 |             output = layer(
173 |                 output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
174 |             )
175 |         return output
176 | 
177 | 
178 | def _compute_mask(
179 |     shape: Tuple[int, int],
180 |     mask_prob: float,
181 |     mask_length: int,
182 |     device: torch.device,
183 |     min_masks: int = 0,
184 | ) -> torch.Tensor:
185 |     batch_size, sequence_length = shape
186 | 
187 |     if mask_length < 1:
188 |         raise ValueError("`mask_length` has to be bigger than 0.")
189 | 
190 |     if mask_length > sequence_length:
191 |         raise ValueError(
192 |             f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
193 |         )
194 | 
195 |     # compute number of masked spans in batch
196 |     num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
197 |     num_masked_spans = max(num_masked_spans, min_masks)
198 | 
199 |     # make sure num masked indices <= sequence_length
200 |     if num_masked_spans * mask_length > sequence_length:
201 |         num_masked_spans = sequence_length // mask_length
202 | 
203 |     # SpecAugment mask to fill
204 |     mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
205 | 
206 |     # uniform distribution to sample from, make sure that offset samples are < sequence_length
207 |     uniform_dist = torch.ones(
208 |         (batch_size, sequence_length - (mask_length - 1)), device=device
209 |     )
210 | 
211 |     # get random indices to mask
212 |     mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
213 | 
214 |     # expand masked indices to masked spans
215 |     mask_indices = (
216 |         mask_indices.unsqueeze(dim=-1)
217 |         .expand((batch_size, num_masked_spans, mask_length))
218 |         .reshape(batch_size, num_masked_spans * mask_length)
219 |     )
220 |     offsets = (
221 |         torch.arange(mask_length, device=device)[None, None, :]
222 |         .expand((batch_size, num_masked_spans, mask_length))
223 |         .reshape(batch_size, num_masked_spans * mask_length)
224 |     )
225 |     mask_idxs = mask_indices + offsets
226 | 
227 |     # scatter indices to mask
228 |     mask = mask.scatter(1, mask_idxs, True)
229 | 
230 |     return mask
231 | 
232 | 
233 | def hubert_discrete(
234 |     pretrained: bool = True,
235 |     progress: bool = True,
236 | ) -> HubertDiscrete:
237 |     r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
238 |     Args:
239 |         pretrained (bool): load pretrained weights into the model
240 |         progress (bool): show progress bar when downloading model
241 |     """
242 |     kmeans = kmeans100(pretrained=pretrained, progress=progress)
243 |     hubert = HubertDiscrete(kmeans)
244 |     if pretrained:
245 |         checkpoint = torch.hub.load_state_dict_from_url(
246 |             URLS["hubert-discrete"], progress=progress
247 |         )
248 |         consume_prefix_in_state_dict_if_present(checkpoint, "module.")
249 |         hubert.load_state_dict(checkpoint)
250 |         hubert.eval()
251 |     return hubert
252 | 
253 | 
254 | def hubert_soft(
255 |     pretrained: bool = True,
256 |     progress: bool = True,
257 | ) -> HubertSoft:
258 |     r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
259 |     Args:
260 |         pretrained (bool): load pretrained weights into the model
261 |         progress (bool): show progress bar when downloading model
262 |     """
263 |     hubert = HubertSoft()
264 |     if pretrained:
265 |         checkpoint = torch.hub.load_state_dict_from_url(
266 |             URLS["hubert-soft"], progress=progress
267 |         )
268 |         consume_prefix_in_state_dict_if_present(checkpoint, "module.")
269 |         hubert.load_state_dict(checkpoint)
270 |         hubert.eval()
271 |     return hubert
272 | 
273 | 
274 | def _kmeans(
275 |     num_clusters: int, pretrained: bool = True, progress: bool = True
276 | ) -> KMeans:
277 |     kmeans = KMeans(num_clusters)
278 |     if pretrained:
279 |         checkpoint = torch.hub.load_state_dict_from_url(
280 |             URLS[f"kmeans{num_clusters}"], progress=progress
281 |         )
282 |         kmeans.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
283 |         kmeans.__dict__["_n_threads"] = checkpoint["_n_threads"]
284 |         kmeans.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
285 |     return kmeans
286 | 
287 | 
288 | def kmeans100(pretrained: bool = True, progress: bool = True) -> KMeans:
289 |     r"""
290 |     k-means checkpoint for HuBERT-Discrete with 100 clusters.
291 |     Args:
292 |         pretrained (bool): load pretrained weights into the model
293 |         progress (bool): show progress bar when downloading model
294 |     """
295 |     return _kmeans(100, pretrained, progress)
296 | 


--------------------------------------------------------------------------------
/libs/hubert/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class Metric:
 5 |     def __init__(self):
 6 |         self.steps = 0
 7 |         self.value = 0
 8 | 
 9 |     def update(self, value):
10 |         self.steps += 1
11 |         self.value += (value - self.value) / self.steps
12 |         return self.value
13 | 
14 |     def reset(self):
15 |         self.steps = 0
16 |         self.value = 0
17 | 
18 | 
19 | def save_checkpoint(
20 |     checkpoint_dir,
21 |     hubert,
22 |     optimizer,
23 |     scaler,
24 |     step,
25 |     loss,
26 |     best,
27 |     logger,
28 | ):
29 |     state = {
30 |         "hubert": hubert.state_dict(),
31 |         "optimizer": optimizer.state_dict(),
32 |         "scaler": scaler.state_dict(),
33 |         "step": step,
34 |         "loss": loss,
35 |     }
36 |     checkpoint_dir.mkdir(exist_ok=True, parents=True)
37 |     checkpoint_path = checkpoint_dir / f"model-{step}.pt"
38 |     torch.save(state, checkpoint_path)
39 |     if best:
40 |         best_path = checkpoint_dir / "model-best.pt"
41 |         torch.save(state, best_path)
42 |     logger.info(f"Saved checkpoint: {checkpoint_path.stem}")
43 | 
44 | 
45 | def load_checkpoint(
46 |     load_path,
47 |     hubert,
48 |     optimizer,
49 |     scaler,
50 |     rank,
51 |     logger,
52 | ):
53 |     logger.info(f"Loading checkpoint from {load_path}")
54 |     checkpoint = torch.load(load_path, map_location={"cuda:0": f"cuda:{rank}"})
55 |     hubert.load_state_dict(checkpoint["hubert"])
56 |     scaler.load_state_dict(checkpoint["scaler"])
57 |     optimizer.load_state_dict(checkpoint["optimizer"])
58 |     return checkpoint["step"], checkpoint["loss"]
59 | 


--------------------------------------------------------------------------------
/libs/wavlm/WavLM-Large.pt.txt:
--------------------------------------------------------------------------------
1 | https://github.com/microsoft/unilm/tree/master/wavlm


--------------------------------------------------------------------------------
/libs/wavlm/__pycache__/WavLM.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/wavlm/__pycache__/WavLM.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/wavlm/__pycache__/WavLM.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/wavlm/__pycache__/WavLM.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/wavlm/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/wavlm/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/wavlm/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/wavlm/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/libs/wavlm/__pycache__/modules.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/wavlm/__pycache__/modules.cpython-310.pyc


--------------------------------------------------------------------------------
/libs/wavlm/__pycache__/modules.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/libs/wavlm/__pycache__/modules.cpython-38.pyc


--------------------------------------------------------------------------------
/minimal_quickvc/commons.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def init_weights(m, mean=0.0, std=0.01):
 5 |     classname = m.__class__.__name__
 6 |     if classname.find("Conv") != -1:
 7 |         m.weight.data.normal_(mean, std)
 8 | 
 9 | 
10 | def get_padding(kernel_size, dilation=1):
11 |     return int((kernel_size*dilation - dilation)/2)
12 | 
13 | 
14 | def rand_slice_segments(x, x_lengths=None, segment_size=4):
15 |     b, d, t = x.size()
16 |     if x_lengths is None:
17 |         x_lengths = t
18 |     ids_str_max = x_lengths - segment_size + 1
19 |     ids_str = (torch.rand([b]).to(device=x.device)
20 |                * ids_str_max).to(dtype=torch.long)
21 |     ret = slice_segments(x, ids_str, segment_size)
22 |     return ret, ids_str
23 | 
24 | 
25 | @torch.jit.script
26 | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
27 |     n_channels_int = n_channels[0]
28 |     in_act = input_a + input_b
29 |     t_act = torch.tanh(in_act[:, :n_channels_int, :])
30 |     s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
31 |     acts = t_act * s_act
32 |     return acts
33 | 
34 | 
35 | def sequence_mask(length, max_length=None):
36 |     if max_length is None:
37 |         max_length = length.max()
38 |     x = torch.arange(max_length, dtype=length.dtype, device=length.device)
39 |     return x.unsqueeze(0) < length.unsqueeze(1)
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/minimal_quickvc/modules.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import functional as F
  4 | 
  5 | from torch.nn import Conv1d
  6 | from torch.nn.utils import weight_norm, remove_weight_norm
  7 | 
  8 | from . import commons
  9 | 
 10 | 
 11 | LRELU_SLOPE = 0.1
 12 | 
 13 | 
 14 | class LayerNorm(nn.Module):
 15 |     def __init__(self, channels, eps=1e-5):
 16 |         super().__init__()
 17 |         self.channels = channels
 18 |         self.eps = eps
 19 | 
 20 |         self.gamma = nn.Parameter(torch.ones(channels))
 21 |         self.beta = nn.Parameter(torch.zeros(channels))
 22 | 
 23 |     def forward(self, x):
 24 |         x = x.transpose(1, -1)
 25 |         x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
 26 |         return x.transpose(1, -1)
 27 | 
 28 | 
 29 | class ConvReluNorm(nn.Module):
 30 |     def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
 31 |         super().__init__()
 32 |         self.in_channels = in_channels
 33 |         self.hidden_channels = hidden_channels
 34 |         self.out_channels = out_channels
 35 |         self.kernel_size = kernel_size
 36 |         self.n_layers = n_layers
 37 |         self.p_dropout = p_dropout
 38 |         assert n_layers > 1, "Number of layers should be larger than 0."
 39 | 
 40 |         self.conv_layers = nn.ModuleList()
 41 |         self.norm_layers = nn.ModuleList()
 42 |         self.conv_layers.append(
 43 |             nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 44 |         self.norm_layers.append(LayerNorm(hidden_channels))
 45 |         self.relu_drop = nn.Sequential(
 46 |             nn.ReLU(),
 47 |             nn.Dropout(p_dropout))
 48 |         for _ in range(n_layers-1):
 49 |             self.conv_layers.append(nn.Conv1d(
 50 |                 hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
 51 |             self.norm_layers.append(LayerNorm(hidden_channels))
 52 |         self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 53 |         self.proj.weight.data.zero_()
 54 |         self.proj.bias.data.zero_()
 55 | 
 56 |     def forward(self, x, x_mask):
 57 |         x_org = x
 58 |         for i in range(self.n_layers):
 59 |             x = self.conv_layers[i](x * x_mask)
 60 |             x = self.norm_layers[i](x)
 61 |             x = self.relu_drop(x)
 62 |         x = x_org + self.proj(x)
 63 |         return x * x_mask
 64 | 
 65 | 
 66 | class DDSConv(nn.Module):
 67 |     """
 68 |     Dialted and Depth-Separable Convolution
 69 |     """
 70 | 
 71 |     def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
 72 |         super().__init__()
 73 |         self.channels = channels
 74 |         self.kernel_size = kernel_size
 75 |         self.n_layers = n_layers
 76 |         self.p_dropout = p_dropout
 77 | 
 78 |         self.drop = nn.Dropout(p_dropout)
 79 |         self.convs_sep = nn.ModuleList()
 80 |         self.convs_1x1 = nn.ModuleList()
 81 |         self.norms_1 = nn.ModuleList()
 82 |         self.norms_2 = nn.ModuleList()
 83 |         for i in range(n_layers):
 84 |             dilation = kernel_size ** i
 85 |             padding = (kernel_size * dilation - dilation) // 2
 86 |             self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
 87 |                                             groups=channels, dilation=dilation, padding=padding
 88 |                                             ))
 89 |             self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
 90 |             self.norms_1.append(LayerNorm(channels))
 91 |             self.norms_2.append(LayerNorm(channels))
 92 | 
 93 |     def forward(self, x, x_mask, g=None):
 94 |         if g is not None:
 95 |             x = x + g
 96 |         for i in range(self.n_layers):
 97 |             y = self.convs_sep[i](x * x_mask)
 98 |             y = self.norms_1[i](y)
 99 |             y = F.gelu(y)
100 |             y = self.convs_1x1[i](y)
101 |             y = self.norms_2[i](y)
102 |             y = F.gelu(y)
103 |             y = self.drop(y)
104 |             x = x + y
105 |         return x * x_mask
106 | 
107 | 
108 | class WN(torch.nn.Module):
109 |     def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
110 |         super(WN, self).__init__()
111 |         assert (kernel_size % 2 == 1)
112 |         self.hidden_channels = hidden_channels
113 |         self.kernel_size = kernel_size,
114 |         self.dilation_rate = dilation_rate
115 |         self.n_layers = n_layers
116 |         self.gin_channels = gin_channels
117 |         self.p_dropout = p_dropout
118 | 
119 |         self.in_layers = torch.nn.ModuleList()
120 |         self.res_skip_layers = torch.nn.ModuleList()
121 |         self.drop = nn.Dropout(p_dropout)
122 | 
123 |         if gin_channels != 0:
124 |             cond_layer = torch.nn.Conv1d(
125 |                 gin_channels, 2*hidden_channels*n_layers, 1)
126 |             self.cond_layer = torch.nn.utils.weight_norm(
127 |                 cond_layer, name='weight')
128 | 
129 |         for i in range(n_layers):
130 |             dilation = dilation_rate ** i
131 |             padding = int((kernel_size * dilation - dilation) / 2)
132 |             in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
133 |                                        dilation=dilation, padding=padding)
134 |             in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
135 |             self.in_layers.append(in_layer)
136 | 
137 |             # last one is not necessary
138 |             if i < n_layers - 1:
139 |                 res_skip_channels = 2 * hidden_channels
140 |             else:
141 |                 res_skip_channels = hidden_channels
142 | 
143 |             res_skip_layer = torch.nn.Conv1d(
144 |                 hidden_channels, res_skip_channels, 1)
145 |             res_skip_layer = torch.nn.utils.weight_norm(
146 |                 res_skip_layer, name='weight')
147 |             self.res_skip_layers.append(res_skip_layer)
148 | 
149 |     def forward(self, x, x_mask, g=None, **kwargs):
150 |         output = torch.zeros_like(x)
151 |         n_channels_tensor = torch.IntTensor([self.hidden_channels])
152 | 
153 |         if g is not None:
154 |             g = self.cond_layer(g)
155 | 
156 |         for i in range(self.n_layers):
157 |             x_in = self.in_layers[i](x)
158 |             if g is not None:
159 |                 cond_offset = i * 2 * self.hidden_channels
160 |                 g_l = g[:, cond_offset:cond_offset+2*self.hidden_channels, :]
161 |             else:
162 |                 g_l = torch.zeros_like(x_in)
163 | 
164 |             acts = commons.fused_add_tanh_sigmoid_multiply(
165 |                 x_in,
166 |                 g_l,
167 |                 n_channels_tensor)
168 |             acts = self.drop(acts)
169 | 
170 |             res_skip_acts = self.res_skip_layers[i](acts)
171 |             if i < self.n_layers - 1:
172 |                 res_acts = res_skip_acts[:, :self.hidden_channels, :]
173 |                 x = (x + res_acts) * x_mask
174 |                 output = output + res_skip_acts[:, self.hidden_channels:, :]
175 |             else:
176 |                 output = output + res_skip_acts
177 |         return output * x_mask
178 | 
179 |     def remove_weight_norm(self):
180 |         if self.gin_channels != 0:
181 |             torch.nn.utils.remove_weight_norm(self.cond_layer)
182 |         for l in self.in_layers:
183 |             torch.nn.utils.remove_weight_norm(l)
184 |         for l in self.res_skip_layers:
185 |             torch.nn.utils.remove_weight_norm(l)
186 | 
187 | 
188 | class ResBlock1(torch.nn.Module):
189 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
190 |         super(ResBlock1, self).__init__()
191 |         self.convs1 = nn.ModuleList([
192 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
193 |                                padding=commons.get_padding(kernel_size, dilation[0]))),
194 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
195 |                                padding=commons.get_padding(kernel_size, dilation[1]))),
196 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
197 |                                padding=commons.get_padding(kernel_size, dilation[2])))
198 |         ])
199 |         self.convs1.apply(commons.init_weights)
200 | 
201 |         self.convs2 = nn.ModuleList([
202 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
203 |                                padding=commons.get_padding(kernel_size, 1))),
204 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
205 |                                padding=commons.get_padding(kernel_size, 1))),
206 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
207 |                                padding=commons.get_padding(kernel_size, 1)))
208 |         ])
209 |         self.convs2.apply(commons.init_weights)
210 | 
211 |     def forward(self, x, x_mask=None):
212 |         for c1, c2 in zip(self.convs1, self.convs2):
213 |             xt = F.leaky_relu(x, LRELU_SLOPE)
214 |             if x_mask is not None:
215 |                 xt = xt * x_mask
216 |             xt = c1(xt)
217 |             xt = F.leaky_relu(xt, LRELU_SLOPE)
218 |             # print(xt.size())
219 |             if x_mask is not None:
220 |                 xt = xt * x_mask
221 |             xt = c2(xt)
222 |             # print(xt.size())
223 |             x = xt + x
224 |         if x_mask is not None:
225 |             x = x * x_mask
226 |         return x
227 | 
228 |     def remove_weight_norm(self):
229 |         for l in self.convs1:
230 |             remove_weight_norm(l)
231 |         for l in self.convs2:
232 |             remove_weight_norm(l)
233 | 
234 | 
235 | class ResBlock2(torch.nn.Module):
236 |     def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
237 |         super(ResBlock2, self).__init__()
238 |         self.convs = nn.ModuleList([
239 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
240 |                                padding=commons.get_padding(kernel_size, dilation[0]))),
241 |             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
242 |                                padding=commons.get_padding(kernel_size, dilation[1])))
243 |         ])
244 |         self.convs.apply(commons.init_weights)
245 | 
246 |     def forward(self, x, x_mask=None):
247 |         for c in self.convs:
248 |             xt = F.leaky_relu(x, LRELU_SLOPE)
249 |             if x_mask is not None:
250 |                 xt = xt * x_mask
251 |             xt = c(xt)
252 |             x = xt + x
253 |         if x_mask is not None:
254 |             x = x * x_mask
255 |         return x
256 | 
257 |     def remove_weight_norm(self):
258 |         for l in self.convs:
259 |             remove_weight_norm(l)
260 | 
261 | 
262 | class Log(nn.Module):
263 |     def forward(self, x, x_mask, reverse=False, **kwargs):
264 |         if not reverse:
265 |             y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
266 |             logdet = torch.sum(-y, [1, 2])
267 |             return y, logdet
268 |         else:
269 |             x = torch.exp(x) * x_mask
270 |             return x
271 | 
272 | 
273 | class Flip(nn.Module):
274 |     def forward(self, x, *args, reverse=False, **kwargs):
275 |         x = torch.flip(x, [1])
276 |         if not reverse:
277 |             logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
278 |             return x, logdet
279 |         else:
280 |             return x
281 | 
282 | 
283 | class ResidualCouplingLayer(nn.Module):
284 |     def __init__(self,
285 |                  channels,
286 |                  hidden_channels,
287 |                  kernel_size,
288 |                  dilation_rate,
289 |                  n_layers,
290 |                  p_dropout=0,
291 |                  gin_channels=0,
292 |                  mean_only=False):
293 |         assert channels % 2 == 0, "channels should be divisible by 2"
294 |         super().__init__()
295 |         self.channels = channels
296 |         self.hidden_channels = hidden_channels
297 |         self.kernel_size = kernel_size
298 |         self.dilation_rate = dilation_rate
299 |         self.n_layers = n_layers
300 |         self.half_channels = channels // 2
301 |         self.mean_only = mean_only
302 | 
303 |         self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
304 |         self.enc = WN(hidden_channels, kernel_size, dilation_rate,
305 |                       n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
306 |         self.post = nn.Conv1d(
307 |             hidden_channels, self.half_channels * (2 - mean_only), 1)
308 |         self.post.weight.data.zero_()
309 |         self.post.bias.data.zero_()
310 | 
311 |     def forward(self, x, x_mask, g=None, reverse=False):
312 |         x0, x1 = torch.split(x, [self.half_channels]*2, 1)
313 |         h = self.pre(x0) * x_mask
314 |         h = self.enc(h, x_mask, g=g)
315 |         stats = self.post(h) * x_mask
316 |         if not self.mean_only:
317 |             m, logs = torch.split(stats, [self.half_channels]*2, 1)
318 |         else:
319 |             m = stats
320 |             logs = torch.zeros_like(m)
321 | 
322 |         if not reverse:
323 |             x1 = m + x1 * torch.exp(logs) * x_mask
324 |             x = torch.cat([x0, x1], 1)
325 |             logdet = torch.sum(logs, [1, 2])
326 |             return x, logdet
327 |         else:
328 |             x1 = (x1 - m) * torch.exp(-logs) * x_mask
329 |             x = torch.cat([x0, x1], 1)
330 |             return x
331 | 


--------------------------------------------------------------------------------
/minimal_quickvc/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import logging as logger
 4 | 
 5 | 
 6 | def load_checkpoint(checkpoint_path, model, optimizer=None):
 7 |     assert os.path.isfile(checkpoint_path)
 8 |     checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
 9 |     iteration = checkpoint_dict['iteration']
10 |     learning_rate = checkpoint_dict['learning_rate']
11 |     if optimizer is not None:
12 |         optimizer.load_state_dict(checkpoint_dict['optimizer'])
13 |     saved_state_dict = checkpoint_dict['model']
14 |     if hasattr(model, 'module'):
15 |         state_dict = model.module.state_dict()
16 |     else:
17 |         state_dict = model.state_dict()
18 |     new_state_dict = {}
19 |     for k, v in state_dict.items():
20 |         try:
21 |             new_state_dict[k] = saved_state_dict[k]
22 |         except:
23 |             logger.info("%s is not in the checkpoint" % k)
24 |             new_state_dict[k] = v
25 |     if hasattr(model, 'module'):
26 |         model.module.load_state_dict(new_state_dict)
27 |     else:
28 |         model.load_state_dict(new_state_dict)
29 |     logger.info("Loaded checkpoint '{}' (iteration {})" .format(
30 |         checkpoint_path, iteration))
31 |     return model, optimizer, learning_rate, iteration
32 | 


--------------------------------------------------------------------------------
/minimal_wesper/config/LJ_hubert_layer12/stats.json:
--------------------------------------------------------------------------------
1 | {"pitch": [-3.017691628597761, 14.210434825858718, 206.99761689758864, 49.12105044064982], "energy": [-1.1875702142715454, 16.95836639404297, 20.390984369035806, 17.155741255242276]}


--------------------------------------------------------------------------------
/minimal_wesper/config/my_model16000.yaml:
--------------------------------------------------------------------------------
 1 | transformer:
 2 |   encoder_layer: 4
 3 |   encoder_head: 2
 4 |   encoder_hidden: 256
 5 |   decoder_layer: 6
 6 |   decoder_head: 2
 7 |   decoder_hidden: 256
 8 |   conv_filter_size: 1024
 9 |   conv_kernel_size: [9, 1]
10 |   encoder_dropout: 0.2
11 |   decoder_dropout: 0.2
12 | 
13 | variance_predictor:
14 |   filter_size: 256
15 |   kernel_size: 3
16 |   dropout: 0.5
17 | 
18 | variance_embedding:
19 |   pitch_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
20 |   energy_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
21 |   n_bins: 256
22 | 
23 | # gst:
24 | #   use_gst: False
25 | #   conv_filters: [32, 32, 64, 64, 128, 128]
26 | #   gru_hidden: 128
27 | #   token_size: 128
28 | #   n_style_token: 10
29 | #   attn_head: 4
30 | 
31 | multi_speaker: False
32 | 
33 | max_seq_len: 1000
34 | 
35 | soft_unit: True # rkmt 2022.7.3  for HuBERT soft unit
36 | #soft_unit_dim: 768 # rkmt 2022.7.9 for HuBERT
37 | 
38 | vocoder:
39 |   model: "HiFi-GAN16k" # support 'HiFi-GAN', 'MelGAN'  Hifi-GAN16k
40 |   speaker: "universal" # was LJSpeech :  support  'LJSpeech', 'universal'
41 | 


--------------------------------------------------------------------------------
/minimal_wesper/config/my_preprocess16k_LJ.yaml:
--------------------------------------------------------------------------------
 1 | dataset: "rkmt"
 2 | 
 3 | path:
 4 |   raw_path: "/home/rekimoto/Dropbox/my/data/LJSpeech1.1/wavs"
 5 |   preprocessed_path: "./minimal_wesper/config/LJ_hubert_layer12"
 6 |   hubert_checkpoint_path: "./HuBERT/model-layer12-450000.pt"
 7 | 
 8 | preprocessing:
 9 |   val_size: 512 # validation size
10 |   text:
11 |     text_cleaners: ["unit"]  # was  ["english_cleaners"]  ["tkn"] 
12 |     # layer: 7  # layer number. 0 if use soft unit
13 |     language: "en"
14 |   audio:
15 |     sampling_rate: 16000 # was 22050
16 |     max_wav_value: 32768.0
17 |   stft:
18 |     filter_length: 1024
19 |     hop_length: 320 # was 256
20 |     win_length: 1024
21 |   mel:
22 |     n_mel_channels: 80
23 |     mel_fmin: 0
24 |     mel_fmax: 8000 # please set to 8000 for HiFi-GAN vocoder, set to null for MelGAN vocoder
25 |   pitch:
26 |     feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
27 |     normalization: True
28 |   energy:
29 |     feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
30 |     normalization: True
31 | 


--------------------------------------------------------------------------------
/minimal_wesper/whisper_normal.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import torch
  3 | import yaml
  4 | 
  5 | from torch import nn
  6 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
  7 | 
  8 | # FastSpeech2
  9 | from libs.FastSpeech2 import FastSpeech2
 10 | from libs.FastSpeech2.utils.tools import pad_2D
 11 | # HuBERT
 12 | from libs.hubert.model import HubertSoft
 13 | # Hifigan
 14 | from libs import hifigan
 15 | from libs.hifigan.model import vocoder_infer
 16 | 
 17 | def load_fastspeech2(device='cuda'):
 18 |     checkpoint_path = 'https://github.com/rkmt/wesper-demo/releases/download/v0.1/googletts_neutral_best.tar'
 19 |     preprocess_config = './minimal_wesper/config/my_preprocess16k_LJ.yaml'
 20 |     model_config = './minimal_wesper/config/my_model16000.yaml'
 21 |     preprocess_config = yaml.load(open(preprocess_config, "r"), Loader=yaml.FullLoader)
 22 |     model_config = yaml.load(open(model_config, "r"), Loader=yaml.FullLoader)
 23 | 
 24 |     model = FastSpeech2(preprocess_config, model_config).to(device)
 25 |     if checkpoint_path.startswith("http"):
 26 |         ckpt = torch.hub.load_state_dict_from_url(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.hub.load_state_dict_from_url(checkpoint_path)
 27 |     else:
 28 |         ckpt = torch.load(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.load(checkpoint_path)
 29 |     model.load_state_dict(ckpt["model"], strict=True)
 30 | 
 31 |     model = model.to(device)
 32 |     model.eval()
 33 |     model.requires_grad_ = False
 34 |     return model
 35 | 
 36 | 
 37 | def load_hubert(device='cuda'):
 38 |     checkpoint_path = "https://github.com/rkmt/wesper-demo/releases/download/v0.1/model-layer12-450000.pt"
 39 |     if checkpoint_path.startswith("http"):
 40 |         checkpoint = torch.hub.load_state_dict_from_url(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.hub.load_state_dict_from_url(checkpoint_path)
 41 |     else:
 42 |         checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.load(checkpoint_path)
 43 |     hubert = HubertSoft().to(device)
 44 | 
 45 |     checkpoint = checkpoint['hubert'] if checkpoint['hubert'] is not None else checkpoint
 46 |     consume_prefix_in_state_dict_if_present(checkpoint, "module.")
 47 | 
 48 |     hubert.load_state_dict(checkpoint, strict=True)
 49 |     hubert.eval().to(device)
 50 |     return hubert
 51 | 
 52 | 
 53 | def load_hifigan(device='cuda'):
 54 |     checkpoint_path='https://github.com/rkmt/wesper-demo/releases/download/v0.1/g_00205000'
 55 |     with open("./libs/hifigan/my_config_v1_16000.json", "r") as f:
 56 |         config = json.load(f)
 57 |     config = hifigan.AttrDict(config)
 58 |     vocoder = hifigan.Generator(config)
 59 |     if checkpoint_path.startswith("http"):
 60 |         ckpt = torch.hub.load_state_dict_from_url(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.hub.load_state_dict_from_url(checkpoint_path)
 61 |     else:
 62 |         ckpt = torch.load(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.load(checkpoint_path)
 63 | 
 64 |     vocoder.load_state_dict(ckpt['generator'])
 65 |     vocoder.eval()
 66 |     vocoder.remove_weight_norm()
 67 |     vocoder.to(device)
 68 | 
 69 |     return vocoder
 70 | 
 71 | 
 72 | class SynthesizerTrn(nn.Module):
 73 |     def __init__(self):
 74 |         super().__init__()
 75 |         self.device = 'cuda'
 76 |         self.fs2model = load_fastspeech2(device=self.device)
 77 |         self.vocoder = load_hifigan(device=self.device)
 78 | 
 79 |     def infer(self, c):
 80 |         c = c.squeeze(0).detach().cpu().numpy()
 81 |         c = pad_2D([c])
 82 |         c = torch.from_numpy(c).to(self.device)
 83 |         speakers = torch.tensor([0], device=self.device)
 84 |         max_src_len = c.shape[1]
 85 |         src_lens = torch.tensor([max_src_len], device=self.device)
 86 | 
 87 |         with torch.no_grad():
 88 |             output = self.fs2model(speakers, c, src_lens, max_src_len)
 89 |         mel_len = output[9][0].item()
 90 |         mel_prediction = output[1][0, :mel_len].detach().transpose(0, 1)
 91 |         
 92 |         with torch.no_grad():
 93 |             o = vocoder_infer(mel_prediction.unsqueeze(0),self.vocoder,)[0]
 94 |         return o
 95 | 
 96 | 
 97 | class MyWhisper2Normal(object):
 98 |     def __init__(self, args):
 99 |         self.device = args.device
100 |         
101 |         self.encoder = load_hubert(device=self.device)
102 |         self.syn = SynthesizerTrn()
103 | 
104 |     def convert(self, wav_from):
105 |         wav_t = torch.from_numpy(wav_from).unsqueeze(0).unsqueeze(0).to(self.device)
106 |         with torch.no_grad():
107 |             units = self.encoder.units(wav_t)
108 |         wav_prediction = self.syn.infer(units)
109 |         wav_prediction = (wav_prediction.cpu().numpy() * 32768.0).astype("int16")
110 |         return wav_prediction
111 | 
112 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .loss import MultiScaleMelSpectrogramLoss, t_axis_distill_loss
2 | from .discriminators import WaveDiscriminator, ReconstructionLoss, STFTDiscriminator
3 | from .s2u import call_feature_by_name, DVAEDecoder
4 | from .u2s import Reencoder, Decoder


--------------------------------------------------------------------------------
/models/discriminators.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from typing import List, Tuple
  4 | 
  5 | class ReconstructionLoss(nn.Module):
  6 |     """Reconstruction loss from https://arxiv.org/pdf/2107.03312.pdf
  7 |     but uses STFT instead of mel-spectrogram
  8 |     """
  9 |     def __init__(self, eps=1e-5):
 10 |         super().__init__()
 11 |         self.eps = eps
 12 | 
 13 |     def forward(self, input, target):
 14 |         loss = 0
 15 |         input = input.to(torch.float32)
 16 |         target = target.to(torch.float32)
 17 |         for i in range(6, 12):
 18 |             s = 2 ** i
 19 |             alpha = (s / 2) ** 0.5
 20 |             # We use STFT instead of 64-bin mel-spectrogram as n_fft=64 is too small
 21 |             # for 64 bins.
 22 |             x = torch.stft(input, n_fft=s, hop_length=s // 4, win_length=s, normalized=True, onesided=True, return_complex=True)
 23 |             x = torch.abs(x)
 24 |             y = torch.stft(target, n_fft=s, hop_length=s // 4, win_length=s, normalized=True, onesided=True, return_complex=True)
 25 |             y = torch.abs(y)
 26 |             if x.shape[-1] > y.shape[-1]:
 27 |                 x = x[:, :, :y.shape[-1]]
 28 |             elif x.shape[-1] < y.shape[-1]:
 29 |                 y = y[:, :, :x.shape[-1]]
 30 |             loss += torch.mean(torch.abs(x - y))
 31 |             loss += alpha * torch.mean(torch.square(torch.log(x + self.eps) - torch.log(y + self.eps)))
 32 |         return loss / (12 - 6)
 33 | 
 34 | 
 35 | class ResNet2d(nn.Module):
 36 |     def __init__(
 37 |         self,
 38 |         n_channels: int,
 39 |         factor: int,
 40 |         stride: Tuple[int, int]
 41 |     ) -> None:
 42 |         # https://arxiv.org/pdf/2005.00341.pdf
 43 |         # The original paper uses layer normalization, but here
 44 |         # we use batch normalization.
 45 |         super().__init__()
 46 |         self.conv0 = nn.Conv2d(
 47 |             n_channels,
 48 |             n_channels,
 49 |             kernel_size=(3, 3),
 50 |             padding='same')
 51 |         self.bn0 = nn.BatchNorm2d(
 52 |             n_channels
 53 |         )
 54 |         self.conv1 = nn.Conv2d(
 55 |             n_channels,
 56 |             factor * n_channels,
 57 |             kernel_size=(stride[0] + 2, stride[1] + 2),
 58 |             stride=stride)
 59 |         self.bn1 = nn.BatchNorm2d(
 60 |             factor * n_channels
 61 |         )
 62 |         self.conv2 = nn.Conv2d(
 63 |             n_channels,
 64 |             factor * n_channels,
 65 |             kernel_size=1,
 66 |             stride=stride)
 67 |         self.bn2 = nn.BatchNorm2d(
 68 |             factor * n_channels
 69 |         )
 70 |         self.pad = nn.ReflectionPad2d([
 71 |             (stride[1] + 1) // 2,
 72 |             (stride[1] + 2) // 2,
 73 |             (stride[0] + 1) // 2,
 74 |             (stride[0] + 2) // 2,
 75 |         ])
 76 |         self.activation = nn.LeakyReLU(0.3)
 77 | 
 78 |     def forward(self, input):
 79 |         x = self.conv0(input)
 80 |         x = self.bn0(x)
 81 |         x = self.activation(x)
 82 |         x = self.pad(x)
 83 |         x = self.conv1(x)
 84 |         x = self.bn1(x)
 85 | 
 86 |         # shortcut
 87 |         y = self.conv2(input)
 88 |         y = self.bn2(y)
 89 | 
 90 |         x += y
 91 |         x = self.activation(x)
 92 |         return x
 93 | 
 94 | 
 95 | class WaveDiscriminator(nn.Module):
 96 |     r"""MelGAN discriminator from https://arxiv.org/pdf/1910.06711.pdf
 97 |     """
 98 |     def __init__(self, resolution: int = 1, n_channels: int = 4) -> None:
 99 |         super().__init__()
100 |         assert resolution >= 1
101 |         if resolution == 1:
102 |             self.avg_pool = nn.Identity()
103 |         else:
104 |             self.avg_pool = nn.AvgPool1d(resolution * 2, stride=resolution)
105 |         self.activation = nn.LeakyReLU(0.2, inplace=True)
106 |         self.layers = nn.ModuleList([
107 |             nn.utils.weight_norm(nn.Conv1d(1, n_channels, kernel_size=15, padding=7)),
108 |             nn.utils.weight_norm(nn.Conv1d(n_channels, 4 * n_channels, kernel_size=41, stride=4, padding=20, groups=4)),
109 |             nn.utils.weight_norm(nn.Conv1d(4 * n_channels, 16 * n_channels, kernel_size=41, stride=4, padding=20, groups=16)),
110 |             nn.utils.weight_norm(nn.Conv1d(16 * n_channels, 64 * n_channels, kernel_size=41, stride=4, padding=20, groups=64)),
111 |             nn.utils.weight_norm(nn.Conv1d(64 * n_channels, 256 * n_channels, kernel_size=41, stride=4, padding=20, groups=256)),
112 |             nn.utils.weight_norm(nn.Conv1d(256 * n_channels, 256 * n_channels, kernel_size=5, padding=2)),
113 |             nn.utils.weight_norm(nn.Conv1d(256 * n_channels, 1, kernel_size=3, padding=1)),
114 |         ])
115 | 
116 |     def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
117 |         x = self.avg_pool(x)
118 |         feats = []
119 |         for layer in self.layers[:-1]:
120 |             x = layer(x)
121 |             feats.append(x)
122 |             x = self.activation(x)
123 |         feats.append(self.layers[-1](x))
124 |         return feats
125 |     
126 | 
127 | class STFTDiscriminator(nn.Module):
128 |     r"""STFT-based discriminator from https://arxiv.org/pdf/2107.03312.pdf
129 |     """
130 |     def __init__(
131 |         self, n_fft: int = 1024, hop_length: int = 256,
132 |         n_channels: int = 32
133 |     ) -> None:
134 |         super().__init__()
135 |         self.n_fft = n_fft
136 |         self.hop_length = hop_length
137 |         n = n_fft // 2 + 1
138 |         for _ in range(6):
139 |             n = (n - 1) // 2 + 1
140 |         self.layers = nn.Sequential(
141 |             nn.Conv2d(1, n_channels, kernel_size=7, padding='same'),
142 |             nn.LeakyReLU(0.3, inplace=True),
143 |             ResNet2d(n_channels, 2, stride=(2, 1)),
144 |             ResNet2d(2 * n_channels, 2, stride=(2, 2)),
145 |             ResNet2d(4 * n_channels, 1, stride=(2, 1)),
146 |             ResNet2d(4 * n_channels, 2, stride=(2, 2)),
147 |             ResNet2d(8 * n_channels, 1, stride=(2, 1)),
148 |             ResNet2d(8 * n_channels, 2, stride=(2, 2)),
149 |             nn.Conv2d(16 * n_channels, 1, kernel_size=(n, 1))
150 |         )
151 | 
152 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
153 |         assert input.shape[1] == 1
154 |         # input: [batch, channel, sequence]
155 |         x = torch.squeeze(input, 1).to(torch.float32)  # torch.stft() doesn't accept float16
156 |         x = torch.stft(x, self.n_fft, self.hop_length, normalized=True, onesided=True, return_complex=True)
157 |         x = torch.abs(x)
158 |         x = torch.unsqueeze(x, dim=1)
159 |         x = self.layers(x)
160 |         return x


--------------------------------------------------------------------------------
/models/loss.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import torch
  3 | import torch.nn as nn
  4 | import typing
  5 | from typing import List
  6 | from collections import namedtuple
  7 | from scipy import signal
  8 | from librosa.filters import mel as librosa_mel_fn
  9 | import math
 10 | 
 11 | # Adapted from https://github.com/descriptinc/descript-audio-codec/blob/main/dac/nn/loss.py under the MIT license.
 12 | #   LICENSE is in incl_licenses directory.
 13 | class MultiScaleMelSpectrogramLoss(nn.Module):
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         sampling_rate: int,
 18 |         n_mels: List[int] = [5, 10, 20, 40, 80, 160, 320],
 19 |         window_lengths: List[int] = [32, 64, 128, 256, 512, 1024, 2048],
 20 |         loss_fn: typing.Callable = nn.L1Loss(),
 21 |         clamp_eps: float = 1e-5,
 22 |         mag_weight: float = 0.0,
 23 |         log_weight: float = 1.0,
 24 |         pow: float = 1.0,
 25 |         weight: float = 1.0,
 26 |         match_stride: bool = False,
 27 |         mel_fmin: List[float] = [0, 0, 0, 0, 0, 0, 0],
 28 |         mel_fmax: List[float] = [None, None, None, None, None, None, None],
 29 |         window_type: str = "hann",
 30 |     ):
 31 |         super().__init__()
 32 |         self.sampling_rate = sampling_rate
 33 | 
 34 |         STFTParams = namedtuple(
 35 |             "STFTParams",
 36 |             ["window_length", "hop_length", "window_type", "match_stride"],
 37 |         )
 38 | 
 39 |         self.stft_params = [
 40 |             STFTParams(
 41 |                 window_length=w,
 42 |                 hop_length=w // 4,
 43 |                 match_stride=match_stride,
 44 |                 window_type=window_type,
 45 |             )
 46 |             for w in window_lengths
 47 |         ]
 48 |         self.n_mels = n_mels
 49 |         self.loss_fn = loss_fn
 50 |         self.clamp_eps = clamp_eps
 51 |         self.log_weight = log_weight
 52 |         self.mag_weight = mag_weight
 53 |         self.weight = weight
 54 |         self.mel_fmin = mel_fmin
 55 |         self.mel_fmax = mel_fmax
 56 |         self.pow = pow
 57 | 
 58 |     @staticmethod
 59 |     @functools.lru_cache(None)
 60 |     def get_window(
 61 |         window_type,
 62 |         window_length,
 63 |     ):
 64 |         return signal.get_window(window_type, window_length)
 65 | 
 66 |     @staticmethod
 67 |     @functools.lru_cache(None)
 68 |     def get_mel_filters(sr, n_fft, n_mels, fmin, fmax):
 69 |         return librosa_mel_fn(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
 70 | 
 71 |     def mel_spectrogram(
 72 |         self,
 73 |         wav,
 74 |         n_mels,
 75 |         fmin,
 76 |         fmax,
 77 |         window_length,
 78 |         hop_length,
 79 |         match_stride,
 80 |         window_type,
 81 |     ):
 82 | 
 83 |         B, C, T = wav.shape
 84 | 
 85 |         if match_stride:
 86 |             assert (
 87 |                 hop_length == window_length // 4
 88 |             ), "For match_stride, hop must equal n_fft // 4"
 89 |             right_pad = math.ceil(T / hop_length) * hop_length - T
 90 |             pad = (window_length - hop_length) // 2
 91 |         else:
 92 |             right_pad = 0
 93 |             pad = 0
 94 | 
 95 |         wav = torch.nn.functional.pad(wav, (pad, pad + right_pad), mode="reflect")
 96 | 
 97 |         window = self.get_window(window_type, window_length)
 98 |         window = torch.from_numpy(window).to(wav.device).float()
 99 | 
100 |         stft = torch.stft(
101 |             wav.reshape(-1, T),
102 |             n_fft=window_length,
103 |             hop_length=hop_length,
104 |             window=window,
105 |             return_complex=True,
106 |             center=True,
107 |         )
108 |         _, nf, nt = stft.shape
109 |         stft = stft.reshape(B, C, nf, nt)
110 |         if match_stride:
111 |             stft = stft[..., 2:-2]
112 |         magnitude = torch.abs(stft)
113 | 
114 |         nf = magnitude.shape[2]
115 |         mel_basis = self.get_mel_filters(
116 |             self.sampling_rate, 2 * (nf - 1), n_mels, fmin, fmax
117 |         )
118 |         mel_basis = torch.from_numpy(mel_basis).to(wav.device)
119 |         mel_spectrogram = magnitude.transpose(2, -1) @ mel_basis.T
120 |         mel_spectrogram = mel_spectrogram.transpose(-1, 2)
121 | 
122 |         return mel_spectrogram
123 | 
124 |     def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
125 | 
126 |         loss = 0.0
127 |         for n_mels, fmin, fmax, s in zip(
128 |             self.n_mels, self.mel_fmin, self.mel_fmax, self.stft_params
129 |         ):
130 |             kwargs = {
131 |                 "n_mels": n_mels,
132 |                 "fmin": fmin,
133 |                 "fmax": fmax,
134 |                 "window_length": s.window_length,
135 |                 "hop_length": s.hop_length,
136 |                 "match_stride": s.match_stride,
137 |                 "window_type": s.window_type,
138 |             }
139 | 
140 |             x_mels = self.mel_spectrogram(x, **kwargs)
141 |             y_mels = self.mel_spectrogram(y, **kwargs)
142 |             x_logmels = torch.log(
143 |                 x_mels.clamp(min=self.clamp_eps).pow(self.pow)
144 |             ) / torch.log(torch.tensor(10.0))
145 |             y_logmels = torch.log(
146 |                 y_mels.clamp(min=self.clamp_eps).pow(self.pow)
147 |             ) / torch.log(torch.tensor(10.0))
148 | 
149 |             loss += self.log_weight * self.loss_fn(x_logmels, y_logmels)
150 |             loss += self.mag_weight * self.loss_fn(x_logmels, y_logmels)
151 | 
152 |         return loss
153 | 
154 | # t_axis_distill_loss copied from https://github.com/ZhangXInFD/SpeechTokenizer
155 | class t_axis_distill_loss(nn.Module):
156 |     def __init__(self, **params):
157 |         super().__init__()
158 |     
159 |     def forward(self, feature, target_feature, lambda_sim=1):
160 |         n = min(feature.size(1), target_feature.size(1))
161 |         l1_loss = torch.nn.functional.mse_loss(feature[:, :n], target_feature[:, :n], reduction='mean')
162 |         sim_loss = - torch.log(torch.sigmoid(torch.nn.functional.cosine_similarity(feature[:, :n], target_feature[:, :n], axis=-1))).mean()
163 |         distill_loss = l1_loss + lambda_sim * sim_loss
164 |         return distill_loss 


--------------------------------------------------------------------------------
/models/s2u.py:
--------------------------------------------------------------------------------
  1 | # Torch and related libraries
  2 | import torch
  3 | import torch.nn as nn
  4 | from nnAudio import features
  5 | from utils.config import Config
  6 | 
  7 | def call_feature_by_name(name, *args, **kwargs):
  8 |     func = globals().get(name)
  9 |     if func and callable(func):
 10 |         return func(*args, **kwargs)
 11 |     else:
 12 |         print("Function not found or not callable.")
 13 | 
 14 | # Learnable MFCCs Extractor
 15 | class mfcc(nn.Module):
 16 |     def __init__(self, trainable=False, **params):
 17 |         super().__init__()
 18 |         config = Config({})
 19 |         self.spec = features.MFCC(
 20 |             sr=config.sample_rate,
 21 |             n_fft=config.n_fft,
 22 |             win_length=config.win_length,
 23 |             hop_length=config.hop_length,
 24 |             n_mfcc=config.n_mels,
 25 |             trainable_mel=trainable,
 26 |             trainable_STFT=trainable,
 27 |         )
 28 |         # self.conv = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=1)
 29 |         self.linear = nn.Linear(config.n_mels, config.n_embed_dim)
 30 | 
 31 |     def forward(self, input):
 32 |         x = self.spec(input)
 33 |         # y = torch.repeat_interleave(x, 2, dim=1)
 34 |         # y = self.conv(x)
 35 |         x = x.permute(0, 2, 1)
 36 |         y = self.linear(x)
 37 |         y = y.permute(0, 2, 1)
 38 |         return y
 39 | 
 40 | class melspec(nn.Module):
 41 |     def __init__(self, **params):
 42 |         super().__init__()
 43 |         # self.spec = features.MelSpectrogram(
 44 |         #                 sr=16000,
 45 |         #                 n_fft=1024,
 46 |         #                 win_length=1024,
 47 |         #                 hop_length=320,
 48 |         #                 n_mels=256,
 49 |         #                 fmin=0.0,
 50 |         #                 fmax=None,
 51 |         #                 trainable_mel=True, 
 52 |         #                 trainable_STFT=True
 53 |         #             )
 54 |         self.spec = features.gammatone.Gammatonegram(
 55 |                         sr=16000,
 56 |                         n_fft=1024,
 57 |                         hop_length=320,
 58 |                         n_bins=256,
 59 |                         fmin=0.0,
 60 |                         fmax=None,
 61 |                         trainable_bins=True, 
 62 |                         trainable_STFT=True
 63 |                     )
 64 | 
 65 |     def forward(self, input):
 66 |         # logmel = F.interpolate(logmel, scale_factor=2)
 67 |         x = self.spec(input)
 68 |         return x[..., :-1]
 69 |     
 70 | class stftspec(nn.Module):
 71 |     def __init__(self, **params):
 72 |         super().__init__()
 73 |         self.spec = features.STFT(
 74 |             n_fft=1024,
 75 |             win_length=1024,
 76 |             freq_bins=256,
 77 |             hop_length=320,
 78 |             output_format="Magnitude",
 79 |         ) # trainable=True,
 80 | 
 81 |     def forward(self, input):
 82 |         return self.spec(input)
 83 | 
 84 | 
 85 | # Encoder
 86 | class ConvNeXtBlock(nn.Module):
 87 |     def __init__(
 88 |         self,
 89 |         dim: int,
 90 |         intermediate_dim: int,
 91 |         kernel, dilation,
 92 |         layer_scale_init_value: float = 1e-6,
 93 |     ):
 94 |         # ConvNeXt Block copied from Vocos.
 95 |         super().__init__()
 96 |         self.dwconv = nn.Conv1d(dim, dim, 
 97 |                                 kernel_size=kernel, padding=dilation*(kernel//2), 
 98 |                                 dilation=dilation, groups=dim
 99 |                             )  # depthwise conv
100 |         
101 |         self.norm = nn.LayerNorm(dim, eps=1e-6)
102 |         self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
103 |         self.act = nn.GELU()
104 |         self.pwconv2 = nn.Linear(intermediate_dim, dim)
105 |         self.gamma = (
106 |             nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
107 |             if layer_scale_init_value > 0
108 |             else None
109 |         )
110 | 
111 |     def forward(self, x: torch.Tensor, cond = None) -> torch.Tensor:
112 |         residual = x
113 |         x = self.dwconv(x)
114 |         x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
115 |         x = self.norm(x)
116 |         x = self.pwconv1(x)
117 |         x = self.act(x)
118 |         x = self.pwconv2(x)
119 |         if self.gamma is not None:
120 |             x = self.gamma * x
121 |         x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
122 | 
123 |         x = residual + x
124 |         return x
125 |     
126 | class DVAEDecoder(nn.Module):
127 |     def __init__(self, idim, odim,
128 |                  n_layer = 12, bn_dim = 64, hidden = 256, 
129 |                  kernel = 7, dilation = 2, up = False
130 |                 ):
131 |         super().__init__()
132 |         self.up = up
133 |         self.conv_in = nn.Sequential(
134 |             nn.Conv1d(idim, bn_dim, 3, 1, 1), nn.GELU(),
135 |             nn.Conv1d(bn_dim, hidden, 3, 1, 1)
136 |         )
137 |         self.decoder_block = nn.ModuleList([
138 |             ConvNeXtBlock(hidden, hidden* 4, kernel, dilation,)
139 |             for _ in range(n_layer)])
140 |         self.conv_out = nn.Conv1d(hidden, odim, kernel_size=1, bias=False)
141 |         # self.layernorm1 = nn.LayerNorm(256)
142 |         # self.layernorm2 = nn.LayerNorm(256, bias=False)
143 | 
144 |     def forward(self, input, conditioning=None):
145 |         # B, T, C
146 |         # x = self.layernorm1(input)
147 |         x = input.transpose(1, 2)
148 |         x = self.conv_in(x)
149 |         for f in self.decoder_block:
150 |             x = f(x, conditioning)
151 |         x = self.conv_out(x)
152 |         x = x.transpose(1, 2)
153 |         # x = self.layernorm2(x)
154 |         return x
155 |         


--------------------------------------------------------------------------------
/models/u2s.py:
--------------------------------------------------------------------------------
  1 | # Torch and related libraries
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | # Reencoder
  7 | class FiLMLayer(nn.Module):
  8 |     """
  9 |     Feature-wise Linear Modulation (FiLM) layer
 10 |     Reference: https://arxiv.org/abs/1709.07871
 11 |     """
 12 |     def __init__(self, in_channels, out_channels, cond_channels):
 13 |         super(FiLMLayer, self).__init__()
 14 |         self.in_channels = in_channels
 15 |         self.film = nn.Conv1d(cond_channels, (in_channels + out_channels), 1)
 16 | 
 17 |     def forward(self, x, c):
 18 |         gamma, beta = torch.chunk(self.film(c.unsqueeze(2)), chunks=2, dim=1)
 19 |         return gamma * x + beta
 20 | 
 21 | class StyleAdaptiveLayerNorm(nn.Module):
 22 |     def __init__(self, in_channels, cond_channels):
 23 |         """
 24 |         Style Adaptive Layer Normalization (SALN) module.
 25 | 
 26 |         Parameters:
 27 |         in_channels: The number of channels in the input feature maps.
 28 |         cond_channels: The number of channels in the conditioning input.
 29 |         """
 30 |         super(StyleAdaptiveLayerNorm, self).__init__()
 31 |         self.in_channels = in_channels
 32 | 
 33 |         self.saln = nn.Linear(cond_channels, in_channels * 2, 1)
 34 |         self.norm = nn.LayerNorm(in_channels, elementwise_affine=False)
 35 | 
 36 |         self.reset_parameters()
 37 | 
 38 |     def reset_parameters(self):
 39 |         nn.init.constant_(self.saln.bias.data[:self.in_channels], 1)
 40 |         nn.init.constant_(self.saln.bias.data[self.in_channels:], 0)
 41 | 
 42 |     def forward(self, x, c):
 43 |         c = self.saln(c.unsqueeze(1))
 44 |         gamma, beta = torch.chunk(c, chunks=2, dim=-1)
 45 |         return gamma * self.norm(x) + beta
 46 |     
 47 | class ConvNeXtBlock_Adapt(nn.Module):
 48 |     def __init__(self, gin_channels, layer_scale_init_value: float = 1e-6,):
 49 |         super().__init__()
 50 |         self.dwconv = nn.Conv1d(256, 256, kernel_size=7, padding=3, groups=256)
 51 |         self.norm = StyleAdaptiveLayerNorm(256, gin_channels)
 52 |         self.pwconv_2 = nn.Sequential(nn.Linear(256, 256*4),
 53 |                                     nn.GELU(),
 54 |                                     nn.Linear(256*4, 256))
 55 |         self.gamma = (
 56 |             nn.Parameter(layer_scale_init_value * torch.ones(256), requires_grad=True)
 57 |             if layer_scale_init_value > 0
 58 |             else None
 59 |         )
 60 | 
 61 |     def forward(self, x, c) -> torch.Tensor:
 62 |         residual = x # 24,256,102
 63 |         x = self.dwconv(x) # 24,512,102
 64 |         x = self.norm(x.transpose(1, 2), c)  # 24,512,102
 65 |         x = self.pwconv_2(x)
 66 |         if self.gamma is not None:
 67 |             x = self.gamma * x
 68 |         x = x.transpose(1, 2)
 69 |         x = residual + x
 70 |         return x
 71 | 
 72 | class Reencoder(torch.nn.Module):
 73 |     def __init__(self, n_layers: int, wavenet_embed_dim: int, 
 74 |                  decoder_causal: bool = False, nn_type='conv'):
 75 |         super(Reencoder, self).__init__()
 76 |         self.nn_type = nn_type
 77 |         if nn_type == 'film':
 78 |             self.film = FiLMLayer(in_channels=256, out_channels=256, cond_channels=192)
 79 |         elif nn_type == 'adapt':
 80 |             self.adapt = ConvNeXtBlock_Adapt(gin_channels=192)
 81 |         elif nn_type == 'norm':
 82 |             self.norm = StyleAdaptiveLayerNorm(256, 192)
 83 |         # self.conv_out = torch.nn.Conv1d(256, 512, 1)
 84 |     
 85 | 
 86 |     def forward(self, c_code, spk_emb): # c_code.shape [B, 256, 100]
 87 |         if self.nn_type == 'conv':
 88 |             spk_emb = self.spk_proj(spk_emb.unsqueeze(2)) # [B, 256]
 89 |             c_code = c_code + spk_emb
 90 |             # z = self.conv_out(c_code)
 91 |         elif self.nn_type == 'film':
 92 |             x = self.film(c_code, spk_emb)
 93 |             c_code = self.adapt(c_code, spk_emb)
 94 |             # z = self.conv_out(c_code)
 95 |         elif self.nn_type == 'adapt':
 96 |             c_code = self.adapt(c_code, spk_emb)
 97 |             # z = self.conv_out(c_code)
 98 |         elif self.nn_type == 'norm':
 99 |             x = self.norm(c_code.transpose(1, 2), spk_emb)
100 |             c_code = x.transpose(1, 2)
101 |             # z = self.conv_out(c_code)
102 |         # elif self.nn_type == 'wo':
103 |         #     # z = self.conv_out(c_code)
104 |         return c_code
105 | 
106 | # Decoder copied from https://github.com/kaiidams/soundstream-pytorch
107 | class ResNet1d(nn.Module):
108 |     def __init__(
109 |         self,
110 |         n_channels,
111 |         kernel_size: int = 7,
112 |         padding: str = 'valid',
113 |         dilation: int = 1
114 |     ) -> None:
115 |         super().__init__()
116 |         assert padding in ['valid', 'same']
117 |         self.kernel_size = kernel_size
118 |         self.padding = padding
119 |         self.dilation = dilation
120 |         self._padding_size = (kernel_size // 2) * dilation
121 |         self.conv0 = nn.Conv1d(
122 |             n_channels,
123 |             n_channels,
124 |             kernel_size=kernel_size,
125 |             padding=padding,
126 |             dilation=dilation)
127 |         self.conv1 = nn.Conv1d(
128 |             n_channels,
129 |             n_channels,
130 |             kernel_size=1)
131 | 
132 |     def forward(self, input):
133 |         y = input
134 |         x = self.conv0(input)
135 |         x = F.elu(x)
136 |         x = self.conv1(x)
137 |         if self.padding == 'valid':
138 |             y = y[:, :, self._padding_size:-self._padding_size]
139 |         x += y
140 |         x = F.elu(x)
141 |         return x
142 | 
143 | class DecoderBlock(nn.Module):
144 |     def __init__(
145 |         self,
146 |         n_channels: int,
147 |         padding: str,
148 |         stride: int
149 |     ) -> None:
150 |         super().__init__()
151 |         assert padding in ['valid', 'same']
152 |         self.layers = nn.Sequential(
153 |             nn.ConvTranspose1d(
154 |                 n_channels, n_channels // 2,
155 |                 kernel_size=2 * stride,
156 |                 padding=(2 * stride) // 2 if padding == 'same' else 0,
157 |                 stride=stride),
158 |             nn.ELU(),
159 |             ResNet1d(n_channels // 2, padding=padding, dilation=1),
160 |             ResNet1d(n_channels // 2, padding=padding, dilation=3),
161 |             ResNet1d(n_channels // 2, padding=padding, dilation=9),
162 |         )
163 | 
164 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
165 |         return self.layers(input)
166 | 
167 | class Decoder(nn.Module):
168 |     def __init__(self, n_channels: int, padding):
169 |         super().__init__()
170 |         assert padding in ['valid', 'same']
171 |         self.layers = nn.Sequential(
172 |             nn.Conv1d(16 * n_channels, 16 * n_channels, kernel_size=7, padding=padding),
173 |             nn.ELU(),
174 |             DecoderBlock(16 * n_channels, padding=padding, stride=8),
175 |             DecoderBlock(8 * n_channels, padding=padding, stride=5),
176 |             DecoderBlock(4 * n_channels, padding=padding, stride=4),
177 |             DecoderBlock(2 * n_channels, padding=padding, stride=2),
178 |             nn.Conv1d(n_channels, 1, kernel_size=7, padding=padding),
179 |             nn.Tanh(),
180 |         )
181 | 
182 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
183 |         return self.layers(input)


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/DisocoGAN_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/DisocoGAN_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/MSpeC_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/MSpeC_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/WES_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/WES_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/s000_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s000_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/s000_QuickVC_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s000_QuickVC_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/s000_W2S_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s000_W2S_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/s001_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s001_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/s001_QuickVC_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s001_QuickVC_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/s001_W2S_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s001_W2S_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/s002_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s002_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/s002_QuickVC_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s002_QuickVC_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/s002_W2S_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s002_W2S_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/s003_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s003_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/s003_QuickVC_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s003_QuickVC_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_403_headset/s003_W2S_403_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_403_headset/s003_W2S_403_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_416_headset/416_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/416_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_416_headset/DisocoGAN_416_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/DisocoGAN_416_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_416_headset/MSpeC_416_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/MSpeC_416_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_416_headset/WES_416_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/WES_416_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_416_headset/s000_416_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s000_416_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_416_headset/s000_QuickVC_416_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s000_QuickVC_416_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_416_headset/s001_416_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s001_416_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_416_headset/s001_QuickVC_416_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s001_QuickVC_416_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_416_headset/s002_416_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s002_416_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_416_headset/s002_QuickVC_416_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s002_QuickVC_416_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_416_headset/s003_416_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s003_416_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/W2S_416_headset/s003_QuickVC_416_headset.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/W2S_416_headset/s003_QuickVC_416_headset.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/AGAN-W2SC_fn001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/AGAN-W2SC_fn001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/BLSTM_fn001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/BLSTM_fn001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/CycleGAN-VC_fn001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/CycleGAN-VC_fn001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/GMM_fn001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/GMM_fn001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/WES_fw001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/WES_fw001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/fw001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/fw001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/s000_QuickVC_fw001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s000_QuickVC_fw001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/s000_fw001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s000_fw001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/s001_QuickVC_fw001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s001_QuickVC_fw001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/s001_fw001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s001_fw001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/s002_QuickVC_fw001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s002_QuickVC_fw001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/s002_fw001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s002_fw001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/s003_QuickVC_fw001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s003_QuickVC_fw001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw001/s003_fw001.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw001/s003_fw001.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/AGAN-W2SC_fn002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/AGAN-W2SC_fn002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/BLSTM_fn002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/BLSTM_fn002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/CycleGAN-VC_fn002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/CycleGAN-VC_fn002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/GMM_fn002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/GMM_fn002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/WES_fw002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/WES_fw002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/fw002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/fw002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/s000_QuickVC_fw002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s000_QuickVC_fw002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/s000_fw002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s000_fw002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/s001_QuickVC_fw002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s001_QuickVC_fw002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/s001_fw002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s001_fw002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/s002_QuickVC_fw002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s002_QuickVC_fw002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/s002_fw002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s002_fw002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/s003_QuickVC_fw002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s003_QuickVC_fw002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/fw002/s003_fw002.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/fw002/s003_fw002.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/sample_whisper/WES_sample_whisper.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/WES_sample_whisper.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/sample_whisper/s000_QuickVC_sample_whisper.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s000_QuickVC_sample_whisper.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/sample_whisper/s000_sample_whisper.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s000_sample_whisper.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/sample_whisper/s001_QuickVC_sample_whisper.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s001_QuickVC_sample_whisper.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/sample_whisper/s001_sample_whisper.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s001_sample_whisper.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/sample_whisper/s002_QuickVC_sample_whisper.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s002_QuickVC_sample_whisper.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/sample_whisper/s002_sample_whisper.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s002_sample_whisper.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/sample_whisper/s003_QuickVC_sample_whisper.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s003_QuickVC_sample_whisper.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/sample_whisper/s003_sample_whisper.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/s003_sample_whisper.wav


--------------------------------------------------------------------------------
/raw/data_in_the_wild/sample_whisper/sample_whisper.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/data_in_the_wild/sample_whisper/sample_whisper.wav


--------------------------------------------------------------------------------
/raw/freevc/s000u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/freevc/s000u003w.wav


--------------------------------------------------------------------------------
/raw/freevc/s001u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/freevc/s001u003w.wav


--------------------------------------------------------------------------------
/raw/freevc/s002u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/freevc/s002u003w.wav


--------------------------------------------------------------------------------
/raw/freevc/s003u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/freevc/s003u003w.wav


--------------------------------------------------------------------------------
/raw/gt/s000u003n.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s000u003n.wav


--------------------------------------------------------------------------------
/raw/gt/s000u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s000u003w.wav


--------------------------------------------------------------------------------
/raw/gt/s001u003n.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s001u003n.wav


--------------------------------------------------------------------------------
/raw/gt/s001u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s001u003w.wav


--------------------------------------------------------------------------------
/raw/gt/s002u003n.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s002u003n.wav


--------------------------------------------------------------------------------
/raw/gt/s002u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s002u003w.wav


--------------------------------------------------------------------------------
/raw/gt/s003u003n.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s003u003n.wav


--------------------------------------------------------------------------------
/raw/gt/s003u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/gt/s003u003w.wav


--------------------------------------------------------------------------------
/raw/pseudo/s000u003n.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/pseudo/s000u003n.wav


--------------------------------------------------------------------------------
/raw/pseudo/s001u003n.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/pseudo/s001u003n.wav


--------------------------------------------------------------------------------
/raw/pseudo/s002u003n.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/pseudo/s002u003n.wav


--------------------------------------------------------------------------------
/raw/pseudo/s003u003n.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/pseudo/s003u003n.wav


--------------------------------------------------------------------------------
/raw/quickvc/s000u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/quickvc/s000u003w.wav


--------------------------------------------------------------------------------
/raw/quickvc/s001u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/quickvc/s001u003w.wav


--------------------------------------------------------------------------------
/raw/quickvc/s002u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/quickvc/s002u003w.wav


--------------------------------------------------------------------------------
/raw/quickvc/s003u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/quickvc/s003u003w.wav


--------------------------------------------------------------------------------
/raw/s2u_fs2_hifigan/s000u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_fs2_hifigan/s000u003w.wav


--------------------------------------------------------------------------------
/raw/s2u_fs2_hifigan/s001u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_fs2_hifigan/s001u003w.wav


--------------------------------------------------------------------------------
/raw/s2u_fs2_hifigan/s002u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_fs2_hifigan/s002u003w.wav


--------------------------------------------------------------------------------
/raw/s2u_fs2_hifigan/s003u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_fs2_hifigan/s003u003w.wav


--------------------------------------------------------------------------------
/raw/s2u_ms_istft_vits/s000u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_ms_istft_vits/s000u003w.wav


--------------------------------------------------------------------------------
/raw/s2u_ms_istft_vits/s001u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_ms_istft_vits/s001u003w.wav


--------------------------------------------------------------------------------
/raw/s2u_ms_istft_vits/s002u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_ms_istft_vits/s002u003w.wav


--------------------------------------------------------------------------------
/raw/s2u_ms_istft_vits/s003u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_ms_istft_vits/s003u003w.wav


--------------------------------------------------------------------------------
/raw/s2u_u2s/s000u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_u2s/s000u003w.wav


--------------------------------------------------------------------------------
/raw/s2u_u2s/s001u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_u2s/s001u003w.wav


--------------------------------------------------------------------------------
/raw/s2u_u2s/s002u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_u2s/s002u003w.wav


--------------------------------------------------------------------------------
/raw/s2u_u2s/s003u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/s2u_u2s/s003u003w.wav


--------------------------------------------------------------------------------
/raw/softvc/s000u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/softvc/s000u003w.wav


--------------------------------------------------------------------------------
/raw/softvc/s001u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/softvc/s001u003w.wav


--------------------------------------------------------------------------------
/raw/softvc/s002u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/softvc/s002u003w.wav


--------------------------------------------------------------------------------
/raw/softvc/s003u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/softvc/s003u003w.wav


--------------------------------------------------------------------------------
/raw/test/s000u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/test/s000u003w.wav


--------------------------------------------------------------------------------
/raw/test/s001u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/test/s001u003w.wav


--------------------------------------------------------------------------------
/raw/test/s002u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/test/s002u003w.wav


--------------------------------------------------------------------------------
/raw/test/s003u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/test/s003u003w.wav


--------------------------------------------------------------------------------
/raw/wesper/s000u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/wesper/s000u003w.wav


--------------------------------------------------------------------------------
/raw/wesper/s001u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/wesper/s001u003w.wav


--------------------------------------------------------------------------------
/raw/wesper/s002u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/wesper/s002u003w.wav


--------------------------------------------------------------------------------
/raw/wesper/s003u003w.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/raw/wesper/s003u003w.wav


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | nnAudio 
2 | pesq 
3 | Cython 
4 | packaging 
5 | nemo_toolkit[asr]


--------------------------------------------------------------------------------
/resources/system_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tan90xx/distillw2n/77532048a60b47bafc98d3323a7a9e30c145c1b7/resources/system_diagram.png


--------------------------------------------------------------------------------
/utils/__init_.py:
--------------------------------------------------------------------------------
1 | from .s2f0 import load_F0_models, wav2F0
2 | from .s2fhubert import load_hubert, wav2units
3 | from .audioprep import resample_if_needed, squeeze_and_normalize, pad_if_needed


--------------------------------------------------------------------------------
/utils/audioprep.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchaudio
 3 | import torch.nn.functional as F
 4 | 
 5 | # Resampling if necessary
 6 | def resample_if_needed(signal, orig_sr, target_sr):
 7 |     if orig_sr != target_sr:
 8 |         return torchaudio.functional.resample(signal, orig_sr, target_sr)
 9 |     return signal
10 | # Squeeze and normalize
11 | def squeeze_and_normalize(signal):
12 |     signal = torch.squeeze(signal)
13 |     return signal * (0.95 / torch.max(signal))
14 | # Pad if necessary
15 | def pad_if_needed(signal, length):
16 |     if signal.shape[0] < length:
17 |         return F.pad(signal, [0, length - signal.shape[0]], "constant")
18 |     return signal
19 | 
20 | def process_signal(signal, orig_sr, target_sr, target_len, segment_len):
21 |     signal = resample_if_needed(signal, orig_sr, target_sr)
22 |     signal = squeeze_and_normalize(signal)
23 |     signal = signal[:target_len]
24 |     signal = pad_if_needed(signal, segment_len)
25 |     return signal


--------------------------------------------------------------------------------
/utils/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from omegaconf import OmegaConf
 3 | 
 4 | DEFAULT_DICT = {
 5 |     # Configuration ID
 6 |     'id': "null",
 7 |     # Training configuration
 8 |     'seed': 1234,
 9 |     'lr': 1e-6,
10 |     'b1': 0.5,
11 |     'b2': 0.9,
12 |     'segment_length': 32270,
13 |     # Model configuration
14 |     'n_channels': 16,    
15 |     'n_embed_dim': 256,
16 |     'n_reencoder_layer': 1,
17 |     'n_encoder_layer': 12,
18 |     'sample_rate': 16000,
19 |     'n_mels': 128,
20 |     'n_fft': 1024,
21 |     'win_length': 1024,
22 |     'hop_length': 320,
23 |     'trainable': True,
24 |     'padding': 'same',
25 |     # ROOT
26 |     'pseudo_rate': 0.4,
27 |     'datasets_root': '/data/ssd1/tianyi.tan/soundstream',
28 |     'F0_model_path': './libs/JDC/bst.t7',
29 | }   
30 | 
31 | class ConfigItem(dict):
32 |     __slots__ = ()
33 | 
34 |     def __init__(self, config_dict=None):
35 |         if config_dict is None:
36 |             config_dict = dict()
37 |         if isinstance(config_dict, ConfigItem):
38 |             config_dict = config_dict.to_dict()
39 |         assert isinstance(config_dict, dict)
40 | 
41 |         # Set attributes (not dict in ConfigItem)
42 |         for key, value in config_dict.items():
43 |             if isinstance(value, (list, tuple)):
44 |                 value = [ConfigItem(x) if isinstance(x, dict) else x for x in value]
45 |             elif isinstance(value, dict):
46 |                 value = ConfigItem(value)
47 |             elif isinstance(value, ConfigItem):
48 |                 value = ConfigItem(value.to_dict())
49 |             elif isinstance(value, str) and value.lower() == 'none':
50 |                 value = None
51 |             self[key] = value
52 |     
53 |     def __getattr__(self, item):
54 |         try:
55 |             return self[item]
56 |         except KeyError:
57 |             raise AttributeError(item)
58 |     
59 |     def __setattr__(self, name, value):
60 |         self[name] = value
61 |     
62 |     def to_dict(self, recursive=True):
63 |         conf_dict = {}
64 |         for k, v in self.items():
65 |             if isinstance(v, ConfigItem) and recursive:
66 |                 v = v.to_dict(recursive)
67 |             conf_dict[k] = v
68 |         return conf_dict
69 | 
70 |     def update(self, obj):
71 |         assert isinstance(obj, (ConfigItem, dict))
72 | 
73 |         for k, v in obj.items():
74 |             if k not in self or not isinstance(v, (ConfigItem, dict)):
75 |                 self[k] = v
76 |             else:
77 |                 self[k].update(v)
78 | 
79 | 
80 | class Config(ConfigItem):
81 |     def __init__(self, yaml_object, dot_list=None):
82 |         super().__init__(DEFAULT_DICT)
83 |         
84 |         # Check yaml_object
85 |         if isinstance(yaml_object, str):
86 |             assert os.path.isfile(yaml_object), yaml_object
87 |             cfg = OmegaConf.load(yaml_object)
88 |             if dot_list is not None:
89 |                 cfg_extra = OmegaConf.from_dotlist(dot_list)
90 |                 cfg = OmegaConf.merge(cfg, cfg_extra)
91 |             yaml_object = OmegaConf.to_container(cfg, resolve=True)
92 | 
93 |         if isinstance(yaml_object, dict):
94 |             yaml_object = ConfigItem(yaml_object)
95 |         
96 |         assert isinstance(yaml_object, ConfigItem)
97 | 
98 |         self.update(yaml_object)


--------------------------------------------------------------------------------
/utils/s2f0.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from libs import JDCNet
 3 | 
 4 | def load_F0_models(path, device):
 5 |   F0_model = JDCNet(num_class=1, seq_len=192)
 6 |   params = torch.load(path, map_location=device, weights_only=True)['net']
 7 |   F0_model.load_state_dict(params)
 8 |   _ = F0_model.train()
 9 |   return F0_model
10 | 
11 | def wav2F0(mels, F0_model, device, norm=True):
12 |     mels = mels.to(device)
13 |     mels = mels.squeeze()
14 |     F0_model = F0_model.to(device)
15 |     with torch.no_grad():
16 |         F0_real, _, _ = F0_model(mels.unsqueeze(1))
17 | 
18 |     # normalize f0
19 |     # Remove unvoiced frames (replace with -1)
20 |     gt_glob_f0s = []
21 |     f0_targets = []
22 |     norm_f0 = True
23 |     if not norm_f0:
24 |         f0_targets.append(F0_real)
25 |     else:
26 |         for bib in range(len(F0_real)):
27 |             voiced_indices = F0_real[bib] > 5.0
28 |             f0_voiced = F0_real[bib][voiced_indices]
29 | 
30 |             if len(f0_voiced) != 0:
31 |                 # Convert to log scale
32 |                 log_f0 = f0_voiced.log2()
33 |                 
34 |                 # Calculate mean and standard deviation
35 |                 mean_f0 = log_f0.mean()
36 |                 std_f0 = log_f0.std()
37 |                 if norm:
38 |                     # Normalize the F0 sequence
39 |                     normalized_f0 = (log_f0 - mean_f0) / std_f0
40 |                 else:
41 |                     normalized_f0 = log_f0
42 | 
43 |                 # Create the normalized F0 sequence with unvoiced frames
44 |                 normalized_sequence = torch.zeros_like(F0_real[bib])
45 |                 normalized_sequence[voiced_indices] = normalized_f0.to(normalized_sequence.dtype)
46 |                 normalized_sequence[~voiced_indices] = -10  # Assign -10 to unvoiced frames
47 | 
48 |                 gt_glob_f0s.append(mean_f0)
49 |             else:
50 |                 normalized_sequence = torch.zeros_like(F0_real[bib]) - 10.0
51 |                 gt_glob_f0s.append(torch.tensor(0.0).to(device))
52 | 
53 |             # f0_targets.append(normalized_sequence[single_side_context // 200:-single_side_context // 200])
54 |             f0_targets.append(normalized_sequence)
55 | 
56 |     f0_targets = torch.stack(f0_targets).to(device)
57 |     # fill nan with -10
58 |     f0_targets[torch.isnan(f0_targets)] = -10.0
59 |     # fill inf with -10
60 |     f0_targets[torch.isinf(f0_targets)] = -10.0
61 | 
62 |     return f0_targets


--------------------------------------------------------------------------------
/utils/s2fhubert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
 4 | import numpy as np
 5 | from libs import HubertSoft
 6 | 
 7 | # Fine-tuned Soft-Hubert Block copied from https://github.com/rkmt/wesper-demo.
 8 | def load_hubert(checkpoint_path=None, device='cuda:0'):
 9 |     print("### load_hubert", checkpoint_path, device)
10 |     assert checkpoint_path is not None
11 |     print("### loading checkpoint from: ", checkpoint_path)
12 |     checkpoint = torch.load(checkpoint_path)
13 |     hubert = HubertSoft().to(device)
14 | 
15 |     checkpoint = checkpoint['hubert'] if checkpoint['hubert'] is not None else checkpoint
16 |     consume_prefix_in_state_dict_if_present(checkpoint, "module.")
17 | 
18 |     hubert.load_state_dict(checkpoint, strict=True)
19 |     hubert = hubert.eval().to(device)
20 |     return hubert
21 | 
22 | def wav2units(wav, encoder, layer=None, device='cuda:0'):
23 |     ''' 
24 |         encoder: HuBERT
25 |     '''
26 |     if type(wav) == np.ndarray:
27 |         wav = torch.tensor([wav], dtype=torch.float32, device=device)
28 |     else:
29 |         wav = wav.to(device)
30 |     assert type(wav) == torch.Tensor
31 |     if len(wav.shape) == 2:
32 |         wav = wav.unsqueeze(0)
33 |     with torch.inference_mode():  # wav -> HuBERT soft units
34 |         if layer is None or layer < 0:
35 |             units = encoder.units(wav)
36 |         else:
37 |             wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
38 |             units, _ = encoder.encode(wav, layer=layer)
39 |     return units


--------------------------------------------------------------------------------